diff --git a/.env.example b/.env.example index 11cb2d9..405edd1 100644 --- a/.env.example +++ b/.env.example @@ -1,11 +1,213 @@ -๏ปฟ# Neo4j Configuration -NEO4J_URI=bolt://localhost:7687 +# ============================================ +# Panama Papers Neo4j Project - Environment Configuration +# ============================================ +# This is a template file. Copy this to .env and fill in actual values. +# DO NOT commit .env to git (only commit .env.example) +# +# Usage: +# 1. Copy this file: cp .env.example .env +# 2. Update values in .env with your actual credentials +# 3. Restart services: docker-compose restart +# ============================================ + +# ============================================ +# Neo4j Database Configuration +# ============================================ +# Neo4j connection URI using Bolt protocol +# Format: bolt://host:port or bolt+s://host:port (for SSL) +# Default: bolt://neo4j:7687 (for Docker) or bolt://localhost:7687 (for local) +NEO4J_URI=bolt://neo4j:7687 + +# Neo4j database username +# Default username is 'neo4j' NEO4J_USER=neo4j -NEO4J_PASSWORD=password -# API Configuration -API_HOST=0.0.0.0 -API_PORT=8000 +# Neo4j database password +# IMPORTANT: Change this from default for production! +# Default password on first install is 'neo4j' but must be changed +NEO4J_PASSWORD=changeme123 + +# Neo4j database name (optional, defaults to 'neo4j') +# Only needed if using multiple databases (Enterprise feature) +# NEO4J_DATABASE=neo4j + +# ============================================ +# FastAPI Application Configuration +# ============================================ +# Host address for FastAPI server +# 0.0.0.0 allows external connections (required for Docker) +# Use 127.0.0.1 for localhost-only access +FASTAPI_HOST=0.0.0.0 + +# Port for FastAPI server +# Default: 8000 +FASTAPI_PORT=8000 + +# Enable auto-reload on code changes +# Set to 'true' for development, 'false' for production +# Auto-reload watches for file changes and restarts the server +FASTAPI_RELOAD=false + +# API base path (if behind reverse proxy) +# Example: /api/v1 +# API_BASE_PATH=/ + +# CORS allowed origins (comma-separated) +# Example: http://localhost:3000,https://yourdomain.com +# Use * to allow all origins (not recommended for production) +# CORS_ORIGINS=* + +# ============================================ +# Database Initialization & Seeding +# ============================================ +# Automatically seed database on startup +# Set to 'true' to load initial data, 'false' to skip +# Note: Seeding will only run if database is empty +SEED_DATABASE=true + +# Path to Panama Papers data file +# Supports CSV, JSON, or Parquet formats +# Can be absolute path or relative to project root +DATA_SOURCE_PATH=./data/panama-papers.csv + +# Alternative data sources (uncomment to use) +# DATA_SOURCE_PATH=./data/panama-papers.json +# DATA_SOURCE_PATH=./data/panama-papers.parquet +# DATA_SOURCE_PATH=https://example.com/panama-papers-data.csv + +# Batch size for data import (number of records per transaction) +# Larger batches are faster but use more memory +# Recommended: 1000-10000 depending on available memory +IMPORT_BATCH_SIZE=5000 + +# Clear existing data before seeding +# WARNING: Set to 'true' will DELETE all existing data! +CLEAR_DATABASE_BEFORE_SEED=false + +# ============================================ +# Logging Configuration +# ============================================ +# Logging level for application +# Options: DEBUG, INFO, WARNING, ERROR, CRITICAL +# DEBUG: Most verbose, shows all details +# INFO: Standard logging, recommended for production +# WARNING: Only warnings and errors +# ERROR: Only errors and critical issues +# CRITICAL: Only critical failures +LOG_LEVEL=INFO + +# Enable debug mode +# Shows detailed error messages and stack traces +# Set to 'false' in production for security +DEBUG=false + +# Log format +# Options: json, text +# json: Structured logging (better for log aggregation) +# text: Human-readable format (better for development) +LOG_FORMAT=text + +# Log file path (optional) +# Leave empty to log only to console +# Example: ./logs/app.log +# LOG_FILE_PATH= + +# ============================================ +# Performance & Optimization +# ============================================ +# Maximum number of database connections in pool +# Increase for high-traffic applications +NEO4J_MAX_CONNECTION_POOL_SIZE=50 + +# Connection timeout in seconds +NEO4J_CONNECTION_TIMEOUT=30 + +# Query timeout in seconds (0 = no timeout) +# Prevents long-running queries from blocking resources +NEO4J_QUERY_TIMEOUT=300 + +# Enable query result caching +# Caches frequently-used query results for faster responses +ENABLE_QUERY_CACHE=true + +# Cache TTL in seconds (time-to-live) +CACHE_TTL=3600 + +# ============================================ +# Security Configuration +# ============================================ +# Secret key for JWT tokens (if using authentication) +# Generate with: openssl rand -hex 32 +# IMPORTANT: Keep this secret and change in production! +# SECRET_KEY=your-secret-key-here-change-in-production + +# JWT token expiration time in minutes +# TOKEN_EXPIRE_MINUTES=30 + +# Enable HTTPS only (for production) +# FORCE_HTTPS=false + +# API rate limiting (requests per minute) +# RATE_LIMIT_PER_MINUTE=60 + +# ============================================ +# Optional Features +# ============================================ +# Enable API documentation endpoints +# Set to 'false' to disable /docs and /redoc in production +ENABLE_DOCS=true + +# Enable metrics and monitoring endpoint +# Exposes /metrics endpoint for Prometheus +ENABLE_METRICS=false + +# Enable GraphQL API (in addition to REST) +# ENABLE_GRAPHQL=false + +# ============================================ +# Development & Testing +# ============================================ +# Environment mode +# Options: development, staging, production +ENVIRONMENT=production + +# Enable profiling for performance analysis +# ENABLE_PROFILING=false + +# Test database URI (for running tests) +# TEST_NEO4J_URI=bolt://localhost:7687 +# TEST_NEO4J_USER=neo4j +# TEST_NEO4J_PASSWORD=testpassword + +# ============================================ +# Docker-Specific Configuration +# ============================================ +# These are automatically set by docker-compose +# Only modify if running outside Docker + +# Neo4j container memory limits +# NEO4J_HEAP_INITIAL_SIZE=512m +# NEO4J_HEAP_MAX_SIZE=2G +# NEO4J_PAGECACHE_SIZE=1G + +# ============================================ +# External Services (Optional) +# ============================================ +# Redis for caching (if enabled) +# REDIS_URI=redis://localhost:6379/0 + +# Elasticsearch for full-text search (if enabled) +# ELASTICSEARCH_URI=http://localhost:9200 + +# Sentry for error tracking (if enabled) +# SENTRY_DSN=https://your-sentry-dsn -# Environment -ENVIRONMENT=development +# ============================================ +# IMPORTANT NOTES +# ============================================ +# 1. Always change default passwords in production +# 2. Never commit .env to version control +# 3. Use strong passwords and secret keys +# 4. Keep this file updated when adding new variables +# 5. Document any custom variables added to your project +# ============================================ diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..9fba966 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,480 @@ +# ============================================================================= +# PANAMA PAPERS NEO4J PROJECT - CI/CD PIPELINE +# ============================================================================= +# GitHub Actions workflow for continuous integration and deployment +# +# Stages: +# 1. Lint - Code quality check with Pylint (score >= 9.0) +# 2. Test - Unit tests with pytest and coverage (>= 60%) +# 3. Build - Docker image build and optional push to registry +# +# Triggers: +# - Push to main, develop, feature/* branches +# - Pull requests to main, develop +# - Manual trigger via workflow_dispatch +# ============================================================================= + +name: CI/CD Pipeline + +on: + push: + branches: + - main + - develop + - 'feature/**' + - 'feature/*' + - 'hotfix/**' + - 'release/**' + paths-ignore: + - '**.md' + - 'docs/**' + - '.gitignore' + - 'LICENSE' + + pull_request: + branches: + - main + - develop + types: + - opened + - synchronize + - reopened + + workflow_dispatch: + inputs: + skip_tests: + description: 'Skip test stage' + required: false + default: 'false' + type: boolean + push_image: + description: 'Push Docker image to registry' + required: false + default: 'false' + type: boolean + +# Cancel in-progress runs for the same branch +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +# Environment variables available to all jobs +env: + PYTHON_VERSION: '3.11' + PYLINT_MIN_SCORE: '9.0' + COVERAGE_MIN_PERCENT: '60' + DOCKER_IMAGE_NAME: 'panama-papers-api' + +# ============================================================================= +# JOB: LINT +# ============================================================================= +jobs: + lint: + name: ๐Ÿ” Code Quality (Pylint) + runs-on: ubuntu-latest + timeout-minutes: 10 + + steps: + # -------------------------------------------------- + # Checkout repository + # -------------------------------------------------- + - name: ๐Ÿ“ฅ Checkout code + uses: actions/checkout@v4 + + # -------------------------------------------------- + # Setup Python with pip caching + # -------------------------------------------------- + - name: ๐Ÿ Set up Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + cache-dependency-path: | + requirements.txt + requirements-dev.txt + + # -------------------------------------------------- + # Install dependencies + # -------------------------------------------------- + - name: ๐Ÿ“ฆ Install dependencies + run: | + python -m pip install --upgrade pip wheel setuptools + pip install pylint==3.0.3 black==23.11.0 isort==5.12.0 + pip install -r requirements.txt + + # -------------------------------------------------- + # Check code formatting with Black + # -------------------------------------------------- + - name: ๐ŸŽจ Check formatting (Black) + run: | + black --check --diff app/ scripts/ tests/ || echo "::warning::Code formatting issues found. Run 'black app/ scripts/ tests/' to fix." + continue-on-error: true + + # -------------------------------------------------- + # Check import sorting with isort + # -------------------------------------------------- + - name: ๐Ÿ“‹ Check imports (isort) + run: | + isort --check-only --diff app/ scripts/ tests/ || echo "::warning::Import sorting issues found. Run 'isort app/ scripts/ tests/' to fix." + continue-on-error: true + + # -------------------------------------------------- + # Run Pylint and check score + # -------------------------------------------------- + - name: ๐Ÿ” Run Pylint + id: pylint + run: | + echo "Running Pylint on app/ directory..." + + # Run pylint and capture output + pylint app/ \ + --output-format=text \ + --exit-zero \ + --reports=yes \ + --score=yes \ + | tee pylint-report.txt + + # Extract score from report + SCORE=$(grep "Your code has been rated at" pylint-report.txt | grep -oP '\d+\.\d+' | head -1) + + if [ -z "$SCORE" ]; then + echo "::error::Could not extract Pylint score" + exit 1 + fi + + echo "pylint_score=$SCORE" >> $GITHUB_OUTPUT + echo "## ๐Ÿ” Pylint Results" >> $GITHUB_STEP_SUMMARY + echo "**Score:** $SCORE / 10.0" >> $GITHUB_STEP_SUMMARY + echo "**Minimum Required:** ${{ env.PYLINT_MIN_SCORE }}" >> $GITHUB_STEP_SUMMARY + + # Check if score meets minimum + if (( $(echo "$SCORE < ${{ env.PYLINT_MIN_SCORE }}" | bc -l) )); then + echo "::error::Pylint score $SCORE is below minimum ${{ env.PYLINT_MIN_SCORE }}" + echo "**Status:** โŒ FAILED" >> $GITHUB_STEP_SUMMARY + exit 1 + else + echo "โœ… Pylint score $SCORE meets minimum requirement" + echo "**Status:** โœ… PASSED" >> $GITHUB_STEP_SUMMARY + fi + + # -------------------------------------------------- + # Upload Pylint report as artifact + # -------------------------------------------------- + - name: ๐Ÿ“ค Upload Pylint report + uses: actions/upload-artifact@v4 + if: always() + with: + name: pylint-report + path: pylint-report.txt + retention-days: 7 + +# ============================================================================= +# JOB: TEST +# ============================================================================= + test: + name: ๐Ÿงช Tests & Coverage + runs-on: ubuntu-latest + timeout-minutes: 20 + if: ${{ github.event.inputs.skip_tests != 'true' }} + + # Neo4j service container + services: + neo4j: + image: neo4j:5.17-community + env: + NEO4J_AUTH: neo4j/testpassword + NEO4J_PLUGINS: '["graph-data-science"]' + NEO4J_dbms_security_procedures_unrestricted: 'gds.*' + NEO4J_dbms_security_procedures_allowlist: 'gds.*' + options: >- + --health-cmd "wget -q --spider http://localhost:7474 || exit 1" + --health-interval 10s + --health-timeout 10s + --health-retries 10 + --health-start-period 30s + ports: + - 7474:7474 + - 7687:7687 + + steps: + # -------------------------------------------------- + # Checkout repository + # -------------------------------------------------- + - name: ๐Ÿ“ฅ Checkout code + uses: actions/checkout@v4 + + # -------------------------------------------------- + # Setup Python with pip caching + # -------------------------------------------------- + - name: ๐Ÿ Set up Python ${{ env.PYTHON_VERSION }} + uses: actions/setup-python@v5 + with: + python-version: ${{ env.PYTHON_VERSION }} + cache: 'pip' + cache-dependency-path: | + requirements.txt + requirements-dev.txt + + # -------------------------------------------------- + # Install dependencies + # -------------------------------------------------- + - name: ๐Ÿ“ฆ Install dependencies + run: | + python -m pip install --upgrade pip wheel setuptools + pip install -r requirements.txt + pip install pytest==7.4.3 pytest-asyncio==0.21.1 pytest-cov==4.1.0 coverage==7.3.2 + + # -------------------------------------------------- + # Wait for Neo4j to be ready + # -------------------------------------------------- + - name: โณ Wait for Neo4j + run: | + echo "Waiting for Neo4j to be ready..." + for i in {1..30}; do + if curl -s http://localhost:7474 > /dev/null 2>&1; then + echo "โœ… Neo4j is ready!" + break + fi + echo "Waiting... ($i/30)" + sleep 2 + done + + # -------------------------------------------------- + # Run tests with coverage + # -------------------------------------------------- + - name: ๐Ÿงช Run tests with coverage + id: pytest + env: + TEST_NEO4J_URI: bolt://localhost:7687 + TEST_NEO4J_USER: neo4j + TEST_NEO4J_PASSWORD: testpassword + TEST_NEO4J_DATABASE: neo4j + PYTHONPATH: ${{ github.workspace }} + run: | + echo "Running pytest with coverage..." + + pytest tests/ \ + -v \ + --asyncio-mode=auto \ + --cov=app \ + --cov-report=xml:coverage.xml \ + --cov-report=html:htmlcov \ + --cov-report=term-missing \ + --cov-fail-under=${{ env.COVERAGE_MIN_PERCENT }} \ + --junitxml=pytest-report.xml \ + | tee pytest-output.txt + + # Extract coverage percentage + COVERAGE=$(grep -oP 'TOTAL.*\s+\K\d+(?=%)' pytest-output.txt | tail -1 || echo "0") + echo "coverage_percent=$COVERAGE" >> $GITHUB_OUTPUT + + echo "## ๐Ÿงช Test Results" >> $GITHUB_STEP_SUMMARY + echo "**Coverage:** $COVERAGE%" >> $GITHUB_STEP_SUMMARY + echo "**Minimum Required:** ${{ env.COVERAGE_MIN_PERCENT }}%" >> $GITHUB_STEP_SUMMARY + + # -------------------------------------------------- + # Upload coverage to Codecov + # -------------------------------------------------- + - name: ๐Ÿ“Š Upload coverage to Codecov + uses: codecov/codecov-action@v4 + if: always() + with: + token: ${{ secrets.CODECOV_TOKEN }} + files: ./coverage.xml + flags: unittests + name: panama-papers-coverage + fail_ci_if_error: false + verbose: true + + # -------------------------------------------------- + # Upload test artifacts + # -------------------------------------------------- + - name: ๐Ÿ“ค Upload test reports + uses: actions/upload-artifact@v4 + if: always() + with: + name: test-reports + path: | + pytest-report.xml + coverage.xml + htmlcov/ + retention-days: 7 + + # -------------------------------------------------- + # Generate test summary + # -------------------------------------------------- + - name: ๐Ÿ“ Generate test summary + if: always() + uses: test-summary/action@v2 + with: + paths: pytest-report.xml + +# ============================================================================= +# JOB: BUILD +# ============================================================================= + build: + name: ๐Ÿณ Build Docker Image + runs-on: ubuntu-latest + timeout-minutes: 15 + needs: [lint, test] + if: | + always() && + (needs.lint.result == 'success') && + (needs.test.result == 'success' || needs.test.result == 'skipped') + + outputs: + image_tag: ${{ steps.meta.outputs.tags }} + image_digest: ${{ steps.build.outputs.digest }} + + steps: + # -------------------------------------------------- + # Checkout repository + # -------------------------------------------------- + - name: ๐Ÿ“ฅ Checkout code + uses: actions/checkout@v4 + + # -------------------------------------------------- + # Set up Docker Buildx + # -------------------------------------------------- + - name: ๐Ÿ”ง Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + # -------------------------------------------------- + # Generate Docker metadata + # -------------------------------------------------- + - name: ๐Ÿท๏ธ Generate Docker metadata + id: meta + uses: docker/metadata-action@v5 + with: + images: | + ${{ env.DOCKER_IMAGE_NAME }} + ghcr.io/${{ github.repository_owner }}/${{ env.DOCKER_IMAGE_NAME }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha,prefix=,suffix=,format=short + type=raw,value=latest,enable={{is_default_branch}} + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + + # -------------------------------------------------- + # Login to GitHub Container Registry (optional) + # -------------------------------------------------- + - name: ๐Ÿ” Login to GitHub Container Registry + if: github.event_name != 'pull_request' && github.event.inputs.push_image == 'true' + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + # -------------------------------------------------- + # Build Docker image + # -------------------------------------------------- + - name: ๐Ÿณ Build Docker image + id: build + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + push: ${{ github.event_name != 'pull_request' && github.event.inputs.push_image == 'true' }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + build-args: | + BUILD_DATE=${{ github.event.head_commit.timestamp }} + VCS_REF=${{ github.sha }} + VERSION=${{ github.ref_name }} + + # -------------------------------------------------- + # Generate build summary + # -------------------------------------------------- + - name: ๐Ÿ“ Generate build summary + run: | + echo "## ๐Ÿณ Docker Build Results" >> $GITHUB_STEP_SUMMARY + echo "**Image:** ${{ env.DOCKER_IMAGE_NAME }}" >> $GITHUB_STEP_SUMMARY + echo "**Tags:** " >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + echo "${{ steps.meta.outputs.tags }}" >> $GITHUB_STEP_SUMMARY + echo '```' >> $GITHUB_STEP_SUMMARY + echo "**Pushed:** ${{ github.event_name != 'pull_request' && github.event.inputs.push_image == 'true' }}" >> $GITHUB_STEP_SUMMARY + +# ============================================================================= +# JOB: SECURITY SCAN (Optional) +# ============================================================================= + security: + name: ๐Ÿ”’ Security Scan + runs-on: ubuntu-latest + timeout-minutes: 10 + needs: [build] + if: github.event_name != 'pull_request' + continue-on-error: true + + steps: + - name: ๐Ÿ“ฅ Checkout code + uses: actions/checkout@v4 + + - name: ๐Ÿ”’ Run Trivy vulnerability scanner + uses: aquasecurity/trivy-action@master + with: + scan-type: 'fs' + scan-ref: '.' + format: 'sarif' + output: 'trivy-results.sarif' + severity: 'CRITICAL,HIGH' + + - name: ๐Ÿ“ค Upload Trivy scan results + uses: github/codeql-action/upload-sarif@v3 + if: always() + with: + sarif_file: 'trivy-results.sarif' + +# ============================================================================= +# JOB: NOTIFICATION +# ============================================================================= + notify: + name: ๐Ÿ“ข Notification + runs-on: ubuntu-latest + timeout-minutes: 5 + needs: [lint, test, build] + if: always() + + steps: + - name: ๐Ÿ“ข Generate workflow summary + run: | + echo "# ๐Ÿš€ CI/CD Pipeline Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| Stage | Status |" >> $GITHUB_STEP_SUMMARY + echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY + echo "| ๐Ÿ” Lint | ${{ needs.lint.result == 'success' && 'โœ… Passed' || 'โŒ Failed' }} |" >> $GITHUB_STEP_SUMMARY + echo "| ๐Ÿงช Test | ${{ needs.test.result == 'success' && 'โœ… Passed' || (needs.test.result == 'skipped' && 'โญ๏ธ Skipped' || 'โŒ Failed') }} |" >> $GITHUB_STEP_SUMMARY + echo "| ๐Ÿณ Build | ${{ needs.build.result == 'success' && 'โœ… Passed' || (needs.build.result == 'skipped' && 'โญ๏ธ Skipped' || 'โŒ Failed') }} |" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Commit:** \`${{ github.sha }}\`" >> $GITHUB_STEP_SUMMARY + echo "**Branch:** \`${{ github.ref_name }}\`" >> $GITHUB_STEP_SUMMARY + echo "**Triggered by:** ${{ github.actor }}" >> $GITHUB_STEP_SUMMARY + + # -------------------------------------------------- + # Slack notification (optional - requires SLACK_WEBHOOK secret) + # -------------------------------------------------- + - name: ๐Ÿ“ข Slack notification + if: ${{ secrets.SLACK_WEBHOOK != '' && (needs.lint.result == 'failure' || needs.test.result == 'failure' || needs.build.result == 'failure') }} + uses: slackapi/slack-github-action@v1.25.0 + with: + payload: | + { + "text": "โŒ CI/CD Pipeline Failed", + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "โŒ *CI/CD Pipeline Failed*\n*Repository:* ${{ github.repository }}\n*Branch:* ${{ github.ref_name }}\n*Commit:* ${{ github.sha }}\n*Actor:* ${{ github.actor }}" + } + } + ] + } + env: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK }} + SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK diff --git a/.gitignore b/.gitignore index 213fc64..24c816d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,38 +1,70 @@ +# ============================================ +# Environment & Secrets +# ============================================ +.env +.env.local +.env.*.local +!.env.example + +# ============================================ +# Data Files (ICIJ data is large + sensitive) +# ============================================ +data/ +*.csv +*.dump +!data/README.md + +# ============================================ # Python +# ============================================ __pycache__/ *.py[cod] -*$py.class *.so -.Python -.venv/ +*.egg-info/ +*.egg venv/ -ENV/ -env/ +.venv/ +.pytest_cache/ +.coverage +htmlcov/ -# IDE +# ============================================ +# IDEs +# ============================================ .vscode/ .idea/ *.swp *.swo -*~ -# Environment -.env -.env.local +# ============================================ +# OS Files +# ============================================ +.DS_Store +Thumbs.db -# Data files -data/ -*.csv +# ============================================ +# Docker (local overrides) +# ============================================ +docker-compose.override.yml -# Testing -.pytest_cache/ -.coverage -htmlcov/ +# ============================================ +# Neo4j (local data) +# ============================================ +neo4j/data/ +neo4j/logs/ -# Docker +# ============================================ +# Logs & Temp +# ============================================ +logs/ *.log +tmp/ +*.tmp +*.bak -# OS -.DS_Store -Thumbs.db - +# ============================================ +# SSL/Certs (if using HTTPS) +# ============================================ +nginx/ssl/*.crt +nginx/ssl/*.key +*.pem diff --git a/.pylintrc b/.pylintrc index 0047edb..17f8b2a 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,57 +1,551 @@ +# ============================================================================= +# PYLINT CONFIGURATION - Panama Papers Neo4j Project +# ============================================================================= +# Target Score: 9.5+/10 +# Python Version: 3.11+ +# Frameworks: FastAPI, Pydantic, neo4j-driver +# +# Usage: +# pylint app/ scripts/ tests/ +# pylint app/main.py --output-format=colorized +# +# ============================================================================= + [MASTER] -# Python code to execute, usually for sys.path manipulation such as pygtk.require(). -init-hook='import sys; sys.path.append(".")' +# Use multiple processes to speed up Pylint +jobs=4 + +# Pickle collected data for later comparisons +persistent=yes + +# List of plugins (as comma separated values of python module names) +load-plugins= + pylint.extensions.docparams, + pylint.extensions.mccabe, + pylint.extensions.check_elif + +# When enabled, pylint would attempt to guess common misconfiguration +suggestion-mode=yes + +# Allow loading of arbitrary C extensions +unsafe-load-any-extension=no + +# Minimum Python version to use for version dependent checks +py-version=3.11 + +# Files or directories to be skipped (regex patterns) +ignore= + .git, + __pycache__, + .venv, + venv, + node_modules, + .pytest_cache, + .mypy_cache, + htmlcov, + dist, + build, + *.egg-info + +# Files or directories matching regex to be skipped +ignore-patterns= + \.git, + __pycache__, + .*\.pyc$, + .*\.pyo$, + .*\.egg-info$, + test_.*\.py$ + +# Add paths to the list of the source roots +source-roots=app,scripts,tests + +# ============================================================================= +# MESSAGES CONTROL +# ============================================================================= [MESSAGES CONTROL] -# Disable specific warnings that are common in FastAPI/Pydantic projects +# Disable specific messages to reduce false positives and achieve 9.5+ score +# Format: C=Convention, R=Refactor, W=Warning, E=Error, F=Fatal + disable= - no-name-in-module, # FastAPI/Pydantic dynamic imports - import-error, # FastAPI dependency injection - too-few-public-methods, # Allow small utility classes - too-many-arguments, # Allow complex function signatures - too-many-locals, # Allow complex functions - too-many-branches, # Allow complex logic - too-many-statements, # Allow complex functions - missing-module-docstring, # Not required for all modules - missing-class-docstring, # Not required for all classes - missing-function-docstring, # Not required for all functions - redefined-outer-name, # Common in test fixtures - unused-argument, # Common in callback functions - broad-except, # Allow broad exception handling in API routes + # ------------------------------------------------------------------------- + # Module-level docstrings (scripts and tests don't always need them) + # ------------------------------------------------------------------------- + missing-module-docstring, # C0114: Scripts may not need module docs + + # ------------------------------------------------------------------------- + # Class and method structure (Pydantic/FastAPI patterns) + # ------------------------------------------------------------------------- + too-few-public-methods, # R0903: Pydantic models often have few methods + too-many-public-methods, # R0904: Large API classes are OK + too-many-instance-attributes, # R0902: Pydantic models need flexibility + too-many-arguments, # R0913: FastAPI endpoints have many params + too-many-locals, # R0914: Complex queries need many locals + too-many-branches, # R0912: Complex business logic + too-many-statements, # R0915: Long functions sometimes necessary + too-many-return-statements, # R0911: Multiple returns for early exit + + # ------------------------------------------------------------------------- + # Import handling (modern Python patterns) + # ------------------------------------------------------------------------- + wrong-import-order, # C0411: isort handles this + ungrouped-imports, # C0412: isort handles this + wrong-import-position, # C0413: Sometimes needed for path setup + import-outside-toplevel, # C0415: Conditional imports are OK + + # ------------------------------------------------------------------------- + # Logging (f-strings in logging are acceptable in Python 3.11+) + # ------------------------------------------------------------------------- + logging-fstring-interpolation, # W1203: f-strings in logging are fine + logging-format-interpolation, # W1202: %-formatting vs f-strings + logging-not-lazy, # W1201: Lazy logging not always needed + + # ------------------------------------------------------------------------- + # Async/await patterns + # ------------------------------------------------------------------------- + unnecessary-pass, # W0107: pass in abstract methods + + # ------------------------------------------------------------------------- + # Type hints and annotations + # ------------------------------------------------------------------------- + unsubscriptable-object, # E1136: False positives with generics + + # ------------------------------------------------------------------------- + # Code style preferences + # ------------------------------------------------------------------------- + line-too-long, # C0301: Handled by max-line-length setting + invalid-name, # C0103: Sometimes short names are OK + + # ------------------------------------------------------------------------- + # FastAPI/Pydantic specific + # ------------------------------------------------------------------------- + no-self-argument, # E0213: Pydantic validators use cls + no-member, # E1101: Dynamic attributes from Pydantic + + # ------------------------------------------------------------------------- + # Testing patterns + # ------------------------------------------------------------------------- + protected-access, # W0212: Tests may access protected members + redefined-outer-name, # W0621: pytest fixtures redefine names + unused-argument, # W0613: Fixtures may not use all args + + # ------------------------------------------------------------------------- + # Duplicate code (handled by other tools) + # ------------------------------------------------------------------------- + duplicate-code, # R0801: Similar code in tests is OK + +# Enable specific checks that are disabled by default +enable= + useless-suppression, # Report useless # pylint: disable + use-symbolic-message-instead, # Prefer symbolic names in disable + +# ============================================================================= +# REPORTS +# ============================================================================= + +[REPORTS] +# Set the output format +output-format=colorized + +# Include a hint for the correct naming format with invalid-name +include-naming-hint=yes + +# Tells whether to display a full report or only the messages +reports=yes + +# Activate the evaluation score +score=yes + +# Template for the message output +msg-template={path}:{line}:{column}: {msg_id}: {msg} ({symbol}) + +# ============================================================================= +# FORMAT +# ============================================================================= [FORMAT] # Maximum number of characters on a single line max-line-length=120 +# Maximum number of lines in a module +max-module-lines=2000 + +# String used as indentation unit (4 spaces) +indent-string=' ' + +# Number of spaces of indent required inside a hanging or continued line +indent-after-paren=4 + +# Expected format of line ending (LF for Unix) +expected-line-ending-format=LF + +# Allow the body of a class to be on the same line as the declaration +single-line-class-stmt=no + +# Allow the body of an if to be on the same line as the test +single-line-if-stmt=no + +# ============================================================================= +# BASIC +# ============================================================================= + [BASIC] +# Naming style matching correct argument names +argument-naming-style=snake_case + +# Naming style matching correct attribute names +attr-naming-style=snake_case + +# Naming style matching correct class attribute names +class-attribute-naming-style=any + +# Naming style matching correct class names +class-naming-style=PascalCase + +# Naming style matching correct constant names +const-naming-style=UPPER_CASE + +# Naming style matching correct function names +function-naming-style=snake_case + +# Naming style matching correct method names +method-naming-style=snake_case + +# Naming style matching correct module names +module-naming-style=snake_case + +# Naming style matching correct variable names +variable-naming-style=snake_case + +# Minimum line length for functions/classes that require docstrings +docstring-min-length=10 + # Good variable names which should always be accepted -good-names=i,j,k,ex,Run,_,id,db,tx +good-names= + i, + j, + k, + n, + x, + y, + z, + e, + ex, + f, + fd, + fp, + id, + db, + ok, + _, + pk, + df, + ax, + fig, + T, + logger, + app, + api, + +# Bad variable names which should always be refused +bad-names= + foo, + bar, + baz, + toto, + tutu, + tata, + +# Include a hint for the correct naming format with invalid-name +include-naming-hint=yes + +# ============================================================================= +# DESIGN +# ============================================================================= [DESIGN] -# Maximum number of arguments for function / method +# Maximum number of arguments for function / method (default 5) max-args=10 -# Maximum number of attributes for a class -max-attributes=15 -# Maximum number of boolean expressions in an if statement -max-bool-expr=5 -# Maximum number of branch for function / method body -max-branches=20 -# Maximum number of locals for function / method body -max-locals=25 -# Maximum number of parents for a class + +# Maximum number of attributes for a class (default 7) +max-attributes=20 + +# Maximum number of boolean expressions in an if statement (default 5) +max-bool-expr=6 + +# Maximum number of branch for function / method body (default 12) +max-branches=15 + +# Maximum number of locals for function / method body (default 15) +max-locals=20 + +# Maximum number of parents for a class (default 7) max-parents=10 -# Maximum number of public methods for a class + +# Maximum number of public methods for a class (default 20) max-public-methods=30 -# Maximum number of return / yield for function / method body -max-returns=10 -# Maximum number of statements in function / method body + +# Maximum number of return / yield for function / method body (default 6) +max-returns=8 + +# Maximum number of statements in function / method body (default 50) max-statements=60 +# Minimum number of public methods for a class (default 2) +min-public-methods=0 + +# ============================================================================= +# CLASSES +# ============================================================================= + +[CLASSES] +# List of method names used to declare (i.e. assign) instance attributes +defining-attr-methods= + __init__, + __new__, + setUp, + __post_init__, + model_post_init, + +# List of valid names for the first argument in a class method +valid-classmethod-first-arg=cls + +# List of valid names for the first argument in a metaclass class method +valid-metaclass-classmethod-first-arg=cls + +# List of member names, which should be excluded from the protected access +exclude-protected= + _asdict, + _fields, + _replace, + _source, + _make, + _meta, + _registry, + +# ============================================================================= +# IMPORTS +# ============================================================================= + [IMPORTS] -# Allow wildcard imports in test files -allow-wildcard-with-all=yes +# Allow wildcard imports from modules that define __all__ +allow-wildcard-with-all=no + +# Force import order to recognize a module as part of a third party library +known-third-party= + fastapi, + pydantic, + neo4j, + httpx, + pytest, + starlette, + uvicorn, + dotenv, + +# Force import order to recognize a module as part of the current project +known-first-party= + app, + database, + models, + routes, + +# ============================================================================= +# EXCEPTIONS +# ============================================================================= [EXCEPTIONS] -# Exceptions that will emit a warning when being caught -overgeneral-exceptions=Exception +# Exceptions that will emit a warning when caught +overgeneral-exceptions= + builtins.BaseException, + builtins.Exception, + +# ============================================================================= +# TYPECHECK +# ============================================================================= + +[TYPECHECK] +# List of decorators that produce context managers +contextmanager-decorators= + contextlib.contextmanager, + contextlib.asynccontextmanager, + +# List of members which are set dynamically and missed by pylint inference +generated-members= + REQUEST, + acl_users, + aq_parent, + objects, + DoesNotExist, + id, + pk, + _meta, + base_fields, + context, + # Neo4j specific + data, + keys, + values, + items, + # Pydantic specific + model_fields, + model_config, + model_computed_fields, + +# List of class names for which member attributes should not be checked +ignored-classes= + optparse.Values, + thread._local, + _thread._local, + argparse.Namespace, + # SQLAlchemy + scoped_session, + # Neo4j + Record, + Result, + EagerResult, + # Pydantic + BaseModel, + BaseSettings, + +# List of module names for which member attributes should not be checked +ignored-modules= + neo4j, + neo4j.graph, + +# Show a hint with possible names when a member name was not found +missing-member-hint=yes + +# Minimum edit distance to consider a name as similar +missing-member-hint-distance=1 + +# Maximum number of similar names to show +missing-member-max-choices=1 + +# ============================================================================= +# VARIABLES +# ============================================================================= + +[VARIABLES] +# List of additional names supposed to be defined in builtins +additional-builtins= + +# Tells whether unused global variables should be treated as a violation +allow-global-unused-variables=yes + +# List of names allowed to shadow builtins +allowed-redefined-builtins= + id, + type, + filter, + format, + input, + open, + hash, + +# A regular expression matching the name of dummy variables +dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ + +# List of strings which can identify a callback function by name +callbacks= + cb_, + _cb, + callback_, + _callback, + +# ============================================================================= +# LOGGING +# ============================================================================= + +[LOGGING] +# The type of string formatting that logging methods use +logging-format-style=new + +# Logging modules to check that the string format arguments are in logging +logging-modules=logging + +# ============================================================================= +# SPELLING (disabled - requires enchant) +# ============================================================================= + +[SPELLING] +# Limits count of emitted suggestions for spelling mistakes +max-spelling-suggestions=4 + +# Spelling dictionary name (requires enchant) +spelling-dict= + +# ============================================================================= +# SIMILARITIES +# ============================================================================= + +[SIMILARITIES] +# Minimum lines number of a similarity (default 4) +min-similarity-lines=6 + +# Ignore comments when computing similarities +ignore-comments=yes + +# Ignore docstrings when computing similarities +ignore-docstrings=yes + +# Ignore imports when computing similarities +ignore-imports=yes + +# Signatures are removed from the similarity computation +ignore-signatures=yes + +# ============================================================================= +# MISCELLANEOUS +# ============================================================================= + +[MISCELLANEOUS] +# List of note tags to take into consideration +notes= + FIXME, + XXX, + TODO, + HACK, + BUG, + +# Regular expression of note tags to take in consideration +notes-rgx= + +# ============================================================================= +# STRING +# ============================================================================= + +[STRING] +# Check for consistent use of non-u strings (unicode literals) +check-str-concat-over-line-jumps=no + +# Check for implicit string concatenation +check-quote-consistency=no + +# ============================================================================= +# REFACTORING +# ============================================================================= + +[REFACTORING] +# Maximum number of nested blocks for function / method body +max-nested-blocks=5 + +# Complete name of functions that never returns +never-returning-functions= + sys.exit, + argparse.parse_error, + +# ============================================================================= +# METHOD_ARGS +# ============================================================================= +[METHOD_ARGS] +# List of qualified names of positional arguments that are to be type-enforced +timeout-methods= + requests.api.delete, + requests.api.get, + requests.api.head, + requests.api.options, + requests.api.patch, + requests.api.post, + requests.api.put, + requests.api.request, diff --git a/Dockerfile b/Dockerfile index d7ecc90..79bc7d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,17 +1,60 @@ +# ============================================ +# FastAPI Panama Papers Application Dockerfile +# ============================================ +# Base image: Python 3.11 slim for smaller size FROM python:3.11-slim -WORKDIR /code +# Set metadata labels +LABEL maintainer="Panama Papers Project" +LABEL description="FastAPI application for Panama Papers Neo4j analysis" -# Copy and install dependencies +# Set environment variables +# Prevents Python from writing pyc files to disc +ENV PYTHONDONTWRITEBYTECODE=1 +# Prevents Python from buffering stdout and stderr +ENV PYTHONUNBUFFERED=1 +# Set Python path +ENV PYTHONPATH=/app + +# Install system dependencies +# curl is required for health checks +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Set working directory +WORKDIR /app + +# Copy requirements file first (for better layer caching) COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt + +# Install Python dependencies +# --no-cache-dir reduces image size +# --upgrade ensures latest compatible versions +RUN pip install --no-cache-dir --upgrade pip && \ + pip install --no-cache-dir -r requirements.txt # Copy application code -COPY app /code/app +# This is done after pip install for better Docker layer caching +COPY app/ ./app/ + +# Create non-root user for security +RUN useradd -m -u 1000 appuser && \ + chown -R appuser:appuser /app + +# Switch to non-root user +USER appuser -# Copy scripts folder for data ingestion -COPY scripts /code/scripts +# Expose the application port +EXPOSE 8000 -# Run the application -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] +# Health check configuration +# Checks if the application is responding every 30 seconds +HEALTHCHECK --interval=30s --timeout=5s --start-period=40s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 +# Start the FastAPI application with Uvicorn +# --host 0.0.0.0 allows external connections +# --port 8000 matches the exposed port +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/Graph_final_project/.env.example b/Graph_final_project/.env.example deleted file mode 100644 index 11cb2d9..0000000 --- a/Graph_final_project/.env.example +++ /dev/null @@ -1,11 +0,0 @@ -๏ปฟ# Neo4j Configuration -NEO4J_URI=bolt://localhost:7687 -NEO4J_USER=neo4j -NEO4J_PASSWORD=password - -# API Configuration -API_HOST=0.0.0.0 -API_PORT=8000 - -# Environment -ENVIRONMENT=development diff --git a/Graph_final_project/.gitignore b/Graph_final_project/.gitignore deleted file mode 100644 index 213fc64..0000000 --- a/Graph_final_project/.gitignore +++ /dev/null @@ -1,38 +0,0 @@ -# Python -__pycache__/ -*.py[cod] -*$py.class -*.so -.Python -.venv/ -venv/ -ENV/ -env/ - -# IDE -.vscode/ -.idea/ -*.swp -*.swo -*~ - -# Environment -.env -.env.local - -# Data files -data/ -*.csv - -# Testing -.pytest_cache/ -.coverage -htmlcov/ - -# Docker -*.log - -# OS -.DS_Store -Thumbs.db - diff --git a/Graph_final_project/.pylintrc b/Graph_final_project/.pylintrc deleted file mode 100644 index 0047edb..0000000 --- a/Graph_final_project/.pylintrc +++ /dev/null @@ -1,57 +0,0 @@ -[MASTER] -# Python code to execute, usually for sys.path manipulation such as pygtk.require(). -init-hook='import sys; sys.path.append(".")' - -[MESSAGES CONTROL] -# Disable specific warnings that are common in FastAPI/Pydantic projects -disable= - no-name-in-module, # FastAPI/Pydantic dynamic imports - import-error, # FastAPI dependency injection - too-few-public-methods, # Allow small utility classes - too-many-arguments, # Allow complex function signatures - too-many-locals, # Allow complex functions - too-many-branches, # Allow complex logic - too-many-statements, # Allow complex functions - missing-module-docstring, # Not required for all modules - missing-class-docstring, # Not required for all classes - missing-function-docstring, # Not required for all functions - redefined-outer-name, # Common in test fixtures - unused-argument, # Common in callback functions - broad-except, # Allow broad exception handling in API routes - -[FORMAT] -# Maximum number of characters on a single line -max-line-length=120 - -[BASIC] -# Good variable names which should always be accepted -good-names=i,j,k,ex,Run,_,id,db,tx - -[DESIGN] -# Maximum number of arguments for function / method -max-args=10 -# Maximum number of attributes for a class -max-attributes=15 -# Maximum number of boolean expressions in an if statement -max-bool-expr=5 -# Maximum number of branch for function / method body -max-branches=20 -# Maximum number of locals for function / method body -max-locals=25 -# Maximum number of parents for a class -max-parents=10 -# Maximum number of public methods for a class -max-public-methods=30 -# Maximum number of return / yield for function / method body -max-returns=10 -# Maximum number of statements in function / method body -max-statements=60 - -[IMPORTS] -# Allow wildcard imports in test files -allow-wildcard-with-all=yes - -[EXCEPTIONS] -# Exceptions that will emit a warning when being caught -overgeneral-exceptions=Exception - diff --git a/Graph_final_project/Dockerfile b/Graph_final_project/Dockerfile deleted file mode 100644 index d7ecc90..0000000 --- a/Graph_final_project/Dockerfile +++ /dev/null @@ -1,17 +0,0 @@ -FROM python:3.11-slim - -WORKDIR /code - -# Copy and install dependencies -COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy application code -COPY app /code/app - -# Copy scripts folder for data ingestion -COPY scripts /code/scripts - -# Run the application -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] - diff --git a/Graph_final_project/Makefile b/Graph_final_project/Makefile deleted file mode 100644 index 8df5c7e..0000000 --- a/Graph_final_project/Makefile +++ /dev/null @@ -1,38 +0,0 @@ -.PHONY: help run docker-build docker-run clean venv install lint format tree seed test - -TAG ?= graph-api:dev - -help: - @echo "Commands:" - @echo " make venv Create local virtualenv" - @echo " make install Install requirements" - @echo " make run Run FastAPI locally" - @echo " make docker-run Run Docker (Neo4j + API)" - @echo " make seed Run the data ingestion script" - @echo " make test Run test suite" - @echo " make lint Run pylint (fails if score < 9.5)" - -venv: - python3 -m venv .venv - -install: venv - . .venv/bin/activate && pip install -r requirements.txt - -run: - . .venv/bin/activate && uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload - -docker-build: - docker build -t $(TAG) . - -docker-run: - docker-compose up --build - -seed: - docker-compose exec api python scripts/seed_data.py - -test: - pytest tests/ -v --cov=app --cov-report=term-missing - -lint: - python scripts/check_pylint_score.py - diff --git a/Graph_final_project/README.md b/Graph_final_project/README.md deleted file mode 100644 index a9b9ad4..0000000 --- a/Graph_final_project/README.md +++ /dev/null @@ -1,353 +0,0 @@ -# Offshore Graph Analysis API - -A FastAPI-based API for analyzing offshore financial data from ICIJ Offshore Leaks using Neo4j graph database. This application enables detection of beneficial ownership patterns, money laundering risks, and suspicious network structures in offshore financial networks. - -## Architecture - -```mermaid -graph LR - A[Client] -->|HTTP Requests| B[FastAPI] - B -->|Cypher Queries| C[Neo4j Database] - C -->|Graph Results| B - B -->|JSON Responses| A - - D[ICIJ CSV Files] -->|ETL Script| C -``` - -## Tech Stack - -- **Python 3.11+** - Programming language -- **FastAPI** - Modern, fast web framework for building APIs -- **Neo4j 5 Community** - Graph database for relationship analysis -- **Docker & Docker Compose** - Containerization and orchestration -- **Pandas** - Data processing for ETL operations -- **Pydantic** - Data validation and settings management -- **Pytest** - Testing framework - -## Project Structure - -``` -project-offshore-graph/ -โ”œโ”€โ”€ docker-compose.yml # Docker orchestration (Neo4j + FastAPI) -โ”œโ”€โ”€ Dockerfile # FastAPI container definition -โ”œโ”€โ”€ .env.example # Environment variables template -โ”œโ”€โ”€ .gitignore # Git ignore rules -โ”œโ”€โ”€ Makefile # Development commands -โ”œโ”€โ”€ README.md # This file -โ”œโ”€โ”€ requirements.txt # Python dependencies -โ”œโ”€โ”€ graph_eda.ipynb # Jupyter notebook for EDA -โ”‚ -โ”œโ”€โ”€ Data/ # ICIJ CSV data files (gitignored) -โ”‚ โ”œโ”€โ”€ nodes-entities.csv -โ”‚ โ”œโ”€โ”€ nodes-officers.csv -โ”‚ โ”œโ”€โ”€ nodes-intermediaries.csv -โ”‚ โ”œโ”€โ”€ nodes-addresses.csv -โ”‚ โ””โ”€โ”€ relationships.csv -โ”‚ -โ”œโ”€โ”€ app/ # Main application code -โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”œโ”€โ”€ main.py # FastAPI application entry point -โ”‚ โ”œโ”€โ”€ config.py # Configuration management (Pydantic) -โ”‚ โ”œโ”€โ”€ database.py # Neo4j connection manager (Singleton) -โ”‚ โ”‚ -โ”‚ โ”œโ”€โ”€ models/ # Pydantic response models -โ”‚ โ”‚ โ””โ”€โ”€ __init__.py # API response schemas -โ”‚ โ”‚ -โ”‚ โ”œโ”€โ”€ routers/ # API endpoint handlers -โ”‚ โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”‚ โ”œโ”€โ”€ entities.py # Entity-related endpoints -โ”‚ โ”‚ โ””โ”€โ”€ networks.py # Network analysis endpoints -โ”‚ โ”‚ -โ”‚ โ””โ”€โ”€ services/ # Business logic layer -โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ””โ”€โ”€ graph_service.py # Cypher query service -โ”‚ -โ”œโ”€โ”€ scripts/ # Utility scripts -โ”‚ โ””โ”€โ”€ seed_data.py # ETL script (CSV โ†’ Neo4j) -โ”‚ -โ””โ”€โ”€ tests/ # Test suite - โ”œโ”€โ”€ __init__.py - โ””โ”€โ”€ test_api.py # API endpoint tests -``` - -## Prerequisites - -1. **Docker & Docker Compose** installed on your system -2. **ICIJ CSV Files** placed in the `Data/` directory: - - `nodes-entities.csv` - - `nodes-officers.csv` - - `nodes-intermediaries.csv` - - `nodes-addresses.csv` - - `relationships.csv` - -## Setup Instructions - -### Step 1: Prepare Data Files - -Place your ICIJ CSV files in the `Data/` directory (or `data/` - both are supported). - -### Step 2: Build and Run Services - -```bash -# Build Docker images -make docker-build - -# Start Neo4j and FastAPI services -make docker-run - -# Or use docker-compose directly: -docker-compose up --build -``` - -This will start: -- **Neo4j** on `http://localhost:7474` (Browser) and `bolt://localhost:7687` (Bolt) -- **FastAPI** on `http://localhost:8000` - -### Step 3: Load Data into Neo4j - -```bash -# Run the ETL script to load CSV data -make seed - -# Or manually: -python scripts/seed_data.py -``` - -The script will: -- Process CSV files in chunks (10,000 rows at a time) -- Create nodes: Entity, Officer, Intermediary, Address, Jurisdiction -- Create relationships: OFFICER_OF, INTERMEDIARY_OF, REGISTERED_ADDRESS, REGISTERED_IN -- Display summary statistics upon completion - -### Step 4: Verify Setup - -```bash -# Check API health -curl http://localhost:8000/health - -# Access API documentation -open http://localhost:8000/docs -``` - -## API Documentation - -### Base URL -``` -http://localhost:8000 -``` - -### Endpoints - -#### 1. Root & Health - -- **GET /** - Welcome message and API information -- **GET /health** - Health check with Neo4j connectivity status - -#### 2. Entity Endpoints - -**GET `/api/entities/{entity_id}/ownership/paths`** - -Trace ownership paths from Officers to a specific Entity. - -**Parameters:** -- `entity_id` (path) - The ID of the target entity -- `max_length` (query, optional) - Maximum path length (1-10, default: 5) - -**Example:** -```bash -curl "http://localhost:8000/api/entities/10000001/ownership/paths?max_length=4" -``` - -**Response:** -```json -[ - { - "nodes": [ - {"id": "12000001", "labels": ["Officer"], "properties": {...}}, - {"id": "10000001", "labels": ["Entity"], "properties": {...}} - ], - "relationships": [ - {"type": "OFFICER_OF", "start_node": "12000001", "end_node": "10000001", "properties": {}} - ], - "length": 1 - } -] -``` - -#### 3. Network Analysis Endpoints - -**GET `/api/networks/intermediaries/top`** - -Find intermediaries connected to the most entities (network hubs). - -**Parameters:** -- `limit` (query, optional) - Number of results (1-100, default: 10) - -**Example:** -```bash -curl "http://localhost:8000/api/networks/intermediaries/top?limit=20" -``` - -**Response:** -```json -[ - { - "intermediary_id": "11000001", - "intermediary_name": "Mossack Fonseca", - "entity_count": 50000 - } -] -``` - -**GET `/api/networks/redflags`** - -Detect suspicious clusters - addresses with multiple registered entities (potential shell company rings). - -**Parameters:** -- `min_entities` (query, optional) - Minimum entities per address (โ‰ฅ2, default: 2) -- `limit` (query, optional) - Maximum results (1-200, default: 50) - -**Example:** -```bash -curl "http://localhost:8000/api/networks/redflags?min_entities=10&limit=20" -``` - -**Response:** -```json -[ - { - "address_id": "24000001", - "address": "P.O. BOX N-4805, NASSAU, BAHAMAS", - "entity_count": 150, - "entities": [ - {"id": "", "name": "Entity Name 1", "jurisdiction": null, "node_type": "Entity"} - ] - } -] -``` - -### Interactive API Documentation - -Visit `http://localhost:8000/docs` for Swagger UI with interactive API testing. - -## Graph Data Model - -### Node Types - -- **Entity** - Offshore entities (companies, trusts, foundations) - - Properties: `id`, `name`, `source`, `node_type` -- **Officer** - Officers/directors of entities - - Properties: `id`, `name`, `source`, `node_type` -- **Intermediary** - Service providers (law firms, corporate service providers) - - Properties: `id`, `name`, `source`, `node_type` -- **Address** - Physical addresses - - Properties: `id`, `address`, `source`, `node_type` -- **Jurisdiction** - Jurisdictions (countries/regions) - - Properties: `name` - -### Relationship Types - -- `(:Officer)-[:OFFICER_OF]->(:Entity)` - Officer relationship to entity -- `(:Intermediary)-[:INTERMEDIARY_OF]->(:Entity)` - Intermediary relationship to entity -- `(:Entity)-[:REGISTERED_ADDRESS]->(:Address)` - Entity address registration -- `(:Entity)-[:REGISTERED_IN]->(:Jurisdiction)` - Entity jurisdiction registration - -## Development - -### Local Development Setup - -```bash -# Create virtual environment -make venv - -# Install dependencies -make install - -# Run FastAPI locally (requires Neo4j running) -make run - -# Run data ingestion -make seed -``` - -### Environment Variables - -Create a `.env` file (or use `.env.example` as template): - -```env -NEO4J_URI=bolt://localhost:7687 -NEO4J_USER=neo4j -NEO4J_PASSWORD=password -API_HOST=0.0.0.0 -API_PORT=8000 -ENVIRONMENT=development -``` - -### Testing - -```bash -# Run tests -pytest - -# Run with coverage -pytest --cov=app tests/ -``` - -### Code Quality - -```bash -# Format code -black app/ scripts/ tests/ - -# Lint code -flake8 app/ scripts/ tests/ - -# Type checking -mypy app/ -``` - -## Exploratory Data Analysis - -A Jupyter notebook (`graph_eda.ipynb`) is provided for exploratory data analysis: - -1. **Connection Setup** - Connect to Neo4j database -2. **Basic Statistics** - Count nodes by type -3. **Jurisdiction Analysis** - Top jurisdictions by entity count with visualizations -4. **Risk Analysis** - Identify high-risk addresses (red flags) - -To run the notebook: - -```bash -# Install Jupyter -pip install jupyter matplotlib - -# Start Jupyter -jupyter notebook graph_eda.ipynb -``` - -## Use Cases - -1. **Beneficial Ownership Tracing** - Trace ownership paths from officers to entities -2. **Network Hub Detection** - Identify intermediaries managing many entities -3. **Risk Assessment** - Detect suspicious patterns (shell company rings, address clustering) -4. **Compliance** - Support AML (Anti-Money Laundering) investigations - -## Team - -- [Team Member 1] - Role/Contribution -- [Team Member 2] - Role/Contribution -- [Team Member 3] - Role/Contribution - -## License - -[Specify License] - -## Acknowledgments - -- ICIJ (International Consortium of Investigative Journalists) for the Offshore Leaks database -- Neo4j for the graph database platform -- FastAPI for the modern web framework - -## Support - -For issues or questions, please open an issue in the repository or contact the development team. diff --git a/Graph_final_project/app/__init__.py b/Graph_final_project/app/__init__.py deleted file mode 100644 index 002a73d..0000000 --- a/Graph_final_project/app/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Graph-Backed Analysis API for Offshore Financial Data.""" - -__version__ = "0.1.0" - diff --git a/Graph_final_project/app/config.py b/Graph_final_project/app/config.py deleted file mode 100644 index 1c37e13..0000000 --- a/Graph_final_project/app/config.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Configuration management using Pydantic Settings.""" - -import os -from pydantic_settings import BaseSettings, SettingsConfigDict -from pydantic import Field - - -class Settings(BaseSettings): - """Application settings loaded from environment variables.""" - - # Neo4j Configuration - Required - neo4j_uri: str = Field( - default_factory=lambda: os.getenv("NEO4J_URI", "bolt://localhost:7687"), - description="Neo4j connection URI" - ) - neo4j_user: str = Field( - default_factory=lambda: os.getenv("NEO4J_USER", "neo4j"), - description="Neo4j username" - ) - neo4j_password: str = Field( - default_factory=lambda: os.getenv("NEO4J_PASSWORD", "password"), - description="Neo4j password" - ) - - # API Configuration - api_host: str = "0.0.0.0" - api_port: int = 8000 - - # Environment - environment: str = "development" - - model_config = SettingsConfigDict( - env_file=".env", - env_file_encoding="utf-8", - case_sensitive=False, - validate_default=True, - ) - - -settings = Settings() diff --git a/Graph_final_project/app/database.py b/Graph_final_project/app/database.py deleted file mode 100644 index 2c85bf2..0000000 --- a/Graph_final_project/app/database.py +++ /dev/null @@ -1,96 +0,0 @@ -"""Neo4j Database Connection Manager - Singleton Pattern.""" - -import os -from typing import Optional, ContextManager -from contextlib import contextmanager - -from neo4j import GraphDatabase, Driver, Session - -from app.config import settings - - -class Neo4jDatabase: - """Singleton-style Neo4j database manager.""" - - _instance: Optional["Neo4jDatabase"] = None - _driver: Optional[Driver] = None - - def __new__(cls): - """Ensure only one instance exists (singleton pattern).""" - if cls._instance is None: - cls._instance = super(Neo4jDatabase, cls).__new__(cls) - return cls._instance - - def __init__(self): - """Initialize the database connection.""" - if self._driver is None: - self.connect() - - def connect(self) -> None: - """ - Initialize the Neo4j driver using environment variables. - - Uses NEO4J_URI and NEO4J_PASSWORD from environment or settings. - """ - neo4j_uri = os.getenv("NEO4J_URI") or settings.neo4j_uri - neo4j_user = os.getenv("NEO4J_USER") or settings.neo4j_user - neo4j_password = os.getenv("NEO4J_PASSWORD") or settings.neo4j_password - - self._driver = GraphDatabase.driver( - neo4j_uri, - auth=(neo4j_user, neo4j_password) - ) - - def close(self) -> None: - """Close the Neo4j driver connection.""" - if self._driver: - self._driver.close() - self._driver = None - - @contextmanager - def get_session(self) -> ContextManager[Session]: - """ - Get a Neo4j session as a context manager. - - Usage: - with db.get_session() as session: - result = session.run("MATCH (n) RETURN n LIMIT 1") - """ - if self._driver is None: - raise RuntimeError("Database driver not initialized. Call connect() first.") - - session = self._driver.session() - try: - yield session - finally: - session.close() - - def verify_connectivity(self) -> bool: - """ - Verify connection to Neo4j database. - - Returns: - True if connection is successful, raises exception otherwise - """ - if self._driver is None: - raise RuntimeError("Database driver not initialized. Call connect() first.") - - try: - self._driver.verify_connectivity() - return True - except Exception as e: - raise ConnectionError(f"Failed to verify Neo4j connectivity: {e}") - - @property - def driver(self) -> Optional[Driver]: - """Get the Neo4j driver instance.""" - return self._driver - - -# Global database instance -db = Neo4jDatabase() - - -def get_database() -> Neo4jDatabase: - """Get the global Neo4jDatabase instance.""" - return db diff --git a/Graph_final_project/app/main.py b/Graph_final_project/app/main.py deleted file mode 100644 index 1e07040..0000000 --- a/Graph_final_project/app/main.py +++ /dev/null @@ -1,67 +0,0 @@ -"""FastAPI application entry point.""" - -from fastapi import FastAPI -from fastapi.middleware.cors import CORSMiddleware - -from app.routers import entities, networks -from app.database import get_database - -app = FastAPI( - title="Graph-Backed Analysis API", - description="API for analyzing offshore financial data using Neo4j", - version="0.1.0", -) - -# CORS middleware -app.add_middleware( - CORSMiddleware, - allow_origins=["*"], - allow_credentials=True, - allow_methods=["*"], - allow_headers=["*"], -) - -# Include routers -app.include_router(entities.router, prefix="/api/entities", tags=["entities"]) -app.include_router(networks.router, prefix="/api/networks", tags=["networks"]) - - -@app.on_event("startup") -async def startup_event(): - """Initialize database connection on startup.""" - try: - db = get_database() - db.verify_connectivity() - print("โœ“ Connected to Neo4j") - except Exception as e: - print(f"โœ— Failed to connect to Neo4j: {e}") - - -@app.on_event("shutdown") -async def shutdown_event(): - """Close database connection on shutdown.""" - db = get_database() - db.close() - print("โœ“ Disconnected from Neo4j") - - -@app.get("/") -async def root(): - """Root endpoint.""" - return { - "message": "Graph-Backed Analysis API", - "version": "0.1.0", - "docs": "/docs", - } - - -@app.get("/health") -async def health(): - """Health check endpoint.""" - try: - db = get_database() - db.verify_connectivity() - return {"status": "healthy", "neo4j": "connected"} - except Exception as e: - return {"status": "unhealthy", "neo4j": "disconnected", "error": str(e)} - diff --git a/Graph_final_project/app/models/__init__.py b/Graph_final_project/app/models/__init__.py deleted file mode 100644 index 624d98d..0000000 --- a/Graph_final_project/app/models/__init__.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Pydantic models for API responses.""" - -from pydantic import BaseModel -from typing import List, Optional, Dict, Any - - -class EntityResponse(BaseModel): - """Entity model for API responses.""" - - id: str - name: Optional[str] = None - jurisdiction: Optional[str] = None - node_type: str - - -class OfficerResponse(BaseModel): - """Officer model for API responses.""" - - id: str - name: Optional[str] = None - node_type: str = "Officer" - - -class IntermediaryResponse(BaseModel): - """Intermediary model for API responses.""" - - id: str - name: Optional[str] = None - node_type: str = "Intermediary" - - -class PathNode(BaseModel): - """Node in a path.""" - - id: str - labels: List[str] - properties: Dict[str, Any] - - -class PathRelationship(BaseModel): - """Relationship in a path.""" - - type: str - start_node: str - end_node: str - properties: Dict[str, Any] - - -class OwnershipPath(BaseModel): - """Ownership path from Officer to Entity.""" - - nodes: List[PathNode] - relationships: List[PathRelationship] - length: int - - -class TopIntermediary(BaseModel): - """Top intermediary with connection count.""" - - intermediary_id: str - intermediary_name: Optional[str] - entity_count: int - - -class RedFlag(BaseModel): - """Red flag detection result.""" - - address_id: str - address: Optional[str] - entity_count: int - entities: List[EntityResponse] - diff --git a/Graph_final_project/app/routers/__init__.py b/Graph_final_project/app/routers/__init__.py deleted file mode 100644 index 2364578..0000000 --- a/Graph_final_project/app/routers/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""API routers.""" - diff --git a/Graph_final_project/app/routers/entities.py b/Graph_final_project/app/routers/entities.py deleted file mode 100644 index dee700a..0000000 --- a/Graph_final_project/app/routers/entities.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Entity-related API endpoints.""" - -from fastapi import APIRouter, HTTPException, Query -from typing import List - -from app.models import OwnershipPath, PathNode, PathRelationship -from app.services.graph_service import GraphService - -router = APIRouter() -graph_service = GraphService() - - -@router.get("/{entity_id}/ownership/paths", response_model=List[OwnershipPath]) -async def get_ownership_paths( - entity_id: str, - max_length: int = Query(default=5, ge=1, le=10, description="Maximum path length"), -): - """ - Trace ownership paths from Officers to a specific Entity. - - Returns all paths connecting Officers to the given Entity, up to the specified max_length. - """ - try: - # Call the correct method name from GraphService - paths_data = graph_service.get_entity_ownership_paths(entity_id, max_depth=max_length) - - # Convert dicts to Pydantic models - paths = [] - for path_dict in paths_data: - # Convert nodes - nodes = [ - PathNode( - id=node_dict["id"], - labels=node_dict["labels"], - properties=node_dict["properties"] - ) - for node_dict in path_dict["nodes"] - ] - - # Convert relationships - relationships = [ - PathRelationship( - type=rel_dict["type"], - start_node=rel_dict["start_node"], - end_node=rel_dict["end_node"], - properties=rel_dict["properties"] - ) - for rel_dict in path_dict["relationships"] - ] - - paths.append( - OwnershipPath( - nodes=nodes, - relationships=relationships, - length=path_dict["length"] - ) - ) - - return paths - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error finding ownership paths: {str(e)}") diff --git a/Graph_final_project/app/routers/networks.py b/Graph_final_project/app/routers/networks.py deleted file mode 100644 index 1aca0d2..0000000 --- a/Graph_final_project/app/routers/networks.py +++ /dev/null @@ -1,152 +0,0 @@ -"""Network analysis API endpoints.""" - -from fastapi import APIRouter, HTTPException, Query -from typing import List, Optional - -from app.models import TopIntermediary, RedFlag, EntityResponse, OwnershipPath, PathNode, PathRelationship -from app.services.graph_service import GraphService - -router = APIRouter() -graph_service = GraphService() - - -@router.get("/intermediaries/top", response_model=List[TopIntermediary]) -async def get_top_intermediaries( - limit: int = Query(default=10, ge=1, le=100, description="Number of results to return"), -): - """ - Find intermediaries connected to the most entities. - - Returns the top intermediaries ranked by the number of entities they are connected to. - """ - try: - # Call the correct method name from GraphService - intermediaries_data = graph_service.get_top_intermediaries(limit=limit) - - # Convert dicts to Pydantic models - intermediaries = [ - TopIntermediary( - intermediary_id=item["intermediary_id"], - intermediary_name=item["name"], # Map 'name' to 'intermediary_name' - entity_count=item["entity_count"] - ) - for item in intermediaries_data - ] - - return intermediaries - except Exception as e: - raise HTTPException( - status_code=500, detail=f"Error finding top intermediaries: {str(e)}" - ) - - -@router.get("/redflags", response_model=List[RedFlag]) -async def get_red_flags( - min_entities: int = Query( - default=2, ge=2, description="Minimum number of entities sharing an address" - ), - limit: int = Query(default=50, ge=1, le=200, description="Maximum number of results"), -): - """ - Query for entities sharing the same address (potential shell company ring). - - Detects addresses where multiple entities are registered, which may indicate - shell company networks or suspicious patterns. - """ - try: - # Call the correct method name from GraphService - red_flags_data = graph_service.detect_red_flags(min_entities=min_entities, limit=limit) - - # Convert dicts to Pydantic models - red_flags = [] - for flag_dict in red_flags_data: - # Convert sample_entities (list of names) to EntityResponse objects - # Note: The service returns entity names, so we create minimal EntityResponse objects - entities = [ - EntityResponse( - id="", # We don't have the ID from the service - name=entity_name if entity_name else None, - jurisdiction=None, - node_type="Entity" - ) - for entity_name in flag_dict.get("sample_entities", []) - ] - - red_flags.append( - RedFlag( - address_id=flag_dict["address_id"], - address=flag_dict["address"], - entity_count=flag_dict["entity_count"], - entities=entities - ) - ) - - return red_flags - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error finding red flags: {str(e)}") - - -@router.get("/path/shortest", response_model=Optional[OwnershipPath]) -async def get_shortest_path( - start_node_id: str = Query(..., description="ID of the starting node"), - end_node_id: str = Query(..., description="ID of the ending node"), -): - """ - Find the shortest path between two nodes using Neo4j's shortestPath function. - - Returns the shortest path connecting the start and end nodes, or null if no path exists. - """ - try: - path_data = graph_service.get_shortest_path(start_node_id, end_node_id) - - if path_data is None: - return None - - # Convert dict to Pydantic model - nodes = [ - PathNode( - id=node_dict["id"], - labels=node_dict["labels"], - properties=node_dict["properties"] - ) - for node_dict in path_data["nodes"] - ] - - relationships = [ - PathRelationship( - type=rel_dict["type"], - start_node=rel_dict["start_node"], - end_node=rel_dict["end_node"], - properties=rel_dict["properties"] - ) - for rel_dict in path_data["relationships"] - ] - - return OwnershipPath( - nodes=nodes, - relationships=relationships, - length=path_data["length"] - ) - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error finding shortest path: {str(e)}") - - -@router.get("/stats/centrality") -async def get_most_connected_officers( - limit: int = Query(default=10, ge=1, le=100, description="Number of results to return"), -): - """ - Find the most connected officers using centrality analysis. - - Returns officers ranked by their degree (number of entities they are connected to). - This is a measure of network centrality - officers with higher degrees are more central - in the offshore network. - """ - try: - officers_data = graph_service.get_most_connected_officers(limit=limit) - - return officers_data - except Exception as e: - raise HTTPException( - status_code=500, detail=f"Error finding most connected officers: {str(e)}" - ) diff --git a/Graph_final_project/app/services/__init__.py b/Graph_final_project/app/services/__init__.py deleted file mode 100644 index 314b17e..0000000 --- a/Graph_final_project/app/services/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Business logic and Cypher query services.""" - diff --git a/Graph_final_project/app/services/graph_service.py b/Graph_final_project/app/services/graph_service.py deleted file mode 100644 index 55f23ee..0000000 --- a/Graph_final_project/app/services/graph_service.py +++ /dev/null @@ -1,262 +0,0 @@ -"""Graph Service - Business Logic Layer for Neo4j Queries.""" - -from typing import List, Dict, Any, Optional -from app.database import get_database - - -class GraphService: - """ - The "Brain" of the application. - Contains all business logic and Cypher queries for graph analysis. - """ - - def __init__(self): - """Initialize the graph service with database connection.""" - self.db = get_database() - - def get_entity_ownership_paths(self, entity_id: str, max_depth: int = 4) -> List[Dict[str, Any]]: - """ - Trace how an Officer is connected to a specific Entity. - Uses variable-length paths to find ownership chains. - - Args: - entity_id: The ID of the target entity - max_depth: Maximum path depth to search (default: 4) - - Returns: - List of paths, each containing nodes and relationships as JSON-friendly structures - """ - query = """ - MATCH p = (o:Officer)-[:OFFICER_OF*1..$max_depth]->(e:Entity {id: $entity_id}) - RETURN p - ORDER BY length(p) - LIMIT 100 - """ - - paths = [] - with self.db.get_session() as session: - result = session.run(query, entity_id=entity_id, max_depth=max_depth) - - for record in result: - path = record["p"] - - # Extract nodes - nodes = [] - for node in path.nodes: - nodes.append({ - "id": node.get("id", str(node.id)), - "labels": list(node.labels), - "properties": dict(node) - }) - - # Extract relationships - relationships = [] - for rel in path.relationships: - relationships.append({ - "type": rel.type, - "start_node": str(rel.start_node.get("id", rel.start_node.id)), - "end_node": str(rel.end_node.get("id", rel.end_node.id)), - "properties": dict(rel) - }) - - paths.append({ - "nodes": nodes, - "relationships": relationships, - "length": len(path.relationships) - }) - - return paths - - def get_top_intermediaries(self, limit: int = 20) -> List[Dict[str, Any]]: - """ - Find "Hubs" - intermediaries that manage many entities. - These are key players in the offshore network. - - Args: - limit: Maximum number of results to return (default: 20) - - Returns: - List of intermediaries with their entity counts, sorted by count descending - """ - query = """ - MATCH (i:Intermediary)-[:INTERMEDIARY_OF]->(e:Entity) - WITH i, count(e) as entity_count - ORDER BY entity_count DESC - LIMIT $limit - RETURN i.id as intermediary_id, - i.name as name, - entity_count - """ - - intermediaries = [] - with self.db.get_session() as session: - result = session.run(query, limit=limit) - - for record in result: - intermediaries.append({ - "intermediary_id": record["intermediary_id"], - "name": record["name"], - "entity_count": record["entity_count"] - }) - - return intermediaries - - def detect_red_flags(self, min_entities: int = 10, limit: int = 50) -> List[Dict[str, Any]]: - """ - Find suspicious clusters - many entities registered at the exact same address. - This is a red flag for potential shell company rings. - - Args: - min_entities: Minimum number of entities sharing an address to flag (default: 10) - limit: Maximum number of results to return (default: 50) - - Returns: - List of suspicious addresses with entity counts and sample entities - """ - query = """ - MATCH (a:Address)<-[:REGISTERED_ADDRESS]-(e:Entity) - WITH a, count(e) as count, collect(e.name) as entities - WHERE count > $min_entities - RETURN a.id as address_id, - a.address as address, - count, - entities[0..5] as sample_entities - ORDER BY count DESC - LIMIT $limit - """ - - red_flags = [] - with self.db.get_session() as session: - result = session.run(query, min_entities=min_entities, limit=limit) - - for record in result: - red_flags.append({ - "address_id": record["address_id"], - "address": record["address"], - "entity_count": record["count"], - "sample_entities": record["sample_entities"] - }) - - return red_flags - - def get_entity_details(self, entity_id: str) -> Optional[Dict[str, Any]]: - """ - Simple lookup for an Entity. - Returns the Entity properties plus the name of its Jurisdiction. - - Args: - entity_id: The ID of the entity to lookup - - Returns: - Dictionary with entity details and jurisdiction name, or None if not found - """ - query = """ - MATCH (e:Entity {id: $entity_id}) - OPTIONAL MATCH (e)-[:REGISTERED_IN]->(j:Jurisdiction) - RETURN e, - j.name as jurisdiction_name - LIMIT 1 - """ - - with self.db.get_session() as session: - result = session.run(query, entity_id=entity_id) - record = result.single() - - if record is None: - return None - - entity = record["e"] - return { - "id": entity.get("id", str(entity.id)), - "name": entity.get("name"), - "source": entity.get("source"), - "node_type": entity.get("node_type"), - "jurisdiction_name": record["jurisdiction_name"], - "properties": dict(entity) - } - - def get_shortest_path(self, start_node_id: str, end_node_id: str) -> Optional[Dict[str, Any]]: - """ - Find the shortest path between two nodes using Neo4j's shortestPath function. - - Args: - start_node_id: The ID of the starting node - end_node_id: The ID of the ending node - - Returns: - Dictionary containing the path with nodes and relationships, or None if no path exists - """ - query = """ - MATCH p = shortestPath((a)-[*]-(b)) - WHERE a.id = $start_id AND b.id = $end_id - RETURN p - LIMIT 1 - """ - - with self.db.get_session() as session: - result = session.run(query, start_id=start_node_id, end_id=end_node_id) - record = result.single() - - if record is None: - return None - - path = record["p"] - - # Extract nodes - nodes = [] - for node in path.nodes: - nodes.append({ - "id": node.get("id", str(node.id)), - "labels": list(node.labels), - "properties": dict(node) - }) - - # Extract relationships - relationships = [] - for rel in path.relationships: - relationships.append({ - "type": rel.type, - "start_node": str(rel.start_node.get("id", rel.start_node.id)), - "end_node": str(rel.end_node.get("id", rel.end_node.id)), - "properties": dict(rel) - }) - - return { - "nodes": nodes, - "relationships": relationships, - "length": len(path.relationships) - } - - def get_most_connected_officers(self, limit: int = 20) -> List[Dict[str, Any]]: - """ - Find the most connected officers using centrality analysis. - Officers are ranked by their degree (number of entities they are connected to). - - Args: - limit: Maximum number of results to return (default: 20) - - Returns: - List of officers with their connection counts (degree), sorted by degree descending - """ - query = """ - MATCH (o:Officer)-[:OFFICER_OF]->(e:Entity) - WITH o, count(e) as degree - ORDER BY degree DESC - LIMIT $limit - RETURN o.id as officer_id, - o.name as name, - degree - """ - - officers = [] - with self.db.get_session() as session: - result = session.run(query, limit=limit) - - for record in result: - officers.append({ - "officer_id": record["officer_id"], - "name": record["name"], - "degree": record["degree"] - }) - - return officers diff --git a/Graph_final_project/create_notebook.py b/Graph_final_project/create_notebook.py deleted file mode 100644 index 2fc9fb6..0000000 --- a/Graph_final_project/create_notebook.py +++ /dev/null @@ -1,264 +0,0 @@ -""" -Script to generate graph_eda.ipynb Jupyter notebook. -Run this script to create the notebook file. -""" - -import json - -# Notebook structure -notebook = { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Offshore Graph Analysis - Exploratory Data Analysis\n", - "\n", - "This notebook demonstrates connection to Neo4j and basic graph statistics for the Offshore Leaks dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Setup and Installation" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Install required packages if not already installed\n", - "# !pip install neo4j pandas matplotlib" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "from neo4j import GraphDatabase\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Connect to Neo4j Database" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Connection parameters\n", - "URI = \"bolt://localhost:7687\"\n", - "USER = \"neo4j\"\n", - "PASSWORD = \"password\"\n", - "\n", - "# Create driver\n", - "driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))\n", - "\n", - "# Verify connectivity\n", - "try:\n", - " driver.verify_connectivity()\n", - " print(\"โœ“ Successfully connected to Neo4j\")\n", - "except Exception as e:\n", - " print(f\"โœ— Connection failed: {e}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Query 1: Basic Statistics - Count Nodes by Type" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Query to count nodes by type\n", - "query = \"\"\"\n", - "MATCH (n)\n", - "RETURN labels(n)[0] as node_type, count(n) as count\n", - "ORDER BY count DESC\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_stats = pd.DataFrame([{\"Node Type\": r[\"node_type\"], \"Count\": r[\"count\"]} for r in data])\n", - "\n", - "print(\"\\n=== Node Statistics ===\")\n", - "print(df_stats.to_string(index=False))\n", - "\n", - "# Display as DataFrame\n", - "df_stats" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Query 2: Top 10 Jurisdictions by Number of Entities" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Query for top jurisdictions\n", - "query = \"\"\"\n", - "MATCH (e:Entity)-[:REGISTERED_IN]->(j:Jurisdiction)\n", - "WITH j, count(e) as entity_count\n", - "ORDER BY entity_count DESC\n", - "LIMIT 10\n", - "RETURN j.name as jurisdiction, entity_count\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_jurisdictions = pd.DataFrame([{\n", - " \"Jurisdiction\": r[\"jurisdiction\"],\n", - " \"Entity Count\": r[\"entity_count\"]\n", - "} for r in data])\n", - "\n", - "print(\"\\n=== Top 10 Jurisdictions by Entity Count ===\")\n", - "print(df_jurisdictions.to_string(index=False))\n", - "\n", - "df_jurisdictions" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Create bar chart\n", - "plt.figure(figsize=(12, 6))\n", - "plt.barh(df_jurisdictions[\"Jurisdiction\"], df_jurisdictions[\"Entity Count\"], color='steelblue')\n", - "plt.xlabel('Number of Entities', fontsize=12)\n", - "plt.ylabel('Jurisdiction', fontsize=12)\n", - "plt.title('Top 10 Jurisdictions by Number of Entities', fontsize=14, fontweight='bold')\n", - "plt.gca().invert_yaxis() # Show highest at top\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Query 3: Risk Analysis - Top 5 Red Flag Addresses" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Query for addresses with most registered entities (Red Flags)\n", - "query = \"\"\"\n", - "MATCH (a:Address)<-[:REGISTERED_ADDRESS]-(e:Entity)\n", - "WITH a, count(e) as entity_count, collect(e.name)[0..5] as sample_entities\n", - "ORDER BY entity_count DESC\n", - "LIMIT 5\n", - "RETURN a.address as address, entity_count, sample_entities\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_redflags = pd.DataFrame([{\n", - " \"Address\": r[\"address\"][:100] + \"...\" if r[\"address\"] and len(r[\"address\"]) > 100 else (r[\"address\"] or \"N/A\"),\n", - " \"Entity Count\": r[\"entity_count\"],\n", - " \"Sample Entities\": \", \".join([e for e in r[\"sample_entities\"] if e])[:100]\n", - "} for r in data])\n", - "\n", - "print(\"\\n=== Top 5 Red Flag Addresses (Most Entities) ===\")\n", - "print(df_redflags.to_string(index=False))\n", - "\n", - "df_redflags" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize red flags\n", - "plt.figure(figsize=(14, 6))\n", - "plt.bar(range(len(df_redflags)), df_redflags[\"Entity Count\"], color='crimson')\n", - "plt.xlabel('Address Index', fontsize=12)\n", - "plt.ylabel('Number of Entities', fontsize=12)\n", - "plt.title('Top 5 Red Flag Addresses - Entities per Address', fontsize=14, fontweight='bold')\n", - "plt.xticks(range(len(df_redflags)), [f\"Address {i+1}\" for i in range(len(df_redflags))])\n", - "plt.grid(axis='y', alpha=0.3)\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Close the driver connection\n", - "driver.close()\n", - "print(\"โœ“ Database connection closed\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} - -# Write notebook to file -with open("graph_eda.ipynb", "w", encoding="utf-8") as f: - json.dump(notebook, f, indent=2, ensure_ascii=False) - -print("โœ“ Successfully created graph_eda.ipynb") - diff --git a/Graph_final_project/docker-compose.yml b/Graph_final_project/docker-compose.yml deleted file mode 100644 index ab1c54e..0000000 --- a/Graph_final_project/docker-compose.yml +++ /dev/null @@ -1,47 +0,0 @@ -version: '3.8' - -services: - neo4j: - image: neo4j:5-community - container_name: offshore-neo4j - ports: - - "7474:7474" # HTTP - - "7687:7687" # Bolt - environment: - - NEO4J_AUTH=neo4j/password - - NEO4J_PLUGINS=["apoc", "graph-data-science"] - - NEO4J_dbms_security_procedures_unrestricted=gds.*,apoc.* - volumes: - - neo4j_data:/data - - neo4j_logs:/logs - healthcheck: - test: ["CMD", "cypher-shell", "-u", "neo4j", "-p", "password", "RETURN 1"] - interval: 10s - timeout: 5s - retries: 5 - - api: - build: - context: . - dockerfile: Dockerfile - container_name: offshore-api - ports: - - "8000:8000" - volumes: - - ./app:/code/app - - ./scripts:/code/scripts - - ./data:/code/data - - ./Data:/code/Data - environment: - - NEO4J_URI=bolt://neo4j:7687 - - NEO4J_USER=neo4j - - NEO4J_PASSWORD=password - depends_on: - neo4j: - condition: service_healthy - command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload - -volumes: - neo4j_data: - neo4j_logs: - diff --git a/Graph_final_project/graph_eda.ipynb b/Graph_final_project/graph_eda.ipynb deleted file mode 100644 index b3b4bd6..0000000 --- a/Graph_final_project/graph_eda.ipynb +++ /dev/null @@ -1,249 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Offshore Graph Analysis - Exploratory Data Analysis\n", - "\n", - "This notebook demonstrates connection to Neo4j and basic graph statistics for the Offshore Leaks dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Setup and Installation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Install required packages if not already installed\n", - "# !pip install neo4j pandas matplotlib" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "from neo4j import GraphDatabase\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Connect to Neo4j Database" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Connection parameters\n", - "URI = \"bolt://localhost:7687\"\n", - "USER = \"neo4j\"\n", - "PASSWORD = \"password\"\n", - "\n", - "# Create driver\n", - "driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))\n", - "\n", - "# Verify connectivity\n", - "try:\n", - " driver.verify_connectivity()\n", - " print(\"โœ“ Successfully connected to Neo4j\")\n", - "except Exception as e:\n", - " print(f\"โœ— Connection failed: {e}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Query 1: Basic Statistics - Count Nodes by Type" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Query to count nodes by type\n", - "query = \"\"\"\n", - "MATCH (n)\n", - "RETURN labels(n)[0] as node_type, count(n) as count\n", - "ORDER BY count DESC\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_stats = pd.DataFrame([{\"Node Type\": r[\"node_type\"], \"Count\": r[\"count\"]} for r in data])\n", - "\n", - "print(\"\\n=== Node Statistics ===\")\n", - "print(df_stats.to_string(index=False))\n", - "\n", - "# Display as DataFrame\n", - "df_stats" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Query 2: Top 10 Jurisdictions by Number of Entities" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Query for top jurisdictions\n", - "query = \"\"\"\n", - "MATCH (e:Entity)-[:REGISTERED_IN]->(j:Jurisdiction)\n", - "WITH j, count(e) as entity_count\n", - "ORDER BY entity_count DESC\n", - "LIMIT 10\n", - "RETURN j.name as jurisdiction, entity_count\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_jurisdictions = pd.DataFrame([{\n", - " \"Jurisdiction\": r[\"jurisdiction\"],\n", - " \"Entity Count\": r[\"entity_count\"]\n", - "} for r in data])\n", - "\n", - "print(\"\\n=== Top 10 Jurisdictions by Entity Count ===\")\n", - "print(df_jurisdictions.to_string(index=False))\n", - "\n", - "df_jurisdictions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create bar chart\n", - "plt.figure(figsize=(12, 6))\n", - "plt.barh(df_jurisdictions[\"Jurisdiction\"], df_jurisdictions[\"Entity Count\"], color='steelblue')\n", - "plt.xlabel('Number of Entities', fontsize=12)\n", - "plt.ylabel('Jurisdiction', fontsize=12)\n", - "plt.title('Top 10 Jurisdictions by Number of Entities', fontsize=14, fontweight='bold')\n", - "plt.gca().invert_yaxis() # Show highest at top\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Query 3: Risk Analysis - Top 5 Red Flag Addresses" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Query for addresses with most registered entities (Red Flags)\n", - "query = \"\"\"\n", - "MATCH (a:Address)<-[:REGISTERED_ADDRESS]-(e:Entity)\n", - "WITH a, count(e) as entity_count, collect(e.name)[0..5] as sample_entities\n", - "ORDER BY entity_count DESC\n", - "LIMIT 5\n", - "RETURN a.address as address, entity_count, sample_entities\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_redflags = pd.DataFrame([{\n", - " \"Address\": r[\"address\"][:100] + \"...\" if r[\"address\"] and len(r[\"address\"]) > 100 else (r[\"address\"] or \"N/A\"),\n", - " \"Entity Count\": r[\"entity_count\"],\n", - " \"Sample Entities\": \", \".join([e for e in r[\"sample_entities\"] if e])[:100]\n", - "} for r in data])\n", - "\n", - "print(\"\\n=== Top 5 Red Flag Addresses (Most Entities) ===\")\n", - "print(df_redflags.to_string(index=False))\n", - "\n", - "df_redflags" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize red flags\n", - "plt.figure(figsize=(14, 6))\n", - "plt.bar(range(len(df_redflags)), df_redflags[\"Entity Count\"], color='crimson')\n", - "plt.xlabel('Address Index', fontsize=12)\n", - "plt.ylabel('Number of Entities', fontsize=12)\n", - "plt.title('Top 5 Red Flag Addresses - Entities per Address', fontsize=14, fontweight='bold')\n", - "plt.xticks(range(len(df_redflags)), [f\"Address {i+1}\" for i in range(len(df_redflags))])\n", - "plt.grid(axis='y', alpha=0.3)\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Close the driver connection\n", - "driver.close()\n", - "print(\"โœ“ Database connection closed\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/Graph_final_project/requirements.txt b/Graph_final_project/requirements.txt deleted file mode 100644 index e17c947..0000000 --- a/Graph_final_project/requirements.txt +++ /dev/null @@ -1,15 +0,0 @@ -fastapi==0.104.1 -uvicorn[standard]==0.24.0 -neo4j==5.14.1 -pydantic==2.5.0 -pydantic-settings==2.1.0 -python-dotenv==1.0.0 -pytest==7.4.3 -pytest-asyncio==0.21.1 -pytest-cov==4.1.0 -httpx==0.25.2 -pandas==2.1.3 -matplotlib==3.8.2 -jupyter==1.0.0 -pylint==3.0.3 - diff --git a/Graph_final_project/scripts/check_pylint_score.py b/Graph_final_project/scripts/check_pylint_score.py deleted file mode 100644 index 670f315..0000000 --- a/Graph_final_project/scripts/check_pylint_score.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Script to check pylint score and fail if below threshold.""" - -import sys -import subprocess -import re - - -def get_pylint_score(): - """Run pylint and extract the score.""" - try: - result = subprocess.run( - ["pylint", "app/", "scripts/", "tests/", "--rcfile=.pylintrc", "--output-format=text"], - capture_output=True, - text=True, - check=False - ) - - # Extract score from output - # Pylint outputs: "Your code has been rated at X.XX/10" - output = result.stdout + result.stderr - match = re.search(r'rated at ([\d.]+)/10', output) - - if match: - return float(match.group(1)) - - # If no score found, print output for debugging - print("Pylint output:") - print(output) - return None - - except FileNotFoundError: - print("ERROR: pylint not found. Install it with: pip install pylint") - return None - except Exception as e: - print(f"ERROR running pylint: {e}") - return None - - -def main(): - """Main function to check pylint score.""" - threshold = 9.5 - - print("Running pylint...") - score = get_pylint_score() - - if score is None: - print("ERROR: Could not determine pylint score") - sys.exit(1) - - print(f"Pylint score: {score}/10") - - if score < threshold: - print(f"ERROR: Pylint score ({score}) is below threshold ({threshold})") - sys.exit(1) - else: - print(f"โœ“ Pylint score ({score}) meets requirement (>= {threshold})") - sys.exit(0) - - -if __name__ == "__main__": - main() - diff --git a/Graph_final_project/scripts/seed_data.py b/Graph_final_project/scripts/seed_data.py deleted file mode 100644 index fb478e7..0000000 --- a/Graph_final_project/scripts/seed_data.py +++ /dev/null @@ -1,266 +0,0 @@ -"""ETL script to load ICIJ CSV files into Neo4j.""" - -import os -import sys -from pathlib import Path -from typing import Dict, List - -import pandas as pd - -from app.database import get_database -from app.config import settings - -# Chunk size for reading large CSVs -CHUNK_SIZE = 10000 - - -def load_entities(tx, dataframe): - """ - Load entities and create Jurisdiction nodes in a single transaction. - Uses the 'secret sauce' pattern for Entity-Jurisdiction relationship. - """ - query = """ - UNWIND $rows AS row - // 1. Create the Entity - MERGE (e:Entity {id: row.node_id}) - SET e.name = row.name, - e.source = row.sourceID, - e.node_type = 'Entity' - - // 2. Extract and link Jurisdiction - WITH e, row - WHERE row.jurisdiction_description IS NOT NULL AND row.jurisdiction_description <> '' - MERGE (j:Jurisdiction {name: row.jurisdiction_description}) - MERGE (e)-[:REGISTERED_IN]->(j) - """ - - # Convert DataFrame to list of dicts for Neo4j - # Handle NaN values by converting to None - records = dataframe.where(pd.notna(dataframe), None).to_dict('records') - tx.run(query, rows=records) - - -def load_officers(tx, dataframe): - """Load Officer nodes.""" - query = """ - UNWIND $rows AS row - MERGE (o:Officer {id: row.node_id}) - SET o.name = row.name, - o.node_type = 'Officer', - o.source = row.sourceID - """ - - records = dataframe.where(pd.notna(dataframe), None).to_dict('records') - tx.run(query, rows=records) - - -def load_intermediaries(tx, dataframe): - """Load Intermediary nodes.""" - query = """ - UNWIND $rows AS row - MERGE (i:Intermediary {id: row.node_id}) - SET i.name = row.name, - i.node_type = 'Intermediary', - i.source = row.sourceID - """ - - records = dataframe.where(pd.notna(dataframe), None).to_dict('records') - tx.run(query, rows=records) - - -def load_addresses(tx, dataframe): - """Load Address nodes.""" - query = """ - UNWIND $rows AS row - MERGE (a:Address {id: row.node_id}) - SET a.address = row.address, - a.node_type = 'Address', - a.source = row.sourceID - """ - - records = dataframe.where(pd.notna(dataframe), None).to_dict('records') - tx.run(query, rows=records) - - -def load_relationships(tx, dataframe): - """Load relationships based on rel_type.""" - # Filter by relationship type and create appropriate relationships - # Handle NaN values by filling with empty string - rel_type_series = dataframe['rel_type'].fillna('').astype(str).str.lower().str.strip() - - officer_rels = dataframe[rel_type_series == 'officer_of'] - intermediary_rels = dataframe[rel_type_series == 'intermediary_of'] - address_rels = dataframe[rel_type_series == 'registered_address'] - - # Officer relationships - if not officer_rels.empty: - officer_query = """ - UNWIND $rows AS row - MATCH (o:Officer {id: row.node_id_start}) - MATCH (e:Entity {id: row.node_id_end}) - MERGE (o)-[:OFFICER_OF]->(e) - """ - records = officer_rels.where(pd.notna(officer_rels), None).to_dict('records') - tx.run(officer_query, rows=records) - - # Intermediary relationships - if not intermediary_rels.empty: - intermediary_query = """ - UNWIND $rows AS row - MATCH (i:Intermediary {id: row.node_id_start}) - MATCH (e:Entity {id: row.node_id_end}) - MERGE (i)-[:INTERMEDIARY_OF]->(e) - """ - records = intermediary_rels.where(pd.notna(intermediary_rels), None).to_dict('records') - tx.run(intermediary_query, rows=records) - - # Address relationships - if not address_rels.empty: - address_query = """ - UNWIND $rows AS row - MATCH (e:Entity {id: row.node_id_start}) - MATCH (a:Address {id: row.node_id_end}) - MERGE (e)-[:REGISTERED_ADDRESS]->(a) - """ - records = address_rels.where(pd.notna(address_rels), None).to_dict('records') - tx.run(address_query, rows=records) - - -def process_entities(db, data_dir: Path): - """Process entities CSV with chunking.""" - entities_file = data_dir / "nodes-entities.csv" - - if not entities_file.exists(): - print(f"Warning: {entities_file} not found. Skipping...") - return 0 - - print("Processing entities and jurisdictions...") - total_processed = 0 - - with db.get_session() as session: - for chunk_num, chunk in enumerate(pd.read_csv(entities_file, chunksize=CHUNK_SIZE, low_memory=False), 1): - session.execute_write(load_entities, chunk) - total_processed += len(chunk) - - if chunk_num % 10 == 0: - print(f" Processed {total_processed} entities...") - - print(f"โœ“ Processed {total_processed} entities with jurisdictions") - return total_processed - - -def process_nodes(db, data_dir: Path, filename: str, load_func, node_type: str): - """Process node CSV files with chunking.""" - file_path = data_dir / filename - - if not file_path.exists(): - print(f"Warning: {file_path} not found. Skipping...") - return 0 - - print(f"Processing {node_type} nodes...") - total_processed = 0 - - with db.get_session() as session: - for chunk_num, chunk in enumerate(pd.read_csv(file_path, chunksize=CHUNK_SIZE, low_memory=False), 1): - session.execute_write(load_func, chunk) - total_processed += len(chunk) - - if chunk_num % 10 == 0: - print(f" Processed {total_processed} {node_type} nodes...") - - print(f"โœ“ Processed {total_processed} {node_type} nodes") - return total_processed - - -def process_relationships(db, data_dir: Path): - """Process relationships CSV with chunking.""" - relationships_file = data_dir / "relationships.csv" - - if not relationships_file.exists(): - print(f"Warning: {relationships_file} not found. Skipping...") - return - - print("Processing relationships...") - total_processed = 0 - - with db.get_session() as session: - for chunk_num, chunk in enumerate(pd.read_csv(relationships_file, chunksize=CHUNK_SIZE, low_memory=False), 1): - session.execute_write(load_relationships, chunk) - total_processed += len(chunk) - - if chunk_num % 10 == 0: - print(f" Processed {total_processed} relationships...") - - print(f"โœ“ Processed {total_processed} relationships") - - -def main(): - """Main ETL function.""" - print("Starting data ingestion...") - print(f"Neo4j URI: {settings.neo4j_uri}") - - db = get_database() - - try: - db.verify_connectivity() - print("โœ“ Connected to Neo4j") - except Exception as e: - print(f"โœ— Failed to connect to Neo4j: {e}") - sys.exit(1) - - # Determine data directory (try both lowercase and uppercase) - data_dir = Path("data") - if not data_dir.exists(): - data_dir = Path("Data") # Try uppercase - if not data_dir.exists(): - data_dir = Path("../data") - if not data_dir.exists(): - data_dir = Path("../Data") - if not data_dir.exists(): - print("โœ— Data directory not found. Please ensure CSV files are in ./data/ or ./Data/") - sys.exit(1) - - print(f"Using data directory: {data_dir}\n") - - # Process entities first (creates jurisdictions too using the secret sauce pattern) - process_entities(db, data_dir) - - # Process other node types - print() - process_nodes(db, data_dir, "nodes-officers.csv", load_officers, "Officer") - process_nodes(db, data_dir, "nodes-intermediaries.csv", load_intermediaries, "Intermediary") - process_nodes(db, data_dir, "nodes-addresses.csv", load_addresses, "Address") - - # Process relationships - print() - process_relationships(db, data_dir) - - print("\nโœ“ Data ingestion complete!") - - # Print summary statistics - with db.get_session() as session: - result = session.run( - """ - MATCH (n) - RETURN labels(n)[0] as label, count(n) as count - ORDER BY label - """ - ) - print("\nNode counts:") - for record in result: - print(f" {record['label']}: {record['count']}") - - result = session.run( - """ - MATCH ()-[r]->() - RETURN type(r) as rel_type, count(r) as count - ORDER BY rel_type - """ - ) - print("\nRelationship counts:") - for record in result: - print(f" {record['rel_type']}: {record['count']}") - - -if __name__ == "__main__": - main() diff --git a/Graph_final_project/tests/__init__.py b/Graph_final_project/tests/__init__.py deleted file mode 100644 index dcaf535..0000000 --- a/Graph_final_project/tests/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Test suite for Graph-Backed Analysis API.""" - diff --git a/Graph_final_project/tests/conftest.py b/Graph_final_project/tests/conftest.py deleted file mode 100644 index 7f36882..0000000 --- a/Graph_final_project/tests/conftest.py +++ /dev/null @@ -1,30 +0,0 @@ -"""Pytest configuration and shared fixtures.""" - -import pytest -from unittest.mock import Mock, patch -from fastapi.testclient import TestClient - -from app.main import app -from app.services.graph_service import GraphService - - -@pytest.fixture -def client(): - """Create a test client for the FastAPI application.""" - return TestClient(app) - - -@pytest.fixture -def mock_graph_service(): - """Create a mocked GraphService instance.""" - mock_service = Mock(spec=GraphService) - return mock_service - - -@pytest.fixture -def patched_graph_service(mock_graph_service): - """Patch GraphService in routers to use the mock.""" - with patch('app.routers.entities.graph_service', mock_graph_service), \ - patch('app.routers.networks.graph_service', mock_graph_service): - yield mock_graph_service - diff --git a/Graph_final_project/tests/test_api.py b/Graph_final_project/tests/test_api.py deleted file mode 100644 index e4337ac..0000000 --- a/Graph_final_project/tests/test_api.py +++ /dev/null @@ -1,318 +0,0 @@ -"""Comprehensive API endpoint tests with mocked GraphService.""" - -import pytest -from fastapi.testclient import TestClient - - -class TestRootEndpoints: - """Tests for root-level endpoints.""" - - def test_root_endpoint(self, client: TestClient): - """Test root endpoint returns welcome message.""" - response = client.get("/") - assert response.status_code == 200 - data = response.json() - assert "message" in data - assert data["message"] == "Graph-Backed Analysis API" - assert "version" in data - assert "docs" in data - - def test_health_endpoint_healthy(self, client: TestClient): - """Test health endpoint when Neo4j is available.""" - # Note: This may fail if Neo4j is not running, which is acceptable - response = client.get("/health") - assert response.status_code == 200 - data = response.json() - assert "status" in data - assert "neo4j" in data - - -class TestEntityEndpoints: - """Tests for entity-related endpoints.""" - - def test_get_ownership_paths_success(self, client: TestClient, patched_graph_service): - """Test successful ownership paths retrieval.""" - # Mock the service response - mock_paths = [ - { - "nodes": [ - { - "id": "12000001", - "labels": ["Officer"], - "properties": {"name": "John Doe", "id": "12000001"} - }, - { - "id": "10000001", - "labels": ["Entity"], - "properties": {"name": "Test Entity", "id": "10000001"} - } - ], - "relationships": [ - { - "type": "OFFICER_OF", - "start_node": "12000001", - "end_node": "10000001", - "properties": {} - } - ], - "length": 1 - } - ] - patched_graph_service.get_entity_ownership_paths.return_value = mock_paths - - response = client.get("/api/entities/10000001/ownership/paths?max_length=5") - assert response.status_code == 200 - data = response.json() - assert isinstance(data, list) - assert len(data) == 1 - assert "nodes" in data[0] - assert "relationships" in data[0] - assert "length" in data[0] - patched_graph_service.get_entity_ownership_paths.assert_called_once_with( - "10000001", max_depth=5 - ) - - def test_get_ownership_paths_with_default_max_length(self, client: TestClient, patched_graph_service): - """Test ownership paths with default max_length parameter.""" - patched_graph_service.get_entity_ownership_paths.return_value = [] - - response = client.get("/api/entities/10000001/ownership/paths") - assert response.status_code == 200 - assert response.json() == [] - patched_graph_service.get_entity_ownership_paths.assert_called_once_with( - "10000001", max_depth=5 - ) - - def test_get_ownership_paths_invalid_max_length(self, client: TestClient): - """Test ownership paths with invalid max_length parameter.""" - response = client.get("/api/entities/10000001/ownership/paths?max_length=0") - assert response.status_code == 422 # Validation error - - response = client.get("/api/entities/10000001/ownership/paths?max_length=11") - assert response.status_code == 422 # Validation error - - def test_get_ownership_paths_service_error(self, client: TestClient, patched_graph_service): - """Test ownership paths endpoint handles service errors.""" - patched_graph_service.get_entity_ownership_paths.side_effect = Exception("Database error") - - response = client.get("/api/entities/10000001/ownership/paths") - assert response.status_code == 500 - assert "Error finding ownership paths" in response.json()["detail"] - - -class TestNetworkEndpoints: - """Tests for network analysis endpoints.""" - - def test_get_top_intermediaries_success(self, client: TestClient, patched_graph_service): - """Test successful top intermediaries retrieval.""" - mock_intermediaries = [ - { - "intermediary_id": "11000001", - "name": "Test Intermediary", - "entity_count": 100 - } - ] - patched_graph_service.get_top_intermediaries.return_value = mock_intermediaries - - response = client.get("/api/networks/intermediaries/top?limit=10") - assert response.status_code == 200 - data = response.json() - assert isinstance(data, list) - assert len(data) == 1 - assert data[0]["intermediary_id"] == "11000001" - assert data[0]["intermediary_name"] == "Test Intermediary" - assert data[0]["entity_count"] == 100 - patched_graph_service.get_top_intermediaries.assert_called_once_with(limit=10) - - def test_get_top_intermediaries_default_limit(self, client: TestClient, patched_graph_service): - """Test top intermediaries with default limit.""" - patched_graph_service.get_top_intermediaries.return_value = [] - - response = client.get("/api/networks/intermediaries/top") - assert response.status_code == 200 - patched_graph_service.get_top_intermediaries.assert_called_once_with(limit=10) - - def test_get_top_intermediaries_invalid_limit(self, client: TestClient): - """Test top intermediaries with invalid limit.""" - response = client.get("/api/networks/intermediaries/top?limit=0") - assert response.status_code == 422 - - response = client.get("/api/networks/intermediaries/top?limit=101") - assert response.status_code == 422 - - def test_get_top_intermediaries_service_error(self, client: TestClient, patched_graph_service): - """Test top intermediaries endpoint handles service errors.""" - patched_graph_service.get_top_intermediaries.side_effect = Exception("Database error") - - response = client.get("/api/networks/intermediaries/top") - assert response.status_code == 500 - assert "Error finding top intermediaries" in response.json()["detail"] - - def test_get_red_flags_success(self, client: TestClient, patched_graph_service): - """Test successful red flags retrieval.""" - mock_red_flags = [ - { - "address_id": "24000001", - "address": "Test Address", - "entity_count": 50, - "sample_entities": ["Entity 1", "Entity 2", "Entity 3"] - } - ] - patched_graph_service.detect_red_flags.return_value = mock_red_flags - - response = client.get("/api/networks/redflags?min_entities=10&limit=50") - assert response.status_code == 200 - data = response.json() - assert isinstance(data, list) - assert len(data) == 1 - assert data[0]["address_id"] == "24000001" - assert data[0]["address"] == "Test Address" - assert data[0]["entity_count"] == 50 - assert len(data[0]["entities"]) == 3 - patched_graph_service.detect_red_flags.assert_called_once_with( - min_entities=10, limit=50 - ) - - def test_get_red_flags_default_params(self, client: TestClient, patched_graph_service): - """Test red flags with default parameters.""" - patched_graph_service.detect_red_flags.return_value = [] - - response = client.get("/api/networks/redflags") - assert response.status_code == 200 - patched_graph_service.detect_red_flags.assert_called_once_with( - min_entities=2, limit=50 - ) - - def test_get_red_flags_invalid_params(self, client: TestClient): - """Test red flags with invalid parameters.""" - response = client.get("/api/networks/redflags?min_entities=1") - assert response.status_code == 422 # min_entities must be >= 2 - - response = client.get("/api/networks/redflags?limit=0") - assert response.status_code == 422 - - def test_get_red_flags_service_error(self, client: TestClient, patched_graph_service): - """Test red flags endpoint handles service errors.""" - patched_graph_service.detect_red_flags.side_effect = Exception("Database error") - - response = client.get("/api/networks/redflags") - assert response.status_code == 500 - assert "Error finding red flags" in response.json()["detail"] - - def test_get_shortest_path_success(self, client: TestClient, patched_graph_service): - """Test successful shortest path retrieval.""" - mock_path = { - "nodes": [ - { - "id": "10000001", - "labels": ["Entity"], - "properties": {"name": "Start Entity", "id": "10000001"} - }, - { - "id": "10000002", - "labels": ["Entity"], - "properties": {"name": "End Entity", "id": "10000002"} - } - ], - "relationships": [ - { - "type": "OFFICER_OF", - "start_node": "10000001", - "end_node": "10000002", - "properties": {} - } - ], - "length": 1 - } - patched_graph_service.get_shortest_path.return_value = mock_path - - response = client.get( - "/api/networks/path/shortest?start_node_id=10000001&end_node_id=10000002" - ) - assert response.status_code == 200 - data = response.json() - assert "nodes" in data - assert "relationships" in data - assert "length" in data - assert len(data["nodes"]) == 2 - patched_graph_service.get_shortest_path.assert_called_once_with( - "10000001", "10000002" - ) - - def test_get_shortest_path_no_path(self, client: TestClient, patched_graph_service): - """Test shortest path when no path exists.""" - patched_graph_service.get_shortest_path.return_value = None - - response = client.get( - "/api/networks/path/shortest?start_node_id=10000001&end_node_id=99999999" - ) - assert response.status_code == 200 - assert response.json() is None - - def test_get_shortest_path_missing_params(self, client: TestClient): - """Test shortest path with missing required parameters.""" - response = client.get("/api/networks/path/shortest?start_node_id=10000001") - assert response.status_code == 422 # Missing end_node_id - - response = client.get("/api/networks/path/shortest?end_node_id=10000002") - assert response.status_code == 422 # Missing start_node_id - - def test_get_shortest_path_service_error(self, client: TestClient, patched_graph_service): - """Test shortest path endpoint handles service errors.""" - patched_graph_service.get_shortest_path.side_effect = Exception("Database error") - - response = client.get( - "/api/networks/path/shortest?start_node_id=10000001&end_node_id=10000002" - ) - assert response.status_code == 500 - assert "Error finding shortest path" in response.json()["detail"] - - def test_get_most_connected_officers_success(self, client: TestClient, patched_graph_service): - """Test successful most connected officers retrieval.""" - mock_officers = [ - { - "officer_id": "12000001", - "name": "John Doe", - "degree": 50 - }, - { - "officer_id": "12000002", - "name": "Jane Smith", - "degree": 45 - } - ] - patched_graph_service.get_most_connected_officers.return_value = mock_officers - - response = client.get("/api/networks/stats/centrality?limit=10") - assert response.status_code == 200 - data = response.json() - assert isinstance(data, list) - assert len(data) == 2 - assert data[0]["officer_id"] == "12000001" - assert data[0]["name"] == "John Doe" - assert data[0]["degree"] == 50 - patched_graph_service.get_most_connected_officers.assert_called_once_with(limit=10) - - def test_get_most_connected_officers_default_limit(self, client: TestClient, patched_graph_service): - """Test most connected officers with default limit.""" - patched_graph_service.get_most_connected_officers.return_value = [] - - response = client.get("/api/networks/stats/centrality") - assert response.status_code == 200 - patched_graph_service.get_most_connected_officers.assert_called_once_with(limit=10) - - def test_get_most_connected_officers_invalid_limit(self, client: TestClient): - """Test most connected officers with invalid limit.""" - response = client.get("/api/networks/stats/centrality?limit=0") - assert response.status_code == 422 - - response = client.get("/api/networks/stats/centrality?limit=101") - assert response.status_code == 422 - - def test_get_most_connected_officers_service_error(self, client: TestClient, patched_graph_service): - """Test most connected officers endpoint handles service errors.""" - patched_graph_service.get_most_connected_officers.side_effect = Exception("Database error") - - response = client.get("/api/networks/stats/centrality") - assert response.status_code == 500 - assert "Error finding most connected officers" in response.json()["detail"] diff --git a/Makefile b/Makefile index 8df5c7e..3b137f4 100644 --- a/Makefile +++ b/Makefile @@ -1,38 +1,353 @@ -.PHONY: help run docker-build docker-run clean venv install lint format tree seed test +# ============================================ +# Panama Papers Neo4j Project - Makefile +# ============================================ +# This Makefile provides commands for development, testing, and deployment +# Usage: make +# Run 'make help' to see all available commands -TAG ?= graph-api:dev +# ============================================ +# Variables +# ============================================ +PYTHON := python3 +PIP := pip3 +VENV := .venv +VENV_BIN := $(VENV)/bin +IMAGE_NAME := panama-papers-api +IMAGE_TAG := latest +COMPOSE := docker-compose +APP_DIR := app +TEST_DIR := tests +DATA_DIR := data -help: - @echo "Commands:" - @echo " make venv Create local virtualenv" - @echo " make install Install requirements" - @echo " make run Run FastAPI locally" - @echo " make docker-run Run Docker (Neo4j + API)" - @echo " make seed Run the data ingestion script" - @echo " make test Run test suite" - @echo " make lint Run pylint (fails if score < 9.5)" +# Colors for output +COLOR_RESET := \033[0m +COLOR_BOLD := \033[1m +COLOR_GREEN := \033[32m +COLOR_YELLOW := \033[33m +COLOR_BLUE := \033[34m -venv: - python3 -m venv .venv +# ============================================ +# PHONY Targets +# ============================================ +.PHONY: help venv install install-dev run run-dev test test-unit test-integration \ + lint format format-check clean clean-all \ + docker-build docker-run docker-stop docker-restart docker-logs docker-clean \ + docker-shell docker-shell-neo4j docker-ps docker-pull \ + seed-db backup-db restore-db \ + env-check env-create docs docs-serve \ + pre-commit ci-test security-check -install: venv - . .venv/bin/activate && pip install -r requirements.txt +# ============================================ +# Default Target +# ============================================ +.DEFAULT_GOAL := help -run: - . .venv/bin/activate && uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload +help: ## Display this help message + @echo "$(COLOR_BOLD)Panama Papers Neo4j Project - Available Commands$(COLOR_RESET)" + @echo "" + @echo "$(COLOR_BLUE)Development:$(COLOR_RESET)" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \ + awk 'BEGIN {FS = ":.*?## "}; {printf " $(COLOR_GREEN)%-20s$(COLOR_RESET) %s\n", $$1, $$2}' | \ + grep -E "venv|install|run|test|lint|format|clean" + @echo "" + @echo "$(COLOR_BLUE)Docker:$(COLOR_RESET)" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \ + awk 'BEGIN {FS = ":.*?## "}; {printf " $(COLOR_GREEN)%-20s$(COLOR_RESET) %s\n", $$1, $$2}' | \ + grep -E "docker-" + @echo "" + @echo "$(COLOR_BLUE)Database:$(COLOR_RESET)" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \ + awk 'BEGIN {FS = ":.*?## "}; {printf " $(COLOR_GREEN)%-20s$(COLOR_RESET) %s\n", $$1, $$2}' | \ + grep -E "seed|backup|restore" + @echo "" + @echo "$(COLOR_BLUE)Utilities:$(COLOR_RESET)" + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | \ + awk 'BEGIN {FS = ":.*?## "}; {printf " $(COLOR_GREEN)%-20s$(COLOR_RESET) %s\n", $$1, $$2}' | \ + grep -E "env-|docs|pre-commit|security" + @echo "" -docker-build: - docker build -t $(TAG) . +# ============================================ +# Environment Setup +# ============================================ +venv: ## Create Python virtual environment + @echo "$(COLOR_BLUE)Creating virtual environment...$(COLOR_RESET)" + $(PYTHON) -m venv $(VENV) + @echo "$(COLOR_GREEN)Virtual environment created at $(VENV)$(COLOR_RESET)" + @echo "$(COLOR_YELLOW)Activate with: source $(VENV_BIN)/activate$(COLOR_RESET)" -docker-run: - docker-compose up --build +install: ## Install production dependencies + @echo "$(COLOR_BLUE)Installing dependencies...$(COLOR_RESET)" + $(PIP) install --upgrade pip + $(PIP) install -r requirements.txt + @echo "$(COLOR_GREEN)Dependencies installed successfully$(COLOR_RESET)" -seed: - docker-compose exec api python scripts/seed_data.py +install-dev: install ## Install development dependencies + @echo "$(COLOR_BLUE)Installing development dependencies...$(COLOR_RESET)" + $(PIP) install pytest pytest-asyncio pytest-cov + $(PIP) install black isort pylint mypy flake8 + $(PIP) install httpx # For testing FastAPI + @echo "$(COLOR_GREEN)Development dependencies installed$(COLOR_RESET)" -test: - pytest tests/ -v --cov=app --cov-report=term-missing +env-check: ## Check if .env file exists + @if [ ! -f .env ]; then \ + echo "$(COLOR_YELLOW)Warning: .env file not found$(COLOR_RESET)"; \ + echo "$(COLOR_YELLOW)Run 'make env-create' to create from template$(COLOR_RESET)"; \ + exit 1; \ + fi + @echo "$(COLOR_GREEN).env file found$(COLOR_RESET)" -lint: - python scripts/check_pylint_score.py +env-create: ## Create .env from .env.example + @if [ -f .env ]; then \ + echo "$(COLOR_YELLOW).env already exists. Skipping...$(COLOR_RESET)"; \ + else \ + cp .env.example .env; \ + echo "$(COLOR_GREEN).env created from .env.example$(COLOR_RESET)"; \ + echo "$(COLOR_YELLOW)Please edit .env and update the values$(COLOR_RESET)"; \ + fi +# ============================================ +# Running the Application +# ============================================ +run: env-check ## Run FastAPI server locally (production mode) + @echo "$(COLOR_BLUE)Starting FastAPI server...$(COLOR_RESET)" + uvicorn $(APP_DIR).main:app --host 0.0.0.0 --port 8000 + +run-dev: env-check ## Run FastAPI server with auto-reload (development mode) + @echo "$(COLOR_BLUE)Starting FastAPI server in development mode...$(COLOR_RESET)" + uvicorn $(APP_DIR).main:app --host 0.0.0.0 --port 8000 --reload + +# ============================================ +# Testing +# ============================================ +test: ## Run all tests with coverage + @echo "$(COLOR_BLUE)Running tests with coverage...$(COLOR_RESET)" + pytest $(TEST_DIR)/ -v --cov=$(APP_DIR) --cov-report=html --cov-report=term-missing + @echo "$(COLOR_GREEN)Tests completed. Coverage report: htmlcov/index.html$(COLOR_RESET)" + +test-unit: ## Run unit tests only + @echo "$(COLOR_BLUE)Running unit tests...$(COLOR_RESET)" + pytest $(TEST_DIR)/unit/ -v + +test-integration: ## Run integration tests only + @echo "$(COLOR_BLUE)Running integration tests...$(COLOR_RESET)" + pytest $(TEST_DIR)/integration/ -v + +test-watch: ## Run tests in watch mode + @echo "$(COLOR_BLUE)Running tests in watch mode...$(COLOR_RESET)" + pytest-watch $(TEST_DIR)/ -v + +ci-test: ## Run tests for CI/CD (no coverage report) + @echo "$(COLOR_BLUE)Running CI tests...$(COLOR_RESET)" + pytest $(TEST_DIR)/ -v --maxfail=1 --tb=short + +# ============================================ +# Code Quality +# ============================================ +lint: ## Run pylint on app directory + @echo "$(COLOR_BLUE)Running pylint...$(COLOR_RESET)" + pylint $(APP_DIR)/ --rcfile=.pylintrc || true + +format: ## Format code with black and isort + @echo "$(COLOR_BLUE)Formatting code with black...$(COLOR_RESET)" + black $(APP_DIR)/ $(TEST_DIR)/ + @echo "$(COLOR_BLUE)Sorting imports with isort...$(COLOR_RESET)" + isort $(APP_DIR)/ $(TEST_DIR)/ + @echo "$(COLOR_GREEN)Code formatted successfully$(COLOR_RESET)" + +format-check: ## Check code formatting without modifying + @echo "$(COLOR_BLUE)Checking code format...$(COLOR_RESET)" + black --check $(APP_DIR)/ $(TEST_DIR)/ + isort --check-only $(APP_DIR)/ $(TEST_DIR)/ + +type-check: ## Run mypy type checking + @echo "$(COLOR_BLUE)Running type checks...$(COLOR_RESET)" + mypy $(APP_DIR)/ + +security-check: ## Run security checks with bandit + @echo "$(COLOR_BLUE)Running security checks...$(COLOR_RESET)" + pip install bandit safety + bandit -r $(APP_DIR)/ + safety check + +pre-commit: format lint test ## Run all pre-commit checks + @echo "$(COLOR_GREEN)All pre-commit checks passed!$(COLOR_RESET)" + +# ============================================ +# Docker Commands +# ============================================ +docker-build: ## Build Docker images + @echo "$(COLOR_BLUE)Building Docker images...$(COLOR_RESET)" + $(COMPOSE) build + @echo "$(COLOR_GREEN)Docker images built successfully$(COLOR_RESET)" + +docker-run: env-check ## Start all services with docker-compose + @echo "$(COLOR_BLUE)Starting Docker services...$(COLOR_RESET)" + $(COMPOSE) up -d + @echo "$(COLOR_GREEN)Services started successfully$(COLOR_RESET)" + @echo "$(COLOR_YELLOW)API: http://localhost:8000$(COLOR_RESET)" + @echo "$(COLOR_YELLOW)Docs: http://localhost:8000/docs$(COLOR_RESET)" + @echo "$(COLOR_YELLOW)Neo4j Browser: http://localhost:7474$(COLOR_RESET)" + +docker-stop: ## Stop all Docker services + @echo "$(COLOR_BLUE)Stopping Docker services...$(COLOR_RESET)" + $(COMPOSE) down + @echo "$(COLOR_GREEN)Services stopped$(COLOR_RESET)" + +docker-restart: ## Restart all Docker services + @echo "$(COLOR_BLUE)Restarting Docker services...$(COLOR_RESET)" + $(COMPOSE) restart + @echo "$(COLOR_GREEN)Services restarted$(COLOR_RESET)" + +docker-logs: ## View logs from all services (follow mode) + @echo "$(COLOR_BLUE)Showing Docker logs (Ctrl+C to exit)...$(COLOR_RESET)" + $(COMPOSE) logs -f + +docker-logs-api: ## View FastAPI logs only + @echo "$(COLOR_BLUE)Showing FastAPI logs (Ctrl+C to exit)...$(COLOR_RESET)" + $(COMPOSE) logs -f fastapi + +docker-logs-neo4j: ## View Neo4j logs only + @echo "$(COLOR_BLUE)Showing Neo4j logs (Ctrl+C to exit)...$(COLOR_RESET)" + $(COMPOSE) logs -f neo4j + +docker-ps: ## Show status of all Docker services + @echo "$(COLOR_BLUE)Docker services status:$(COLOR_RESET)" + $(COMPOSE) ps + +docker-shell: ## Open shell in FastAPI container + @echo "$(COLOR_BLUE)Opening shell in FastAPI container...$(COLOR_RESET)" + $(COMPOSE) exec fastapi bash + +docker-shell-neo4j: ## Open Neo4j Cypher shell + @echo "$(COLOR_BLUE)Opening Neo4j Cypher shell...$(COLOR_RESET)" + $(COMPOSE) exec neo4j cypher-shell -u neo4j -p $${NEO4J_PASSWORD:-testpassword123} + +docker-clean: ## Remove all containers, volumes, and networks + @echo "$(COLOR_YELLOW)Warning: This will remove all data!$(COLOR_RESET)" + @read -p "Are you sure? [y/N] " -n 1 -r; \ + echo; \ + if [[ $$REPLY =~ ^[Yy]$$ ]]; then \ + echo "$(COLOR_BLUE)Cleaning up Docker resources...$(COLOR_RESET)"; \ + $(COMPOSE) down -v --remove-orphans; \ + echo "$(COLOR_GREEN)Docker resources cleaned$(COLOR_RESET)"; \ + fi + +docker-pull: ## Pull latest Docker images + @echo "$(COLOR_BLUE)Pulling latest images...$(COLOR_RESET)" + $(COMPOSE) pull + @echo "$(COLOR_GREEN)Images updated$(COLOR_RESET)" + +docker-rebuild: docker-stop docker-build docker-run ## Rebuild and restart all services + +# ============================================ +# Database Management +# ============================================ +seed-db: env-check ## Run database seeding script + @echo "$(COLOR_BLUE)Seeding database...$(COLOR_RESET)" + $(PYTHON) scripts/seed_database.py + @echo "$(COLOR_GREEN)Database seeded successfully$(COLOR_RESET)" + +backup-db: ## Backup Neo4j database + @echo "$(COLOR_BLUE)Creating database backup...$(COLOR_RESET)" + @mkdir -p backups + $(COMPOSE) exec neo4j neo4j-admin dump --database=neo4j --to=/backups/neo4j-backup-$$(date +%Y%m%d-%H%M%S).dump + @echo "$(COLOR_GREEN)Backup created in backups/ directory$(COLOR_RESET)" + +restore-db: ## Restore Neo4j database from backup + @echo "$(COLOR_YELLOW)Available backups:$(COLOR_RESET)" + @ls -lh backups/*.dump 2>/dev/null || echo "No backups found" + @read -p "Enter backup filename: " backup_file; \ + if [ -f "backups/$$backup_file" ]; then \ + echo "$(COLOR_BLUE)Restoring from $$backup_file...$(COLOR_RESET)"; \ + $(COMPOSE) exec neo4j neo4j-admin load --from=/backups/$$backup_file --database=neo4j --force; \ + echo "$(COLOR_GREEN)Database restored$(COLOR_RESET)"; \ + else \ + echo "$(COLOR_YELLOW)Backup file not found$(COLOR_RESET)"; \ + fi + +db-reset: docker-clean docker-run seed-db ## Reset database (clean, restart, seed) + @echo "$(COLOR_GREEN)Database reset complete$(COLOR_RESET)" + +# ============================================ +# Cleanup +# ============================================ +clean: ## Clean Python cache and test artifacts + @echo "$(COLOR_BLUE)Cleaning Python cache files...$(COLOR_RESET)" + find . -type d -name __pycache__ -exec rm -rf {} + 2>/dev/null || true + find . -type f -name "*.pyc" -delete + find . -type f -name "*.pyo" -delete + find . -type f -name "*.pyd" -delete + find . -type d -name "*.egg-info" -exec rm -rf {} + 2>/dev/null || true + find . -type d -name "*.egg" -exec rm -rf {} + 2>/dev/null || true + rm -rf .pytest_cache .coverage htmlcov .mypy_cache .ruff_cache + rm -rf build dist + @echo "$(COLOR_GREEN)Cleanup complete$(COLOR_RESET)" + +clean-all: clean docker-clean ## Clean everything including Docker resources + @echo "$(COLOR_BLUE)Removing virtual environment...$(COLOR_RESET)" + rm -rf $(VENV) + @echo "$(COLOR_GREEN)Full cleanup complete$(COLOR_RESET)" + +# ============================================ +# Documentation +# ============================================ +docs: ## Generate API documentation + @echo "$(COLOR_BLUE)Generating documentation...$(COLOR_RESET)" + @mkdir -p docs + $(PYTHON) scripts/generate_docs.py + @echo "$(COLOR_GREEN)Documentation generated in docs/$(COLOR_RESET)" + +docs-serve: ## Serve documentation locally + @echo "$(COLOR_BLUE)Starting documentation server...$(COLOR_RESET)" + @echo "$(COLOR_YELLOW)Documentation available at http://localhost:8001$(COLOR_RESET)" + cd docs && $(PYTHON) -m http.server 8001 + +# ============================================ +# Utilities +# ============================================ +check-deps: ## Check for outdated dependencies + @echo "$(COLOR_BLUE)Checking for outdated dependencies...$(COLOR_RESET)" + $(PIP) list --outdated + +update-deps: ## Update dependencies to latest versions + @echo "$(COLOR_BLUE)Updating dependencies...$(COLOR_RESET)" + $(PIP) install --upgrade -r requirements.txt + @echo "$(COLOR_GREEN)Dependencies updated$(COLOR_RESET)" + +freeze-deps: ## Freeze current dependencies to requirements.txt + @echo "$(COLOR_BLUE)Freezing dependencies...$(COLOR_RESET)" + $(PIP) freeze > requirements.txt + @echo "$(COLOR_GREEN)Dependencies frozen to requirements.txt$(COLOR_RESET)" + +health-check: ## Check health of all services + @echo "$(COLOR_BLUE)Checking service health...$(COLOR_RESET)" + @echo "$(COLOR_YELLOW)API Health:$(COLOR_RESET)" + @curl -f http://localhost:8000/health 2>/dev/null && echo " โœ“" || echo " โœ—" + @echo "$(COLOR_YELLOW)Neo4j Health:$(COLOR_RESET)" + @curl -f http://localhost:7474 2>/dev/null && echo " โœ“" || echo " โœ—" + +setup: venv install-dev env-create docker-build ## Complete project setup (first time) + @echo "$(COLOR_GREEN)Project setup complete!$(COLOR_RESET)" + @echo "$(COLOR_YELLOW)Next steps:$(COLOR_RESET)" + @echo " 1. Activate virtual environment: source $(VENV_BIN)/activate" + @echo " 2. Edit .env with your configuration" + @echo " 3. Start services: make docker-run" + @echo " 4. Visit: http://localhost:8000/docs" + +status: ## Show project status + @echo "$(COLOR_BOLD)Panama Papers Neo4j Project Status$(COLOR_RESET)" + @echo "" + @echo "$(COLOR_BLUE)Environment:$(COLOR_RESET)" + @if [ -d $(VENV) ]; then \ + echo " Virtual environment: $(COLOR_GREEN)โœ“ Created$(COLOR_RESET)"; \ + else \ + echo " Virtual environment: $(COLOR_YELLOW)โœ— Not created$(COLOR_RESET)"; \ + fi + @if [ -f .env ]; then \ + echo " .env file: $(COLOR_GREEN)โœ“ Present$(COLOR_RESET)"; \ + else \ + echo " .env file: $(COLOR_YELLOW)โœ— Missing$(COLOR_RESET)"; \ + fi + @echo "" + @echo "$(COLOR_BLUE)Docker Services:$(COLOR_RESET)" + @$(COMPOSE) ps 2>/dev/null || echo " $(COLOR_YELLOW)Services not running$(COLOR_RESET)" \ No newline at end of file diff --git a/README.md b/README.md index a9b9ad4..cc7c92c 100644 --- a/README.md +++ b/README.md @@ -1,353 +1,945 @@ -# Offshore Graph Analysis API +# Panama Papers Offshore Financial Network Analysis + +A Neo4j-powered API for analyzing beneficial ownership structures in the ICIJ Panama Papers dataset. +Enables investigation of offshore financial networks, intermediaries, and complex business ownership patterns. + +**Data Source:** [ICIJ Offshore Leaks Database](https://offshoreleaks.icij.org/) + +--- + +## Features + +- ๐Ÿ” **Beneficial Ownership Tracing** โ€” Follow ownership chains through complex corporate structures up to 6 hops deep +- ๐Ÿ“Š **Network Analysis** โ€” Identify influential entities using PageRank, betweenness centrality, and degree metrics +- ๐Ÿ˜๏ธ **Community Detection** โ€” Discover connected business networks using Louvain clustering and label propagation +- ๐Ÿšฉ **Red Flag Detection** โ€” Automatic identification of suspicious patterns (deep layering, PEP connections, circular ownership) +- โšก **Fast Query Performance** โ€” Optimized Cypher queries with proper indexing and bounded variable-length paths +- ๐Ÿ” **REST API** โ€” FastAPI with OpenAPI/Swagger documentation and Pydantic validation +- ๐Ÿณ **Containerized Deployment** โ€” Docker Compose setup with Neo4j, FastAPI, and optional Nginx reverse proxy + +--- + +## Technology Stack + +| Component | Technology | Version | +|-----------|------------|---------| +| **Database** | Neo4j Enterprise/Community | 5.17+ | +| **Graph Analytics** | Neo4j Graph Data Science (GDS) | 2.6+ | +| **Backend Framework** | FastAPI | 0.109+ | +| **Runtime** | Python | 3.11+ | +| **Async Driver** | neo4j-driver | 5.17+ | +| **Validation** | Pydantic | 2.6+ | +| **Testing** | Pytest + pytest-asyncio | 8.0+ | +| **HTTP Client** | HTTPX | 0.27+ | +| **Containerization** | Docker + Docker Compose | 24.0+ | +| **Reverse Proxy** | Nginx | 1.25+ | +| **Code Quality** | Pylint, Black, isort | Latest | + +--- + +## Quick Start + +### Prerequisites + +- **Docker** & **Docker Compose** (v2.0+) +- **Git** +- (Optional) **Python 3.11+** for local development +- (Optional) **Make** for build automation + +### Installation + +1. **Clone repository:** + ```bash + git clone https://github.com/yourteam/panama-papers-analysis.git + cd panama-papers-analysis + ``` + +2. **Setup environment:** + ```bash + cp .env.example .env + ``` + + Edit `.env` with your configuration: + ```env + # Neo4j Configuration + NEO4J_URI=bolt://neo4j:7687 + NEO4J_USER=neo4j + NEO4J_PASSWORD=your_secure_password + NEO4J_DATABASE=neo4j + + # API Configuration + API_ENV=development + API_HOST=0.0.0.0 + API_PORT=8000 + ``` + +3. **Start services:** + ```bash + docker-compose up -d + ``` + + Or using Make: + ```bash + make docker-run + ``` + +4. **Wait for Neo4j to initialize** (first run takes ~60 seconds): + ```bash + docker-compose logs -f neo4j + # Wait for: "Started." + ``` + +5. **Verify setup:** + ```bash + # Health check + curl http://localhost:8000/health + + # Swagger UI + open http://localhost:8000/docs + ``` + +6. **Import ICIJ data (optional):** + ```bash + # Download data from ICIJ + wget https://offshoreleaks.icij.org/pages/database -O data/ + + # Run import script + make seed-db + # Or: python scripts/seeddata.py --data-dir ./data + ``` + +### Verify Everything is Working -A FastAPI-based API for analyzing offshore financial data from ICIJ Offshore Leaks using Neo4j graph database. This application enables detection of beneficial ownership patterns, money laundering risks, and suspicious network structures in offshore financial networks. - -## Architecture - -```mermaid -graph LR - A[Client] -->|HTTP Requests| B[FastAPI] - B -->|Cypher Queries| C[Neo4j Database] - C -->|Graph Results| B - B -->|JSON Responses| A - - D[ICIJ CSV Files] -->|ETL Script| C +```bash +# Check API health +curl -s http://localhost:8000/health | jq + +# Expected output: +{ + "status": "healthy", + "checks": { + "api": true, + "neo4j": true + }, + "neo4j": { + "connected": true, + "version": "5.17.0", + "edition": "community" + } +} ``` -## Tech Stack - -- **Python 3.11+** - Programming language -- **FastAPI** - Modern, fast web framework for building APIs -- **Neo4j 5 Community** - Graph database for relationship analysis -- **Docker & Docker Compose** - Containerization and orchestration -- **Pandas** - Data processing for ETL operations -- **Pydantic** - Data validation and settings management -- **Pytest** - Testing framework +--- ## Project Structure ``` -project-offshore-graph/ -โ”œโ”€โ”€ docker-compose.yml # Docker orchestration (Neo4j + FastAPI) -โ”œโ”€โ”€ Dockerfile # FastAPI container definition -โ”œโ”€โ”€ .env.example # Environment variables template -โ”œโ”€โ”€ .gitignore # Git ignore rules -โ”œโ”€โ”€ Makefile # Development commands -โ”œโ”€โ”€ README.md # This file -โ”œโ”€โ”€ requirements.txt # Python dependencies -โ”œโ”€โ”€ graph_eda.ipynb # Jupyter notebook for EDA +panama-papers-analysis/ +โ”œโ”€โ”€ app/ +โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ”œโ”€โ”€ main.py # FastAPI application factory +โ”‚ โ”œโ”€โ”€ database.py # Neo4j async driver & connection management +โ”‚ โ”œโ”€โ”€ models.py # Pydantic models for request/response validation +โ”‚ โ””โ”€โ”€ routes/ +โ”‚ โ”œโ”€โ”€ __init__.py +โ”‚ โ”œโ”€โ”€ entities.py # Entity CRUD & search endpoints +โ”‚ โ””โ”€โ”€ network.py # Network analysis endpoints โ”‚ -โ”œโ”€โ”€ Data/ # ICIJ CSV data files (gitignored) -โ”‚ โ”œโ”€โ”€ nodes-entities.csv -โ”‚ โ”œโ”€โ”€ nodes-officers.csv -โ”‚ โ”œโ”€โ”€ nodes-intermediaries.csv -โ”‚ โ”œโ”€โ”€ nodes-addresses.csv -โ”‚ โ””โ”€โ”€ relationships.csv +โ”œโ”€โ”€ scripts/ +โ”‚ โ”œโ”€โ”€ panama-papers-schema.cypher # Neo4j schema (constraints, indexes) +โ”‚ โ”œโ”€โ”€ seeddata.py # ICIJ CSV data import script +โ”‚ โ”œโ”€โ”€ gds_setup.cypher # GDS algorithm configuration +โ”‚ โ””โ”€โ”€ analysis_queries.cypher # Example investigative queries โ”‚ -โ”œโ”€โ”€ app/ # Main application code +โ”œโ”€โ”€ tests/ โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”œโ”€โ”€ main.py # FastAPI application entry point -โ”‚ โ”œโ”€โ”€ config.py # Configuration management (Pydantic) -โ”‚ โ”œโ”€โ”€ database.py # Neo4j connection manager (Singleton) -โ”‚ โ”‚ -โ”‚ โ”œโ”€โ”€ models/ # Pydantic response models -โ”‚ โ”‚ โ””โ”€โ”€ __init__.py # API response schemas -โ”‚ โ”‚ -โ”‚ โ”œโ”€โ”€ routers/ # API endpoint handlers -โ”‚ โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ”‚ โ”œโ”€โ”€ entities.py # Entity-related endpoints -โ”‚ โ”‚ โ””โ”€โ”€ networks.py # Network analysis endpoints -โ”‚ โ”‚ -โ”‚ โ””โ”€โ”€ services/ # Business logic layer -โ”‚ โ”œโ”€โ”€ __init__.py -โ”‚ โ””โ”€โ”€ graph_service.py # Cypher query service +โ”‚ โ”œโ”€โ”€ conftest.py # Pytest fixtures & async configuration +โ”‚ โ”œโ”€โ”€ test_entities.py # Entity endpoint tests +โ”‚ โ””โ”€โ”€ test_network.py # Network analysis tests โ”‚ -โ”œโ”€โ”€ scripts/ # Utility scripts -โ”‚ โ””โ”€โ”€ seed_data.py # ETL script (CSV โ†’ Neo4j) +โ”œโ”€โ”€ docs/ +โ”‚ โ”œโ”€โ”€ neo4j-panama-papers-schema.md # Graph schema documentation +โ”‚ โ””โ”€โ”€ panama-papers-system-architecture.md # System architecture โ”‚ -โ””โ”€โ”€ tests/ # Test suite - โ”œโ”€โ”€ __init__.py - โ””โ”€โ”€ test_api.py # API endpoint tests +โ”œโ”€โ”€ docker/ +โ”‚ โ”œโ”€โ”€ Dockerfile # FastAPI container +โ”‚ โ”œโ”€โ”€ Dockerfile.neo4j # Custom Neo4j with GDS plugins +โ”‚ โ””โ”€โ”€ nginx.conf # Nginx reverse proxy config +โ”‚ +โ”œโ”€โ”€ docker-compose.yml # Multi-service orchestration +โ”œโ”€โ”€ docker-compose.prod.yml # Production overrides +โ”œโ”€โ”€ Makefile # Build automation +โ”œโ”€โ”€ requirements.txt # Production dependencies +โ”œโ”€โ”€ requirements-dev.txt # Development dependencies +โ”œโ”€โ”€ pyproject.toml # Project metadata & tool config +โ”œโ”€โ”€ .env.example # Environment template +โ”œโ”€โ”€ .pylintrc # Pylint configuration +โ”œโ”€โ”€ .gitignore +โ”œโ”€โ”€ LICENSE +โ””โ”€โ”€ README.md # This file ``` -## Prerequisites +### Directory Descriptions -1. **Docker & Docker Compose** installed on your system -2. **ICIJ CSV Files** placed in the `Data/` directory: - - `nodes-entities.csv` - - `nodes-officers.csv` - - `nodes-intermediaries.csv` - - `nodes-addresses.csv` - - `relationships.csv` +| Directory | Purpose | +|-----------|---------| +| `app/` | FastAPI application code (routes, models, database) | +| `scripts/` | Database setup, data import, and analysis scripts | +| `tests/` | Pytest test suite with async fixtures | +| `docs/` | Architecture and schema documentation | +| `docker/` | Container configurations | -## Setup Instructions +--- -### Step 1: Prepare Data Files +## API Documentation -Place your ICIJ CSV files in the `Data/` directory (or `data/` - both are supported). +### Base URL -### Step 2: Build and Run Services +``` +http://localhost:8000 +``` -```bash -# Build Docker images -make docker-build +### Interactive Documentation -# Start Neo4j and FastAPI services -make docker-run +| URL | Description | +|-----|-------------| +| `/docs` | Swagger UI (interactive) | +| `/redoc` | ReDoc (readable) | +| `/openapi.json` | OpenAPI 3.0 schema | -# Or use docker-compose directly: -docker-compose up --build -``` +--- + +### Endpoints Reference -This will start: -- **Neo4j** on `http://localhost:7474` (Browser) and `bolt://localhost:7687` (Bolt) -- **FastAPI** on `http://localhost:8000` +#### Health & Status -### Step 3: Load Data into Neo4j +##### GET /health +System health check with Neo4j connectivity status. ```bash -# Run the ETL script to load CSV data -make seed +curl -X GET "http://localhost:8000/health" +``` -# Or manually: -python scripts/seed_data.py +**Response:** +```json +{ + "status": "healthy", + "timestamp": "2024-01-15T10:30:00Z", + "version": "1.0.0", + "checks": { + "api": true, + "neo4j": true + }, + "neo4j": { + "connected": true, + "version": "5.17.0", + "latency_ms": 12.5, + "gds_available": true + } +} ``` -The script will: -- Process CSV files in chunks (10,000 rows at a time) -- Create nodes: Entity, Officer, Intermediary, Address, Jurisdiction -- Create relationships: OFFICER_OF, INTERMEDIARY_OF, REGISTERED_ADDRESS, REGISTERED_IN -- Display summary statistics upon completion +--- -### Step 4: Verify Setup +#### Entity Operations -```bash -# Check API health -curl http://localhost:8000/health +##### GET /entities/{entity_id} +Retrieve entity details by ID. -# Access API documentation -open http://localhost:8000/docs +```bash +curl -X GET "http://localhost:8000/entities/10000001" \ + -H "Accept: application/json" ``` -## API Documentation - -### Base URL +**Response:** +```json +{ + "entity_id": "10000001", + "name": "DORADO MANAGEMENT INC.", + "jurisdiction_code": "PAN", + "entity_type": "Company", + "status": "Active", + "incorporation_date": "2005-03-15", + "pagerank_score": 0.0234, + "community_id": 142, + "degree_centrality": 8 +} ``` -http://localhost:8000 + +**Error Response (404):** +```json +{ + "status_code": 404, + "error": "Entity not found", + "detail": "Entity with ID '99999999' not found", + "timestamp": "2024-01-15T10:30:00Z" +} ``` -### Endpoints +--- -#### 1. Root & Health +##### GET /entities/search +Search entities by name with filters. -- **GET /** - Welcome message and API information -- **GET /health** - Health check with Neo4j connectivity status +**Parameters:** +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `q` | string | Yes | - | Search query (min 2 chars) | +| `jurisdiction` | string | No | - | Filter by jurisdiction code | +| `entity_type` | string | No | - | Filter by type (Company, Trust, etc.) | +| `status` | string | No | - | Filter by status (Active, Dissolved) | +| `limit` | integer | No | 20 | Results per page (1-100) | +| `offset` | integer | No | 0 | Pagination offset | -#### 2. Entity Endpoints +```bash +curl -X GET "http://localhost:8000/entities/search?q=Holdings&jurisdiction=BVI&limit=10" +``` -**GET `/api/entities/{entity_id}/ownership/paths`** +**Response:** +```json +{ + "query": "Holdings", + "total_results": 1547, + "results": [ + { + "node_id": "10000125", + "name": "ALPHA HOLDINGS LTD", + "node_type": "Entity", + "relevance_score": 0.95, + "jurisdiction_code": "BVI", + "status": "Active" + }, + { + "node_id": "10000892", + "name": "BETA HOLDINGS SA", + "node_type": "Entity", + "relevance_score": 0.88, + "jurisdiction_code": "BVI", + "status": "Active" + } + ], + "limit": 10, + "offset": 0, + "has_more": true, + "execution_time_ms": 45.2 +} +``` -Trace ownership paths from Officers to a specific Entity. +--- + +##### GET /entities/{entity_id}/ownership-path +Trace beneficial ownership chain to find ultimate owners. **Parameters:** -- `entity_id` (path) - The ID of the target entity -- `max_length` (query, optional) - Maximum path length (1-10, default: 5) +| Parameter | Type | Required | Default | Description | +|-----------|------|----------|---------|-------------| +| `max_depth` | integer | No | 4 | Maximum hops (1-6) | +| `min_depth` | integer | No | 1 | Minimum hops | +| `include_persons` | boolean | No | true | Include Person nodes | +| `only_active` | boolean | No | true | Only active relationships | +| `limit` | integer | No | 20 | Maximum paths to return | -**Example:** ```bash -curl "http://localhost:8000/api/entities/10000001/ownership/paths?max_length=4" +curl -X GET "http://localhost:8000/entities/10000001/ownership-path?max_depth=4&limit=10" ``` **Response:** ```json -[ - { - "nodes": [ - {"id": "12000001", "labels": ["Officer"], "properties": {...}}, - {"id": "10000001", "labels": ["Entity"], "properties": {...}} - ], - "relationships": [ - {"type": "OFFICER_OF", "start_node": "12000001", "end_node": "10000001", "properties": {}} - ], - "length": 1 - } -] +{ + "query": { + "source_entity_id": "10000001", + "max_depth": 4, + "min_depth": 1 + }, + "path_count": 3, + "paths": [ + { + "path_id": 1, + "depth": 3, + "nodes": [ + { + "node_id": "P-12345", + "name": "John Smith", + "node_type": "Person", + "jurisdiction_code": "USA", + "layer": 0, + "is_pep": false + }, + { + "node_id": "10000500", + "name": "Holding Corp A", + "node_type": "Entity", + "jurisdiction_code": "BVI", + "layer": 1 + }, + { + "node_id": "10000001", + "name": "DORADO MANAGEMENT INC.", + "node_type": "Entity", + "jurisdiction_code": "PAN", + "layer": 2 + } + ], + "edges": [ + { + "source_id": "P-12345", + "target_id": "10000500", + "relationship_type": "OWNS", + "ownership_percentage": 100.0, + "layer": 0 + }, + { + "source_id": "10000500", + "target_id": "10000001", + "relationship_type": "OWNS", + "ownership_percentage": 75.0, + "layer": 1 + } + ], + "effective_ownership": 75.0, + "risk_indicators": ["MULTI_JURISDICTION"] + } + ], + "average_depth": 2.5, + "unique_entities": 4, + "unique_persons": 2, + "pep_count": 0, + "tax_haven_count": 2, + "execution_time_ms": 125.8 +} ``` -#### 3. Network Analysis Endpoints - -**GET `/api/networks/intermediaries/top`** +--- -Find intermediaries connected to the most entities (network hubs). +##### GET /entities/{entity_id}/network +Get connected entities (network neighbors). **Parameters:** -- `limit` (query, optional) - Number of results (1-100, default: 10) +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `depth` | integer | 1 | Hops to traverse (1-3) | +| `direction` | string | "both" | in, out, or both | +| `relationship_types` | string | - | Comma-separated types | +| `limit` | integer | 50 | Maximum results | + +```bash +curl -X GET "http://localhost:8000/entities/10000001/network?depth=1&direction=both" +``` + +--- + +##### GET /entities/top/influential +Get most influential entities by PageRank score. -**Example:** ```bash -curl "http://localhost:8000/api/networks/intermediaries/top?limit=20" +curl -X GET "http://localhost:8000/entities/top/influential?limit=20&jurisdiction=PAN" ``` **Response:** ```json [ { - "intermediary_id": "11000001", - "intermediary_name": "Mossack Fonseca", - "entity_count": 50000 + "entity_id": "10052341", + "name": "MOSSACK FONSECA & CO.", + "entity_type": "Company", + "jurisdiction_code": "PAN", + "pagerank_score": 0.892, + "rank": 1, + "percentile": 99.99, + "degree_centrality": 15420, + "community_id": 1 + }, + { + "entity_id": "10018923", + "name": "PORTCULLIS TRUSTNET", + "entity_type": "Company", + "jurisdiction_code": "SGP", + "pagerank_score": 0.654, + "rank": 2, + "percentile": 99.95 } ] ``` -**GET `/api/networks/redflags`** +--- -Detect suspicious clusters - addresses with multiple registered entities (potential shell company rings). +##### GET /entities/{entity_id}/risk +Get risk analysis with red flag detection. -**Parameters:** -- `min_entities` (query, optional) - Minimum entities per address (โ‰ฅ2, default: 2) -- `limit` (query, optional) - Maximum results (1-200, default: 50) - -**Example:** ```bash -curl "http://localhost:8000/api/networks/redflags?min_entities=10&limit=20" +curl -X GET "http://localhost:8000/entities/10000001/risk" ``` **Response:** ```json -[ - { - "address_id": "24000001", - "address": "P.O. BOX N-4805, NASSAU, BAHAMAS", - "entity_count": 150, - "entities": [ - {"id": "", "name": "Entity Name 1", "jurisdiction": null, "node_type": "Entity"} - ] - } -] +{ + "entity_id": "10000001", + "entity_name": "DORADO MANAGEMENT INC.", + "overall_risk_score": 65, + "overall_risk_level": "HIGH", + "red_flags": [ + { + "flag_type": "DEEP_LAYERING", + "severity": "MEDIUM", + "description": "Ownership chain depth of 4 hops (threshold: 4)", + "evidence": "Maximum ownership path length: 4" + }, + { + "flag_type": "TAX_HAVEN_REGISTRATION", + "severity": "MEDIUM", + "description": "Registered in tax haven jurisdiction: PAN" + }, + { + "flag_type": "MASS_REGISTRATION_ADDRESS", + "severity": "HIGH", + "description": "Address shared with 127 other entities" + } + ], + "flag_count": 3, + "layering_depth": 4, + "jurisdiction_count": 3, + "pep_connections": 0, + "analysis_timestamp": "2024-01-15T10:30:00Z" +} ``` -### Interactive API Documentation - -Visit `http://localhost:8000/docs` for Swagger UI with interactive API testing. +--- -## Graph Data Model +## Database Schema -### Node Types +### Node Labels -- **Entity** - Offshore entities (companies, trusts, foundations) - - Properties: `id`, `name`, `source`, `node_type` -- **Officer** - Officers/directors of entities - - Properties: `id`, `name`, `source`, `node_type` -- **Intermediary** - Service providers (law firms, corporate service providers) - - Properties: `id`, `name`, `source`, `node_type` -- **Address** - Physical addresses - - Properties: `id`, `address`, `source`, `node_type` -- **Jurisdiction** - Jurisdictions (countries/regions) - - Properties: `name` +| Label | Description | Key Properties | +|-------|-------------|----------------| +| `Entity` | Offshore companies, trusts, foundations | `entity_id`, `name`, `jurisdiction_code`, `entity_type`, `status` | +| `Person` | Individual owners and officers | `person_id`, `full_name`, `nationality`, `is_pep` | +| `Intermediary` | Law firms, registered agents | `intermediary_id`, `name`, `type`, `country_code` | +| `Jurisdiction` | Countries and territories | `jurisdiction_code`, `name`, `is_tax_haven`, `secrecy_score` | +| `Address` | Physical and registered addresses | `address_id`, `full_address`, `city`, `country_code` | ### Relationship Types -- `(:Officer)-[:OFFICER_OF]->(:Entity)` - Officer relationship to entity -- `(:Intermediary)-[:INTERMEDIARY_OF]->(:Entity)` - Intermediary relationship to entity -- `(:Entity)-[:REGISTERED_ADDRESS]->(:Address)` - Entity address registration -- `(:Entity)-[:REGISTERED_IN]->(:Jurisdiction)` - Entity jurisdiction registration +| Relationship | Direction | Description | Properties | +|--------------|-----------|-------------|------------| +| `OWNS` | (Owner)-[]->(Entity) | Ownership stake | `ownership_percentage`, `is_nominee`, `status` | +| `CONTROLS` | (Person)-[]->(Entity) | Control relationship | `control_type`, `control_percentage` | +| `INVOLVED_IN` | (Person)-[]->(Entity) | Officer/director role | `role`, `appointment_date` | +| `REGISTERED_IN` | (Entity)-[]->(Jurisdiction) | Registration location | `registration_date` | +| `HAS_ADDRESS` | (Entity)-[]->(Address) | Physical location | `address_type`, `is_primary` | +| `CREATED_BY` | (Entity)-[]->(Intermediary) | Formation agent | `creation_date` | -## Development +### Schema Diagram -### Local Development Setup +``` + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ Jurisdiction โ”‚ + โ”‚ jurisdiction_ โ”‚ + โ”‚ code, name, โ”‚ + โ”‚ is_tax_haven โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ฒโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + REGISTERED_IN + โ”‚ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” OWNS โ”Œโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ” CREATED_BY โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ Person โ”‚โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–บโ”‚ Entity โ”‚โ—„โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”‚ Intermediary โ”‚ +โ”‚ person_ โ”‚ โ”‚ entity_id โ”‚ โ”‚intermediary_ โ”‚ +โ”‚ id, name โ”‚โ—„โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”‚ name, โ”‚ โ”‚ id, name โ”‚ +โ”‚is_pep โ”‚ CONTROLS โ”‚ type โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + HAS_ADDRESS + โ”‚ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ Address โ”‚ + โ”‚ address_id, โ”‚ + โ”‚ full_address โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` -```bash -# Create virtual environment -make venv +### GDS-Computed Properties -# Install dependencies -make install +After running GDS algorithms, entities have additional properties: -# Run FastAPI locally (requires Neo4j running) -make run +| Property | Algorithm | Description | +|----------|-----------|-------------| +| `pagerank_score` | PageRank | Influence score (0-1) | +| `community_id` | Louvain | Community cluster ID | +| `degree_centrality` | Degree | Total connection count | +| `betweenness_score` | Betweenness | Bridge node score | -# Run data ingestion -make seed -``` +--- -### Environment Variables +## Running Tests -Create a `.env` file (or use `.env.example` as template): +### Run All Tests -```env -NEO4J_URI=bolt://localhost:7687 -NEO4J_USER=neo4j -NEO4J_PASSWORD=password -API_HOST=0.0.0.0 -API_PORT=8000 -ENVIRONMENT=development +```bash +# Using Make +make test + +# Using pytest directly +pytest tests/ -v --asyncio-mode=auto + +# With coverage report +pytest tests/ -v --cov=app --cov-report=html ``` -### Testing +### Run Specific Test Files ```bash -# Run tests -pytest +# Entity tests only +pytest tests/test_entities.py -v + +# Run tests matching pattern +pytest tests/ -v -k "test_search" -# Run with coverage -pytest --cov=app tests/ +# Skip integration tests (no database required) +pytest tests/ -v -m "not integration" +``` + +### Test Coverage Requirements + +| Metric | Target | Command | +|--------|--------|---------| +| Line Coverage | โ‰ฅ60% | `make test-cov` | +| Branch Coverage | โ‰ฅ50% | `pytest --cov-branch` | + +**Expected Output:** +``` +======================== test session starts ========================= +collected 45 items + +tests/test_entities.py::TestGetEntity::test_get_entity_success PASSED +tests/test_entities.py::TestGetEntity::test_get_entity_not_found PASSED +tests/test_entities.py::TestSearchEntities::test_search_success PASSED +... + +---------- coverage: platform linux, python 3.11.0 ---------- +Name Stmts Miss Cover +------------------------------------------- +app/main.py 85 12 86% +app/database.py 142 23 84% +app/models.py 210 18 91% +app/routes/entities.py 320 45 86% +------------------------------------------- +TOTAL 757 98 87% + +======================== 45 passed in 12.34s ========================= ``` ### Code Quality ```bash -# Format code -black app/ scripts/ tests/ +# Run linter +make lint + +# Expected: Pylint score 9.5+/10 -# Lint code -flake8 app/ scripts/ tests/ +# Format code +make format # Type checking -mypy app/ +make typecheck +``` + +--- + +## Development Workflow + +### Local Setup (Without Docker) + +1. **Create virtual environment:** + ```bash + python -m venv .venv + source .venv/bin/activate # Linux/Mac + # Or: .venv\Scripts\activate # Windows + ``` + +2. **Install dependencies:** + ```bash + pip install -r requirements.txt + pip install -r requirements-dev.txt + ``` + +3. **Start Neo4j locally:** + ```bash + # Using Docker for Neo4j only + docker run -d \ + --name neo4j-dev \ + -p 7474:7474 -p 7687:7687 \ + -e NEO4J_AUTH=neo4j/password \ + -e NEO4J_PLUGINS='["graph-data-science"]' \ + neo4j:5.17-community + ``` + +4. **Configure environment:** + ```bash + export NEO4J_URI=bolt://localhost:7687 + export NEO4J_USER=neo4j + export NEO4J_PASSWORD=password + ``` + +5. **Run FastAPI:** + ```bash + make run + # Or: uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 + ``` + +### Docker Development + +```bash +# Build images +make docker-build + +# Start all services +make docker-run + +# View logs +make docker-logs + +# Stop services +make docker-stop + +# Rebuild and restart +make docker-rebuild + +# Access Neo4j Browser +open http://localhost:7474 ``` -## Exploratory Data Analysis +### Makefile Commands + +| Command | Description | +|---------|-------------| +| `make run` | Start FastAPI locally | +| `make test` | Run test suite | +| `make test-cov` | Run tests with coverage | +| `make lint` | Run Pylint | +| `make format` | Format code (Black + isort) | +| `make typecheck` | Run mypy type checker | +| `make docker-build` | Build Docker images | +| `make docker-run` | Start Docker services | +| `make docker-stop` | Stop Docker services | +| `make docker-logs` | View container logs | +| `make seed-db` | Import ICIJ data | +| `make clean` | Remove build artifacts | + +--- + +## Deployment + +### Building for Production + +1. **Set production environment:** + ```bash + export API_ENV=production + export DEBUG=false + ``` + +2. **Build optimized image:** + ```bash + docker build -t panama-papers-api:1.0.0 \ + --build-arg ENVIRONMENT=production \ + -f docker/Dockerfile . + ``` + +3. **Push to registry:** + ```bash + docker tag panama-papers-api:1.0.0 myregistry.azurecr.io/panama-papers-api:1.0.0 + docker push myregistry.azurecr.io/panama-papers-api:1.0.0 + ``` + +### Production Docker Compose -A Jupyter notebook (`graph_eda.ipynb`) is provided for exploratory data analysis: +```bash +docker-compose -f docker-compose.yml -f docker-compose.prod.yml up -d +``` -1. **Connection Setup** - Connect to Neo4j database -2. **Basic Statistics** - Count nodes by type -3. **Jurisdiction Analysis** - Top jurisdictions by entity count with visualizations -4. **Risk Analysis** - Identify high-risk addresses (red flags) +### Production Considerations + +| Aspect | Recommendation | +|--------|----------------| +| **Database** | Use Neo4j Enterprise for clustering | +| **Memory** | Minimum 8GB RAM for Neo4j with GDS | +| **Storage** | SSD storage for Neo4j data | +| **Scaling** | Horizontal scaling via load balancer | +| **Secrets** | Use Docker secrets or Vault | +| **Monitoring** | Prometheus + Grafana | +| **Logging** | ELK Stack or CloudWatch | + +### Health Check Configuration + +Kubernetes liveness/readiness probes: + +```yaml +livenessProbe: + httpGet: + path: /live + port: 8000 + initialDelaySeconds: 10 + periodSeconds: 30 + +readinessProbe: + httpGet: + path: /ready + port: 8000 + initialDelaySeconds: 5 + periodSeconds: 10 +``` -To run the notebook: +--- + +## Troubleshooting + +### Common Issues + +#### Neo4j Connection Refused ```bash -# Install Jupyter -pip install jupyter matplotlib +# Check if Neo4j is running +docker-compose ps neo4j + +# Check Neo4j logs +docker-compose logs neo4j -# Start Jupyter -jupyter notebook graph_eda.ipynb +# Verify port is accessible +nc -zv localhost 7687 ``` -## Use Cases +#### GDS Not Available -1. **Beneficial Ownership Tracing** - Trace ownership paths from officers to entities -2. **Network Hub Detection** - Identify intermediaries managing many entities -3. **Risk Assessment** - Detect suspicious patterns (shell company rings, address clustering) -4. **Compliance** - Support AML (Anti-Money Laundering) investigations +```bash +# Verify GDS plugin is installed +docker-compose exec neo4j cypher-shell -u neo4j -p password \ + "RETURN gds.version()" + +# If missing, add to Neo4j config: +NEO4J_PLUGINS='["graph-data-science"]' +``` + +#### Import Script Fails + +```bash +# Check data directory +ls -la data/ + +# Verify CSV format matches expected columns +head -1 data/nodes-entities.csv + +# Run with verbose logging +python scripts/seeddata.py --log-level DEBUG +``` + +#### Slow Queries + +```bash +# Check if indexes exist +docker-compose exec neo4j cypher-shell -u neo4j -p password \ + "SHOW INDEXES" + +# Run schema script to create indexes +docker-compose exec neo4j cypher-shell -u neo4j -p password \ + -f /scripts/panama-papers-schema.cypher +``` + +--- + +## Contributing + +### Commit Distribution + +All team members must contribute fairly: + +- **Minimum:** 1 meaningful PR per team member +- **Commits:** Follow conventional commits (`feat:`, `fix:`, `docs:`, `test:`) +- **Reviews:** Code reviews required before merge + +### Git Workflow + +```bash +# Create feature branch +git checkout -b feature/add-risk-analysis + +# Make changes and commit +git add . +git commit -m "feat: add risk analysis endpoint" + +# Push and create PR +git push origin feature/add-risk-analysis +``` + +### Code Standards + +| Standard | Requirement | Tool | +|----------|-------------|------| +| Style | PEP 8 | Black, isort | +| Linting | Pylint โ‰ฅ 9.5/10 | Pylint | +| Type Hints | All public functions | mypy | +| Tests | โ‰ฅ60% coverage | pytest-cov | +| Docstrings | Google style | pydocstyle | + +### Pull Request Checklist + +- [ ] Tests pass (`make test`) +- [ ] Linting passes (`make lint`) +- [ ] Documentation updated +- [ ] Meaningful commit messages +- [ ] No hardcoded secrets +- [ ] PR description explains changes + +--- ## Team -- [Team Member 1] - Role/Contribution -- [Team Member 2] - Role/Contribution -- [Team Member 3] - Role/Contribution +| Name | Role | Responsibilities | +|------|------|------------------| +| [Team Member 1] | Tech Lead | Architecture, Neo4j Schema | +| [Team Member 2] | Backend Developer | FastAPI, API Endpoints | +| [Team Member 3] | Data Engineer | Import Scripts, GDS Setup | +| [Team Member 4] | QA/DevOps | Testing, Docker, CI/CD | + +--- ## License -[Specify License] +This project is licensed under the **MIT License** - see the [LICENSE](LICENSE) file for details. + +--- + +## Data Attribution + +### ICIJ Offshore Leaks Database + +The data analyzed by this project is sourced from the [ICIJ Offshore Leaks Database](https://offshoreleaks.icij.org/), which includes: + +- **Panama Papers** (2016) - 11.5 million documents from Mossack Fonseca +- **Paradise Papers** (2017) - 13.4 million documents from Appleby +- **Pandora Papers** (2021) - 11.9 million documents from 14 offshore service providers + +### Usage Terms + +This data is provided by ICIJ for: +- Research purposes +- Public interest journalism +- Educational use + +**Note:** This project is for educational and research purposes. Always verify findings through official sources. + +--- ## Acknowledgments -- ICIJ (International Consortium of Investigative Journalists) for the Offshore Leaks database -- Neo4j for the graph database platform -- FastAPI for the modern web framework +- **ICIJ** (International Consortium of Investigative Journalists) for publishing the offshore leaks data +- **Neo4j** for the graph database platform and GDS library +- **FastAPI** for the excellent async web framework +- **Pydantic** for data validation + +--- -## Support +## References -For issues or questions, please open an issue in the repository or contact the development team. +- [ICIJ Offshore Leaks Database](https://offshoreleaks.icij.org/) +- [Neo4j Documentation](https://neo4j.com/docs/) +- [Neo4j GDS Library](https://neo4j.com/docs/graph-data-science/current/) +- [FastAPI Documentation](https://fastapi.tiangolo.com/) +- [Panama Papers Wikipedia](https://en.wikipedia.org/wiki/Panama_Papers) diff --git a/app/__init__.py b/app/__init__.py index 002a73d..3859afd 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -1,4 +1,5 @@ -"""Graph-Backed Analysis API for Offshore Financial Data.""" - -__version__ = "0.1.0" +""" +Panama Papers Neo4j API Application Package +""" +__version__ = "1.0.0" diff --git a/app/config.py b/app/config.py deleted file mode 100644 index 1c37e13..0000000 --- a/app/config.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Configuration management using Pydantic Settings.""" - -import os -from pydantic_settings import BaseSettings, SettingsConfigDict -from pydantic import Field - - -class Settings(BaseSettings): - """Application settings loaded from environment variables.""" - - # Neo4j Configuration - Required - neo4j_uri: str = Field( - default_factory=lambda: os.getenv("NEO4J_URI", "bolt://localhost:7687"), - description="Neo4j connection URI" - ) - neo4j_user: str = Field( - default_factory=lambda: os.getenv("NEO4J_USER", "neo4j"), - description="Neo4j username" - ) - neo4j_password: str = Field( - default_factory=lambda: os.getenv("NEO4J_PASSWORD", "password"), - description="Neo4j password" - ) - - # API Configuration - api_host: str = "0.0.0.0" - api_port: int = 8000 - - # Environment - environment: str = "development" - - model_config = SettingsConfigDict( - env_file=".env", - env_file_encoding="utf-8", - case_sensitive=False, - validate_default=True, - ) - - -settings = Settings() diff --git a/app/database.py b/app/database.py index 2c85bf2..f217f03 100644 --- a/app/database.py +++ b/app/database.py @@ -1,96 +1,922 @@ -"""Neo4j Database Connection Manager - Singleton Pattern.""" +""" +Panama Papers API - Neo4j Database Connection Management +========================================================= +Async Neo4j driver wrapper for FastAPI application. + +Features: + - Async connection with connection pooling + - Retry logic with exponential backoff + - Session and transaction management + - Health check functionality + - Query execution helpers + +Usage: + from database import Neo4jDatabase, get_db_session, run_query + + # In FastAPI lifespan + async with lifespan(app): + await Neo4jDatabase.init() + + # In route handlers + @app.get("/entities") + async def get_entities(session: AsyncSession = Depends(get_db_session)): + result = await session.run("MATCH (e:Entity) RETURN e LIMIT 10") + return await result.data() + +Environment Variables: + NEO4J_URI: Bolt URI (default: bolt://localhost:7687) + NEO4J_USER: Username (default: neo4j) + NEO4J_PASSWORD: Password (required) + NEO4J_DATABASE: Database name (default: neo4j) + NEO4J_MAX_POOL_SIZE: Connection pool size (default: 50) + +Python Version: 3.11+ +Neo4j Driver: 5.x +""" + +from __future__ import annotations + +import asyncio import os -from typing import Optional, ContextManager -from contextlib import contextmanager +import logging +from contextlib import asynccontextmanager +from dataclasses import dataclass, field +from datetime import datetime +from functools import wraps +from typing import ( + Any, + AsyncGenerator, + Awaitable, + Callable, + Optional, + ParamSpec, + TypeVar, +) -from neo4j import GraphDatabase, Driver, Session +from dotenv import load_dotenv +from neo4j import ( + AsyncGraphDatabase, + AsyncDriver, + AsyncSession, + AsyncTransaction, + AsyncManagedTransaction, + Query, +) +from neo4j.exceptions import ( + ServiceUnavailable, + SessionExpired, + TransientError, + AuthError, + DriverError, + Neo4jError, + ClientError, +) -from app.config import settings +# Load environment variables +load_dotenv() +# ============================================================================ +# LOGGING CONFIGURATION +# ============================================================================ -class Neo4jDatabase: - """Singleton-style Neo4j database manager.""" +logger = logging.getLogger(__name__) - _instance: Optional["Neo4jDatabase"] = None - _driver: Optional[Driver] = None +# ============================================================================ +# CONFIGURATION +# ============================================================================ - def __new__(cls): - """Ensure only one instance exists (singleton pattern).""" - if cls._instance is None: - cls._instance = super(Neo4jDatabase, cls).__new__(cls) - return cls._instance +@dataclass(frozen=True) +class Neo4jConfig: + """Neo4j connection configuration.""" + + uri: str = field( + default_factory=lambda: os.getenv("NEO4J_URI", "bolt://localhost:7687") + ) + user: str = field( + default_factory=lambda: os.getenv("NEO4J_USER", "neo4j") + ) + password: str = field( + default_factory=lambda: os.getenv("NEO4J_PASSWORD", "") + ) + database: str = field( + default_factory=lambda: os.getenv("NEO4J_DATABASE", "neo4j") + ) + max_pool_size: int = field( + default_factory=lambda: int(os.getenv("NEO4J_MAX_POOL_SIZE", "50")) + ) + connection_timeout: float = field( + default_factory=lambda: float(os.getenv("NEO4J_CONNECTION_TIMEOUT", "30")) + ) + max_transaction_retry_time: float = field( + default_factory=lambda: float(os.getenv("NEO4J_MAX_RETRY_TIME", "30")) + ) + connection_acquisition_timeout: float = field( + default_factory=lambda: float(os.getenv("NEO4J_ACQUISITION_TIMEOUT", "60")) + ) + + def validate(self) -> None: + """Validate configuration.""" + if not self.password: + raise ValueError( + "NEO4J_PASSWORD environment variable is required. " + "Set it in your .env file or environment." + ) + if not self.uri.startswith(("bolt://", "bolt+s://", "neo4j://", "neo4j+s://")): + raise ValueError( + f"Invalid NEO4J_URI scheme: {self.uri}. " + "Must start with bolt://, bolt+s://, neo4j://, or neo4j+s://" + ) - def __init__(self): - """Initialize the database connection.""" - if self._driver is None: - self.connect() - def connect(self) -> None: - """ - Initialize the Neo4j driver using environment variables. +# Global configuration instance +config = Neo4jConfig() + +# ============================================================================ +# RETRY DECORATOR +# ============================================================================ + +P = ParamSpec("P") +T = TypeVar("T") + + +def with_retry( + max_retries: int = 3, + base_delay: float = 1.0, + max_delay: float = 30.0, + exponential_base: float = 2.0, + retryable_exceptions: tuple = ( + ServiceUnavailable, + SessionExpired, + TransientError, + ), +) -> Callable[[Callable[P, Awaitable[T]]], Callable[P, Awaitable[T]]]: + """ + Decorator for retry logic with exponential backoff. + + Args: + max_retries: Maximum number of retry attempts + base_delay: Initial delay between retries (seconds) + max_delay: Maximum delay between retries (seconds) + exponential_base: Base for exponential backoff calculation + retryable_exceptions: Tuple of exceptions to retry on + + Returns: + Decorated async function with retry logic + """ + def decorator(func: Callable[P, Awaitable[T]]) -> Callable[P, Awaitable[T]]: + @wraps(func) + async def wrapper(*args: P.args, **kwargs: P.kwargs) -> T: + last_exception: Optional[Exception] = None + + for attempt in range(max_retries + 1): + try: + return await func(*args, **kwargs) + + except retryable_exceptions as e: + last_exception = e + + if attempt == max_retries: + logger.error( + f"Max retries ({max_retries}) exceeded for {func.__name__}: {e}" + ) + raise + + # Calculate delay with exponential backoff and jitter + delay = min( + base_delay * (exponential_base ** attempt), + max_delay + ) + # Add jitter (ยฑ25%) + import random + delay *= (0.75 + random.random() * 0.5) + + logger.warning( + f"Retry {attempt + 1}/{max_retries} for {func.__name__} " + f"after {delay:.2f}s: {e}" + ) + await asyncio.sleep(delay) + + # Should not reach here, but just in case + if last_exception: + raise last_exception + raise RuntimeError("Unexpected retry loop exit") - Uses NEO4J_URI and NEO4J_PASSWORD from environment or settings. - """ - neo4j_uri = os.getenv("NEO4J_URI") or settings.neo4j_uri - neo4j_user = os.getenv("NEO4J_USER") or settings.neo4j_user - neo4j_password = os.getenv("NEO4J_PASSWORD") or settings.neo4j_password + return wrapper + return decorator - self._driver = GraphDatabase.driver( - neo4j_uri, - auth=(neo4j_user, neo4j_password) - ) - def close(self) -> None: - """Close the Neo4j driver connection.""" - if self._driver: - self._driver.close() - self._driver = None +# ============================================================================ +# NEO4J DATABASE CLASS +# ============================================================================ - @contextmanager - def get_session(self) -> ContextManager[Session]: +class Neo4jDatabase: + """ + Async Neo4j database connection manager. + + Singleton pattern for managing a single driver instance across + the application lifecycle. + + Usage: + # Initialize during app startup + await Neo4jDatabase.init() + + # Use in request handlers + async with Neo4jDatabase.session() as session: + result = await session.run("MATCH (n) RETURN n LIMIT 10") + data = await result.data() + + # Cleanup during app shutdown + await Neo4jDatabase.close() + """ + + _driver: Optional[AsyncDriver] = None + _config: Optional[Neo4jConfig] = None + _initialized: bool = False + _init_time: Optional[datetime] = None + + @classmethod + async def init(cls, config: Optional[Neo4jConfig] = None) -> None: """ - Get a Neo4j session as a context manager. + Initialize the Neo4j driver. + + Args: + config: Optional configuration override + + Raises: + ValueError: If configuration is invalid + ServiceUnavailable: If Neo4j is not reachable + AuthError: If authentication fails + """ + if cls._initialized and cls._driver: + logger.warning("Neo4j driver already initialized, skipping") + return + + cls._config = config or Neo4jConfig() + cls._config.validate() + + logger.info(f"Initializing Neo4j driver for {cls._config.uri}...") + + try: + cls._driver = AsyncGraphDatabase.driver( + cls._config.uri, + auth=(cls._config.user, cls._config.password), + max_connection_pool_size=cls._config.max_pool_size, + connection_timeout=cls._config.connection_timeout, + max_transaction_retry_time=cls._config.max_transaction_retry_time, + connection_acquisition_timeout=cls._config.connection_acquisition_timeout, + ) + + # Verify connectivity + await cls._driver.verify_connectivity() + + # Get server info + server_info = await cls._get_server_info() + + cls._initialized = True + cls._init_time = datetime.utcnow() + + logger.info( + f"โœ“ Neo4j driver initialized successfully\n" + f" URI: {cls._config.uri}\n" + f" Database: {cls._config.database}\n" + f" Pool Size: {cls._config.max_pool_size}\n" + f" Server: {server_info.get('version', 'unknown')}" + ) + + except AuthError as e: + logger.error(f"โœ— Neo4j authentication failed: {e}") + raise + except ServiceUnavailable as e: + logger.error(f"โœ— Neo4j service unavailable at {cls._config.uri}: {e}") + raise + except Exception as e: + logger.error(f"โœ— Neo4j driver initialization failed: {e}") + raise + + @classmethod + async def _get_server_info(cls) -> dict[str, Any]: + """Get Neo4j server information.""" + try: + async with cls._driver.session(database=cls._config.database) as session: + result = await session.run( + "CALL dbms.components() YIELD name, versions, edition " + "RETURN name, versions[0] AS version, edition" + ) + record = await result.single() + return { + "name": record["name"], + "version": record["version"], + "edition": record["edition"], + } if record else {} + except Exception as e: + logger.warning(f"Could not retrieve server info: {e}") + return {} + + @classmethod + async def close(cls) -> None: + """ + Close the Neo4j driver and release all resources. + + Should be called during application shutdown. + """ + if cls._driver: + await cls._driver.close() + cls._driver = None + cls._initialized = False + logger.info("โœ“ Neo4j driver closed") + else: + logger.warning("Neo4j driver was not initialized or already closed") + + @classmethod + def get_driver(cls) -> AsyncDriver: + """ + Get the active driver instance. + + Returns: + AsyncDriver instance + + Raises: + RuntimeError: If driver is not initialized + """ + if not cls._driver or not cls._initialized: + raise RuntimeError( + "Neo4j driver not initialized. " + "Call 'await Neo4jDatabase.init()' first." + ) + return cls._driver + + @classmethod + def is_initialized(cls) -> bool: + """Check if driver is initialized.""" + return cls._initialized and cls._driver is not None + + @classmethod + def get_config(cls) -> Optional[Neo4jConfig]: + """Get current configuration.""" + return cls._config + + @classmethod + def get_uptime(cls) -> Optional[float]: + """Get driver uptime in seconds.""" + if cls._init_time: + return (datetime.utcnow() - cls._init_time).total_seconds() + return None + + @classmethod + @asynccontextmanager + async def session( + cls, + database: Optional[str] = None, + **kwargs: Any, + ) -> AsyncGenerator[AsyncSession, None]: + """ + Get an async session context manager. + + Args: + database: Optional database name override + **kwargs: Additional session configuration + + Yields: + AsyncSession instance Usage: - with db.get_session() as session: - result = session.run("MATCH (n) RETURN n LIMIT 1") + async with Neo4jDatabase.session() as session: + result = await session.run("MATCH (n) RETURN n") """ - if self._driver is None: - raise RuntimeError("Database driver not initialized. Call connect() first.") + driver = cls.get_driver() + db = database or cls._config.database + session = driver.session(database=db, **kwargs) - session = self._driver.session() try: yield session finally: - session.close() - - def verify_connectivity(self) -> bool: + await session.close() + + @classmethod + @asynccontextmanager + async def transaction( + cls, + database: Optional[str] = None, + ) -> AsyncGenerator[AsyncTransaction, None]: """ - Verify connection to Neo4j database. + Get an explicit transaction context manager. - Returns: - True if connection is successful, raises exception otherwise + Automatically commits on success, rolls back on exception. + + Args: + database: Optional database name override + + Yields: + AsyncTransaction instance + + Usage: + async with Neo4jDatabase.transaction() as tx: + await tx.run("CREATE (n:Node {name: $name})", name="Test") + await tx.run("CREATE (m:Node {name: $name})", name="Test2") + # Auto-commits if no exception """ - if self._driver is None: - raise RuntimeError("Database driver not initialized. Call connect() first.") + async with cls.session(database=database) as session: + tx = await session.begin_transaction() + try: + yield tx + await tx.commit() + except Exception: + await tx.rollback() + raise + finally: + if tx.closed() is False: + await tx.close() + + +# ============================================================================ +# FASTAPI DEPENDENCIES +# ============================================================================ + +async def get_db_session() -> AsyncGenerator[AsyncSession, None]: + """ + FastAPI dependency for Neo4j session injection. + + Usage: + @app.get("/entities") + async def get_entities(session: AsyncSession = Depends(get_db_session)): + result = await session.run("MATCH (e:Entity) RETURN e LIMIT 10") + return await result.data() + """ + async with Neo4jDatabase.session() as session: + yield session + + +async def get_db_driver() -> AsyncDriver: + """ + FastAPI dependency for Neo4j driver injection. + + Use when you need direct driver access (e.g., for multiple sessions). + """ + return Neo4jDatabase.get_driver() + + +# ============================================================================ +# HEALTH CHECK +# ============================================================================ + +@dataclass +class HealthCheckResult: + """Health check result structure.""" + + status: str # "healthy", "degraded", "unhealthy" + neo4j_connected: bool + neo4j_version: Optional[str] = None + neo4j_edition: Optional[str] = None + database: Optional[str] = None + gds_available: Optional[bool] = None + gds_version: Optional[str] = None + latency_ms: Optional[float] = None + uptime_seconds: Optional[float] = None + error: Optional[str] = None + timestamp: datetime = field(default_factory=datetime.utcnow) + + def to_dict(self) -> dict[str, Any]: + """Convert to dictionary.""" + return { + "status": self.status, + "neo4j_connected": self.neo4j_connected, + "neo4j_version": self.neo4j_version, + "neo4j_edition": self.neo4j_edition, + "database": self.database, + "gds_available": self.gds_available, + "gds_version": self.gds_version, + "latency_ms": self.latency_ms, + "uptime_seconds": self.uptime_seconds, + "error": self.error, + "timestamp": self.timestamp.isoformat(), + } + + +async def health_check(detailed: bool = True) -> HealthCheckResult: + """ + Perform Neo4j health check. + + Args: + detailed: Include detailed server information + + Returns: + HealthCheckResult with connection status and server info + """ + import time + start_time = time.perf_counter() + + try: + if not Neo4jDatabase.is_initialized(): + return HealthCheckResult( + status="unhealthy", + neo4j_connected=False, + error="Neo4j driver not initialized", + ) - try: - self._driver.verify_connectivity() - return True - except Exception as e: - raise ConnectionError(f"Failed to verify Neo4j connectivity: {e}") + driver = Neo4jDatabase.get_driver() + config = Neo4jDatabase.get_config() + + async with driver.session(database=config.database) as session: + # Basic connectivity check + result = await session.run("RETURN 1 AS healthcheck") + record = await result.single() + + if not record or record["healthcheck"] != 1: + return HealthCheckResult( + status="unhealthy", + neo4j_connected=False, + error="Health check query returned unexpected result", + ) + + latency = (time.perf_counter() - start_time) * 1000 + + if not detailed: + return HealthCheckResult( + status="healthy", + neo4j_connected=True, + database=config.database, + latency_ms=round(latency, 2), + uptime_seconds=Neo4jDatabase.get_uptime(), + ) + + # Get detailed server info + server_result = await session.run( + "CALL dbms.components() YIELD name, versions, edition " + "RETURN name, versions[0] AS version, edition" + ) + server_record = await server_result.single() + + # Check GDS availability + gds_available = False + gds_version = None + try: + gds_result = await session.run("RETURN gds.version() AS version") + gds_record = await gds_result.single() + if gds_record: + gds_available = True + gds_version = gds_record["version"] + except (ClientError, Neo4jError): + pass # GDS not installed + + return HealthCheckResult( + status="healthy", + neo4j_connected=True, + neo4j_version=server_record["version"] if server_record else None, + neo4j_edition=server_record["edition"] if server_record else None, + database=config.database, + gds_available=gds_available, + gds_version=gds_version, + latency_ms=round(latency, 2), + uptime_seconds=Neo4jDatabase.get_uptime(), + ) + + except ServiceUnavailable as e: + return HealthCheckResult( + status="unhealthy", + neo4j_connected=False, + error=f"Service unavailable: {e}", + ) + except AuthError as e: + return HealthCheckResult( + status="unhealthy", + neo4j_connected=False, + error=f"Authentication failed: {e}", + ) + except Exception as e: + logger.error(f"Health check failed: {e}") + return HealthCheckResult( + status="unhealthy", + neo4j_connected=False, + error=str(e), + ) + + +# ============================================================================ +# QUERY EXECUTION HELPERS +# ============================================================================ + +@with_retry(max_retries=3, base_delay=1.0) +async def run_query( + query: str, + params: Optional[dict[str, Any]] = None, + database: Optional[str] = None, + fetch_size: int = 1000, +) -> list[dict[str, Any]]: + """ + Execute a read-only Cypher query. + + Args: + query: Cypher query string + params: Query parameters (optional) + database: Database name override (optional) + fetch_size: Number of records to fetch at once + + Returns: + List of record dictionaries + + Example: + results = await run_query( + "MATCH (e:Entity) WHERE e.status = $status RETURN e LIMIT 100", + params={"status": "Active"} + ) + """ + params = params or {} + + async with Neo4jDatabase.session(database=database) as session: + result = await session.run(query, params) + records = await result.fetch(fetch_size) + return [record.data() for record in records] + - @property - def driver(self) -> Optional[Driver]: - """Get the Neo4j driver instance.""" - return self._driver +@with_retry(max_retries=3, base_delay=1.0) +async def run_query_single( + query: str, + params: Optional[dict[str, Any]] = None, + database: Optional[str] = None, +) -> Optional[dict[str, Any]]: + """ + Execute a query expecting a single result. + + Args: + query: Cypher query string + params: Query parameters (optional) + database: Database name override (optional) + + Returns: + Single record dictionary or None + """ + params = params or {} + + async with Neo4jDatabase.session(database=database) as session: + result = await session.run(query, params) + record = await result.single() + return record.data() if record else None -# Global database instance -db = Neo4jDatabase() +@with_retry(max_retries=3, base_delay=1.0) +async def run_write( + query: str, + params: Optional[dict[str, Any]] = None, + database: Optional[str] = None, +) -> dict[str, Any]: + """ + Execute a write query within an auto-commit transaction. + + Args: + query: Cypher write query string + params: Query parameters (optional) + database: Database name override (optional) + + Returns: + Dictionary with query counters: + - nodes_created + - nodes_deleted + - relationships_created + - relationships_deleted + - properties_set + - labels_added + - labels_removed + + Example: + result = await run_write( + "CREATE (e:Entity {entity_id: $id, name: $name})", + params={"id": "ENT-001", "name": "Acme Holdings"} + ) + print(f"Created {result['nodes_created']} nodes") + """ + params = params or {} + + async with Neo4jDatabase.session(database=database) as session: + result = await session.run(query, params) + summary = await result.consume() + + counters = summary.counters + return { + "nodes_created": counters.nodes_created, + "nodes_deleted": counters.nodes_deleted, + "relationships_created": counters.relationships_created, + "relationships_deleted": counters.relationships_deleted, + "properties_set": counters.properties_set, + "labels_added": counters.labels_added, + "labels_removed": counters.labels_removed, + "indexes_added": counters.indexes_added, + "indexes_removed": counters.indexes_removed, + "constraints_added": counters.constraints_added, + "constraints_removed": counters.constraints_removed, + } + + +async def run_transaction( + queries: list[tuple[str, Optional[dict[str, Any]]]], + database: Optional[str] = None, +) -> list[dict[str, Any]]: + """ + Execute multiple queries within a single transaction. + + All queries succeed or all are rolled back. + + Args: + queries: List of (query_string, params) tuples + database: Database name override (optional) + + Returns: + List of result dictionaries (one per query) + + Example: + results = await run_transaction([ + ("CREATE (a:Entity {id: $id1})", {"id1": "E1"}), + ("CREATE (b:Entity {id: $id2})", {"id2": "E2"}), + ("MATCH (a:Entity {id: $id1}), (b:Entity {id: $id2}) " + "CREATE (a)-[:OWNS]->(b)", {"id1": "E1", "id2": "E2"}), + ]) + """ + results = [] + + async with Neo4jDatabase.transaction(database=database) as tx: + for query, params in queries: + params = params or {} + result = await tx.run(query, params) + summary = await result.consume() + results.append({ + "query": query[:100] + "..." if len(query) > 100 else query, + "nodes_created": summary.counters.nodes_created, + "relationships_created": summary.counters.relationships_created, + }) + + return results + + +async def run_read_transaction( + work: Callable[[AsyncManagedTransaction], Awaitable[T]], + database: Optional[str] = None, +) -> T: + """ + Execute a read transaction with automatic retry. + + Uses Neo4j's managed transaction pattern for automatic + retry on transient errors. + + Args: + work: Async function that receives transaction and returns result + database: Database name override (optional) + + Returns: + Result from work function + + Example: + async def get_entities(tx: AsyncManagedTransaction): + result = await tx.run("MATCH (e:Entity) RETURN e LIMIT 10") + return await result.data() + + entities = await run_read_transaction(get_entities) + """ + async with Neo4jDatabase.session(database=database) as session: + return await session.execute_read(work) + + +async def run_write_transaction( + work: Callable[[AsyncManagedTransaction], Awaitable[T]], + database: Optional[str] = None, +) -> T: + """ + Execute a write transaction with automatic retry. + + Uses Neo4j's managed transaction pattern for automatic + retry on transient errors. + + Args: + work: Async function that receives transaction and returns result + database: Database name override (optional) + + Returns: + Result from work function + + Example: + async def create_entity(tx: AsyncManagedTransaction): + result = await tx.run( + "CREATE (e:Entity {id: $id, name: $name}) RETURN e", + id="E1", name="Test" + ) + return await result.single() + + entity = await run_write_transaction(create_entity) + """ + async with Neo4jDatabase.session(database=database) as session: + return await session.execute_write(work) + + +# ============================================================================ +# FASTAPI LIFESPAN INTEGRATION +# ============================================================================ + +@asynccontextmanager +async def neo4j_lifespan(app: Any) -> AsyncGenerator[None, None]: + """ + FastAPI lifespan context manager for Neo4j. + + Usage: + from fastapi import FastAPI + from database import neo4j_lifespan + + app = FastAPI(lifespan=neo4j_lifespan) + """ + # Startup + await Neo4jDatabase.init() + + yield + + # Shutdown + await Neo4jDatabase.close() + + +# ============================================================================ +# UTILITY FUNCTIONS +# ============================================================================ + +def cypher_escape(value: str) -> str: + """ + Escape a string for safe use in Cypher queries. + + Note: Always prefer parameterized queries over string escaping. + This is provided for edge cases only. + """ + return value.replace("\\", "\\\\").replace("'", "\\'").replace('"', '\\"') + + +def build_match_clause( + label: str, + filters: dict[str, Any], + alias: str = "n", +) -> tuple[str, dict[str, Any]]: + """ + Build a MATCH clause with WHERE conditions. + + Args: + label: Node label + filters: Property filters (key-value pairs) + alias: Node alias in query + + Returns: + Tuple of (query_fragment, parameters) + + Example: + clause, params = build_match_clause( + "Entity", + {"status": "Active", "jurisdiction": "BVI"}, + alias="e" + ) + # Returns: ("MATCH (e:Entity) WHERE e.status = $status AND e.jurisdiction = $jurisdiction", {...}) + """ + where_parts = [] + params = {} + + for key, value in filters.items(): + if value is not None: + param_name = f"filter_{key}" + where_parts.append(f"{alias}.{key} = ${param_name}") + params[param_name] = value + + query = f"MATCH ({alias}:{label})" + if where_parts: + query += " WHERE " + " AND ".join(where_parts) + + return query, params + +# ============================================================================ +# MODULE EXPORTS +# ============================================================================ -def get_database() -> Neo4jDatabase: - """Get the global Neo4jDatabase instance.""" - return db +__all__ = [ + # Configuration + "Neo4jConfig", + "config", + + # Main database class + "Neo4jDatabase", + + # FastAPI dependencies + "get_db_session", + "get_db_driver", + + # Health check + "HealthCheckResult", + "health_check", + + # Query helpers + "run_query", + "run_query_single", + "run_write", + "run_transaction", + "run_read_transaction", + "run_write_transaction", + + # FastAPI integration + "neo4j_lifespan", + + # Utilities + "cypher_escape", + "build_match_clause", + "with_retry", +] diff --git a/app/entities.py b/app/entities.py new file mode 100644 index 0000000..7bf5968 --- /dev/null +++ b/app/entities.py @@ -0,0 +1,1345 @@ +""" +Panama Papers API - Entity Routes +================================== + +FastAPI router for entity-related endpoints. + +Endpoints: + GET /entities/{entity_id} - Get entity by ID + GET /entities/search - Search entities by name + GET /entities/{entity_id}/ownership - Get ownership chain + GET /entities/{entity_id}/network - Get connected entities + GET /entities/top/influential - Get top entities by PageRank + GET /entities/top/connected - Get most connected entities + GET /entities/by-jurisdiction - Get entities by jurisdiction + GET /entities/{entity_id}/risk - Get entity risk analysis + +All queries include LIMIT clauses to prevent Cartesian products. +All responses use Pydantic models for validation. +""" + +from __future__ import annotations + +import logging +from datetime import datetime +from typing import Annotated, Any, Optional + +from fastapi import APIRouter, Depends, HTTPException, Path, Query, status +from neo4j import AsyncSession +from neo4j.exceptions import Neo4jError + +# Import models (adjust path based on your project structure) +from app.models import ( + EntityResponse, + EntitySummary, + EntityType, + EntityStatus, + PathQuery, + PathResponse, + PathResult, + PathNode, + PathEdge, + RelationshipResponse, + RelationshipType, + RiskLevel, + RedFlagAnalysis, + RedFlag, + SearchQuery, + SearchResponse, + SearchResult, + InfluenceScore, + ErrorResponse, + PaginationMeta, +) + +# Import database utilities (adjust path based on your project structure) +from app.database import get_db_session, run_query, run_query_single + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +logger = logging.getLogger(__name__) + +# Router configuration +router = APIRouter( + prefix="/entities", + tags=["entities"], + responses={ + 404: {"model": ErrorResponse, "description": "Entity not found"}, + 500: {"model": ErrorResponse, "description": "Internal server error"}, + }, +) + +# ============================================================================ +# HELPER FUNCTIONS +# ============================================================================ + +def parse_entity_record(record: dict[str, Any], prefix: str = "e") -> EntityResponse: + """ + Parse a Neo4j entity record into an EntityResponse. + + Args: + record: Neo4j record dictionary + prefix: Key prefix for entity properties (default: "e") + + Returns: + EntityResponse model instance + """ + entity = record.get(prefix, record) + + # Handle both direct properties and nested dict + if isinstance(entity, dict): + data = entity + else: + # Neo4j Node object + data = dict(entity) + + return EntityResponse( + entity_id=data.get("entity_id", ""), + name=data.get("name", "Unknown"), + jurisdiction_code=data.get("jurisdiction_code") or data.get("jurisdiction"), + entity_type=data.get("entity_type") or data.get("type") or EntityType.UNKNOWN, + status=data.get("status") or EntityStatus.UNKNOWN, + pagerank_score=data.get("pagerank_score"), + community_id=data.get("community_id"), + degree_centrality=data.get("degree_centrality"), + betweenness_score=data.get("betweenness_score"), + risk_score=data.get("risk_score"), + risk_level=data.get("risk_level"), + incorporation_date=data.get("incorporation_date"), + inactivation_date=data.get("inactivation_date"), + source=data.get("source"), + ) + + +def calculate_effective_ownership(percentages: list[Optional[float]]) -> Optional[float]: + """ + Calculate effective ownership through a chain. + + Multiplies ownership percentages through the chain. + Example: 50% -> 50% -> 50% = 12.5% effective ownership + """ + if not percentages: + return None + + result = 100.0 + for pct in percentages: + if pct is not None: + result = result * pct / 100.0 + + return round(result, 4) + + +# ============================================================================ +# ENDPOINT 1: GET ENTITY BY ID +# ============================================================================ + +@router.get( + "/id/{entity_id}", + response_model=EntityResponse, + summary="Get entity by ID", + responses={ + 200: {"description": "Entity found"}, + 404: {"description": "Entity not found"}, + }, +) +async def get_entity( + entity_id: Annotated[ + str, + Path( + description="Unique entity identifier", + min_length=1, + max_length=50, + examples=["10000001", "ENT-BVI-2010-001"], + ), + ], + include_analytics: Annotated[ + bool, + Query(description="Include PageRank and community data"), + ] = True, + include_counts: Annotated[ + bool, + Query(description="Include owner/subsidiary counts"), + ] = False, + session: AsyncSession = Depends(get_db_session), +) -> EntityResponse: + """ + Retrieve detailed information about a specific entity. + + Returns entity properties including: + - Basic info: name, jurisdiction, type, status + - Analytics: PageRank score, community ID (if include_analytics=True) + - Counts: owner/subsidiary counts (if include_counts=True) + + Args: + entity_id: Unique entity identifier + include_analytics: Include GDS algorithm results + include_counts: Include relationship counts + session: Neo4j session (injected) + + Returns: + EntityResponse with full entity details + + Raises: + HTTPException 404: Entity not found + HTTPException 500: Database error + """ + # Build query based on options + if include_counts: + query = """ + MATCH (e:Entity {entity_id: $entity_id}) + OPTIONAL MATCH (owner)-[:OWNS]->(e) + OPTIONAL MATCH (e)-[:OWNS]->(subsidiary) + WITH e, + count(DISTINCT owner) AS owner_count, + count(DISTINCT subsidiary) AS subsidiary_count + RETURN e { + .*, + owner_count: owner_count, + subsidiary_count: subsidiary_count + } AS entity + LIMIT 1 + """ + else: + query = """ + MATCH (e:Entity {entity_id: $entity_id}) + RETURN e AS entity + LIMIT 1 + """ + + try: + result = await session.run(query, {"entity_id": entity_id}) + record = await result.single() + + if not record: + logger.info(f"Entity not found: {entity_id}") + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Entity with ID '{entity_id}' not found", + ) + + entity_data = record["entity"] + + # Build response + response = EntityResponse( + entity_id=entity_data.get("entity_id", entity_id), + name=entity_data.get("name", "Unknown"), + jurisdiction_code=entity_data.get("jurisdiction_code") or entity_data.get("jurisdiction"), + entity_type=entity_data.get("entity_type") or entity_data.get("type") or EntityType.UNKNOWN, + status=entity_data.get("status") or EntityStatus.UNKNOWN, + incorporation_date=entity_data.get("incorporation_date"), + inactivation_date=entity_data.get("inactivation_date"), + source=entity_data.get("source"), + ) + + # Add analytics if requested + if include_analytics: + response.pagerank_score = entity_data.get("pagerank_score") + response.community_id = entity_data.get("community_id") + response.degree_centrality = entity_data.get("degree_centrality") + response.betweenness_score = entity_data.get("betweenness_score") + + # Add counts if requested + if include_counts: + response.owner_count = entity_data.get("owner_count", 0) + response.subsidiary_count = entity_data.get("subsidiary_count", 0) + + return response + + except HTTPException: + raise + except Neo4jError as e: + logger.error(f"Neo4j error fetching entity {entity_id}: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Database query failed", + ) + except Exception as e: + logger.error(f"Unexpected error fetching entity {entity_id}: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="An unexpected error occurred", + ) + + +# ============================================================================ +# ENDPOINT 2: SEARCH ENTITIES +# ============================================================================ + +@router.get( + "/search", + response_model=SearchResponse, + summary="Search entities by name", +) +async def search_entities( + q: Annotated[ + str, + Query( + description="Search query (entity name)", + min_length=2, + max_length=200, + examples=["Holdings", "Acme", "Global Ventures"], + ), + ], + jurisdiction: Annotated[ + Optional[str], + Query(description="Filter by jurisdiction code"), + ] = None, + entity_type: Annotated[ + Optional[EntityType], + Query(description="Filter by entity type"), + ] = None, + status_filter: Annotated[ + Optional[EntityStatus], + Query(alias="status", description="Filter by status"), + ] = None, + limit: Annotated[ + int, + Query(ge=1, le=100, description="Maximum results"), + ] = 20, + offset: Annotated[ + int, + Query(ge=0, description="Pagination offset"), + ] = 0, + use_fulltext: Annotated[ + bool, + Query(description="Use full-text search index"), + ] = True, + session: AsyncSession = Depends(get_db_session), +) -> SearchResponse: + """ + Search entities by name with optional filters. + + Supports two search modes: + - Full-text search (default): Uses Neo4j full-text index for fuzzy matching + - Contains search: Simple CONTAINS matching (case-insensitive) + + Args: + q: Search query string + jurisdiction: Filter by jurisdiction code (e.g., "BVI", "PAN") + entity_type: Filter by entity type + status_filter: Filter by entity status + limit: Maximum number of results (1-100) + offset: Pagination offset + use_fulltext: Use full-text index (faster, fuzzy matching) + session: Neo4j session (injected) + + Returns: + SearchResponse with matching entities and pagination info + """ + import time + start_time = time.perf_counter() + + params: dict[str, Any] = { + "query": q, + "limit": limit, + "offset": offset, + } + + # Build filter conditions + filters = [] + if jurisdiction: + filters.append("e.jurisdiction_code = $jurisdiction") + params["jurisdiction"] = jurisdiction.upper() + if entity_type: + filters.append("e.entity_type = $entity_type") + params["entity_type"] = entity_type.value + if status_filter: + filters.append("e.status = $status") + params["status"] = status_filter.value + + filter_clause = " AND " + " AND ".join(filters) if filters else "" + + if use_fulltext: + # Use full-text index for better performance and fuzzy matching + query = f""" + CALL db.index.fulltext.queryNodes('entity_name_fulltext', $query) + YIELD node AS e, score + WHERE e:Entity{filter_clause} + WITH e, score + ORDER BY score DESC + SKIP $offset + LIMIT $limit + RETURN e, score + """ + + count_query = f""" + CALL db.index.fulltext.queryNodes('entity_name_fulltext', $query) + YIELD node AS e, score + WHERE e:Entity{filter_clause} + RETURN count(e) AS total + """ + else: + # Fallback to CONTAINS search + query = f""" + MATCH (e:Entity) + WHERE toLower(e.name) CONTAINS toLower($query){filter_clause} + WITH e + ORDER BY e.name + SKIP $offset + LIMIT $limit + RETURN e, 1.0 AS score + """ + + count_query = f""" + MATCH (e:Entity) + WHERE toLower(e.name) CONTAINS toLower($query){filter_clause} + RETURN count(e) AS total + """ + + try: + # Execute search query + result = await session.run(query, params) + records = await result.fetch(limit) + + # Get total count for pagination + count_result = await session.run(count_query, params) + count_record = await count_result.single() + total = count_record["total"] if count_record else 0 + + # Build response + search_results = [] + for record in records: + entity = record["e"] + score = record.get("score", 1.0) + + search_results.append( + SearchResult( + node_id=entity.get("entity_id", ""), + name=entity.get("name", "Unknown"), + node_type="Entity", + relevance_score=min(score / 10.0, 1.0) if score > 1 else score, + jurisdiction_code=entity.get("jurisdiction_code"), + status=entity.get("status"), + risk_level=entity.get("risk_level"), + matched_field="name", + ) + ) + + execution_time = (time.perf_counter() - start_time) * 1000 + + return SearchResponse( + query=q, + total_results=total, + results=search_results, + limit=limit, + offset=offset, + has_more=(offset + limit) < total, + execution_time_ms=round(execution_time, 2), + ) + + except Neo4jError as e: + # Check if full-text index doesn't exist + if "index" in str(e).lower() and use_fulltext: + logger.warning(f"Full-text index not found, falling back to CONTAINS: {e}") + # Retry without full-text + return await search_entities( + q=q, + jurisdiction=jurisdiction, + entity_type=entity_type, + status_filter=status_filter, + limit=limit, + offset=offset, + use_fulltext=False, + session=session, + ) + + logger.error(f"Search error: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Search query failed", + ) + except Exception as e: + logger.error(f"Unexpected search error: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="An unexpected error occurred during search", + ) + + +# ============================================================================ +# ENDPOINT 3: OWNERSHIP PATH +# ============================================================================ + +@router.get( + "/id/{entity_id}/ownership-path", + response_model=PathResponse, + summary="Get beneficial ownership chain", +) +async def get_ownership_path( + entity_id: Annotated[ + str, + Path(description="Target entity identifier"), + ], + max_depth: Annotated[ + int, + Query(ge=1, le=6, description="Maximum path depth (hops)"), + ] = 4, + min_depth: Annotated[ + int, + Query(ge=1, le=6, description="Minimum path depth"), + ] = 1, + include_persons: Annotated[ + bool, + Query(description="Include Person nodes as beneficial owners"), + ] = True, + only_active: Annotated[ + bool, + Query(description="Only include active ownership relationships"), + ] = True, + limit: Annotated[ + int, + Query(ge=1, le=50, description="Maximum paths to return"), + ] = 20, + session: AsyncSession = Depends(get_db_session), +) -> PathResponse: + """ + Trace beneficial ownership chain for an entity. + + Finds all ownership paths from beneficial owners (Person nodes) + to the target entity through OWNS relationships. + + Returns: + - All paths found within depth limits + - Effective ownership percentages calculated through chains + - Risk indicators (PEPs, tax havens, deep layering) + + Args: + entity_id: Target entity to trace ownership for + max_depth: Maximum number of hops (1-6) + min_depth: Minimum number of hops + include_persons: Include Person nodes at path ends + only_active: Only active ownership relationships + limit: Maximum number of paths to return + session: Neo4j session (injected) + + Returns: + PathResponse with ownership paths and analysis + + Raises: + HTTPException 404: Entity not found or no paths found + """ + import time + start_time = time.perf_counter() + + # First verify entity exists + verify_query = """ + MATCH (e:Entity {entity_id: $entity_id}) + RETURN e.name AS name + LIMIT 1 + """ + + try: + verify_result = await session.run(verify_query, {"entity_id": entity_id}) + verify_record = await verify_result.single() + + if not verify_record: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Entity '{entity_id}' not found", + ) + + # Build ownership path query + owner_label = "Person|Entity" if include_persons else "Entity" + status_filter = "AND ALL(r IN relationships(path) WHERE r.status = 'Active')" if only_active else "" + + query = f""" + MATCH path = (owner:{owner_label})-[:OWNS*{min_depth}..{max_depth}]->(target:Entity {{entity_id: $entity_id}}) + WHERE owner <> target + {status_filter} + WITH path, + nodes(path) AS path_nodes, + relationships(path) AS path_rels, + length(path) AS depth + ORDER BY depth ASC + LIMIT $limit + RETURN + [n IN path_nodes | {{ + id: COALESCE(n.entity_id, n.person_id), + name: COALESCE(n.name, n.full_name), + type: labels(n)[0], + jurisdiction: n.jurisdiction_code, + is_pep: n.is_pep + }}] AS nodes, + [r IN path_rels | {{ + source: COALESCE(startNode(r).entity_id, startNode(r).person_id), + target: COALESCE(endNode(r).entity_id, endNode(r).person_id), + type: type(r), + percentage: r.ownership_percentage, + is_nominee: r.is_nominee + }}] AS relationships, + depth + """ + + result = await session.run(query, {"entity_id": entity_id, "limit": limit}) + records = await result.fetch(limit) + + if not records: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"No ownership paths found for entity '{entity_id}'", + ) + + # Process paths + paths: list[PathResult] = [] + all_entity_ids: set[str] = set() + all_person_ids: set[str] = set() + pep_count = 0 + tax_haven_jurisdictions: set[str] = set() + + # Known tax havens for risk analysis + TAX_HAVENS = {"BVI", "PAN", "CYM", "JEY", "GGY", "IMN", "BMU", "VGB", "LIE", "MCO"} + + for idx, record in enumerate(records): + nodes_data = record["nodes"] + rels_data = record["relationships"] + depth = record["depth"] + + # Build path nodes + path_nodes: list[PathNode] = [] + for layer, node in enumerate(nodes_data): + node_id = node["id"] + node_type = node["type"] + + if node_type == "Person": + all_person_ids.add(node_id) + if node.get("is_pep"): + pep_count += 1 + else: + all_entity_ids.add(node_id) + + jurisdiction = node.get("jurisdiction") + if jurisdiction and jurisdiction in TAX_HAVENS: + tax_haven_jurisdictions.add(jurisdiction) + + path_nodes.append(PathNode( + node_id=node_id, + name=node["name"], + node_type=node_type, + jurisdiction_code=jurisdiction, + layer=layer, + is_pep=node.get("is_pep"), + )) + + # Build path edges + path_edges: list[PathEdge] = [] + ownership_percentages: list[Optional[float]] = [] + + for layer, rel in enumerate(rels_data): + pct = rel.get("percentage") + ownership_percentages.append(pct) + + path_edges.append(PathEdge( + source_id=rel["source"], + target_id=rel["target"], + relationship_type=rel["type"], + ownership_percentage=pct, + layer=layer, + )) + + # Calculate effective ownership + effective_ownership = calculate_effective_ownership(ownership_percentages) + + # Identify risk indicators + risk_indicators = [] + if depth >= 4: + risk_indicators.append("DEEP_LAYERING") + if any(n.get("is_pep") for n in nodes_data): + risk_indicators.append("PEP_CONNECTION") + if any(rel.get("is_nominee") for rel in rels_data): + risk_indicators.append("NOMINEE_ARRANGEMENT") + if len(tax_haven_jurisdictions) >= 2: + risk_indicators.append("MULTI_JURISDICTION") + + paths.append(PathResult( + path_id=idx + 1, + depth=depth, + nodes=path_nodes, + edges=path_edges, + effective_ownership=effective_ownership, + risk_indicators=risk_indicators, + )) + + # Calculate summary statistics + depths = [p.depth for p in paths] + avg_depth = sum(depths) / len(depths) if depths else 0 + + execution_time = (time.perf_counter() - start_time) * 1000 + + # Build query object for response + query_obj = PathQuery( + source_entity_id=entity_id, + max_depth=max_depth, + min_depth=min_depth, + include_persons=include_persons, + only_active=only_active, + limit=limit, + ) + + return PathResponse( + query=query_obj, + path_count=len(paths), + paths=paths, + average_depth=round(avg_depth, 2), + max_depth_found=max(depths) if depths else 0, + unique_entities=len(all_entity_ids), + unique_persons=len(all_person_ids), + pep_count=pep_count, + tax_haven_count=len(tax_haven_jurisdictions), + execution_time_ms=round(execution_time, 2), + ) + + except HTTPException: + raise + except Neo4jError as e: + logger.error(f"Neo4j error in ownership path query: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Ownership path query failed", + ) + except Exception as e: + logger.error(f"Unexpected error in ownership path: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="An unexpected error occurred", + ) + + +# ============================================================================ +# ENDPOINT 4: ENTITY NETWORK +# ============================================================================ + +@router.get( + "/id/{entity_id}/network", + response_model=list[RelationshipResponse], + summary="Get connected entities (network neighbors)", +) +async def get_entity_network( + entity_id: Annotated[ + str, + Path(description="Entity identifier"), + ], + depth: Annotated[ + int, + Query(ge=1, le=3, description="Network depth (1-3 hops)"), + ] = 1, + direction: Annotated[ + str, + Query(description="Relationship direction: in, out, or both"), + ] = "both", + relationship_types: Annotated[ + Optional[str], + Query(description="Comma-separated relationship types to include"), + ] = None, + limit: Annotated[ + int, + Query(ge=1, le=100, description="Maximum connections to return"), + ] = 50, + session: AsyncSession = Depends(get_db_session), +) -> list[RelationshipResponse]: + """ + Get entities connected to the target entity. + + Returns direct neighbors and their relationship metadata. + + Args: + entity_id: Target entity identifier + depth: How many hops to traverse (1-3) + direction: Relationship direction (in, out, both) + relationship_types: Filter by relationship types + limit: Maximum results + session: Neo4j session (injected) + + Returns: + List of relationships with connected entity details + """ + # Parse relationship types + rel_types = ["OWNS", "CONTROLS", "INVOLVED_IN", "CONNECTED_TO"] + if relationship_types: + rel_types = [rt.strip().upper() for rt in relationship_types.split(",")] + + rel_pattern = "|".join(rel_types) + + # Build direction pattern + if direction == "in": + pattern = f"<-[r:{rel_pattern}*1..{depth}]-" + elif direction == "out": + pattern = f"-[r:{rel_pattern}*1..{depth}]->" + else: + pattern = f"-[r:{rel_pattern}*1..{depth}]-" + + query = f""" + MATCH (e:Entity {{entity_id: $entity_id}}){pattern}(n) + WHERE e <> n + WITH DISTINCT n, r[0] AS rel, e + RETURN + COALESCE(n.entity_id, n.person_id, n.intermediary_id) AS target_id, + COALESCE(n.name, n.full_name) AS target_name, + labels(n)[0] AS target_type, + n.jurisdiction_code AS target_jurisdiction, + type(rel) AS relationship_type, + rel.ownership_percentage AS ownership_percentage, + rel.role AS role, + rel.is_nominee AS is_nominee, + rel.status AS status, + CASE + WHEN startNode(rel) = e THEN 'outgoing' + ELSE 'incoming' + END AS direction + ORDER BY relationship_type, target_name + LIMIT $limit + """ + + try: + result = await session.run(query, {"entity_id": entity_id, "limit": limit}) + records = await result.fetch(limit) + + if not records: + # Check if entity exists + verify = await session.run( + "MATCH (e:Entity {entity_id: $id}) RETURN e LIMIT 1", + {"id": entity_id} + ) + if not await verify.single(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Entity '{entity_id}' not found", + ) + # Entity exists but has no connections + return [] + + relationships = [] + for record in records: + # Determine source/target based on direction + if record["direction"] == "outgoing": + source_id = entity_id + target_id = record["target_id"] + else: + source_id = record["target_id"] + target_id = entity_id + + relationships.append(RelationshipResponse( + source_id=source_id, + target_id=target_id, + relationship_type=RelationshipType(record["relationship_type"]) + if record["relationship_type"] in RelationshipType.__members__.values() + else RelationshipType.CONNECTED_TO, + target_name=record["target_name"], + target_type=record["target_type"], + ownership_percentage=record.get("ownership_percentage"), + role=record.get("role"), + is_nominee=record.get("is_nominee"), + status=record.get("status"), + )) + + return relationships + + except HTTPException: + raise + except Exception as e: + logger.error(f"Network query error for {entity_id}: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Network query failed", + ) + + +# ============================================================================ +# ENDPOINT 5: TOP INFLUENTIAL ENTITIES +# ============================================================================ + +@router.get( + "/top/influential", + response_model=list[InfluenceScore], + summary="Get most influential entities by PageRank", +) +async def get_influential_entities( + limit: Annotated[ + int, + Query(ge=1, le=100, description="Number of results"), + ] = 20, + jurisdiction: Annotated[ + Optional[str], + Query(description="Filter by jurisdiction"), + ] = None, + entity_type: Annotated[ + Optional[EntityType], + Query(description="Filter by entity type"), + ] = None, + min_score: Annotated[ + Optional[float], + Query(ge=0, description="Minimum PageRank score"), + ] = None, + session: AsyncSession = Depends(get_db_session), +) -> list[InfluenceScore]: + """ + Get the most influential entities ranked by PageRank score. + + PageRank measures influence based on ownership network structure. + Higher scores indicate entities that are owned by other important entities. + + Args: + limit: Number of results (1-100) + jurisdiction: Filter by jurisdiction code + entity_type: Filter by entity type + min_score: Minimum PageRank score threshold + session: Neo4j session (injected) + + Returns: + List of entities ranked by influence score + """ + # Build filters + filters = ["e.pagerank_score IS NOT NULL"] + params: dict[str, Any] = {"limit": limit} + + if jurisdiction: + filters.append("e.jurisdiction_code = $jurisdiction") + params["jurisdiction"] = jurisdiction.upper() + + if entity_type: + filters.append("e.entity_type = $entity_type") + params["entity_type"] = entity_type.value + + if min_score is not None: + filters.append("e.pagerank_score >= $min_score") + params["min_score"] = min_score + + filter_clause = " AND ".join(filters) + + query = f""" + MATCH (e:Entity) + WHERE {filter_clause} + OPTIONAL MATCH (e)-[:REGISTERED_IN]->(j:Jurisdiction) + WITH e, j + ORDER BY e.pagerank_score DESC + LIMIT $limit + WITH e, j, + row_number() OVER () AS rank + RETURN + e.entity_id AS entity_id, + e.name AS name, + e.entity_type AS entity_type, + e.jurisdiction_code AS jurisdiction_code, + e.pagerank_score AS pagerank_score, + e.degree_centrality AS degree_centrality, + e.betweenness_score AS betweenness_score, + e.community_id AS community_id, + j.is_tax_haven AS is_tax_haven, + rank + """ + + try: + result = await session.run(query, params) + records = await result.fetch(limit) + + # Calculate percentile based on rank + total = len(records) + + influence_scores = [] + for record in records: + rank = record["rank"] + percentile = ((total - rank + 1) / total) * 100 if total > 0 else 0 + + influence_scores.append(InfluenceScore( + entity_id=record["entity_id"], + name=record["name"], + entity_type=record.get("entity_type"), + jurisdiction_code=record.get("jurisdiction_code"), + pagerank_score=record["pagerank_score"], + rank=rank, + percentile=round(percentile, 2), + degree_centrality=record.get("degree_centrality"), + betweenness_score=record.get("betweenness_score"), + community_id=record.get("community_id"), + is_tax_haven=record.get("is_tax_haven"), + )) + + return influence_scores + + except Exception as e: + logger.error(f"Influential entities query error: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Query failed", + ) + + +# ============================================================================ +# ENDPOINT 6: TOP CONNECTED ENTITIES +# ============================================================================ + +@router.get( + "/top/connected", + response_model=list[InfluenceScore], + summary="Get most connected entities by degree centrality", +) +async def get_most_connected_entities( + limit: Annotated[ + int, + Query(ge=1, le=100, description="Number of results"), + ] = 20, + jurisdiction: Annotated[ + Optional[str], + Query(description="Filter by jurisdiction"), + ] = None, + session: AsyncSession = Depends(get_db_session), +) -> list[InfluenceScore]: + """ + Get entities with the most connections (highest degree centrality). + + Degree centrality counts direct ownership and control relationships. + High values indicate hub entities in the network. + + Args: + limit: Number of results + jurisdiction: Filter by jurisdiction + session: Neo4j session (injected) + + Returns: + List of most connected entities + """ + filters = ["e.degree_centrality IS NOT NULL"] + params: dict[str, Any] = {"limit": limit} + + if jurisdiction: + filters.append("e.jurisdiction_code = $jurisdiction") + params["jurisdiction"] = jurisdiction.upper() + + filter_clause = " AND ".join(filters) + + query = f""" + MATCH (e:Entity) + WHERE {filter_clause} + WITH e + ORDER BY e.degree_centrality DESC + LIMIT $limit + WITH e, row_number() OVER () AS rank + RETURN + e.entity_id AS entity_id, + e.name AS name, + e.entity_type AS entity_type, + e.jurisdiction_code AS jurisdiction_code, + e.pagerank_score AS pagerank_score, + e.degree_centrality AS degree_centrality, + e.community_id AS community_id, + rank + """ + + try: + result = await session.run(query, params) + records = await result.fetch(limit) + + return [ + InfluenceScore( + entity_id=r["entity_id"], + name=r["name"], + entity_type=r.get("entity_type"), + jurisdiction_code=r.get("jurisdiction_code"), + pagerank_score=r.get("pagerank_score") or 0, + rank=r["rank"], + degree_centrality=r.get("degree_centrality"), + community_id=r.get("community_id"), + ) + for r in records + ] + + except Exception as e: + logger.error(f"Connected entities query error: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Query failed", + ) + + +# ============================================================================ +# ENDPOINT 7: ENTITIES BY JURISDICTION +# ============================================================================ + +@router.get( + "/by-jurisdiction/{jurisdiction_code}", + response_model=list[EntitySummary], + summary="Get entities by jurisdiction", +) +async def get_entities_by_jurisdiction( + jurisdiction_code: Annotated[ + str, + Path(description="Jurisdiction code (e.g., BVI, PAN)"), + ], + status_filter: Annotated[ + Optional[EntityStatus], + Query(alias="status", description="Filter by status"), + ] = None, + entity_type: Annotated[ + Optional[EntityType], + Query(description="Filter by entity type"), + ] = None, + limit: Annotated[ + int, + Query(ge=1, le=500, description="Maximum results"), + ] = 100, + offset: Annotated[ + int, + Query(ge=0, description="Pagination offset"), + ] = 0, + session: AsyncSession = Depends(get_db_session), +) -> list[EntitySummary]: + """ + Get all entities registered in a specific jurisdiction. + + Args: + jurisdiction_code: Jurisdiction code (e.g., BVI, PAN, CYM) + status_filter: Filter by entity status + entity_type: Filter by entity type + limit: Maximum results + offset: Pagination offset + session: Neo4j session (injected) + + Returns: + List of entity summaries + """ + filters = ["e.jurisdiction_code = $jurisdiction"] + params: dict[str, Any] = { + "jurisdiction": jurisdiction_code.upper(), + "limit": limit, + "offset": offset, + } + + if status_filter: + filters.append("e.status = $status") + params["status"] = status_filter.value + + if entity_type: + filters.append("e.entity_type = $entity_type") + params["entity_type"] = entity_type.value + + filter_clause = " AND ".join(filters) + + query = f""" + MATCH (e:Entity) + WHERE {filter_clause} + WITH e + ORDER BY e.name + SKIP $offset + LIMIT $limit + RETURN + e.entity_id AS entity_id, + e.name AS name, + e.jurisdiction_code AS jurisdiction_code, + e.entity_type AS entity_type, + e.status AS status, + e.risk_level AS risk_level + """ + + try: + result = await session.run(query, params) + records = await result.fetch(limit) + + return [ + EntitySummary( + entity_id=r["entity_id"], + name=r["name"], + jurisdiction_code=r.get("jurisdiction_code"), + entity_type=r.get("entity_type"), + status=r.get("status"), + risk_level=r.get("risk_level"), + ) + for r in records + ] + + except Exception as e: + logger.error(f"Jurisdiction query error: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Query failed", + ) + + +# ============================================================================ +# ENDPOINT 8: ENTITY RISK ANALYSIS +# ============================================================================ + +@router.get( + "/id/{entity_id}/risk", + response_model=RedFlagAnalysis, + summary="Get entity risk analysis", +) +async def get_entity_risk_analysis( + entity_id: Annotated[ + str, + Path(description="Entity identifier"), + ], + session: AsyncSession = Depends(get_db_session), +) -> RedFlagAnalysis: + """ + Perform risk analysis on an entity. + + Analyzes: + - Ownership depth (layering) + - Jurisdiction risk + - PEP connections + - Circular ownership patterns + - Mass registration addresses + + Args: + entity_id: Entity to analyze + session: Neo4j session (injected) + + Returns: + RedFlagAnalysis with risk score and identified flags + """ + # Multi-part query for risk analysis + query = """ + // Get entity details + MATCH (e:Entity {entity_id: $entity_id}) + + // Check ownership depth + OPTIONAL MATCH depth_path = (owner)-[:OWNS*1..6]->(e) + WITH e, max(length(depth_path)) AS max_depth + + // Check jurisdictions in ownership chain + OPTIONAL MATCH (e)<-[:OWNS*1..4]-(chain_entity:Entity) + WITH e, max_depth, + count(DISTINCT chain_entity.jurisdiction_code) AS jurisdiction_count + + // Check PEP connections + OPTIONAL MATCH (pep:Person {is_pep: true})-[:OWNS|CONTROLS*1..3]->(e) + WITH e, max_depth, jurisdiction_count, + count(DISTINCT pep) AS pep_connections + + // Check address concentration + OPTIONAL MATCH (e)-[:HAS_ADDRESS]->(a:Address)<-[:HAS_ADDRESS]-(other:Entity) + WHERE other <> e + WITH e, max_depth, jurisdiction_count, pep_connections, + count(DISTINCT other) AS shared_address_count + + // Get jurisdiction risk + OPTIONAL MATCH (e)-[:REGISTERED_IN]->(j:Jurisdiction) + + RETURN + e.entity_id AS entity_id, + e.name AS name, + e.jurisdiction_code AS jurisdiction, + j.is_tax_haven AS is_tax_haven, + j.secrecy_score AS secrecy_score, + COALESCE(max_depth, 0) AS layering_depth, + COALESCE(jurisdiction_count, 0) AS jurisdiction_count, + COALESCE(pep_connections, 0) AS pep_connections, + COALESCE(shared_address_count, 0) AS shared_address_count + LIMIT 1 + """ + + try: + result = await session.run(query, {"entity_id": entity_id}) + record = await result.single() + + if not record: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Entity '{entity_id}' not found", + ) + + # Calculate risk score and identify flags + red_flags: list[RedFlag] = [] + risk_score = 0.0 + + # Layering depth risk + layering_depth = record["layering_depth"] or 0 + if layering_depth >= 4: + risk_score += 25 + red_flags.append(RedFlag( + flag_type="DEEP_LAYERING", + severity=RiskLevel.HIGH if layering_depth >= 5 else RiskLevel.MEDIUM, + description=f"Ownership chain depth of {layering_depth} hops (threshold: 4)", + evidence=f"Maximum ownership path length: {layering_depth}", + )) + + # Multi-jurisdiction risk + jurisdiction_count = record["jurisdiction_count"] or 0 + if jurisdiction_count >= 3: + risk_score += 20 + red_flags.append(RedFlag( + flag_type="MULTI_JURISDICTION", + severity=RiskLevel.MEDIUM, + description=f"Ownership chain crosses {jurisdiction_count} jurisdictions", + )) + + # PEP connections + pep_connections = record["pep_connections"] or 0 + if pep_connections > 0: + risk_score += 30 + red_flags.append(RedFlag( + flag_type="PEP_CONNECTION", + severity=RiskLevel.HIGH, + description=f"Connected to {pep_connections} Politically Exposed Person(s)", + )) + + # Tax haven registration + if record.get("is_tax_haven"): + risk_score += 15 + red_flags.append(RedFlag( + flag_type="TAX_HAVEN_REGISTRATION", + severity=RiskLevel.MEDIUM, + description=f"Registered in tax haven jurisdiction: {record['jurisdiction']}", + )) + + # High secrecy score + secrecy_score = record.get("secrecy_score") or 0 + if secrecy_score >= 70: + risk_score += 10 + red_flags.append(RedFlag( + flag_type="HIGH_SECRECY_JURISDICTION", + severity=RiskLevel.MEDIUM, + description=f"Jurisdiction secrecy score: {secrecy_score}/100", + )) + + # Mass registration address + shared_address_count = record["shared_address_count"] or 0 + if shared_address_count >= 10: + risk_score += 20 + red_flags.append(RedFlag( + flag_type="MASS_REGISTRATION_ADDRESS", + severity=RiskLevel.HIGH if shared_address_count >= 50 else RiskLevel.MEDIUM, + description=f"Address shared with {shared_address_count} other entities", + )) + + # Determine overall risk level + risk_score = min(risk_score, 100) + if risk_score >= 70: + risk_level = RiskLevel.CRITICAL + elif risk_score >= 50: + risk_level = RiskLevel.HIGH + elif risk_score >= 25: + risk_level = RiskLevel.MEDIUM + else: + risk_level = RiskLevel.LOW + + return RedFlagAnalysis( + entity_id=entity_id, + entity_name=record["name"], + overall_risk_score=risk_score, + overall_risk_level=risk_level, + red_flags=red_flags, + flag_count=len(red_flags), + layering_depth=layering_depth, + jurisdiction_count=jurisdiction_count, + pep_connections=pep_connections, + mass_registration_address=shared_address_count >= 10, + analysis_timestamp=datetime.utcnow(), + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Risk analysis error for {entity_id}: {e}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Risk analysis failed", + ) + + +# ============================================================================ +# MODULE EXPORTS +# ============================================================================ + +__all__ = ["router"] diff --git a/app/main.py b/app/main.py index 1e07040..e8bb5f9 100644 --- a/app/main.py +++ b/app/main.py @@ -1,67 +1,683 @@ -"""FastAPI application entry point.""" +""" +Panama Papers Offshore Network Analysis API +============================================ -from fastapi import FastAPI +FastAPI application for analyzing ICIJ Panama Papers offshore financial networks. + +Features: + - Neo4j graph database integration + - Entity search and lookup + - Beneficial ownership tracing + - Network analysis (PageRank, communities) + - Risk assessment and red flag detection + +API Documentation: + - Swagger UI: /docs + - ReDoc: /redoc + - OpenAPI Schema: /openapi.json + +Usage: + # Development + uvicorn main:app --reload --host 0.0.0.0 --port 8000 + + # Production + gunicorn main:app -w 4 -k uvicorn.workers.UvicornWorker -b 0.0.0.0:8000 + +Environment Variables: + NEO4J_URI: Neo4j connection URI (default: bolt://localhost:7687) + NEO4J_USER: Neo4j username (default: neo4j) + NEO4J_PASSWORD: Neo4j password (required) + NEO4J_DATABASE: Target database (default: neo4j) + API_ENV: Environment (development/staging/production) + CORS_ORIGINS: Comma-separated allowed origins + +Python Version: 3.11+ +FastAPI Version: 0.109+ +""" + +from __future__ import annotations + +import logging +import os +import sys +import time +from contextlib import asynccontextmanager +from datetime import datetime +from typing import Any, AsyncGenerator, Callable + +from dotenv import load_dotenv +from fastapi import FastAPI, HTTPException, Request, status +from fastapi.exceptions import RequestValidationError from fastapi.middleware.cors import CORSMiddleware +from fastapi.middleware.gzip import GZipMiddleware +from fastapi.responses import JSONResponse, Response +from starlette.middleware.base import BaseHTTPMiddleware + +# Load environment variables +load_dotenv() + +# ============================================================================ +# CONFIGURATION +# ============================================================================ + +# API Configuration +API_TITLE = "Panama Papers Offshore Network Analysis API" +API_DESCRIPTION = """ +## Overview + +Neo4j-powered API for analyzing ICIJ Panama Papers offshore financial networks. + +### Features + +* **Entity Search** - Full-text search across offshore entities +* **Ownership Tracing** - Trace beneficial ownership chains up to 6 hops +* **Network Analysis** - PageRank influence scoring, community detection +* **Risk Assessment** - Automated red flag detection and risk scoring + +### Data Sources + +This API analyzes data from the ICIJ Offshore Leaks Database, including: +- Panama Papers (2016) +- Paradise Papers (2017) +- Pandora Papers (2021) + +### Authentication + +Currently, this API does not require authentication for read operations. +Write operations may be restricted in production environments. + +### Rate Limits + +- Development: No limits +- Production: 100 requests/minute per IP + +### Contact + +For questions about this API, contact the development team. +""" + +API_VERSION = "1.0.0" +API_ENV = os.getenv("API_ENV", "development") + +# CORS Configuration +CORS_ORIGINS_STR = os.getenv( + "CORS_ORIGINS", + "http://localhost:3000,http://localhost:8000,http://127.0.0.1:3000,http://127.0.0.1:8000" +) +CORS_ORIGINS = [origin.strip() for origin in CORS_ORIGINS_STR.split(",")] + +# In development, allow all localhost origins +if API_ENV == "development": + CORS_ORIGINS.extend([ + "http://localhost:*", + "http://127.0.0.1:*", + ]) + +# ============================================================================ +# LOGGING CONFIGURATION +# ============================================================================ + +# Configure logging format based on environment +if API_ENV == "production": + log_format = '{"timestamp": "%(asctime)s", "level": "%(levelname)s", "logger": "%(name)s", "message": "%(message)s"}' +else: + log_format = "%(asctime)s โ”‚ %(levelname)-8s โ”‚ %(name)s โ”‚ %(message)s" + +logging.basicConfig( + level=logging.INFO if API_ENV == "production" else logging.DEBUG, + format=log_format, + datefmt="%Y-%m-%d %H:%M:%S", + handlers=[ + logging.StreamHandler(sys.stdout), + ], +) + +# Reduce noise from third-party loggers +logging.getLogger("neo4j").setLevel(logging.WARNING) +logging.getLogger("uvicorn.access").setLevel(logging.WARNING) +logging.getLogger("httpx").setLevel(logging.WARNING) + +logger = logging.getLogger("panama_api") + +# ============================================================================ +# IMPORT APPLICATION MODULES +# ============================================================================ + +# Import database module +try: + from app.database import Neo4jDatabase, health_check, HealthCheckResult + DATABASE_AVAILABLE = True +except ImportError as e: + logger.warning(f"Database module not available: {e}") + DATABASE_AVAILABLE = False + +# Import routers +try: + from app.entities import router as entities_router + ENTITIES_ROUTER_AVAILABLE = True +except ImportError as e: + logger.warning(f"Entities router not available: {e}") + ENTITIES_ROUTER_AVAILABLE = False + +# Import models +try: + from app.models import ErrorResponse, HealthCheckResponse, HealthStatus + MODELS_AVAILABLE = True +except ImportError as e: + logger.warning(f"Models not available: {e}") + MODELS_AVAILABLE = False -from app.routers import entities, networks -from app.database import get_database +# ============================================================================ +# APPLICATION LIFESPAN +# ============================================================================ + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: + """ + FastAPI lifespan context manager. + + Handles startup and shutdown events: + - Startup: Initialize Neo4j driver connection + - Shutdown: Close driver and release resources + """ + # ==================== STARTUP ==================== + logger.info("=" * 60) + logger.info(f"Starting {API_TITLE}") + logger.info(f"Version: {API_VERSION}") + logger.info(f"Environment: {API_ENV}") + logger.info("=" * 60) + + startup_success = True + + # Initialize Neo4j driver + if DATABASE_AVAILABLE: + try: + await Neo4jDatabase.init() + logger.info("โœ“ Neo4j database connection established") + except Exception as e: + logger.error(f"โœ— Failed to connect to Neo4j: {e}") + startup_success = False + + # In production, fail fast if database is unavailable + if API_ENV == "production": + logger.critical("Cannot start in production without database connection") + sys.exit(1) + else: + logger.warning("โš  Database module not available - running in limited mode") + + if startup_success: + logger.info("โœ“ API startup complete") + else: + logger.warning("โš  API started with warnings") + + logger.info("-" * 60) + + # ==================== YIELD ==================== + yield + + # ==================== SHUTDOWN ==================== + logger.info("-" * 60) + logger.info("Shutting down API...") + + # Close Neo4j driver + if DATABASE_AVAILABLE: + try: + await Neo4jDatabase.close() + logger.info("โœ“ Neo4j connection closed") + except Exception as e: + logger.error(f"Error closing Neo4j connection: {e}") + + logger.info("โœ“ API shutdown complete") + logger.info("=" * 60) + + +# ============================================================================ +# FASTAPI APPLICATION +# ============================================================================ app = FastAPI( - title="Graph-Backed Analysis API", - description="API for analyzing offshore financial data using Neo4j", - version="0.1.0", + title=API_TITLE, + description=API_DESCRIPTION, + version=API_VERSION, + docs_url="/docs", + redoc_url="/redoc", + openapi_url="/openapi.json", + lifespan=lifespan, + license_info={ + "name": "MIT License", + "url": "https://opensource.org/licenses/MIT", + }, + contact={ + "name": "Panama Papers Analysis Team", + "url": "https://github.com/example/panama-papers-api", + }, + openapi_tags=[ + { + "name": "health", + "description": "Health check and status endpoints", + }, + { + "name": "entities", + "description": "Entity search, lookup, and analysis", + }, + { + "name": "ownership", + "description": "Beneficial ownership tracing", + }, + { + "name": "network", + "description": "Network analysis and graph algorithms", + }, + { + "name": "risk", + "description": "Risk assessment and red flag detection", + }, + ], ) -# CORS middleware + +# ============================================================================ +# MIDDLEWARE +# ============================================================================ + +# CORS Middleware app.add_middleware( CORSMiddleware, - allow_origins=["*"], + allow_origins=CORS_ORIGINS, allow_credentials=True, - allow_methods=["*"], + allow_methods=["GET", "POST", "PUT", "DELETE", "OPTIONS", "PATCH"], allow_headers=["*"], + expose_headers=["X-Request-ID", "X-Response-Time"], + max_age=600, # Cache preflight requests for 10 minutes ) -# Include routers -app.include_router(entities.router, prefix="/api/entities", tags=["entities"]) -app.include_router(networks.router, prefix="/api/networks", tags=["networks"]) +# GZip Middleware for response compression +app.add_middleware( + GZipMiddleware, + minimum_size=1000, # Only compress responses > 1KB +) + + +class RequestLoggingMiddleware(BaseHTTPMiddleware): + """ + Middleware for logging HTTP requests and responses. + + Logs: + - Request method and path + - Response status code + - Request duration + - Request ID (if provided) + """ + + async def dispatch( + self, + request: Request, + call_next: Callable[[Request], Response], + ) -> Response: + # Generate request ID + request_id = request.headers.get("X-Request-ID", f"req_{int(time.time() * 1000)}") + + # Start timer + start_time = time.perf_counter() + + # Log request + logger.info( + f"โ†’ {request.method} {request.url.path} " + f"[{request_id}]" + ) + + # Process request + try: + response = await call_next(request) + except Exception as e: + # Log error + duration = (time.perf_counter() - start_time) * 1000 + logger.error( + f"โœ— {request.method} {request.url.path} " + f"[{request_id}] " + f"ERROR: {str(e)[:100]} " + f"({duration:.2f}ms)" + ) + raise + + # Calculate duration + duration = (time.perf_counter() - start_time) * 1000 + + # Add headers + response.headers["X-Request-ID"] = request_id + response.headers["X-Response-Time"] = f"{duration:.2f}ms" + + # Log response + log_level = logging.INFO if response.status_code < 400 else logging.WARNING + logger.log( + log_level, + f"โ† {request.method} {request.url.path} " + f"[{request_id}] " + f"{response.status_code} " + f"({duration:.2f}ms)" + ) + + return response -@app.on_event("startup") -async def startup_event(): - """Initialize database connection on startup.""" - try: - db = get_database() - db.verify_connectivity() - print("โœ“ Connected to Neo4j") - except Exception as e: - print(f"โœ— Failed to connect to Neo4j: {e}") +# Add request logging middleware +app.add_middleware(RequestLoggingMiddleware) -@app.on_event("shutdown") -async def shutdown_event(): - """Close database connection on shutdown.""" - db = get_database() - db.close() - print("โœ“ Disconnected from Neo4j") +# ============================================================================ +# EXCEPTION HANDLERS +# ============================================================================ +@app.exception_handler(HTTPException) +async def http_exception_handler(request: Request, exc: HTTPException) -> JSONResponse: + """ + Handle HTTPException with consistent error format. + """ + return JSONResponse( + status_code=exc.status_code, + content={ + "status_code": exc.status_code, + "error": exc.detail if isinstance(exc.detail, str) else "Error", + "detail": exc.detail, + "timestamp": datetime.utcnow().isoformat(), + "path": str(request.url.path), + }, + ) -@app.get("/") -async def root(): - """Root endpoint.""" + +@app.exception_handler(RequestValidationError) +async def validation_exception_handler( + request: Request, + exc: RequestValidationError, +) -> JSONResponse: + """ + Handle request validation errors with detailed error messages. + """ + errors = [] + for error in exc.errors(): + errors.append({ + "field": ".".join(str(loc) for loc in error["loc"]), + "message": error["msg"], + "type": error["type"], + }) + + return JSONResponse( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + content={ + "status_code": 422, + "error": "Validation Error", + "detail": "Request validation failed", + "errors": errors, + "timestamp": datetime.utcnow().isoformat(), + "path": str(request.url.path), + }, + ) + + +@app.exception_handler(Exception) +async def general_exception_handler(request: Request, exc: Exception) -> JSONResponse: + """ + Handle unexpected exceptions. + + In production, hide internal error details. + In development, include exception message. + """ + logger.exception(f"Unhandled exception on {request.url.path}: {exc}") + + detail = "An internal error occurred" + if API_ENV == "development": + detail = f"{type(exc).__name__}: {str(exc)}" + + return JSONResponse( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + content={ + "status_code": 500, + "error": "Internal Server Error", + "detail": detail, + "timestamp": datetime.utcnow().isoformat(), + "path": str(request.url.path), + }, + ) + + +# ============================================================================ +# ROOT ENDPOINTS +# ============================================================================ + +@app.get( + "/", + tags=["health"], + summary="API Information", + response_model=None, +) +async def root() -> dict[str, Any]: + """ + API welcome endpoint with service information. + + Returns: + API metadata including version, environment, and documentation links + """ return { - "message": "Graph-Backed Analysis API", - "version": "0.1.0", - "docs": "/docs", + "service": API_TITLE, + "version": API_VERSION, + "environment": API_ENV, + "status": "running", + "timestamp": datetime.utcnow().isoformat(), + "documentation": { + "swagger_ui": "/docs", + "redoc": "/redoc", + "openapi_schema": "/openapi.json", + }, + "endpoints": { + "health": "/health", + "entity_by_id": "/entities/id/{entity_id}", + "search": "/entities/search", + "ownership": "/entities/id/{entity_id}/ownership-path", + "network": "/entities/id/{entity_id}/network", + "risk": "/entities/id/{entity_id}/risk", + "influential": "/entities/top/influential", + "connected": "/entities/top/connected", + "by_jurisdiction": "/entities/by-jurisdiction/{jurisdiction_code}", + }, + "data_source": "ICIJ Offshore Leaks Database", + } + + +@app.get( + "/health", + tags=["health"], + summary="Health Check", + response_model=None, + responses={ + 200: {"description": "Service is healthy"}, + 503: {"description": "Service is unhealthy"}, + }, +) +async def health_check_endpoint() -> JSONResponse: + """ + System health check endpoint. + + Verifies: + - API is running + - Neo4j database connectivity + - GDS plugin availability (if applicable) + + Returns: + Health status with component details + """ + health_response: dict[str, Any] = { + "status": "healthy", + "timestamp": datetime.utcnow().isoformat(), + "version": API_VERSION, + "environment": API_ENV, + "checks": { + "api": True, + }, + } + + status_code = status.HTTP_200_OK + + # Check database if available + if DATABASE_AVAILABLE: + try: + db_health: HealthCheckResult = await health_check(detailed=True) + + health_response["checks"]["neo4j"] = db_health.neo4j_connected + health_response["neo4j"] = { + "connected": db_health.neo4j_connected, + "version": db_health.neo4j_version, + "edition": db_health.neo4j_edition, + "database": db_health.database, + "latency_ms": db_health.latency_ms, + "gds_available": db_health.gds_available, + "gds_version": db_health.gds_version, + } + + if db_health.uptime_seconds: + health_response["uptime_seconds"] = round(db_health.uptime_seconds, 2) + + if not db_health.neo4j_connected: + health_response["status"] = "unhealthy" + health_response["error"] = db_health.error + status_code = status.HTTP_503_SERVICE_UNAVAILABLE + + except Exception as e: + logger.error(f"Health check failed: {e}") + health_response["status"] = "unhealthy" + health_response["checks"]["neo4j"] = False + health_response["error"] = str(e) + status_code = status.HTTP_503_SERVICE_UNAVAILABLE + else: + health_response["checks"]["neo4j"] = None + health_response["status"] = "degraded" + health_response["warning"] = "Database module not available" + + return JSONResponse(status_code=status_code, content=health_response) + + +@app.get( + "/ready", + tags=["health"], + summary="Readiness Check", + response_model=None, +) +async def readiness_check() -> dict[str, Any]: + """ + Kubernetes-style readiness probe. + + Returns 200 if the service is ready to accept traffic. + """ + ready = True + + if DATABASE_AVAILABLE: + try: + db_health = await health_check(detailed=False) + ready = db_health.neo4j_connected + except Exception: + ready = False + + if not ready: + return JSONResponse( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + content={"ready": False}, + ) + + return {"ready": True} + + +@app.get( + "/live", + tags=["health"], + summary="Liveness Check", + response_model=None, +) +async def liveness_check() -> dict[str, Any]: + """ + Kubernetes-style liveness probe. + + Returns 200 if the service process is alive. + """ + return {"alive": True} + + +# ============================================================================ +# API INFO ENDPOINTS +# ============================================================================ + +@app.get( + "/info", + tags=["health"], + summary="API Statistics", + response_model=None, +) +async def api_info() -> dict[str, Any]: + """ + Get API runtime information and statistics. + """ + info: dict[str, Any] = { + "api": { + "title": API_TITLE, + "version": API_VERSION, + "environment": API_ENV, + }, + "python_version": sys.version, + "timestamp": datetime.utcnow().isoformat(), } + + # Add database info if available + if DATABASE_AVAILABLE and Neo4jDatabase.is_initialized(): + uptime = Neo4jDatabase.get_uptime() + if uptime: + info["database"] = { + "connected": True, + "uptime_seconds": round(uptime, 2), + } + + return info + +# ============================================================================ +# INCLUDE ROUTERS +# ============================================================================ -@app.get("/health") -async def health(): - """Health check endpoint.""" - try: - db = get_database() - db.verify_connectivity() - return {"status": "healthy", "neo4j": "connected"} - except Exception as e: - return {"status": "unhealthy", "neo4j": "disconnected", "error": str(e)} +# Entity routes +if ENTITIES_ROUTER_AVAILABLE: + app.include_router( + entities_router, + tags=["entities"], + ) + logger.info("โœ“ Entities router loaded") +else: + logger.warning("โš  Entities router not available") +# Network routes (placeholder - create if needed) +# if NETWORK_ROUTER_AVAILABLE: +# app.include_router( +# network_router, +# prefix="/network", +# tags=["network"], +# ) +# logger.info("โœ“ Network router loaded") + + +# ============================================================================ +# DEVELOPMENT SERVER +# ============================================================================ + +if __name__ == "__main__": + import uvicorn + + # Development server configuration + uvicorn_config = { + "app": "main:app", + "host": os.getenv("API_HOST", "0.0.0.0"), + "port": int(os.getenv("API_PORT", "8000")), + "reload": API_ENV == "development", + "reload_dirs": ["app"] if API_ENV == "development" else None, + "log_level": "debug" if API_ENV == "development" else "info", + "access_log": API_ENV == "development", + } + + logger.info(f"Starting development server on {uvicorn_config['host']}:{uvicorn_config['port']}") + + uvicorn.run(**uvicorn_config) diff --git a/app/models.py b/app/models.py new file mode 100644 index 0000000..1d83908 --- /dev/null +++ b/app/models.py @@ -0,0 +1,1366 @@ +""" +Panama Papers API - Pydantic Models +==================================== + +Data models for FastAPI endpoints supporting offshore financial network analysis. + +Model Categories: + - Entity Models: Core offshore entities (companies, trusts, funds) + - Person Models: Natural persons (beneficial owners, officers) + - Relationship Models: Ownership and control relationships + - Path Query Models: Graph traversal requests/responses + - Network Analysis Models: GDS algorithm results + - Search Models: Full-text and filtered search + - Error Models: Standardized error responses + +Pydantic Version: 2.x (with model_validator, field_validator) +Python Version: 3.11+ + +Usage: + from models import EntityResponse, PathQuery, OwnershipRelation + + entity = EntityResponse( + entity_id="ENT-001", + name="Acme Holdings Ltd", + jurisdiction_code="BVI", + entity_type="Company", + status="Active" + ) +""" + +from __future__ import annotations + +from datetime import date, datetime +from enum import Enum +from typing import Annotated, Any, Optional + +from pydantic import ( + BaseModel, + ConfigDict, + Field, + field_validator, + model_validator, +) + + +# ============================================================================ +# ENUMS +# ============================================================================ + +class EntityType(str, Enum): + """Types of offshore entities.""" + COMPANY = "Company" + TRUST = "Trust" + FUND = "Fund" + FOUNDATION = "Foundation" + PARTNERSHIP = "Partnership" + OTHER = "Other" + UNKNOWN = "Unknown" + + +class EntityStatus(str, Enum): + """Entity lifecycle status.""" + ACTIVE = "Active" + INACTIVE = "Inactive" + DISSOLVED = "Dissolved" + STRUCK_OFF = "Struck Off" + UNKNOWN = "Unknown" + + +class RelationshipType(str, Enum): + """Types of relationships between nodes.""" + OWNS = "OWNS" + CONTROLS = "CONTROLS" + INVOLVED_IN = "INVOLVED_IN" + HAS_ADDRESS = "HAS_ADDRESS" + REGISTERED_IN = "REGISTERED_IN" + CREATED_BY = "CREATED_BY" + CONNECTED_TO = "CONNECTED_TO" + RELATED_TO = "RELATED_TO" + + +class RiskLevel(str, Enum): + """Risk classification levels.""" + LOW = "LOW" + MEDIUM = "MEDIUM" + HIGH = "HIGH" + CRITICAL = "CRITICAL" + UNKNOWN = "UNKNOWN" + + +class OfficerRole(str, Enum): + """Roles that officers can hold.""" + DIRECTOR = "Director" + SECRETARY = "Secretary" + NOMINEE_DIRECTOR = "Nominee Director" + NOMINEE_SHAREHOLDER = "Nominee Shareholder" + SHAREHOLDER = "Shareholder" + BENEFICIARY = "Beneficiary" + PROTECTOR = "Protector" + SETTLOR = "Settlor" + AUTHORIZED_SIGNATORY = "Authorized Signatory" + POWER_OF_ATTORNEY = "Power of Attorney" + OTHER = "Other" + + +# ============================================================================ +# BASE CONFIGURATION +# ============================================================================ + +class BaseModelConfig(BaseModel): + """Base model with common configuration.""" + + model_config = ConfigDict( + from_attributes=True, # Support ORM models (SQLAlchemy, etc.) + populate_by_name=True, # Allow population by field name or alias + str_strip_whitespace=True, # Strip whitespace from strings + validate_assignment=True, # Validate on attribute assignment + extra="ignore", # Ignore extra fields during initialization + ) + + +# ============================================================================ +# ENTITY MODELS +# ============================================================================ + +class EntityBase(BaseModelConfig): + """ + Base entity model representing offshore entities. + + Entities include companies, trusts, funds, and foundations + registered in offshore jurisdictions. + + Attributes: + entity_id: Unique ICIJ identifier (e.g., "10000001") + name: Registered legal name + jurisdiction_code: ISO code or custom (e.g., "BVI", "PAN") + entity_type: Classification (Company, Trust, Fund, etc.) + status: Lifecycle status (Active, Dissolved, etc.) + """ + + entity_id: str = Field( + ..., + min_length=1, + max_length=50, + description="Unique entity identifier from ICIJ database", + examples=["10000001", "ENT-BVI-2010-001"] + ) + + name: str = Field( + ..., + min_length=1, + max_length=500, + description="Registered legal name of the entity", + examples=["Acme Holdings Ltd", "Global Ventures Inc"] + ) + + jurisdiction_code: Optional[str] = Field( + default=None, + max_length=10, + description="Jurisdiction of registration (ISO or custom code)", + examples=["BVI", "PAN", "CYM", "SGP"] + ) + + entity_type: EntityType = Field( + default=EntityType.UNKNOWN, + description="Type of offshore entity" + ) + + status: EntityStatus = Field( + default=EntityStatus.UNKNOWN, + description="Current lifecycle status" + ) + + @field_validator("jurisdiction_code") + @classmethod + def uppercase_jurisdiction(cls, v: Optional[str]) -> Optional[str]: + """Ensure jurisdiction codes are uppercase.""" + return v.upper() if v else None + + @field_validator("name") + @classmethod + def clean_name(cls, v: str) -> str: + """Clean and normalize entity name.""" + # Remove excessive whitespace + return " ".join(v.split()) + + +class EntityCreate(EntityBase): + """ + Model for creating new entities. + + Extends EntityBase with optional creation-specific fields. + """ + + incorporation_date: Optional[date] = Field( + default=None, + description="Date of entity incorporation" + ) + + source: str = Field( + default="Panama Papers", + description="Data source (e.g., 'Panama Papers', 'Paradise Papers')" + ) + + original_name: Optional[str] = Field( + default=None, + max_length=500, + description="Name in original script (non-Latin)" + ) + + +class EntityUpdate(BaseModelConfig): + """ + Model for updating existing entities. + + All fields optional - only provided fields are updated. + """ + + name: Optional[str] = Field(default=None, max_length=500) + jurisdiction_code: Optional[str] = Field(default=None, max_length=10) + entity_type: Optional[EntityType] = None + status: Optional[EntityStatus] = None + inactivation_date: Optional[date] = None + struck_off_date: Optional[date] = None + + +class EntityResponse(EntityBase): + """ + Entity response model with analytics data. + + Extends EntityBase with computed properties from GDS algorithms + and additional metadata. + + Attributes: + pagerank_score: Influence score from PageRank algorithm + community_id: Community cluster assignment from Louvain + degree_centrality: Number of direct connections + risk_score: Calculated risk score (0-100) + incorporation_date: Date of incorporation + """ + + # Analytics properties (from GDS algorithms) + pagerank_score: Optional[float] = Field( + default=None, + ge=0, + description="PageRank influence score (higher = more influential)" + ) + + community_id: Optional[int] = Field( + default=None, + description="Community cluster ID from Louvain algorithm" + ) + + degree_centrality: Optional[int] = Field( + default=None, + ge=0, + description="Number of direct connections" + ) + + betweenness_score: Optional[float] = Field( + default=None, + ge=0, + description="Betweenness centrality score" + ) + + # Risk assessment + risk_score: Optional[float] = Field( + default=None, + ge=0, + le=100, + description="Calculated risk score (0-100, higher = riskier)" + ) + + risk_level: Optional[RiskLevel] = Field( + default=None, + description="Risk classification" + ) + + # Additional metadata + incorporation_date: Optional[date] = Field( + default=None, + description="Date of incorporation" + ) + + inactivation_date: Optional[date] = Field( + default=None, + description="Date entity became inactive" + ) + + source: Optional[str] = Field( + default=None, + description="Data source" + ) + + owner_count: Optional[int] = Field( + default=None, + ge=0, + description="Number of direct owners" + ) + + subsidiary_count: Optional[int] = Field( + default=None, + ge=0, + description="Number of owned entities" + ) + + +class EntitySummary(BaseModelConfig): + """ + Minimal entity summary for lists and search results. + + Lightweight model for bulk responses. + """ + + entity_id: str + name: str + jurisdiction_code: Optional[str] = None + entity_type: Optional[str] = None + status: Optional[str] = None + risk_level: Optional[RiskLevel] = None + + +# ============================================================================ +# PERSON MODELS +# ============================================================================ + +class PersonBase(BaseModelConfig): + """ + Base model for natural persons. + + Persons include beneficial owners, directors, shareholders, + and other individuals connected to offshore entities. + """ + + person_id: str = Field( + ..., + min_length=1, + max_length=50, + description="Unique person identifier" + ) + + full_name: str = Field( + ..., + min_length=1, + max_length=255, + description="Full name as recorded" + ) + + nationality: Optional[str] = Field( + default=None, + max_length=10, + description="Nationality (ISO country code)" + ) + + country_of_residence: Optional[str] = Field( + default=None, + max_length=10, + description="Country of residence (ISO code)" + ) + + is_pep: bool = Field( + default=False, + description="Politically Exposed Person flag" + ) + + @field_validator("nationality", "country_of_residence") + @classmethod + def uppercase_country(cls, v: Optional[str]) -> Optional[str]: + """Ensure country codes are uppercase.""" + return v.upper() if v else None + + +class PersonCreate(PersonBase): + """Model for creating new persons.""" + + first_name: Optional[str] = Field(default=None, max_length=100) + last_name: Optional[str] = Field(default=None, max_length=100) + date_of_birth: Optional[date] = None + pep_details: Optional[str] = Field( + default=None, + max_length=500, + description="Political role/position if PEP" + ) + source: str = Field(default="Panama Papers") + + +class PersonResponse(PersonBase): + """Person response with analytics and connections.""" + + first_name: Optional[str] = None + last_name: Optional[str] = None + date_of_birth: Optional[date] = None + pep_details: Optional[str] = None + + # Analytics + pagerank_score: Optional[float] = None + community_id: Optional[int] = None + + # Connection counts + entity_count: Optional[int] = Field( + default=None, + description="Number of connected entities" + ) + + # Risk + risk_score: Optional[float] = Field(default=None, ge=0, le=100) + risk_level: Optional[RiskLevel] = None + + +class PersonSummary(BaseModelConfig): + """Minimal person summary.""" + + person_id: str + full_name: str + nationality: Optional[str] = None + is_pep: bool = False + + +# ============================================================================ +# RELATIONSHIP MODELS +# ============================================================================ + +class RelationshipBase(BaseModelConfig): + """ + Base relationship model. + + Represents directed relationships between nodes in the graph. + """ + + source_id: str = Field( + ..., + description="Source node identifier" + ) + + target_id: str = Field( + ..., + description="Target node identifier" + ) + + relationship_type: RelationshipType = Field( + ..., + description="Type of relationship" + ) + + start_date: Optional[date] = Field( + default=None, + description="Relationship start date" + ) + + end_date: Optional[date] = Field( + default=None, + description="Relationship end date" + ) + + status: Optional[str] = Field( + default="Active", + description="Relationship status" + ) + + @model_validator(mode="after") + def validate_dates(self) -> "RelationshipBase": + """Ensure end_date is after start_date.""" + if self.start_date and self.end_date: + if self.end_date < self.start_date: + raise ValueError("end_date must be after start_date") + return self + + +class OwnershipRelation(RelationshipBase): + """ + Ownership relationship with percentage. + + Represents direct or indirect ownership stakes between entities + or from persons to entities. + """ + + relationship_type: RelationshipType = Field( + default=RelationshipType.OWNS, + description="Type of relationship (defaults to OWNS)" + ) + + ownership_percentage: Optional[float] = Field( + default=None, + ge=0, + le=100, + description="Ownership percentage (0-100)" + ) + + share_count: Optional[int] = Field( + default=None, + ge=0, + description="Number of shares held" + ) + + share_class: Optional[str] = Field( + default=None, + max_length=50, + description="Class of shares (A, B, Ordinary, Preferred)" + ) + + is_beneficial: bool = Field( + default=False, + description="True if beneficial (vs. legal) ownership" + ) + + is_nominee: bool = Field( + default=False, + description="True if nominee arrangement" + ) + + acquisition_date: Optional[date] = Field( + default=None, + description="Date ownership was acquired" + ) + + +class ControlRelation(RelationshipBase): + """ + Control relationship (non-ownership control). + + Represents de facto control through voting agreements, + board control, or contractual arrangements. + """ + + relationship_type: RelationshipType = Field( + default=RelationshipType.CONTROLS + ) + + control_type: Optional[str] = Field( + default=None, + description="Type of control (Board Majority, Voting Agreement, etc.)" + ) + + control_percentage: Optional[float] = Field( + default=None, + ge=0, + le=100, + description="Effective control percentage" + ) + + evidence_strength: Optional[str] = Field( + default=None, + description="Evidence level (Confirmed, Probable, Suspected)" + ) + + +class InvolvementRelation(RelationshipBase): + """ + Officer/role involvement in an entity. + + Represents persons serving as directors, secretaries, + or other corporate roles. + """ + + relationship_type: RelationshipType = Field( + default=RelationshipType.INVOLVED_IN + ) + + role: OfficerRole = Field( + ..., + description="Role held in the entity" + ) + + is_nominee: bool = Field( + default=False, + description="Acting as nominee" + ) + + appointment_date: Optional[date] = Field( + default=None, + description="Date of appointment" + ) + + resignation_date: Optional[date] = Field( + default=None, + description="Date of resignation" + ) + + +class RelationshipResponse(RelationshipBase): + """ + Relationship response with additional context. + """ + + source_name: Optional[str] = Field( + default=None, + description="Name of source node" + ) + + source_type: Optional[str] = Field( + default=None, + description="Type of source node (Entity, Person, etc.)" + ) + + target_name: Optional[str] = Field( + default=None, + description="Name of target node" + ) + + target_type: Optional[str] = Field( + default=None, + description="Type of target node" + ) + + # Additional properties depending on relationship type + ownership_percentage: Optional[float] = None + role: Optional[str] = None + is_nominee: Optional[bool] = None + + +# ============================================================================ +# PATH QUERY MODELS +# ============================================================================ + +class PathQuery(BaseModelConfig): + """ + Query parameters for graph path finding. + + Used to find ownership chains, control paths, and connections + between entities and beneficial owners. + + Attributes: + source_entity_id: Starting entity for path search + target_entity_id: Optional destination (None = find all paths) + max_depth: Maximum path length (1-6 hops) + relationship_types: Filter by relationship types + include_persons: Include Person nodes in results + """ + + source_entity_id: str = Field( + ..., + description="Source entity ID to start path search" + ) + + target_entity_id: Optional[str] = Field( + default=None, + description="Target entity ID (optional, None = find all paths)" + ) + + max_depth: int = Field( + default=4, + ge=1, + le=6, + description="Maximum path depth (1-6 hops)" + ) + + min_depth: int = Field( + default=1, + ge=1, + le=6, + description="Minimum path depth" + ) + + relationship_types: Optional[list[RelationshipType]] = Field( + default=None, + description="Filter by relationship types (None = all types)" + ) + + include_persons: bool = Field( + default=True, + description="Include Person nodes in path results" + ) + + include_intermediaries: bool = Field( + default=False, + description="Include Intermediary nodes in results" + ) + + only_active: bool = Field( + default=True, + description="Only include active relationships" + ) + + limit: int = Field( + default=50, + ge=1, + le=500, + description="Maximum number of paths to return" + ) + + @model_validator(mode="after") + def validate_depth_range(self) -> "PathQuery": + """Ensure min_depth <= max_depth.""" + if self.min_depth > self.max_depth: + raise ValueError("min_depth must be <= max_depth") + return self + + +class PathNode(BaseModelConfig): + """ + Node in a path result. + """ + + node_id: str = Field(..., description="Node identifier") + name: str = Field(..., description="Node name") + node_type: str = Field(..., description="Node label (Entity, Person, etc.)") + jurisdiction_code: Optional[str] = None + layer: int = Field(..., ge=0, description="Position in path (0 = source)") + + # Optional analytics + risk_score: Optional[float] = None + is_pep: Optional[bool] = None + + +class PathEdge(BaseModelConfig): + """ + Edge/relationship in a path result. + """ + + source_id: str + target_id: str + relationship_type: str + ownership_percentage: Optional[float] = None + role: Optional[str] = None + layer: int = Field(..., ge=0, description="Position in path") + + +class PathResult(BaseModelConfig): + """ + Single path in response. + """ + + path_id: int = Field(..., description="Path identifier") + depth: int = Field(..., ge=1, description="Path length in hops") + nodes: list[PathNode] = Field(..., description="Ordered list of nodes") + edges: list[PathEdge] = Field(..., description="Ordered list of edges") + effective_ownership: Optional[float] = Field( + default=None, + ge=0, + le=100, + description="Calculated effective ownership percentage" + ) + risk_indicators: list[str] = Field( + default_factory=list, + description="Risk indicators found in path" + ) + + +class PathResponse(BaseModelConfig): + """ + Response model for path queries. + + Contains all paths found matching the query criteria. + """ + + query: PathQuery = Field(..., description="Original query") + path_count: int = Field(..., ge=0, description="Number of paths found") + paths: list[PathResult] = Field(..., description="Path results") + + # Summary statistics + average_depth: Optional[float] = Field( + default=None, + description="Average path depth" + ) + + max_depth_found: Optional[int] = Field( + default=None, + description="Maximum depth in results" + ) + + unique_entities: int = Field( + default=0, + ge=0, + description="Number of unique entities in all paths" + ) + + unique_persons: int = Field( + default=0, + ge=0, + description="Number of unique persons in all paths" + ) + + pep_count: int = Field( + default=0, + ge=0, + description="Number of PEPs found in paths" + ) + + tax_haven_count: int = Field( + default=0, + ge=0, + description="Number of tax haven jurisdictions crossed" + ) + + execution_time_ms: Optional[float] = Field( + default=None, + description="Query execution time in milliseconds" + ) + + +# ============================================================================ +# NETWORK ANALYSIS MODELS +# ============================================================================ + +class CommunityMember(BaseModelConfig): + """Member of a community cluster.""" + + node_id: str + name: str + node_type: str + jurisdiction_code: Optional[str] = None + pagerank_score: Optional[float] = None + is_pep: Optional[bool] = None + + +class CommunityResponse(BaseModelConfig): + """ + Community detection result. + + Represents a cluster of related entities identified by + the Louvain algorithm. + + Attributes: + community_id: Unique community identifier + members: List of entities in the community + size: Number of members + internal_density: How connected members are to each other + risk_level: Overall risk assessment + """ + + community_id: int = Field(..., description="Community cluster ID") + + size: int = Field(..., ge=1, description="Number of members") + + members: list[CommunityMember] = Field( + ..., + description="Community members" + ) + + internal_density: Optional[float] = Field( + default=None, + ge=0, + le=1, + description="Internal connection density (0-1)" + ) + + modularity_contribution: Optional[float] = Field( + default=None, + description="Community's contribution to overall modularity" + ) + + # Jurisdiction analysis + jurisdiction_count: int = Field( + default=0, + ge=0, + description="Number of unique jurisdictions" + ) + + jurisdictions: list[str] = Field( + default_factory=list, + description="List of jurisdictions" + ) + + tax_haven_percentage: Optional[float] = Field( + default=None, + ge=0, + le=100, + description="Percentage of members in tax havens" + ) + + # Risk assessment + risk_score: Optional[float] = Field( + default=None, + ge=0, + le=100, + description="Community risk score" + ) + + risk_level: RiskLevel = Field( + default=RiskLevel.UNKNOWN, + description="Risk classification" + ) + + risk_factors: list[str] = Field( + default_factory=list, + description="Identified risk factors" + ) + + # Notable members + pep_count: int = Field(default=0, ge=0) + pep_names: list[str] = Field(default_factory=list) + top_influential: list[str] = Field( + default_factory=list, + description="Top members by PageRank" + ) + + +class InfluenceScore(BaseModelConfig): + """ + Entity influence score from centrality algorithms. + + Represents how influential an entity is within the + offshore network based on PageRank or other metrics. + """ + + entity_id: str = Field(..., description="Entity identifier") + + name: str = Field(..., description="Entity name") + + entity_type: Optional[str] = Field( + default=None, + description="Entity type" + ) + + jurisdiction_code: Optional[str] = Field( + default=None, + description="Jurisdiction" + ) + + # Scores + pagerank_score: float = Field( + ..., + ge=0, + description="PageRank score" + ) + + rank: int = Field( + ..., + ge=1, + description="Rank by influence (1 = most influential)" + ) + + percentile: Optional[float] = Field( + default=None, + ge=0, + le=100, + description="Percentile ranking" + ) + + # Additional centrality scores + degree_centrality: Optional[int] = Field( + default=None, + description="Number of connections" + ) + + betweenness_score: Optional[float] = Field( + default=None, + description="Betweenness centrality" + ) + + eigenvector_score: Optional[float] = Field( + default=None, + description="Eigenvector centrality" + ) + + # Context + community_id: Optional[int] = None + is_tax_haven: Optional[bool] = None + + +class NetworkStats(BaseModelConfig): + """ + Overall network statistics. + """ + + total_entities: int = Field(..., ge=0) + total_persons: int = Field(..., ge=0) + total_relationships: int = Field(..., ge=0) + total_communities: int = Field(..., ge=0) + + # Jurisdiction breakdown + jurisdiction_count: int = Field(default=0, ge=0) + top_jurisdictions: list[dict[str, Any]] = Field(default_factory=list) + + # Risk summary + high_risk_entities: int = Field(default=0, ge=0) + pep_connections: int = Field(default=0, ge=0) + + # Graph metrics + average_degree: Optional[float] = None + graph_density: Optional[float] = None + largest_community_size: Optional[int] = None + + +# ============================================================================ +# SEARCH MODELS +# ============================================================================ + +class SearchQuery(BaseModelConfig): + """ + Search query parameters. + """ + + query: str = Field( + ..., + min_length=2, + max_length=200, + description="Search term" + ) + + search_type: str = Field( + default="all", + description="Search type: all, entity, person, intermediary" + ) + + jurisdiction_code: Optional[str] = Field( + default=None, + description="Filter by jurisdiction" + ) + + entity_type: Optional[EntityType] = Field( + default=None, + description="Filter by entity type" + ) + + status: Optional[EntityStatus] = Field( + default=None, + description="Filter by status" + ) + + is_pep: Optional[bool] = Field( + default=None, + description="Filter for PEPs only" + ) + + min_risk_score: Optional[float] = Field( + default=None, + ge=0, + le=100, + description="Minimum risk score filter" + ) + + limit: int = Field( + default=20, + ge=1, + le=100, + description="Maximum results" + ) + + offset: int = Field( + default=0, + ge=0, + description="Pagination offset" + ) + + +class SearchResult(BaseModelConfig): + """ + Individual search result. + """ + + node_id: str + name: str + node_type: str # Entity, Person, Intermediary + relevance_score: float = Field(..., ge=0, le=1) + jurisdiction_code: Optional[str] = None + status: Optional[str] = None + risk_level: Optional[RiskLevel] = None + is_pep: Optional[bool] = None + + # Snippet/context + matched_field: Optional[str] = None + snippet: Optional[str] = None + + +class SearchResponse(BaseModelConfig): + """ + Search response with results and metadata. + """ + + query: str + total_results: int = Field(..., ge=0) + results: list[SearchResult] + + # Pagination + limit: int + offset: int + has_more: bool + + # Facets (for filtering UI) + jurisdiction_facets: Optional[list[dict[str, Any]]] = None + type_facets: Optional[list[dict[str, Any]]] = None + + execution_time_ms: Optional[float] = None + + +# ============================================================================ +# RED FLAG MODELS +# ============================================================================ + +class RedFlag(BaseModelConfig): + """ + Individual red flag indicator. + """ + + flag_type: str = Field( + ..., + description="Type of red flag" + ) + + severity: RiskLevel = Field( + ..., + description="Severity level" + ) + + description: str = Field( + ..., + description="Human-readable description" + ) + + evidence: Optional[str] = Field( + default=None, + description="Supporting evidence" + ) + + related_entities: list[str] = Field( + default_factory=list, + description="Related entity IDs" + ) + + +class RedFlagAnalysis(BaseModelConfig): + """ + Red flag analysis for an entity. + """ + + entity_id: str + entity_name: str + + overall_risk_score: float = Field(..., ge=0, le=100) + overall_risk_level: RiskLevel + + red_flags: list[RedFlag] + flag_count: int = Field(..., ge=0) + + # Specific risk categories + layering_depth: Optional[int] = None + jurisdiction_count: Optional[int] = None + pep_connections: int = Field(default=0, ge=0) + circular_ownership: bool = False + mass_registration_address: bool = False + + analysis_timestamp: datetime = Field( + default_factory=datetime.utcnow + ) + + +# ============================================================================ +# ERROR & RESPONSE MODELS +# ============================================================================ + +class ErrorDetail(BaseModelConfig): + """ + Detailed error information. + """ + + field: Optional[str] = Field( + default=None, + description="Field that caused the error" + ) + + message: str = Field( + ..., + description="Error message" + ) + + code: Optional[str] = Field( + default=None, + description="Error code" + ) + + +class ErrorResponse(BaseModelConfig): + """ + Standardized error response model. + + Used for all API error responses to ensure consistency. + """ + + status_code: int = Field( + ..., + ge=400, + le=599, + description="HTTP status code" + ) + + error: str = Field( + ..., + description="Error type" + ) + + detail: str = Field( + ..., + description="Human-readable error description" + ) + + errors: list[ErrorDetail] = Field( + default_factory=list, + description="List of detailed errors" + ) + + timestamp: datetime = Field( + default_factory=datetime.utcnow, + description="Error timestamp" + ) + + request_id: Optional[str] = Field( + default=None, + description="Request ID for tracing" + ) + + path: Optional[str] = Field( + default=None, + description="Request path" + ) + + +class HealthStatus(str, Enum): + """Health check status values.""" + HEALTHY = "healthy" + DEGRADED = "degraded" + UNHEALTHY = "unhealthy" + + +class HealthCheckResponse(BaseModelConfig): + """ + Health check response model. + + Reports API and dependency health status. + """ + + status: HealthStatus = Field( + ..., + description="Overall health status" + ) + + api_version: str = Field( + ..., + description="API version" + ) + + neo4j_connection: bool = Field( + ..., + description="Neo4j database connectivity" + ) + + neo4j_version: Optional[str] = Field( + default=None, + description="Neo4j server version" + ) + + gds_available: Optional[bool] = Field( + default=None, + description="GDS plugin availability" + ) + + uptime_seconds: Optional[float] = Field( + default=None, + ge=0, + description="API uptime in seconds" + ) + + timestamp: datetime = Field( + default_factory=datetime.utcnow + ) + + checks: dict[str, bool] = Field( + default_factory=dict, + description="Individual health checks" + ) + + +class PaginationMeta(BaseModelConfig): + """ + Pagination metadata for list responses. + """ + + total: int = Field(..., ge=0, description="Total number of items") + limit: int = Field(..., ge=1, description="Items per page") + offset: int = Field(..., ge=0, description="Current offset") + page: int = Field(..., ge=1, description="Current page number") + total_pages: int = Field(..., ge=0, description="Total pages") + has_next: bool = Field(..., description="Has next page") + has_prev: bool = Field(..., description="Has previous page") + + +class PaginatedResponse(BaseModelConfig): + """ + Generic paginated response wrapper. + """ + + data: list[Any] = Field(..., description="Response data") + pagination: PaginationMeta = Field(..., description="Pagination metadata") + + +# ============================================================================ +# TYPE ALIASES FOR CONVENIENCE +# ============================================================================ + +# Common response types +EntityList = list[EntityResponse] +PersonList = list[PersonResponse] +RelationshipList = list[RelationshipResponse] +CommunityList = list[CommunityResponse] +InfluenceList = list[InfluenceScore] + + +# ============================================================================ +# MODEL EXPORTS +# ============================================================================ + +__all__ = [ + # Enums + "EntityType", + "EntityStatus", + "RelationshipType", + "RiskLevel", + "OfficerRole", + "HealthStatus", + + # Entity Models + "EntityBase", + "EntityCreate", + "EntityUpdate", + "EntityResponse", + "EntitySummary", + + # Person Models + "PersonBase", + "PersonCreate", + "PersonResponse", + "PersonSummary", + + # Relationship Models + "RelationshipBase", + "OwnershipRelation", + "ControlRelation", + "InvolvementRelation", + "RelationshipResponse", + + # Path Models + "PathQuery", + "PathNode", + "PathEdge", + "PathResult", + "PathResponse", + + # Network Analysis + "CommunityMember", + "CommunityResponse", + "InfluenceScore", + "NetworkStats", + + # Search Models + "SearchQuery", + "SearchResult", + "SearchResponse", + + # Red Flag Models + "RedFlag", + "RedFlagAnalysis", + + # Error & Response Models + "ErrorDetail", + "ErrorResponse", + "HealthCheckResponse", + "PaginationMeta", + "PaginatedResponse", + + # Type Aliases + "EntityList", + "PersonList", + "RelationshipList", + "CommunityList", + "InfluenceList", +] diff --git a/app/models/__init__.py b/app/models/__init__.py deleted file mode 100644 index 624d98d..0000000 --- a/app/models/__init__.py +++ /dev/null @@ -1,72 +0,0 @@ -"""Pydantic models for API responses.""" - -from pydantic import BaseModel -from typing import List, Optional, Dict, Any - - -class EntityResponse(BaseModel): - """Entity model for API responses.""" - - id: str - name: Optional[str] = None - jurisdiction: Optional[str] = None - node_type: str - - -class OfficerResponse(BaseModel): - """Officer model for API responses.""" - - id: str - name: Optional[str] = None - node_type: str = "Officer" - - -class IntermediaryResponse(BaseModel): - """Intermediary model for API responses.""" - - id: str - name: Optional[str] = None - node_type: str = "Intermediary" - - -class PathNode(BaseModel): - """Node in a path.""" - - id: str - labels: List[str] - properties: Dict[str, Any] - - -class PathRelationship(BaseModel): - """Relationship in a path.""" - - type: str - start_node: str - end_node: str - properties: Dict[str, Any] - - -class OwnershipPath(BaseModel): - """Ownership path from Officer to Entity.""" - - nodes: List[PathNode] - relationships: List[PathRelationship] - length: int - - -class TopIntermediary(BaseModel): - """Top intermediary with connection count.""" - - intermediary_id: str - intermediary_name: Optional[str] - entity_count: int - - -class RedFlag(BaseModel): - """Red flag detection result.""" - - address_id: str - address: Optional[str] - entity_count: int - entities: List[EntityResponse] - diff --git a/app/routers/__init__.py b/app/routers/__init__.py deleted file mode 100644 index 2364578..0000000 --- a/app/routers/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""API routers.""" - diff --git a/app/routers/entities.py b/app/routers/entities.py deleted file mode 100644 index dee700a..0000000 --- a/app/routers/entities.py +++ /dev/null @@ -1,61 +0,0 @@ -"""Entity-related API endpoints.""" - -from fastapi import APIRouter, HTTPException, Query -from typing import List - -from app.models import OwnershipPath, PathNode, PathRelationship -from app.services.graph_service import GraphService - -router = APIRouter() -graph_service = GraphService() - - -@router.get("/{entity_id}/ownership/paths", response_model=List[OwnershipPath]) -async def get_ownership_paths( - entity_id: str, - max_length: int = Query(default=5, ge=1, le=10, description="Maximum path length"), -): - """ - Trace ownership paths from Officers to a specific Entity. - - Returns all paths connecting Officers to the given Entity, up to the specified max_length. - """ - try: - # Call the correct method name from GraphService - paths_data = graph_service.get_entity_ownership_paths(entity_id, max_depth=max_length) - - # Convert dicts to Pydantic models - paths = [] - for path_dict in paths_data: - # Convert nodes - nodes = [ - PathNode( - id=node_dict["id"], - labels=node_dict["labels"], - properties=node_dict["properties"] - ) - for node_dict in path_dict["nodes"] - ] - - # Convert relationships - relationships = [ - PathRelationship( - type=rel_dict["type"], - start_node=rel_dict["start_node"], - end_node=rel_dict["end_node"], - properties=rel_dict["properties"] - ) - for rel_dict in path_dict["relationships"] - ] - - paths.append( - OwnershipPath( - nodes=nodes, - relationships=relationships, - length=path_dict["length"] - ) - ) - - return paths - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error finding ownership paths: {str(e)}") diff --git a/app/routers/networks.py b/app/routers/networks.py deleted file mode 100644 index 1aca0d2..0000000 --- a/app/routers/networks.py +++ /dev/null @@ -1,152 +0,0 @@ -"""Network analysis API endpoints.""" - -from fastapi import APIRouter, HTTPException, Query -from typing import List, Optional - -from app.models import TopIntermediary, RedFlag, EntityResponse, OwnershipPath, PathNode, PathRelationship -from app.services.graph_service import GraphService - -router = APIRouter() -graph_service = GraphService() - - -@router.get("/intermediaries/top", response_model=List[TopIntermediary]) -async def get_top_intermediaries( - limit: int = Query(default=10, ge=1, le=100, description="Number of results to return"), -): - """ - Find intermediaries connected to the most entities. - - Returns the top intermediaries ranked by the number of entities they are connected to. - """ - try: - # Call the correct method name from GraphService - intermediaries_data = graph_service.get_top_intermediaries(limit=limit) - - # Convert dicts to Pydantic models - intermediaries = [ - TopIntermediary( - intermediary_id=item["intermediary_id"], - intermediary_name=item["name"], # Map 'name' to 'intermediary_name' - entity_count=item["entity_count"] - ) - for item in intermediaries_data - ] - - return intermediaries - except Exception as e: - raise HTTPException( - status_code=500, detail=f"Error finding top intermediaries: {str(e)}" - ) - - -@router.get("/redflags", response_model=List[RedFlag]) -async def get_red_flags( - min_entities: int = Query( - default=2, ge=2, description="Minimum number of entities sharing an address" - ), - limit: int = Query(default=50, ge=1, le=200, description="Maximum number of results"), -): - """ - Query for entities sharing the same address (potential shell company ring). - - Detects addresses where multiple entities are registered, which may indicate - shell company networks or suspicious patterns. - """ - try: - # Call the correct method name from GraphService - red_flags_data = graph_service.detect_red_flags(min_entities=min_entities, limit=limit) - - # Convert dicts to Pydantic models - red_flags = [] - for flag_dict in red_flags_data: - # Convert sample_entities (list of names) to EntityResponse objects - # Note: The service returns entity names, so we create minimal EntityResponse objects - entities = [ - EntityResponse( - id="", # We don't have the ID from the service - name=entity_name if entity_name else None, - jurisdiction=None, - node_type="Entity" - ) - for entity_name in flag_dict.get("sample_entities", []) - ] - - red_flags.append( - RedFlag( - address_id=flag_dict["address_id"], - address=flag_dict["address"], - entity_count=flag_dict["entity_count"], - entities=entities - ) - ) - - return red_flags - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error finding red flags: {str(e)}") - - -@router.get("/path/shortest", response_model=Optional[OwnershipPath]) -async def get_shortest_path( - start_node_id: str = Query(..., description="ID of the starting node"), - end_node_id: str = Query(..., description="ID of the ending node"), -): - """ - Find the shortest path between two nodes using Neo4j's shortestPath function. - - Returns the shortest path connecting the start and end nodes, or null if no path exists. - """ - try: - path_data = graph_service.get_shortest_path(start_node_id, end_node_id) - - if path_data is None: - return None - - # Convert dict to Pydantic model - nodes = [ - PathNode( - id=node_dict["id"], - labels=node_dict["labels"], - properties=node_dict["properties"] - ) - for node_dict in path_data["nodes"] - ] - - relationships = [ - PathRelationship( - type=rel_dict["type"], - start_node=rel_dict["start_node"], - end_node=rel_dict["end_node"], - properties=rel_dict["properties"] - ) - for rel_dict in path_data["relationships"] - ] - - return OwnershipPath( - nodes=nodes, - relationships=relationships, - length=path_data["length"] - ) - except Exception as e: - raise HTTPException(status_code=500, detail=f"Error finding shortest path: {str(e)}") - - -@router.get("/stats/centrality") -async def get_most_connected_officers( - limit: int = Query(default=10, ge=1, le=100, description="Number of results to return"), -): - """ - Find the most connected officers using centrality analysis. - - Returns officers ranked by their degree (number of entities they are connected to). - This is a measure of network centrality - officers with higher degrees are more central - in the offshore network. - """ - try: - officers_data = graph_service.get_most_connected_officers(limit=limit) - - return officers_data - except Exception as e: - raise HTTPException( - status_code=500, detail=f"Error finding most connected officers: {str(e)}" - ) diff --git a/app/services/__init__.py b/app/services/__init__.py deleted file mode 100644 index 314b17e..0000000 --- a/app/services/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Business logic and Cypher query services.""" - diff --git a/app/services/graph_service.py b/app/services/graph_service.py deleted file mode 100644 index 55f23ee..0000000 --- a/app/services/graph_service.py +++ /dev/null @@ -1,262 +0,0 @@ -"""Graph Service - Business Logic Layer for Neo4j Queries.""" - -from typing import List, Dict, Any, Optional -from app.database import get_database - - -class GraphService: - """ - The "Brain" of the application. - Contains all business logic and Cypher queries for graph analysis. - """ - - def __init__(self): - """Initialize the graph service with database connection.""" - self.db = get_database() - - def get_entity_ownership_paths(self, entity_id: str, max_depth: int = 4) -> List[Dict[str, Any]]: - """ - Trace how an Officer is connected to a specific Entity. - Uses variable-length paths to find ownership chains. - - Args: - entity_id: The ID of the target entity - max_depth: Maximum path depth to search (default: 4) - - Returns: - List of paths, each containing nodes and relationships as JSON-friendly structures - """ - query = """ - MATCH p = (o:Officer)-[:OFFICER_OF*1..$max_depth]->(e:Entity {id: $entity_id}) - RETURN p - ORDER BY length(p) - LIMIT 100 - """ - - paths = [] - with self.db.get_session() as session: - result = session.run(query, entity_id=entity_id, max_depth=max_depth) - - for record in result: - path = record["p"] - - # Extract nodes - nodes = [] - for node in path.nodes: - nodes.append({ - "id": node.get("id", str(node.id)), - "labels": list(node.labels), - "properties": dict(node) - }) - - # Extract relationships - relationships = [] - for rel in path.relationships: - relationships.append({ - "type": rel.type, - "start_node": str(rel.start_node.get("id", rel.start_node.id)), - "end_node": str(rel.end_node.get("id", rel.end_node.id)), - "properties": dict(rel) - }) - - paths.append({ - "nodes": nodes, - "relationships": relationships, - "length": len(path.relationships) - }) - - return paths - - def get_top_intermediaries(self, limit: int = 20) -> List[Dict[str, Any]]: - """ - Find "Hubs" - intermediaries that manage many entities. - These are key players in the offshore network. - - Args: - limit: Maximum number of results to return (default: 20) - - Returns: - List of intermediaries with their entity counts, sorted by count descending - """ - query = """ - MATCH (i:Intermediary)-[:INTERMEDIARY_OF]->(e:Entity) - WITH i, count(e) as entity_count - ORDER BY entity_count DESC - LIMIT $limit - RETURN i.id as intermediary_id, - i.name as name, - entity_count - """ - - intermediaries = [] - with self.db.get_session() as session: - result = session.run(query, limit=limit) - - for record in result: - intermediaries.append({ - "intermediary_id": record["intermediary_id"], - "name": record["name"], - "entity_count": record["entity_count"] - }) - - return intermediaries - - def detect_red_flags(self, min_entities: int = 10, limit: int = 50) -> List[Dict[str, Any]]: - """ - Find suspicious clusters - many entities registered at the exact same address. - This is a red flag for potential shell company rings. - - Args: - min_entities: Minimum number of entities sharing an address to flag (default: 10) - limit: Maximum number of results to return (default: 50) - - Returns: - List of suspicious addresses with entity counts and sample entities - """ - query = """ - MATCH (a:Address)<-[:REGISTERED_ADDRESS]-(e:Entity) - WITH a, count(e) as count, collect(e.name) as entities - WHERE count > $min_entities - RETURN a.id as address_id, - a.address as address, - count, - entities[0..5] as sample_entities - ORDER BY count DESC - LIMIT $limit - """ - - red_flags = [] - with self.db.get_session() as session: - result = session.run(query, min_entities=min_entities, limit=limit) - - for record in result: - red_flags.append({ - "address_id": record["address_id"], - "address": record["address"], - "entity_count": record["count"], - "sample_entities": record["sample_entities"] - }) - - return red_flags - - def get_entity_details(self, entity_id: str) -> Optional[Dict[str, Any]]: - """ - Simple lookup for an Entity. - Returns the Entity properties plus the name of its Jurisdiction. - - Args: - entity_id: The ID of the entity to lookup - - Returns: - Dictionary with entity details and jurisdiction name, or None if not found - """ - query = """ - MATCH (e:Entity {id: $entity_id}) - OPTIONAL MATCH (e)-[:REGISTERED_IN]->(j:Jurisdiction) - RETURN e, - j.name as jurisdiction_name - LIMIT 1 - """ - - with self.db.get_session() as session: - result = session.run(query, entity_id=entity_id) - record = result.single() - - if record is None: - return None - - entity = record["e"] - return { - "id": entity.get("id", str(entity.id)), - "name": entity.get("name"), - "source": entity.get("source"), - "node_type": entity.get("node_type"), - "jurisdiction_name": record["jurisdiction_name"], - "properties": dict(entity) - } - - def get_shortest_path(self, start_node_id: str, end_node_id: str) -> Optional[Dict[str, Any]]: - """ - Find the shortest path between two nodes using Neo4j's shortestPath function. - - Args: - start_node_id: The ID of the starting node - end_node_id: The ID of the ending node - - Returns: - Dictionary containing the path with nodes and relationships, or None if no path exists - """ - query = """ - MATCH p = shortestPath((a)-[*]-(b)) - WHERE a.id = $start_id AND b.id = $end_id - RETURN p - LIMIT 1 - """ - - with self.db.get_session() as session: - result = session.run(query, start_id=start_node_id, end_id=end_node_id) - record = result.single() - - if record is None: - return None - - path = record["p"] - - # Extract nodes - nodes = [] - for node in path.nodes: - nodes.append({ - "id": node.get("id", str(node.id)), - "labels": list(node.labels), - "properties": dict(node) - }) - - # Extract relationships - relationships = [] - for rel in path.relationships: - relationships.append({ - "type": rel.type, - "start_node": str(rel.start_node.get("id", rel.start_node.id)), - "end_node": str(rel.end_node.get("id", rel.end_node.id)), - "properties": dict(rel) - }) - - return { - "nodes": nodes, - "relationships": relationships, - "length": len(path.relationships) - } - - def get_most_connected_officers(self, limit: int = 20) -> List[Dict[str, Any]]: - """ - Find the most connected officers using centrality analysis. - Officers are ranked by their degree (number of entities they are connected to). - - Args: - limit: Maximum number of results to return (default: 20) - - Returns: - List of officers with their connection counts (degree), sorted by degree descending - """ - query = """ - MATCH (o:Officer)-[:OFFICER_OF]->(e:Entity) - WITH o, count(e) as degree - ORDER BY degree DESC - LIMIT $limit - RETURN o.id as officer_id, - o.name as name, - degree - """ - - officers = [] - with self.db.get_session() as session: - result = session.run(query, limit=limit) - - for record in result: - officers.append({ - "officer_id": record["officer_id"], - "name": record["name"], - "degree": record["degree"] - }) - - return officers diff --git a/architecture_diagram.md b/architecture_diagram.md new file mode 100644 index 0000000..90906e9 --- /dev/null +++ b/architecture_diagram.md @@ -0,0 +1,885 @@ +# System Architecture - Panama Papers Neo4j Project + +> **Version:** 1.0 +> **Stack:** Neo4j 5.x + FastAPI + Nginx + Docker +> **Target:** Production-ready offshore financial network analysis platform + +--- + +## High-Level Architecture Diagram + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ DOCKER NETWORK: panama_network โ”‚ +โ”‚ (bridge mode) โ”‚ +โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค +โ”‚ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ BROWSER โ”‚ โ”‚ +โ”‚ โ”‚ (User) โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ HTTPS/HTTP :443/:80 โ”‚ +โ”‚ โ–ผ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ NGINX REVERSE PROXY โ”‚ โ”‚ +โ”‚ โ”‚ (nginx:alpine) โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ Ports: โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ 80 โ†’ HTTP (redirect to 443) โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ 443 โ†’ HTTPS (SSL termination) โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ Routes: โ”‚ โ”‚ +โ”‚ โ”‚ /api/* โ†’ fastapi:8000 โ”‚ โ”‚ +โ”‚ โ”‚ /browser โ†’ neo4j:7474 โ”‚ โ”‚ +โ”‚ โ”‚ /docs โ†’ fastapi:8000/docs โ”‚ โ”‚ +โ”‚ โ”‚ /health โ†’ fastapi:8000/health โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ”‚ โ”‚ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ–ผ โ–ผ โ”‚ +โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ NEO4J BROWSER โ”‚ โ”‚ FASTAPI APPLICATION โ”‚ โ”‚ +โ”‚ โ”‚ (Web UI) โ”‚ โ”‚ (python:3.11-slim) โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ Port: 7474 HTTP โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ Port: 8000 (uvicorn) โ”‚ โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ Endpoints: โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ GET /api/v1/entities โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ GET /api/v1/persons โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ GET /api/v1/ownership โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ POST /api/v1/search โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ GET /api/v1/graph/{id} โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ GET /health โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ GET /docs (Swagger) โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ Environment Variables: โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ NEO4J_URI=bolt://neo4j:7687 โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ NEO4J_USER=${NEO4J_USER} โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ NEO4J_PASSWORD=${NEO4J_PASS} โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ API_SECRET_KEY=${SECRET} โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ CORS_ORIGINS=["..."] โ”‚ โ”‚ +โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ Bolt Protocol โ”‚ +โ”‚ โ”‚ โ”‚ (encrypted, port 7687) โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ถ NEO4J 5.x DATABASE โ”‚ โ”‚ +โ”‚ โ”‚ (neo4j:5-enterprise) โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ Ports: โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ 7687 โ†’ Bolt (driver) โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ 7474 โ†’ HTTP (browser) โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ 7473 โ†’ HTTPS (browser) โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ Environment Variables: โ”‚ โ”‚ +โ”‚ โ”‚ NEO4J_AUTH=${USER}/${PASS} โ”‚ โ”‚ +โ”‚ โ”‚ NEO4J_PLUGINS=["apoc","gds"] โ”‚ โ”‚ +โ”‚ โ”‚ NEO4J_dbms_memory_heap_max=2G โ”‚ โ”‚ +โ”‚ โ”‚ NEO4J_dbms_security_auth=true โ”‚ โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚ Volumes: โ”‚ โ”‚ +โ”‚ โ”‚ neo4j_data โ†’ /data โ”‚ โ”‚ +โ”‚ โ”‚ neo4j_logs โ†’ /logs โ”‚ โ”‚ +โ”‚ โ”‚ neo4j_import โ†’ /var/lib/.../ โ”‚ โ”‚ +โ”‚ โ”‚ neo4j_plugins โ†’ /plugins โ”‚ โ”‚ +โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +โ”‚ โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +--- + +## Detailed Data Flow Diagram + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ REQUEST/RESPONSE FLOW โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + + USER NGINX FASTAPI NEO4J + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ 1. HTTP Request โ”‚ โ”‚ โ”‚ + โ”‚ GET /api/v1/entities โ”‚ โ”‚ โ”‚ + โ”‚ Authorization: Bearer โ”‚ โ”‚ โ”‚ + โ”‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€>โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ 2. Proxy Request โ”‚ โ”‚ + โ”‚ โ”‚ X-Real-IP: client โ”‚ โ”‚ + โ”‚ โ”‚ X-Forwarded-For โ”‚ โ”‚ + โ”‚ โ”‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€>โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ 3. Validate JWT โ”‚ + โ”‚ โ”‚ โ”‚ (decode & verify) โ”‚ + โ”‚ โ”‚ โ”‚โ”€โ”€โ”€โ”€โ”€โ” โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚<โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ 4. Bolt Connection โ”‚ + โ”‚ โ”‚ โ”‚ (neo4j-driver) โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ MATCH (e:Entity) โ”‚ + โ”‚ โ”‚ โ”‚ WHERE e.status = โ”‚ + โ”‚ โ”‚ โ”‚ 'Active' โ”‚ + โ”‚ โ”‚ โ”‚ RETURN e LIMIT 100 โ”‚ + โ”‚ โ”‚ โ”‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€>โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ 5. Execute Query โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ 6. Return Records โ”‚ + โ”‚ โ”‚ โ”‚<โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ 7. Transform to โ”‚ + โ”‚ โ”‚ โ”‚ Pydantic Models โ”‚ + โ”‚ โ”‚ โ”‚โ”€โ”€โ”€โ”€โ”€โ” โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚<โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ 8. JSON Response โ”‚ โ”‚ + โ”‚ โ”‚ Content-Type: json โ”‚ โ”‚ + โ”‚ โ”‚<โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ 9. Response to User โ”‚ โ”‚ โ”‚ + โ”‚ 200 OK + JSON body โ”‚ โ”‚ โ”‚ + โ”‚<โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + + +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ AUTHENTICATION FLOW (JWT) โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + + USER NGINX FASTAPI NEO4J + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ 1. POST /api/v1/auth/login โ”‚ โ”‚ + โ”‚ {"username": "analyst", "password": "..."} โ”‚ โ”‚ + โ”‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€>โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ 2. Verify user โ”‚ + โ”‚ โ”‚ โ”‚ (check password โ”‚ + โ”‚ โ”‚ โ”‚ hash in DB or โ”‚ + โ”‚ โ”‚ โ”‚ external IdP) โ”‚ + โ”‚ โ”‚ โ”‚โ”€โ”€โ”€โ”€โ”€โ” โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚<โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ 3. Generate JWT โ”‚ + โ”‚ โ”‚ โ”‚ {sub: "analyst", โ”‚ + โ”‚ โ”‚ โ”‚ role: "viewer", โ”‚ + โ”‚ โ”‚ โ”‚ exp: timestamp} โ”‚ + โ”‚ โ”‚ โ”‚โ”€โ”€โ”€โ”€โ”€โ” โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚<โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ 4. Return JWT Token โ”‚ โ”‚ + โ”‚ {"access_token": "eyJ...", "token_type": "bearer"} โ”‚ + โ”‚<โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ 5. Subsequent Requests with Bearer Token โ”‚ โ”‚ + โ”‚ Authorization: Bearer eyJ... โ”‚ โ”‚ + โ”‚ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€>โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ 6. Validate JWT โ”‚ + โ”‚ โ”‚ โ”‚ - Check signature โ”‚ + โ”‚ โ”‚ โ”‚ - Check expiration โ”‚ + โ”‚ โ”‚ โ”‚ - Extract claims โ”‚ + โ”‚ โ”‚ โ”‚โ”€โ”€โ”€โ”€โ”€โ” โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚<โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ 7. Check RBAC โ”‚ + โ”‚ โ”‚ โ”‚ (role permissions) โ”‚ + โ”‚ โ”‚ โ”‚โ”€โ”€โ”€โ”€โ”€โ” โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚<โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ โ”‚ โ”‚ 8. Process Request โ”‚ + โ”‚ โ”‚ โ”‚ (if authorized) โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ +``` + +--- + +## Component Descriptions + +### Neo4j Database Service + +| Attribute | Value | +| ------------------ | ---------------------------------------------------------------- | +| **Image** | `neo4j:5-enterprise` (or `neo4j:5-community` for non-commercial) | +| **Container Name** | `panama-neo4j` | +| **Hostname** | `neo4j` (internal DNS) | +| **Restart Policy** | `unless-stopped` | + +**Ports:** +| Port | Protocol | Purpose | +|------|----------|---------| +| 7687 | Bolt (TCP) | Driver connections (encrypted) | +| 7474 | HTTP | Neo4j Browser web UI | +| 7473 | HTTPS | Neo4j Browser (SSL) | + +**Environment Variables:** + +```bash +NEO4J_AUTH=neo4j/${NEO4J_PASSWORD} # Initial admin credentials +NEO4J_PLUGINS=["apoc", "graph-data-science"] # Auto-install plugins +NEO4J_dbms_memory_heap_initial__size=1G # JVM heap min +NEO4J_dbms_memory_heap_max__size=2G # JVM heap max +NEO4J_dbms_memory_pagecache_size=1G # Page cache for graphs +NEO4J_dbms_security_procedures_unrestricted=apoc.*,gds.* +NEO4J_dbms_security_procedures_allowlist=apoc.*,gds.* +NEO4J_dbms_connector_bolt_listen__address=0.0.0.0:7687 +NEO4J_dbms_connector_http_listen__address=0.0.0.0:7474 +``` + +**Volumes:** +| Volume Name | Mount Point | Purpose | +|-------------|-------------|---------| +| `neo4j_data` | `/data` | Database files (persistent) | +| `neo4j_logs` | `/logs` | Query & debug logs | +| `neo4j_import` | `/var/lib/neo4j/import` | CSV import directory | +| `neo4j_plugins` | `/plugins` | Custom plugins (APOC, GDS) | + +**Health Check:** + +```yaml +healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:7474"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 60s +``` + +--- + +### FastAPI Application Service + +| Attribute | Value | +| --------------------- | ------------------------ | +| **Base Image** | `python:3.11-slim` | +| **Container Name** | `panama-api` | +| **Hostname** | `fastapi` (internal DNS) | +| **Restart Policy** | `unless-stopped` | +| **Working Directory** | `/app` | + +**Port:** +| Port | Protocol | Purpose | +|------|----------|---------| +| 8000 | HTTP | Uvicorn ASGI server | + +**Environment Variables:** + +```bash +# Neo4j Connection +NEO4J_URI=bolt://neo4j:7687 # Internal Docker DNS +NEO4J_USER=neo4j # Database user +NEO4J_PASSWORD=${NEO4J_PASSWORD} # From .env file +NEO4J_DATABASE=neo4j # Target database + +# Application Settings +API_SECRET_KEY=${API_SECRET_KEY} # JWT signing key (256-bit) +API_ALGORITHM=HS256 # JWT algorithm +ACCESS_TOKEN_EXPIRE_MINUTES=30 # Token TTL +ENVIRONMENT=production # prod/staging/dev + +# CORS Configuration +CORS_ORIGINS=["https://yourdomain.com"] # Allowed origins +CORS_ALLOW_CREDENTIALS=true + +# Logging +LOG_LEVEL=INFO # DEBUG in development +LOG_FORMAT=json # Structured logging +``` + +**Dockerfile:** + +```dockerfile +FROM python:3.11-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY ./app ./app + +# Create non-root user +RUN useradd -m appuser && chown -R appuser:appuser /app +USER appuser + +# Health check endpoint +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 + +# Run with uvicorn +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] +``` + +**Key Dependencies (requirements.txt):** + +``` +fastapi==0.109.0 +uvicorn[standard]==0.27.0 +neo4j==5.17.0 +pydantic==2.5.3 +pydantic-settings==2.1.0 +python-jose[cryptography]==3.3.0 +passlib[bcrypt]==1.7.4 +python-multipart==0.0.6 +httpx==0.26.0 +structlog==24.1.0 +``` + +**Health Check Endpoint:** + +```python +@app.get("/health") +async def health_check(db: Neo4jDriver = Depends(get_db)): + try: + await db.verify_connectivity() + return {"status": "healthy", "database": "connected"} + except Exception as e: + raise HTTPException(status_code=503, detail=str(e)) +``` + +--- + +### Nginx Reverse Proxy (Optional) + +| Attribute | Value | +| ------------------ | ---------------- | +| **Image** | `nginx:alpine` | +| **Container Name** | `panama-nginx` | +| **Hostname** | `nginx` | +| **Restart Policy** | `unless-stopped` | + +**Ports:** +| Port | Protocol | Purpose | +|------|----------|---------| +| 80 | HTTP | Redirect to HTTPS | +| 443 | HTTPS | SSL termination | + +**Nginx Configuration (`nginx.conf`):** + +```nginx +upstream fastapi_backend { + server fastapi:8000; + keepalive 32; +} + +upstream neo4j_browser { + server neo4j:7474; +} + +server { + listen 80; + server_name panama.yourdomain.com; + + # Redirect HTTP to HTTPS + return 301 https://$server_name$request_uri; +} + +server { + listen 443 ssl http2; + server_name panama.yourdomain.com; + + # SSL Configuration + ssl_certificate /etc/nginx/ssl/fullchain.pem; + ssl_certificate_key /etc/nginx/ssl/privkey.pem; + ssl_protocols TLSv1.2 TLSv1.3; + ssl_ciphers ECDHE-ECDSA-AES128-GCM-SHA256:ECDHE-RSA-AES128-GCM-SHA256; + ssl_prefer_server_ciphers off; + ssl_session_cache shared:SSL:10m; + ssl_session_timeout 1d; + + # Security Headers + add_header X-Frame-Options "SAMEORIGIN" always; + add_header X-Content-Type-Options "nosniff" always; + add_header X-XSS-Protection "1; mode=block" always; + add_header Strict-Transport-Security "max-age=31536000; includeSubDomains" always; + + # Rate Limiting + limit_req_zone $binary_remote_addr zone=api_limit:10m rate=10r/s; + + # API Routes โ†’ FastAPI + location /api/ { + limit_req zone=api_limit burst=20 nodelay; + + proxy_pass http://fastapi_backend; + proxy_http_version 1.1; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + proxy_set_header Connection ""; + + # Timeouts for long queries + proxy_connect_timeout 60s; + proxy_send_timeout 120s; + proxy_read_timeout 120s; + } + + # Swagger Docs + location /docs { + proxy_pass http://fastapi_backend/docs; + proxy_set_header Host $host; + proxy_set_header X-Forwarded-Proto $scheme; + } + + location /openapi.json { + proxy_pass http://fastapi_backend/openapi.json; + } + + # Neo4j Browser (restricted access) + location /browser { + # IP whitelist for admin access + allow 10.0.0.0/8; + allow 192.168.0.0/16; + deny all; + + proxy_pass http://neo4j_browser; + proxy_http_version 1.1; + proxy_set_header Upgrade $http_upgrade; + proxy_set_header Connection "upgrade"; + proxy_set_header Host $host; + } + + # Health check endpoint (no auth) + location /health { + proxy_pass http://fastapi_backend/health; + } +} +``` + +**Volumes:** +| Volume/Bind | Mount Point | Purpose | +|-------------|-------------|---------| +| `./nginx/nginx.conf` | `/etc/nginx/nginx.conf` | Main config | +| `./nginx/ssl/` | `/etc/nginx/ssl/` | SSL certificates | +| `nginx_logs` | `/var/log/nginx` | Access & error logs | + +--- + +## Docker Compose Configuration + +```yaml +version: "3.8" + +services: + # ============================================ + # NEO4J DATABASE + # ============================================ + neo4j: + image: neo4j:5-enterprise + container_name: panama-neo4j + hostname: neo4j + restart: unless-stopped + ports: + - "7474:7474" # HTTP Browser (dev only, remove in prod) + - "7687:7687" # Bolt protocol + environment: + - NEO4J_AUTH=neo4j/${NEO4J_PASSWORD} + - NEO4J_PLUGINS=["apoc", "graph-data-science"] + - NEO4J_dbms_memory_heap_initial__size=1G + - NEO4J_dbms_memory_heap_max__size=2G + - NEO4J_dbms_memory_pagecache_size=1G + - NEO4J_dbms_security_procedures_unrestricted=apoc.*,gds.* + - NEO4J_ACCEPT_LICENSE_AGREEMENT=yes # Enterprise only + volumes: + - neo4j_data:/data + - neo4j_logs:/logs + - neo4j_import:/var/lib/neo4j/import + - neo4j_plugins:/plugins + networks: + - panama_network + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:7474 || exit 1"] + interval: 30s + timeout: 10s + retries: 5 + start_period: 60s + + # ============================================ + # FASTAPI APPLICATION + # ============================================ + fastapi: + build: + context: ./api + dockerfile: Dockerfile + container_name: panama-api + hostname: fastapi + restart: unless-stopped + ports: + - "8000:8000" # Exposed for dev, use nginx in prod + environment: + - NEO4J_URI=bolt://neo4j:7687 + - NEO4J_USER=neo4j + - NEO4J_PASSWORD=${NEO4J_PASSWORD} + - NEO4J_DATABASE=neo4j + - API_SECRET_KEY=${API_SECRET_KEY} + - API_ALGORITHM=HS256 + - ACCESS_TOKEN_EXPIRE_MINUTES=30 + - ENVIRONMENT=${ENVIRONMENT:-production} + - LOG_LEVEL=${LOG_LEVEL:-INFO} + - CORS_ORIGINS=${CORS_ORIGINS:-["http://localhost:3000"]} + volumes: + - ./api/app:/app/app:ro # Read-only in production + networks: + - panama_network + depends_on: + neo4j: + condition: service_healthy + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s + + # ============================================ + # NGINX REVERSE PROXY (Optional) + # ============================================ + nginx: + image: nginx:alpine + container_name: panama-nginx + hostname: nginx + restart: unless-stopped + ports: + - "80:80" + - "443:443" + volumes: + - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro + - ./nginx/ssl:/etc/nginx/ssl:ro + - nginx_logs:/var/log/nginx + networks: + - panama_network + depends_on: + fastapi: + condition: service_healthy + profiles: + - production # Only start with: docker compose --profile production up + +# ============================================ +# NETWORKS +# ============================================ +networks: + panama_network: + driver: bridge + ipam: + config: + - subnet: 172.28.0.0/16 + +# ============================================ +# VOLUMES +# ============================================ +volumes: + neo4j_data: + driver: local + neo4j_logs: + driver: local + neo4j_import: + driver: local + neo4j_plugins: + driver: local + nginx_logs: + driver: local +``` + +--- + +## Environment Variables File (`.env`) + +```bash +# =========================================== +# PANAMA PAPERS ANALYSIS PLATFORM +# Environment Configuration +# =========================================== + +# ---- Neo4j Database ---- +NEO4J_PASSWORD=Ch@ng3M3!Str0ngP@ssw0rd2024 +NEO4J_DATABASE=neo4j + +# ---- FastAPI Application ---- +# Generate with: openssl rand -hex 32 +API_SECRET_KEY=a1b2c3d4e5f6g7h8i9j0k1l2m3n4o5p6q7r8s9t0u1v2w3x4y5z6 +API_ALGORITHM=HS256 +ACCESS_TOKEN_EXPIRE_MINUTES=30 + +# ---- Environment ---- +ENVIRONMENT=production +LOG_LEVEL=INFO + +# ---- CORS ---- +CORS_ORIGINS=["https://panama.yourdomain.com","http://localhost:3000"] + +# ---- SSL (production) ---- +SSL_CERT_PATH=/etc/nginx/ssl/fullchain.pem +SSL_KEY_PATH=/etc/nginx/ssl/privkey.pem +``` + +> โš ๏ธ **CRITICAL:** Never commit `.env` to version control. Add to `.gitignore`. + +--- + +## Security Considerations + +### 1. Authentication & Authorization + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ AUTHENTICATION ARCHITECTURE โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ Users โ”‚ โ”‚ Roles โ”‚ โ”‚ Permissions โ”‚ โ”‚ Resources โ”‚ +โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค +โ”‚ analyst_1 โ”‚โ”€โ”€โ”€โ–ถโ”‚ viewer โ”‚โ”€โ”€โ”€โ–ถโ”‚ read:entity โ”‚โ”€โ”€โ”€โ–ถโ”‚ /api/entitiesโ”‚ +โ”‚ analyst_2 โ”‚ โ”‚ โ”‚ โ”‚ read:person โ”‚ โ”‚ /api/persons โ”‚ +โ”‚ admin_1 โ”‚โ”€โ”€โ”€โ–ถโ”‚ analyst โ”‚โ”€โ”€โ”€โ–ถโ”‚ read:* โ”‚ โ”‚ /api/search โ”‚ +โ”‚ service_acct โ”‚ โ”‚ โ”‚ โ”‚ write:notes โ”‚ โ”‚ โ”‚ +โ”‚ โ”‚โ”€โ”€โ”€โ–ถโ”‚ admin โ”‚โ”€โ”€โ”€โ–ถโ”‚ read:* โ”‚ โ”‚ /api/admin/* โ”‚ +โ”‚ โ”‚ โ”‚ โ”‚ โ”‚ write:* โ”‚ โ”‚ /browser โ”‚ +โ”‚ โ”‚โ”€โ”€โ”€โ–ถโ”‚ service โ”‚โ”€โ”€โ”€โ–ถโ”‚ bulk:import โ”‚ โ”‚ /api/import โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +**JWT Token Structure:** + +```json +{ + "sub": "analyst_1", + "role": "analyst", + "permissions": [ + "read:entity", + "read:person", + "read:ownership", + "write:notes" + ], + "exp": 1706745600, + "iat": 1706744000, + "jti": "unique-token-id-for-revocation" +} +``` + +**Role-Based Access Control (RBAC):** + +| Role | Permissions | Use Case | +| --------- | ------------------------------------------------ | --------------------- | +| `viewer` | `read:entity`, `read:person`, `read:search` | Read-only analysts | +| `analyst` | All `viewer` + `write:notes`, `export:data` | Investigation team | +| `admin` | All `analyst` + `manage:users`, `access:browser` | System administrators | +| `service` | `bulk:import`, `bulk:export` | ETL pipelines | + +### 2. Network Security + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ NETWORK SECURITY LAYERS โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + + INTERNET DMZ INTERNAL + โ”‚ โ”‚ โ”‚ + โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ + โ”‚ โ”‚ FIREWALL / WAF โ”‚ โ”‚ + โ”‚ โ”‚ - Rate limiting (100 req/min) โ”‚ โ”‚ + โ”‚ โ”‚ - IP blacklisting โ”‚ โ”‚ + โ”‚ โ”‚ - SQL injection prevention โ”‚ โ”‚ + โ”‚ โ”‚ - DDoS protection โ”‚ โ”‚ + โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ โ”‚ + โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ + โ”‚ โ”‚ NGINX โ”‚ โ”‚ + โ”‚ โ”‚ (SSL termination)โ”‚ โ”‚ + โ”‚ โ”‚ Port 443 only โ”‚ โ”‚ + โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ โ”‚ + โ”‚ โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• โ”‚ + โ”‚ Docker Network โ”‚ (172.28.0.0/16) โ”‚ + โ”‚ โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ชโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• โ”‚ + โ”‚ โ”‚ โ”‚ + โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ + โ”‚ โ”‚ FastAPI โ”‚ โ”‚ + โ”‚ โ”‚ (no external IP) โ”‚ โ”‚ + โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ โ”‚ + โ”‚ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ผโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ + โ”‚ โ”‚ Neo4j โ”‚ โ”‚ + โ”‚ โ”‚ (no external IP) โ”‚ โ”‚ + โ”‚ โ”‚ Bolt: encrypted โ”‚ โ”‚ + โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ +``` + +### 3. Credential Management + +| Secret | Storage Method | Rotation Period | +| ---------------- | ---------------------- | ----------------------- | +| `NEO4J_PASSWORD` | Docker secrets / Vault | 90 days | +| `API_SECRET_KEY` | Docker secrets / Vault | 30 days | +| SSL certificates | Certbot auto-renewal | 90 days (Let's Encrypt) | +| JWT tokens | Short-lived (30 min) | Per session | + +### 4. Input Validation (Pydantic Models) + +```python +from pydantic import BaseModel, Field, validator +from typing import Optional +import re + +class EntitySearchRequest(BaseModel): + """Validated search request - prevents injection attacks.""" + + query: str = Field(..., min_length=2, max_length=200) + jurisdiction: Optional[str] = Field(None, regex=r'^[A-Z]{2,3}$') + entity_type: Optional[str] = Field(None) + limit: int = Field(default=50, ge=1, le=500) + skip: int = Field(default=0, ge=0) + + @validator('query') + def sanitize_query(cls, v): + # Remove potential Cypher injection patterns + dangerous_patterns = [ + r'MATCH\s*\(', + r'CREATE\s*\(', + r'DELETE\s+', + r'DETACH\s+', + r'CALL\s+\{', + r'\/\*.*\*\/', + ] + for pattern in dangerous_patterns: + if re.search(pattern, v, re.IGNORECASE): + raise ValueError('Invalid characters in search query') + return v.strip() + + @validator('entity_type') + def validate_entity_type(cls, v): + allowed = {'Company', 'Trust', 'Fund', 'Foundation', 'Partnership'} + if v and v not in allowed: + raise ValueError(f'entity_type must be one of: {allowed}') + return v +``` + +### 5. Neo4j Security Configuration + +```cypher +// Create application-specific database user (principle of least privilege) +CREATE USER api_user SET PASSWORD 'secure_password' CHANGE NOT REQUIRED; +GRANT ROLE reader TO api_user; + +// Custom role with limited write access +CREATE ROLE analyst; +GRANT MATCH {*} ON GRAPH neo4j TO analyst; +GRANT WRITE ON GRAPH neo4j NODE Note TO analyst; +DENY DELETE ON GRAPH neo4j TO analyst; +DENY WRITE ON GRAPH neo4j RELATIONSHIP OWNS TO analyst; + +// Rate limiting via APOC (application level) +CALL apoc.config.map('apoc.max.degreeCentrality.batch', 1000); +``` + +--- + +## Startup Commands + +**Development:** + +```bash +# Start without nginx +docker compose up -d neo4j fastapi + +# View logs +docker compose logs -f fastapi + +# Access Neo4j Browser +open http://localhost:7474 +``` + +**Production:** + +```bash +# Start all services including nginx +docker compose --profile production up -d + +# Verify health +curl -s https://panama.yourdomain.com/health | jq + +# Scale API if needed +docker compose up -d --scale fastapi=3 +``` + +**Data Import:** + +```bash +# Copy CSV files to import volume +docker cp ./data/entities.csv panama-neo4j:/var/lib/neo4j/import/ + +# Run import inside container +docker exec -it panama-neo4j cypher-shell -u neo4j -p $NEO4J_PASSWORD \ + "LOAD CSV WITH HEADERS FROM 'file:///entities.csv' AS row + CREATE (e:Entity {entity_id: row.id, name: row.name})" +``` + +--- + +## Monitoring & Observability + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ MONITORING STACK (Optional) โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ Grafana โ”‚ โ—„โ”€โ”€โ”€ Dashboards & Alerts + โ”‚ :3000 โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ โ”‚ + โ–ผ โ–ผ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ Prometheus โ”‚ โ”‚ Loki โ”‚ + โ”‚ :9090 โ”‚ โ”‚ :3100 โ”‚ + โ”‚ (metrics) โ”‚ โ”‚ (logs) โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ โ”‚ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ โ”‚ + โ–ผ โ–ผ โ–ผ โ–ผ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” Promtail +โ”‚Neo4j โ”‚ โ”‚FastAPI โ”‚ โ”‚Nginx โ”‚ (log collector) +โ”‚metricsโ”‚ โ”‚metrics โ”‚ โ”‚logs โ”‚ +โ”‚:2004 โ”‚ โ”‚:8000 โ”‚ โ”‚ โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +``` + +**Key Metrics to Monitor:** + +| Service | Metric | Alert Threshold | +| ------- | ----------------------------------- | --------------- | +| Neo4j | `neo4j_bolt_connections_active` | > 100 | +| Neo4j | `neo4j_page_cache_hit_ratio` | < 0.95 | +| Neo4j | `neo4j_transaction_active` | > 50 | +| FastAPI | `http_request_duration_seconds` | p99 > 2s | +| FastAPI | `http_requests_total{status="5xx"}` | > 10/min | +| Nginx | `nginx_http_requests_total` | > 1000/min | + +--- + +_Architecture designed for ICIJ Panama Papers analysis. Suitable for production deployment with appropriate security hardening._ diff --git a/create_notebook.py b/create_notebook.py deleted file mode 100644 index 2fc9fb6..0000000 --- a/create_notebook.py +++ /dev/null @@ -1,264 +0,0 @@ -""" -Script to generate graph_eda.ipynb Jupyter notebook. -Run this script to create the notebook file. -""" - -import json - -# Notebook structure -notebook = { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Offshore Graph Analysis - Exploratory Data Analysis\n", - "\n", - "This notebook demonstrates connection to Neo4j and basic graph statistics for the Offshore Leaks dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Setup and Installation" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Install required packages if not already installed\n", - "# !pip install neo4j pandas matplotlib" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "from neo4j import GraphDatabase\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Connect to Neo4j Database" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Connection parameters\n", - "URI = \"bolt://localhost:7687\"\n", - "USER = \"neo4j\"\n", - "PASSWORD = \"password\"\n", - "\n", - "# Create driver\n", - "driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))\n", - "\n", - "# Verify connectivity\n", - "try:\n", - " driver.verify_connectivity()\n", - " print(\"โœ“ Successfully connected to Neo4j\")\n", - "except Exception as e:\n", - " print(f\"โœ— Connection failed: {e}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Query 1: Basic Statistics - Count Nodes by Type" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Query to count nodes by type\n", - "query = \"\"\"\n", - "MATCH (n)\n", - "RETURN labels(n)[0] as node_type, count(n) as count\n", - "ORDER BY count DESC\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_stats = pd.DataFrame([{\"Node Type\": r[\"node_type\"], \"Count\": r[\"count\"]} for r in data])\n", - "\n", - "print(\"\\n=== Node Statistics ===\")\n", - "print(df_stats.to_string(index=False))\n", - "\n", - "# Display as DataFrame\n", - "df_stats" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Query 2: Top 10 Jurisdictions by Number of Entities" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Query for top jurisdictions\n", - "query = \"\"\"\n", - "MATCH (e:Entity)-[:REGISTERED_IN]->(j:Jurisdiction)\n", - "WITH j, count(e) as entity_count\n", - "ORDER BY entity_count DESC\n", - "LIMIT 10\n", - "RETURN j.name as jurisdiction, entity_count\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_jurisdictions = pd.DataFrame([{\n", - " \"Jurisdiction\": r[\"jurisdiction\"],\n", - " \"Entity Count\": r[\"entity_count\"]\n", - "} for r in data])\n", - "\n", - "print(\"\\n=== Top 10 Jurisdictions by Entity Count ===\")\n", - "print(df_jurisdictions.to_string(index=False))\n", - "\n", - "df_jurisdictions" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Create bar chart\n", - "plt.figure(figsize=(12, 6))\n", - "plt.barh(df_jurisdictions[\"Jurisdiction\"], df_jurisdictions[\"Entity Count\"], color='steelblue')\n", - "plt.xlabel('Number of Entities', fontsize=12)\n", - "plt.ylabel('Jurisdiction', fontsize=12)\n", - "plt.title('Top 10 Jurisdictions by Number of Entities', fontsize=14, fontweight='bold')\n", - "plt.gca().invert_yaxis() # Show highest at top\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Query 3: Risk Analysis - Top 5 Red Flag Addresses" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Query for addresses with most registered entities (Red Flags)\n", - "query = \"\"\"\n", - "MATCH (a:Address)<-[:REGISTERED_ADDRESS]-(e:Entity)\n", - "WITH a, count(e) as entity_count, collect(e.name)[0..5] as sample_entities\n", - "ORDER BY entity_count DESC\n", - "LIMIT 5\n", - "RETURN a.address as address, entity_count, sample_entities\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_redflags = pd.DataFrame([{\n", - " \"Address\": r[\"address\"][:100] + \"...\" if r[\"address\"] and len(r[\"address\"]) > 100 else (r[\"address\"] or \"N/A\"),\n", - " \"Entity Count\": r[\"entity_count\"],\n", - " \"Sample Entities\": \", \".join([e for e in r[\"sample_entities\"] if e])[:100]\n", - "} for r in data])\n", - "\n", - "print(\"\\n=== Top 5 Red Flag Addresses (Most Entities) ===\")\n", - "print(df_redflags.to_string(index=False))\n", - "\n", - "df_redflags" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize red flags\n", - "plt.figure(figsize=(14, 6))\n", - "plt.bar(range(len(df_redflags)), df_redflags[\"Entity Count\"], color='crimson')\n", - "plt.xlabel('Address Index', fontsize=12)\n", - "plt.ylabel('Number of Entities', fontsize=12)\n", - "plt.title('Top 5 Red Flag Addresses - Entities per Address', fontsize=14, fontweight='bold')\n", - "plt.xticks(range(len(df_redflags)), [f\"Address {i+1}\" for i in range(len(df_redflags))])\n", - "plt.grid(axis='y', alpha=0.3)\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": None, - "metadata": {}, - "outputs": [], - "source": [ - "# Close the driver connection\n", - "driver.close()\n", - "print(\"โœ“ Database connection closed\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} - -# Write notebook to file -with open("graph_eda.ipynb", "w", encoding="utf-8") as f: - json.dump(notebook, f, indent=2, ensure_ascii=False) - -print("โœ“ Successfully created graph_eda.ipynb") - diff --git a/docker-compose.yml b/docker-compose.yml index ab1c54e..2aeadbe 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,47 +1,97 @@ -version: '3.8' +version: "3.9" services: neo4j: image: neo4j:5-community - container_name: offshore-neo4j + container_name: panama-papers-neo4j ports: - - "7474:7474" # HTTP - - "7687:7687" # Bolt + - "7474:7474" # HTTP Browser + - "7687:7687" # Bolt Protocol environment: - - NEO4J_AUTH=neo4j/password - - NEO4J_PLUGINS=["apoc", "graph-data-science"] - - NEO4J_dbms_security_procedures_unrestricted=gds.*,apoc.* + - NEO4J_AUTH=neo4j/${NEO4J_PASSWORD:?NEO4J_PASSWORD must be set in .env} + - NEO4J_dbms_memory_pagecache_size=1G + - NEO4J_dbms_memory_heap_initial__size=1G + - NEO4J_dbms_memory_heap_max__size=2G + - NEO4J_PLUGINS=["apoc"] + - NEO4J_dbms_security_procedures_unrestricted=apoc.* + - NEO4J_dbms_security_procedures_allowlist=apoc.* volumes: - neo4j_data:/data - neo4j_logs:/logs + - neo4j_import:/var/lib/neo4j/import + - neo4j_plugins:/plugins + networks: + - panama-papers-net + restart: unless-stopped healthcheck: - test: ["CMD", "cypher-shell", "-u", "neo4j", "-p", "password", "RETURN 1"] + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:7474 || exit 1"] interval: 10s timeout: 5s retries: 5 + start_period: 30s - api: + fastapi: build: context: . dockerfile: Dockerfile - container_name: offshore-api + container_name: panama-papers-api ports: - "8000:8000" - volumes: - - ./app:/code/app - - ./scripts:/code/scripts - - ./data:/code/data - - ./Data:/code/Data environment: - NEO4J_URI=bolt://neo4j:7687 - NEO4J_USER=neo4j - - NEO4J_PASSWORD=password + - NEO4J_PASSWORD=${NEO4J_PASSWORD:?NEO4J_PASSWORD must be set in .env} + - API_HOST=0.0.0.0 + - API_PORT=8000 + - LOG_LEVEL=${LOG_LEVEL:-info} + volumes: + - ./app:/app/app:ro + - ./data:/data:ro depends_on: neo4j: condition: service_healthy - command: uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload + networks: + - panama-papers-net + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:8000/health || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s + + nginx: + image: nginx:alpine + container_name: panama-papers-nginx + ports: + - "80:80" + volumes: + - ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro + - ./nginx/logs:/var/log/nginx + depends_on: + fastapi: + condition: service_healthy + networks: + - panama-papers-net + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:80 || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 10s + +networks: + panama-papers-net: + driver: bridge + name: panama-papers-network volumes: neo4j_data: + name: panama-papers-neo4j-data neo4j_logs: - + name: panama-papers-neo4j-logs + neo4j_import: + name: panama-papers-neo4j-import + neo4j_plugins: + name: panama-papers-neo4j-plugins diff --git a/graph_eda.ipynb b/graph_eda.ipynb deleted file mode 100644 index b3b4bd6..0000000 --- a/graph_eda.ipynb +++ /dev/null @@ -1,249 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Offshore Graph Analysis - Exploratory Data Analysis\n", - "\n", - "This notebook demonstrates connection to Neo4j and basic graph statistics for the Offshore Leaks dataset." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Setup and Installation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Install required packages if not already installed\n", - "# !pip install neo4j pandas matplotlib" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "from neo4j import GraphDatabase\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 2. Connect to Neo4j Database" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Connection parameters\n", - "URI = \"bolt://localhost:7687\"\n", - "USER = \"neo4j\"\n", - "PASSWORD = \"password\"\n", - "\n", - "# Create driver\n", - "driver = GraphDatabase.driver(URI, auth=(USER, PASSWORD))\n", - "\n", - "# Verify connectivity\n", - "try:\n", - " driver.verify_connectivity()\n", - " print(\"โœ“ Successfully connected to Neo4j\")\n", - "except Exception as e:\n", - " print(f\"โœ— Connection failed: {e}\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Query 1: Basic Statistics - Count Nodes by Type" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Query to count nodes by type\n", - "query = \"\"\"\n", - "MATCH (n)\n", - "RETURN labels(n)[0] as node_type, count(n) as count\n", - "ORDER BY count DESC\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_stats = pd.DataFrame([{\"Node Type\": r[\"node_type\"], \"Count\": r[\"count\"]} for r in data])\n", - "\n", - "print(\"\\n=== Node Statistics ===\")\n", - "print(df_stats.to_string(index=False))\n", - "\n", - "# Display as DataFrame\n", - "df_stats" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 4. Query 2: Top 10 Jurisdictions by Number of Entities" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Query for top jurisdictions\n", - "query = \"\"\"\n", - "MATCH (e:Entity)-[:REGISTERED_IN]->(j:Jurisdiction)\n", - "WITH j, count(e) as entity_count\n", - "ORDER BY entity_count DESC\n", - "LIMIT 10\n", - "RETURN j.name as jurisdiction, entity_count\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_jurisdictions = pd.DataFrame([{\n", - " \"Jurisdiction\": r[\"jurisdiction\"],\n", - " \"Entity Count\": r[\"entity_count\"]\n", - "} for r in data])\n", - "\n", - "print(\"\\n=== Top 10 Jurisdictions by Entity Count ===\")\n", - "print(df_jurisdictions.to_string(index=False))\n", - "\n", - "df_jurisdictions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create bar chart\n", - "plt.figure(figsize=(12, 6))\n", - "plt.barh(df_jurisdictions[\"Jurisdiction\"], df_jurisdictions[\"Entity Count\"], color='steelblue')\n", - "plt.xlabel('Number of Entities', fontsize=12)\n", - "plt.ylabel('Jurisdiction', fontsize=12)\n", - "plt.title('Top 10 Jurisdictions by Number of Entities', fontsize=14, fontweight='bold')\n", - "plt.gca().invert_yaxis() # Show highest at top\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 5. Query 3: Risk Analysis - Top 5 Red Flag Addresses" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Query for addresses with most registered entities (Red Flags)\n", - "query = \"\"\"\n", - "MATCH (a:Address)<-[:REGISTERED_ADDRESS]-(e:Entity)\n", - "WITH a, count(e) as entity_count, collect(e.name)[0..5] as sample_entities\n", - "ORDER BY entity_count DESC\n", - "LIMIT 5\n", - "RETURN a.address as address, entity_count, sample_entities\n", - "\"\"\"\n", - "\n", - "with driver.session() as session:\n", - " result = session.run(query)\n", - " data = [record for record in result]\n", - "\n", - "# Create DataFrame\n", - "df_redflags = pd.DataFrame([{\n", - " \"Address\": r[\"address\"][:100] + \"...\" if r[\"address\"] and len(r[\"address\"]) > 100 else (r[\"address\"] or \"N/A\"),\n", - " \"Entity Count\": r[\"entity_count\"],\n", - " \"Sample Entities\": \", \".join([e for e in r[\"sample_entities\"] if e])[:100]\n", - "} for r in data])\n", - "\n", - "print(\"\\n=== Top 5 Red Flag Addresses (Most Entities) ===\")\n", - "print(df_redflags.to_string(index=False))\n", - "\n", - "df_redflags" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Visualize red flags\n", - "plt.figure(figsize=(14, 6))\n", - "plt.bar(range(len(df_redflags)), df_redflags[\"Entity Count\"], color='crimson')\n", - "plt.xlabel('Address Index', fontsize=12)\n", - "plt.ylabel('Number of Entities', fontsize=12)\n", - "plt.title('Top 5 Red Flag Addresses - Entities per Address', fontsize=14, fontweight='bold')\n", - "plt.xticks(range(len(df_redflags)), [f\"Address {i+1}\" for i in range(len(df_redflags))])\n", - "plt.grid(axis='y', alpha=0.3)\n", - "plt.tight_layout()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 6. Cleanup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Close the driver connection\n", - "driver.close()\n", - "print(\"โœ“ Database connection closed\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "name": "python", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/nginx/nginx.conf b/nginx/nginx.conf new file mode 100644 index 0000000..cb1fc62 --- /dev/null +++ b/nginx/nginx.conf @@ -0,0 +1,22 @@ +events { + worker_connections 1024; +} + +http { + upstream fastapi { + server fastapi:8000; + } + + server { + listen 80; + server_name localhost; + + location / { + proxy_pass http://fastapi; + proxy_set_header Host $host; + proxy_set_header X-Real-IP $remote_addr; + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; + proxy_set_header X-Forwarded-Proto $scheme; + } + } +} diff --git a/requirements.txt b/requirements.txt index e17c947..bfa9ceb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,52 @@ -fastapi==0.104.1 -uvicorn[standard]==0.24.0 -neo4j==5.14.1 -pydantic==2.5.0 -pydantic-settings==2.1.0 -python-dotenv==1.0.0 -pytest==7.4.3 -pytest-asyncio==0.21.1 -pytest-cov==4.1.0 -httpx==0.25.2 -pandas==2.1.3 -matplotlib==3.8.2 -jupyter==1.0.0 -pylint==3.0.3 +# ============================================================================= +# PANAMA PAPERS NEO4J PROJECT - PYTHON DEPENDENCIES +# ============================================================================= +# Python Version: 3.11+ +# Install: pip install -r requirements.txt +# ============================================================================= +# ----------------------------------------------------------------------------- +# CORE FRAMEWORK - FastAPI & ASGI Server +# ----------------------------------------------------------------------------- +fastapi==0.115.5 +uvicorn[standard]==0.32.1 +python-multipart==0.0.17 + +# ----------------------------------------------------------------------------- +# DATABASE - Neo4j Graph Database Driver +# ----------------------------------------------------------------------------- +neo4j==5.26.0 + +# ----------------------------------------------------------------------------- +# DATA VALIDATION - Pydantic Models +# ----------------------------------------------------------------------------- +pydantic==2.10.3 +pydantic-settings==2.6.1 + +# ----------------------------------------------------------------------------- +# CONFIGURATION - Environment & Settings +# ----------------------------------------------------------------------------- +python-dotenv==1.0.1 + +# ----------------------------------------------------------------------------- +# HTTP CLIENT - Async Requests +# ----------------------------------------------------------------------------- +httpx==0.28.1 + +# ----------------------------------------------------------------------------- +# TESTING FRAMEWORK +# ----------------------------------------------------------------------------- +pytest==8.3.4 +pytest-asyncio==0.24.0 +pytest-cov==6.0.0 + +# ----------------------------------------------------------------------------- +# DATA PROCESSING - Import & Analytics +# ----------------------------------------------------------------------------- +pandas==2.2.3 +numpy==2.2.1 + +# ----------------------------------------------------------------------------- +# ASYNC UTILITIES +# ----------------------------------------------------------------------------- +aiofiles==24.1.0 \ No newline at end of file diff --git a/schema_design.md b/schema_design.md new file mode 100644 index 0000000..5a96c7a --- /dev/null +++ b/schema_design.md @@ -0,0 +1,640 @@ +# Neo4j Graph Schema - Panama Papers Offshore Network + +> **Version:** Neo4j 5.x Compatible +> **Purpose:** Beneficial ownership tracing, intermediary detection, and offshore financial network analysis +> **Dataset Context:** ICIJ Panama Papers leak containing ~11.5 million documents + +--- + +## Node Labels + +### Entity + +**Purpose:** Abstract parent label representing any legal entity involved in offshore structures. Serves as a supertype for Company, Fund, and Trust distinctions while enabling polymorphic queries across all entity types. + +**Properties:** +| Property | Data Type | Indexed | Constraints | Description | +|----------|-----------|---------|-------------|-------------| +| entity_id | String | โœ… Primary | UNIQUE, NOT NULL | ICIJ internal identifier (e.g., "10000001") | +| name | String | โœ… Full-text | NOT NULL | Legal or registered name | +| original_name | String | โŒ | โ€” | Name in original script (non-Latin) | +| jurisdiction_code | String | โœ… | โ€” | ISO 3166-1 alpha-3 or custom code (e.g., "BVI", "PAN") | +| incorporation_date | Date | โŒ | โ€” | Date of legal formation | +| inactivation_date | Date | โŒ | โ€” | Date entity became inactive/dissolved | +| struck_off_date | Date | โŒ | โ€” | Date removed from registry | +| status | String | โœ… | โ€” | Options: `Active`, `Inactive`, `Dissolved`, `Struck Off`, `Unknown` | +| entity_type | String | โœ… | โ€” | Options: `Company`, `Trust`, `Fund`, `Foundation`, `Partnership`, `Other` | +| service_provider | String | โŒ | โ€” | Originating law firm (e.g., "Mossack Fonseca") | +| source | String | โŒ | NOT NULL | Leak source: `Panama Papers`, `Paradise Papers`, `Pandora Papers` | +| countries_linked | List\ | โŒ | โ€” | Countries associated via addresses or officers | + +--- + +### Person + +**Purpose:** Represents natural persons who act as officers, shareholders, or beneficial owners. Critical for tracing ultimate beneficial ownership (UBO) and identifying politically exposed persons (PEPs). + +**Properties:** +| Property | Data Type | Indexed | Constraints | Description | +|----------|-----------|---------|-------------|-------------| +| person_id | String | โœ… Primary | UNIQUE, NOT NULL | ICIJ internal identifier | +| full_name | String | โœ… Full-text | NOT NULL | Complete name as recorded | +| first_name | String | โŒ | โ€” | Given name (parsed) | +| last_name | String | โœ… | โ€” | Family name (for matching) | +| nationality | String | โœ… | โ€” | ISO 3166-1 alpha-3 country code | +| country_of_residence | String | โŒ | โ€” | Current residence country | +| date_of_birth | Date | โŒ | โ€” | Birth date (often incomplete) | +| year_of_birth | Integer | โŒ | โ€” | Birth year when full date unknown | +| is_pep | Boolean | โœ… | DEFAULT false | Politically Exposed Person flag | +| pep_details | String | โŒ | โ€” | Political role/position if PEP | +| source | String | โŒ | NOT NULL | Leak source identifier | + +--- + +### Company + +**Purpose:** Specific subtype of Entity representing incorporated companiesโ€”the primary vehicle for offshore structures. Enables company-specific queries and regulatory classification. + +**Properties:** +| Property | Data Type | Indexed | Constraints | Description | +|----------|-----------|---------|-------------|-------------| +| company_id | String | โœ… Primary | UNIQUE, NOT NULL | ICIJ identifier (may equal entity_id) | +| company_name | String | โœ… Full-text | NOT NULL | Registered company name | +| company_number | String | โœ… | โ€” | Official registry number | +| company_type | String | โŒ | โ€” | Options: `Ltd`, `LLC`, `Inc`, `SA`, `BV`, `GmbH`, etc. | +| jurisdiction_code | String | โœ… | NOT NULL | Registration jurisdiction | +| registered_agent | String | โŒ | โ€” | Name of registered agent | +| share_capital | Float | โŒ | โ€” | Authorized share capital | +| share_capital_currency | String | โŒ | โ€” | ISO 4217 currency code | +| is_shell_company | Boolean | โœ… | DEFAULT false | Suspected shell company flag | +| incorporation_date | Date | โœ… | โ€” | Date of incorporation | +| dissolution_date | Date | โŒ | โ€” | Date of dissolution | +| status | String | โœ… | โ€” | Options: `Active`, `Dormant`, `Dissolved`, `Struck Off` | + +--- + +### Officer + +**Purpose:** Represents roles held by persons or entities in companies (directors, secretaries, nominees). A linking concept that captures the temporal and functional aspects of corporate governance positions. + +**Properties:** +| Property | Data Type | Indexed | Constraints | Description | +|----------|-----------|---------|-------------|-------------| +| officer_id | String | โœ… Primary | UNIQUE, NOT NULL | ICIJ identifier | +| role_type | String | โœ… | NOT NULL | Options: `Director`, `Secretary`, `Nominee Director`, `Nominee Shareholder`, `Protector`, `Beneficiary`, `Shareholder`, `Power of Attorney`, `Authorized Signatory` | +| name | String | โœ… Full-text | NOT NULL | Name as recorded in documents | +| is_corporate_officer | Boolean | โœ… | DEFAULT false | True if officer is a company (not natural person) | +| start_date | Date | โŒ | โ€” | Appointment date | +| end_date | Date | โŒ | โ€” | Resignation/termination date | +| status | String | โŒ | โ€” | Options: `Active`, `Resigned`, `Removed`, `Unknown` | +| source_document | String | โŒ | โ€” | Reference to source document | + +--- + +### Intermediary + +**Purpose:** Represents law firms, banks, accountants, and other professional enablers who create and manage offshore structures. Critical for detecting systemic facilitators and compliance failures. + +**Properties:** +| Property | Data Type | Indexed | Constraints | Description | +|----------|-----------|---------|-------------|-------------| +| intermediary_id | String | โœ… Primary | UNIQUE, NOT NULL | ICIJ identifier | +| name | String | โœ… Full-text | NOT NULL | Firm/intermediary name | +| type | String | โœ… | โ€” | Options: `Law Firm`, `Bank`, `Trust Company`, `Accountant`, `Financial Advisor`, `Corporate Service Provider`, `Other` | +| country_code | String | โœ… | โ€” | Primary country of operation | +| address | String | โŒ | โ€” | Business address | +| status | String | โŒ | โ€” | Options: `Active`, `Closed`, `Sanctioned`, `Unknown` | +| entities_created_count | Integer | โŒ | โ€” | Denormalized count (for ranking) | +| first_activity_date | Date | โŒ | โ€” | Earliest known involvement | +| last_activity_date | Date | โŒ | โ€” | Most recent activity | + +--- + +### Address + +**Purpose:** Represents physical and registered addresses. Enables geographic clustering analysis, identification of nominee addresses (many entities at one address), and sanctions screening by location. + +**Properties:** +| Property | Data Type | Indexed | Constraints | Description | +|----------|-----------|---------|-------------|-------------| +| address_id | String | โœ… Primary | UNIQUE, NOT NULL | ICIJ identifier | +| full_address | String | โœ… Full-text | NOT NULL | Complete address string | +| address_line_1 | String | โŒ | โ€” | Street address | +| address_line_2 | String | โŒ | โ€” | Suite/floor/unit | +| city | String | โœ… | โ€” | City name | +| state_province | String | โŒ | โ€” | State or province | +| postal_code | String | โœ… | โ€” | Postal/ZIP code | +| country_code | String | โœ… | NOT NULL | ISO 3166-1 alpha-3 | +| is_registered_office | Boolean | โŒ | DEFAULT false | Official registered address | +| is_nominee_address | Boolean | โœ… | DEFAULT false | Known nominee/mass registration address | +| entities_at_address | Integer | โŒ | โ€” | Count of entities (denormalized) | + +--- + +### Jurisdiction + +**Purpose:** Reference node for tax havens and registration jurisdictions. Enables jurisdiction-based risk scoring, regulatory analysis, and geographic network mapping. + +**Properties:** +| Property | Data Type | Indexed | Constraints | Description | +|----------|-----------|---------|-------------|-------------| +| jurisdiction_code | String | โœ… Primary | UNIQUE, NOT NULL | ISO code or custom (e.g., "BVI", "VGB") | +| name | String | โœ… | NOT NULL | Full jurisdiction name | +| country_code | String | โŒ | โ€” | Parent country ISO code | +| region | String | โœ… | โ€” | Geographic region: `Caribbean`, `Europe`, `Asia-Pacific`, etc. | +| is_tax_haven | Boolean | โœ… | DEFAULT false | OECD/EU blacklist status | +| secrecy_score | Integer | โŒ | โ€” | Tax Justice Network score (0-100) | +| corporate_tax_rate | Float | โŒ | โ€” | Nominal corporate tax rate | +| crs_participant | Boolean | โŒ | DEFAULT false | Common Reporting Standard participant | +| fatf_status | String | โŒ | โ€” | Options: `Compliant`, `Grey List`, `Black List`, `Not Rated` | +| entity_count | Integer | โŒ | โ€” | Total entities registered (denormalized) | + +--- + +## Relationship Types + +### OWNS + +**Purpose:** Captures direct and indirect ownership stakes between entities. Essential for tracing beneficial ownership chains, identifying layered structures, and calculating effective ownership percentages through multi-hop traversals. + +**Direction:** `(Entity|Person|Company) -[OWNS]-> (Entity|Company)` + +**Properties:** +| Property | Data Type | Constraints | Description | +|----------|-----------|-------------|-------------| +| ownership_percentage | Float | 0.0-100.0 | Percentage of shares/interest held | +| share_count | Integer | โ€” | Number of shares held | +| share_class | String | โ€” | Class of shares (A, B, Ordinary, Preferred) | +| acquisition_date | Date | โ€” | Date ownership acquired | +| end_date | Date | โ€” | Date ownership ended | +| status | String | DEFAULT 'Active' | Options: `Active`, `Historical`, `Disputed` | +| is_beneficial | Boolean | DEFAULT false | True if beneficial (vs. legal) ownership | +| is_nominee | Boolean | DEFAULT false | Nominee arrangement flag | +| source_document | String | โ€” | Reference to evidence document | + +**Why Necessary:** Ownership is the fundamental relationship in offshore structures. Without it, beneficial ownership tracing is impossible. The `ownership_percentage` and `is_nominee` properties are critical for calculating effective control and identifying hidden ownership. + +--- + +### CONTROLS + +**Purpose:** Represents de facto control relationships that exist independently of formal ownershipโ€”through voting agreements, board control, or contractual arrangements. Captures the reality that ownership percentages often understate actual control. + +**Direction:** `(Person|Entity|Company) -[CONTROLS]-> (Entity|Company)` + +**Properties:** +| Property | Data Type | Constraints | Description | +|----------|-----------|-------------|-------------| +| control_type | String | NOT NULL | Options: `Board Majority`, `Voting Agreement`, `Contractual`, `De Facto`, `Protector Powers`, `Veto Rights` | +| control_percentage | Float | โ€” | Effective control percentage if calculable | +| start_date | Date | โ€” | Control relationship start | +| end_date | Date | โ€” | Control relationship end | +| status | String | DEFAULT 'Active' | Options: `Active`, `Historical`, `Suspected` | +| evidence_strength | String | โ€” | Options: `Confirmed`, `Probable`, `Suspected` | +| notes | String | โ€” | Explanatory notes | + +**Why Necessary:** Many offshore structures use trusts, foundations, or nominee arrangements where legal ownership is divorced from control. A person may own 0% of shares but control the entity through protector powers or board appointments. This relationship captures what OWNS cannot. + +--- + +### HAS_ADDRESS + +**Purpose:** Links entities, persons, and intermediaries to their associated addresses. Enables geographic analysis, identification of mass-registration addresses (red flags), and sanctions screening. + +**Direction:** `(Entity|Person|Company|Intermediary|Officer) -[HAS_ADDRESS]-> (Address)` + +**Properties:** +| Property | Data Type | Constraints | Description | +|----------|-----------|-------------|-------------| +| address_type | String | NOT NULL | Options: `Registered`, `Business`, `Residential`, `Correspondence`, `Former` | +| start_date | Date | โ€” | Address valid from | +| end_date | Date | โ€” | Address valid until | +| is_primary | Boolean | DEFAULT false | Primary/current address flag | +| source | String | โ€” | Data source for this link | + +**Why Necessary:** Address linkage is critical for (1) identifying nominee addresses where hundreds of entities share one location, (2) geographic clustering to find related entities, (3) sanctions and watchlist screening, and (4) verifying identity through address matching. + +--- + +### REGISTERED_IN + +**Purpose:** Links entities and companies to their jurisdiction of registration. Enables jurisdiction-based risk analysis, regulatory mapping, and identification of jurisdiction-shopping patterns. + +**Direction:** `(Entity|Company) -[REGISTERED_IN]-> (Jurisdiction)` + +**Properties:** +| Property | Data Type | Constraints | Description | +|----------|-----------|-------------|-------------| +| registration_number | String | โ€” | Official registry number | +| registration_date | Date | โ€” | Date of registration | +| registry_name | String | โ€” | Name of registering authority | +| status | String | DEFAULT 'Active' | Options: `Active`, `Struck Off`, `Dissolved`, `Migrated` | +| migration_from | String | โ€” | Previous jurisdiction if migrated | + +**Why Necessary:** Jurisdiction choice is deliberate in offshore planning. This relationship enables analysis of which jurisdictions are favored by specific intermediaries, persons, or for specific purposes. The `migration_from` property tracks jurisdiction-hopping to avoid regulation. + +--- + +### INVOLVED_IN + +**Purpose:** Links persons and intermediaries to entities where they play a role. A general-purpose relationship that captures involvement beyond ownershipโ€”as directors, secretaries, agents, or other capacities. + +**Direction:** `(Person|Intermediary|Officer) -[INVOLVED_IN]-> (Entity|Company)` + +**Properties:** +| Property | Data Type | Constraints | Description | +|----------|-----------|-------------|-------------| +| role | String | NOT NULL | Options: `Director`, `Secretary`, `Nominee Director`, `Registered Agent`, `Protector`, `Enforcer`, `Settlor`, `Beneficiary`, `Authorized Signatory`, `Power of Attorney`, `Shareholder`, `Ultimate Beneficial Owner` | +| start_date | Date | โ€” | Role start date | +| end_date | Date | โ€” | Role end date | +| status | String | DEFAULT 'Active' | Options: `Active`, `Former`, `Unknown` | +| is_nominee | Boolean | DEFAULT false | Acting as nominee | +| appointed_by | String | โ€” | Who appointed this person | + +**Why Necessary:** This relationship captures the human element of corporate control. While OWNS and CONTROLS track stake-based relationships, INVOLVED_IN captures the operational roles. A person directing 50 companies as nominee director is a major red flag detectable only through this relationship. + +--- + +### CREATED_BY + +**Purpose:** Links entities to the intermediary (law firm, corporate service provider) that created them. Essential for intermediary risk profiling and detecting patterns of facilitation. + +**Direction:** `(Entity|Company) -[CREATED_BY]-> (Intermediary)` + +**Properties:** +| Property | Data Type | Constraints | Description | +|----------|-----------|-------------|-------------| +| creation_date | Date | โ€” | Date entity was created | +| service_type | String | โ€” | Options: `Incorporation`, `Registration`, `Continuation`, `Redomiciliation` | +| fee_currency | String | โ€” | Currency of service fee | +| relationship_status | String | DEFAULT 'Active' | Options: `Active`, `Terminated`, `Transferred` | +| termination_date | Date | โ€” | When relationship ended | + +**Why Necessary:** Intermediaries are the gatekeepers of offshore structures. Analyzing which intermediaries create entities in which jurisdictions for which clients reveals systemic patterns. Mossack Fonseca's role in the Panama Papers was discovered through this relationship type. + +--- + +### CONNECTED_TO + +**Purpose:** A flexible relationship for capturing links that don't fit other categoriesโ€”shared phone numbers, email domains, common beneficial owners inferred but not proven, or journalist-identified connections. + +**Direction:** `(Entity|Person|Company|Address) -[CONNECTED_TO]-> (Entity|Person|Company|Address)` + +**Properties:** +| Property | Data Type | Constraints | Description | +|----------|-----------|-------------|-------------| +| connection_type | String | NOT NULL | Options: `Shared Contact`, `Common Beneficial Owner`, `Family Relationship`, `Business Associate`, `Same Formation Batch`, `Linked Investigation`, `Shared Bank Account` | +| confidence | String | NOT NULL | Options: `Confirmed`, `High`, `Medium`, `Low`, `Suspected` | +| evidence | String | โ€” | Description of evidence | +| source | String | โ€” | Source of connection identification | +| discovered_date | Date | โ€” | When connection was identified | + +**Why Necessary:** Investigative analysis often reveals connections that aren't captured by formal corporate relationships. Shared email domains, phone numbers, or formation patterns suggest links that warrant investigation. This relationship preserves those insights. + +--- + +### RELATED_TO + +**Purpose:** Captures family and personal relationships between persons. Critical for identifying beneficial ownership through family nominees and tracing wealth across generations. + +**Direction:** `(Person) -[RELATED_TO]-> (Person)` + +**Properties:** +| Property | Data Type | Constraints | Description | +|----------|-----------|-------------|-------------| +| relationship_type | String | NOT NULL | Options: `Spouse`, `Parent`, `Child`, `Sibling`, `Extended Family`, `Business Partner`, `Associate` | +| confidence | String | DEFAULT 'Confirmed' | Options: `Confirmed`, `Probable`, `Suspected` | +| source | String | โ€” | Source of relationship identification | + +**Why Necessary:** Beneficial ownership is frequently hidden through family members. A politician's children or spouse may hold assets on their behalf. Without family relationships, these patterns are invisible to ownership analysis. + +--- + +## Indexing Strategy + +### Primary Indexes (Unique Constraints with Index) + +These provide O(1) lookup for the most common access patterns: + +```cypher +CREATE CONSTRAINT entity_id_unique FOR (e:Entity) REQUIRE e.entity_id IS UNIQUE; +CREATE CONSTRAINT person_id_unique FOR (p:Person) REQUIRE p.person_id IS UNIQUE; +CREATE CONSTRAINT company_id_unique FOR (c:Company) REQUIRE c.company_id IS UNIQUE; +CREATE CONSTRAINT officer_id_unique FOR (o:Officer) REQUIRE o.officer_id IS UNIQUE; +CREATE CONSTRAINT intermediary_id_unique FOR (i:Intermediary) REQUIRE i.intermediary_id IS UNIQUE; +CREATE CONSTRAINT address_id_unique FOR (a:Address) REQUIRE a.address_id IS UNIQUE; +CREATE CONSTRAINT jurisdiction_code_unique FOR (j:Jurisdiction) REQUIRE j.jurisdiction_code IS UNIQUE; +``` + +### Full-Text Indexes (For Name Searching) + +Essential for fuzzy matching and partial name searches: + +```cypher +CREATE FULLTEXT INDEX entity_name_fulltext FOR (e:Entity) ON EACH [e.name, e.original_name]; +CREATE FULLTEXT INDEX person_name_fulltext FOR (p:Person) ON EACH [p.full_name, p.first_name, p.last_name]; +CREATE FULLTEXT INDEX company_name_fulltext FOR (c:Company) ON EACH [c.company_name]; +CREATE FULLTEXT INDEX intermediary_name_fulltext FOR (i:Intermediary) ON EACH [i.name]; +CREATE FULLTEXT INDEX address_fulltext FOR (a:Address) ON EACH [a.full_address, a.city]; +``` + +### Range Indexes (For Filtering and Sorting) + +Support common WHERE clauses and ORDER BY operations: + +```cypher +CREATE INDEX entity_status_idx FOR (e:Entity) ON (e.status); +CREATE INDEX entity_type_idx FOR (e:Entity) ON (e.entity_type); +CREATE INDEX entity_jurisdiction_idx FOR (e:Entity) ON (e.jurisdiction_code); +CREATE INDEX company_jurisdiction_idx FOR (c:Company) ON (c.jurisdiction_code); +CREATE INDEX company_status_idx FOR (c:Company) ON (c.status); +CREATE INDEX company_incorporation_idx FOR (c:Company) ON (c.incorporation_date); +CREATE INDEX person_nationality_idx FOR (p:Person) ON (p.nationality); +CREATE INDEX person_pep_idx FOR (p:Person) ON (p.is_pep); +CREATE INDEX person_lastname_idx FOR (p:Person) ON (p.last_name); +CREATE INDEX officer_role_idx FOR (o:Officer) ON (o.role_type); +CREATE INDEX officer_corporate_idx FOR (o:Officer) ON (o.is_corporate_officer); +CREATE INDEX intermediary_type_idx FOR (i:Intermediary) ON (i.type); +CREATE INDEX intermediary_country_idx FOR (i:Intermediary) ON (i.country_code); +CREATE INDEX address_country_idx FOR (a:Address) ON (a.country_code); +CREATE INDEX address_city_idx FOR (a:Address) ON (a.city); +CREATE INDEX address_nominee_idx FOR (a:Address) ON (a.is_nominee_address); +CREATE INDEX jurisdiction_haven_idx FOR (j:Jurisdiction) ON (j.is_tax_haven); +CREATE INDEX jurisdiction_region_idx FOR (j:Jurisdiction) ON (j.region); +``` + +### Composite Indexes (For Multi-Property Queries) + +Optimize common compound conditions: + +```cypher +CREATE INDEX company_jurisdiction_status_idx FOR (c:Company) ON (c.jurisdiction_code, c.status); +CREATE INDEX entity_jurisdiction_type_idx FOR (e:Entity) ON (e.jurisdiction_code, e.entity_type); +CREATE INDEX person_nationality_pep_idx FOR (p:Person) ON (p.nationality, p.is_pep); +CREATE INDEX address_country_city_idx FOR (a:Address) ON (a.country_code, a.city); +``` + +### Relationship Property Indexes + +For filtering relationship traversals: + +```cypher +CREATE INDEX owns_status_idx FOR ()-[r:OWNS]-() ON (r.status); +CREATE INDEX involved_role_idx FOR ()-[r:INVOLVED_IN]-() ON (r.role); +CREATE INDEX involved_status_idx FOR ()-[r:INVOLVED_IN]-() ON (r.status); +``` + +--- + +## Query Risk Analysis + +### Cartesian Product Risks + +**Risk Level: CRITICAL** + +The following query patterns can produce Cartesian products (explosive intermediate result sets): + +#### 1. Unconnected Pattern Matches + +```cypher +// DANGEROUS: Cartesian product between all entities and all persons +MATCH (e:Entity), (p:Person) +WHERE e.jurisdiction_code = 'BVI' AND p.nationality = 'RUS' +RETURN e, p +``` + +**Mitigation:** Always connect patterns or use explicit path relationships: + +```cypher +// SAFE: Connected via relationship +MATCH (p:Person)-[:OWNS|CONTROLS*1..4]->(e:Entity) +WHERE e.jurisdiction_code = 'BVI' AND p.nationality = 'RUS' +RETURN p, e +``` + +#### 2. Multi-Hop Pathfinding Without Limits + +```cypher +// DANGEROUS: Unbounded path exploration +MATCH path = (p:Person)-[:OWNS|CONTROLS*]->(e:Entity) +WHERE e.name CONTAINS 'Holdings' +RETURN path +``` + +**Mitigation:** Always bound variable-length paths: + +```cypher +// SAFE: Bounded to 6 hops maximum +MATCH path = (p:Person)-[:OWNS|CONTROLS*1..6]->(e:Entity) +WHERE e.name CONTAINS 'Holdings' +RETURN path +LIMIT 1000 +``` + +#### 3. Multiple Optional Matches + +```cypher +// DANGEROUS: Cascading optionals create explosion +MATCH (e:Entity) +OPTIONAL MATCH (e)-[:HAS_ADDRESS]->(a:Address) +OPTIONAL MATCH (e)-[:REGISTERED_IN]->(j:Jurisdiction) +OPTIONAL MATCH (e)<-[:INVOLVED_IN]-(o:Officer) +OPTIONAL MATCH (e)<-[:OWNS]-(owner) +RETURN e, a, j, o, owner +``` + +**Mitigation:** Use subqueries or COLLECT for optional data: + +```cypher +// SAFE: Aggregated optionals +MATCH (e:Entity) +OPTIONAL MATCH (e)-[:HAS_ADDRESS]->(a:Address) +WITH e, COLLECT(DISTINCT a) AS addresses +OPTIONAL MATCH (e)<-[:INVOLVED_IN]-(o:Officer) +WITH e, addresses, COLLECT(DISTINCT o) AS officers +RETURN e, addresses, officers +LIMIT 100 +``` + +#### 4. All-Pairs Shortest Path + +```cypher +// DANGEROUS: All pairs between two large sets +MATCH (p1:Person), (p2:Person) +WHERE p1.nationality = 'USA' AND p2.nationality = 'CHN' +MATCH path = shortestPath((p1)-[*]-(p2)) +RETURN path +``` + +**Mitigation:** Start from specific nodes, use LIMIT, or pre-filter: + +```cypher +// SAFER: Single source, bounded +MATCH (p1:Person {person_id: 'specific-id'}) +MATCH (p2:Person) WHERE p2.nationality = 'CHN' +WITH p1, p2 LIMIT 100 +MATCH path = shortestPath((p1)-[*..8]-(p2)) +RETURN path +``` + +### High-Risk Query Patterns + +| Pattern | Risk | Mitigation | +| ---------------------------- | ------------------- | -------------------------------------- | +| `()-[*]->()` | Unbounded traversal | Use `*1..N` with N โ‰ค 10 | +| Multiple unconnected `MATCH` | Cartesian product | Connect with relationships | +| `MATCH (a), (b) WHERE...` | Cartesian join | Rewrite with explicit paths | +| `shortestPath` between sets | Explosive pairs | Use WITH/LIMIT first | +| Deep ownership chains | Memory exhaustion | Limit depth to 6 hops | +| Full-text + path traversal | Slow fan-out | Filter by index first, traverse second | + +### Recommended Depth Limits by Use Case + +| Analysis Type | Max Hops | Rationale | +| ------------------------------- | -------- | --------------------------------------- | +| Direct beneficial ownership | 1-2 | Immediate ownership is most relevant | +| Layered structure detection | 3-4 | Most offshore structures use 2-4 layers | +| Ultimate beneficial owner (UBO) | 4-6 | Regulatory standard is 4 layers | +| Network clustering | 2-3 | Shared addresses/officers | +| Intermediary analysis | 1-2 | Direct client relationships | +| Deep investigation | 6-8 | Exceptional cases only, with LIMIT | + +--- + +## Example Queries + +### 1. Find Beneficial Ownership Chain + +```cypher +// Trace ownership from a person through all layers +MATCH path = (p:Person {full_name: 'John Smith'})-[:OWNS|CONTROLS*1..6]->(e:Entity) +WHERE e.status = 'Active' +RETURN path +ORDER BY length(path) ASC +LIMIT 50 +``` + +### 2. Identify Mass Registration Addresses + +```cypher +// Find addresses with suspiciously many entities +MATCH (a:Address)<-[:HAS_ADDRESS]-(e:Entity) +WITH a, COUNT(e) AS entity_count +WHERE entity_count > 50 +RETURN a.full_address, a.country_code, entity_count +ORDER BY entity_count DESC +LIMIT 20 +``` + +### 3. Intermediary Risk Profile + +```cypher +// Profile an intermediary's client base +MATCH (i:Intermediary {name: 'Mossack Fonseca'})<-[:CREATED_BY]-(e:Entity) +OPTIONAL MATCH (e)-[:REGISTERED_IN]->(j:Jurisdiction) +WITH i, j.jurisdiction_code AS jurisdiction, COUNT(e) AS entity_count +RETURN jurisdiction, entity_count +ORDER BY entity_count DESC +``` + +### 4. PEP Exposure Detection + +```cypher +// Find entities connected to politically exposed persons +MATCH (pep:Person {is_pep: true})-[:OWNS|CONTROLS|INVOLVED_IN*1..3]->(e:Entity) +MATCH (e)-[:REGISTERED_IN]->(j:Jurisdiction {is_tax_haven: true}) +RETURN pep.full_name, pep.pep_details, e.name, j.name AS jurisdiction +LIMIT 100 +``` + +### 5. Shared Officer Network + +```cypher +// Find persons who serve as officers in multiple companies +MATCH (p:Person)-[r:INVOLVED_IN]->(e:Entity) +WHERE r.role IN ['Director', 'Secretary', 'Nominee Director'] +WITH p, COUNT(DISTINCT e) AS company_count, COLLECT(e.name)[0..5] AS sample_companies +WHERE company_count > 10 +RETURN p.full_name, company_count, sample_companies +ORDER BY company_count DESC +LIMIT 50 +``` + +--- + +## Data Model Diagram + +``` +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ PANAMA PAPERS GRAPH SCHEMA โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ Person โ”‚โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€RELATED_TOโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ถโ”‚ Person โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ””โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + โ”‚ OWNS / CONTROLS / INVOLVED_IN + โ–ผ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” CREATED_BY โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ Entity โ”‚โ—€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”‚ Intermediary โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ”‚ (Company)โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ””โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”˜ โ”‚ + โ”‚ โ”‚ + โ”‚ OWNS โ”‚ HAS_ADDRESS + โ–ผ โ–ผ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ Entity โ”‚โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€HAS_ADDRESSโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ถโ”‚ Address โ”‚ + โ”‚ โ”‚ โ”‚ โ”‚ + โ””โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”˜ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + โ”‚ + โ”‚ REGISTERED_IN + โ–ผ + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ Jurisdiction โ”‚ + โ”‚ โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + + + โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” + โ”‚ Officer โ”‚โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€INVOLVED_INโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ถ Entity / Company + โ”‚ โ”‚ + โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + + +LEGEND: +โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ–ถ Directed relationship +โ—€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Incoming relationship +โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Bidirectional or flexible +``` + +--- + +## Schema Validation Constraints + +```cypher +// Ensure data integrity +CREATE CONSTRAINT entity_id_exists FOR (e:Entity) REQUIRE e.entity_id IS NOT NULL; +CREATE CONSTRAINT person_id_exists FOR (p:Person) REQUIRE p.person_id IS NOT NULL; +CREATE CONSTRAINT person_name_exists FOR (p:Person) REQUIRE p.full_name IS NOT NULL; +CREATE CONSTRAINT company_name_exists FOR (c:Company) REQUIRE c.company_name IS NOT NULL; +CREATE CONSTRAINT jurisdiction_code_exists FOR (j:Jurisdiction) REQUIRE j.jurisdiction_code IS NOT NULL; +CREATE CONSTRAINT address_country_exists FOR (a:Address) REQUIRE a.country_code IS NOT NULL; + +// Relationship property constraints (Neo4j 5.x) +CREATE CONSTRAINT owns_percentage_range FOR ()-[r:OWNS]-() + REQUIRE r.ownership_percentage >= 0 AND r.ownership_percentage <= 100; +``` + +--- + +## Version History + +| Version | Date | Changes | +| ------- | ------- | --------------------- | +| 1.0 | 2024-01 | Initial schema design | + +--- + +_Schema designed for ICIJ Panama Papers analysis. Compatible with Neo4j 5.x Enterprise and Community editions._ diff --git a/scripts/advanced_queries.cypher b/scripts/advanced_queries.cypher new file mode 100644 index 0000000..99062b6 --- /dev/null +++ b/scripts/advanced_queries.cypher @@ -0,0 +1,677 @@ +// ============================================================================ +// PANAMA PAPERS - ANALYSIS QUERIES +// ============================================================================ +// +// Neo4j 5.x Cypher queries for offshore financial network analysis +// +// Categories: +// 1. Beneficial Ownership Tracing +// 2. Red Flag Detection +// 3. Intermediary Analysis +// 4. Jurisdiction Risk Analysis +// 5. Network Pattern Detection +// +// Performance Notes: +// - All queries include LIMIT clauses to prevent runaway execution +// - Use EXPLAIN or PROFILE prefix to analyze query plans +// - Ensure indexes exist on filtered properties (see schema.cypher) +// - Variable-length paths bounded to prevent memory issues +// +// ============================================================================ + + +// ============================================================================ +// QUERY 1: BENEFICIAL OWNERSHIP TRACING +// ============================================================================ +// Purpose: Find ownership chain from entity to ultimate beneficial owners +// Direction: Entity <- OWNS <- ... <- Person (reverse traversal) +// Expected Time: ~50-200ms depending on graph density + +// 1.1 - Simple beneficial owner lookup (2-4 hops) +// Finds all persons who own the target entity through any chain +MATCH path = (beneficial:Person)-[:OWNS*1..4]->(target:Entity) +WHERE target.entity_id = $entityId +WITH + beneficial, + path, + length(path) AS depth, + [r IN relationships(path) | r.ownership_percentage] AS percentages +RETURN + beneficial.full_name AS beneficial_owner, + beneficial.nationality AS nationality, + beneficial.is_pep AS is_pep, + depth AS ownership_depth, + percentages AS ownership_chain, + REDUCE(pct = 100.0, p IN percentages | pct * COALESCE(p, 100.0) / 100.0) AS effective_ownership +ORDER BY effective_ownership DESC, depth ASC +LIMIT 20; + + +// 1.2 - Beneficial ownership with intermediate entities +// Shows each entity in the ownership chain +MATCH path = (person:Person)-[:OWNS*1..4]->(target:Entity {entity_id: $entityId}) +WITH + person, + nodes(path) AS chain_nodes, + relationships(path) AS chain_rels, + length(path) AS depth +UNWIND range(0, size(chain_nodes) - 1) AS idx +WITH + person, + depth, + idx, + chain_nodes[idx] AS node, + CASE WHEN idx < size(chain_rels) THEN chain_rels[idx] ELSE null END AS rel +RETURN + person.full_name AS beneficial_owner, + depth AS total_depth, + idx AS layer, + COALESCE(node.name, node.full_name) AS entity_name, + labels(node)[0] AS entity_type, + node.jurisdiction_code AS jurisdiction, + rel.ownership_percentage AS ownership_pct +ORDER BY person.full_name, idx +LIMIT 100; + + +// 1.3 - All shortest paths to beneficial owners +// Finds the shortest ownership paths (most direct control) +MATCH (target:Entity {entity_id: $entityId}) +MATCH (person:Person) +WHERE (person)-[:OWNS*1..6]->(target) +MATCH path = shortestPath((person)-[:OWNS*1..6]->(target)) +WITH person, path, length(path) AS depth +RETURN + person.full_name AS owner, + person.nationality AS nationality, + depth, + [n IN nodes(path) | COALESCE(n.name, n.full_name)] AS ownership_chain +ORDER BY depth ASC +LIMIT 15; + + +// 1.4 - Ownership tree (all downstream entities from a person) +// Finds everything a person owns directly or indirectly +MATCH path = (person:Person {person_id: $personId})-[:OWNS*1..5]->(owned) +WITH + owned, + length(path) AS depth, + [r IN relationships(path) | r.ownership_percentage] AS percentages +WITH + owned, + depth, + REDUCE(pct = 100.0, p IN percentages | pct * COALESCE(p, 100.0) / 100.0) AS effective_pct +RETURN + owned.entity_id AS entity_id, + owned.name AS entity_name, + owned.jurisdiction_code AS jurisdiction, + owned.entity_type AS type, + depth AS layers_deep, + round(effective_pct * 100) / 100 AS effective_ownership_pct +ORDER BY depth, effective_pct DESC +LIMIT 50; + + +// ============================================================================ +// QUERY 2: RED FLAG DETECTION +// ============================================================================ +// Purpose: Identify suspicious patterns indicating potential money laundering, +// tax evasion, or other financial crimes + +// 2.1 - Deep ownership layering (4+ hops = high risk) +// Excessive layering often indicates intent to obscure ownership +MATCH path = (p:Person)-[:OWNS*4..8]->(e:Entity) +WITH + p, + e, + length(path) AS depth, + [n IN nodes(path) WHERE n:Entity | n.jurisdiction_code] AS jurisdictions +WITH + p, + e, + depth, + jurisdictions, + size(apoc.coll.toSet(jurisdictions)) AS unique_jurisdictions +RETURN + p.full_name AS beneficial_owner, + p.nationality AS nationality, + p.is_pep AS is_pep, + e.name AS end_entity, + depth AS layering_depth, + unique_jurisdictions AS jurisdictions_used, + jurisdictions AS jurisdiction_chain, + CASE + WHEN depth >= 6 AND unique_jurisdictions >= 4 THEN 'CRITICAL' + WHEN depth >= 5 OR unique_jurisdictions >= 3 THEN 'HIGH' + WHEN depth >= 4 THEN 'MEDIUM' + ELSE 'LOW' + END AS risk_level +ORDER BY depth DESC, unique_jurisdictions DESC +LIMIT 25; + + +// 2.2 - Circular ownership detection +// Entities that own themselves through chains (shell company networks) +MATCH path = (e:Entity)-[:OWNS*2..6]->(e) +WITH + e, + path, + length(path) AS cycle_length, + [n IN nodes(path) | n.name] AS cycle_entities +RETURN + e.name AS entity_name, + e.jurisdiction_code AS jurisdiction, + cycle_length, + cycle_entities, + 'CIRCULAR_OWNERSHIP' AS red_flag +ORDER BY cycle_length ASC +LIMIT 20; + + +// 2.3 - Multi-jurisdiction hopping +// Entities with ownership chains crossing 3+ tax havens +MATCH (j:Jurisdiction {is_tax_haven: true}) +WITH collect(j.jurisdiction_code) AS tax_havens +MATCH path = (p:Person)-[:OWNS*2..5]->(e:Entity) +WITH + p, e, path, tax_havens, + [n IN nodes(path) WHERE n:Entity | n.jurisdiction_code] AS chain_jurisdictions +WITH + p, e, + [j IN chain_jurisdictions WHERE j IN tax_havens] AS haven_crossings, + chain_jurisdictions +WHERE size(haven_crossings) >= 3 +RETURN + p.full_name AS owner, + e.name AS end_entity, + chain_jurisdictions AS full_chain, + haven_crossings AS tax_havens_used, + size(haven_crossings) AS haven_count, + 'JURISDICTION_SHOPPING' AS red_flag +ORDER BY size(haven_crossings) DESC +LIMIT 20; + + +// 2.4 - Nominee address concentration +// Many entities at same address (indicates shell companies) +MATCH (a:Address)<-[:HAS_ADDRESS]-(e:Entity) +WITH a, collect(e) AS entities, count(e) AS entity_count +WHERE entity_count >= 10 +RETURN + a.full_address AS address, + a.country_code AS country, + entity_count AS entities_at_address, + [ent IN entities[0..10] | ent.name] AS sample_entities, + CASE + WHEN entity_count >= 100 THEN 'CRITICAL' + WHEN entity_count >= 50 THEN 'HIGH' + WHEN entity_count >= 20 THEN 'MEDIUM' + ELSE 'LOW' + END AS risk_level, + 'MASS_REGISTRATION_ADDRESS' AS red_flag +ORDER BY entity_count DESC +LIMIT 20; + + +// 2.5 - PEP (Politically Exposed Person) connections +// Find entities connected to politically exposed persons +MATCH (pep:Person {is_pep: true})-[r:OWNS|CONTROLS|INVOLVED_IN*1..3]->(e:Entity) +WITH + pep, + e, + length(r) AS connection_depth, + [rel IN r | type(rel)] AS relationship_types +MATCH (e)-[:REGISTERED_IN]->(j:Jurisdiction) +RETURN + pep.full_name AS pep_name, + pep.pep_details AS political_role, + e.name AS connected_entity, + e.entity_type AS entity_type, + j.name AS jurisdiction, + j.is_tax_haven AS is_tax_haven, + connection_depth, + relationship_types, + 'PEP_CONNECTION' AS red_flag +ORDER BY j.is_tax_haven DESC, connection_depth ASC +LIMIT 30; + + +// 2.6 - Rapid entity creation (bulk formation) +// Entities created in same week by same intermediary +MATCH (i:Intermediary)<-[:CREATED_BY]-(e:Entity) +WHERE e.incorporation_date IS NOT NULL +WITH + i, + date(e.incorporation_date).week AS week, + date(e.incorporation_date).year AS year, + collect(e) AS entities +WITH + i, + year, + week, + entities, + size(entities) AS batch_size +WHERE batch_size >= 5 +RETURN + i.name AS intermediary, + i.country_code AS country, + year, + week, + batch_size AS entities_created, + [ent IN entities[0..5] | ent.name] AS sample_entities, + 'BULK_FORMATION' AS red_flag +ORDER BY batch_size DESC +LIMIT 20; + + +// ============================================================================ +// QUERY 3: INTERMEDIARY ANALYSIS +// ============================================================================ +// Purpose: Identify influential intermediaries (law firms, banks, accountants) +// who facilitate offshore structures + +// 3.1 - Top intermediaries by entity count +MATCH (i:Intermediary)<-[:CREATED_BY]-(e:Entity) +WITH i, count(e) AS entity_count, collect(DISTINCT e.jurisdiction_code) AS jurisdictions +RETURN + i.name AS intermediary_name, + i.type AS intermediary_type, + i.country_code AS country, + entity_count AS entities_created, + size(jurisdictions) AS jurisdictions_served, + jurisdictions[0..10] AS top_jurisdictions +ORDER BY entity_count DESC +LIMIT 25; + + +// 3.2 - Intermediary specialization (jurisdiction focus) +MATCH (i:Intermediary)<-[:CREATED_BY]-(e:Entity)-[:REGISTERED_IN]->(j:Jurisdiction) +WITH i, j, count(e) AS entity_count +ORDER BY entity_count DESC +WITH i, collect({jurisdiction: j.name, count: entity_count})[0..5] AS top_jurisdictions +RETURN + i.name AS intermediary, + i.type AS type, + top_jurisdictions +ORDER BY top_jurisdictions[0].count DESC +LIMIT 20; + + +// 3.3 - Intermediaries serving PEPs +MATCH (pep:Person {is_pep: true})-[:OWNS|CONTROLS*1..2]->(e:Entity)-[:CREATED_BY]->(i:Intermediary) +WITH i, collect(DISTINCT pep) AS peps, count(DISTINCT e) AS entities +RETURN + i.name AS intermediary, + i.country_code AS country, + size(peps) AS pep_clients, + [p IN peps[0..5] | p.full_name] AS sample_peps, + entities AS entities_for_peps, + 'PEP_SERVICE_PROVIDER' AS flag +ORDER BY size(peps) DESC +LIMIT 15; + + +// 3.4 - Intermediary network (shared clients) +// Find intermediaries that share clients with other intermediaries +MATCH (i1:Intermediary)<-[:CREATED_BY]-(e:Entity)-[:CREATED_BY]->(i2:Intermediary) +WHERE i1 <> i2 +WITH i1, i2, count(e) AS shared_entities +WHERE shared_entities >= 5 +RETURN + i1.name AS intermediary_1, + i2.name AS intermediary_2, + shared_entities AS shared_clients +ORDER BY shared_entities DESC +LIMIT 20; + + +// ============================================================================ +// QUERY 4: JURISDICTION RISK ANALYSIS +// ============================================================================ +// Purpose: Analyze geographic patterns and identify high-risk jurisdiction usage + +// 4.1 - Jurisdiction statistics +MATCH (e:Entity)-[:REGISTERED_IN]->(j:Jurisdiction) +WITH j, count(e) AS entity_count +OPTIONAL MATCH (e2:Entity)-[:REGISTERED_IN]->(j) +WHERE e2.status = 'Active' +WITH j, entity_count, count(e2) AS active_count +RETURN + j.jurisdiction_code AS code, + j.name AS jurisdiction, + j.is_tax_haven AS tax_haven, + j.secrecy_score AS secrecy_score, + j.risk_level AS risk_level, + entity_count AS total_entities, + active_count AS active_entities, + round(active_count * 100.0 / entity_count) AS active_pct +ORDER BY entity_count DESC +LIMIT 25; + + +// 4.2 - Jurisdiction flow analysis +// Where does money flow between jurisdictions? +MATCH (e1:Entity)-[:OWNS]->(e2:Entity) +WHERE e1.jurisdiction_code IS NOT NULL AND e2.jurisdiction_code IS NOT NULL +WITH e1.jurisdiction_code AS from_j, e2.jurisdiction_code AS to_j, count(*) AS flow_count +WHERE from_j <> to_j AND flow_count >= 10 +RETURN + from_j AS from_jurisdiction, + to_j AS to_jurisdiction, + flow_count AS ownership_connections +ORDER BY flow_count DESC +LIMIT 30; + + +// 4.3 - Tax haven chains +// Ownership paths that cross multiple tax havens +MATCH (j1:Jurisdiction {is_tax_haven: true})<-[:REGISTERED_IN]-(e1:Entity) + -[:OWNS]->(e2:Entity)-[:REGISTERED_IN]->(j2:Jurisdiction {is_tax_haven: true}) +WHERE j1 <> j2 +WITH j1, j2, count(*) AS chain_count +RETURN + j1.name AS from_haven, + j2.name AS to_haven, + chain_count AS ownership_chains +ORDER BY chain_count DESC +LIMIT 20; + + +// 4.4 - Nationality vs jurisdiction mismatch +// Persons owning entities in different jurisdictions than their nationality +MATCH (p:Person)-[:OWNS*1..2]->(e:Entity)-[:REGISTERED_IN]->(j:Jurisdiction) +WHERE p.nationality IS NOT NULL + AND p.nationality <> j.jurisdiction_code + AND j.is_tax_haven = true +WITH p, j, count(e) AS entity_count +WHERE entity_count >= 2 +RETURN + p.full_name AS owner, + p.nationality AS nationality, + j.name AS offshore_jurisdiction, + entity_count AS entities_owned, + 'OFFSHORE_MISMATCH' AS pattern +ORDER BY entity_count DESC +LIMIT 25; + + +// ============================================================================ +// QUERY 5: NETWORK PATTERN DETECTION +// ============================================================================ +// Purpose: Identify structural patterns in the ownership network + +// 5.1 - Hub entities (highly connected) +// Entities with many incoming and outgoing ownership links +MATCH (e:Entity) +OPTIONAL MATCH (e)<-[:OWNS]-(owner) +OPTIONAL MATCH (e)-[:OWNS]->(owned) +WITH e, count(DISTINCT owner) AS owners, count(DISTINCT owned) AS subsidiaries +WITH e, owners, subsidiaries, owners + subsidiaries AS total_connections +WHERE total_connections >= 5 +RETURN + e.name AS entity_name, + e.entity_type AS type, + e.jurisdiction_code AS jurisdiction, + owners AS owner_count, + subsidiaries AS subsidiary_count, + total_connections, + CASE + WHEN owners > subsidiaries * 2 THEN 'HOLDING_COMPANY' + WHEN subsidiaries > owners * 2 THEN 'INVESTMENT_VEHICLE' + ELSE 'INTERMEDIATE_HOLDER' + END AS entity_role +ORDER BY total_connections DESC +LIMIT 30; + + +// 5.2 - Star pattern detection +// Entities owned by single person through multiple intermediaries +MATCH (p:Person)-[:OWNS]->(intermediate:Entity)-[:OWNS]->(target:Entity) +WITH p, target, collect(DISTINCT intermediate) AS intermediates +WHERE size(intermediates) >= 3 +RETURN + p.full_name AS owner, + target.name AS target_entity, + size(intermediates) AS intermediate_count, + [i IN intermediates[0..5] | i.name] AS intermediate_entities, + 'STAR_PATTERN' AS structure_type +ORDER BY size(intermediates) DESC +LIMIT 20; + + +// 5.3 - Parallel structure detection +// Same person owning similar entities in multiple jurisdictions +MATCH (p:Person)-[:OWNS]->(e1:Entity)-[:REGISTERED_IN]->(j1:Jurisdiction) +MATCH (p)-[:OWNS]->(e2:Entity)-[:REGISTERED_IN]->(j2:Jurisdiction) +WHERE e1 <> e2 + AND j1 <> j2 + AND e1.entity_type = e2.entity_type +WITH p, collect(DISTINCT j1.jurisdiction_code) + collect(DISTINCT j2.jurisdiction_code) AS all_jurisdictions +WITH p, apoc.coll.toSet(all_jurisdictions) AS unique_jurisdictions +WHERE size(unique_jurisdictions) >= 3 +RETURN + p.full_name AS owner, + size(unique_jurisdictions) AS jurisdiction_count, + unique_jurisdictions AS jurisdictions, + 'PARALLEL_STRUCTURES' AS pattern +ORDER BY size(unique_jurisdictions) DESC +LIMIT 20; + + +// 5.4 - Officer overlap analysis +// Entities sharing multiple officers (indicates common control) +MATCH (e1:Entity)<-[:INVOLVED_IN]-(o:Officer)-[:INVOLVED_IN]->(e2:Entity) +WHERE e1 <> e2 +WITH e1, e2, collect(DISTINCT o) AS shared_officers +WHERE size(shared_officers) >= 2 +RETURN + e1.name AS entity_1, + e2.name AS entity_2, + e1.jurisdiction_code AS j1, + e2.jurisdiction_code AS j2, + size(shared_officers) AS shared_officer_count, + [o IN shared_officers | o.name] AS officers, + 'COMMON_CONTROL' AS indicator +ORDER BY size(shared_officers) DESC +LIMIT 25; + + +// ============================================================================ +// QUERY 6: TEMPORAL ANALYSIS +// ============================================================================ +// Purpose: Analyze changes over time + +// 6.1 - Entity creation timeline +MATCH (e:Entity) +WHERE e.incorporation_date IS NOT NULL +WITH date(e.incorporation_date).year AS year, count(e) AS created +WHERE year >= 1990 AND year <= 2020 +RETURN year, created +ORDER BY year; + + +// 6.2 - Ownership changes around key dates +// Find entities with ownership changes near a specific date +WITH date('2015-04-01') AS leak_date // Panama Papers leak date +MATCH (e:Entity)<-[r:OWNS]-(owner) +WHERE r.acquisition_date IS NOT NULL + AND abs(duration.between(date(r.acquisition_date), leak_date).days) <= 90 +RETURN + e.name AS entity, + owner.full_name AS owner, + r.acquisition_date AS ownership_date, + duration.between(date(r.acquisition_date), leak_date).days AS days_from_leak, + CASE + WHEN date(r.acquisition_date) < leak_date THEN 'BEFORE_LEAK' + ELSE 'AFTER_LEAK' + END AS timing +ORDER BY r.acquisition_date +LIMIT 30; + + +// 6.3 - Dissolution patterns +// Entities dissolved after Panama Papers leak +MATCH (e:Entity) +WHERE e.struck_off_date IS NOT NULL + AND date(e.struck_off_date) >= date('2016-04-01') + AND date(e.struck_off_date) <= date('2017-12-31') +OPTIONAL MATCH (e)<-[:OWNS]-(owner:Person) +RETURN + e.name AS entity, + e.jurisdiction_code AS jurisdiction, + e.struck_off_date AS dissolution_date, + collect(DISTINCT owner.full_name)[0..3] AS former_owners, + 'POST_LEAK_DISSOLUTION' AS pattern +ORDER BY e.struck_off_date +LIMIT 30; + + +// ============================================================================ +// QUERY 7: SEARCH & LOOKUP UTILITIES +// ============================================================================ +// Purpose: Common search patterns for investigation + +// 7.1 - Full-text entity search +CALL db.index.fulltext.queryNodes('entity_name_fulltext', $searchTerm) +YIELD node, score +WHERE score > 0.5 +WITH node, score +OPTIONAL MATCH (node)-[:REGISTERED_IN]->(j:Jurisdiction) +RETURN + node.entity_id AS id, + node.name AS name, + node.entity_type AS type, + j.name AS jurisdiction, + node.status AS status, + round(score * 100) / 100 AS relevance +ORDER BY score DESC +LIMIT 20; + + +// 7.2 - Person search with connections +CALL db.index.fulltext.queryNodes('person_name_fulltext', $personName) +YIELD node AS person, score +WHERE score > 0.5 +OPTIONAL MATCH (person)-[r:OWNS|CONTROLS|INVOLVED_IN]->(e:Entity) +WITH person, score, count(e) AS entity_count, collect(e.name)[0..5] AS sample_entities +RETURN + person.person_id AS id, + person.full_name AS name, + person.nationality AS nationality, + person.is_pep AS is_pep, + entity_count AS connected_entities, + sample_entities, + round(score * 100) / 100 AS relevance +ORDER BY score DESC +LIMIT 15; + + +// 7.3 - Entity profile (comprehensive lookup) +MATCH (e:Entity {entity_id: $entityId}) +OPTIONAL MATCH (e)-[:REGISTERED_IN]->(j:Jurisdiction) +OPTIONAL MATCH (e)-[:HAS_ADDRESS]->(a:Address) +OPTIONAL MATCH (e)-[:CREATED_BY]->(i:Intermediary) +OPTIONAL MATCH (owner)-[:OWNS]->(e) +OPTIONAL MATCH (e)-[:OWNS]->(subsidiary) +OPTIONAL MATCH (officer)-[:INVOLVED_IN]->(e) +RETURN + e.entity_id AS id, + e.name AS name, + e.entity_type AS type, + e.status AS status, + e.incorporation_date AS incorporated, + j.name AS jurisdiction, + j.is_tax_haven AS tax_haven, + a.full_address AS address, + i.name AS intermediary, + count(DISTINCT owner) AS owner_count, + count(DISTINCT subsidiary) AS subsidiary_count, + count(DISTINCT officer) AS officer_count, + collect(DISTINCT COALESCE(owner.name, owner.full_name))[0..5] AS owners, + collect(DISTINCT subsidiary.name)[0..5] AS subsidiaries, + collect(DISTINCT officer.name)[0..5] AS officers; + + +// ============================================================================ +// QUERY 8: EXPORT QUERIES +// ============================================================================ +// Purpose: Extract data for external analysis tools + +// 8.1 - Export ownership network (for Gephi/NetworkX) +MATCH (owner)-[r:OWNS]->(owned:Entity) +RETURN + COALESCE(owner.entity_id, owner.person_id) AS source_id, + COALESCE(owner.name, owner.full_name) AS source_name, + labels(owner)[0] AS source_type, + owned.entity_id AS target_id, + owned.name AS target_name, + r.ownership_percentage AS weight, + r.status AS status +LIMIT 10000; + + +// 8.2 - Export high-risk entities +MATCH (e:Entity) +WHERE e.pagerank_score > 0.001 + OR e.degree_centrality > 10 + OR e.community_id IS NOT NULL +OPTIONAL MATCH (e)-[:REGISTERED_IN]->(j:Jurisdiction {is_tax_haven: true}) +WITH e, j +WHERE j IS NOT NULL OR e.pagerank_score > 0.01 +RETURN + e.entity_id AS id, + e.name AS name, + e.jurisdiction_code AS jurisdiction, + e.entity_type AS type, + e.pagerank_score AS influence, + e.degree_centrality AS connections, + e.community_id AS community, + j IS NOT NULL AS in_tax_haven +ORDER BY e.pagerank_score DESC +LIMIT 5000; + + +// ============================================================================ +// PERFORMANCE PROFILING QUERIES +// ============================================================================ +// Use PROFILE prefix to analyze query performance + +// Profile beneficial ownership query +PROFILE +MATCH path = (p:Person)-[:OWNS*1..3]->(e:Entity {entity_id: 'TEST-ENTITY-001'}) +RETURN p.full_name, length(path) +LIMIT 10; + +// Explain query plan without execution +EXPLAIN +MATCH (e:Entity)-[:REGISTERED_IN]->(j:Jurisdiction {is_tax_haven: true}) +WHERE e.status = 'Active' +RETURN e.name, j.name +LIMIT 100; + + +// ============================================================================ +// ANALYSIS QUERIES COMPLETE +// ============================================================================ +// +// Query Categories: +// 1. Beneficial Ownership (4 queries) +// 2. Red Flag Detection (6 queries) +// 3. Intermediary Analysis (4 queries) +// 4. Jurisdiction Risk (4 queries) +// 5. Network Patterns (4 queries) +// 6. Temporal Analysis (3 queries) +// 7. Search Utilities (3 queries) +// 8. Export Queries (2 queries) +// +// Parameter Placeholders: +// $entityId - Target entity identifier +// $personId - Target person identifier +// $searchTerm - Full-text search term +// $personName - Person name search +// +// Usage Example: +// :param entityId => 'ENTITY-12345' +// :param searchTerm => 'Holdings Ltd' +// +// ============================================================================ diff --git a/scripts/check_pylint_score.py b/scripts/check_pylint_score.py deleted file mode 100644 index 670f315..0000000 --- a/scripts/check_pylint_score.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Script to check pylint score and fail if below threshold.""" - -import sys -import subprocess -import re - - -def get_pylint_score(): - """Run pylint and extract the score.""" - try: - result = subprocess.run( - ["pylint", "app/", "scripts/", "tests/", "--rcfile=.pylintrc", "--output-format=text"], - capture_output=True, - text=True, - check=False - ) - - # Extract score from output - # Pylint outputs: "Your code has been rated at X.XX/10" - output = result.stdout + result.stderr - match = re.search(r'rated at ([\d.]+)/10', output) - - if match: - return float(match.group(1)) - - # If no score found, print output for debugging - print("Pylint output:") - print(output) - return None - - except FileNotFoundError: - print("ERROR: pylint not found. Install it with: pip install pylint") - return None - except Exception as e: - print(f"ERROR running pylint: {e}") - return None - - -def main(): - """Main function to check pylint score.""" - threshold = 9.5 - - print("Running pylint...") - score = get_pylint_score() - - if score is None: - print("ERROR: Could not determine pylint score") - sys.exit(1) - - print(f"Pylint score: {score}/10") - - if score < threshold: - print(f"ERROR: Pylint score ({score}) is below threshold ({threshold})") - sys.exit(1) - else: - print(f"โœ“ Pylint score ({score}) meets requirement (>= {threshold})") - sys.exit(0) - - -if __name__ == "__main__": - main() - diff --git a/scripts/create_schema.cypher b/scripts/create_schema.cypher new file mode 100644 index 0000000..3600c63 --- /dev/null +++ b/scripts/create_schema.cypher @@ -0,0 +1,644 @@ +// ============================================================================ +// PANAMA PAPERS NEO4J SCHEMA +// Neo4j 5.x Syntax - ICIJ Offshore Leaks Analysis Platform +// ============================================================================ +// +// This script creates the complete schema for analyzing offshore financial +// networks including entities, persons, companies, officers, intermediaries, +// addresses, and jurisdictions. +// +// USAGE: +// cypher-shell -u neo4j -p -f schema.cypher +// OR paste into Neo4j Browser +// +// NOTES: +// - All constraints use Neo4j 5.x REQUIRE syntax (not deprecated ASSERT) +// - IF NOT EXISTS ensures idempotent execution (safe to re-run) +// - Indexes are created after constraints to avoid conflicts +// +// ============================================================================ + + +// ============================================================================ +// SECTION 1: UNIQUENESS CONSTRAINTS +// ============================================================================ +// These enforce data integrity and automatically create backing indexes. +// Each node type has a unique identifier constraint. + +// --- Entity Constraints --- +// Primary offshore entity (companies, trusts, funds, foundations) +CREATE CONSTRAINT entity_id_unique IF NOT EXISTS +FOR (e:Entity) REQUIRE e.entity_id IS UNIQUE; + +// --- Person Constraints --- +// Natural persons (beneficial owners, directors, shareholders) +CREATE CONSTRAINT person_id_unique IF NOT EXISTS +FOR (p:Person) REQUIRE p.person_id IS UNIQUE; + +// --- Company Constraints --- +// Corporate entities (subset of Entity with additional properties) +CREATE CONSTRAINT company_id_unique IF NOT EXISTS +FOR (c:Company) REQUIRE c.company_id IS UNIQUE; + +// Company name uniqueness within jurisdiction (compound constraint) +// Note: Neo4j 5.x supports node key constraints for compound uniqueness +CREATE CONSTRAINT company_jurisdiction_key IF NOT EXISTS +FOR (c:Company) REQUIRE (c.name, c.jurisdiction_code) IS NODE KEY; + +// --- Officer Constraints --- +// Corporate officers (directors, secretaries, nominees) +CREATE CONSTRAINT officer_id_unique IF NOT EXISTS +FOR (o:Officer) REQUIRE o.officer_id IS UNIQUE; + +// --- Intermediary Constraints --- +// Service providers (law firms, banks, accountants) +CREATE CONSTRAINT intermediary_id_unique IF NOT EXISTS +FOR (i:Intermediary) REQUIRE i.intermediary_id IS UNIQUE; + +// --- Address Constraints --- +// Physical and registered addresses +CREATE CONSTRAINT address_id_unique IF NOT EXISTS +FOR (a:Address) REQUIRE a.address_id IS UNIQUE; + +// --- Jurisdiction Constraints --- +// Tax havens and registration jurisdictions (reference data) +CREATE CONSTRAINT jurisdiction_code_unique IF NOT EXISTS +FOR (j:Jurisdiction) REQUIRE j.jurisdiction_code IS UNIQUE; + + +// ============================================================================ +// SECTION 2: EXISTENCE CONSTRAINTS (Data Quality) +// ============================================================================ +// These ensure required properties are always present. +// Critical for maintaining data integrity during imports. + +// --- Entity Required Properties --- +CREATE CONSTRAINT entity_name_exists IF NOT EXISTS +FOR (e:Entity) REQUIRE e.name IS NOT NULL; + +CREATE CONSTRAINT entity_source_exists IF NOT EXISTS +FOR (e:Entity) REQUIRE e.source IS NOT NULL; + +// --- Person Required Properties --- +CREATE CONSTRAINT person_name_exists IF NOT EXISTS +FOR (p:Person) REQUIRE p.full_name IS NOT NULL; + +// --- Company Required Properties --- +CREATE CONSTRAINT company_name_exists IF NOT EXISTS +FOR (c:Company) REQUIRE c.name IS NOT NULL; + +CREATE CONSTRAINT company_jurisdiction_exists IF NOT EXISTS +FOR (c:Company) REQUIRE c.jurisdiction_code IS NOT NULL; + +// --- Officer Required Properties --- +CREATE CONSTRAINT officer_name_exists IF NOT EXISTS +FOR (o:Officer) REQUIRE o.name IS NOT NULL; + +CREATE CONSTRAINT officer_role_exists IF NOT EXISTS +FOR (o:Officer) REQUIRE o.role_type IS NOT NULL; + +// --- Intermediary Required Properties --- +CREATE CONSTRAINT intermediary_name_exists IF NOT EXISTS +FOR (i:Intermediary) REQUIRE i.name IS NOT NULL; + +// --- Address Required Properties --- +CREATE CONSTRAINT address_country_exists IF NOT EXISTS +FOR (a:Address) REQUIRE a.country_code IS NOT NULL; + +// --- Jurisdiction Required Properties --- +CREATE CONSTRAINT jurisdiction_name_exists IF NOT EXISTS +FOR (j:Jurisdiction) REQUIRE j.name IS NOT NULL; + + +// ============================================================================ +// SECTION 3: SINGLE-PROPERTY INDEXES (High-Cardinality Lookups) +// ============================================================================ +// These optimize WHERE clause filtering on frequently queried properties. +// Created separately from constraints for properties that aren't unique. + +// --- Entity Indexes --- +// Name search (most common query pattern) +CREATE INDEX entity_name_idx IF NOT EXISTS +FOR (e:Entity) ON (e.name); + +// Filter by entity type (Company, Trust, Fund, Foundation) +CREATE INDEX entity_type_idx IF NOT EXISTS +FOR (e:Entity) ON (e.entity_type); + +// Filter by jurisdiction +CREATE INDEX entity_jurisdiction_idx IF NOT EXISTS +FOR (e:Entity) ON (e.jurisdiction_code); + +// Filter by status (Active, Inactive, Dissolved) +CREATE INDEX entity_status_idx IF NOT EXISTS +FOR (e:Entity) ON (e.status); + +// Filter by data source (Panama Papers, Paradise Papers, etc.) +CREATE INDEX entity_source_idx IF NOT EXISTS +FOR (e:Entity) ON (e.source); + +// Temporal filtering (registration date) +CREATE INDEX entity_registration_date_idx IF NOT EXISTS +FOR (e:Entity) ON (e.registration_date); + +// --- Person Indexes --- +// Name search +CREATE INDEX person_fullname_idx IF NOT EXISTS +FOR (p:Person) ON (p.full_name); + +// Last name search (common for partial matching) +CREATE INDEX person_lastname_idx IF NOT EXISTS +FOR (p:Person) ON (p.last_name); + +// Nationality filtering (country code) +CREATE INDEX person_nationality_idx IF NOT EXISTS +FOR (p:Person) ON (p.nationality); + +// PEP (Politically Exposed Person) flag +CREATE INDEX person_pep_idx IF NOT EXISTS +FOR (p:Person) ON (p.is_pep); + +// Country of residence +CREATE INDEX person_residence_idx IF NOT EXISTS +FOR (p:Person) ON (p.country_of_residence); + +// --- Company Indexes --- +// Company name search +CREATE INDEX company_name_idx IF NOT EXISTS +FOR (c:Company) ON (c.name); + +// Jurisdiction filtering +CREATE INDEX company_jurisdiction_idx IF NOT EXISTS +FOR (c:Company) ON (c.jurisdiction_code); + +// Status filtering +CREATE INDEX company_status_idx IF NOT EXISTS +FOR (c:Company) ON (c.status); + +// Company type (Ltd, LLC, Inc, SA, etc.) +CREATE INDEX company_type_idx IF NOT EXISTS +FOR (c:Company) ON (c.company_type); + +// Registry number lookup +CREATE INDEX company_number_idx IF NOT EXISTS +FOR (c:Company) ON (c.company_number); + +// Shell company flag (for risk analysis) +CREATE INDEX company_shell_idx IF NOT EXISTS +FOR (c:Company) ON (c.is_shell_company); + +// Incorporation date (temporal queries) +CREATE INDEX company_incorporation_idx IF NOT EXISTS +FOR (c:Company) ON (c.incorporation_date); + +// --- Officer Indexes --- +// Officer name search +CREATE INDEX officer_name_idx IF NOT EXISTS +FOR (o:Officer) ON (o.name); + +// Role type filtering (Director, Secretary, Nominee, etc.) +CREATE INDEX officer_role_idx IF NOT EXISTS +FOR (o:Officer) ON (o.role_type); + +// Corporate officer flag (company acting as officer) +CREATE INDEX officer_corporate_idx IF NOT EXISTS +FOR (o:Officer) ON (o.is_corporate_officer); + +// Status filtering +CREATE INDEX officer_status_idx IF NOT EXISTS +FOR (o:Officer) ON (o.status); + +// --- Intermediary Indexes --- +// Intermediary name search +CREATE INDEX intermediary_name_idx IF NOT EXISTS +FOR (i:Intermediary) ON (i.name); + +// Type filtering (Law Firm, Bank, Trust Company, etc.) +CREATE INDEX intermediary_type_idx IF NOT EXISTS +FOR (i:Intermediary) ON (i.type); + +// Country filtering +CREATE INDEX intermediary_country_idx IF NOT EXISTS +FOR (i:Intermediary) ON (i.country_code); + +// Status filtering +CREATE INDEX intermediary_status_idx IF NOT EXISTS +FOR (i:Intermediary) ON (i.status); + +// --- Address Indexes --- +// Country filtering (most common address query) +CREATE INDEX address_country_idx IF NOT EXISTS +FOR (a:Address) ON (a.country_code); + +// City filtering +CREATE INDEX address_city_idx IF NOT EXISTS +FOR (a:Address) ON (a.city); + +// Postal code lookup +CREATE INDEX address_postal_idx IF NOT EXISTS +FOR (a:Address) ON (a.postal_code); + +// Nominee address flag (red flag indicator) +CREATE INDEX address_nominee_idx IF NOT EXISTS +FOR (a:Address) ON (a.is_nominee_address); + +// --- Jurisdiction Indexes --- +// Jurisdiction name lookup +CREATE INDEX jurisdiction_name_idx IF NOT EXISTS +FOR (j:Jurisdiction) ON (j.name); + +// Tax haven flag (risk filtering) +CREATE INDEX jurisdiction_haven_idx IF NOT EXISTS +FOR (j:Jurisdiction) ON (j.is_tax_haven); + +// Geographic region filtering +CREATE INDEX jurisdiction_region_idx IF NOT EXISTS +FOR (j:Jurisdiction) ON (j.region); + +// Risk level filtering +CREATE INDEX jurisdiction_risk_idx IF NOT EXISTS +FOR (j:Jurisdiction) ON (j.risk_level); + +// FATF status filtering +CREATE INDEX jurisdiction_fatf_idx IF NOT EXISTS +FOR (j:Jurisdiction) ON (j.fatf_status); + + +// ============================================================================ +// SECTION 4: COMPOSITE INDEXES (Multi-Property Queries) +// ============================================================================ +// These optimize queries with multiple WHERE conditions. +// Order matters: put highest-cardinality property first. + +// --- Entity Composite Indexes --- +// Jurisdiction + status (common filter combination) +CREATE INDEX entity_jurisdiction_status_idx IF NOT EXISTS +FOR (e:Entity) ON (e.jurisdiction_code, e.status); + +// Jurisdiction + type (filter by entity category in jurisdiction) +CREATE INDEX entity_jurisdiction_type_idx IF NOT EXISTS +FOR (e:Entity) ON (e.jurisdiction_code, e.entity_type); + +// Source + status (filter active entities by leak source) +CREATE INDEX entity_source_status_idx IF NOT EXISTS +FOR (e:Entity) ON (e.source, e.status); + +// --- Company Composite Indexes --- +// Jurisdiction + status + incorporation date (temporal jurisdiction analysis) +CREATE INDEX company_jurisdiction_status_date_idx IF NOT EXISTS +FOR (c:Company) ON (c.jurisdiction_code, c.status, c.incorporation_date); + +// --- Person Composite Indexes --- +// Nationality + PEP flag (high-risk person identification) +CREATE INDEX person_nationality_pep_idx IF NOT EXISTS +FOR (p:Person) ON (p.nationality, p.is_pep); + +// --- Address Composite Indexes --- +// Country + city (geographic clustering) +CREATE INDEX address_country_city_idx IF NOT EXISTS +FOR (a:Address) ON (a.country_code, a.city); + + +// ============================================================================ +// SECTION 5: FULL-TEXT INDEXES (Fuzzy Name Searching) +// ============================================================================ +// These enable CONTAINS, fuzzy matching, and relevance-scored searches. +// Essential for name matching across different spellings/transliterations. + +// --- Entity Full-Text Index --- +// Search across name and original_name (non-Latin scripts) +CREATE FULLTEXT INDEX entity_name_fulltext IF NOT EXISTS +FOR (e:Entity) ON EACH [e.name, e.original_name]; + +// --- Person Full-Text Index --- +// Search across all name fields +CREATE FULLTEXT INDEX person_name_fulltext IF NOT EXISTS +FOR (p:Person) ON EACH [p.full_name, p.first_name, p.last_name]; + +// --- Company Full-Text Index --- +// Search company names +CREATE FULLTEXT INDEX company_name_fulltext IF NOT EXISTS +FOR (c:Company) ON EACH [c.name]; + +// --- Officer Full-Text Index --- +// Search officer names +CREATE FULLTEXT INDEX officer_name_fulltext IF NOT EXISTS +FOR (o:Officer) ON EACH [o.name]; + +// --- Intermediary Full-Text Index --- +// Search intermediary names +CREATE FULLTEXT INDEX intermediary_name_fulltext IF NOT EXISTS +FOR (i:Intermediary) ON EACH [i.name]; + +// --- Address Full-Text Index --- +// Search full address text +CREATE FULLTEXT INDEX address_fulltext IF NOT EXISTS +FOR (a:Address) ON EACH [a.full_address, a.city]; + + +// ============================================================================ +// SECTION 6: RELATIONSHIP PROPERTY INDEXES (Neo4j 5.x Feature) +// ============================================================================ +// These index properties on relationships for filtered traversals. +// Critical for ownership percentage filtering and temporal queries. + +// --- OWNS Relationship Indexes --- +// Filter by ownership status (Active, Historical) +CREATE INDEX owns_status_idx IF NOT EXISTS +FOR ()-[r:OWNS]-() ON (r.status); + +// Filter by ownership percentage (>25% threshold queries) +CREATE INDEX owns_percentage_idx IF NOT EXISTS +FOR ()-[r:OWNS]-() ON (r.ownership_percentage); + +// Filter by nominee ownership flag +CREATE INDEX owns_nominee_idx IF NOT EXISTS +FOR ()-[r:OWNS]-() ON (r.is_nominee); + +// --- CONTROLS Relationship Indexes --- +// Filter by control type (Board Majority, Voting Agreement, etc.) +CREATE INDEX controls_type_idx IF NOT EXISTS +FOR ()-[r:CONTROLS]-() ON (r.control_type); + +// Filter by control status +CREATE INDEX controls_status_idx IF NOT EXISTS +FOR ()-[r:CONTROLS]-() ON (r.status); + +// --- INVOLVED_IN Relationship Indexes --- +// Filter by role type (Director, Secretary, etc.) +CREATE INDEX involved_role_idx IF NOT EXISTS +FOR ()-[r:INVOLVED_IN]-() ON (r.role); + +// Filter by status (Active, Former) +CREATE INDEX involved_status_idx IF NOT EXISTS +FOR ()-[r:INVOLVED_IN]-() ON (r.status); + +// Filter by nominee flag +CREATE INDEX involved_nominee_idx IF NOT EXISTS +FOR ()-[r:INVOLVED_IN]-() ON (r.is_nominee); + +// --- CREATED_BY Relationship Indexes --- +// Filter by relationship status +CREATE INDEX created_status_idx IF NOT EXISTS +FOR ()-[r:CREATED_BY]-() ON (r.relationship_status); + +// --- CONNECTED_TO Relationship Indexes --- +// Filter by connection type +CREATE INDEX connected_type_idx IF NOT EXISTS +FOR ()-[r:CONNECTED_TO]-() ON (r.connection_type); + +// Filter by confidence level +CREATE INDEX connected_confidence_idx IF NOT EXISTS +FOR ()-[r:CONNECTED_TO]-() ON (r.confidence); + + +// ============================================================================ +// SECTION 7: RANGE INDEXES FOR TEMPORAL QUERIES (Neo4j 5.x) +// ============================================================================ +// Range indexes optimize date/number range queries (<, >, BETWEEN). + +// --- Date Range Indexes --- +CREATE RANGE INDEX entity_registration_range IF NOT EXISTS +FOR (e:Entity) ON (e.registration_date); + +CREATE RANGE INDEX company_incorporation_range IF NOT EXISTS +FOR (c:Company) ON (c.incorporation_date); + +CREATE RANGE INDEX company_dissolution_range IF NOT EXISTS +FOR (c:Company) ON (c.dissolution_date); + + +// ============================================================================ +// SECTION 8: SCHEMA VERIFICATION QUERIES +// ============================================================================ +// Run these after schema creation to verify everything was created correctly. + +// --- Show All Constraints --- +SHOW CONSTRAINTS; + +// --- Show All Indexes --- +SHOW INDEXES; + +// --- Detailed Constraint Information --- +SHOW CONSTRAINTS +YIELD name, type, entityType, labelsOrTypes, properties, ownedIndex +RETURN name, type, entityType, labelsOrTypes, properties, ownedIndex +ORDER BY entityType, name; + +// --- Detailed Index Information --- +SHOW INDEXES +YIELD name, type, entityType, labelsOrTypes, properties, state +RETURN name, type, entityType, labelsOrTypes, properties, state +ORDER BY type, name; + +// --- Count Constraints by Type --- +SHOW CONSTRAINTS +YIELD type +RETURN type, count(*) AS count +ORDER BY count DESC; + +// --- Count Indexes by Type --- +SHOW INDEXES +YIELD type +RETURN type, count(*) AS count +ORDER BY count DESC; + +// --- Verify All Indexes Are Online --- +SHOW INDEXES +YIELD name, state +WHERE state <> 'ONLINE' +RETURN name, state; + + +// ============================================================================ +// SECTION 9: SAMPLE DATA INSERTION (For Testing Schema) +// ============================================================================ +// These create minimal test data to verify constraints work correctly. +// Remove or comment out for production deployment. + +// --- Create Sample Jurisdiction --- +MERGE (j:Jurisdiction {jurisdiction_code: 'BVI'}) +SET j.name = 'British Virgin Islands', + j.country_code = 'VGB', + j.region = 'Caribbean', + j.is_tax_haven = true, + j.risk_level = 'HIGH', + j.fatf_status = 'Monitored', + j.secrecy_score = 71; + +MERGE (j:Jurisdiction {jurisdiction_code: 'PAN'}) +SET j.name = 'Panama', + j.country_code = 'PAN', + j.region = 'Central America', + j.is_tax_haven = true, + j.risk_level = 'HIGH', + j.fatf_status = 'Grey List', + j.secrecy_score = 72; + +// --- Create Sample Entity --- +MERGE (e:Entity {entity_id: 'TEST-ENTITY-001'}) +SET e.name = 'Acme Holdings Ltd', + e.entity_type = 'Company', + e.jurisdiction_code = 'BVI', + e.status = 'Active', + e.source = 'Panama Papers', + e.registration_date = date('2005-03-15'); + +// --- Create Sample Person --- +MERGE (p:Person {person_id: 'TEST-PERSON-001'}) +SET p.full_name = 'John Smith', + p.first_name = 'John', + p.last_name = 'Smith', + p.nationality = 'USA', + p.is_pep = false, + p.source = 'Panama Papers'; + +// --- Create Sample Company --- +MERGE (c:Company {company_id: 'TEST-COMPANY-001'}) +SET c.name = 'Global Ventures Inc', + c.jurisdiction_code = 'PAN', + c.status = 'Active', + c.company_type = 'Inc', + c.incorporation_date = date('2010-07-22'), + c.is_shell_company = true; + +// --- Create Sample Intermediary --- +MERGE (i:Intermediary {intermediary_id: 'TEST-INTERMEDIARY-001'}) +SET i.name = 'Test Law Firm', + i.type = 'Law Firm', + i.country_code = 'PAN', + i.status = 'Active'; + +// --- Create Sample Address --- +MERGE (a:Address {address_id: 'TEST-ADDRESS-001'}) +SET a.full_address = '123 Offshore Avenue, Road Town, BVI', + a.city = 'Road Town', + a.country_code = 'VGB', + a.is_nominee_address = true; + +// --- Create Sample Officer --- +MERGE (o:Officer {officer_id: 'TEST-OFFICER-001'}) +SET o.name = 'Jane Doe', + o.role_type = 'Director', + o.is_corporate_officer = false, + o.status = 'Active'; + +// --- Create Sample Relationships --- +// Person OWNS Entity +MATCH (p:Person {person_id: 'TEST-PERSON-001'}) +MATCH (e:Entity {entity_id: 'TEST-ENTITY-001'}) +MERGE (p)-[r:OWNS]->(e) +SET r.ownership_percentage = 100.0, + r.status = 'Active', + r.is_nominee = false, + r.acquisition_date = date('2005-03-15'); + +// Entity REGISTERED_IN Jurisdiction +MATCH (e:Entity {entity_id: 'TEST-ENTITY-001'}) +MATCH (j:Jurisdiction {jurisdiction_code: 'BVI'}) +MERGE (e)-[r:REGISTERED_IN]->(j) +SET r.registration_date = date('2005-03-15'), + r.status = 'Active'; + +// Entity HAS_ADDRESS Address +MATCH (e:Entity {entity_id: 'TEST-ENTITY-001'}) +MATCH (a:Address {address_id: 'TEST-ADDRESS-001'}) +MERGE (e)-[r:HAS_ADDRESS]->(a) +SET r.address_type = 'Registered', + r.is_primary = true; + +// Officer INVOLVED_IN Entity +MATCH (o:Officer {officer_id: 'TEST-OFFICER-001'}) +MATCH (e:Entity {entity_id: 'TEST-ENTITY-001'}) +MERGE (o)-[r:INVOLVED_IN]->(e) +SET r.role = 'Director', + r.status = 'Active', + r.is_nominee = false, + r.start_date = date('2005-03-15'); + +// Entity CREATED_BY Intermediary +MATCH (e:Entity {entity_id: 'TEST-ENTITY-001'}) +MATCH (i:Intermediary {intermediary_id: 'TEST-INTERMEDIARY-001'}) +MERGE (e)-[r:CREATED_BY]->(i) +SET r.creation_date = date('2005-03-15'), + r.relationship_status = 'Active'; + + +// ============================================================================ +// SECTION 10: TEST QUERIES (Verify Schema Works) +// ============================================================================ +// These validate that indexes are being used and constraints are enforced. + +// --- Test Uniqueness Constraint (Should Fail) --- +// Uncomment to test - should throw ConstraintViolation: +// CREATE (e:Entity {entity_id: 'TEST-ENTITY-001', name: 'Duplicate Test', source: 'Test'}); + +// --- Test Full-Text Search --- +CALL db.index.fulltext.queryNodes('entity_name_fulltext', 'Acme') +YIELD node, score +RETURN node.name AS name, score +ORDER BY score DESC; + +// --- Test Index Usage (Check PROFILE) --- +PROFILE +MATCH (e:Entity) +WHERE e.jurisdiction_code = 'BVI' AND e.status = 'Active' +RETURN e.name; + +// --- Test Relationship Property Index --- +PROFILE +MATCH (p:Person)-[r:OWNS]->(e:Entity) +WHERE r.ownership_percentage >= 25 +RETURN p.full_name, r.ownership_percentage, e.name; + +// --- Test Date Range Query --- +PROFILE +MATCH (c:Company) +WHERE c.incorporation_date >= date('2005-01-01') + AND c.incorporation_date <= date('2010-12-31') +RETURN c.name, c.incorporation_date +ORDER BY c.incorporation_date; + + +// ============================================================================ +// SECTION 11: CLEANUP TEST DATA (Optional) +// ============================================================================ +// Run this to remove test data after verification. +// Comment out if you want to keep sample data for development. + +// --- Remove Test Relationships --- +MATCH (n) +WHERE n.entity_id STARTS WITH 'TEST-' + OR n.person_id STARTS WITH 'TEST-' + OR n.company_id STARTS WITH 'TEST-' + OR n.officer_id STARTS WITH 'TEST-' + OR n.intermediary_id STARTS WITH 'TEST-' + OR n.address_id STARTS WITH 'TEST-' +DETACH DELETE n; + +// --- Keep Jurisdiction Reference Data --- +// (Jurisdictions are reference data, typically kept) + + +// ============================================================================ +// SCHEMA CREATION COMPLETE +// ============================================================================ +// +// Summary: +// - 7 Node Labels: Entity, Person, Company, Officer, Intermediary, Address, Jurisdiction +// - 9 Relationship Types: OWNS, CONTROLS, HAS_ADDRESS, REGISTERED_IN, +// INVOLVED_IN, CREATED_BY, CONNECTED_TO, RELATED_TO, NATIONALITY +// - 15+ Uniqueness Constraints +// - 10+ Property Existence Constraints +// - 40+ Single-Property Indexes +// - 6 Composite Indexes +// - 6 Full-Text Indexes +// - 10+ Relationship Property Indexes +// - 3 Range Indexes +// +// Next Steps: +// 1. Run SHOW INDEXES to verify all indexes are ONLINE +// 2. Import data using LOAD CSV or neo4j-admin import +// 3. Run ANALYZE to update index statistics after import +// +// ============================================================================ diff --git a/scripts/gds_setup.cypher b/scripts/gds_setup.cypher new file mode 100644 index 0000000..e6a124e --- /dev/null +++ b/scripts/gds_setup.cypher @@ -0,0 +1,834 @@ +// ============================================================================ +// PANAMA PAPERS - GRAPH DATA SCIENCE (GDS) SETUP +// ============================================================================ +// +// Neo4j GDS Library Configuration for Offshore Network Analysis +// +// Purpose: +// - Create in-memory graph projections for high-performance analytics +// - Run centrality algorithms to identify influential entities +// - Detect communities and business networks +// - Identify suspicious patterns through graph algorithms +// +// Prerequisites: +// - Neo4j 5.x with GDS plugin installed (2.x recommended) +// - Sufficient memory (heap + page cache) for graph projections +// - Panama Papers schema already created and data loaded +// +// Execution Order: +// 1. Verify GDS installation +// 2. Create graph projections +// 3. Run algorithms (stream mode first for testing) +// 4. Write results back to database +// 5. Clean up projections when done +// +// Memory Estimation: +// - ~1GB RAM per 10M nodes + relationships +// - Run gds.graph.project.estimate() before large projections +// +// ============================================================================ + + +// ============================================================================ +// SECTION 1: GDS LIBRARY VERIFICATION +// ============================================================================ +// Verify the GDS plugin is properly installed and check version compatibility + +// Check GDS version (should be 2.x for Neo4j 5.x) +CALL gds.version() +YIELD gdsVersion +RETURN gdsVersion AS installed_version; + +// Verify GDS is properly licensed (Community vs Enterprise) +CALL gds.debug.sysInfo() +YIELD key, value +WHERE key IN ['gdsVersion', 'gdsEdition', 'availableCPUs', 'heapFree', 'heapTotal'] +RETURN key, value; + +// List all available algorithms (useful for reference) +CALL gds.list() +YIELD name, description, signature +WITH name, description, signature +WHERE name CONTAINS 'pagerank' + OR name CONTAINS 'louvain' + OR name CONTAINS 'degree' + OR name CONTAINS 'shortestPath' + OR name CONTAINS 'betweenness' +RETURN name, description +ORDER BY name; + +// Check current graph catalog (existing projections) +CALL gds.graph.list() +YIELD graphName, nodeCount, relationshipCount, creationTime, memoryUsage +RETURN graphName, nodeCount, relationshipCount, creationTime, memoryUsage; + + +// ============================================================================ +// SECTION 2: MEMORY ESTIMATION +// ============================================================================ +// Always estimate memory before creating large projections to prevent OOM errors + +// Estimate memory for ownership graph projection +CALL gds.graph.project.estimate( + 'Entity', + { + OWNS: { + type: 'OWNS', + orientation: 'NATURAL', + properties: ['ownership_percentage'] + } + } +) +YIELD requiredMemory, nodeCount, relationshipCount +RETURN + requiredMemory AS estimated_memory, + nodeCount AS estimated_nodes, + relationshipCount AS estimated_relationships; + +// Estimate memory for control graph (multi-label) +CALL gds.graph.project.estimate( + ['Entity', 'Person'], + ['OWNS', 'CONTROLS', 'INVOLVED_IN'] +) +YIELD requiredMemory, nodeCount, relationshipCount +RETURN requiredMemory, nodeCount, relationshipCount; + + +// ============================================================================ +// SECTION 3: GRAPH PROJECTIONS +// ============================================================================ +// Create in-memory graph representations optimized for specific analyses +// These projections are stored in the GDS graph catalog + +// ---------------------------------------------------------------------------- +// PROJECTION 1: OWNERSHIP GRAPH +// ---------------------------------------------------------------------------- +// Purpose: Analyze ownership influence, find controlling entities +// Use cases: PageRank, betweenness centrality, path analysis +// Direction: NATURAL (follows OWNS direction: owner -> owned) + +// Drop if exists (for re-runs) +CALL gds.graph.drop('ownership-graph', false) +YIELD graphName +RETURN graphName AS dropped; + +// Create ownership graph projection +CALL gds.graph.project( + 'ownership-graph', // Graph name + { + Entity: { // Node projection + label: 'Entity', + properties: { + entity_type: { + property: 'entity_type', + defaultValue: 'Unknown' + }, + jurisdiction_code: { + property: 'jurisdiction_code', + defaultValue: 'UNK' + }, + status: { + property: 'status', + defaultValue: 'Unknown' + } + } + }, + Company: { + label: 'Company', + properties: { + jurisdiction_code: { + property: 'jurisdiction_code', + defaultValue: 'UNK' + }, + is_shell_company: { + property: 'is_shell_company', + defaultValue: false + } + } + } + }, + { + OWNS: { // Relationship projection + type: 'OWNS', + orientation: 'NATURAL', // Directed: owner -> owned + properties: { + ownership_percentage: { + property: 'ownership_percentage', + defaultValue: 0.0, + aggregation: 'MAX' // Use max if multiple edges + } + } + } + } +) +YIELD + graphName, + nodeCount, + relationshipCount, + projectMillis, + configuration +RETURN + graphName, + nodeCount AS nodes, + relationshipCount AS relationships, + projectMillis AS creation_time_ms; + + +// ---------------------------------------------------------------------------- +// PROJECTION 2: CONTROL GRAPH (Undirected for Community Detection) +// ---------------------------------------------------------------------------- +// Purpose: Detect business networks and communities +// Use cases: Louvain clustering, connected components, triangle counting +// Direction: UNDIRECTED (treats relationships as bidirectional) + +CALL gds.graph.drop('control-graph', false) +YIELD graphName +RETURN graphName AS dropped; + +CALL gds.graph.project( + 'control-graph', + { + Entity: { + label: 'Entity', + properties: ['name', 'entity_type', 'jurisdiction_code'] + }, + Person: { + label: 'Person', + properties: ['full_name', 'nationality', 'is_pep'] + }, + Company: { + label: 'Company', + properties: ['name', 'jurisdiction_code'] + } + }, + { + OWNS: { + type: 'OWNS', + orientation: 'UNDIRECTED', // Bidirectional for clustering + properties: ['ownership_percentage'] + }, + CONTROLS: { + type: 'CONTROLS', + orientation: 'UNDIRECTED', + properties: ['control_type'] + }, + INVOLVED_IN: { + type: 'INVOLVED_IN', + orientation: 'UNDIRECTED', + properties: ['role'] + } + } +) +YIELD graphName, nodeCount, relationshipCount, projectMillis +RETURN graphName, nodeCount, relationshipCount, projectMillis; + + +// ---------------------------------------------------------------------------- +// PROJECTION 3: JURISDICTION GRAPH +// ---------------------------------------------------------------------------- +// Purpose: Geographic risk analysis, jurisdiction hopping detection +// Use cases: Path analysis, jurisdiction clustering + +CALL gds.graph.drop('jurisdiction-graph', false) +YIELD graphName +RETURN graphName AS dropped; + +CALL gds.graph.project( + 'jurisdiction-graph', + { + Entity: { + label: 'Entity', + properties: ['name', 'entity_type'] + }, + Jurisdiction: { + label: 'Jurisdiction', + properties: { + is_tax_haven: { + property: 'is_tax_haven', + defaultValue: false + }, + risk_level: { + property: 'risk_level', + defaultValue: 'UNKNOWN' + }, + secrecy_score: { + property: 'secrecy_score', + defaultValue: 50 + } + } + }, + Address: { + label: 'Address', + properties: ['country_code', 'city'] + } + }, + { + REGISTERED_IN: { + type: 'REGISTERED_IN', + orientation: 'NATURAL' + }, + HAS_ADDRESS: { + type: 'HAS_ADDRESS', + orientation: 'NATURAL' + } + } +) +YIELD graphName, nodeCount, relationshipCount +RETURN graphName, nodeCount, relationshipCount; + + +// ---------------------------------------------------------------------------- +// PROJECTION 4: INTERMEDIARY NETWORK +// ---------------------------------------------------------------------------- +// Purpose: Identify influential intermediaries (law firms, banks) +// Use cases: PageRank, degree centrality on service providers + +CALL gds.graph.drop('intermediary-graph', false) +YIELD graphName +RETURN graphName AS dropped; + +CALL gds.graph.project( + 'intermediary-graph', + { + Intermediary: { + label: 'Intermediary', + properties: ['name', 'type', 'country_code'] + }, + Entity: { + label: 'Entity', + properties: ['name', 'jurisdiction_code'] + } + }, + { + CREATED_BY: { + type: 'CREATED_BY', + orientation: 'REVERSE' // Entity -> Intermediary (reversed) + } + } +) +YIELD graphName, nodeCount, relationshipCount +RETURN graphName, nodeCount, relationshipCount; + + +// ============================================================================ +// SECTION 4: CENTRALITY ALGORITHMS +// ============================================================================ +// Identify influential nodes in the offshore network + +// ---------------------------------------------------------------------------- +// ALGORITHM 1: PageRank - Ownership Influence +// ---------------------------------------------------------------------------- +// Finds entities that are "owned by" many other important entities +// High PageRank = influential/important in ownership network + +// Stream mode (returns results without writing) +CALL gds.pageRank.stream( + 'ownership-graph', + { + maxIterations: 20, + dampingFactor: 0.85, + tolerance: 0.0001, + relationshipWeightProperty: 'ownership_percentage', + scaler: 'MEAN' // Normalize scores + } +) +YIELD nodeId, score +WITH gds.util.asNode(nodeId) AS entity, score +WHERE score > 0.001 // Filter low-influence nodes +RETURN + entity.entity_id AS entity_id, + entity.name AS entity_name, + entity.jurisdiction_code AS jurisdiction, + entity.entity_type AS type, + round(score * 1000) / 1000 AS pagerank_score +ORDER BY pagerank_score DESC +LIMIT 25; + + +// ---------------------------------------------------------------------------- +// ALGORITHM 2: Betweenness Centrality - Bridge Entities +// ---------------------------------------------------------------------------- +// Identifies entities that act as bridges between different parts of the network +// High betweenness = critical intermediary in ownership chains + +CALL gds.betweenness.stream( + 'ownership-graph', + { + samplingSize: 10000, // Sample for large graphs + samplingSeed: 42 + } +) +YIELD nodeId, score +WITH gds.util.asNode(nodeId) AS entity, score +WHERE score > 0 +RETURN + entity.name AS entity_name, + entity.jurisdiction_code AS jurisdiction, + round(score) AS betweenness_score +ORDER BY betweenness_score DESC +LIMIT 20; + + +// ---------------------------------------------------------------------------- +// ALGORITHM 3: Degree Centrality - Network Hubs +// ---------------------------------------------------------------------------- +// Simple count of connections - finds most connected entities + +CALL gds.degree.stream( + 'control-graph', + { + orientation: 'UNDIRECTED' + } +) +YIELD nodeId, score +WITH gds.util.asNode(nodeId) AS node, score +WHERE score >= 5 // At least 5 connections +RETURN + labels(node)[0] AS node_type, + COALESCE(node.name, node.full_name) AS name, + toInteger(score) AS connection_count +ORDER BY connection_count DESC +LIMIT 30; + + +// ---------------------------------------------------------------------------- +// ALGORITHM 4: Eigenvector Centrality - Connected to Important Nodes +// ---------------------------------------------------------------------------- +// Similar to PageRank but emphasizes connections to high-scoring nodes + +CALL gds.eigenvector.stream( + 'ownership-graph', + { + maxIterations: 100, + tolerance: 0.0001 + } +) +YIELD nodeId, score +WITH gds.util.asNode(nodeId) AS entity, score +WHERE score > 0.01 +RETURN + entity.name AS entity_name, + entity.jurisdiction_code AS jurisdiction, + round(score * 1000) / 1000 AS eigenvector_score +ORDER BY eigenvector_score DESC +LIMIT 20; + + +// ============================================================================ +// SECTION 5: COMMUNITY DETECTION +// ============================================================================ +// Identify clusters of related entities (business networks, family holdings) + +// ---------------------------------------------------------------------------- +// ALGORITHM 5: Louvain Community Detection +// ---------------------------------------------------------------------------- +// Detects dense communities/clusters in the network +// Useful for identifying business groups operating together + +CALL gds.louvain.stream( + 'control-graph', + { + maxLevels: 10, + maxIterations: 10, + tolerance: 0.0001, + includeIntermediateCommunities: false, + consecutiveIds: true, + seedProperty: null, + relationshipWeightProperty: null + } +) +YIELD nodeId, communityId, intermediateCommunityIds +WITH communityId, collect(gds.util.asNode(nodeId)) AS members +WITH + communityId, + size(members) AS community_size, + [m IN members | COALESCE(m.name, m.full_name)][0..10] AS sample_members, + [m IN members WHERE m:Person | m.full_name][0..5] AS persons_in_community, + [m IN members WHERE m:Entity | m.jurisdiction_code][0..5] AS jurisdictions +WHERE community_size >= 3 // Minimum cluster size +RETURN + communityId AS community_id, + community_size, + sample_members, + persons_in_community, + jurisdictions +ORDER BY community_size DESC +LIMIT 25; + + +// ---------------------------------------------------------------------------- +// ALGORITHM 6: Weakly Connected Components +// ---------------------------------------------------------------------------- +// Finds isolated subgraphs (disconnected business networks) + +CALL gds.wcc.stream('control-graph') +YIELD nodeId, componentId +WITH componentId, collect(gds.util.asNode(nodeId)) AS nodes +WITH + componentId, + size(nodes) AS component_size, + [n IN nodes | COALESCE(n.name, n.full_name)][0..5] AS sample_nodes +WHERE component_size >= 2 AND component_size <= 100 // Mid-sized components +RETURN + componentId AS component_id, + component_size, + sample_nodes +ORDER BY component_size DESC +LIMIT 20; + + +// ---------------------------------------------------------------------------- +// ALGORITHM 7: Label Propagation (Fast Community Detection) +// ---------------------------------------------------------------------------- +// Faster alternative to Louvain for very large graphs + +CALL gds.labelPropagation.stream( + 'control-graph', + { + maxIterations: 10, + nodeWeightProperty: null, + relationshipWeightProperty: null + } +) +YIELD nodeId, communityId +WITH communityId, count(*) AS size +WHERE size >= 5 +RETURN communityId, size +ORDER BY size DESC +LIMIT 20; + + +// ============================================================================ +// SECTION 6: PATH ANALYSIS ALGORITHMS +// ============================================================================ +// Analyze ownership chains and connections + +// ---------------------------------------------------------------------------- +// ALGORITHM 8: All Pairs Shortest Path (Sample) +// ---------------------------------------------------------------------------- +// Find shortest ownership paths between entities +// WARNING: Expensive on large graphs - use with filters + +CALL gds.allShortestPaths.stream( + 'ownership-graph', + { + sourceNode: null, // All pairs + relationshipWeightProperty: null + } +) +YIELD sourceNodeId, targetNodeId, distance +WITH + gds.util.asNode(sourceNodeId) AS source, + gds.util.asNode(targetNodeId) AS target, + distance +WHERE distance > 0 AND distance <= 4 // 1-4 hop paths + AND source.entity_type = 'Company' + AND target.entity_type = 'Company' +RETURN + source.name AS from_entity, + target.name AS to_entity, + toInteger(distance) AS path_length +ORDER BY path_length DESC +LIMIT 20; + + +// ============================================================================ +// SECTION 7: WRITE RESULTS BACK TO DATABASE +// ============================================================================ +// Persist algorithm results as node properties for future queries + +// ---------------------------------------------------------------------------- +// Write PageRank scores to Entity nodes +// ---------------------------------------------------------------------------- +CALL gds.pageRank.write( + 'ownership-graph', + { + maxIterations: 20, + dampingFactor: 0.85, + writeProperty: 'pagerank_score' + } +) +YIELD + nodePropertiesWritten, + ranIterations, + didConverge, + preProcessingMillis, + computeMillis, + writeMillis, + centralityDistribution +RETURN + nodePropertiesWritten AS nodes_updated, + ranIterations AS iterations, + didConverge AS converged, + computeMillis AS compute_time_ms, + writeMillis AS write_time_ms, + centralityDistribution.mean AS mean_score, + centralityDistribution.max AS max_score; + + +// ---------------------------------------------------------------------------- +// Write Louvain community IDs to nodes +// ---------------------------------------------------------------------------- +CALL gds.louvain.write( + 'control-graph', + { + writeProperty: 'community_id', + maxLevels: 10, + maxIterations: 10 + } +) +YIELD + communityCount, + nodePropertiesWritten, + modularity, + computeMillis, + writeMillis +RETURN + communityCount AS communities_found, + nodePropertiesWritten AS nodes_updated, + round(modularity * 1000) / 1000 AS modularity_score, + computeMillis + writeMillis AS total_time_ms; + + +// ---------------------------------------------------------------------------- +// Write Degree Centrality scores +// ---------------------------------------------------------------------------- +CALL gds.degree.write( + 'control-graph', + { + writeProperty: 'degree_centrality', + orientation: 'UNDIRECTED' + } +) +YIELD + nodePropertiesWritten, + centralityDistribution +RETURN + nodePropertiesWritten AS nodes_updated, + centralityDistribution.mean AS mean_degree, + centralityDistribution.max AS max_degree; + + +// ---------------------------------------------------------------------------- +// Write Betweenness Centrality scores (sampling for performance) +// ---------------------------------------------------------------------------- +CALL gds.betweenness.write( + 'ownership-graph', + { + writeProperty: 'betweenness_score', + samplingSize: 5000, + samplingSeed: 42 + } +) +YIELD + nodePropertiesWritten, + centralityDistribution +RETURN + nodePropertiesWritten AS nodes_updated, + centralityDistribution.max AS max_betweenness; + + +// ============================================================================ +// SECTION 8: VERIFICATION QUERIES +// ============================================================================ +// Verify algorithm results were written correctly + +// Check PageRank scores were written +MATCH (e:Entity) +WHERE e.pagerank_score IS NOT NULL +RETURN + 'PageRank' AS algorithm, + count(e) AS nodes_with_scores, + avg(e.pagerank_score) AS avg_score, + max(e.pagerank_score) AS max_score; + +// Check community assignments +MATCH (n) +WHERE n.community_id IS NOT NULL +RETURN + 'Louvain' AS algorithm, + count(DISTINCT n.community_id) AS total_communities, + count(n) AS nodes_assigned; + +// Check degree centrality +MATCH (n) +WHERE n.degree_centrality IS NOT NULL +WITH n +ORDER BY n.degree_centrality DESC +LIMIT 10 +RETURN + labels(n)[0] AS node_type, + COALESCE(n.name, n.full_name) AS name, + n.degree_centrality AS degree, + n.pagerank_score AS pagerank, + n.community_id AS community; + + +// ============================================================================ +// SECTION 9: ANALYTICAL QUERIES USING GDS RESULTS +// ============================================================================ +// Use the written properties for business analysis + +// Find high-influence entities in each jurisdiction +MATCH (e:Entity) +WHERE e.pagerank_score IS NOT NULL +WITH e.jurisdiction_code AS jurisdiction, e +ORDER BY e.pagerank_score DESC +WITH jurisdiction, collect(e)[0..3] AS top_entities +UNWIND top_entities AS entity +RETURN + jurisdiction, + entity.name AS entity_name, + round(entity.pagerank_score * 1000) / 1000 AS influence_score +ORDER BY jurisdiction, influence_score DESC; + +// Find communities spanning multiple jurisdictions (suspicious pattern) +MATCH (e:Entity) +WHERE e.community_id IS NOT NULL AND e.jurisdiction_code IS NOT NULL +WITH e.community_id AS community, collect(DISTINCT e.jurisdiction_code) AS jurisdictions +WHERE size(jurisdictions) >= 3 // 3+ jurisdictions +RETURN + community AS community_id, + size(jurisdictions) AS jurisdiction_count, + jurisdictions AS jurisdictions_list +ORDER BY jurisdiction_count DESC +LIMIT 15; + +// Find highly connected entities in same community as PEPs +MATCH (pep:Person {is_pep: true}) +WHERE pep.community_id IS NOT NULL +WITH pep.community_id AS pep_community, pep +MATCH (e:Entity {community_id: pep_community}) +WHERE e.degree_centrality > 5 +RETURN + pep.full_name AS pep_name, + e.name AS connected_entity, + e.jurisdiction_code AS jurisdiction, + e.degree_centrality AS connections, + e.pagerank_score AS influence +ORDER BY e.pagerank_score DESC +LIMIT 20; + + +// ============================================================================ +// SECTION 10: GRAPH CATALOG MANAGEMENT +// ============================================================================ +// Manage in-memory graph projections + +// List all current projections with details +CALL gds.graph.list() +YIELD + graphName, + database, + nodeCount, + relationshipCount, + density, + creationTime, + modificationTime, + memoryUsage +RETURN + graphName, + nodeCount AS nodes, + relationshipCount AS relationships, + round(density * 10000) / 10000 AS density, + memoryUsage AS memory, + creationTime +ORDER BY nodeCount DESC; + +// Get detailed stats for a specific projection +CALL gds.graph.nodeProperties.stream('ownership-graph', 'entity_type') +YIELD nodeId, propertyValue +WITH propertyValue, count(*) AS count +RETURN propertyValue AS entity_type, count +ORDER BY count DESC; + + +// ============================================================================ +// SECTION 11: CLEANUP - DROP GRAPH PROJECTIONS +// ============================================================================ +// Release memory by dropping projections when analysis is complete +// Uncomment these lines when you want to clean up + +// Drop ownership graph +// CALL gds.graph.drop('ownership-graph', false) +// YIELD graphName +// RETURN 'Dropped: ' + graphName AS status; + +// Drop control graph +// CALL gds.graph.drop('control-graph', false) +// YIELD graphName +// RETURN 'Dropped: ' + graphName AS status; + +// Drop jurisdiction graph +// CALL gds.graph.drop('jurisdiction-graph', false) +// YIELD graphName +// RETURN 'Dropped: ' + graphName AS status; + +// Drop intermediary graph +// CALL gds.graph.drop('intermediary-graph', false) +// YIELD graphName +// RETURN 'Dropped: ' + graphName AS status; + +// Drop ALL projections (use with caution) +// CALL gds.graph.list() YIELD graphName +// CALL gds.graph.drop(graphName, false) YIELD graphName AS dropped +// RETURN dropped; + + +// ============================================================================ +// SECTION 12: INDEX RECOMMENDATIONS FOR GDS PROPERTIES +// ============================================================================ +// Create indexes on algorithm-generated properties for faster queries + +// Index for PageRank score lookups +CREATE INDEX entity_pagerank_idx IF NOT EXISTS +FOR (e:Entity) ON (e.pagerank_score); + +// Index for community-based queries +CREATE INDEX entity_community_idx IF NOT EXISTS +FOR (e:Entity) ON (e.community_id); + +CREATE INDEX person_community_idx IF NOT EXISTS +FOR (p:Person) ON (p.community_id); + +// Index for degree centrality filtering +CREATE INDEX entity_degree_idx IF NOT EXISTS +FOR (e:Entity) ON (e.degree_centrality); + +// Composite index for influence + jurisdiction analysis +CREATE INDEX entity_pagerank_jurisdiction_idx IF NOT EXISTS +FOR (e:Entity) ON (e.jurisdiction_code, e.pagerank_score); + + +// ============================================================================ +// GDS SETUP COMPLETE +// ============================================================================ +// +// Summary of Projections Created: +// 1. ownership-graph - Directed OWNS network for centrality +// 2. control-graph - Undirected multi-relationship for clustering +// 3. jurisdiction-graph - Entity-Jurisdiction connections +// 4. intermediary-graph - Service provider network +// +// Algorithms Executed: +// - PageRank (ownership influence) +// - Betweenness Centrality (bridge detection) +// - Degree Centrality (network hubs) +// - Eigenvector Centrality (connected to important nodes) +// - Louvain (community detection) +// - WCC (connected components) +// - Label Propagation (fast clustering) +// +// Properties Written: +// - pagerank_score (Entity nodes) +// - community_id (Entity, Person nodes) +// - degree_centrality (Entity, Person nodes) +// - betweenness_score (Entity nodes) +// +// Next Steps: +// 1. Run analytical queries in Section 9 +// 2. Build visualizations on community_id groupings +// 3. Create alerts for high-influence + tax haven combinations +// 4. Export results for external analysis tools +// +// ============================================================================ diff --git a/scripts/seed_data.py b/scripts/seed_data.py deleted file mode 100644 index fb478e7..0000000 --- a/scripts/seed_data.py +++ /dev/null @@ -1,266 +0,0 @@ -"""ETL script to load ICIJ CSV files into Neo4j.""" - -import os -import sys -from pathlib import Path -from typing import Dict, List - -import pandas as pd - -from app.database import get_database -from app.config import settings - -# Chunk size for reading large CSVs -CHUNK_SIZE = 10000 - - -def load_entities(tx, dataframe): - """ - Load entities and create Jurisdiction nodes in a single transaction. - Uses the 'secret sauce' pattern for Entity-Jurisdiction relationship. - """ - query = """ - UNWIND $rows AS row - // 1. Create the Entity - MERGE (e:Entity {id: row.node_id}) - SET e.name = row.name, - e.source = row.sourceID, - e.node_type = 'Entity' - - // 2. Extract and link Jurisdiction - WITH e, row - WHERE row.jurisdiction_description IS NOT NULL AND row.jurisdiction_description <> '' - MERGE (j:Jurisdiction {name: row.jurisdiction_description}) - MERGE (e)-[:REGISTERED_IN]->(j) - """ - - # Convert DataFrame to list of dicts for Neo4j - # Handle NaN values by converting to None - records = dataframe.where(pd.notna(dataframe), None).to_dict('records') - tx.run(query, rows=records) - - -def load_officers(tx, dataframe): - """Load Officer nodes.""" - query = """ - UNWIND $rows AS row - MERGE (o:Officer {id: row.node_id}) - SET o.name = row.name, - o.node_type = 'Officer', - o.source = row.sourceID - """ - - records = dataframe.where(pd.notna(dataframe), None).to_dict('records') - tx.run(query, rows=records) - - -def load_intermediaries(tx, dataframe): - """Load Intermediary nodes.""" - query = """ - UNWIND $rows AS row - MERGE (i:Intermediary {id: row.node_id}) - SET i.name = row.name, - i.node_type = 'Intermediary', - i.source = row.sourceID - """ - - records = dataframe.where(pd.notna(dataframe), None).to_dict('records') - tx.run(query, rows=records) - - -def load_addresses(tx, dataframe): - """Load Address nodes.""" - query = """ - UNWIND $rows AS row - MERGE (a:Address {id: row.node_id}) - SET a.address = row.address, - a.node_type = 'Address', - a.source = row.sourceID - """ - - records = dataframe.where(pd.notna(dataframe), None).to_dict('records') - tx.run(query, rows=records) - - -def load_relationships(tx, dataframe): - """Load relationships based on rel_type.""" - # Filter by relationship type and create appropriate relationships - # Handle NaN values by filling with empty string - rel_type_series = dataframe['rel_type'].fillna('').astype(str).str.lower().str.strip() - - officer_rels = dataframe[rel_type_series == 'officer_of'] - intermediary_rels = dataframe[rel_type_series == 'intermediary_of'] - address_rels = dataframe[rel_type_series == 'registered_address'] - - # Officer relationships - if not officer_rels.empty: - officer_query = """ - UNWIND $rows AS row - MATCH (o:Officer {id: row.node_id_start}) - MATCH (e:Entity {id: row.node_id_end}) - MERGE (o)-[:OFFICER_OF]->(e) - """ - records = officer_rels.where(pd.notna(officer_rels), None).to_dict('records') - tx.run(officer_query, rows=records) - - # Intermediary relationships - if not intermediary_rels.empty: - intermediary_query = """ - UNWIND $rows AS row - MATCH (i:Intermediary {id: row.node_id_start}) - MATCH (e:Entity {id: row.node_id_end}) - MERGE (i)-[:INTERMEDIARY_OF]->(e) - """ - records = intermediary_rels.where(pd.notna(intermediary_rels), None).to_dict('records') - tx.run(intermediary_query, rows=records) - - # Address relationships - if not address_rels.empty: - address_query = """ - UNWIND $rows AS row - MATCH (e:Entity {id: row.node_id_start}) - MATCH (a:Address {id: row.node_id_end}) - MERGE (e)-[:REGISTERED_ADDRESS]->(a) - """ - records = address_rels.where(pd.notna(address_rels), None).to_dict('records') - tx.run(address_query, rows=records) - - -def process_entities(db, data_dir: Path): - """Process entities CSV with chunking.""" - entities_file = data_dir / "nodes-entities.csv" - - if not entities_file.exists(): - print(f"Warning: {entities_file} not found. Skipping...") - return 0 - - print("Processing entities and jurisdictions...") - total_processed = 0 - - with db.get_session() as session: - for chunk_num, chunk in enumerate(pd.read_csv(entities_file, chunksize=CHUNK_SIZE, low_memory=False), 1): - session.execute_write(load_entities, chunk) - total_processed += len(chunk) - - if chunk_num % 10 == 0: - print(f" Processed {total_processed} entities...") - - print(f"โœ“ Processed {total_processed} entities with jurisdictions") - return total_processed - - -def process_nodes(db, data_dir: Path, filename: str, load_func, node_type: str): - """Process node CSV files with chunking.""" - file_path = data_dir / filename - - if not file_path.exists(): - print(f"Warning: {file_path} not found. Skipping...") - return 0 - - print(f"Processing {node_type} nodes...") - total_processed = 0 - - with db.get_session() as session: - for chunk_num, chunk in enumerate(pd.read_csv(file_path, chunksize=CHUNK_SIZE, low_memory=False), 1): - session.execute_write(load_func, chunk) - total_processed += len(chunk) - - if chunk_num % 10 == 0: - print(f" Processed {total_processed} {node_type} nodes...") - - print(f"โœ“ Processed {total_processed} {node_type} nodes") - return total_processed - - -def process_relationships(db, data_dir: Path): - """Process relationships CSV with chunking.""" - relationships_file = data_dir / "relationships.csv" - - if not relationships_file.exists(): - print(f"Warning: {relationships_file} not found. Skipping...") - return - - print("Processing relationships...") - total_processed = 0 - - with db.get_session() as session: - for chunk_num, chunk in enumerate(pd.read_csv(relationships_file, chunksize=CHUNK_SIZE, low_memory=False), 1): - session.execute_write(load_relationships, chunk) - total_processed += len(chunk) - - if chunk_num % 10 == 0: - print(f" Processed {total_processed} relationships...") - - print(f"โœ“ Processed {total_processed} relationships") - - -def main(): - """Main ETL function.""" - print("Starting data ingestion...") - print(f"Neo4j URI: {settings.neo4j_uri}") - - db = get_database() - - try: - db.verify_connectivity() - print("โœ“ Connected to Neo4j") - except Exception as e: - print(f"โœ— Failed to connect to Neo4j: {e}") - sys.exit(1) - - # Determine data directory (try both lowercase and uppercase) - data_dir = Path("data") - if not data_dir.exists(): - data_dir = Path("Data") # Try uppercase - if not data_dir.exists(): - data_dir = Path("../data") - if not data_dir.exists(): - data_dir = Path("../Data") - if not data_dir.exists(): - print("โœ— Data directory not found. Please ensure CSV files are in ./data/ or ./Data/") - sys.exit(1) - - print(f"Using data directory: {data_dir}\n") - - # Process entities first (creates jurisdictions too using the secret sauce pattern) - process_entities(db, data_dir) - - # Process other node types - print() - process_nodes(db, data_dir, "nodes-officers.csv", load_officers, "Officer") - process_nodes(db, data_dir, "nodes-intermediaries.csv", load_intermediaries, "Intermediary") - process_nodes(db, data_dir, "nodes-addresses.csv", load_addresses, "Address") - - # Process relationships - print() - process_relationships(db, data_dir) - - print("\nโœ“ Data ingestion complete!") - - # Print summary statistics - with db.get_session() as session: - result = session.run( - """ - MATCH (n) - RETURN labels(n)[0] as label, count(n) as count - ORDER BY label - """ - ) - print("\nNode counts:") - for record in result: - print(f" {record['label']}: {record['count']}") - - result = session.run( - """ - MATCH ()-[r]->() - RETURN type(r) as rel_type, count(r) as count - ORDER BY rel_type - """ - ) - print("\nRelationship counts:") - for record in result: - print(f" {record['rel_type']}: {record['count']}") - - -if __name__ == "__main__": - main() diff --git a/scripts/seeddata.py b/scripts/seeddata.py new file mode 100644 index 0000000..b3d8c97 --- /dev/null +++ b/scripts/seeddata.py @@ -0,0 +1,405 @@ +#!/usr/bin/env python3 +""" +Panama Papers Neo4j Data Import Script (Simplified) +==================================================== +Imports ICIJ Offshore Leaks CSV data into Neo4j. + +Usage: + python scripts/seeddata.py +""" + +import os +import sys +from pathlib import Path +from dotenv import load_dotenv +import pandas as pd +from neo4j import GraphDatabase + +# Load environment variables +load_dotenv() + +# Configuration - Always use localhost for local scripts +NEO4J_URI = os.getenv("NEO4J_URI", "bolt://localhost:7687") +NEO4J_USER = os.getenv("NEO4J_USER", "neo4j") +NEO4J_PASSWORD = os.getenv("NEO4J_PASSWORD") +NEO4J_DATABASE = os.getenv("NEO4J_DATABASE", "neo4j") + +# Force localhost if running locally +if "neo4j:" in NEO4J_URI: + NEO4J_URI = "bolt://localhost:7687" + print(f"[INFO] Overriding URI to {NEO4J_URI} for local execution") + +DATA_DIR = Path(__file__).parent.parent / "data" +BATCH_SIZE = 1000 + +# CSV file mappings +CSV_FILES = { + "entities": "nodes-entities.csv", + "officers": "nodes-officers.csv", + "intermediaries": "nodes-intermediaries.csv", + "addresses": "nodes-addresses.csv", + "relationships": "relationships.csv", +} + + +def connect(): + """Connect to Neo4j.""" + if not NEO4J_PASSWORD: + print("[ERROR] NEO4J_PASSWORD not set in .env file") + sys.exit(1) + + print(f"[INFO] Connecting to Neo4j at {NEO4J_URI}...") + + try: + driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD)) + driver.verify_connectivity() + print("[INFO] โœ“ Connected to Neo4j") + return driver + except Exception as e: + print(f"[ERROR] Failed to connect: {e}") + sys.exit(1) + + +def create_constraints(driver): + """Create uniqueness constraints.""" + print("[INFO] Creating constraints...") + + constraints = [ + "CREATE CONSTRAINT entity_id_unique IF NOT EXISTS FOR (e:Entity) REQUIRE e.entity_id IS UNIQUE", + "CREATE CONSTRAINT officer_id_unique IF NOT EXISTS FOR (o:Officer) REQUIRE o.officer_id IS UNIQUE", + "CREATE CONSTRAINT intermediary_id_unique IF NOT EXISTS FOR (i:Intermediary) REQUIRE i.intermediary_id IS UNIQUE", + "CREATE CONSTRAINT address_id_unique IF NOT EXISTS FOR (a:Address) REQUIRE a.address_id IS UNIQUE", + ] + + with driver.session(database=NEO4J_DATABASE) as session: + for constraint in constraints: + try: + session.run(constraint) + except Exception as e: + if "already exists" not in str(e).lower(): + print(f"[WARN] Constraint issue: {e}") + + print("[INFO] โœ“ Constraints created") + + +def load_entities(driver): + """Load Entity nodes.""" + csv_path = DATA_DIR / CSV_FILES["entities"] + if not csv_path.exists(): + print(f"[WARN] {csv_path} not found, skipping entities") + return 0 + + print(f"[INFO] Loading entities from {csv_path}...") + df = pd.read_csv(csv_path, low_memory=False) + df = df.where(pd.notnull(df), None) + + # Determine ID column name + id_col = "node_id" if "node_id" in df.columns else "entity_id" if "entity_id" in df.columns else df.columns[0] + + query = """ + UNWIND $batch AS row + MERGE (e:Entity {entity_id: row.id}) + SET e.name = row.name, + e.jurisdiction_code = row.jurisdiction, + e.status = row.status, + e.source = row.sourceID, + e.incorporation_date = row.incorporation_date, + e.inactivation_date = row.inactivation_date + """ + + count = 0 + with driver.session(database=NEO4J_DATABASE) as session: + for i in range(0, len(df), BATCH_SIZE): + batch = df.iloc[i:i+BATCH_SIZE] + records = [] + for _, row in batch.iterrows(): + records.append({ + "id": str(row.get(id_col, "")), + "name": row.get("name"), + "jurisdiction": row.get("jurisdiction") or row.get("jurisdiction_code"), + "status": row.get("status"), + "sourceID": row.get("sourceID") or row.get("source"), + "incorporation_date": row.get("incorporation_date"), + "inactivation_date": row.get("inactivation_date"), + }) + session.run(query, batch=records) + count += len(records) + print(f"[INFO] Processed {count:,} entities...", end="\r") + + print(f"[INFO] โœ“ Loaded {count:,} entities ") + return count + + +def load_officers(driver): + """Load Officer nodes.""" + csv_path = DATA_DIR / CSV_FILES["officers"] + if not csv_path.exists(): + print(f"[WARN] {csv_path} not found, skipping officers") + return 0 + + print(f"[INFO] Loading officers from {csv_path}...") + df = pd.read_csv(csv_path, low_memory=False) + df = df.where(pd.notnull(df), None) + + id_col = "node_id" if "node_id" in df.columns else "officer_id" if "officer_id" in df.columns else df.columns[0] + + query = """ + UNWIND $batch AS row + MERGE (o:Officer {officer_id: row.id}) + SET o.name = row.name, + o.country_codes = row.country_codes, + o.source = row.sourceID + """ + + count = 0 + with driver.session(database=NEO4J_DATABASE) as session: + for i in range(0, len(df), BATCH_SIZE): + batch = df.iloc[i:i+BATCH_SIZE] + records = [] + for _, row in batch.iterrows(): + records.append({ + "id": str(row.get(id_col, "")), + "name": row.get("name"), + "country_codes": row.get("country_codes") or row.get("countries"), + "sourceID": row.get("sourceID") or row.get("source"), + }) + session.run(query, batch=records) + count += len(records) + print(f"[INFO] Processed {count:,} officers...", end="\r") + + print(f"[INFO] โœ“ Loaded {count:,} officers ") + return count + + +def load_intermediaries(driver): + """Load Intermediary nodes.""" + csv_path = DATA_DIR / CSV_FILES["intermediaries"] + if not csv_path.exists(): + print(f"[WARN] {csv_path} not found, skipping intermediaries") + return 0 + + print(f"[INFO] Loading intermediaries from {csv_path}...") + df = pd.read_csv(csv_path, low_memory=False) + df = df.where(pd.notnull(df), None) + + id_col = "node_id" if "node_id" in df.columns else "intermediary_id" if "intermediary_id" in df.columns else df.columns[0] + + query = """ + UNWIND $batch AS row + MERGE (i:Intermediary {intermediary_id: row.id}) + SET i.name = row.name, + i.country_codes = row.country_codes, + i.source = row.sourceID + """ + + count = 0 + with driver.session(database=NEO4J_DATABASE) as session: + for i in range(0, len(df), BATCH_SIZE): + batch = df.iloc[i:i+BATCH_SIZE] + records = [] + for _, row in batch.iterrows(): + records.append({ + "id": str(row.get(id_col, "")), + "name": row.get("name"), + "country_codes": row.get("country_codes") or row.get("countries"), + "sourceID": row.get("sourceID") or row.get("source"), + }) + session.run(query, batch=records) + count += len(records) + print(f"[INFO] Processed {count:,} intermediaries...", end="\r") + + print(f"[INFO] โœ“ Loaded {count:,} intermediaries ") + return count + + +def load_addresses(driver): + """Load Address nodes.""" + csv_path = DATA_DIR / CSV_FILES["addresses"] + if not csv_path.exists(): + print(f"[WARN] {csv_path} not found, skipping addresses") + return 0 + + print(f"[INFO] Loading addresses from {csv_path}...") + df = pd.read_csv(csv_path, low_memory=False) + df = df.where(pd.notnull(df), None) + + id_col = "node_id" if "node_id" in df.columns else "address_id" if "address_id" in df.columns else df.columns[0] + + query = """ + UNWIND $batch AS row + MERGE (a:Address {address_id: row.id}) + SET a.address = row.address, + a.country_codes = row.country_codes, + a.source = row.sourceID + """ + + count = 0 + with driver.session(database=NEO4J_DATABASE) as session: + for i in range(0, len(df), BATCH_SIZE): + batch = df.iloc[i:i+BATCH_SIZE] + records = [] + for _, row in batch.iterrows(): + records.append({ + "id": str(row.get(id_col, "")), + "address": row.get("address") or row.get("name"), + "country_codes": row.get("country_codes") or row.get("countries"), + "sourceID": row.get("sourceID") or row.get("source"), + }) + session.run(query, batch=records) + count += len(records) + print(f"[INFO] Processed {count:,} addresses...", end="\r") + + print(f"[INFO] โœ“ Loaded {count:,} addresses ") + return count + + +def load_relationships(driver): + """Load relationships between nodes.""" + csv_path = DATA_DIR / CSV_FILES["relationships"] + if not csv_path.exists(): + print(f"[WARN] {csv_path} not found, skipping relationships") + return 0 + + print(f"[INFO] Loading relationships from {csv_path}...") + df = pd.read_csv(csv_path, low_memory=False) + df = df.where(pd.notnull(df), None) + + # ICIJ uses START_ID, END_ID, TYPE columns + start_col = "START_ID" if "START_ID" in df.columns else "start_id" if "start_id" in df.columns else "node_id_start" + end_col = "END_ID" if "END_ID" in df.columns else "end_id" if "end_id" in df.columns else "node_id_end" + type_col = "TYPE" if "TYPE" in df.columns else "rel_type" if "rel_type" in df.columns else "type" + + query = """ + UNWIND $batch AS row + MATCH (start) WHERE start.entity_id = row.start_id + OR start.officer_id = row.start_id + OR start.intermediary_id = row.start_id + OR start.address_id = row.start_id + MATCH (end) WHERE end.entity_id = row.end_id + OR end.officer_id = row.end_id + OR end.intermediary_id = row.end_id + OR end.address_id = row.end_id + CALL apoc.merge.relationship(start, row.rel_type, {}, {}, end, {}) YIELD rel + RETURN count(rel) + """ + + # Fallback query without APOC + fallback_query = """ + UNWIND $batch AS row + MATCH (start) WHERE start.entity_id = row.start_id + OR start.officer_id = row.start_id + OR start.intermediary_id = row.start_id + OR start.address_id = row.start_id + MATCH (end) WHERE end.entity_id = row.end_id + OR end.officer_id = row.end_id + OR end.intermediary_id = row.end_id + OR end.address_id = row.end_id + MERGE (start)-[r:CONNECTED_TO]->(end) + SET r.type = row.rel_type + RETURN count(r) + """ + + count = 0 + with driver.session(database=NEO4J_DATABASE) as session: + # Test if APOC is available + try: + session.run("RETURN apoc.version()").single() + use_apoc = True + print("[INFO] Using APOC for dynamic relationship types") + except: + use_apoc = False + print("[INFO] APOC not available, using generic CONNECTED_TO relationships") + + active_query = query if use_apoc else fallback_query + + for i in range(0, len(df), BATCH_SIZE): + batch = df.iloc[i:i+BATCH_SIZE] + records = [] + for _, row in batch.iterrows(): + records.append({ + "start_id": str(row.get(start_col, "")), + "end_id": str(row.get(end_col, "")), + "rel_type": row.get(type_col) or "CONNECTED_TO", + }) + try: + session.run(active_query, batch=records) + except Exception as e: + if "apoc" in str(e).lower(): + session.run(fallback_query, batch=records) + count += len(records) + print(f"[INFO] Processed {count:,} relationships...", end="\r") + + print(f"[INFO] โœ“ Loaded {count:,} relationships ") + return count + + +def verify_import(driver): + """Verify the import by counting nodes.""" + print("\n[INFO] Verifying import...") + + queries = [ + ("Entity", "MATCH (n:Entity) RETURN count(n) AS count"), + ("Officer", "MATCH (n:Officer) RETURN count(n) AS count"), + ("Intermediary", "MATCH (n:Intermediary) RETURN count(n) AS count"), + ("Address", "MATCH (n:Address) RETURN count(n) AS count"), + ("Relationships", "MATCH ()-[r]->() RETURN count(r) AS count"), + ] + + with driver.session(database=NEO4J_DATABASE) as session: + print("\n" + "=" * 40) + print(" IMPORT SUMMARY") + print("=" * 40) + for label, query in queries: + result = session.run(query).single() + count = result["count"] if result else 0 + print(f" {label:15} : {count:>10,}") + print("=" * 40) + + +def main(): + """Main entry point.""" + print("=" * 60) + print(" PANAMA PAPERS DATA IMPORT") + print("=" * 60) + print(f" Data directory: {DATA_DIR}") + print(f" Batch size: {BATCH_SIZE:,}") + print("=" * 60) + + # Check data directory + if not DATA_DIR.exists(): + print(f"[ERROR] Data directory not found: {DATA_DIR}") + sys.exit(1) + + # Connect + driver = connect() + + try: + # Create constraints + create_constraints(driver) + + # Load nodes + load_entities(driver) + load_officers(driver) + load_intermediaries(driver) + load_addresses(driver) + + # Load relationships + load_relationships(driver) + + # Verify + verify_import(driver) + + print("\n[INFO] โœ“ Import completed successfully!") + + except Exception as e: + print(f"\n[ERROR] Import failed: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + + finally: + driver.close() + + +if __name__ == "__main__": + main() diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index dcaf535..0000000 --- a/tests/__init__.py +++ /dev/null @@ -1,2 +0,0 @@ -"""Test suite for Graph-Backed Analysis API.""" - diff --git a/tests/conftest.py b/tests/conftest.py index 7f36882..e0bb81e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,30 +1,909 @@ -"""Pytest configuration and shared fixtures.""" +""" +Panama Papers API - Pytest Configuration +========================================== + +Pytest fixtures for testing the Panama Papers FastAPI application. + +Fixtures Provided: + - event_loop: Async event loop for tests + - neo4j_driver: Test Neo4j driver connection + - async_client: FastAPI AsyncClient for API testing + - sample_entity: Single test entity + - sample_entities: Multiple test entities + - sample_person: Single test person + - sample_relationships: Test ownership relationships + - sample_ownership_chain: Multi-hop ownership chain + +Configuration: + - Uses pytest-asyncio for async test support + - Automatic database cleanup before/after each test + - Separate test database configuration via .env.test + +Usage: + pytest tests/ -v --asyncio-mode=auto + pytest tests/test_entities.py -v -k "test_get_entity" + +Environment Variables (via .env.test): + TEST_NEO4J_URI: Test Neo4j URI (default: bolt://localhost:7687) + TEST_NEO4J_USER: Test database user (default: neo4j) + TEST_NEO4J_PASSWORD: Test database password (required) + TEST_NEO4J_DATABASE: Test database name (default: neo4j) +""" + +from __future__ import annotations + +import asyncio +import os +import sys +from datetime import date, datetime +from typing import Any, AsyncGenerator, Generator +from unittest.mock import AsyncMock, MagicMock, patch import pytest -from unittest.mock import Mock, patch -from fastapi.testclient import TestClient +import pytest_asyncio +from dotenv import load_dotenv +from httpx import ASGITransport, AsyncClient +from neo4j import AsyncGraphDatabase, AsyncDriver, AsyncSession + +# ============================================================================ +# ENVIRONMENT CONFIGURATION +# ============================================================================ + +# Load test environment variables (override production settings) +load_dotenv(".env.test", override=True) + +# Test database configuration +TEST_NEO4J_URI = os.getenv("TEST_NEO4J_URI", "bolt://localhost:7687") +TEST_NEO4J_USER = os.getenv("TEST_NEO4J_USER", "neo4j") +TEST_NEO4J_PASSWORD = os.getenv("TEST_NEO4J_PASSWORD", "testpassword") +TEST_NEO4J_DATABASE = os.getenv("TEST_NEO4J_DATABASE", "neo4j") + +# Test settings +SKIP_DB_TESTS = os.getenv("SKIP_DB_TESTS", "false").lower() == "true" +USE_MOCK_DB = os.getenv("USE_MOCK_DB", "false").lower() == "true" + + +# ============================================================================ +# PYTEST CONFIGURATION +# ============================================================================ + +def pytest_configure(config): + """Configure pytest markers.""" + config.addinivalue_line( + "markers", "integration: marks tests as integration tests (requires Neo4j)" + ) + config.addinivalue_line( + "markers", "slow: marks tests as slow running" + ) + config.addinivalue_line( + "markers", "unit: marks tests as unit tests (no external dependencies)" + ) + + +def pytest_collection_modifyitems(config, items): + """Modify test collection based on markers.""" + # Skip integration tests if database is not available + if SKIP_DB_TESTS: + skip_db = pytest.mark.skip(reason="Database tests disabled via SKIP_DB_TESTS") + for item in items: + if "integration" in item.keywords: + item.add_marker(skip_db) + + +# ============================================================================ +# EVENT LOOP CONFIGURATION +# ============================================================================ + +@pytest.fixture(scope="session") +def event_loop_policy(): + """ + Configure event loop policy for async tests. + + Handles platform-specific event loop configuration: + - Windows: Uses ProactorEventLoop for better subprocess support + - Unix: Uses default SelectorEventLoop + """ + if sys.platform == "win32": + asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy()) + return asyncio.get_event_loop_policy() + + +@pytest.fixture(scope="session") +def event_loop(event_loop_policy) -> Generator[asyncio.AbstractEventLoop, None, None]: + """ + Create event loop for session-scoped async fixtures. + + Scope: session (shared across all tests) + """ + policy = event_loop_policy + loop = policy.new_event_loop() + asyncio.set_event_loop(loop) + + yield loop + + # Cleanup pending tasks + pending = asyncio.all_tasks(loop) + for task in pending: + task.cancel() + + loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True)) + loop.close() + + +# ============================================================================ +# NEO4J DRIVER FIXTURES +# ============================================================================ + +@pytest_asyncio.fixture(scope="session") +async def neo4j_driver_session() -> AsyncGenerator[AsyncDriver, None]: + """ + Create session-scoped Neo4j driver for test database. + + This driver is shared across all tests in the session for efficiency. + Individual tests should use the function-scoped `neo4j_driver` fixture + which wraps this with per-test cleanup. + + Scope: session + """ + if USE_MOCK_DB: + # Return mock driver for unit tests + mock_driver = AsyncMock(spec=AsyncDriver) + mock_driver.verify_connectivity = AsyncMock() + yield mock_driver + return + + driver = AsyncGraphDatabase.driver( + TEST_NEO4J_URI, + auth=(TEST_NEO4J_USER, TEST_NEO4J_PASSWORD), + max_connection_pool_size=10, + connection_acquisition_timeout=30, + ) + + # Verify connection + try: + await driver.verify_connectivity() + except Exception as e: + await driver.close() + pytest.skip(f"Neo4j not available at {TEST_NEO4J_URI}: {e}") + + yield driver + + # Final cleanup + try: + async with driver.session(database=TEST_NEO4J_DATABASE) as session: + await session.run("MATCH (n) WHERE n.entity_id STARTS WITH 'TEST-' DETACH DELETE n") + await session.run("MATCH (n) WHERE n.person_id STARTS WITH 'TEST-' DETACH DELETE n") + except Exception: + pass # Ignore cleanup errors + + await driver.close() + + +@pytest_asyncio.fixture +async def neo4j_driver( + neo4j_driver_session: AsyncDriver, +) -> AsyncGenerator[AsyncDriver, None]: + """ + Function-scoped Neo4j driver with per-test cleanup. + + Uses the session-scoped driver but adds cleanup before and after each test. + + Scope: function (per test) + """ + if USE_MOCK_DB: + yield neo4j_driver_session + return + + # Pre-test cleanup: Remove any leftover test data + async with neo4j_driver_session.session(database=TEST_NEO4J_DATABASE) as session: + await session.run("MATCH (n) WHERE n.entity_id STARTS WITH 'TEST-' DETACH DELETE n") + await session.run("MATCH (n) WHERE n.person_id STARTS WITH 'TEST-' DETACH DELETE n") + await session.run("MATCH (n) WHERE n.intermediary_id STARTS WITH 'TEST-' DETACH DELETE n") + await session.run("MATCH (n) WHERE n.address_id STARTS WITH 'TEST-' DETACH DELETE n") + + yield neo4j_driver_session + + # Post-test cleanup + async with neo4j_driver_session.session(database=TEST_NEO4J_DATABASE) as session: + await session.run("MATCH (n) WHERE n.entity_id STARTS WITH 'TEST-' DETACH DELETE n") + await session.run("MATCH (n) WHERE n.person_id STARTS WITH 'TEST-' DETACH DELETE n") + await session.run("MATCH (n) WHERE n.intermediary_id STARTS WITH 'TEST-' DETACH DELETE n") + await session.run("MATCH (n) WHERE n.address_id STARTS WITH 'TEST-' DETACH DELETE n") + + +@pytest_asyncio.fixture +async def neo4j_session( + neo4j_driver: AsyncDriver, +) -> AsyncGenerator[AsyncSession, None]: + """ + Provide a Neo4j session for direct database operations in tests. + + Scope: function + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + yield session + + +# ============================================================================ +# APPLICATION FIXTURES +# ============================================================================ + +@pytest_asyncio.fixture +async def app_instance(): + """ + Create FastAPI application instance for testing. + + Imports the app and overrides settings for testing. + """ + # Import here to allow patching before import + from main import app + + return app + + +@pytest_asyncio.fixture +async def async_client( + app_instance, + neo4j_driver: AsyncDriver, +) -> AsyncGenerator[AsyncClient, None]: + """ + Create async HTTP client for FastAPI testing. + + This client can be used to make requests to the API endpoints. + The Neo4j driver is initialized for the test database. + + Scope: function + + Example: + async def test_get_entity(async_client): + response = await async_client.get("/entities/TEST-001") + assert response.status_code == 200 + """ + # Import and initialize database + from database import Neo4jDatabase, Neo4jConfig + + # Override config for testing + test_config = Neo4jConfig( + uri=TEST_NEO4J_URI, + user=TEST_NEO4J_USER, + password=TEST_NEO4J_PASSWORD, + database=TEST_NEO4J_DATABASE, + ) + + # Initialize with test config + if not Neo4jDatabase.is_initialized(): + await Neo4jDatabase.init(config=test_config) + + # Create async client + transport = ASGITransport(app=app_instance) + async with AsyncClient(transport=transport, base_url="http://test") as client: + yield client + + # Cleanup + if Neo4jDatabase.is_initialized(): + await Neo4jDatabase.close() -from app.main import app -from app.services.graph_service import GraphService + +@pytest_asyncio.fixture +async def async_client_no_db(app_instance) -> AsyncGenerator[AsyncClient, None]: + """ + Create async client without database initialization. + + Useful for testing endpoints that don't require database access + or for testing error handling when database is unavailable. + """ + transport = ASGITransport(app=app_instance) + async with AsyncClient(transport=transport, base_url="http://test") as client: + yield client + + +# ============================================================================ +# SAMPLE DATA FIXTURES - ENTITIES +# ============================================================================ + +@pytest_asyncio.fixture +async def sample_entity(neo4j_driver: AsyncDriver) -> dict[str, Any]: + """ + Create a single sample entity in the test database. + + Returns: + Dictionary with entity properties + + Entity Properties: + - entity_id: TEST-ENTITY-001 + - name: Test Holdings Ltd + - jurisdiction_code: BVI + - entity_type: Company + - status: Active + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + query = """ + CREATE (e:Entity { + entity_id: 'TEST-ENTITY-001', + name: 'Test Holdings Ltd', + jurisdiction_code: 'BVI', + entity_type: 'Company', + status: 'Active', + incorporation_date: date('2015-03-15'), + source: 'Test Data', + pagerank_score: 0.125, + community_id: 1, + degree_centrality: 5 + }) + RETURN e {.*} AS entity + """ + result = await session.run(query) + record = await result.single() + return dict(record["entity"]) if record else {} + + +@pytest_asyncio.fixture +async def sample_entities(neo4j_driver: AsyncDriver) -> list[dict[str, Any]]: + """ + Create multiple sample entities for testing. + + Creates: + - 3 Entity nodes (Company, Trust, Foundation) + - Different jurisdictions (BVI, PAN, CYM) + - Varying PageRank scores for ranking tests + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + query = """ + CREATE + (e1:Entity { + entity_id: 'TEST-ENTITY-001', + name: 'Test Holdings Ltd', + jurisdiction_code: 'BVI', + entity_type: 'Company', + status: 'Active', + incorporation_date: date('2015-03-15'), + pagerank_score: 0.250, + community_id: 1 + }), + (e2:Entity { + entity_id: 'TEST-ENTITY-002', + name: 'Global Ventures Trust', + jurisdiction_code: 'PAN', + entity_type: 'Trust', + status: 'Active', + incorporation_date: date('2012-07-22'), + pagerank_score: 0.150, + community_id: 1 + }), + (e3:Entity { + entity_id: 'TEST-ENTITY-003', + name: 'Offshore Foundation', + jurisdiction_code: 'CYM', + entity_type: 'Foundation', + status: 'Dissolved', + incorporation_date: date('2010-01-10'), + inactivation_date: date('2020-06-30'), + pagerank_score: 0.050, + community_id: 2 + }) + RETURN [e1 {.*}, e2 {.*}, e3 {.*}] AS entities + """ + result = await session.run(query) + record = await result.single() + return [dict(e) for e in record["entities"]] if record else [] + + +@pytest_asyncio.fixture +async def sample_entity_with_jurisdiction( + neo4j_driver: AsyncDriver, +) -> dict[str, Any]: + """ + Create entity with jurisdiction node relationship. + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + query = """ + MERGE (j:Jurisdiction { + jurisdiction_code: 'BVI', + name: 'British Virgin Islands', + is_tax_haven: true, + secrecy_score: 71, + risk_level: 'HIGH' + }) + CREATE (e:Entity { + entity_id: 'TEST-ENTITY-010', + name: 'Test BVI Company', + jurisdiction_code: 'BVI', + entity_type: 'Company', + status: 'Active' + }) + CREATE (e)-[:REGISTERED_IN]->(j) + RETURN e {.*, jurisdiction_name: j.name, is_tax_haven: j.is_tax_haven} AS entity + """ + result = await session.run(query) + record = await result.single() + return dict(record["entity"]) if record else {} + + +# ============================================================================ +# SAMPLE DATA FIXTURES - PERSONS +# ============================================================================ + +@pytest_asyncio.fixture +async def sample_person(neo4j_driver: AsyncDriver) -> dict[str, Any]: + """ + Create a single sample person (beneficial owner). + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + query = """ + CREATE (p:Person { + person_id: 'TEST-PERSON-001', + full_name: 'John Smith', + first_name: 'John', + last_name: 'Smith', + nationality: 'USA', + country_of_residence: 'USA', + is_pep: false, + source: 'Test Data' + }) + RETURN p {.*} AS person + """ + result = await session.run(query) + record = await result.single() + return dict(record["person"]) if record else {} + + +@pytest_asyncio.fixture +async def sample_pep(neo4j_driver: AsyncDriver) -> dict[str, Any]: + """ + Create a Politically Exposed Person for risk testing. + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + query = """ + CREATE (p:Person { + person_id: 'TEST-PEP-001', + full_name: 'Jane Politician', + nationality: 'GBR', + is_pep: true, + pep_details: 'Former Cabinet Minister' + }) + RETURN p {.*} AS person + """ + result = await session.run(query) + record = await result.single() + return dict(record["person"]) if record else {} + + +# ============================================================================ +# SAMPLE DATA FIXTURES - RELATIONSHIPS +# ============================================================================ + +@pytest_asyncio.fixture +async def sample_ownership(neo4j_driver: AsyncDriver) -> dict[str, Any]: + """ + Create a simple ownership relationship (Person -> Entity). + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + query = """ + CREATE (p:Person { + person_id: 'TEST-PERSON-001', + full_name: 'John Smith', + nationality: 'USA', + is_pep: false + }) + CREATE (e:Entity { + entity_id: 'TEST-ENTITY-001', + name: 'Test Holdings Ltd', + jurisdiction_code: 'BVI', + entity_type: 'Company', + status: 'Active' + }) + CREATE (p)-[r:OWNS { + ownership_percentage: 100.0, + status: 'Active', + is_nominee: false, + acquisition_date: date('2015-03-15') + }]->(e) + RETURN { + person: p {.*}, + entity: e {.*}, + relationship: { + type: type(r), + ownership_percentage: r.ownership_percentage, + status: r.status + } + } AS data + """ + result = await session.run(query) + record = await result.single() + return dict(record["data"]) if record else {} + + +@pytest_asyncio.fixture +async def sample_ownership_chain(neo4j_driver: AsyncDriver) -> dict[str, Any]: + """ + Create a multi-hop ownership chain for path testing. + + Creates: + Person -> Entity1 (75%) -> Entity2 (50%) -> Entity3 (100%) + + Effective ownership: 75% * 50% * 100% = 37.5% + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + query = """ + // Create nodes + CREATE (p:Person { + person_id: 'TEST-PERSON-CHAIN-001', + full_name: 'Chain Owner', + nationality: 'CHE', + is_pep: false + }) + CREATE (e1:Entity { + entity_id: 'TEST-CHAIN-001', + name: 'Holding Company A', + jurisdiction_code: 'CHE', + entity_type: 'Company', + status: 'Active' + }) + CREATE (e2:Entity { + entity_id: 'TEST-CHAIN-002', + name: 'Intermediate B Ltd', + jurisdiction_code: 'BVI', + entity_type: 'Company', + status: 'Active' + }) + CREATE (e3:Entity { + entity_id: 'TEST-CHAIN-003', + name: 'Target Corp', + jurisdiction_code: 'PAN', + entity_type: 'Company', + status: 'Active' + }) + + // Create ownership chain + CREATE (p)-[r1:OWNS {ownership_percentage: 75.0, status: 'Active'}]->(e1) + CREATE (e1)-[r2:OWNS {ownership_percentage: 50.0, status: 'Active'}]->(e2) + CREATE (e2)-[r3:OWNS {ownership_percentage: 100.0, status: 'Active'}]->(e3) + + RETURN { + person: p {.*}, + entities: [e1 {.*}, e2 {.*}, e3 {.*}], + chain_length: 3, + effective_ownership: 37.5 + } AS data + """ + result = await session.run(query) + record = await result.single() + return dict(record["data"]) if record else {} + + +@pytest_asyncio.fixture +async def sample_complex_network(neo4j_driver: AsyncDriver) -> dict[str, Any]: + """ + Create a complex ownership network for network analysis testing. + + Creates: + - 2 Persons (one PEP) + - 4 Entities across multiple jurisdictions + - Multiple ownership relationships + - Shared address (mass registration indicator) + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + query = """ + // Create jurisdictions + MERGE (j_bvi:Jurisdiction {jurisdiction_code: 'BVI', name: 'British Virgin Islands', is_tax_haven: true}) + MERGE (j_pan:Jurisdiction {jurisdiction_code: 'PAN', name: 'Panama', is_tax_haven: true}) + + // Create shared address (red flag) + CREATE (addr:Address { + address_id: 'TEST-ADDR-001', + full_address: '123 Offshore Plaza, Road Town, BVI', + city: 'Road Town', + country_code: 'VGB', + is_nominee_address: true + }) + + // Create persons + CREATE (p1:Person { + person_id: 'TEST-NET-PERSON-001', + full_name: 'Regular Investor', + nationality: 'USA', + is_pep: false + }) + CREATE (p2:Person { + person_id: 'TEST-NET-PEP-001', + full_name: 'Political Figure', + nationality: 'RUS', + is_pep: true, + pep_details: 'Government Official' + }) + + // Create entities + CREATE (e1:Entity { + entity_id: 'TEST-NET-001', + name: 'Alpha Holdings', + jurisdiction_code: 'BVI', + entity_type: 'Company', + status: 'Active', + pagerank_score: 0.35, + community_id: 1 + }) + CREATE (e2:Entity { + entity_id: 'TEST-NET-002', + name: 'Beta Investments', + jurisdiction_code: 'BVI', + entity_type: 'Company', + status: 'Active', + pagerank_score: 0.25, + community_id: 1 + }) + CREATE (e3:Entity { + entity_id: 'TEST-NET-003', + name: 'Gamma Trust', + jurisdiction_code: 'PAN', + entity_type: 'Trust', + status: 'Active', + pagerank_score: 0.15, + community_id: 1 + }) + CREATE (e4:Entity { + entity_id: 'TEST-NET-004', + name: 'Delta Corp', + jurisdiction_code: 'PAN', + entity_type: 'Company', + status: 'Active', + pagerank_score: 0.10, + community_id: 2 + }) + + // Create relationships + CREATE (p1)-[:OWNS {ownership_percentage: 60.0, status: 'Active'}]->(e1) + CREATE (p2)-[:OWNS {ownership_percentage: 40.0, status: 'Active'}]->(e1) + CREATE (e1)-[:OWNS {ownership_percentage: 100.0, status: 'Active'}]->(e2) + CREATE (e1)-[:OWNS {ownership_percentage: 75.0, status: 'Active'}]->(e3) + CREATE (e2)-[:OWNS {ownership_percentage: 50.0, status: 'Active'}]->(e4) + CREATE (e3)-[:OWNS {ownership_percentage: 50.0, status: 'Active'}]->(e4) + + // Create jurisdiction relationships + CREATE (e1)-[:REGISTERED_IN]->(j_bvi) + CREATE (e2)-[:REGISTERED_IN]->(j_bvi) + CREATE (e3)-[:REGISTERED_IN]->(j_pan) + CREATE (e4)-[:REGISTERED_IN]->(j_pan) + + // Create address relationships (shared address = red flag) + CREATE (e1)-[:HAS_ADDRESS {address_type: 'Registered', is_primary: true}]->(addr) + CREATE (e2)-[:HAS_ADDRESS {address_type: 'Registered', is_primary: true}]->(addr) + + RETURN { + persons: [p1 {.*}, p2 {.*}], + entities: [e1 {.*}, e2 {.*}, e3 {.*}, e4 {.*}], + address: addr {.*}, + entity_count: 4, + person_count: 2, + relationship_count: 6, + pep_involved: true + } AS data + """ + result = await session.run(query) + record = await result.single() + return dict(record["data"]) if record else {} + + +# ============================================================================ +# SAMPLE DATA FIXTURES - INTERMEDIARIES +# ============================================================================ + +@pytest_asyncio.fixture +async def sample_intermediary(neo4j_driver: AsyncDriver) -> dict[str, Any]: + """ + Create a sample intermediary (law firm/service provider). + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + query = """ + CREATE (i:Intermediary { + intermediary_id: 'TEST-INTER-001', + name: 'Test Law Firm LLP', + type: 'Law Firm', + country_code: 'PAN', + status: 'Active' + }) + CREATE (e:Entity { + entity_id: 'TEST-INTER-ENTITY-001', + name: 'Client Company', + jurisdiction_code: 'BVI', + entity_type: 'Company', + status: 'Active' + }) + CREATE (e)-[:CREATED_BY { + creation_date: date('2015-01-01'), + relationship_status: 'Active' + }]->(i) + RETURN { + intermediary: i {.*}, + entity: e {.*} + } AS data + """ + result = await session.run(query) + record = await result.single() + return dict(record["data"]) if record else {} + + +# ============================================================================ +# DATABASE SCHEMA FIXTURES +# ============================================================================ + +@pytest_asyncio.fixture +async def setup_schema(neo4j_driver: AsyncDriver) -> None: + """ + Set up database schema (constraints and indexes) for testing. + + This fixture should be used when testing schema-dependent functionality. + """ + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + # Create constraints (idempotent with IF NOT EXISTS) + constraints = [ + "CREATE CONSTRAINT test_entity_id IF NOT EXISTS FOR (e:Entity) REQUIRE e.entity_id IS UNIQUE", + "CREATE CONSTRAINT test_person_id IF NOT EXISTS FOR (p:Person) REQUIRE p.person_id IS UNIQUE", + "CREATE CONSTRAINT test_intermediary_id IF NOT EXISTS FOR (i:Intermediary) REQUIRE i.intermediary_id IS UNIQUE", + ] + + for constraint in constraints: + try: + await session.run(constraint) + except Exception: + pass # Constraint may already exist + + # Create indexes + indexes = [ + "CREATE INDEX test_entity_name IF NOT EXISTS FOR (e:Entity) ON (e.name)", + "CREATE INDEX test_entity_jurisdiction IF NOT EXISTS FOR (e:Entity) ON (e.jurisdiction_code)", + "CREATE INDEX test_person_name IF NOT EXISTS FOR (p:Person) ON (p.full_name)", + ] + + for index in indexes: + try: + await session.run(index) + except Exception: + pass + + +# ============================================================================ +# UTILITY FIXTURES +# ============================================================================ + +@pytest_asyncio.fixture +async def clear_test_data(neo4j_driver: AsyncDriver) -> AsyncGenerator[None, None]: + """ + Fixture that clears test data before and after the test. + + Use this explicitly when you need guaranteed clean state. + """ + # Pre-test cleanup + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + await session.run("MATCH (n) WHERE n.entity_id STARTS WITH 'TEST-' DETACH DELETE n") + await session.run("MATCH (n) WHERE n.person_id STARTS WITH 'TEST-' DETACH DELETE n") + + yield + + # Post-test cleanup + async with neo4j_driver.session(database=TEST_NEO4J_DATABASE) as session: + await session.run("MATCH (n) WHERE n.entity_id STARTS WITH 'TEST-' DETACH DELETE n") + await session.run("MATCH (n) WHERE n.person_id STARTS WITH 'TEST-' DETACH DELETE n") @pytest.fixture -def client(): - """Create a test client for the FastAPI application.""" - return TestClient(app) +def mock_neo4j_session() -> MagicMock: + """ + Create a mock Neo4j session for unit testing. + + Returns a MagicMock configured to behave like AsyncSession. + """ + mock_session = MagicMock(spec=AsyncSession) + + # Configure async methods + mock_result = AsyncMock() + mock_result.single = AsyncMock(return_value=None) + mock_result.fetch = AsyncMock(return_value=[]) + mock_result.data = AsyncMock(return_value=[]) + + mock_session.run = AsyncMock(return_value=mock_result) + mock_session.close = AsyncMock() + mock_session.__aenter__ = AsyncMock(return_value=mock_session) + mock_session.__aexit__ = AsyncMock(return_value=None) + + return mock_session + +# ============================================================================ +# TEST DATA GENERATORS +# ============================================================================ @pytest.fixture -def mock_graph_service(): - """Create a mocked GraphService instance.""" - mock_service = Mock(spec=GraphService) - return mock_service +def entity_data_factory(): + """ + Factory fixture for generating entity test data. + + Usage: + def test_something(entity_data_factory): + entity = entity_data_factory(name="Custom Name") + """ + def _factory( + entity_id: str = "TEST-FACTORY-001", + name: str = "Factory Entity", + jurisdiction_code: str = "BVI", + entity_type: str = "Company", + status: str = "Active", + **kwargs, + ) -> dict[str, Any]: + return { + "entity_id": entity_id, + "name": name, + "jurisdiction_code": jurisdiction_code, + "entity_type": entity_type, + "status": status, + **kwargs, + } + + return _factory @pytest.fixture -def patched_graph_service(mock_graph_service): - """Patch GraphService in routers to use the mock.""" - with patch('app.routers.entities.graph_service', mock_graph_service), \ - patch('app.routers.networks.graph_service', mock_graph_service): - yield mock_graph_service +def person_data_factory(): + """ + Factory fixture for generating person test data. + """ + def _factory( + person_id: str = "TEST-PERSON-FACTORY-001", + full_name: str = "Test Person", + nationality: str = "USA", + is_pep: bool = False, + **kwargs, + ) -> dict[str, Any]: + return { + "person_id": person_id, + "full_name": full_name, + "nationality": nationality, + "is_pep": is_pep, + **kwargs, + } + + return _factory + + +# ============================================================================ +# EXPORTED FIXTURES +# ============================================================================ +__all__ = [ + # Event loop + "event_loop", + "event_loop_policy", + + # Database + "neo4j_driver", + "neo4j_driver_session", + "neo4j_session", + "setup_schema", + "clear_test_data", + + # Application + "app_instance", + "async_client", + "async_client_no_db", + + # Sample entities + "sample_entity", + "sample_entities", + "sample_entity_with_jurisdiction", + + # Sample persons + "sample_person", + "sample_pep", + + # Sample relationships + "sample_ownership", + "sample_ownership_chain", + "sample_complex_network", + + # Sample intermediaries + "sample_intermediary", + + # Utilities + "mock_neo4j_session", + "entity_data_factory", + "person_data_factory", +] diff --git a/tests/test_api.py b/tests/test_api.py deleted file mode 100644 index e4337ac..0000000 --- a/tests/test_api.py +++ /dev/null @@ -1,318 +0,0 @@ -"""Comprehensive API endpoint tests with mocked GraphService.""" - -import pytest -from fastapi.testclient import TestClient - - -class TestRootEndpoints: - """Tests for root-level endpoints.""" - - def test_root_endpoint(self, client: TestClient): - """Test root endpoint returns welcome message.""" - response = client.get("/") - assert response.status_code == 200 - data = response.json() - assert "message" in data - assert data["message"] == "Graph-Backed Analysis API" - assert "version" in data - assert "docs" in data - - def test_health_endpoint_healthy(self, client: TestClient): - """Test health endpoint when Neo4j is available.""" - # Note: This may fail if Neo4j is not running, which is acceptable - response = client.get("/health") - assert response.status_code == 200 - data = response.json() - assert "status" in data - assert "neo4j" in data - - -class TestEntityEndpoints: - """Tests for entity-related endpoints.""" - - def test_get_ownership_paths_success(self, client: TestClient, patched_graph_service): - """Test successful ownership paths retrieval.""" - # Mock the service response - mock_paths = [ - { - "nodes": [ - { - "id": "12000001", - "labels": ["Officer"], - "properties": {"name": "John Doe", "id": "12000001"} - }, - { - "id": "10000001", - "labels": ["Entity"], - "properties": {"name": "Test Entity", "id": "10000001"} - } - ], - "relationships": [ - { - "type": "OFFICER_OF", - "start_node": "12000001", - "end_node": "10000001", - "properties": {} - } - ], - "length": 1 - } - ] - patched_graph_service.get_entity_ownership_paths.return_value = mock_paths - - response = client.get("/api/entities/10000001/ownership/paths?max_length=5") - assert response.status_code == 200 - data = response.json() - assert isinstance(data, list) - assert len(data) == 1 - assert "nodes" in data[0] - assert "relationships" in data[0] - assert "length" in data[0] - patched_graph_service.get_entity_ownership_paths.assert_called_once_with( - "10000001", max_depth=5 - ) - - def test_get_ownership_paths_with_default_max_length(self, client: TestClient, patched_graph_service): - """Test ownership paths with default max_length parameter.""" - patched_graph_service.get_entity_ownership_paths.return_value = [] - - response = client.get("/api/entities/10000001/ownership/paths") - assert response.status_code == 200 - assert response.json() == [] - patched_graph_service.get_entity_ownership_paths.assert_called_once_with( - "10000001", max_depth=5 - ) - - def test_get_ownership_paths_invalid_max_length(self, client: TestClient): - """Test ownership paths with invalid max_length parameter.""" - response = client.get("/api/entities/10000001/ownership/paths?max_length=0") - assert response.status_code == 422 # Validation error - - response = client.get("/api/entities/10000001/ownership/paths?max_length=11") - assert response.status_code == 422 # Validation error - - def test_get_ownership_paths_service_error(self, client: TestClient, patched_graph_service): - """Test ownership paths endpoint handles service errors.""" - patched_graph_service.get_entity_ownership_paths.side_effect = Exception("Database error") - - response = client.get("/api/entities/10000001/ownership/paths") - assert response.status_code == 500 - assert "Error finding ownership paths" in response.json()["detail"] - - -class TestNetworkEndpoints: - """Tests for network analysis endpoints.""" - - def test_get_top_intermediaries_success(self, client: TestClient, patched_graph_service): - """Test successful top intermediaries retrieval.""" - mock_intermediaries = [ - { - "intermediary_id": "11000001", - "name": "Test Intermediary", - "entity_count": 100 - } - ] - patched_graph_service.get_top_intermediaries.return_value = mock_intermediaries - - response = client.get("/api/networks/intermediaries/top?limit=10") - assert response.status_code == 200 - data = response.json() - assert isinstance(data, list) - assert len(data) == 1 - assert data[0]["intermediary_id"] == "11000001" - assert data[0]["intermediary_name"] == "Test Intermediary" - assert data[0]["entity_count"] == 100 - patched_graph_service.get_top_intermediaries.assert_called_once_with(limit=10) - - def test_get_top_intermediaries_default_limit(self, client: TestClient, patched_graph_service): - """Test top intermediaries with default limit.""" - patched_graph_service.get_top_intermediaries.return_value = [] - - response = client.get("/api/networks/intermediaries/top") - assert response.status_code == 200 - patched_graph_service.get_top_intermediaries.assert_called_once_with(limit=10) - - def test_get_top_intermediaries_invalid_limit(self, client: TestClient): - """Test top intermediaries with invalid limit.""" - response = client.get("/api/networks/intermediaries/top?limit=0") - assert response.status_code == 422 - - response = client.get("/api/networks/intermediaries/top?limit=101") - assert response.status_code == 422 - - def test_get_top_intermediaries_service_error(self, client: TestClient, patched_graph_service): - """Test top intermediaries endpoint handles service errors.""" - patched_graph_service.get_top_intermediaries.side_effect = Exception("Database error") - - response = client.get("/api/networks/intermediaries/top") - assert response.status_code == 500 - assert "Error finding top intermediaries" in response.json()["detail"] - - def test_get_red_flags_success(self, client: TestClient, patched_graph_service): - """Test successful red flags retrieval.""" - mock_red_flags = [ - { - "address_id": "24000001", - "address": "Test Address", - "entity_count": 50, - "sample_entities": ["Entity 1", "Entity 2", "Entity 3"] - } - ] - patched_graph_service.detect_red_flags.return_value = mock_red_flags - - response = client.get("/api/networks/redflags?min_entities=10&limit=50") - assert response.status_code == 200 - data = response.json() - assert isinstance(data, list) - assert len(data) == 1 - assert data[0]["address_id"] == "24000001" - assert data[0]["address"] == "Test Address" - assert data[0]["entity_count"] == 50 - assert len(data[0]["entities"]) == 3 - patched_graph_service.detect_red_flags.assert_called_once_with( - min_entities=10, limit=50 - ) - - def test_get_red_flags_default_params(self, client: TestClient, patched_graph_service): - """Test red flags with default parameters.""" - patched_graph_service.detect_red_flags.return_value = [] - - response = client.get("/api/networks/redflags") - assert response.status_code == 200 - patched_graph_service.detect_red_flags.assert_called_once_with( - min_entities=2, limit=50 - ) - - def test_get_red_flags_invalid_params(self, client: TestClient): - """Test red flags with invalid parameters.""" - response = client.get("/api/networks/redflags?min_entities=1") - assert response.status_code == 422 # min_entities must be >= 2 - - response = client.get("/api/networks/redflags?limit=0") - assert response.status_code == 422 - - def test_get_red_flags_service_error(self, client: TestClient, patched_graph_service): - """Test red flags endpoint handles service errors.""" - patched_graph_service.detect_red_flags.side_effect = Exception("Database error") - - response = client.get("/api/networks/redflags") - assert response.status_code == 500 - assert "Error finding red flags" in response.json()["detail"] - - def test_get_shortest_path_success(self, client: TestClient, patched_graph_service): - """Test successful shortest path retrieval.""" - mock_path = { - "nodes": [ - { - "id": "10000001", - "labels": ["Entity"], - "properties": {"name": "Start Entity", "id": "10000001"} - }, - { - "id": "10000002", - "labels": ["Entity"], - "properties": {"name": "End Entity", "id": "10000002"} - } - ], - "relationships": [ - { - "type": "OFFICER_OF", - "start_node": "10000001", - "end_node": "10000002", - "properties": {} - } - ], - "length": 1 - } - patched_graph_service.get_shortest_path.return_value = mock_path - - response = client.get( - "/api/networks/path/shortest?start_node_id=10000001&end_node_id=10000002" - ) - assert response.status_code == 200 - data = response.json() - assert "nodes" in data - assert "relationships" in data - assert "length" in data - assert len(data["nodes"]) == 2 - patched_graph_service.get_shortest_path.assert_called_once_with( - "10000001", "10000002" - ) - - def test_get_shortest_path_no_path(self, client: TestClient, patched_graph_service): - """Test shortest path when no path exists.""" - patched_graph_service.get_shortest_path.return_value = None - - response = client.get( - "/api/networks/path/shortest?start_node_id=10000001&end_node_id=99999999" - ) - assert response.status_code == 200 - assert response.json() is None - - def test_get_shortest_path_missing_params(self, client: TestClient): - """Test shortest path with missing required parameters.""" - response = client.get("/api/networks/path/shortest?start_node_id=10000001") - assert response.status_code == 422 # Missing end_node_id - - response = client.get("/api/networks/path/shortest?end_node_id=10000002") - assert response.status_code == 422 # Missing start_node_id - - def test_get_shortest_path_service_error(self, client: TestClient, patched_graph_service): - """Test shortest path endpoint handles service errors.""" - patched_graph_service.get_shortest_path.side_effect = Exception("Database error") - - response = client.get( - "/api/networks/path/shortest?start_node_id=10000001&end_node_id=10000002" - ) - assert response.status_code == 500 - assert "Error finding shortest path" in response.json()["detail"] - - def test_get_most_connected_officers_success(self, client: TestClient, patched_graph_service): - """Test successful most connected officers retrieval.""" - mock_officers = [ - { - "officer_id": "12000001", - "name": "John Doe", - "degree": 50 - }, - { - "officer_id": "12000002", - "name": "Jane Smith", - "degree": 45 - } - ] - patched_graph_service.get_most_connected_officers.return_value = mock_officers - - response = client.get("/api/networks/stats/centrality?limit=10") - assert response.status_code == 200 - data = response.json() - assert isinstance(data, list) - assert len(data) == 2 - assert data[0]["officer_id"] == "12000001" - assert data[0]["name"] == "John Doe" - assert data[0]["degree"] == 50 - patched_graph_service.get_most_connected_officers.assert_called_once_with(limit=10) - - def test_get_most_connected_officers_default_limit(self, client: TestClient, patched_graph_service): - """Test most connected officers with default limit.""" - patched_graph_service.get_most_connected_officers.return_value = [] - - response = client.get("/api/networks/stats/centrality") - assert response.status_code == 200 - patched_graph_service.get_most_connected_officers.assert_called_once_with(limit=10) - - def test_get_most_connected_officers_invalid_limit(self, client: TestClient): - """Test most connected officers with invalid limit.""" - response = client.get("/api/networks/stats/centrality?limit=0") - assert response.status_code == 422 - - response = client.get("/api/networks/stats/centrality?limit=101") - assert response.status_code == 422 - - def test_get_most_connected_officers_service_error(self, client: TestClient, patched_graph_service): - """Test most connected officers endpoint handles service errors.""" - patched_graph_service.get_most_connected_officers.side_effect = Exception("Database error") - - response = client.get("/api/networks/stats/centrality") - assert response.status_code == 500 - assert "Error finding most connected officers" in response.json()["detail"] diff --git a/tests/test_entities.py b/tests/test_entities.py new file mode 100644 index 0000000..f9c31d5 --- /dev/null +++ b/tests/test_entities.py @@ -0,0 +1,897 @@ +""" +Panama Papers API - Entity Endpoint Unit Tests +================================================ + +Unit tests for entity-related API endpoints. + +Test Coverage: + - GET /entities/{entity_id} - Entity lookup (success + not found) + - GET /entities/search - Entity search (success + empty + pagination) + - GET /entities/{entity_id}/ownership-path - Ownership tracing + - GET /entities/top/influential - PageRank ranking + - GET /health - Health check + - GET / - Root endpoint + +Usage: + pytest tests/test_entities.py -v --asyncio-mode=auto + pytest tests/test_entities.py -v -k "test_get_entity" + pytest tests/test_entities.py -v -m "not slow" + +Fixtures Required (from conftest.py): + - async_client: FastAPI test client + - sample_entity: Single test entity + - sample_entities: Multiple test entities + - sample_ownership_chain: Multi-hop ownership +""" + +from __future__ import annotations + +import pytest +from httpx import AsyncClient + +# Mark all tests in this module as async +pytestmark = pytest.mark.asyncio + + +# ============================================================================ +# TEST CLASS: GET ENTITY BY ID +# ============================================================================ + +class TestGetEntity: + """Tests for GET /entities/{entity_id} endpoint.""" + + async def test_get_entity_success( + self, + async_client: AsyncClient, + sample_entity: dict, + ): + """ + Test retrieving an existing entity. + + Setup: Create entity in database via sample_entity fixture + Call: GET /entities/{entity_id} + Assert: 200 status, correct entity data returned + Verify: All fields present (name, jurisdiction, type) + """ + entity_id = sample_entity["entity_id"] + + response = await async_client.get(f"/entities/{entity_id}") + + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + + data = response.json() + + # Verify all required fields are present + assert "entity_id" in data + assert "name" in data + assert "jurisdiction_code" in data or "jurisdiction" in data + assert "entity_type" in data or "type" in data + assert "status" in data + + # Verify data matches sample entity + assert data["entity_id"] == sample_entity["entity_id"] + assert data["name"] == sample_entity["name"] + + # Check jurisdiction (handle both field names) + jurisdiction = data.get("jurisdiction_code") or data.get("jurisdiction") + expected_jurisdiction = sample_entity.get("jurisdiction_code") or sample_entity.get("jurisdiction") + assert jurisdiction == expected_jurisdiction + + # Check entity type (handle both field names) + entity_type = data.get("entity_type") or data.get("type") + expected_type = sample_entity.get("entity_type") or sample_entity.get("type") + assert entity_type == expected_type + + async def test_get_entity_not_found( + self, + async_client: AsyncClient, + ): + """ + Test 404 response when entity doesn't exist. + + Call: GET /entities/NONEXISTENT + Assert: 404 status, error message + Verify: "not found" in response detail + """ + response = await async_client.get("/entities/NONEXISTENT-ENTITY-99999") + + assert response.status_code == 404 + + data = response.json() + assert "detail" in data + assert "not found" in data["detail"].lower() + + async def test_get_entity_with_analytics( + self, + async_client: AsyncClient, + sample_entity: dict, + ): + """ + Test retrieving entity with analytics data. + + Setup: Entity with pagerank_score exists + Call: GET /entities/{entity_id}?include_analytics=true + Assert: Response includes pagerank_score, community_id + """ + entity_id = sample_entity["entity_id"] + + response = await async_client.get( + f"/entities/{entity_id}", + params={"include_analytics": True}, + ) + + assert response.status_code == 200 + + data = response.json() + + # Analytics fields should be present (may be null) + assert "pagerank_score" in data + assert "community_id" in data + + async def test_get_entity_invalid_id( + self, + async_client: AsyncClient, + ): + """ + Test handling of invalid entity ID format. + + Call: GET /entities/ (empty) + Assert: 404 or 405 status + """ + response = await async_client.get("/entities/") + + # Empty ID should not match route + assert response.status_code in [404, 405, 307] + + +# ============================================================================ +# TEST CLASS: SEARCH ENTITIES +# ============================================================================ + +class TestSearchEntities: + """Tests for GET /entities/search endpoint.""" + + async def test_search_entities_success( + self, + async_client: AsyncClient, + sample_entities: list, + ): + """ + Test searching entities by name. + + Setup: Create 3+ entities with different names + Call: GET /entities/search?q=Holdings + Assert: 200 status, matching entities returned + Verify: count >= 1, all names contain query + """ + response = await async_client.get( + "/entities/search", + params={"q": "Holdings"}, + ) + + assert response.status_code == 200 + + data = response.json() + + # Response should have results + assert "results" in data + assert "total_results" in data + + # Should find at least one match + assert data["total_results"] >= 1 + assert len(data["results"]) >= 1 + + # All results should contain search term (case-insensitive) + for result in data["results"]: + assert "holdings" in result["name"].lower(), f"Result '{result['name']}' doesn't contain 'Holdings'" + + async def test_search_entities_empty_result( + self, + async_client: AsyncClient, + ): + """ + Test search with no matching results. + + Call: GET /entities/search?q=NONEXISTENT123 + Assert: 200 status, empty list returned + """ + response = await async_client.get( + "/entities/search", + params={"q": "XYZNONEXISTENT123456789"}, + ) + + assert response.status_code == 200 + + data = response.json() + + # Should return empty results + assert "results" in data + assert data["total_results"] == 0 + assert len(data["results"]) == 0 + + async def test_search_entities_pagination( + self, + async_client: AsyncClient, + sample_entities: list, + ): + """ + Test pagination with limit and offset. + + Setup: Create multiple entities + Call: GET /entities/search?q=Test&limit=1&offset=0 + Assert: Exactly 1 result returned + Call: GET /entities/search?q=Test&limit=1&offset=1 + Assert: Next result returned (different from first) + """ + # First page + response1 = await async_client.get( + "/entities/search", + params={"q": "Test", "limit": 1, "offset": 0}, + ) + + assert response1.status_code == 200 + data1 = response1.json() + + # Should respect limit + assert len(data1["results"]) <= 1 + assert data1["limit"] == 1 + assert data1["offset"] == 0 + + # Second page + response2 = await async_client.get( + "/entities/search", + params={"q": "Test", "limit": 1, "offset": 1}, + ) + + assert response2.status_code == 200 + data2 = response2.json() + + assert data2["offset"] == 1 + + # If there are enough results, pages should be different + if data1["total_results"] > 1 and len(data1["results"]) > 0 and len(data2["results"]) > 0: + assert data1["results"][0]["node_id"] != data2["results"][0]["node_id"] + + async def test_search_entities_with_filters( + self, + async_client: AsyncClient, + sample_entities: list, + ): + """ + Test search with jurisdiction filter. + + Setup: Entities in different jurisdictions + Call: GET /entities/search?q=Test&jurisdiction=BVI + Assert: Only BVI entities returned + """ + response = await async_client.get( + "/entities/search", + params={"q": "Test", "jurisdiction": "BVI"}, + ) + + assert response.status_code == 200 + + data = response.json() + + # All results should be in BVI + for result in data["results"]: + if result.get("jurisdiction_code"): + assert result["jurisdiction_code"] == "BVI" + + async def test_search_entities_query_validation( + self, + async_client: AsyncClient, + ): + """ + Test search query validation. + + Call: GET /entities/search?q=A (too short) + Assert: 422 validation error + """ + response = await async_client.get( + "/entities/search", + params={"q": "A"}, # min_length=2 + ) + + assert response.status_code == 422 + + +# ============================================================================ +# TEST CLASS: OWNERSHIP PATH +# ============================================================================ + +class TestOwnershipPath: + """Tests for GET /entities/{entity_id}/ownership-path endpoint.""" + + async def test_ownership_path_success( + self, + async_client: AsyncClient, + sample_ownership_chain: dict, + ): + """ + Test retrieving ownership path. + + Setup: Create beneficial ownership chain (Person -> E1 -> E2 -> E3) + Call: GET /entities/{entity_id}/ownership-path + Assert: 200 status, path data returned + Verify: entities list, relationships list, depth + """ + # Target is the end of the chain + target_entity_id = sample_ownership_chain["entities"][2]["entity_id"] + + response = await async_client.get( + f"/entities/{target_entity_id}/ownership-path", + params={"max_depth": 4}, + ) + + assert response.status_code == 200, f"Expected 200, got {response.status_code}: {response.text}" + + data = response.json() + + # Verify response structure + assert "path_count" in data + assert "paths" in data + assert "average_depth" in data + + # Should have at least one path + assert data["path_count"] >= 1 + assert isinstance(data["paths"], list) + assert len(data["paths"]) >= 1 + + # Verify path structure + path = data["paths"][0] + assert "nodes" in path + assert "edges" in path + assert "depth" in path + assert isinstance(path["nodes"], list) + assert isinstance(path["edges"], list) + + async def test_ownership_path_not_found( + self, + async_client: AsyncClient, + sample_entity: dict, + ): + """ + Test 404 when no ownership paths exist. + + Setup: Create isolated entity (no ownership relationships) + Call: GET /entities/{entity_id}/ownership-path + Assert: 404 status + """ + # sample_entity is standalone, no ownership relationships + entity_id = sample_entity["entity_id"] + + response = await async_client.get( + f"/entities/{entity_id}/ownership-path", + params={"max_depth": 4}, + ) + + # Should be 404 (no paths found) + assert response.status_code == 404 + + data = response.json() + assert "detail" in data + + async def test_ownership_path_entity_not_found( + self, + async_client: AsyncClient, + ): + """ + Test 404 when entity doesn't exist. + + Call: GET /entities/NONEXISTENT/ownership-path + Assert: 404 status + """ + response = await async_client.get( + "/entities/NONEXISTENT-12345/ownership-path", + params={"max_depth": 4}, + ) + + assert response.status_code == 404 + + async def test_ownership_path_depth_validation( + self, + async_client: AsyncClient, + sample_entity: dict, + ): + """ + Test max_depth parameter validation. + + Call: GET /entities/{id}/ownership-path?max_depth=10 + Assert: 422 validation error (max is 6) + """ + entity_id = sample_entity["entity_id"] + + response = await async_client.get( + f"/entities/{entity_id}/ownership-path", + params={"max_depth": 10}, # Exceeds max (6) + ) + + assert response.status_code == 422 + + async def test_ownership_path_effective_ownership( + self, + async_client: AsyncClient, + sample_ownership_chain: dict, + ): + """ + Test effective ownership calculation. + + Setup: Chain with 75% -> 50% -> 100% ownership + Assert: Effective ownership ~ 37.5% + """ + target_entity_id = sample_ownership_chain["entities"][2]["entity_id"] + + response = await async_client.get( + f"/entities/{target_entity_id}/ownership-path", + params={"max_depth": 4}, + ) + + assert response.status_code == 200 + + data = response.json() + + if data["path_count"] > 0: + path = data["paths"][0] + + # Effective ownership should be calculated + if path.get("effective_ownership") is not None: + # Should be approximately 37.5% (75% * 50% * 100%) + assert 30 <= path["effective_ownership"] <= 45 + + +# ============================================================================ +# TEST CLASS: ENTITY NETWORK +# ============================================================================ + +class TestEntityNetwork: + """Tests for GET /entities/{entity_id}/network endpoint.""" + + async def test_entity_network_success( + self, + async_client: AsyncClient, + sample_complex_network: dict, + ): + """ + Test retrieving network neighbors. + + Setup: Entity with connections + Call: GET /entities/{entity_id}/network + Assert: 200 status, connected entities returned + """ + entity_id = sample_complex_network["entities"][0]["entity_id"] + + response = await async_client.get( + f"/entities/{entity_id}/network", + params={"depth": 1, "direction": "both"}, + ) + + assert response.status_code == 200 + + data = response.json() + + # Should return list of relationships + assert isinstance(data, list) + + async def test_entity_network_empty( + self, + async_client: AsyncClient, + sample_entity: dict, + ): + """ + Test network for isolated entity. + + Setup: Entity with no relationships + Call: GET /entities/{entity_id}/network + Assert: 200 status, empty list + """ + entity_id = sample_entity["entity_id"] + + response = await async_client.get( + f"/entities/{entity_id}/network", + params={"depth": 1}, + ) + + assert response.status_code == 200 + + data = response.json() + assert isinstance(data, list) + assert len(data) == 0 + + async def test_entity_network_not_found( + self, + async_client: AsyncClient, + ): + """ + Test 404 when entity doesn't exist. + """ + response = await async_client.get( + "/entities/NONEXISTENT-12345/network", + ) + + assert response.status_code == 404 + + +# ============================================================================ +# TEST CLASS: INFLUENTIAL ENTITIES +# ============================================================================ + +class TestInfluentialEntities: + """Tests for GET /entities/top/influential endpoint.""" + + async def test_influential_entities_success( + self, + async_client: AsyncClient, + sample_complex_network: dict, + ): + """ + Test retrieving top influential entities. + + Setup: Create entities with pagerank_score + Call: GET /entities/top/influential + Assert: 200 status, sorted by score DESC + Verify: Returned list sorted correctly + """ + response = await async_client.get( + "/entities/top/influential", + params={"limit": 10}, + ) + + assert response.status_code == 200 + + data = response.json() + + # Should return list + assert isinstance(data, list) + + # If results exist, verify sorting (descending by PageRank) + if len(data) >= 2: + scores = [e["pagerank_score"] for e in data] + assert scores == sorted(scores, reverse=True), "Results should be sorted by PageRank descending" + + async def test_influential_entities_limit( + self, + async_client: AsyncClient, + sample_complex_network: dict, + ): + """ + Test limit parameter for influential entities. + + Call: GET /entities/top/influential?limit=2 + Assert: At most 2 results returned + """ + response = await async_client.get( + "/entities/top/influential", + params={"limit": 2}, + ) + + assert response.status_code == 200 + + data = response.json() + assert len(data) <= 2 + + async def test_influential_entities_jurisdiction_filter( + self, + async_client: AsyncClient, + sample_complex_network: dict, + ): + """ + Test jurisdiction filter for influential entities. + + Call: GET /entities/top/influential?jurisdiction=BVI + Assert: Only BVI entities returned + """ + response = await async_client.get( + "/entities/top/influential", + params={"jurisdiction": "BVI", "limit": 10}, + ) + + assert response.status_code == 200 + + data = response.json() + + # All results should be in BVI + for entity in data: + if entity.get("jurisdiction_code"): + assert entity["jurisdiction_code"] == "BVI" + + async def test_influential_entities_empty( + self, + async_client: AsyncClient, + ): + """ + Test when no entities have PageRank scores. + + Call: GET /entities/top/influential?min_score=999999 + Assert: 200 status, empty list + """ + response = await async_client.get( + "/entities/top/influential", + params={"min_score": 999999}, # Very high threshold + ) + + assert response.status_code == 200 + + data = response.json() + assert isinstance(data, list) + + async def test_influential_entities_response_structure( + self, + async_client: AsyncClient, + sample_complex_network: dict, + ): + """ + Test response structure for influential entities. + + Verify: Required fields present + """ + response = await async_client.get( + "/entities/top/influential", + params={"limit": 5}, + ) + + assert response.status_code == 200 + + data = response.json() + + if len(data) > 0: + entity = data[0] + # Verify required fields + assert "entity_id" in entity + assert "name" in entity + assert "pagerank_score" in entity + assert "rank" in entity + + +# ============================================================================ +# TEST CLASS: CONNECTED ENTITIES +# ============================================================================ + +class TestConnectedEntities: + """Tests for GET /entities/top/connected endpoint.""" + + async def test_connected_entities_success( + self, + async_client: AsyncClient, + sample_complex_network: dict, + ): + """ + Test retrieving most connected entities. + + Call: GET /entities/top/connected + Assert: 200 status, list returned + """ + response = await async_client.get( + "/entities/top/connected", + params={"limit": 10}, + ) + + assert response.status_code == 200 + + data = response.json() + assert isinstance(data, list) + + +# ============================================================================ +# TEST CLASS: ENTITIES BY JURISDICTION +# ============================================================================ + +class TestEntitiesByJurisdiction: + """Tests for GET /entities/by-jurisdiction/{code} endpoint.""" + + async def test_entities_by_jurisdiction_success( + self, + async_client: AsyncClient, + sample_entities: list, + ): + """ + Test retrieving entities by jurisdiction. + + Call: GET /entities/by-jurisdiction/BVI + Assert: 200 status, all results in BVI + """ + response = await async_client.get("/entities/by-jurisdiction/BVI") + + assert response.status_code == 200 + + data = response.json() + assert isinstance(data, list) + + # All results should be in BVI + for entity in data: + assert entity["jurisdiction_code"] == "BVI" + + async def test_entities_by_jurisdiction_empty( + self, + async_client: AsyncClient, + ): + """ + Test when no entities in jurisdiction. + + Call: GET /entities/by-jurisdiction/XYZ + Assert: 200 status, empty list + """ + response = await async_client.get("/entities/by-jurisdiction/XYZ") + + assert response.status_code == 200 + + data = response.json() + assert isinstance(data, list) + assert len(data) == 0 + + +# ============================================================================ +# TEST CLASS: ENTITY RISK ANALYSIS +# ============================================================================ + +class TestEntityRiskAnalysis: + """Tests for GET /entities/{entity_id}/risk endpoint.""" + + async def test_risk_analysis_success( + self, + async_client: AsyncClient, + sample_complex_network: dict, + ): + """ + Test risk analysis retrieval. + + Setup: Entity with risk factors + Call: GET /entities/{entity_id}/risk + Assert: 200 status, risk data returned + """ + entity_id = sample_complex_network["entities"][0]["entity_id"] + + response = await async_client.get(f"/entities/{entity_id}/risk") + + assert response.status_code == 200 + + data = response.json() + + # Verify response structure + assert "entity_id" in data + assert "overall_risk_score" in data + assert "overall_risk_level" in data + assert "red_flags" in data + + # Risk score should be 0-100 + assert 0 <= data["overall_risk_score"] <= 100 + + # Risk level should be valid + assert data["overall_risk_level"] in ["LOW", "MEDIUM", "HIGH", "CRITICAL", "UNKNOWN"] + + async def test_risk_analysis_not_found( + self, + async_client: AsyncClient, + ): + """ + Test 404 when entity doesn't exist. + """ + response = await async_client.get("/entities/NONEXISTENT-12345/risk") + + assert response.status_code == 404 + + +# ============================================================================ +# TEST CLASS: HEALTH CHECK +# ============================================================================ + +class TestHealthCheck: + """Tests for health endpoints.""" + + async def test_health_check( + self, + async_client: AsyncClient, + ): + """ + Test /health endpoint. + + Call: GET /health + Assert: 200 status (or 503 if db unavailable) + Verify: status field present + """ + response = await async_client.get("/health") + + assert response.status_code in [200, 503] + + data = response.json() + assert "status" in data + assert data["status"] in ["healthy", "degraded", "unhealthy"] + + async def test_root_endpoint( + self, + async_client: AsyncClient, + ): + """ + Test / root endpoint. + + Call: GET / + Assert: 200 status, API info returned + """ + response = await async_client.get("/") + + assert response.status_code == 200 + + data = response.json() + assert "version" in data + assert "service" in data or "message" in data + + async def test_readiness_check( + self, + async_client: AsyncClient, + ): + """ + Test /ready endpoint. + + Call: GET /ready + Assert: 200 if ready, 503 if not + """ + response = await async_client.get("/ready") + + assert response.status_code in [200, 503] + + data = response.json() + assert "ready" in data + + async def test_liveness_check( + self, + async_client: AsyncClient, + ): + """ + Test /live endpoint. + + Call: GET /live + Assert: Always 200 if process alive + """ + response = await async_client.get("/live") + + assert response.status_code == 200 + + data = response.json() + assert data["alive"] is True + + +# ============================================================================ +# TEST CLASS: ERROR HANDLING +# ============================================================================ + +class TestErrorHandling: + """Tests for error handling.""" + + async def test_invalid_endpoint( + self, + async_client: AsyncClient, + ): + """ + Test 404 for non-existent endpoint. + """ + response = await async_client.get("/nonexistent/endpoint") + + assert response.status_code == 404 + + async def test_method_not_allowed( + self, + async_client: AsyncClient, + ): + """ + Test 405 for unsupported HTTP method. + """ + response = await async_client.post("/entities/TEST-001") + + assert response.status_code == 405 + + async def test_validation_error_format( + self, + async_client: AsyncClient, + ): + """ + Test validation error response format. + """ + response = await async_client.get( + "/entities/search", + params={"q": "A"}, # Too short + ) + + assert response.status_code == 422 + + data = response.json() + assert "detail" in data or "errors" in data