diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..e6de5ac --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,73 @@ +name: Tests + +on: + push: + branches: [main] + pull_request: + +jobs: + unit-and-sqlite-tests: + name: Unit & SQLite tests (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.12", "3.13"] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: pip install -e ".[dev]" + + - name: Run tests (excluding postgres) + run: pytest -m "not postgres" -q + + postgres-integration-tests: + name: PostgreSQL integration tests (Python ${{ matrix.python-version }}) + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ["3.12", "3.13"] + + services: + postgres: + image: postgres:16 + env: + POSTGRES_USER: test + POSTGRES_PASSWORD: test + POSTGRES_DB: test_db + ports: + - 55432:5432 + options: >- + --health-cmd "pg_isready -U test -d test_db" + --health-interval 2s + --health-timeout 5s + --health-retries 10 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies (including postgres extra) + run: pip install -e ".[dev,postgres]" + + - name: Run postgres integration tests + run: pytest -m postgres -v + env: + PGHOST: localhost + PGPORT: 55432 + PGUSER: test + PGPASSWORD: test + PGDATABASE: test_db + ENGINE_CDM: postgresql+psycopg://test:test@localhost:55432/test_db diff --git a/.gitignore b/.gitignore index 3b77ef3..92ea8da 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,9 @@ RELATIONSHIP.csv DOMAIN.csv CONCEPT_ANCESTOR.csv CONCEPT_SYNONYM.csv +# Allow committed test fixtures (minimal CSVs, not real Athena downloads) +!tests/fixtures/athena_source/ +!tests/fixtures/athena_source/*.csv data/ *.db-journal vocabulary_files/ @@ -66,4 +69,7 @@ logging/ _temp/ temp/ *.dump -*.bak \ No newline at end of file +*.bak +notebooks/ +.dockerignore +docker/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c29534..c43b5ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -91,4 +91,11 @@ - set minimum versions per dependabot (dev and required deps) ## 0.6.2 -- capped maximum `orm-loader` version to avoid pulling in future breaking changes \ No newline at end of file +- capped maximum `orm-loader` version to avoid pulling in future breaking changes + +## 0.6.3 +- fix CSV quote mode for Athena vocabulary loading: switch from `literal` to `auto` to prevent quoted concept names from overflowing `VARCHAR(255)` database columns +- make `chunksize=100_000` the default for `load-vocab-source` (was `None`/disabled); pass `--chunksize 0` to disable chunking explicitly +- **breaking:** `load-vocab-source` CLI now defaults `--merge-strategy` to `replace` (was `upsert`) to match the Python API default and ensure retired concepts are purged on vocabulary refresh; pass `--merge-strategy upsert` to restore the previous behaviour +- **breaking:** CLI entry point renamed from `omop-maint` to `omop-alchemy`; update any scripts or aliases accordingly (saved `.omop-maint.toml` defaults files are unaffected) +- remove stale notebooks from repository diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml deleted file mode 100644 index bfc1c9b..0000000 --- a/docker/docker-compose.yml +++ /dev/null @@ -1,94 +0,0 @@ -volumes: - postgres-data: - name: postgres-data - pgadmin-data: - name: pgadmin-data - -networks: - cava-network: - name: cava-network - driver: bridge - -services: - pgadmin: - profiles: [ "pgadmin"] - image: dpage/pgadmin4:latest - restart: unless-stopped - networks: - - cava-network - environment: - PGADMIN_DEFAULT_EMAIL: a@b.c - PGADMIN_DEFAULT_PASSWORD: pwd - SCRIPT_NAME: /pgadmin4 - volumes: - - pgadmin-data:/var/lib/pgadmin - ports: - - "5050:80" - python: - build: ./python - restart: unless-stopped - networks: - - cava-network - environment: - POSTGRES_USER: ${POSTGRES_USER} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - POSTGRES_DB: ${POSTGRES_DB} - ENGINE_CDM: postgresql+psycopg2://${POSTGRES_USER}:${POSTGRES_PASSWORD}@postgres:5432/${POSTGRES_DB} - env_file: - - .env - depends_on: - postgres: - condition: service_healthy - volumes: - - ..:/workspace:rw - command: tail -f /dev/null - postgres: - build: ./postgres - networks: - - cava-network - environment: - POSTGRES_USER: ${POSTGRES_USER} - POSTGRES_PASSWORD: ${POSTGRES_PASSWORD} - POSTGRES_DB: ${POSTGRES_DB} - env_file: - - .env - restart: unless-stopped - volumes: - - ./data:/home/data:rw - - postgres-data:/var/lib/postgresql - - ./custom.conf:/etc/postgresql/custom.conf - healthcheck: - test: ["CMD-SHELL", "pg_isready -U postgres"] - interval: 5s - timeout: 5s - retries: 10 - command: - - postgres - - -c - - include_if_exists=/etc/postgresql/custom.conf - cava-jupyter-notebook: - profiles: [ "jupyter"] - build: ./jupyter - restart: unless-stopped - depends_on: - postgres: - condition: service_healthy - networks: - - cava-network - environment: - JUPYTERHUB_SERVICE_PREFIX: /jupyter/ - JUPYTERHUB_BASE_URL: ${HTTP_TYPE}://${HOST} - env_file: - - .env - volumes: - - ./work:/home/jovyan/work:rw - command: - - jupyter-lab - - --ip=* - - --NotebookApp.token= - - --NotebookApp.password= - - --NotebookApp.base_url=/jupyter - ports: - - "8888:8888" - mem_limit: 12g - shm_size: 4g \ No newline at end of file diff --git a/docker/jupyter/Dockerfile b/docker/jupyter/Dockerfile deleted file mode 100644 index 7a6abc6..0000000 --- a/docker/jupyter/Dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM quay.io/jupyter/minimal-notebook:python-3.13 - -USER root - -# Force uv install location -ENV HOME=/root -ENV PATH="/root/.local/bin:${PATH}" - -# Install uv -RUN curl -LsSf https://astral.sh/uv/install.sh | sh - -# Create uv venv -RUN uv venv /opt/venv -ENV VIRTUAL_ENV=/opt/venv -ENV PATH="/opt/venv/bin:${PATH}" - -# Install Python deps -RUN uv pip install omop-alchemy psycopg2-binary pip omop-graph -RUN /opt/venv/bin/python -m pip install ipykernel && \ - /opt/venv/bin/python -m ipykernel install \ - --name uv-venv \ - --display-name "Python (uv venv)" -# Switch back to notebook user -USER jovyan -ENV HOME=/home/jovyan -COPY ./.env /home/jovyan/work/.env -WORKDIR /home/jovyan/work \ No newline at end of file diff --git a/docker/postgres/Dockerfile b/docker/postgres/Dockerfile deleted file mode 100644 index 93f0fba..0000000 --- a/docker/postgres/Dockerfile +++ /dev/null @@ -1,8 +0,0 @@ -#FROM timescale/timescaledb-ha:pg18 -FROM postgres:18 - -# Optional: timezone / locale tweaks -ENV TZ=UTC - -# Expose is informational only -EXPOSE 5432 \ No newline at end of file diff --git a/docker/postgres/custom.conf b/docker/postgres/custom.conf deleted file mode 100644 index 9927308..0000000 --- a/docker/postgres/custom.conf +++ /dev/null @@ -1,10 +0,0 @@ -# Performance tuning for bulk loads -max_wal_size = '20GB' -checkpoint_timeout = '30min' -wal_compression = on - -# Memory -shared_buffers = '6GB' -work_mem = '256MB' -maintenance_work_mem = '2GB' -effective_cache_size = '16GB' \ No newline at end of file diff --git a/docker/python/.dockerignore b/docker/python/.dockerignore deleted file mode 100644 index 4a27e3c..0000000 --- a/docker/python/.dockerignore +++ /dev/null @@ -1,6 +0,0 @@ -.venv -__pycache__ -.git -.gitignore -.env -data \ No newline at end of file diff --git a/docker/python/Dockerfile b/docker/python/Dockerfile deleted file mode 100644 index 6a54075..0000000 --- a/docker/python/Dockerfile +++ /dev/null @@ -1,47 +0,0 @@ -# ---- Stage 1: postgres tools ---- -FROM postgres:18 AS pgtools - -# ---- Stage 2: python ---- -FROM python:3.13 - -ENV PYTHONPYCACHEPREFIX=/tmp/pycache \ - PYTHONUNBUFFERED=1 \ - UV_PROJECT_ENVIRONMENT=/home/vscode/.venv \ - UV_CACHE_DIR=/home/vscode/.cache/uv \ - PATH="/usr/local/bin:/home/vscode/.venv/bin:$PATH" \ - LANG=C.UTF-8 \ - LC_ALL=C.UTF-8 - -# system deps -RUN apt-get update && apt-get install -y --no-install-recommends \ - git \ - curl \ - bash \ - bash-completion \ - less \ - vim \ - && rm -rf /var/lib/apt/lists/* - -# copy binaries from pgtools stage -COPY --from=pgtools /usr/lib/postgresql /usr/lib/postgresql -COPY --from=pgtools /usr/lib/aarch64-linux-gnu/libpq* /usr/lib/aarch64-linux-gnu/ - -RUN ln -s /usr/lib/postgresql/18/bin/psql /usr/local/bin/psql \ - && ln -s /usr/lib/postgresql/18/bin/pg_dump /usr/local/bin/pg_dump \ - && ln -s /usr/lib/postgresql/18/bin/pg_restore /usr/local/bin/pg_restore - -# ---- uv install ---- -COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv - -# ---- User setup ---- -RUN useradd -m -s /bin/bash vscode - -WORKDIR /workspace - -# ---- Auto-activate venv ---- -RUN printf '\nif [ -f /home/vscode/.venv/bin/activate ] && [ -z "$VIRTUAL_ENV" ]; then\n . /home/vscode/.venv/bin/activate\nfi\n' >> /home/vscode/.bashrc \ - && chown vscode:vscode /home/vscode/.bashrc - -USER vscode - -CMD ["sleep", "infinity"] \ No newline at end of file diff --git a/docs/advanced/fulltext.md b/docs/advanced/fulltext.md index ab1c531..6cadc08 100644 --- a/docs/advanced/fulltext.md +++ b/docs/advanced/fulltext.md @@ -58,8 +58,8 @@ SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector; To enable the optional full-text sidecars in a PostgreSQL environment: ```bash -omop-maint fulltext install -omop-maint fulltext populate +omop-alchemy fulltext install +omop-alchemy fulltext populate ``` If your running Python process should use the stored sidecar columns through ORM @@ -164,28 +164,28 @@ This is the mode you want when: The maintenance CLI manages the full-text sidecars through: ```bash -omop-maint fulltext install -omop-maint fulltext populate -omop-maint fulltext drop +omop-alchemy fulltext install +omop-alchemy fulltext populate +omop-alchemy fulltext drop ``` Typical workflow: ```bash -omop-maint fulltext install -omop-maint fulltext populate +omop-alchemy fulltext install +omop-alchemy fulltext populate ``` If you later reload or update vocabulary data, refresh the stored vectors with: ```bash -omop-maint fulltext populate +omop-alchemy fulltext populate ``` If you want to remove the feature completely: ```bash -omop-maint fulltext drop +omop-alchemy fulltext drop ``` --- @@ -280,7 +280,7 @@ drop lifecycle is only meaningful on PostgreSQL. ## Operational Gotchas - treat the sidecar columns as **derived search state**, not source-of-truth data -- if you bulk-load new vocabulary rows, rerun `omop-maint fulltext populate` +- if you bulk-load new vocabulary rows, rerun `omop-alchemy fulltext populate` - if you use `reconcile-schema`, the sidecar columns and indexes are intentional database additions outside the core OMOP schema - GIN indexes can be expensive to build on large vocabularies, so plan that as a real diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index ff0f3ea..d8b7a47 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -180,14 +180,14 @@ At the database level: Typical maintenance workflow: ```bash -omop-maint fulltext install -omop-maint fulltext populate +omop-alchemy fulltext install +omop-alchemy fulltext populate ``` If you later reload vocabulary data, rerun: ```bash -omop-maint fulltext populate +omop-alchemy fulltext populate ``` For the full design and query patterns, see: diff --git a/docs/getting-started/maintenance.md b/docs/getting-started/maintenance.md index 98b4114..4f8f09a 100644 --- a/docs/getting-started/maintenance.md +++ b/docs/getting-started/maintenance.md @@ -11,7 +11,7 @@ database. ## Entrypoint ```bash -omop-maint --help +omop-alchemy --help python -m omop_alchemy.maintenance.cli --help ``` @@ -33,27 +33,27 @@ Common flags used by many commands: !!! info "Defaults file discovery" - Project-local defaults are stored in `.omop-maint.toml`. + Project-local defaults are stored in `.omop-alchemy.toml`. - the CLI looks for the nearest ancestor directory containing `pyproject.toml` - and uses `/.omop-maint.toml` - - if no ancestor project marker is found, it falls back to `./.omop-maint.toml` + and uses `/.omop-alchemy.toml` + - if no ancestor project marker is found, it falls back to `./.omop-alchemy.toml` in the current working directory - to force a fixed path, set `OMOP_MAINT_DEFAULTS_FILE` - - running `omop-maint` from outside your intended project tree may use a different + - running `omop-alchemy` from outside your intended project tree may use a different defaults file than expected ```bash -omop-maint config show -omop-maint config set-overrides --dotenv .env --engine-schema cdm --db-schema public --athena-source ./athena_source -omop-maint config clear-overrides -omop-maint config clear-overrides --db-schema +omop-alchemy config show +omop-alchemy config set-overrides --dotenv .env --engine-schema cdm --db-schema public --athena-source ./athena_source +omop-alchemy config clear-overrides +omop-alchemy config clear-overrides --db-schema ``` Resolution order: 1. explicit CLI flag -2. saved `.omop-maint.toml` default +2. saved `.omop-alchemy.toml` default 3. command fallback `engine_schema` selects the configured engine URL (`ENGINE_` or `ENGINE`). @@ -99,49 +99,49 @@ user-facing error. ### Inspect ```bash -omop-maint info -omop-maint doctor -omop-maint doctor --deep +omop-alchemy info +omop-alchemy doctor +omop-alchemy doctor --deep ``` ### Schema ```bash -omop-maint reconcile-schema -omop-maint create-missing-tables --dry-run -omop-maint create-missing-tables +omop-alchemy reconcile-schema +omop-alchemy create-missing-tables --dry-run +omop-alchemy create-missing-tables ``` ### Vocabulary ```bash -omop-maint load-vocab-source -omop-maint load-vocab-source --athena-source ./athena_source --dry-run +omop-alchemy load-vocab-source +omop-alchemy load-vocab-source --athena-source ./athena_source --dry-run ``` ### Bulk reload helpers ```bash -omop-maint foreign-keys disable -omop-maint indexes disable -omop-maint truncate-tables --scope clinical --restart-identities --yes +omop-alchemy foreign-keys disable +omop-alchemy indexes disable +omop-alchemy truncate-tables --scope clinical --restart-identities --yes ``` After ETL: ```bash -omop-maint reset-sequences -omop-maint indexes enable -omop-maint foreign-keys enable --strict -omop-maint analyze-tables --scope clinical +omop-alchemy reset-sequences +omop-alchemy indexes enable +omop-alchemy foreign-keys enable --strict +omop-alchemy analyze-tables --scope clinical ``` ### Full-text sidecars ```bash -omop-maint fulltext install -omop-maint fulltext populate -omop-maint fulltext drop +omop-alchemy fulltext install +omop-alchemy fulltext populate +omop-alchemy fulltext drop ``` For query-side usage and optional ORM metadata registration, see @@ -150,8 +150,8 @@ For query-side usage and optional ORM metadata registration, see ### Backup and restore ```bash -omop-maint backup-database --engine-schema source --output-path ./cdm.dump -omop-maint restore-database ./cdm.dump --format custom --engine-schema target +omop-alchemy backup-database --engine-schema source --output-path ./cdm.dump +omop-alchemy restore-database ./cdm.dump --format custom --engine-schema target ``` --- @@ -176,8 +176,8 @@ omop-maint restore-database ./cdm.dump --format custom --engine-schema target ## Help ```bash -omop-maint --help -omop-maint doctor --help -omop-maint fulltext --help -omop-maint config --help +omop-alchemy --help +omop-alchemy doctor --help +omop-alchemy fulltext --help +omop-alchemy config --help ``` diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index c2b67e0..03a8036 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -15,13 +15,16 @@ The goal is to provide a fast, reproducible environment for: When started with the appropriate profile, this stack runs: -- **PostgreSQL** (`cava-database`) - - Custom-built image (see `docker/postgres/Dockerfile`) +- **PostgreSQL** (`postgres`) + - Official `postgres:18` image with bulk-load-oriented runtime tuning in compose - Persistent storage via Docker volumes +- **Python workspace** (`python`) + - Local OMOP Alchemy source installed into a reusable container image + - PostgreSQL client tools included for direct `psql` / `pg_dump` access - **pgAdmin** (`pgadmin`) - - Web UI for inspecting and querying PostgreSQL + - Web UI for inspecting and querying PostgreSQL (optional) - **JupyterLab** (`cava-jupyter-notebook`, optional) - - Notebook environment wired to the same database + - Notebook environment built from the local repo and wired to the same database All services communicate on a dedicated Docker bridge network (`cava-network`). @@ -48,23 +51,27 @@ POSTGRES_DB=cava HOST=localhost HTTP_TYPE=http - -PYTHON_BIND_MOUNT=/absolute/path/to/your/code_or_data ``` These credentials are not secure and are intentionally simple for local use. ### Starting the stack -From the `docker` directory +From the `docker/` directory. + +#### Database + Python workspace + +``` +docker compose up -d +``` -#### Database + pgAdmin only +#### Database + Python workspace + pgAdmin ``` -docker compose --profile default up -d +docker compose --profile pgadmin up -d ``` -#### Database + pgAdmin + Jupyter +#### Database + Python workspace + Jupyter ``` docker compose --profile jupyter up -d diff --git a/notebooks/00_select_test_fixtures.ipynb b/notebooks/00_select_test_fixtures.ipynb deleted file mode 100644 index 8385b37..0000000 --- a/notebooks/00_select_test_fixtures.ipynb +++ /dev/null @@ -1,293 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "7113aac3", - "metadata": {}, - "outputs": [], - "source": [ - "from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH\n", - "from orm_loader.helpers import get_logger\n", - "from dotenv import load_dotenv\n", - "from pathlib import Path\n", - "import os\n", - "import pandas as pd\n", - "# old enumerator classes from monolithic version of omop_alchemy - selection of cancer-relevant codes\n", - "import concept_enums\n", - "\n", - "base_path = TEST_PATH / \"fixtures\" / \"athena_source\"\n", - "load_dotenv()\n", - "source_path = Path(os.getenv('SOURCE_PATH', 'update/path/to/athena/source/as/required'))" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "d7b63035", - "metadata": {}, - "outputs": [], - "source": [ - "concept = pd.read_csv(source_path / 'CONCEPT.csv', delimiter='\\t', low_memory=False)\n", - "concept_class = pd.read_csv(source_path / 'CONCEPT_CLASS.csv', delimiter='\\t')\n", - "relationship = pd.read_csv(source_path / 'RELATIONSHIP.csv', delimiter='\\t')\n", - "domain = pd.read_csv(source_path / 'DOMAIN.csv', delimiter='\\t')\n", - "vocabulary = pd.read_csv(source_path / 'VOCABULARY.csv', delimiter='\\t')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bff8c220", - "metadata": {}, - "outputs": [], - "source": [ - "required_concepts = set(concept_class.concept_class_concept_id) | set(relationship.relationship_concept_id) | set(domain.domain_concept_id) | set(vocabulary.vocabulary_concept_id)\n", - "required_concepts_df = concept[concept.concept_id.isin(required_concepts)]\n", - "\n", - "selected = []\n", - "for d in set(domain.domain_id):\n", - " try:\n", - " c = concept[(concept.domain_id == d) & (concept.standard_concept == 'S')]\n", - " selected.append(c.sample(min(50, len(c)), random_state=1))\n", - " except ValueError:\n", - " print(f\"Not enough standard concepts in domain {d}\")\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d4b273fa", - "metadata": {}, - "outputs": [], - "source": [ - "standard_concept_by_domain_df = pd.concat(selected)\n", - "\n", - "additional_test_concepts = set([x for y in \n", - " [concept_enums.__dict__[cls].member_values() \n", - " for cls in dir(concept_enums) \n", - " if hasattr(concept_enums.__dict__[cls], 'member_values')\n", - " ] \n", - " for x in y])\n", - "\n", - "additional_test_concept_df = concept[concept.concept_id.isin(additional_test_concepts)]\n", - "\n", - "metadata = concept[concept.domain_id == 'Metadata']\n", - "language = concept[concept.domain_id == 'Language']\n", - "locations = concept[(concept.concept_class_id=='Location') & (concept.standard_concept.notna())].sample(frac=0.1, replace=False)\n", - "\n", - "additional_cancer_ones = []\n", - "\n", - "for vocab, frac in {'Cancer Modifier': 1.0, 'HemOnc': 0.1, 'ICDO3': 0.05}.items():\n", - " additional_cancer_ones.append(concept[(concept.vocabulary_id == vocab) & concept.standard_concept.notna()].sample(frac=frac, replace=False))\n", - "\n", - "cancer_specific_df = pd.concat(additional_cancer_ones)\n", - "\n", - "selected_concept_df = pd.concat(\n", - " [\n", - " standard_concept_by_domain_df,\n", - " required_concepts_df,\n", - " additional_test_concept_df,\n", - " cancer_specific_df,\n", - " locations,\n", - " metadata,\n", - " language\n", - " ]\n", - ").drop_duplicates()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d40f0ebd", - "metadata": {}, - "outputs": [], - "source": [ - "selected_relationships = []\n", - "\n", - "for concept_rel in pd.read_csv(source_path / 'CONCEPT_RELATIONSHIP.csv', delimiter='\\t', low_memory=False, chunksize=100000):\n", - " filtered = concept_rel[\n", - " (concept_rel.concept_id_1.isin(selected_concept_df.concept_id)) &\n", - " (concept_rel.concept_id_2.isin(selected_concept_df.concept_id))\n", - " ]\n", - " if not filtered.empty:\n", - " selected_relationships.append(filtered)\n", - "\n", - "selected_ancestry = []\n", - "\n", - "for concept_anc in pd.read_csv(source_path / 'CONCEPT_ANCESTOR.csv', delimiter='\\t', low_memory=False, chunksize=100000):\n", - " filtered = concept_anc[\n", - " (concept_anc.ancestor_concept_id.isin(selected_concept_df.concept_id)) &\n", - " (concept_anc.descendant_concept_id.isin(selected_concept_df.concept_id))\n", - " ]\n", - " if not filtered.empty:\n", - " selected_ancestry.append(filtered)\n", - "\n", - "selected_synonyms = []\n", - "\n", - "for concept_syn in pd.read_csv(source_path / 'CONCEPT_SYNONYM.csv', delimiter='\\t', low_memory=False, chunksize=100000):\n", - " filtered = concept_syn[\n", - " (concept_syn.concept_id.isin(selected_concept_df.concept_id))\n", - " ]\n", - " if not filtered.empty:\n", - " selected_synonyms.append(filtered)\n", - "\n", - "\n", - "selected_relationship_df = pd.concat(selected_relationships)\n", - "selected_ancestry_df = pd.concat(selected_ancestry)\n", - "selected_synonyms_df = pd.concat(selected_synonyms)\n", - "\n", - "\n", - "selected_relationship_df.to_csv(base_path / 'CONCEPT_RELATIONSHIP.csv', sep='\\t', index=False)\n", - "selected_synonyms_df.to_csv(base_path / 'CONCEPT_SYNONYM.csv', sep='\\t', index=False)\n", - "selected_ancestry_df.to_csv(base_path / 'CONCEPT_ANCESTOR.csv', sep='\\t', index=False)\n", - "selected_concept_df.to_csv(base_path / 'CONCEPT.csv', sep='\\t', index=False)\n", - "domain.to_csv(base_path / 'DOMAIN.csv', sep='\\t', index=False)\n", - "vocabulary.to_csv(base_path / 'VOCABULARY.csv', sep='\\t', index=False)\n", - "relationship.to_csv(base_path / 'RELATIONSHIP.csv', sep='\\t', index=False)\n", - "concept_class.to_csv(base_path / 'CONCEPT_CLASS.csv', sep='\\t', index=False)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9c4c1353", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "796f5be8", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9c5b8b3", - "metadata": {}, - "outputs": [], - "source": [ - "for f in [domain, vocabulary, relationship, concept_class, selected_relationship_df, selected_ancestry_df, selected_synonyms_df]:\n", - " for col in f.columns:\n", - " if 'concept_id' in col:\n", - " if len(f[~f[col].isin(selected_concept_df.concept_id)]) > 0:\n", - " raise ValueError(f\"Found concept_id in {col} not in selected concepts\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b465bc6c", - "metadata": {}, - "outputs": [], - "source": [ - "assert len(selected_relationship_df[~selected_relationship_df.relationship_id.isin(relationship.relationship_id.unique())]) == 0, \"Found relationship_id not in selected relationships\"\n", - "assert len(concept[~concept.concept_class_id.isin(concept_class.concept_class_id.unique())]) == 0, \"Found concept_class_id not in selected concepts\"\n", - "assert len(concept[~concept.domain_id.isin(domain.domain_id.unique())]) == 0, \"Found domain_id not in selected domains\"\n", - "assert len(concept[~concept.vocabulary_id.isin(vocabulary.vocabulary_id.unique())]) == 0, \"Found vocabulary_id not in selected vocabularies\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f65cc24f", - "metadata": {}, - "outputs": [], - "source": [ - "for f in [selected_concept_df, domain, vocabulary, relationship, concept_class, selected_relationship_df, selected_ancestry_df]:\n", - " assert(len(f[f.duplicated()]) == 0), f\"Found duplicated rows in {f}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97014890", - "metadata": {}, - "outputs": [], - "source": [ - "# this is the import issue...TODO: add pk null normalisation on load\n", - "vocabulary.loc[vocabulary.vocabulary_id.isna(), 'vocabulary_id'] = 'Unknown_Vocabulary'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "322e679f", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ff54924", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8326f2a3", - "metadata": {}, - "outputs": [], - "source": [ - "metadata[metadata.concept_id==1147138]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dc803944", - "metadata": {}, - "outputs": [], - "source": [ - "len(selected_concept_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "acb592e2", - "metadata": {}, - "outputs": [], - "source": [ - "os.environ.get('SOURCE_PATH')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c6b7cfd3", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/01_validate_model.ipynb b/notebooks/01_validate_model.ipynb deleted file mode 100644 index b18e149..0000000 --- a/notebooks/01_validate_model.ipynb +++ /dev/null @@ -1,255 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "3175451e", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-22 15:26:50,588 | INFO | sql_loader.omop_alchemy.config | Environment variables loaded from .env file\n", - "2026-01-22 15:26:50,589 | INFO | sql_loader.omop_alchemy.config | Default database engine configured\n" - ] - } - ], - "source": [ - "from orm_loader.registry import ModelRegistry, ValidationRunner, always_on_validators\n", - "from orm_loader.helpers import configure_logging, bootstrap\n", - "from omop_alchemy.cdm.specification import TABLE_LEVEL_CSV, FIELD_LEVEL_CSV\n", - "from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH\n", - "import sqlalchemy as sa\n", - "from sqlalchemy.orm import sessionmaker\n", - "\n", - "configure_logging()\n", - "load_environment()\n", - "\n", - "engine_string = get_engine_name()\n", - "engine = sa.create_engine(engine_string, future=True, echo=False)\n", - "registry = ModelRegistry(model_name='CDM', model_version=\"5.4\")\n", - "\n", - "registry.load_table_specs(\n", - " table_csv=TABLE_LEVEL_CSV,\n", - " field_csv=FIELD_LEVEL_CSV,\n", - ")\n", - "\n", - "registry.discover_models(\"omop_alchemy.cdm.model\")\n", - "bootstrap(engine, create=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "9875dc2f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['concept_synonym',\n", - " 'observation_period',\n", - " 'observation',\n", - " 'payer_plan_period',\n", - " 'dose_era']" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(registry.known_tables())[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "4e144e8a", - "metadata": {}, - "outputs": [], - "source": [ - "validators = always_on_validators()\n", - "runner = ValidationRunner(\n", - " validators=validators,\n", - " fail_fast=False,\n", - ")\n", - "report = runner.run(registry)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "75a09c70", - "metadata": {}, - "outputs": [], - "source": [ - "# report = registry.validate(engine=engine, check_domain_semantics=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "9cfa9046", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "MODEL v5.4: 0 error(s), 28 warning(s), 8 info\n" - ] - } - ], - "source": [ - "print(report.summary())" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "a8fea713", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "📦 cdm_source\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: cdm_source_name) Hint: ORM primary key not marked as primary key in specification\n", - "\n", - "📦 cohort\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: cohort_definition_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: subject_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 cohort_definition\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: cohort_definition_id) Hint: ORM primary key not marked as primary key in specification\n", - "\n", - "📦 concept_ancestor\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: ancestor_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: descendant_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 concept_relationship\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: concept_id_1) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: concept_id_2) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: relationship_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 concept_synonym\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: concept_synonym_name) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: language_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 death\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: person_id) Hint: ORM primary key not marked as primary key in specification\n", - "\n", - "📦 drug_strength\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: drug_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: ingredient_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 episode\n", - " ⚠️ FOREIGN_KEY_NOT_IN_SPEC (field: episode_parent_id) Hint: ORM defines FK but specification does not\n", - "\n", - "📦 episode_event\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: episode_event_field_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: episode_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: event_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 fact_relationship\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: domain_concept_id_1) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: domain_concept_id_2) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: fact_id_1) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: fact_id_2) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: relationship_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n", - "\n", - "📦 relationship\n", - " ⚠️ FOREIGN_KEY_NOT_IN_SPEC (field: reverse_relationship_id) Hint: ORM defines FK but specification does not\n", - "\n", - "📦 source_to_concept_map\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: source_code) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: source_concept_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ PRIMARY_KEY_NOT_DECLARED_IN_SPEC (field: source_vocabulary_id) Hint: ORM primary key not marked as primary key in specification\n", - " ⚠️ COMPOSITE_PRIMARY_KEY Hint: Composite primary key detected\n" - ] - } - ], - "source": [ - "if not report.is_valid():\n", - " print(report.render_text_report())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6086ccff", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c827c762", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3551f2f9", - "metadata": {}, - "outputs": [], - "source": [ - "for table, spec in registry._table_specs.items():\n", - " print(f\"{table}: {spec.is_required}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9585d76b", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2be13a79", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/02_test_load.ipynb b/notebooks/02_test_load.ipynb deleted file mode 100644 index dadd78f..0000000 --- a/notebooks/02_test_load.ipynb +++ /dev/null @@ -1,476 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "67fe4629", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-23 17:36:30,283 | INFO | sql_loader.omop_alchemy.config | Environment variables loaded from .env file\n", - "2026-01-23 17:36:30,283 | INFO | sql_loader.omop_alchemy.config | Default database engine configured\n" - ] - } - ], - "source": [ - "import sqlalchemy as sa\n", - "import pandas as pd\n", - "\n", - "from orm_loader.helpers import configure_logging, bootstrap, explain_sqlite_fk_error, bulk_load_context, configure_logging\n", - "from sqlalchemy.orm import sessionmaker\n", - "from sqlalchemy.exc import IntegrityError\n", - "\n", - "from random import randint, choice\n", - "import numpy as np\n", - "from orm_loader.loaders.loader_interface import ParquetLoader, PandasLoader\n", - "\n", - "from sqlalchemy.orm import Session\n", - "from omop_alchemy.cdm.model.health_system import Location, Care_Site, Provider, Visit_Detail, Visit_Occurrence\n", - "from omop_alchemy.cdm.model.clinical import Person, Condition_Occurrence, Procedure_Occurrence, Death, Specimen, Drug_Exposure, Measurement, Observation\n", - "from omop_alchemy.cdm.model.structural import Episode, Episode_Event\n", - "from omop_alchemy.cdm.model.derived import Observation_Period\n", - "from datetime import date, timedelta\n", - "from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH\n", - "\n", - "from omop_alchemy.cdm.model.vocabulary import (\n", - " Domain,\n", - " Vocabulary,\n", - " Concept_Class,\n", - " Relationship,\n", - " Concept,\n", - " Concept_Ancestor,\n", - " Concept_Relationship,\n", - " Concept_Synonym,\n", - " Concept_Synonym,\n", - ")\n", - "\n", - "ATHENA_INITIAL_LOAD = [\n", - " Domain,\n", - " Vocabulary,\n", - " Concept_Class,\n", - " Relationship,\n", - " Concept\n", - "]\n", - "\n", - "ATHENA_SUBSEQUENT_LOAD = [\n", - " Concept_Ancestor,\n", - " Concept_Relationship,\n", - " Concept_Synonym\n", - "]\n", - "\n", - "configure_logging()\n", - "load_environment()\n", - "\n", - "engine_string = get_engine_name()\n", - "engine = sa.create_engine(engine_string, future=True, echo=False)\n", - "bootstrap(engine, create=True)\n", - "Session = sessionmaker(bind=engine, future=True)\n", - "session = Session()\n", - "p = PandasLoader()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "433ced72", - "metadata": {}, - "outputs": [], - "source": [ - "base_path = TEST_PATH / \"fixtures\" / \"athena_source\"\n", - "\n", - "# uncomment this line if you want to load the full athena source from env var\n", - "# instead of the minimal test fixture set for rapid access\n", - "\n", - "# base_path = Path(os.environ['SOURCE_PATH'])" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "82601899", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-23 17:36:33,728 | INFO | sql_loader.orm_loader.helpers.bulk | Disabled foreign key checks for bulk load\n", - "Staging table _staging_vocabulary does not exist; recreating\n", - "Staging table _staging_concept_class does not exist; recreating\n", - "Staging table _staging_relationship does not exist; recreating\n", - "Staging table _staging_concept does not exist; recreating\n", - "Found 1 rows with unexpected nulls in concept.vocabulary_id\n", - "2026-01-23 17:36:34,375 | INFO | sql_loader.orm_loader.helpers.bulk | Re-enabled foreign key checks after bulk load\n" - ] - } - ], - "source": [ - "# Initial load of core vocabulary tables - use bulk load to ensure mutual FK constraints are handled (trusted sources only)\n", - "\n", - "with bulk_load_context(session):\n", - " for model in ATHENA_INITIAL_LOAD:\n", - " _ = model.load_csv(\n", - " session,\n", - " base_path / f\"{model.__tablename__.upper()}.csv\",\n", - " dedupe=True,\n", - " merge_strategy=\"upsert\",\n", - " loader=p,\n", - " )\n", - " session.commit()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "dcf65010", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-23 17:36:37,680 | INFO | sql_loader.orm_loader.helpers.bulk | Disabled foreign key checks for bulk load\n", - "Staging table _staging_concept_ancestor does not exist; recreating\n", - "Staging table _staging_concept_relationship does not exist; recreating\n", - "Staging table _staging_concept_synonym does not exist; recreating\n", - "2026-01-23 17:36:39,350 | INFO | sql_loader.orm_loader.helpers.bulk | Re-enabled foreign key checks after bulk load\n" - ] - } - ], - "source": [ - "# can still turn off FK checks for speed but mutual dependency is not an issue for this one \n", - "# has been updated to use merge strategy to handle duplicates\n", - "\n", - "with bulk_load_context(session):\n", - " for model in ATHENA_SUBSEQUENT_LOAD:\n", - " _ = model.load_csv(\n", - " session,\n", - " base_path / f\"{model.__tablename__.upper()}.csv\",\n", - " dedupe=True,\n", - " chunksize=5000,\n", - " merge_strategy=\"upsert\",\n", - " )\n", - " session.commit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eac7991f", - "metadata": {}, - "outputs": [], - "source": [ - "concept_by_domain = pd.DataFrame(\n", - " session.query(\n", - " *Concept.__table__.columns\n", - " )\n", - " .filter(\n", - " sa.or_(\n", - " Concept.domain_id.in_(['Gender', 'Ethnicity', 'Race', 'Visit', 'Geography', 'Provider', 'Type Concept']),\n", - " sa.and_(\n", - " Concept.domain_id == 'Condition',\n", - " Concept.vocabulary_id == 'ICDO3'\n", - " )\n", - " )\n", - " )\n", - ")\n", - "\n", - "avail_gender = list(concept_by_domain[concept_by_domain.domain_id=='Gender'].concept_id)\n", - "avail_ethnicity = list(concept_by_domain[concept_by_domain.domain_id=='Ethnicity'].concept_id)\n", - "avail_race = list(concept_by_domain[concept_by_domain.domain_id=='Race'].concept_id)\n", - "avail_place_of_service = list(concept_by_domain[concept_by_domain.domain_id=='Visit'].concept_id)\n", - "avail_country = list(concept_by_domain[concept_by_domain.concept_class_id=='Location'].concept_id)\n", - "avail_provider = list(concept_by_domain[concept_by_domain.domain_id=='Provider'].concept_id)\n", - "avail_types = list(concept_by_domain[concept_by_domain.domain_id=='Type Concept'].concept_id)\n", - "\n", - "cancers = list(concept_by_domain[(concept_by_domain.domain_id=='Condition')&(concept_by_domain.vocabulary_id=='ICDO3') & (concept_by_domain.concept_code.str.contains('/3'))].concept_id)\n", - "\n", - "staging_parents = pd.DataFrame(\n", - " session.query(\n", - " *Concept.__table__.columns\n", - " )\n", - " .join(Concept_Ancestor, Concept.concept_id==Concept_Ancestor.descendant_concept_id)\n", - " .filter(Concept_Ancestor.ancestor_concept_id==734320)\n", - " .filter(Concept_Ancestor.max_levels_of_separation==1)\n", - ")\n", - "\n", - "staging_sets = {}\n", - "\n", - "for axis in ['T', 'N', 'M', 'Stage']:\n", - " parents = list(staging_parents[staging_parents.concept_name.str.contains(axis)].concept_id)\n", - " s = pd.DataFrame(\n", - " session.query(\n", - " *Concept.__table__.columns\n", - " )\n", - " .join(Concept_Ancestor, Concept.concept_id==Concept_Ancestor.descendant_concept_id)\n", - " .filter(Concept_Ancestor.ancestor_concept_id.in_(parents))\n", - " .filter(Concept.concept_code.ilike('%8th%'))\n", - " .filter(~Concept.concept_code.ilike('%yp%'))\n", - " )\n", - " staging_sets[axis] = s" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "41e86f41", - "metadata": {}, - "outputs": [], - "source": [ - "# confirming string hack to identify staging axes does work as expected\n", - "# staging_sets['Stage'].concept_code.map(lambda x: x.split('-')[-1]).value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "dc70fc6b", - "metadata": {}, - "outputs": [], - "source": [ - "# these are super-naive and brute-force ways to populate very basic test data - good enough for now - better content coming\n", - "\n", - "def populate_reference_data(session):\n", - " \n", - " loc_ids = Location.allocator(session)\n", - " cs_ids = Care_Site.allocator(session)\n", - " pro_ids = Provider.allocator(session)\n", - " \n", - " location_data = [{'location_id': loc_ids.next(), 'country_concept_id': choice(avail_country), 'city': f'City {idx}'} for idx in range(10)]\n", - " locations = [Location(**row) for row in location_data]\n", - " care_site_data = [{'care_site_id': cs_ids.next(), 'care_site_name': f'Care Site {idx}', 'location_id': choice(locations).location_id, 'place_of_service_concept_id': choice(avail_place_of_service)} for idx in range(30)]\n", - " care_sites = [Care_Site(**row) for row in care_site_data]\n", - " provider_data = [{'provider_id': pro_ids.next(), 'specialty_concept_id': choice(avail_provider), 'gender_concept_id': choice(avail_gender), 'care_site_id': choice(care_sites).care_site_id} for _ in range(50)]\n", - " providers = [Provider(**row) for row in provider_data]\n", - "\n", - " session.add_all(locations)\n", - " session.add_all(care_sites)\n", - " session.add_all(providers)\n", - " session.commit()\n", - "\n", - " return locations, care_sites, providers\n", - "\n", - "def populate_people_and_visits(session, care_sites):\n", - " \n", - " person_ids = Person.allocator(session)\n", - " visit_ids = Visit_Occurrence.allocator(session)\n", - " \n", - " person_data = [{'person_id': person_ids.next(), 'year_of_birth': randint(1950, 2020), 'month_of_birth': randint(1, 12), 'gender_concept_id':choice(avail_gender), 'race_concept_id':choice(avail_race), 'ethnicity_concept_id':choice(avail_ethnicity)} for idx in range(1000)]\n", - " people = [Person(**row) for row in person_data]\n", - "\n", - " visits = []\n", - " for person in people:\n", - " cs = choice(care_sites)\n", - " visit_num = randint(1, 3)\n", - " for v in range(visit_num):\n", - " days_delay = randint(0, 365)\n", - " visit_date = date(2020, 1, 1) + timedelta(days_delay)\n", - " visit = Visit_Occurrence(\n", - " visit_occurrence_id=visit_ids.next(),\n", - " person_id=person.person_id,\n", - " care_site_id=cs.care_site_id,\n", - " visit_concept_id=choice(avail_place_of_service),\n", - " visit_start_date=visit_date,\n", - " visit_end_date=visit_date,\n", - " )\n", - " visits.append(visit)\n", - " session.add_all(people)\n", - " session.add_all(visits)\n", - " session.commit()\n", - " return people, visits\n", - "\n", - "def populate_observation_periods(session):\n", - " op_ids = Observation_Period.allocator(session)\n", - " deaths = []\n", - " rows = (\n", - " session.query(\n", - " Visit_Occurrence.person_id,\n", - " sa.func.min(Visit_Occurrence.visit_start_date).label(\"start\"),\n", - " sa.func.max(Visit_Occurrence.visit_end_date).label(\"end\"),\n", - " Death.death_date,\n", - " Observation_Period.observation_period_id\n", - " )\n", - " .join(Death, Death.person_id==Visit_Occurrence.person_id, isouter=True)\n", - " .join(Observation_Period, Observation_Period.person_id==Visit_Occurrence.person_id, isouter=True)\n", - " .filter(Observation_Period.observation_period_id==None)\n", - " .group_by(Visit_Occurrence.person_id)\n", - " .all()\n", - " )\n", - " obs = []\n", - " for idx, r in enumerate(rows):\n", - " deceased = np.random.choice([True, False], p=[0.05, 0.95])\n", - " if deceased:\n", - " death_date = r.end + timedelta(days=randint(1, 365))\n", - " deaths.append(\n", - " Death(\n", - " person_id=r.person_id,\n", - " death_date=death_date,\n", - " death_type_concept_id=choice(avail_types),\n", - " )\n", - " )\n", - " obs_end = death_date\n", - " else:\n", - " obs_end = r.end\n", - " obs.append(\n", - " Observation_Period(\n", - " observation_period_id=op_ids.next(),\n", - " person_id=r.person_id,\n", - " observation_period_start_date=r.start,\n", - " observation_period_end_date=obs_end,\n", - " period_type_concept_id=choice(avail_types),\n", - " )\n", - " )\n", - " session.add_all(deaths)\n", - " session.add_all(obs)\n", - " session.commit()\n", - " return obs\n", - "\n", - "def populate_conditions_and_modifiers(session):\n", - " cond_ids = Condition_Occurrence.allocator(session)\n", - " meas_ids = Measurement.allocator(session)\n", - " ep_ids = Episode.allocator(session)\n", - " rows = (\n", - " session.query(\n", - " Observation_Period, Death, Condition_Occurrence\n", - " )\n", - " .join(Death, Observation_Period.person_id==Death.person_id, isouter=True)\n", - " .join(Condition_Occurrence, Observation_Period.person_id==Condition_Occurrence.person_id, isouter=True)\n", - " .all()\n", - " )\n", - " conditions = []\n", - " measurements = []\n", - " episodes = []\n", - " episode_events = []\n", - " for obs, death, condition in rows:\n", - " if condition:\n", - " continue\n", - " t = choice(list(staging_sets['T'].concept_id))\n", - " n = choice(list(staging_sets['N'].concept_id))\n", - " m = choice(list(staging_sets['M'].concept_id))\n", - " # don't worry abt overall stage for now as it should be calculated\n", - " condition_concept = choice(cancers)\n", - " condition = Condition_Occurrence(\n", - " condition_occurrence_id=cond_ids.next(),\n", - " condition_concept_id = condition_concept,\n", - " condition_start_date = obs.observation_period_start_date,\n", - " condition_type_concept_id = choice(avail_types),\n", - " person_id = obs.person_id,\n", - " condition_status_concept_id = 32902\n", - " )\n", - " conditions.append(condition)\n", - " episode = Episode(\n", - " episode_id=ep_ids.next(),\n", - " person_id=obs.person_id,\n", - " episode_concept_id=32533, # Episode of care\n", - " episode_object_concept_id=condition.condition_concept_id,\n", - " episode_start_date=condition.condition_start_date,\n", - " episode_end_date=(\n", - " death.death_date if death else obs.observation_period_end_date\n", - " ),\n", - " episode_type_concept_id=choice(avail_types), # EHR / registry / derived\n", - " )\n", - " episodes.append(episode)\n", - "\n", - " for stage in [t, n, m]:\n", - " measurement = Measurement(\n", - " person_id = obs.person_id,\n", - " measurement_id = meas_ids.next(),\n", - " measurement_concept_id = stage,\n", - " measurement_event_id = condition.condition_occurrence_id,\n", - " meas_event_field_concept_id = 1147127, # condition_occurrence.condition_occurrence_id\n", - " measurement_date = condition.condition_start_date,\n", - " measurement_type_concept_id = choice(avail_types),\n", - " value_as_number = 1\n", - " )\n", - " measurements.append(measurement)\n", - " episode_events.append(\n", - " Episode_Event(\n", - " episode_id=episode.episode_id,\n", - " event_id=measurement.measurement_id,\n", - " episode_event_field_concept_id=1147138, # measurement.measurement_id\n", - " )\n", - " )\n", - " episode_events.append(\n", - " Episode_Event(\n", - " episode_id=episode.episode_id,\n", - " event_id=condition.condition_occurrence_id,\n", - " episode_event_field_concept_id=1147127, # condition_occurrence.condition_occurrence_id\n", - " )\n", - " )\n", - " session.add_all(conditions)\n", - " session.add_all(measurements)\n", - " session.add_all(episodes)\n", - " session.add_all(episode_events)\n", - " session.commit()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b7ccb46a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "97d76a3f", - "metadata": {}, - "outputs": [], - "source": [ - "with Session() as sess:\n", - " populate_reference_data(sess)\n", - " sess.commit()\n", - " care_sites = sess.query(Care_Site).all()\n", - "\n", - "with Session() as sess:\n", - " populate_people_and_visits(sess, care_sites)\n", - " populate_observation_periods(sess)\n", - "\n", - "with Session() as sess:\n", - " populate_conditions_and_modifiers(sess)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e57318e0", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a241ac28", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/03_basic_model_query_demo.ipynb b/notebooks/03_basic_model_query_demo.ipynb deleted file mode 100644 index caec0f8..0000000 --- a/notebooks/03_basic_model_query_demo.ipynb +++ /dev/null @@ -1,1205 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "766a9e4a", - "metadata": {}, - "source": [ - "This notebook is a simple demo to introduce some of the fundamental design patterns from the OMOP_Alchemy library " - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "634ae11f", - "metadata": {}, - "outputs": [], - "source": [ - "import sqlalchemy as sa\n", - "from sqlalchemy.orm import sessionmaker\n", - "from omop_alchemy.cdm.model.vocabulary import Concept, ConceptView, Domain, Vocabulary, Concept_Class\n", - "from orm_loader.helpers import configure_logging, bootstrap, bulk_load_context\n", - "from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH\n", - "from omop_alchemy.cdm.model.clinical import Condition_Occurrence, Condition_OccurrenceView\n", - "from omop_alchemy.cdm.model.structural import EpisodeView, Episode_EventView" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "5c3184bb", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-22 15:27:38,567 | INFO | sql_loader.omop_alchemy.config | Environment variables loaded from .env file\n", - "2026-01-22 15:27:38,568 | INFO | sql_loader.omop_alchemy.config | Default database engine configured\n" - ] - } - ], - "source": [ - "# this demo assumes that you have created a .env file in the ROOT_PATH with your database connection string - see .example_dotenv for details\n", - "\n", - "configure_logging()\n", - "load_environment()\n", - "engine_string = get_engine_name()\n", - "\n", - "engine = sa.create_engine(engine_string, future=True, echo=False)\n", - "bootstrap(engine, create=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "fe73295d", - "metadata": {}, - "outputs": [], - "source": [ - "Session = sessionmaker(bind=engine, future=True)\n", - "session = Session()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "8943cd87", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c = session.query(Concept).first()\n", - "c" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7e2c50e9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'concept_id': 1,\n", - " 'concept_name': 'Domain',\n", - " 'domain_id': 'Metadata',\n", - " 'vocabulary_id': 'Domain',\n", - " 'concept_class_id': 'Domain',\n", - " 'concept_code': 'OMOP generated',\n", - " 'valid_start_date': datetime.date(1970, 1, 1),\n", - " 'valid_end_date': datetime.date(2099, 12, 31)}" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c.to_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e0939c75", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'{\"concept_class_id\": \"Domain\", \"concept_code\": \"OMOP generated\", \"concept_id\": 1, \"concept_name\": \"Domain\", \"domain_id\": \"Metadata\", \"valid_end_date\": \"2099-12-31\", \"valid_start_date\": \"1970-01-01\", \"vocabulary_id\": \"Domain\"}'" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "c.to_json()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "dcc041a4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[(22274, 'Neoplasm of uncertain behavior of larynx', 'S'),\n", - " (22281, 'Sickle cell-hemoglobin SS disease', 'S'),\n", - " (22288, 'Hereditary elliptocytosis', 'S'),\n", - " (22340, 'Esophageal varices without bleeding', 'S'),\n", - " (22350, 'Edema of larynx', 'S')]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "standard_conditions = (\n", - " session.query(Concept)\n", - " .filter(\n", - " Concept.domain_id == \"Condition\",\n", - " Concept.standard_concept == \"S\",\n", - " )\n", - " .limit(5)\n", - " .all()\n", - ")\n", - "\n", - "[(c.concept_id, c.concept_name, c.standard_concept) for c in standard_conditions]\n" - ] - }, - { - "cell_type": "markdown", - "id": "b524d61d", - "metadata": {}, - "source": [ - "`Concept` is the basic class that you should be using for most ETL steps, but for introspection of relationships (including the triggering of lazy loads), `ConceptView` offers much richer expressions.\n", - "\n", - "This is separated to ensure speed of base class is maintained, while optimising the potential benefits of fully-described object relationships" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "4ae51dea", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cv = session.query(ConceptView).first()\n", - "cv" - ] - }, - { - "cell_type": "markdown", - "id": "3df3e3fb", - "metadata": {}, - "source": [ - "`domain_id` is the actual string content of the column that was returned from the query already performed, where `cv.domain` returns a related Domain object" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "3211247e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Metadata',\n", - " str,\n", - " ,\n", - " omop_alchemy.cdm.model.vocabulary.domain.Domain,\n", - " ,\n", - " omop_alchemy.cdm.model.vocabulary.vocabulary.Vocabulary)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cv.domain_id, type(cv.domain_id), cv.domain, type(cv.domain), cv.vocabulary, type(cv.vocabulary)" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "b51388fe", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Hospital admission'" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# because concept ancestor and concept relationship are very large tables, ConceptView relationships have \n", - "# been set to lazy='select', these relationships will not load until accessed\n", - "\n", - "concepts = (\n", - " session.query(ConceptView)\n", - " .filter(ConceptView.vocabulary_id == 'SNOMED')\n", - " .filter(ConceptView.standard_concept == 'S')\n", - " .limit(30)\n", - ")\n", - "\n", - "concepts[0].concept_name" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "5a36bca3", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8715 Hospital admission 5 219 361 361\n", - "9173 Inactive 5 1 7 7\n" - ] - } - ], - "source": [ - "# get details about concept dynamically - ancestors, descendants, relationships\n", - "\n", - "# because of the deferred loading strategy, these relationships will now be querying \n", - "# those tables once for every print statement in the below loop - very efficient for\n", - "# single concepts, not for sets of concepts\n", - "\n", - "for concept in concepts[:2]:\n", - " print(\n", - " concept.concept_id,\n", - " concept.concept_name,\n", - " len(concept.ancestors),\n", - " len(concept.descendants),\n", - " len(concept.incoming_relationships),\n", - " len(concept.outgoing_relationships),\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "5a6d3413", - "metadata": {}, - "outputs": [], - "source": [ - "# when known in advance that these relationships will be needed, use joined loading to\n", - "# load them in the original query and only hit the big table once\n", - "\n", - "from sqlalchemy.orm import selectinload\n", - "\n", - "def concept_hierarchy_bundle():\n", - " return (\n", - " selectinload(ConceptView.ancestors),\n", - " selectinload(ConceptView.descendants),\n", - " )\n", - "\n", - "def concept_relationship_bundle():\n", - " return (\n", - " selectinload(ConceptView.incoming_relationships),\n", - " selectinload(ConceptView.outgoing_relationships),\n", - " )\n", - "\n", - "concepts = (\n", - " session.query(ConceptView)\n", - " .filter(ConceptView.vocabulary_id == 'SNOMED')\n", - " .filter(ConceptView.standard_concept == 'S')\n", - " .options(\n", - " *concept_hierarchy_bundle(),\n", - " *concept_relationship_bundle()\n", - " )\n", - " .limit(30)\n", - " .all()\n", - ")\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "55633a75", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "8715 Hospital admission 5 219 361 361\n", - "9173 Inactive 5 1 7 7\n", - "9174 Obsolete 5 1 7 7\n", - "9176 Patient status determination, deceased 4 7 12 12\n", - "9177 Other 5 1 9 9\n", - "9181 Active 5 1 7 7\n", - "9189 Negative 4 1 184 184\n", - "9190 Not detected 4 3 213 213\n", - "9191 Positive 7 6 231 231\n", - "9192 Trace 6 1 20 20\n", - "22274 Neoplasm of uncertain behavior of larynx 36 45 49 49\n", - "22281 Sickle cell-hemoglobin SS disease 35 12 74 74\n", - "22288 Hereditary elliptocytosis 44 10 49 49\n", - "22340 Esophageal varices without bleeding 29 1 30 30\n", - "22350 Edema of larynx 16 9 39 39\n", - "22426 Congenital macrostomia 30 5 35 35\n", - "22492 Foreign body in pharynx 26 13 60 60\n", - "22557 Malignant tumor of submandibular gland 49 182 18 18\n", - "22665 Chronic peptic ulcer with hemorrhage AND with perforation but without obstruction 33 1 17 17\n", - "22666 Vomiting after gastrointestinal tract surgery 18 3 21 21\n", - "22722 Accessory salivary gland 33 2 17 17\n", - "22820 Tuberculosis of esophagus 36 1 26 26\n", - "22839 Overlapping malignant neoplasm of larynx 38 1 23 23\n", - "22856 Polyglandular dysfunction 6 21 65 65\n", - "22871 Neoplasm of uncertain behavior of pineal gland 44 11 36 36\n", - "22945 Horizontal overbite 22 1 20 20\n", - "22955 Perforation of esophagus 22 3 28 28\n", - "23034 Neonatal hypoglycemia 14 7 35 35\n", - "23137 Chlamydial pharyngitis 44 1 28 28\n", - "23164 Disorder of anterior pituitary 13 149 57 57\n" - ] - } - ], - "source": [ - "for concept in concepts:\n", - " print(\n", - " concept.concept_id,\n", - " concept.concept_name,\n", - " len(concept.ancestors),\n", - " len(concept.descendants),\n", - " len(concept.incoming_relationships),\n", - " len(concept.outgoing_relationships),\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "a53f0b85", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(36402497, 'Round cell liposarcoma of unknown primary site')" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row = (\n", - " session.query(Condition_Occurrence, Concept)\n", - " .join(Concept, Condition_Occurrence.condition_concept_id == Concept.concept_id)\n", - " .first()\n", - ")\n", - "\n", - "row[0].condition_concept_id, row[1].concept_name" - ] - }, - { - "cell_type": "markdown", - "id": "2954093f", - "metadata": {}, - "source": [ - "we don't want to be needing to define joins every time, but equally we don't want to force the loading of relationships that are not required for simple queries.\n", - "this is why they are separated out into View classes, but they can be very useful for exploration, as well as for serialisation to downstream apis" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "19cad800", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(36402497, 'Round cell liposarcoma of unknown primary site')" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "row = (\n", - " session.query(Condition_OccurrenceView)\n", - " .first()\n", - ")\n", - "\n", - "row.condition_concept_id, row.condition_concept.concept_name" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "9370cbc3", - "metadata": {}, - "outputs": [], - "source": [ - "from omop_alchemy.cdm.model.clinical import Person, PersonView\n", - "from omop_alchemy.cdm.model.health_system import Location, Provider, Care_Site" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "3b1f85f4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p = session.query(Person).first()\n", - "p" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "c44f77ac", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'_sa_instance_state': ,\n", - " 'ethnicity_concept_id': 38003564,\n", - " 'gender_source_value': None,\n", - " 'year_of_birth': 1976,\n", - " 'gender_source_concept_id': None,\n", - " 'race_source_value': None,\n", - " 'person_id': 1,\n", - " 'race_source_concept_id': None,\n", - " 'ethnicity_source_value': None,\n", - " 'month_of_birth': 12,\n", - " 'ethnicity_source_concept_id': None,\n", - " 'visit_occurrence_id': None,\n", - " 'day_of_birth': None,\n", - " 'location_id': None,\n", - " 'visit_detail_id': None,\n", - " 'birth_datetime': None,\n", - " 'provider_id': None,\n", - " 'gender_concept_id': 45518388,\n", - " 'care_site_id': None,\n", - " 'race_concept_id': 45456238,\n", - " 'person_source_value': None}" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# simple person class that just has the raw column data - flat, predictable, and cheap to load - no joins and no lazy relationships\n", - "p.__dict__" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "e9910b9c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# subtle in this example, but personview has actually loaded the gender concept relationship to print the label instead of the raw concept_id\n", - "pv = session.query(PersonView).first()\n", - "pv" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "b0fd6101", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Gender unknown', 'Ethnic category - 2001 census', 'Not Hispanic or Latino')" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pv.gender.concept_name, pv.race.concept_name, pv.ethnicity.concept_name" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "9d8e2932", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'gender_concept_id': ,\n", - " 'race_concept_id': ,\n", - " 'ethnicity_concept_id': }" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "PersonView.__expected_domains__" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "4f33223a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p = session.query(PersonView).first()\n", - "p" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "9c059b4b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p.domain_violations" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "8580aa91", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wrong_concept = (\n", - " session.query(Concept)\n", - " .filter(Concept.domain_id == \"Condition\")\n", - " .first()\n", - ")\n", - "wrong_concept" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "930f8d2e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[DomainRule(table='person', field='gender_concept_id', allowed_domains={'Gender'}, allowed_classes=None),\n", - " DomainRule(table='person', field='race_concept_id', allowed_domains={'Race'}, allowed_classes=None),\n", - " DomainRule(table='person', field='ethnicity_concept_id', allowed_domains={'Ethnicity'}, allowed_classes=None)]" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "PersonView.collect_domain_rules()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "192eb5ba", - "metadata": {}, - "outputs": [], - "source": [ - "p.gender_concept_id = wrong_concept.concept_id" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "2ee06bb4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "p.is_domain_valid" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "feb164dd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[\"gender_concept_id not in domain(s): ['Gender']\"]" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# we can do application-side validation of domain rules \n", - "# tbc if this can be made more efficient at scale to truly support ETL \n", - "# so that we can move it to the base class?\n", - "p.domain_violations" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "a5a313da", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "50" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# age as a hybrid property\n", - "from datetime import date\n", - "pv.age" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "85046519", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "44" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pv.age_at(date(2020, 1, 1))" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "efbe1fc7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# because we are using a hybrid property, we can filter on it in queries - same logic but two execution modes\n", - "(\n", - " session.query(PersonView)\n", - " .filter(PersonView.age_at(date(2020, 1, 1)) >= 65)\n", - " .limit(5)\n", - " .all()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "b7de12c1", - "metadata": {}, - "outputs": [], - "source": [ - "# if using the base Person class, we would need to do the age calculation in the query itself\n", - "from sqlalchemy import func\n", - "on = date(2020, 1, 1)\n", - "q = (\n", - " session.query(Person)\n", - " .filter((sa.func.extract(\"year\", sa.literal(on)) - Person.year_of_birth) >= 65)\n", - " .limit(5)\n", - " .all()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "bc2374f3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[, , , , ]" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# this is a trivial example in this case but in the instance of joined elements it can make a big difference in expressiveness / formalism of complex definitions\n", - "q" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "54c9ec02", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "session.query(PersonView).filter(PersonView.under_observation_on(date(2020, 6, 1))).all()[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "a0b86693", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cohort = (\n", - " session.query(PersonView)\n", - " .filter(\n", - " PersonView.age_at(date(2020, 1, 1)) >= 18,\n", - " PersonView.is_deceased == True,\n", - " )\n", - " .limit(10)\n", - " .all()\n", - ")\n", - "\n", - "cohort" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "4f77674c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'person_id': 1,\n", - " 'year_of_birth': 1976,\n", - " 'month_of_birth': 12,\n", - " 'gender_concept_id': 8689,\n", - " 'race_concept_id': 45456238,\n", - " 'ethnicity_concept_id': 38003564}" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cohort[0].to_dict()" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "69fff20b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "cohort[0].death" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "00c0f530", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[]" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pv.observation_periods" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "61cbed1a", - "metadata": {}, - "outputs": [], - "source": [ - "q = (\n", - " session.query(PersonView)\n", - " .filter(PersonView.first_observation_date >= date(2020, 10, 1))\n", - " .filter(PersonView.last_observation_date <= date(2021, 10, 31))\n", - ").all()\n" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "07d6911c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "96" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(q)" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "50ada151", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ep = session.query(EpisodeView).first()\n", - "ep" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "46f0b554", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "('Disease Episode', 'Round cell liposarcoma of unknown primary site')" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ep.episode_concept.concept_name, ep.episode_object_concept.concept_name" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "34dfe21a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ep.events" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "ad088151", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 44, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "events = (\n", - " session.query(Episode_EventView)\n", - " .filter(Episode_EventView.episode_id == ep.episode_id)\n", - " .all()\n", - ")\n", - "\n", - "# polymorphic relationship to clinical fact tables can be context aware and resolved dynamically\n", - "events" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "87193c76", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'condition_occurrence'" - ] - }, - "execution_count": 45, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "events[0].event_table" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "851aa001", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "SELECT episode_event.episode_id, episode_event.event_id, episode_event.episode_event_field_concept_id \n", - "FROM episode_event \n", - "WHERE episode_event.episode_id = 1\n" - ] - } - ], - "source": [ - "q = session.query(Episode_EventView).filter(Episode_EventView.episode_id == ep.episode_id)\n", - "\n", - "print(q.statement.compile(compile_kwargs={\"literal_binds\": True}))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "201386d6", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e828901e", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/04_timeline.ipynb b/notebooks/04_timeline.ipynb deleted file mode 100644 index 59e747f..0000000 --- a/notebooks/04_timeline.ipynb +++ /dev/null @@ -1,142 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "8deb60a9", - "metadata": {}, - "outputs": [], - "source": [ - "import sqlalchemy as sa\n", - "from sqlalchemy.orm import sessionmaker\n", - "from omop_alchemy.cdm.model.vocabulary import Concept, ConceptView, Domain, Vocabulary, Concept_Class\n", - "from orm_loader.helpers import configure_logging, bootstrap, bulk_load_context\n", - "from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH\n", - "from omop_alchemy.cdm.model.extended import Person_Timeline" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "deea8749", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-22 15:30:52,347 | INFO | sql_loader.omop_alchemy.config | Environment variables loaded from .env file\n", - "2026-01-22 15:30:52,348 | INFO | sql_loader.omop_alchemy.config | Default database engine configured\n" - ] - } - ], - "source": [ - "\n", - "configure_logging()\n", - "load_environment()\n", - "engine_string = get_engine_name()\n", - "\n", - "engine = sa.create_engine(engine_string, future=True, echo=False)\n", - "bootstrap(engine, create=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "b3e61002", - "metadata": {}, - "outputs": [], - "source": [ - "Session = sessionmaker(bind=engine, future=True)\n", - "session = Session()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "b2e732c1", - "metadata": {}, - "outputs": [], - "source": [ - "people = session.query(Person_Timeline).limit(5).all()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "7446ea16", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[,\n", - " ,\n", - " ,\n", - " ]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "people[0].timeline" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "99c17c10", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['{\"person_id\": 2, \"concept_id\": 1635163, \"event_start\": \"2020-01-03T00:00:00\", \"event_end\": null, \"value\": {\"type\": \"numeric\", \"value\": 1.0}, \"metadata\": {\"unit_concept_id\": null}}',\n", - " '{\"person_id\": 2, \"concept_id\": 1633674, \"event_start\": \"2020-01-03T00:00:00\", \"event_end\": null, \"value\": {\"type\": \"numeric\", \"value\": 1.0}, \"metadata\": {\"unit_concept_id\": null}}',\n", - " '{\"person_id\": 2, \"concept_id\": 1634891, \"event_start\": \"2020-01-03T00:00:00\", \"event_end\": null, \"value\": {\"type\": \"numeric\", \"value\": 1.0}, \"metadata\": {\"unit_concept_id\": null}}',\n", - " '{\"condition_concept_id\": 36535612, \"condition_occurrence_id\": 2, \"condition_start_date\": \"2020-01-03\", \"condition_status_concept_id\": 32902, \"condition_type_concept_id\": 3564487, \"person_id\": 2}']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "people[1].to_json()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb3b9d11", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/05_concept_resolver.ipynb b/notebooks/05_concept_resolver.ipynb deleted file mode 100644 index 80da2c0..0000000 --- a/notebooks/05_concept_resolver.ipynb +++ /dev/null @@ -1,308 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "5ebb19b4", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2026-01-26 21:26:57,912 | INFO | sql_loader.omop_alchemy.config | Environment variables loaded from .env file\n", - "2026-01-26 21:26:57,912 | INFO | sql_loader.omop_alchemy.config | Default database engine configured\n" - ] - } - ], - "source": [ - "from orm_loader.helpers import configure_logging, bootstrap\n", - "from omop_alchemy import get_engine_name, load_environment\n", - "import sqlalchemy as sa\n", - "\n", - "configure_logging()\n", - "load_environment()\n", - "\n", - "engine_string = get_engine_name('cdm')\n", - "engine = sa.create_engine(engine_string, future=True, echo=False)\n", - "\n", - "bootstrap(engine, create=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "35e8b1b7", - "metadata": {}, - "outputs": [], - "source": [ - "from omop_alchemy.cdm.model.vocabulary import Concept, Concept_Relationship\n", - "from omop_alchemy.cdm.model.clinical import Condition_Occurrence\n", - "from sqlalchemy.orm import sessionmaker" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5921d6ac", - "metadata": {}, - "outputs": [], - "source": [ - "Session = sessionmaker(bind=engine, future=True)\n", - "session = Session()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "c5154ea0", - "metadata": {}, - "outputs": [], - "source": [ - "from omop_alchemy.cdm.model.extended.concept_resolver import OMOPConceptResolver, ConceptValidationMixin\n", - "from orm_loader.helpers import Base\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "515d57fe", - "metadata": {}, - "outputs": [], - "source": [ - "related_concept = sa.alias(Concept, name='related_concept')\n", - "\n", - "q = (\n", - " sa.select(\n", - " Concept.concept_id,\n", - " Concept.standard_concept,\n", - " Concept_Relationship.relationship_id,\n", - " related_concept.c.concept_id.label('related_concept_id'),\n", - " related_concept.c.standard_concept.label('related_standard_concept'),\n", - " ).join(\n", - " Concept_Relationship, Concept.concept_id == Concept_Relationship.concept_id_1\n", - " ).join(\n", - " related_concept, Concept_Relationship.concept_id_2 == related_concept.c.concept_id\n", - " ).where(\n", - " Concept_Relationship.relationship_id == 'Subsumes'\n", - " )\n", - ").subquery()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "1372d0dc", - "metadata": {}, - "outputs": [], - "source": [ - "class TestMapper(OMOPConceptResolver, ConceptValidationMixin, Base):\n", - " __table__ = q\n", - "\n", - " concept_id = q.c.concept_id\n", - " standard_concept = q.c.standard_concept\n", - " relationship_id = q.c.relationship_id\n", - " related_concept_id = q.c.related_concept_id\n", - " related_standard_concept = q.c.related_standard_concept" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "dfbdb85f", - "metadata": {}, - "outputs": [], - "source": [ - "table = TestMapper.get_queryable_table(session)\n", - "cols = TestMapper.concept_id_columns()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "d62c7f03", - "metadata": {}, - "outputs": [], - "source": [ - "violations = TestMapper.referenced_concept_violations(session)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "52f3fdde", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "b0f313cb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Invalid Related Concept IDs
037109760
137109761
237109762
342598409
43170326
......
318137109755
318237109756
318337109757
318437109758
318537109759
\n", - "

3186 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " Invalid Related Concept IDs\n", - "0 37109760\n", - "1 37109761\n", - "2 37109762\n", - "3 42598409\n", - "4 3170326\n", - "... ...\n", - "3181 37109755\n", - "3182 37109756\n", - "3183 37109757\n", - "3184 37109758\n", - "3185 37109759\n", - "\n", - "[3186 rows x 1 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import pandas as pd\n", - "\n", - "pd.DataFrame(violations['related_concept_id'], columns=['Invalid Related Concept IDs'])" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "0da24094", - "metadata": {}, - "outputs": [], - "source": [ - "class CoT(Condition_Occurrence, OMOPConceptResolver, ConceptValidationMixin):\n", - " pass" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "a599d50b", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'condition_type_concept_id': {32544, 32545, 42539609, 45754907}}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "CoT.referenced_concept_violations(session)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fce09d89", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "omop-alchemy (3.13.3)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/ORMforResearchReadyData_APAC2023.pdf b/notebooks/ORMforResearchReadyData_APAC2023.pdf deleted file mode 100644 index 54d5557..0000000 Binary files a/notebooks/ORMforResearchReadyData_APAC2023.pdf and /dev/null differ diff --git a/notebooks/concept_enums.py b/notebooks/concept_enums.py deleted file mode 100644 index da0f4f0..0000000 --- a/notebooks/concept_enums.py +++ /dev/null @@ -1,207 +0,0 @@ -import enum - -class ConceptEnum(enum.Enum): - - @classmethod - def member_values(cls): - return [s.value for s in cls] - - @classmethod - def is_member(cls, val): - return not val or val in [s.value for s in cls] - - @classmethod - def labels(cls): - return [s.name for s in cls] - - @classmethod - def get_name(cls, val): - try: - return cls(val).name - except: - return '' - -class ModifierFields(ConceptEnum): - condition_occurrence_id = 1147127 - drug_exposure_id = 1147707 - procedure_occurrence_id = 1147082 - episode_id = 756290 - -class ModifierTables(ConceptEnum): - drug_exposure = 1147339 - episode = 35225440 - observation = 1147304 - -class TreatmentEpisode(ConceptEnum): - treatment_regimen = 32531 # Assignment to or derivation of chemo treatment regimen - treatment_cycle = 32532 # Assignment to or derivation of chemo treatment cycle - cancer_surgery = 32939 # Surgical treatment episode - radiotherapy = 32940 # Radiotherapy treatment episode - -class Modality(ConceptEnum): - chemotherapy = 35803401 - radiotherapy = 35803411 - -class DiseaseEpisodeConcepts(ConceptEnum): - episode_of_care = 32533 # Overarching disease episode - - confined = 32528 # Confined disease extent - invasive = 32677 # Invasive disease extent - metastatic = 32944 # Invasive disease extent - - stable_disease = 32948 # Stable disease dynamic - disease_progression = 32949 # Progression disease dynamic - partial_response = 32947 # Partial response disease dynamic - complete_response = 32947 # Complete response disease dynamic - -class EpisodeTypes(ConceptEnum): - ehr_defined = 32544 # Episode defined in EHR - ehr_derived = 32545 # Episode derived algorithmically from EHR - ehr_prescription = 32838 # EHR prescription - ehr_planned_dispensing = 32837 # EHR planned dispensation - ehr_encounter_record = 32827 # EHR encounter - ehr_admin_record = 32818 # EHR administration record - ehr_outpatient_note = 32834 # EHR outpatient note - rt_care_plan = 42539609 # RT care plan - -class DocumentType(ConceptEnum): - oncology_note = 706266 - -class DocumentEncoding(ConceptEnum): - UTF8 = 32678 - -class Language(ConceptEnum): - english = 4180186 - -class ConditionModifiers(ConceptEnum): - # for measurement_concept_id grouping - init_diag = 734306 # Cancer Modifier - Initial Diagnosis - tnm = 734320 # Cancer Modifier - Parent AJCC/UICC concept - mets = 36769180 # Cancer Modifier - Parent metastasis hierarchy parent - -class TreatmentModifiers(ConceptEnum): - rt_parameter = 4036397 # Radiotherapy parameter parent - rt_projection = 4124464 # Radiotherapy projection parent - rt_site = 4240671 # Radiotherapy anatomical site parent - -class TreatmentIntent(ConceptEnum): - neoadjuvant = 4161587 - adjuvant = 4191637 - curative = 4162591 - palliative = 4179711 - -class CancerProcedureTypes(ConceptEnum): - surgical_procedure = 4301351 - historical_procedure = 1340204 - rt_procedure = 1242725 # Radiotherapy procedure parent - rn_procedure = 4161415 # Radionuclide parent - rt_externalbeam = 4141448 # ebrt parent - rt_course = 37163499 # overall RT course as a procedure - used to hold intent modifier, as well as to compare intended vs. delivered treatment events - -class ProceduresByLocation(ConceptEnum): - procedure_on_lung = 4040549 - operation_on_lung = 4301352 - -class TStageConcepts(ConceptEnum): - # used to group tnm mappings into their relevant subtypes - # preferably create a concept that is the parent of all these T concepts, but for now... - t0 = 1634213 - t1 = 1635564 - t2 = 1635562 - t3 = 1634376 - t4 = 1634654 - ta = 1635114 - tx = 1635682 - tis = 1634530 - -class NStageConcepts(ConceptEnum): - # as above for n... - n0 = 1633440 - n1 = 1634434 - n2 = 1634119 - n3 = 1635320 - n4 = 1635445 - nx = 1633885 - -class MStageConcepts(ConceptEnum): - # and m... - m0 = 1635624 - m1 = 1635142 - mx = 1633547 - -class GroupStageConcepts(ConceptEnum): - # there's a pattern here - stage0 = 1633754 - stageI = 1633306 - stageII = 1634209 - stageIII = 1633650 - stageIV = 1633308 - -class ConditionConcepts(ConceptEnum): - ehr_problem_list = 32840 - resolved_condition = 32906 - confirmed_diagnosis = 32893 - - -class StageEdition(ConceptEnum): - _6th = 1634647 - _7th = 1633496 - _8th = 1634449 - -class ModifierConcepts(ConceptEnum): - grade = 35918328 - laterality = 35918306 - derived_value = 45754907 - tumor_size = 4139794 - primary_tumor = 36768229 - - -class DrugExposureConcepts(ConceptEnum): - drug_dose = 4162374 - ehr_drug_admin = 32818 - placebo = 1379408 - -class DemographyConcepts(ConceptEnum): - cob = 4155450 - language_spoken = 4052785 - postcode = 4083591 - - -class GenomicValue(ConceptEnum): - positive = 9191 - negative = 9189 - equivocal = 4172976 - -class CancerConsultTypes(ConceptEnum): - medonc = 4147722 - clinonc = 4139715 # there is no suitable radonc code? only radiotherapist? - oncology_referral = 4084352 - pall_care_referral = 4127745 - -class ProviderSpecialty(ConceptEnum): - radonc = 35621987 - medonc = 4151173 - pall_care = 4202942 - dietetitian = 4220638 - occupational_therapist = 4213188 - speech_therapist = 4010130 - haematologist = 4221826 - geneticist = 4009808 - gynaecologist = 17036 - radiation_therapist = 4143746 - medical_doctor = 4010577 - - -class WeightConcepts(ConceptEnum): - weight = 4099154 - height = 607590 - bsa = 4201235 - weight_change = 4086522 - -class WeightUnits(ConceptEnum): - lb = 8739 - pct = 4041099 - kg = 9529 - cm = 8582 - inch = 9327 - m2 = 8617 \ No newline at end of file diff --git a/omop_alchemy/config.py b/omop_alchemy/config.py index eafadb6..1cbd66f 100644 --- a/omop_alchemy/config.py +++ b/omop_alchemy/config.py @@ -10,10 +10,12 @@ logger = get_logger(__name__) +# from orm-loader 0.4.0 onwards, implicit psycopg2 dependency has been removed in favor of explicit driver modules. +# This mapping is used to provide clearer error messages when a required driver is missing. POSTGRES_DRIVER_MODULES: Mapping[str, str] = { - "postgresql": "psycopg2", - "postgresql+psycopg2": "psycopg2", + "postgresql": "psycopg", # bare URL aliased to psycopg "postgresql+psycopg": "psycopg", + "postgresql+psycopg2": "psycopg2", # retained so missing-driver message is clear } def load_environment(dotenv: str = '') -> None: diff --git a/omop_alchemy/maintenance/backup.py b/omop_alchemy/maintenance/backup.py index 6f32eee..a277e78 100644 --- a/omop_alchemy/maintenance/backup.py +++ b/omop_alchemy/maintenance/backup.py @@ -98,7 +98,7 @@ def _psql_path() -> str: def _default_output_path(format: BackupFormat) -> Path: timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") - return Path.cwd() / f"omop-maint-backup-{timestamp}{FORMAT_SUFFIXES[format]}" + return Path.cwd() / f"omop-alchemy-backup-{timestamp}{FORMAT_SUFFIXES[format]}" def _libpq_connection_uri(url: sa.engine.URL) -> str: diff --git a/omop_alchemy/maintenance/cli.py b/omop_alchemy/maintenance/cli.py index d7f640f..e331e1f 100644 --- a/omop_alchemy/maintenance/cli.py +++ b/omop_alchemy/maintenance/cli.py @@ -140,7 +140,7 @@ def _configure_cli_logging() -> None: ) if mode == "file": - log_path = defaults_path().parent / "logging" / "omop-maint.log" + log_path = defaults_path().parent / "logging" / "omop-alchemy.log" log_path.parent.mkdir(parents=True, exist_ok=True) handler: logging.Handler = logging.FileHandler(log_path, encoding="utf-8") else: @@ -822,12 +822,17 @@ def load_vocab_source_command( engine_schema: str | None = typer.Option(None, help="Engine schema selector."), db_schema: str | None = typer.Option(None, help="Database schema override. PostgreSQL only; uses search_path for ORM CSV loading."), merge_strategy: str = typer.Option( - "upsert", - help="CSV merge strategy passed to the ORM loader. Defaults to non-destructive `upsert`; use `replace` to overwrite matching primary keys.", + "replace", + help="CSV merge strategy passed to the ORM loader. Defaults to `replace` to keep the database in sync with the Athena source; use `upsert` for incremental updates.", ), chunksize: int | None = typer.Option( - None, - help="Chunk size for fallback ORM CSV loading to reduce memory usage on large Athena files.", + 100_000, + help="Chunk size for fallback ORM CSV loading. Defaults to 100 000 rows; pass 0 to disable chunking.", + ), + initial_load: bool = typer.Option( + False, + "--initial-load", + help="Assume target vocabulary tables are empty and use the first-load fast path for a fresh Athena vocabulary load.", ), dry_run: bool = typer.Option(False, "--dry-run"), ) -> None: @@ -851,12 +856,20 @@ def load_vocab_source_command( console.print( render_error( "No Athena vocabulary source path is configured. " - "Set it with `omop-maint config set-overrides --athena-source ` " + "Set it with `omop-alchemy config set-overrides --athena-source ` " "or pass `--athena-source`." ) ) raise typer.Exit(code=1) + if initial_load and merge_strategy != "replace": + console.print( + render_error( + "`--initial-load` cannot be combined with `--merge-strategy` values other than `replace`." + ) + ) + raise typer.Exit(code=1) + try: engine = _build_engine( dotenv=connection_defaults.dotenv, @@ -895,25 +908,16 @@ def _update_progress(event: VocabularyLoadProgress) -> None: ) ) - if chunksize is None: - report = load_vocab_source( - engine, - source_path=connection_defaults.athena_source, - db_schema=connection_defaults.db_schema, - dry_run=dry_run, - merge_strategy=merge_strategy, - progress_callback=_update_progress, - ) - else: - report = load_vocab_source( - engine, - source_path=connection_defaults.athena_source, - db_schema=connection_defaults.db_schema, - dry_run=dry_run, - merge_strategy=merge_strategy, - chunksize=chunksize, - progress_callback=_update_progress, - ) + report = load_vocab_source( + engine, + source_path=connection_defaults.athena_source, + db_schema=connection_defaults.db_schema, + dry_run=dry_run, + merge_strategy=merge_strategy, + initial_load=initial_load, + chunksize=None if chunksize == 0 else chunksize, + progress_callback=_update_progress, + ) progress.update( task_id, completed=100.0, diff --git a/omop_alchemy/maintenance/doctor.py b/omop_alchemy/maintenance/doctor.py index bc1a881..91b1cbd 100644 --- a/omop_alchemy/maintenance/doctor.py +++ b/omop_alchemy/maintenance/doctor.py @@ -63,7 +63,7 @@ def _build_recommendations( DoctorRecommendation( status="warning", summary=f"{info.missing_table_count} ORM-managed table(s) are missing from the target database.", - action="Run `omop-maint create-missing-tables` before attempting bulk operations.", + action="Run `omop-alchemy create-missing-tables` before attempting bulk operations.", ) ) @@ -72,7 +72,7 @@ def _build_recommendations( DoctorRecommendation( status="warning", summary=f"Schema reconciliation found {len(reconciliation.issues)} difference(s) against ORM metadata.", - action="Review `omop-maint reconcile-schema` output before continuing with ETL or maintenance work.", + action="Review `omop-alchemy reconcile-schema` output before continuing with ETL or maintenance work.", ) ) @@ -84,7 +84,7 @@ def _build_recommendations( DoctorRecommendation( status="warning", summary="Some PostgreSQL RI triggers are currently disabled.", - action="If loading is complete, run `omop-maint foreign-keys validate` and then `omop-maint foreign-keys enable --strict`.", + action="If loading is complete, run `omop-alchemy foreign-keys validate` and then `omop-alchemy foreign-keys enable --strict`.", ) ) @@ -96,7 +96,7 @@ def _build_recommendations( DoctorRecommendation( status="failed", summary="Foreign key validation found violating rows.", - action="Fix the reported rows, then rerun `omop-maint foreign-keys enable --strict`.", + action="Fix the reported rows, then rerun `omop-alchemy foreign-keys enable --strict`.", ) ) @@ -105,7 +105,7 @@ def _build_recommendations( DoctorRecommendation( status="warning", summary="`pg_dump` is not on PATH, so backup-database is unavailable from this machine.", - action="Install PostgreSQL client tools on the machine running `omop-maint`.", + action="Install PostgreSQL client tools on the machine running `omop-alchemy`.", ) ) @@ -118,7 +118,7 @@ def _build_recommendations( DoctorRecommendation( status="warning", summary="Neither `pg_restore` nor `psql` is on PATH, so restore-database is unavailable from this machine.", - action="Install PostgreSQL client tools on the machine running `omop-maint`.", + action="Install PostgreSQL client tools on the machine running `omop-alchemy`.", ) ) @@ -207,7 +207,7 @@ def collect_doctor_report( DoctorCheck( name="schema drift", status="skipped", - detail="Run `omop-maint doctor --deep` to reconcile ORM metadata against the target database.", + detail="Run `omop-alchemy doctor --deep` to reconcile ORM metadata against the target database.", ) ) @@ -261,7 +261,7 @@ def collect_doctor_report( DoctorCheck( name="foreign key validation", status="skipped", - detail="Run `omop-maint doctor --deep` to validate selected foreign key relationships.", + detail="Run `omop-alchemy doctor --deep` to validate selected foreign key relationships.", ) ) else: diff --git a/omop_alchemy/maintenance/info.py b/omop_alchemy/maintenance/info.py index 4ca7003..aabd11c 100644 --- a/omop_alchemy/maintenance/info.py +++ b/omop_alchemy/maintenance/info.py @@ -315,7 +315,7 @@ def collect_maintenance_info( managed_tables = select_maintenance_tables( exclude_categories=(() if vocabulary_included else (TableCategory.VOCABULARY,)) ) - cli_path = shutil.which("omop-maint") + cli_path = shutil.which("omop-alchemy") dotenv_exists = None if dotenv is None else os.path.exists(dotenv) engine_name: str | None = None diff --git a/omop_alchemy/maintenance/load_vocab.py b/omop_alchemy/maintenance/load_vocab.py index a0ea2af..f88e2a6 100644 --- a/omop_alchemy/maintenance/load_vocab.py +++ b/omop_alchemy/maintenance/load_vocab.py @@ -150,7 +150,7 @@ def _load_vocab_model_csv( model: VocabularyModel, csv_path: Path, merge_strategy: str, - quote_mode: str = "csv", + quote_mode: str = "auto", chunksize: int | None = None, ) -> int: load_kwargs: dict[str, object] = { @@ -262,7 +262,8 @@ def _configure_loader_connection( "SQLite uses the default database namespace." ) - connection.exec_driver_sql(f"SET search_path TO {db_schema}") + quoted_schema = '"' + db_schema.replace('"', '""') + '"' + connection.exec_driver_sql(f"SET search_path TO {quoted_schema}") def load_vocab_source( engine: sa.Engine, @@ -271,11 +272,19 @@ def load_vocab_source( db_schema: str | None = None, dry_run: bool = False, merge_strategy: str = "replace", - chunksize: int | None = None, + initial_load: bool = False, + chunksize: int | None = 100_000, progress_callback: VocabularyLoadProgressCallback | None = None, ) -> VocabularyLoadReport: _ensure_supported_backend(engine) + if initial_load and merge_strategy != "replace": + raise ValueError( + "initial_load=True cannot be combined with merge_strategy values other than 'replace'" + ) + + effective_merge_strategy = "insert_if_empty" if initial_load else merge_strategy + resolved_source_path = Path(source_path).expanduser().resolve() if not resolved_source_path.exists() or not resolved_source_path.is_dir(): raise RuntimeError( @@ -361,6 +370,7 @@ def load_vocab_source( for table_index, item in enumerate(load_items, start=1): model = item.model csv_path = item.csv_path + required = item.required current_model_name = model.__tablename__ current_csv_path = str(csv_path) if dry_run: @@ -396,7 +406,7 @@ def load_vocab_source( row_count=None, csv_path=str(csv_path), required=required, - detail="Athena CSV would be loaded via staged ORM CSV loader using tab-delimited input and literal quote mode", + detail="Athena CSV would be loaded via staged ORM CSV loader using tab-delimited input and auto-detected quote mode", ) ) continue @@ -404,8 +414,8 @@ def load_vocab_source( loader_kwargs: dict[str, object] = { "model": model, "csv_path": csv_path, - "merge_strategy": merge_strategy, - "quote_mode": "literal", + "merge_strategy": effective_merge_strategy, + "quote_mode": "auto", } if chunksize is not None: loader_kwargs["chunksize"] = chunksize @@ -465,7 +475,7 @@ def load_vocab_source( row_count=row_count, csv_path=str(csv_path), required=required, - detail="Athena CSV loaded via staged ORM CSV loader using tab-delimited input and literal quote mode", + detail="Athena CSV loaded via staged ORM CSV loader using tab-delimited input and auto-detected quote mode", ) ) if not dry_run: @@ -487,7 +497,7 @@ def load_vocab_source( raise VocabularyLoadError( "Athena vocabulary load failed for " f"table `{current_model_name or 'unknown'}` from `{current_csv_path or '-'}` " - f"using merge strategy `{merge_strategy}` on backend `{engine.dialect.name}`. " + f"using merge strategy `{effective_merge_strategy}` on backend `{engine.dialect.name}`. " f"Underlying error: {exc.__class__.__name__}: {exc}" ) from exc finally: @@ -511,7 +521,7 @@ def load_vocab_source( source_path=str(resolved_source_path), backend=engine.dialect.name, db_schema=db_schema, - merge_strategy=merge_strategy, + merge_strategy=effective_merge_strategy, created_table_count=created_table_count, sequence_reset_count=sequence_reset_count, results=tuple(results), diff --git a/omop_alchemy/maintenance/ui.py b/omop_alchemy/maintenance/ui.py index 6e4a7a1..3bcd4e9 100644 --- a/omop_alchemy/maintenance/ui.py +++ b/omop_alchemy/maintenance/ui.py @@ -825,7 +825,7 @@ def render_foreign_key_validation_summary( ( "All selected foreign key relationships passed validation." if not failed_tables - else "Fix the violating rows, then rerun `omop-maint foreign-keys enable --strict`." + else "Fix the violating rows, then rerun `omop-alchemy foreign-keys enable --strict`." ), ) return Panel.fit( diff --git a/pyproject.toml b/pyproject.toml index f8f6890..72ca011 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "omop-alchemy" -version = "0.6.2" +version = "0.6.3" description = "SQLAlchemy-based models, validation, and utilities for the OHDSI OMOP Common Data Model" readme = "README.md" requires-python = ">=3.12" @@ -36,13 +36,12 @@ dependencies = [ "python-dotenv>=1.2.2", "typer>=0.12", "rich>=13.0", - "orm-loader>=0.3.27,<0.4.0", + "orm-loader>=0.4.1", ] [project.optional-dependencies] postgres = [ "psycopg[binary]>=3.2", - "psycopg2-binary>=2.9", ] dev = [ @@ -69,13 +68,18 @@ Repository = "https://github.com/AustralianCancerDataNetwork/OMOP_Alchemy" Issues = "https://github.com/AustralianCancerDataNetwork/OMOP_Alchemy/issues" [project.scripts] -omop-maint = "omop_alchemy.maintenance.cli:main" +omop-alchemy = "omop_alchemy.maintenance.cli:main" [build-system] requires = ["setuptools>=68", "wheel"] build-backend = "setuptools.build_meta" +[tool.pytest.ini_options] +markers = [ + "postgres: marks tests that require a running PostgreSQL instance (deselect with '-m not postgres')", +] + [tool.setuptools] include-package-data = true diff --git a/tests/README.md b/tests/README.md index f2e6ec4..0f70491 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,8 +1,41 @@ -# OMOP_Alchemy Tests +# Running the test suite -## Running Tests +## Quick start ```bash -py.test omop_alchemy # run all tests -py.test omop_alchemy test_config_and_setup.py # run specific test battery -``` \ No newline at end of file +# Unit and SQLite tests — no database required +uv run --extra dev pytest -m "not postgres" + +# PostgreSQL integration tests — requires the Docker container below +docker compose -f tests/docker-compose.yaml up -d +uv run --extra dev --extra postgres pytest -m postgres -v +``` + +## PostgreSQL integration tests + +The `postgres`-marked tests connect to a local PostgreSQL 16 container on +port **55432**. + +```bash +# Start +docker compose -f tests/example-docker-compose.yaml up -d + +# Run (this will run all tests) +uv run --extra dev --extra postgres pytest -m "postgres or not postgres" -v + +# Stop +docker compose -f tests/docker-compose.yaml down +``` + +## Test markers + +| Marker | Meaning | +|--------|---------| +| *(none)* | Runs on SQLite, no external dependencies | +| `postgres` | Requires the Docker container on port 55432 | + +## Fixture data + +`tests/fixtures/athena_source/` contains a minimal set of Athena vocabulary +CSVs (7 concepts) used to seed the SQLite test database. These are committed +to the repo and are sufficient for all non-postgres tests. diff --git a/tests/conftest.py b/tests/conftest.py index 3443879..897c383 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ +import time from datetime import date from pathlib import Path - +import os import pytest import sqlalchemy as sa from orm_loader.helpers import bootstrap @@ -52,7 +53,7 @@ def _load_fixture_vocabulary(engine: sa.Engine) -> None: model=model, csv_path=csv_path, merge_strategy="upsert", - quote_mode="literal", + quote_mode="auto", ) session.commit() connection.commit() @@ -200,6 +201,62 @@ def engine(tmp_path_factory: pytest.TempPathFactory): engine.dispose() +@pytest.fixture(scope="session") +def pg_engine(): + """ + Session-scoped engine connecting to a local PostgreSQL container. + + Start the container with: + docker compose -f tests/docker-compose.yaml up -d + + The fixture retries for up to 20 seconds to allow the container to become ready. + """ + _PG_URL = os.getenv("ENGINE_CDM") + if not _PG_URL: + pytest.skip("No PostgreSQL engine configured. Set ENGINE_CDM environment variable.") + engine = sa.create_engine(_PG_URL, future=True) + for attempt in range(20): + try: + with engine.connect() as conn: + conn.execute(sa.text("SELECT 1")) + break + except Exception: + if attempt == 19: + engine.dispose() + pytest.fail( + "PostgreSQL container not available after 20 attempts. " + "Run: docker compose -f tests/docker-compose.yaml up -d" + ) + time.sleep(1) + try: + yield engine + finally: + engine.dispose() + + +@pytest.fixture +def pg_session(pg_engine): + """ + Function-scoped PostgreSQL session with a clean schema for each test. + + Drops and recreates the public schema before each test to ensure full isolation. + """ + with pg_engine.connect() as conn: + conn.execute(sa.text("DROP SCHEMA public CASCADE")) + conn.execute(sa.text("CREATE SCHEMA public")) + conn.commit() + + bootstrap(pg_engine, create=True) + + SessionLocal = sessionmaker(bind=pg_engine, future=True, expire_on_commit=False) + session = SessionLocal() + try: + yield session + finally: + session.rollback() + session.close() + + @pytest.fixture(scope="function") def session(engine) -> Session: # type: ignore """ diff --git a/tests/example-docker-compose.yaml b/tests/example-docker-compose.yaml new file mode 100644 index 0000000..9510ef3 --- /dev/null +++ b/tests/example-docker-compose.yaml @@ -0,0 +1,16 @@ +# Example docker-compose file for local testing purposes. +services: + postgres: + image: postgres:16 + environment: + POSTGRES_USER: test + POSTGRES_PASSWORD: test + POSTGRES_DB: test_db + ENGINE_CDM: postgresql+psycopg://test:test@localhost:55432/test_db + ports: + - "55432:5432" + healthcheck: + test: ["CMD-SHELL", "pg_isready -U test -d test_db"] + interval: 2s + timeout: 5s + retries: 10 diff --git a/tests/fixtures/athena_source/CONCEPT.csv b/tests/fixtures/athena_source/CONCEPT.csv new file mode 100644 index 0000000..db1e8c5 --- /dev/null +++ b/tests/fixtures/athena_source/CONCEPT.csv @@ -0,0 +1,8 @@ +concept_id concept_name domain_id vocabulary_id concept_class_id standard_concept concept_code valid_start_date valid_end_date invalid_reason +8507 MALE Gender Gender Gender S M 19700101 20991231 +8527 White Race Race Race S White 19700101 20991231 +38003564 Not Hispanic or Latino Ethnicity Ethnicity Ethnicity S Not Hispanic or Latino 19700101 20991231 +32817 EHR Type Concept Type Concept Type Concept S EHR 19700101 20991231 +201826 Type 2 diabetes mellitus Condition SNOMED Clinical Finding S 44054006 19700101 20991231 +32546 Disease Episode Episode Episode Episode S Disease Episode 19700101 20991231 +1147127 condition_occurrence.condition_occurrence_id Metadata CDM Field S condition_occurrence.condition_occurrence_id 19700101 20991231 diff --git a/tests/fixtures/athena_source/CONCEPT_ANCESTOR.csv b/tests/fixtures/athena_source/CONCEPT_ANCESTOR.csv new file mode 100644 index 0000000..4e7b1b2 --- /dev/null +++ b/tests/fixtures/athena_source/CONCEPT_ANCESTOR.csv @@ -0,0 +1 @@ +ancestor_concept_id descendant_concept_id min_levels_of_separation max_levels_of_separation diff --git a/tests/fixtures/athena_source/CONCEPT_CLASS.csv b/tests/fixtures/athena_source/CONCEPT_CLASS.csv new file mode 100644 index 0000000..0c128ae --- /dev/null +++ b/tests/fixtures/athena_source/CONCEPT_CLASS.csv @@ -0,0 +1,8 @@ +concept_class_id concept_class_name concept_class_concept_id +Clinical Finding Clinical Finding 0 +Episode Episode 0 +Ethnicity Ethnicity 0 +Field Field 0 +Gender Gender 0 +Race Race 0 +Type Concept Type Concept 0 diff --git a/tests/fixtures/athena_source/CONCEPT_RELATIONSHIP.csv b/tests/fixtures/athena_source/CONCEPT_RELATIONSHIP.csv new file mode 100644 index 0000000..89cfde0 --- /dev/null +++ b/tests/fixtures/athena_source/CONCEPT_RELATIONSHIP.csv @@ -0,0 +1 @@ +concept_id_1 concept_id_2 relationship_id valid_start_date valid_end_date invalid_reason diff --git a/tests/fixtures/athena_source/CONCEPT_SYNONYM.csv b/tests/fixtures/athena_source/CONCEPT_SYNONYM.csv new file mode 100644 index 0000000..e906770 --- /dev/null +++ b/tests/fixtures/athena_source/CONCEPT_SYNONYM.csv @@ -0,0 +1 @@ +concept_id concept_synonym_name language_concept_id diff --git a/tests/fixtures/athena_source/DOMAIN.csv b/tests/fixtures/athena_source/DOMAIN.csv new file mode 100644 index 0000000..2df5ad4 --- /dev/null +++ b/tests/fixtures/athena_source/DOMAIN.csv @@ -0,0 +1,8 @@ +domain_id domain_name domain_concept_id +Condition Condition 0 +Episode Episode 0 +Ethnicity Ethnicity 0 +Gender Gender 0 +Metadata Metadata 0 +Race Race 0 +Type Concept Type Concept 0 diff --git a/tests/fixtures/athena_source/RELATIONSHIP.csv b/tests/fixtures/athena_source/RELATIONSHIP.csv new file mode 100644 index 0000000..aa9cf9b --- /dev/null +++ b/tests/fixtures/athena_source/RELATIONSHIP.csv @@ -0,0 +1,3 @@ +relationship_id relationship_name is_hierarchical defines_ancestry reverse_relationship_id relationship_concept_id +Is a Is a 1 1 Subsumes 0 +Subsumes Subsumes 1 0 Is a 0 diff --git a/tests/fixtures/athena_source/VOCABULARY.csv b/tests/fixtures/athena_source/VOCABULARY.csv new file mode 100644 index 0000000..a51f62a --- /dev/null +++ b/tests/fixtures/athena_source/VOCABULARY.csv @@ -0,0 +1,8 @@ +vocabulary_id vocabulary_name vocabulary_reference vocabulary_version vocabulary_concept_id +CDM Common Data Model OHDSI v5.4 0 +Episode OMOP Episode OHDSI v1.0 0 +Ethnicity OMOP Ethnicity OHDSI v1.0 0 +Gender OMOP Gender OHDSI v1.0 0 +Race OMOP Race OHDSI v1.0 0 +SNOMED SNOMED-CT IHTSDO SNOMED CT 2023 0 +Type Concept OMOP Type Concept OHDSI v1.0 0 diff --git a/tests/test_config_driver.py b/tests/test_config_driver.py new file mode 100644 index 0000000..7d3522b --- /dev/null +++ b/tests/test_config_driver.py @@ -0,0 +1,108 @@ +""" +Tests for omop_alchemy.config driver-selection logic. + +These tests do not require a database; they exercise the driver-mapping +constants, _missing_driver_message(), and create_engine_with_dependencies() +using mock exceptions to simulate missing packages. +""" +import pytest + +from omop_alchemy.config import ( + POSTGRES_DRIVER_MODULES, + _missing_driver_message, + create_engine_with_dependencies, +) + + +def _make_module_not_found(module_name: str) -> ModuleNotFoundError: + exc = ModuleNotFoundError(f"No module named '{module_name}'") + exc.name = module_name + return exc + + +# --------------------------------------------------------------------------- +# Driver-mapping constants +# --------------------------------------------------------------------------- + +def test_bare_postgresql_url_aliases_to_psycopg(): + """Bare postgresql:// now resolves to psycopg, not psycopg2.""" + assert POSTGRES_DRIVER_MODULES["postgresql"] == "psycopg" + + +def test_psycopg_driver_mapping(): + assert POSTGRES_DRIVER_MODULES["postgresql+psycopg"] == "psycopg" + + +def test_psycopg2_driver_mapping_retained_for_error_quality(): + """psycopg2 entry is kept so users get a clear error message.""" + assert POSTGRES_DRIVER_MODULES["postgresql+psycopg2"] == "psycopg2" + + +# --------------------------------------------------------------------------- +# _missing_driver_message() +# --------------------------------------------------------------------------- + +def test_missing_driver_message_for_psycopg(): + exc = _make_module_not_found("psycopg") + msg = _missing_driver_message("postgresql+psycopg://host/db", exc) + + assert msg is not None + assert "psycopg" in msg + assert "postgres" in msg.lower() + + +def test_missing_driver_message_for_bare_postgresql_url(): + """Bare postgresql:// is now aliased to psycopg; missing psycopg gives a helpful error.""" + exc = _make_module_not_found("psycopg") + msg = _missing_driver_message("postgresql://host/db", exc) + + assert msg is not None + assert "psycopg" in msg + + +def test_missing_driver_message_for_psycopg2(): + exc = _make_module_not_found("psycopg2") + msg = _missing_driver_message("postgresql+psycopg2://host/db", exc) + + assert msg is not None + assert "psycopg2" in msg + + +def test_missing_driver_message_returns_none_for_unrelated_module(): + """A ModuleNotFoundError for an unrelated package should not be intercepted.""" + exc = _make_module_not_found("pandas") + msg = _missing_driver_message("postgresql+psycopg://host/db", exc) + + assert msg is None + + +def test_missing_driver_message_returns_none_for_sqlite_url(): + exc = _make_module_not_found("psycopg") + msg = _missing_driver_message("sqlite:///test.db", exc) + + assert msg is None + + +# --------------------------------------------------------------------------- +# create_engine_with_dependencies() +# --------------------------------------------------------------------------- + +def test_sqlite_url_not_intercepted(): + """create_engine_with_dependencies should work for sqlite without wrapping errors.""" + engine = create_engine_with_dependencies("sqlite:///:memory:", future=True) + engine.dispose() + + +def test_create_engine_raises_runtime_for_missing_postgres_driver(monkeypatch): + """When psycopg is missing, create_engine_with_dependencies raises RuntimeError with install hint.""" + import sqlalchemy as sa + + exc = _make_module_not_found("psycopg") + + def raising_create_engine(url, **kwargs): + raise exc + + monkeypatch.setattr(sa, "create_engine", raising_create_engine) + + with pytest.raises(RuntimeError, match="psycopg"): + create_engine_with_dependencies("postgresql+psycopg://host/db") diff --git a/tests/test_load_vocab.py b/tests/test_load_vocab.py index 735c8a8..6d9b65e 100644 --- a/tests/test_load_vocab.py +++ b/tests/test_load_vocab.py @@ -60,8 +60,8 @@ def db_session(connection): @pytest.fixture(scope="session") def athena_vocab(connection): """ - Load a minimal, internally consistent Athena vocabulary - using the real ORM CSV loader. + Load the minimal Athena vocabulary fixture using the real ORM CSV loader. + Files follow the Athena convention: UPPERCASE table names with .csv extension. """ Session = sessionmaker(bind=connection, future=True) session = Session() @@ -73,7 +73,7 @@ def athena_vocab(connection): ) for model in ATHENA_LOAD_ORDER: - csv_path = base_path / f"{model.__tablename__}.csv" + csv_path = base_path / f"{model.__tablename__.upper()}.csv" if not csv_path.exists(): raise RuntimeError(f"Missing vocab CSV: {csv_path}") @@ -84,26 +84,22 @@ def athena_vocab(connection): yield + def test_concept_loaded(db_session, athena_vocab): - """Test concept loaded.""" - concept = db_session.get(Concept, 1) + """Test that vocabulary concepts load and are accessible by primary key.""" + # MALE (concept_id=8507) is a known row in the minimal fixture. + concept = db_session.get(Concept, 8507) assert concept is not None - assert concept.concept_name == "Domain" - assert concept.domain_id == "Metadata" + assert concept.concept_name == "MALE" + assert concept.domain_id == "Gender" + def test_concept_ancestor(db_session, athena_vocab): - """Test concept ancestor.""" - ancestors = ( - # running tests with metadata concepts so that they are definitely present - # assuming the logic to produce test db is stable - db_session.query(Concept_Ancestor) - .filter_by(descendant_concept_id=1147371) - .all() - ) - assert len(ancestors) == 2 - a = [a.ancestor_concept_id for a in ancestors] - assert 1147371 in a - assert 1147423 in a + """Test that the concept_ancestor table loads without error.""" + # Minimal fixtures have no ancestor rows; table must be accessible and empty. + count = db_session.query(Concept_Ancestor).count() + assert count == 0 + def test_all_concepts_reference_valid_domain(db_session, athena_vocab): """Test all concepts reference valid domain.""" @@ -116,15 +112,17 @@ def test_all_concepts_reference_valid_domain(db_session, athena_vocab): assert invalid == 0 + def test_relationship_vocab_loaded(db_session, athena_vocab): """Test relationship vocab loaded.""" rel = ( db_session.query(Relationship) - .filter_by(relationship_id="Has type") + .filter_by(relationship_id="Is a") .one() ) - assert rel.reverse_relationship_id == "Type of" + assert rel.reverse_relationship_id == "Subsumes" + def test_expected_domains_exist(db_session, athena_vocab): """Test expected domains exist.""" @@ -134,31 +132,34 @@ def test_expected_domains_exist(db_session, athena_vocab): } assert "Condition" in domains - assert "Procedure" in domains - assert "Drug" in domains + assert "Gender" in domains + assert "Race" in domains + def test_domains_are_consistent(db_session, athena_vocab): - """Test domains are consistent.""" + """Test concepts reference domains that exist in the domain table.""" concepts = ( db_session.query(Concept) - .filter(Concept.domain_id.in_(["Condition", "Procedure"])) + .filter(Concept.domain_id.in_(["Condition", "Gender"])) .all() ) - assert concepts + assert concepts for c in concepts: - assert c.domain_id in {"Condition", "Procedure"} + assert c.domain_id in {"Condition", "Gender"} -def test_procedure_concepts_exist(db_session, athena_vocab): - """Test procedure concepts exist.""" + +def test_condition_concepts_exist(db_session, athena_vocab): + """Test condition concepts exist.""" assert ( db_session.query(Concept) - .filter(Concept.domain_id == "Procedure") + .filter(Concept.domain_id == "Condition") .count() > 0 ) + def test_relationships_reference_valid_concepts(db_session, athena_vocab): """Test relationships reference valid concepts.""" rels = db_session.query(Concept_Relationship).limit(50).all() diff --git a/tests/test_load_vocab_postgres.py b/tests/test_load_vocab_postgres.py new file mode 100644 index 0000000..4a3ead1 --- /dev/null +++ b/tests/test_load_vocab_postgres.py @@ -0,0 +1,291 @@ +""" +PostgreSQL integration tests for OMOP_Alchemy vocabulary loading. + +These tests require a running PostgreSQL container. Start one with: + docker compose -f tests/docker-compose.yaml up -d + +Then run: + pytest -m postgres +""" +import shutil +from pathlib import Path + +import pytest +import sqlalchemy as sa + +from omop_alchemy.cdm.model.vocabulary import Concept +from omop_alchemy.maintenance.load_vocab import ( + REQUIRED_VOCAB_MODELS, + _load_vocab_model_csv, + load_vocab_source, +) + +_FIXTURE_SOURCE = Path(__file__).parent / "fixtures" / "athena_source" + + +def _copy_fixture_source(base_dir: Path) -> Path: + """Copy the shared Athena fixture set into an isolated per-test source dir.""" + source_path = base_dir / "athena_source" + shutil.copytree(_FIXTURE_SOURCE, source_path) + return source_path + + +def _make_concept_source( + base_dir: Path, + *, + concept_id: int, + concept_name: str, +) -> Path: + """ + Build a minimal vocabulary source where CONCEPT.csv contains exactly one + test concept with a Gender domain reference, and all other required tables + are copied from the shared fixture (which has the Gender domain row). + """ + source_path = base_dir / "athena_source" + source_path.mkdir(parents=True) + + for fname in ( + "DOMAIN.csv", + "VOCABULARY.csv", + "CONCEPT_CLASS.csv", + "RELATIONSHIP.csv", + "CONCEPT_ANCESTOR.csv", + "CONCEPT_RELATIONSHIP.csv", + "CONCEPT_SYNONYM.csv", + ): + shutil.copy(_FIXTURE_SOURCE / fname, source_path / fname) + + (source_path / "CONCEPT.csv").write_text( + "concept_id\tconcept_name\tdomain_id\tvocabulary_id\tconcept_class_id\t" + "standard_concept\tconcept_code\tvalid_start_date\tvalid_end_date\tinvalid_reason\n" + f"{concept_id}\t{concept_name}\tGender\tGender\tGender\tS\tTEST\t19700101\t20991231\t\n", + encoding="utf-8", + ) + return source_path + + +# --------------------------------------------------------------------------- +# Tests +# --------------------------------------------------------------------------- + +@pytest.mark.postgres +def test_end_to_end_vocab_load_on_postgres(pg_session, pg_engine, tmp_path): + """load_vocab_source() completes end-to-end on real Postgres via orm-loader>=0.4.0.""" + source_path = _copy_fixture_source(tmp_path) + report = load_vocab_source(pg_engine, source_path=source_path) + + assert report.merge_strategy == "replace" + assert all(r.status == "loaded" for r in report.results if r.required) + assert all(r.status == "skipped" for r in report.results if not r.required) + + count = pg_session.execute(sa.text("SELECT COUNT(*) FROM concept")).scalar() + assert count == 7 + + +@pytest.mark.postgres +def test_initial_load_uses_insert_if_empty_on_postgres(pg_session, pg_engine, tmp_path): + """initial_load=True uses the empty-target insert fast path on a fresh Postgres load.""" + source_path = _copy_fixture_source(tmp_path) + report = load_vocab_source(pg_engine, source_path=source_path, initial_load=True) + + assert report.merge_strategy == "insert_if_empty" + count = pg_session.execute(sa.text("SELECT COUNT(*) FROM concept")).scalar() + assert count == 7 + + +@pytest.mark.postgres +def test_quote_mode_auto_regression_on_postgres(pg_session, pg_engine, tmp_path): + """ + quote_mode='auto' strips RFC-4180 double-quotes via PostgreSQL COPY. + + Under the old quote_mode='literal' a concept_name of exactly 255 chars + wrapped in double-quotes would be stored as 257 chars and violate the + VARCHAR(255) constraint. This test would fail under literal mode. + """ + source_path = tmp_path / "athena_source" + source_path.mkdir() + + long_name = "A" * 255 # exactly at VARCHAR(255) limit when unquoted + + for model in REQUIRED_VOCAB_MODELS: + table_name = model.__tablename__.upper() + csv_path = source_path / f"{table_name}.csv" + if table_name == "CONCEPT": + # Wrap the 255-char name in double-quotes so it's 257 chars raw. + csv_path.write_text( + "concept_id\tconcept_name\tdomain_id\tvocabulary_id\t" + "concept_class_id\tstandard_concept\tconcept_code\t" + "valid_start_date\tvalid_end_date\tinvalid_reason\n" + f'1\t"{long_name}"\tGender\tGender\tGender\tS\tTEST\t19700101\t20991231\t\n', + encoding="utf-8", + ) + elif table_name == "DOMAIN": + csv_path.write_text( + "domain_id\tdomain_name\tdomain_concept_id\nGender\tGender\t0\n", + encoding="utf-8", + ) + elif table_name == "VOCABULARY": + csv_path.write_text( + "vocabulary_id\tvocabulary_name\tvocabulary_reference\t" + "vocabulary_version\tvocabulary_concept_id\n" + "Gender\tOMOP Gender\tOHDSI\tv1.0\t0\n", + encoding="utf-8", + ) + elif table_name == "CONCEPT_CLASS": + csv_path.write_text( + "concept_class_id\tconcept_class_name\tconcept_class_concept_id\n" + "Gender\tGender\t0\n", + encoding="utf-8", + ) + else: + shutil.copy(_FIXTURE_SOURCE / f"{table_name}.csv", csv_path) + + # Should not raise: literal mode would produce a 257-char value and fail. + load_vocab_source(pg_engine, source_path=source_path) + + concept_name = pg_session.execute( + sa.text("SELECT concept_name FROM concept WHERE concept_id = 1") + ).scalar() + assert concept_name is not None + assert len(concept_name) == 255, ( + f"Expected 255-char name; got {len(concept_name)}: {concept_name!r}" + ) + assert not concept_name.startswith('"'), "Surrounding quotes were not stripped" + + +@pytest.mark.postgres +def test_load_vocab_model_csv_on_postgres(pg_session, tmp_path): + """ + _load_vocab_model_csv loads data correctly on a real PostgreSQL session. + + orm-loader>=0.4.0 handles staging-table creation internally, so we test + the end-to-end path: CSV → staging → concept table on real Postgres. + """ + source_path = _copy_fixture_source(tmp_path) + csv_path = source_path / "CONCEPT.csv" + + row_count = _load_vocab_model_csv( + pg_session, + model=Concept, + csv_path=csv_path, + merge_strategy="replace", + ) + pg_session.commit() + + assert row_count == 7 + count = pg_session.execute(sa.text("SELECT COUNT(*) FROM concept")).scalar() + assert count == 7 + + +@pytest.mark.postgres +def test_replace_strategy_overwrites_existing_rows(pg_session, pg_engine, tmp_path): + """merge_strategy='replace' fully replaces rows with the same PKs on second load.""" + concept_id = 99999 + source_v1 = _make_concept_source( + tmp_path / "v1", concept_id=concept_id, concept_name="name_v1" + ) + source_v2 = _make_concept_source( + tmp_path / "v2", concept_id=concept_id, concept_name="name_v2" + ) + + load_vocab_source(pg_engine, source_path=source_v1, merge_strategy="replace") + load_vocab_source(pg_engine, source_path=source_v2, merge_strategy="replace") + + name = pg_session.execute( + sa.text("SELECT concept_name FROM concept WHERE concept_id = :cid"), + {"cid": concept_id}, + ).scalar() + assert name == "name_v2", f"Expected 'name_v2' after replace, got {name!r}" + + +@pytest.mark.postgres +def test_upsert_strategy_is_non_destructive(pg_session, pg_engine, tmp_path): + """merge_strategy='upsert' preserves existing rows on second load with same PKs.""" + concept_id = 99998 + source_v1 = _make_concept_source( + tmp_path / "v1", concept_id=concept_id, concept_name="name_v1" + ) + source_v2 = _make_concept_source( + tmp_path / "v2", concept_id=concept_id, concept_name="name_v2" + ) + + load_vocab_source(pg_engine, source_path=source_v1, merge_strategy="upsert") + load_vocab_source(pg_engine, source_path=source_v2, merge_strategy="upsert") + + name = pg_session.execute( + sa.text("SELECT concept_name FROM concept WHERE concept_id = :cid"), + {"cid": concept_id}, + ).scalar() + assert name == "name_v1", ( + f"Expected 'name_v1' after upsert (existing row preserved), got {name!r}" + ) + + +@pytest.mark.postgres +def test_chunksize_forwarded_to_loader(pg_session, pg_engine, monkeypatch, tmp_path): + """chunksize is forwarded from load_vocab_source through to _load_vocab_model_csv.""" + from omop_alchemy.maintenance import load_vocab as _lv_module + + source_path = _copy_fixture_source(tmp_path) + received_chunksizes: list[int | None] = [] + original = _lv_module._load_vocab_model_csv + + def tracking_load(session, *, model, csv_path, merge_strategy, quote_mode="auto", chunksize=None): + received_chunksizes.append(chunksize) + return original( + session, + model=model, + csv_path=csv_path, + merge_strategy=merge_strategy, + quote_mode=quote_mode, + chunksize=chunksize, + ) + + monkeypatch.setattr(_lv_module, "_load_vocab_model_csv", tracking_load) + + load_vocab_source(pg_engine, source_path=source_path, chunksize=500) + + assert received_chunksizes, "Expected at least one table to be loaded" + assert all(c == 500 for c in received_chunksizes), ( + f"Expected chunksize=500 for all tables, got: {received_chunksizes}" + ) + + +@pytest.mark.postgres +def test_db_schema_search_path_on_postgres(pg_engine, tmp_path): + """ + load_vocab_source with db_schema creates vocabulary tables in the requested + PostgreSQL schema and loads data into them correctly. + """ + schema = 'VocabTest' + source_path = _copy_fixture_source(tmp_path) + quoted_schema = '"' + schema.replace('"', '""') + '"' + + with pg_engine.connect() as conn: + conn.execute(sa.text(f"DROP SCHEMA IF EXISTS {quoted_schema} CASCADE")) + conn.execute(sa.text(f"CREATE SCHEMA {quoted_schema}")) + conn.commit() + + try: + report = load_vocab_source( + pg_engine, + source_path=source_path, + db_schema=schema, + ) + + assert any(r.status == "loaded" for r in report.results if r.required) + + inspector = sa.inspect(pg_engine) + assert inspector.has_table("concept", schema=schema), ( + f"Expected concept table in schema '{schema}'" + ) + + with pg_engine.connect() as conn: + count = conn.execute( + sa.text(f"SELECT COUNT(*) FROM {quoted_schema}.concept") + ).scalar() + assert count == 7 + finally: + with pg_engine.connect() as conn: + conn.execute(sa.text(f"DROP SCHEMA IF EXISTS {quoted_schema} CASCADE")) + conn.commit() diff --git a/tests/test_load_vocab_source.py b/tests/test_load_vocab_source.py index a6fa1bc..9450b96 100644 --- a/tests/test_load_vocab_source.py +++ b/tests/test_load_vocab_source.py @@ -67,7 +67,8 @@ def fake_load_vocab_model_csv( model, csv_path, merge_strategy, - quote_mode="csv", + quote_mode="auto", + chunksize=None, ) -> int: loaded_tables.append((model.__tablename__, merge_strategy, quote_mode, csv_path)) return 1 @@ -88,7 +89,7 @@ def fake_load_vocab_model_csv( assert all(result_by_name[model.__tablename__].status == "loaded" for model in REQUIRED_VOCAB_MODELS) assert all(result_by_name[model.__tablename__].status == "skipped" for model in OPTIONAL_VOCAB_MODELS) assert all(merge_strategy == "replace" for _, merge_strategy, _, _ in loaded_tables) - assert all(quote_mode == "literal" for _, _, quote_mode, _ in loaded_tables) + assert all(quote_mode == "auto" for _, _, quote_mode, _ in loaded_tables) assert {table_name for table_name, _, _, _ in loaded_tables} == { model.__tablename__ for model in REQUIRED_VOCAB_MODELS @@ -102,10 +103,15 @@ def test_load_vocab_source_requires_full_required_athena_fixture(tmp_path): """Test load vocab source requires full required athena fixture.""" engine = sa.create_engine(f"sqlite:///{tmp_path / 'load_vocab_source_missing_required.db'}", future=True) + # Build a source with only a subset of required models to trigger the missing-files error. + partial_source = tmp_path / "partial_athena" + partial_source.mkdir() + _write_athena_csv(partial_source, REQUIRED_VOCAB_MODELS[0].__tablename__) + with pytest.raises(RuntimeError) as exc_info: load_vocab_source( engine, - source_path=_athena_source_path(), + source_path=partial_source, ) assert "Missing required Athena vocabulary CSV files" in str(exc_info.value) @@ -162,7 +168,9 @@ def fake_load_vocab_source( source_path: str | Path, db_schema: str | None = None, dry_run: bool = False, - merge_strategy: str = "upsert", + merge_strategy: str = "replace", + initial_load: bool = False, + chunksize: int | None = None, progress_callback=None, ): from omop_alchemy.maintenance.load_vocab import VocabularyLoadReport, VocabularyLoadResult @@ -172,6 +180,7 @@ def fake_load_vocab_source( calls["db_schema"] = db_schema calls["dry_run"] = dry_run calls["merge_strategy"] = merge_strategy + calls["initial_load"] = initial_load return VocabularyLoadReport( source_path=str(source_path), backend="sqlite", @@ -233,11 +242,101 @@ def fake_load_vocab_source( assert result.exit_code == 0 assert calls["engine"] == "ENGINE" assert calls["source_path"] == expected_source_path - assert calls["merge_strategy"] == "upsert" + assert calls["merge_strategy"] == "replace" + assert calls["initial_load"] is False assert "load-vocab-source" in result.stdout assert "concept" in result.stdout +def test_load_vocab_source_cli_initial_load_uses_first_load_fast_path(monkeypatch): + """CLI --initial-load forwards the fresh-load intent to load_vocab_source().""" + calls: dict[str, object] = {} + + def fake_build_engine(*, dotenv: str | None, engine_schema: str | None): + return "ENGINE" + + def fake_load_vocab_source( + engine: object, + *, + source_path: str | Path, + db_schema: str | None = None, + dry_run: bool = False, + merge_strategy: str = "replace", + initial_load: bool = False, + chunksize: int | None = None, + progress_callback=None, + ): + from omop_alchemy.maintenance.load_vocab import VocabularyLoadReport, VocabularyLoadResult + + calls["engine"] = engine + calls["merge_strategy"] = merge_strategy + calls["initial_load"] = initial_load + effective_merge_strategy = "insert_if_empty" if initial_load else merge_strategy + return VocabularyLoadReport( + source_path=str(source_path), + backend="sqlite", + db_schema=db_schema, + merge_strategy=effective_merge_strategy, + created_table_count=0, + sequence_reset_count=0, + results=( + VocabularyLoadResult( + table_name="concept", + status="planned", + row_count=None, + csv_path=str(Path(source_path) / "CONCEPT.csv"), + required=True, + detail="Athena CSV would be loaded via staged ORM CSV loader", + ), + ), + ) + + monkeypatch.setattr( + "omop_alchemy.maintenance.cli._build_engine", + fake_build_engine, + ) + monkeypatch.setattr( + "omop_alchemy.maintenance.cli.load_vocab_source", + fake_load_vocab_source, + ) + + result = runner.invoke( + app, + [ + "load-vocab-source", + "--athena-source", + str(_athena_source_path()), + "--initial-load", + "--dry-run", + ], + ) + + assert result.exit_code == 0 + assert calls["engine"] == "ENGINE" + assert calls["merge_strategy"] == "replace" + assert calls["initial_load"] is True + + +def test_load_vocab_source_cli_rejects_initial_load_with_non_replace_strategy(): + """CLI should reject combining --initial-load with a conflicting merge strategy.""" + result = runner.invoke( + app, + [ + "load-vocab-source", + "--athena-source", + str(_athena_source_path()), + "--initial-load", + "--merge-strategy", + "upsert", + "--dry-run", + ], + ) + + assert result.exit_code == 1 + assert "--initial-load" in result.stdout + assert "replace" in result.stdout + + def test_load_vocab_model_csv_passes_quote_mode(monkeypatch, tmp_path): """Test load vocab model csv passes quote mode.""" engine = sa.create_engine(f"sqlite:///{tmp_path / 'load_vocab_source_quote_mode.db'}", future=True) @@ -302,7 +401,8 @@ def fake_load_vocab_model_csv( model, csv_path, merge_strategy, - quote_mode="csv", + quote_mode="auto", + chunksize=None, ) -> int: loaded_order.append(model.__tablename__) return 1 @@ -317,6 +417,51 @@ def fake_load_vocab_model_csv( assert loaded_order[:3] == ["domain", "concept_class", "vocabulary"] +def test_load_vocab_source_initial_load_maps_to_insert_if_empty(monkeypatch, tmp_path): + """initial_load=True maps the vocab loader onto orm-loader's insert-if-empty path.""" + engine = sa.create_engine(f"sqlite:///{tmp_path / 'load_vocab_source_initial_load.db'}", future=True) + source_path = _build_required_athena_source(tmp_path) + + received_merge_strategies: list[str] = [] + + def fake_load_vocab_model_csv( + session, + *, + model, + csv_path, + merge_strategy, + quote_mode="auto", + chunksize=None, + ) -> int: + received_merge_strategies.append(merge_strategy) + return 1 + + monkeypatch.setattr( + "omop_alchemy.maintenance.load_vocab._load_vocab_model_csv", + fake_load_vocab_model_csv, + ) + + report = load_vocab_source(engine, source_path=source_path, initial_load=True) + + assert report.merge_strategy == "insert_if_empty" + assert received_merge_strategies + assert all(strategy == "insert_if_empty" for strategy in received_merge_strategies) + + +def test_load_vocab_source_rejects_initial_load_with_non_replace_strategy(tmp_path): + """initial_load=True cannot be combined with a conflicting merge strategy.""" + engine = sa.create_engine(f"sqlite:///{tmp_path / 'load_vocab_source_initial_load_error.db'}", future=True) + source_path = _build_required_athena_source(tmp_path) + + with pytest.raises(ValueError, match="initial_load=True"): + load_vocab_source( + engine, + source_path=source_path, + initial_load=True, + merge_strategy="upsert", + ) + + def test_load_vocab_source_reports_weighted_progress(monkeypatch, tmp_path): """Test load vocab source reports weighted progress.""" engine = sa.create_engine(f"sqlite:///{tmp_path / 'load_vocab_source_progress.db'}", future=True) @@ -333,7 +478,8 @@ def fake_load_vocab_model_csv( model, csv_path, merge_strategy, - quote_mode="csv", + quote_mode="auto", + chunksize=None, ) -> int: return 1 @@ -360,7 +506,7 @@ def test_load_vocab_source_wraps_failed_table_load(monkeypatch, tmp_path): engine = sa.create_engine(f"sqlite:///{tmp_path / 'load_vocab_source_error.db'}", future=True) source_path = _build_required_athena_source(tmp_path) - def fake_load_vocab_model_csv(session, *, model, csv_path, merge_strategy, quote_mode="csv"): + def fake_load_vocab_model_csv(session, *, model, csv_path, merge_strategy, quote_mode="auto", chunksize=None): if model.__tablename__ == "domain": raise sa.exc.ProgrammingError( "COPY domain FROM STDIN", @@ -470,3 +616,57 @@ def fail_load_vocab_source(*args, **kwargs): assert result.exit_code == 1 assert "Database operation failed: ProgrammingError." in result.stdout assert "value too long for type character varying(255)" in result.stdout + + +def test_load_vocab_source_uses_auto_not_literal_quote_mode(monkeypatch, tmp_path): + """Regression: Athena load must use auto quote mode so that quoted concept_name + values are not padded with surrounding double-quote characters, which would + cause 'value too long for type character varying(255)' on CONCEPT.csv.""" + engine = sa.create_engine(f"sqlite:///{tmp_path / 'quote_mode_regression.db'}", future=True) + + # Build a tab-delimited CSV where concept_name is exactly 255 chars when + # unquoted, but would be 257 chars if the surrounding CSV quotes were kept + # as literal characters (the literal-mode bug). + source_path = tmp_path / "athena_source" + source_path.mkdir() + + long_name = "A" * 255 + for model in REQUIRED_VOCAB_MODELS: + table_name = model.__tablename__.upper() + csv_path = source_path / f"{table_name}.csv" + if table_name == "CONCEPT": + csv_path.write_text( + "concept_id\tconcept_name\tdomain_id\tvocabulary_id\t" + "concept_class_id\tstandard_concept\tconcept_code\t" + "valid_start_date\tvalid_end_date\tinvalid_reason\n" + f'4715176\t"{long_name}"\t...\t...\t...\t\t...\t20000101\t20991231\t\n', + encoding="utf-8", + ) + else: + csv_path.write_text("stub\n", encoding="utf-8") + + received_quote_modes: list[str] = [] + + def fake_load_vocab_model_csv( + session, + *, + model, + csv_path, + merge_strategy, + quote_mode="auto", + chunksize=None, + ) -> int: + received_quote_modes.append(quote_mode) + return 1 + + monkeypatch.setattr( + "omop_alchemy.maintenance.load_vocab._load_vocab_model_csv", + fake_load_vocab_model_csv, + ) + + load_vocab_source(engine, source_path=source_path) + + assert all(mode == "auto" for mode in received_quote_modes), ( + f"Expected all tables to use quote_mode='auto', got: {received_quote_modes}" + ) + assert "literal" not in received_quote_modes diff --git a/uv.lock b/uv.lock index 861910c..1f9887b 100644 --- a/uv.lock +++ b/uv.lock @@ -862,7 +862,7 @@ wheels = [ [[package]] name = "omop-alchemy" -version = "0.6.2" +version = "0.6.3" source = { editable = "." } dependencies = [ { name = "orm-loader" }, @@ -893,7 +893,6 @@ docs = [ ] postgres = [ { name = "psycopg", extra = ["binary"] }, - { name = "psycopg2-binary" }, ] [package.metadata] @@ -905,10 +904,9 @@ requires-dist = [ { name = "mkdocstrings-python", marker = "extra == 'dev'", specifier = ">=2.0.1" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.8" }, { name = "myst-parser", marker = "extra == 'docs'" }, - { name = "orm-loader", specifier = ">=0.3.27,<4.0" }, + { name = "orm-loader", specifier = ">=0.4.1" }, { name = "pandas", specifier = ">=2.0" }, { name = "psycopg", extras = ["binary"], marker = "extra == 'postgres'", specifier = ">=3.2" }, - { name = "psycopg2-binary", marker = "extra == 'postgres'", specifier = ">=2.9" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=9.0.3" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=4.0" }, { name = "python-dotenv", specifier = ">=1.2.2" }, @@ -924,7 +922,7 @@ provides-extras = ["postgres", "dev", "docs"] [[package]] name = "orm-loader" -version = "0.3.27" +version = "0.4.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "chardet" }, @@ -932,9 +930,9 @@ dependencies = [ { name = "pyarrow" }, { name = "sqlalchemy" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ca/72/f5ae8aafb2868301da88c71f6ee095cac14bf4405648c935b533cf1550b6/orm_loader-0.3.27.tar.gz", hash = "sha256:51de60177bb45572329899d883414ba47ed42034a782d49bf05d0dc5d1e9f58c", size = 33014, upload-time = "2026-05-06T07:04:59.088Z" } +sdist = { url = "https://files.pythonhosted.org/packages/2a/6a/007e6eef497753702d5a53444842ee6cc38bcbf7c5c422857c0671bfc727/orm_loader-0.4.1.tar.gz", hash = "sha256:434b6c3436c05bf3ad43774b46476e7f324db05a18bf34ad9f9692e4f02bcb7e", size = 39449, upload-time = "2026-05-19T12:56:29.572Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/83/f8/8f16b0123ea3438a084125d7450ef1250e4780edf0934f79e14a924578bc/orm_loader-0.3.27-py3-none-any.whl", hash = "sha256:7e2bbd7f6935aff1710a99d9d8f550d691307c446e75c04cb59cd67f1e64b16d", size = 44815, upload-time = "2026-05-06T07:04:57.509Z" }, + { url = "https://files.pythonhosted.org/packages/98/d7/37f82f8748a91fdb14d41f314ddc829806f596dec409196c037e59d3a5a7/orm_loader-0.4.1-py3-none-any.whl", hash = "sha256:03131b5d4b7b787ea446e110684b7256b5690313503626939b83984953174825", size = 54472, upload-time = "2026-05-19T12:56:27.959Z" }, ] [[package]] @@ -1120,47 +1118,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/98/5a/291d89f44d3820fffb7a04ebc8f3ef5dda4f542f44a5daea0c55a84abf45/psycopg_binary-3.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:165f22ab5a9513a3d7425ffb7fcc7955ed8ccaeef6d37e369d6cc1dff1582383", size = 3652796, upload-time = "2026-02-18T16:52:14.02Z" }, ] -[[package]] -name = "psycopg2-binary" -version = "2.9.11" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ac/6c/8767aaa597ba424643dc87348c6f1754dd9f48e80fdc1b9f7ca5c3a7c213/psycopg2-binary-2.9.11.tar.gz", hash = "sha256:b6aed9e096bf63f9e75edf2581aa9a7e7186d97ab5c177aa6c87797cd591236c", size = 379620, upload-time = "2025-10-10T11:14:48.041Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/d8/91/f870a02f51be4a65987b45a7de4c2e1897dd0d01051e2b559a38fa634e3e/psycopg2_binary-2.9.11-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:be9b840ac0525a283a96b556616f5b4820e0526addb8dcf6525a0fa162730be4", size = 3756603, upload-time = "2025-10-10T11:11:52.213Z" }, - { url = "https://files.pythonhosted.org/packages/27/fa/cae40e06849b6c9a95eb5c04d419942f00d9eaac8d81626107461e268821/psycopg2_binary-2.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f090b7ddd13ca842ebfe301cd587a76a4cf0913b1e429eb92c1be5dbeb1a19bc", size = 3864509, upload-time = "2025-10-10T11:11:56.452Z" }, - { url = "https://files.pythonhosted.org/packages/2d/75/364847b879eb630b3ac8293798e380e441a957c53657995053c5ec39a316/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ab8905b5dcb05bf3fb22e0cf90e10f469563486ffb6a96569e51f897c750a76a", size = 4411159, upload-time = "2025-10-10T11:12:00.49Z" }, - { url = "https://files.pythonhosted.org/packages/6f/a0/567f7ea38b6e1c62aafd58375665a547c00c608a471620c0edc364733e13/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:bf940cd7e7fec19181fdbc29d76911741153d51cab52e5c21165f3262125685e", size = 4468234, upload-time = "2025-10-10T11:12:04.892Z" }, - { url = "https://files.pythonhosted.org/packages/30/da/4e42788fb811bbbfd7b7f045570c062f49e350e1d1f3df056c3fb5763353/psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fa0f693d3c68ae925966f0b14b8edda71696608039f4ed61b1fe9ffa468d16db", size = 4166236, upload-time = "2025-10-10T11:12:11.674Z" }, - { url = "https://files.pythonhosted.org/packages/3c/94/c1777c355bc560992af848d98216148be5f1be001af06e06fc49cbded578/psycopg2_binary-2.9.11-cp312-cp312-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a1cf393f1cdaf6a9b57c0a719a1068ba1069f022a59b8b1fe44b006745b59757", size = 3983083, upload-time = "2025-10-30T02:55:15.73Z" }, - { url = "https://files.pythonhosted.org/packages/bd/42/c9a21edf0e3daa7825ed04a4a8588686c6c14904344344a039556d78aa58/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:ef7a6beb4beaa62f88592ccc65df20328029d721db309cb3250b0aae0fa146c3", size = 3652281, upload-time = "2025-10-10T11:12:17.713Z" }, - { url = "https://files.pythonhosted.org/packages/12/22/dedfbcfa97917982301496b6b5e5e6c5531d1f35dd2b488b08d1ebc52482/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:31b32c457a6025e74d233957cc9736742ac5a6cb196c6b68499f6bb51390bd6a", size = 3298010, upload-time = "2025-10-10T11:12:22.671Z" }, - { url = "https://files.pythonhosted.org/packages/66/ea/d3390e6696276078bd01b2ece417deac954dfdd552d2edc3d03204416c0c/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:edcb3aeb11cb4bf13a2af3c53a15b3d612edeb6409047ea0b5d6a21a9d744b34", size = 3044641, upload-time = "2025-10-30T02:55:19.929Z" }, - { url = "https://files.pythonhosted.org/packages/12/9a/0402ded6cbd321da0c0ba7d34dc12b29b14f5764c2fc10750daa38e825fc/psycopg2_binary-2.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:62b6d93d7c0b61a1dd6197d208ab613eb7dcfdcca0a49c42ceb082257991de9d", size = 3347940, upload-time = "2025-10-10T11:12:26.529Z" }, - { url = "https://files.pythonhosted.org/packages/b1/d2/99b55e85832ccde77b211738ff3925a5d73ad183c0b37bcbbe5a8ff04978/psycopg2_binary-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:b33fabeb1fde21180479b2d4667e994de7bbf0eec22832ba5d9b5e4cf65b6c6d", size = 2714147, upload-time = "2025-10-10T11:12:29.535Z" }, - { url = "https://files.pythonhosted.org/packages/ff/a8/a2709681b3ac11b0b1786def10006b8995125ba268c9a54bea6f5ae8bd3e/psycopg2_binary-2.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:b8fb3db325435d34235b044b199e56cdf9ff41223a4b9752e8576465170bb38c", size = 3756572, upload-time = "2025-10-10T11:12:32.873Z" }, - { url = "https://files.pythonhosted.org/packages/62/e1/c2b38d256d0dafd32713e9f31982a5b028f4a3651f446be70785f484f472/psycopg2_binary-2.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:366df99e710a2acd90efed3764bb1e28df6c675d33a7fb40df9b7281694432ee", size = 3864529, upload-time = "2025-10-10T11:12:36.791Z" }, - { url = "https://files.pythonhosted.org/packages/11/32/b2ffe8f3853c181e88f0a157c5fb4e383102238d73c52ac6d93a5c8bffe6/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8c55b385daa2f92cb64b12ec4536c66954ac53654c7f15a203578da4e78105c0", size = 4411242, upload-time = "2025-10-10T11:12:42.388Z" }, - { url = "https://files.pythonhosted.org/packages/10/04/6ca7477e6160ae258dc96f67c371157776564679aefd247b66f4661501a2/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:c0377174bf1dd416993d16edc15357f6eb17ac998244cca19bc67cdc0e2e5766", size = 4468258, upload-time = "2025-10-10T11:12:48.654Z" }, - { url = "https://files.pythonhosted.org/packages/3c/7e/6a1a38f86412df101435809f225d57c1a021307dd0689f7a5e7fe83588b1/psycopg2_binary-2.9.11-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5c6ff3335ce08c75afaed19e08699e8aacf95d4a260b495a4a8545244fe2ceb3", size = 4166295, upload-time = "2025-10-10T11:12:52.525Z" }, - { url = "https://files.pythonhosted.org/packages/f2/7d/c07374c501b45f3579a9eb761cbf2604ddef3d96ad48679112c2c5aa9c25/psycopg2_binary-2.9.11-cp313-cp313-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:84011ba3109e06ac412f95399b704d3d6950e386b7994475b231cf61eec2fc1f", size = 3983133, upload-time = "2025-10-30T02:55:24.329Z" }, - { url = "https://files.pythonhosted.org/packages/82/56/993b7104cb8345ad7d4516538ccf8f0d0ac640b1ebd8c754a7b024e76878/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:ba34475ceb08cccbdd98f6b46916917ae6eeb92b5ae111df10b544c3a4621dc4", size = 3652383, upload-time = "2025-10-10T11:12:56.387Z" }, - { url = "https://files.pythonhosted.org/packages/2d/ac/eaeb6029362fd8d454a27374d84c6866c82c33bfc24587b4face5a8e43ef/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:b31e90fdd0f968c2de3b26ab014314fe814225b6c324f770952f7d38abf17e3c", size = 3298168, upload-time = "2025-10-10T11:13:00.403Z" }, - { url = "https://files.pythonhosted.org/packages/2b/39/50c3facc66bded9ada5cbc0de867499a703dc6bca6be03070b4e3b65da6c/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:d526864e0f67f74937a8fce859bd56c979f5e2ec57ca7c627f5f1071ef7fee60", size = 3044712, upload-time = "2025-10-30T02:55:27.975Z" }, - { url = "https://files.pythonhosted.org/packages/9c/8e/b7de019a1f562f72ada81081a12823d3c1590bedc48d7d2559410a2763fe/psycopg2_binary-2.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04195548662fa544626c8ea0f06561eb6203f1984ba5b4562764fbeb4c3d14b1", size = 3347549, upload-time = "2025-10-10T11:13:03.971Z" }, - { url = "https://files.pythonhosted.org/packages/80/2d/1bb683f64737bbb1f86c82b7359db1eb2be4e2c0c13b947f80efefa7d3e5/psycopg2_binary-2.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:efff12b432179443f54e230fdf60de1f6cc726b6c832db8701227d089310e8aa", size = 2714215, upload-time = "2025-10-10T11:13:07.14Z" }, - { url = "https://files.pythonhosted.org/packages/64/12/93ef0098590cf51d9732b4f139533732565704f45bdc1ffa741b7c95fb54/psycopg2_binary-2.9.11-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:92e3b669236327083a2e33ccfa0d320dd01b9803b3e14dd986a4fc54aa00f4e1", size = 3756567, upload-time = "2025-10-10T11:13:11.885Z" }, - { url = "https://files.pythonhosted.org/packages/7c/a9/9d55c614a891288f15ca4b5209b09f0f01e3124056924e17b81b9fa054cc/psycopg2_binary-2.9.11-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:e0deeb03da539fa3577fcb0b3f2554a97f7e5477c246098dbb18091a4a01c16f", size = 3864755, upload-time = "2025-10-10T11:13:17.727Z" }, - { url = "https://files.pythonhosted.org/packages/13/1e/98874ce72fd29cbde93209977b196a2edae03f8490d1bd8158e7f1daf3a0/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b52a3f9bb540a3e4ec0f6ba6d31339727b2950c9772850d6545b7eae0b9d7c5", size = 4411646, upload-time = "2025-10-10T11:13:24.432Z" }, - { url = "https://files.pythonhosted.org/packages/5a/bd/a335ce6645334fb8d758cc358810defca14a1d19ffbc8a10bd38a2328565/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:db4fd476874ccfdbb630a54426964959e58da4c61c9feba73e6094d51303d7d8", size = 4468701, upload-time = "2025-10-10T11:13:29.266Z" }, - { url = "https://files.pythonhosted.org/packages/44/d6/c8b4f53f34e295e45709b7568bf9b9407a612ea30387d35eb9fa84f269b4/psycopg2_binary-2.9.11-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:47f212c1d3be608a12937cc131bd85502954398aaa1320cb4c14421a0ffccf4c", size = 4166293, upload-time = "2025-10-10T11:13:33.336Z" }, - { url = "https://files.pythonhosted.org/packages/4b/e0/f8cc36eadd1b716ab36bb290618a3292e009867e5c97ce4aba908cb99644/psycopg2_binary-2.9.11-cp314-cp314-manylinux_2_38_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:e35b7abae2b0adab776add56111df1735ccc71406e56203515e228a8dc07089f", size = 3983184, upload-time = "2025-10-30T02:55:32.483Z" }, - { url = "https://files.pythonhosted.org/packages/53/3e/2a8fe18a4e61cfb3417da67b6318e12691772c0696d79434184a511906dc/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:fcf21be3ce5f5659daefd2b3b3b6e4727b028221ddc94e6c1523425579664747", size = 3652650, upload-time = "2025-10-10T11:13:38.181Z" }, - { url = "https://files.pythonhosted.org/packages/76/36/03801461b31b29fe58d228c24388f999fe814dfc302856e0d17f97d7c54d/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:9bd81e64e8de111237737b29d68039b9c813bdf520156af36d26819c9a979e5f", size = 3298663, upload-time = "2025-10-10T11:13:44.878Z" }, - { url = "https://files.pythonhosted.org/packages/97/77/21b0ea2e1a73aa5fa9222b2a6b8ba325c43c3a8d54272839c991f2345656/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:32770a4d666fbdafab017086655bcddab791d7cb260a16679cc5a7338b64343b", size = 3044737, upload-time = "2025-10-30T02:55:35.69Z" }, - { url = "https://files.pythonhosted.org/packages/67/69/f36abe5f118c1dca6d3726ceae164b9356985805480731ac6712a63f24f0/psycopg2_binary-2.9.11-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:c3cb3a676873d7506825221045bd70e0427c905b9c8ee8d6acd70cfcbd6e576d", size = 3347643, upload-time = "2025-10-10T11:13:53.499Z" }, - { url = "https://files.pythonhosted.org/packages/e1/36/9c0c326fe3a4227953dfb29f5d0c8ae3b8eb8c1cd2967aa569f50cb3c61f/psycopg2_binary-2.9.11-cp314-cp314-win_amd64.whl", hash = "sha256:4012c9c954dfaccd28f94e84ab9f94e12df76b4afb22331b1f0d3154893a6316", size = 2803913, upload-time = "2025-10-10T11:13:57.058Z" }, -] - [[package]] name = "ptyprocess" version = "0.7.0"