From 7aad433d5b7c28ab8b1c4f93a3afe99dfaf0c62b Mon Sep 17 00:00:00 2001
From: Lawrence Babb <lbabb@broadinstitute.org>
Date: Sun, 22 Mar 2026 15:54:44 -0400
Subject: [PATCH] Update documentation, add setup files, and improve gitignore

Significantly expand README with UTA troubleshooting, port conflict guidance,
dummy AWS credentials documentation, and release processing script usage.
Add uta-setup.sql required by the compose file and CLAUDE.md for AI-assisted
development. Fix SeqRepo volume mount path in compose file.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gitignore                        |  3 +
 CLAUDE.md                         | 99 +++++++++++++++++++++++++++++++
 README.md                         | 95 +++++++++++++++++++++++------
 uta-setup.sql                     | 27 +++++++++
 variation-normalizer-compose.yaml |  4 +-
 5 files changed, 209 insertions(+), 19 deletions(-)
 create mode 100644 CLAUDE.md
 create mode 100644 uta-setup.sql

diff --git a/.gitignore b/.gitignore
index d5526bb..4d0840e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -164,3 +164,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+buckets/
+uv.lock
+.claude/settings.local.json
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..b176030
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,99 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+This is a Python project that converts ClinVar variant-identity data into GA4GH GKS (Genomic Knowledge Standards) forms. The project processes NDJSON files containing ClinVar variants and converts them to VRS (Variation Representation Specification) format using the variation-normalization library.
+
+## Common Development Commands
+
+### Installation
+```bash
+pip install -e '.[dev]'
+```
+
+### Testing
+```bash
+pytest
+pytest test/test_cli.py::test_parse_args  # Run specific test
+```
+
+### Code Quality
+```bash
+./lint.sh                # Check code quality (black, isort, ruff, pylint)
+./lint.sh apply          # Apply automatic fixes
+```
+
+### Running the Application
+```bash
+# Process a local file
+clinvar-gk-pilot --filename sample-input.ndjson.gz --parallelism 4
+
+# Process a file from Google Cloud Storage
+clinvar-gk-pilot --filename gs://clinvar-gks/2025-07-06/dev/vi.json.gz --parallelism 4
+
+# Enable liftover for genomic coordinate conversion
+clinvar-gk-pilot --filename input.ndjson.gz --parallelism 2 --liftover
+```
+
+## Architecture
+
+### Core Processing Pipeline
+- **Input**: GZIP-compressed NDJSON files with ClinVar variant data
+- **Processing**: Converts variants to VRS format using variation-normalization library
+- **Output**: GZIP-compressed NDJSON files with input/output pairs
+
+### Key Components
+- `clinvar_gk_pilot/main.py`: Core processing logic with multiprocessing support
+- `clinvar_gk_pilot/cli.py`: Command-line argument parsing
+- `clinvar_gk_pilot/gcs.py`: Google Cloud Storage download utilities
+- Uses variation-normalization library for VRS conversion
+- Supports three variant types: Allele, CopyNumberChange, CopyNumberCount
+
+### Parallel Processing
+- Uses Python multiprocessing with configurable worker count
+- Files are partitioned by line count across workers
+- Each worker runs with timeout protection (10 seconds per variant)
+- Workers use persistent async event loops for variation-normalization queries
+
+## Required Environment Variables
+
+```bash
+# SeqRepo configuration
+export SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20
+export SEQREPO_DATAPROXY_URL=seqrepo+file://${SEQREPO_ROOT_DIR}
+
+# Database URLs (from Docker compose services)
+export UTA_DB_URL=postgresql://anonymous:anonymous@localhost:5432/uta/uta_20241220
+export GENE_NORM_DB_URL=http://localhost:8000
+```
+
+## External Dependencies
+
+### Required Services
+The project requires these Docker services from variation-normalization:
+```bash
+curl -o variation-normalizer-compose.yaml https://raw.githubusercontent.com/cancervariants/variation-normalization/0.15.0/compose.yaml
+docker compose -f variation-normalizer-compose.yaml up -d
+```
+
+This starts:
+- UTA database (port 5432): Universal Transcript Archive
+- Gene Normalizer database (port 8000): Gene normalization service
+- Variation Normalizer API (port 8001): Variation normalization service
+
+### Memory Considerations
+When using `--liftover` with high parallelism, increase Docker shared memory:
+```yaml
+services:
+  uta:
+    shm_size: 256m
+```
+
+## File Structure Notes
+
+- Input files: Expected to be GZIP-compressed NDJSON format
+- Output location: Files written to `output/` directory with same path structure
+- GCS files: Auto-downloaded to `buckets/` directory with bucket name preserved
+- Logs: Created per-worker as `{input_file}.log`
\ No newline at end of file
diff --git a/README.md b/README.md
index 55be25e..daf7fea 100644
--- a/README.md
+++ b/README.md
@@ -6,10 +6,10 @@ Project for reading and normalizing ClinVar variants into GA4GH GKS forms.
 
 ### Prerequisites
 
-1. **Docker** (or podman) - Required to run the variation-normalization services
-2. **Python 3.11+** - Required for the main application
-3. **SeqRepo database** - Local sequence repository
-4. **UTA database** - Local Universal Transcript Archive (only needed for liftover)
+1. **Docker** (or podman) - Required to run the UTA and Gene Normalizer database services
+1. **Python 3.11+** - Required for the main application
+1. **SeqRepo database** - Local sequence repository
+1. **UTA database** - Universal Transcript Archive (required; also used for liftover)
 
 ## Installation
 
@@ -27,39 +27,77 @@ pip install -e '.[dev]'
 
 ### Database Services Setup
 
-This project requires several database services that can be easily set up using the Docker compose configuration from the variation-normalization project.
+This project requires two database services (UTA and Gene Normalizer) that can be set up using the `variation-normalizer-compose.yaml` included in this repository. The compose file also includes a Variation Normalizer API service, but this project uses the variation-normalization Python library directly and does not require the API container.
 
-1. Download the compose.yaml file from variation-normalization v0.15.0 (matching the version in pyproject.toml):
+Before starting, update the SeqRepo volume mount in `variation-normalizer-compose.yaml` to point to your local SeqRepo installation. The `uta-setup.sql` file referenced by the compose file is also included in this repository.
+
+1. Create the external volume required by the UTA service:
 
 ```bash
-curl -o variation-normalizer-compose.yaml https://raw.githubusercontent.com/cancervariants/variation-normalization/0.15.0/compose.yaml
+docker volume create uta_vol
 ```
+(*or `podman volume create uta_vol` for podman*)
 
-2. Start the required services:
+1. Start the required services:
 
 ```bash
 docker compose -f variation-normalizer-compose.yaml up -d
 ```
-(*or `uvx podman-compose` for podman*)
+(*or `uvx podman-compose -f variation-normalizer-compose.yaml up -d` for podman*)
 
 This will start:
-- **UTA database** (port 5432): Universal Transcript Archive for transcript mapping
+- **UTA database** (port 5434): Universal Transcript Archive for transcript mapping
 - **Gene Normalizer database** (port 8000): Gene normalization service
-- **Variation Normalizer API** (port 8001): Variation normalization service
+- **Variation Normalizer API** (port 8001): Not required by this project, but started by the compose file
+
+#### Known Issue: UTA Data Download Failure
+
+The UTA container downloads a large database dump (~344MB) from `dl.biocommons.org` on first startup. This download may fail due to a human-verification gate on the biocommons download server, resulting in the UTA schema not being loaded. You can check by running:
+
+```bash
+psql -XAt postgres://anonymous@localhost:5434/uta -c 'select count(*) from uta_20241220.transcript'
+# Expected output: 329090
+```
+
+If the schema is missing, you'll need to download the dump manually and restore it:
+
+```bash
+# Download the dump (you may need to open this URL in a browser first to pass verification)
+curl -L -o /tmp/uta_20241220.pgd.gz https://dl.biocommons.org/uta/uta_20241220.pgd.gz
+
+# Verify it's a valid gzip file (should say "gzip compressed data", not "HTML document")
+file /tmp/uta_20241220.pgd.gz
+
+# Copy into the container and restore
+docker cp /tmp/uta_20241220.pgd.gz <uta_container_name>:/tmp/uta_20241220.pgd.gz
+docker exec <uta_container_name> bash -c \
+  'gzip -cdq /tmp/uta_20241220.pgd.gz | psql -1e -U uta_admin -d uta -v ON_ERROR_STOP=1'
+```
 
-**Note on Port Conflicts**: If you already have services running on these ports, you can modify the port mappings in `variation-normalizer-compose.yaml`:
-- For UTA database: Change `5432:5432` to `5433:5432` (or another available port)
+The restore takes several minutes (longer under architecture emulation, e.g. amd64 images on Apple Silicon).
+
+#### Port Conflicts
+
+Before starting the services, check for existing containers using the same ports:
+
+```bash
+docker ps -a | grep -E '5434|8000|8001'
+```
+
+If you have conflicts, you can modify the port mappings in `variation-normalizer-compose.yaml`:
+
+- For UTA database: Change `5434:5432` to another available port (e.g., `5433:5432`)
 - For Gene Normalizer: Change `8000:8000` to `8002:8000` (or another available port)
 - For Variation Normalizer API: Change `8001:80` to `8003:80` (or another available port)
 
-Verify containers are running on the desired ports, e.g. the UTA postgres is running on host port 5433 and the gene normalizer db is on port 8000:
-```
+Verify the required containers are running:
+```bash
 docker ps -a | grep 'uta\|gene-norm'
 ```
 
 ### Environment Configuration
 
-Set up the required environment variables. You can use the provided `env.sh` as a reference:
+Set up the required environment variables:
 
 ```bash
 # SeqRepo configuration - Update path to your local SeqRepo installation
@@ -67,12 +105,19 @@ export SEQREPO_ROOT_DIR=/usr/local/share/seqrepo/2024-12-20
 export SEQREPO_DATAPROXY_URL=seqrepo+file://${SEQREPO_ROOT_DIR}
 
 # Database URLs (using the Docker compose services)
-export UTA_DB_URL=postgresql://anonymous:anonymous@localhost:5432/uta/uta_20241220
+export UTA_DB_URL=postgresql://anonymous:anonymous@localhost:5434/uta/uta_20241220
 export GENE_NORM_DB_URL=http://localhost:8000
+
+# Dummy AWS credentials required by the Gene Normalizer local DynamoDB
+export AWS_ACCESS_KEY_ID=DUMMYIDEXAMPLE
+export AWS_SECRET_ACCESS_KEY=DUMMYEXAMPLEKEY
+export AWS_DEFAULT_REGION=us-east-2
 ```
 
 **Important**: If you modified the ports in the compose file, update the corresponding environment variables accordingly (e.g., change `5432` to `5433` in `UTA_DB_URL` if you changed the UTA port).
 
+The AWS credentials are not real credentials — they are dummy values required by the local DynamoDB instance used by the Gene Normalizer. Without them, the application will fail with `NoCredentialsError` or a `302` error when connecting to the Gene Normalizer database.
+
 ### Python Installation
 
 Install the project and its dependencies:
@@ -114,6 +159,22 @@ Process a file from Google Cloud Storage:
 clinvar-gk-pilot --filename gs://clinvar-gks/2025-07-06/dev/vi.json.gz --parallelism 4
 ```
 
+### Release Processing Script
+
+The `misc/clinvar-vrsification` script is a convenience wrapper for processing a ClinVar release and uploading the results back to GCS. It takes a release date as its only argument:
+
+```bash
+./misc/clinvar-vrsification 2025-07-06
+```
+
+This will:
+
+1. Download and normalize `gs://clinvar-gks/<date>/dev/vi.jsonl.gz` with parallelism 2 (no liftover)
+1. Log output to `<date>-noliftover.log`
+1. Upload the result to `gs://clinvar-gks/<date>/dev/vi-normalized-no-liftover.jsonl.gz`
+
+Requires `gcloud` CLI configured with write access to the `clinvar-gks` bucket.
+
 ### Parallelism
 
 Parallelism is configurable and uses python multiprocessing and multiprocessing queues. Some parallelism is significantly beneficial but since there is interprocess communication overhead and they are hitting the same filesystem there can be diminishing returns. On a Macbook Pro with 16 cores, setting parallelism to 4-6 provides clear benefit, but exceeding 10 saturates the machine and may be counterproductive. The code will partition the input file into `<parallelism>` number of files and each worker will process one, and then the outputs will be combined.
diff --git a/uta-setup.sql b/uta-setup.sql
new file mode 100644
index 0000000..a902f7b
--- /dev/null
+++ b/uta-setup.sql
@@ -0,0 +1,27 @@
+\c uta;
+CREATE TABLE uta_20241220.genomic AS
+SELECT t.hgnc, aes.alt_ac, aes.alt_aln_method,
+    aes.alt_strand, ae.start_i AS alt_start_i,
+    ae.end_i AS alt_end_i
+FROM (((((uta_20241220.transcript t
+    JOIN uta_20241220.exon_set tes ON (((t.ac = tes.tx_ac)
+        AND (tes.alt_aln_method = 'transcript'::text))))
+    JOIN uta_20241220.exon_set aes ON (((t.ac = aes.tx_ac)
+        AND (aes.alt_aln_method <> 'transcript'::text))))
+    JOIN uta_20241220.exon te ON
+        ((tes.exon_set_id = te.exon_set_id)))
+    JOIN uta_20241220.exon ae ON
+        (((aes.exon_set_id = ae.exon_set_id)
+        AND (te.ord = ae.ord))))
+    LEFT JOIN uta_20241220.exon_aln ea ON
+        (((te.exon_id = ea.tx_exon_id) AND
+        (ae.exon_id = ea.alt_exon_id))));
+
+CREATE INDEX alt_pos_index ON uta_20241220.genomic (alt_ac, alt_start_i, alt_end_i);
+CREATE INDEX gene_alt_index ON uta_20241220.genomic (hgnc, alt_ac);
+CREATE INDEX alt_ac_index ON uta_20241220.genomic (alt_ac);
+
+GRANT CONNECT ON DATABASE uta TO anonymous;
+GRANT SELECT, INSERT, UPDATE, DELETE ON ALL TABLES IN SCHEMA uta_20241220 TO anonymous;
+ALTER DATABASE uta OWNER TO anonymous;
+ALTER SCHEMA uta_20241220 OWNER to anonymous;
diff --git a/variation-normalizer-compose.yaml b/variation-normalizer-compose.yaml
index 9420f67..b69b07f 100644
--- a/variation-normalizer-compose.yaml
+++ b/variation-normalizer-compose.yaml
@@ -21,8 +21,8 @@ services:
 
     volumes:
       - uta_vol:/var/lib/postgresql/data
-      # - /usr/local/share/seqrepo:/usr/local/share/seqrepo
-      - /Users/kferrite/dev/data/seqrepo:/usr/local/share/seqrepo:ro
+      # Update this path to your local SeqRepo installation
+      - /usr/local/share/seqrepo:/usr/local/share/seqrepo:ro
 
   uta:
     # Test: