Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "dpgt/DPGT"]
path = dpgt/DPGT
url = https://github.com/nttg8100/DPGT.git
4 changes: 2 additions & 2 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@ Genomics Kit integrates the utilities for bioinformatics analysis

1. [igv-report](./igv-report): Generate the IGV report html format
2. [spark-on-slurm](./spark-on-slurm/): Spark on SLURM cluster configuration, supported
- [Hail](https://hail.is/): Powering genomic analysis, at every scale
3. [glnexus](https://github.com/dnanexus-rnd/GLnexus): The joint variant calling for cohort vcf for deepvariant gvcf
3. [GLnexus](https://github.com/dnanexus-rnd/GLnexus): The joint variant calling for cohort vcf for deepvariant gvcf
4. [DPGT](https://github.com/BGI-flexlab/DPGT): DPGT is a distributed population genetics analysis tool which enabled joint calling on millions of WGS(whole genome sequencing) samples.
3 changes: 3 additions & 0 deletions common.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.PHONY: test
${HOME}/.pixi/bin/pixi:
curl -sSL https://pixi.sh/install.sh | sh
3 changes: 3 additions & 0 deletions dpgt/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.log
reference
build
1 change: 1 addition & 0 deletions dpgt/DPGT
Submodule DPGT added at bc8806
29 changes: 29 additions & 0 deletions dpgt/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
include ../common.mk
.PHONY: clone build build-cpp build-java clean prep-input run-verify
clone:
bash scripts/clone_DPGT.sh

build: clone build-cpp build-java
@echo "DPGT fully built!"

build-cpp: ${HOME}/.pixi/bin/pixi
mkdir -p build
@echo "Building C++ native libraries..."
${HOME}/.pixi/bin/pixi run build-cpp

build-java: ${HOME}/.pixi/bin/pixi
mkdir -p build
@echo "Building Java JAR package..."
cd DPGT && ${HOME}/.pixi/bin/pixi run mvn package -q -DskipTests
@echo "Java build complete!"

test: build ${HOME}/.pixi/bin/pixi
${HOME}/.pixi/bin/pixi run bash scripts/download.sh
${HOME}/.pixi/bin/pixi run bash scripts/run_dpgt.sh

clean:
@echo "Cleaning build artifacts..."
rm -rf build DPGT/target vcf_input.list results_cohort results*
@echo "Cleaned!"

.DEFAULT_GOAL := help
36 changes: 36 additions & 0 deletions dpgt/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# DPGT Cohort VCF (Pixi)

Build and run DPGT locally with Pixi, then generate a joint-genotyped cohort VCF from 3 samples.

## Quick Start

```bash
pixi install
bash run_dpgt.sh
```

## What `run_dpgt.sh` does

- checks DPGT jar and native library
- runs joint genotyping in local mode
- writes cohort VCF to an output directory

Default output example:

- `results_verify_1kgp_run_dpgt/result.chr12_111760000_111765000.0.vcf.gz`

## Optional overrides

```bash
INPUT_LIST=inputs/1kgp_3samples.list \
REFERENCE_FASTA=reference/Homo_sapiens_assembly38.fasta \
OUTPUT_DIR=results/my_run \
TARGET_REGION=chr12:111760000-111765000 \
JOBS=4 \
bash run_dpgt.sh
```

## Notes

- 1KGP 3-sample run is validated and produces cohort VCF.
- GIAB inputs may fail due to `<NON_REF>` compatibility in genotyping.
6 changes: 6 additions & 0 deletions dpgt/cohort_vcf/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
*.gz
*gz.tbi
GLnexus.DB
*.bcf
*.bed
results
Empty file.
3 changes: 3 additions & 0 deletions dpgt/cohort_vcf/1KGP/gvcf_input.list
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cohort_vcf/1KGP/data/NA21142.final.hard-filtered.gvcf.gz
cohort_vcf/1KGP/data/NA21143.final.hard-filtered.gvcf.gz
cohort_vcf/1KGP/data/NA21144.final.hard-filtered.gvcf.gz
Empty file.
3 changes: 3 additions & 0 deletions dpgt/cohort_vcf/GIAB/gvcf_input.list
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
cohort_vcf/GIAB/data/HG002.child.g.vcf.gz
cohort_vcf/GIAB/data/HG003.parent1.g.vcf.gz
cohort_vcf/GIAB/data/HG004.parent2.g.vcf.gz
2,769 changes: 2,769 additions & 0 deletions dpgt/pixi.lock

Large diffs are not rendered by default.

55 changes: 55 additions & 0 deletions dpgt/pixi.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
[workspace]
name = "dpgt"
version = "1.3.2.0"
description = "DPGT: A Distributed Population Genetics analysis Tool with cohort VCF processing"
authors = ["BGI-flexlab"]
channels = ["conda-forge"]
platforms = ["linux-64"]

[dependencies]
# C++ Build Tools
cmake = ">=3.15"
make = "*"
compilers = "*"
boost = ">=1.74"
jemalloc = "*"

# Java
openjdk = "8.*"

# htslib dependencies
zlib = "*"
bzip2 = "*"
xz = "*"
curl = "*"

# Build tools
pkg-config = "*"
maven = "*"
pyspark = "*"
awscliv2 = ">=2.3.1,<3"
google-cloud-storage = ">=3.10.1,<4"
gsutil = ">=5.36,<6"

[tasks]
# Build C++ libraries
build-cpp = { cmd = "bash scripts/build_cpp.sh", depends-on = [], description = "Build C++ native libraries" }

# Build Java package
build-java = { cmd = "cd DPGT && mvn package -q", depends-on = ["build-cpp"], description = "Build Java JAR package" }

# Full build
build = { cmd = "echo 'Building DPGT...' && sleep 1", depends-on = ["build-cpp", "build-java"], description = "Full build (C++ + Java)" }

# Create input VCF list file
prep-input = { cmd = "echo '/scratch/data/gkit/dpgt/cohort_vcf/1KGP/data/1KGP_ALDH2_5kb.bcf' > vcf_input.list && echo 'Created vcf_input.list with cohort VCF' && cat vcf_input.list", description = "Prepare input VCF list for DPGT" }

# Run DPGT on cohort VCF (local mode) - verify build
run-local = { cmd = "export LD_LIBRARY_PATH=$(pwd)/build/lib:${LD_LIBRARY_PATH} && java -cp $(find DPGT/target -name 'dpgt*.jar' | head -1) org.bgi.flexlab.dpgt.jointcalling.JointCallingSpark --help || echo 'Build first with: pixi run build'", depends-on = ["build"], description = "Run DPGT help (verify build)" }

# Clean build artifacts
clean = { cmd = "rm -rf build DPGT/target vcf_input.list results_cohort", description = "Clean build artifacts and outputs" }

# Setup environment variables
setup = { cmd = "echo '=== DPGT Setup ===' && echo '' && echo 'Cohort VCF available at: /scratch/data/gkit/dpgt/cohort_vcf/1KGP/' && echo '' && echo 'Available tasks:' && echo ' pixi run build - Build DPGT (C++ + Java)' && echo ' pixi run prep-input - Create VCF input list' && echo ' pixi run run-local - Verify build (show DPGT help)' && echo ' pixi run clean - Clean artifacts' && echo '' && echo 'Full workflow:' && echo ' pixi run build && pixi run prep-input && pixi run run-local'", description = "Display setup information" }

9 changes: 9 additions & 0 deletions dpgt/scripts/build_cpp.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
set -e

echo "Building DPGT C++ libraries..."
mkdir -p build
cd build
cmake -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" -DJAVA_INCLUDE_PATH="${JAVA_HOME}/include" ../DPGT/src/main/native/
make -j 8
echo "C++ build completed successfully!"
46 changes: 46 additions & 0 deletions dpgt/scripts/clone_DPGT.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash
set -euo pipefail

PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
DPGT_DIR="${PROJECT_ROOT}/DPGT"
FORK_URL="${DPGT_FORK_URL:-https://github.com/nttg8100/DPGT.git}"

echo "[clone] project root: ${PROJECT_ROOT}"
echo "[clone] fork url: ${FORK_URL}"

if ! git -C "${PROJECT_ROOT}" rev-parse --is-inside-work-tree >/dev/null 2>&1; then
echo "[clone] ERROR: ${PROJECT_ROOT} is not a git repository"
exit 1
fi

echo "[clone] init top-level DPGT submodule"
git -C "${PROJECT_ROOT}" config submodule.DPGT.url "${FORK_URL}"
git -C "${PROJECT_ROOT}" submodule sync --recursive DPGT
git -C "${PROJECT_ROOT}" submodule update --init --recursive DPGT

if [ ! -d "${DPGT_DIR}" ]; then
echo "[clone] ERROR: DPGT directory not found after init"
exit 1
fi

if [ -f "${DPGT_DIR}/.gitmodules" ]; then
echo "[clone] set DPGT origin to fork"
git -C "${DPGT_DIR}" remote set-url origin "${FORK_URL}" || true

echo "[clone] rewrite nested submodule URLs to HTTPS"
sed -i 's#git@github.com:#https://github.com/#g' "${DPGT_DIR}/.gitmodules"
sed -i 's#git@gitlab.com:#https://gitlab.com/#g' "${DPGT_DIR}/.gitmodules"

echo "[clone] sync nested submodule config"
git -C "${DPGT_DIR}" submodule sync --recursive

echo "[clone] clear cached nested submodule metadata"
git -C "${DPGT_DIR}" submodule deinit -f --all || true
rm -rf "${DPGT_DIR}/.git/modules"

echo "[clone] init nested DPGT submodules"
git -C "${DPGT_DIR}" submodule update --init --recursive
fi

echo "[clone] done. nested submodule status:"
git -C "${DPGT_DIR}" submodule status --recursive || true
30 changes: 30 additions & 0 deletions dpgt/scripts/download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
HOME_DIR=$(pwd)
################ Download data ##############################
# Download reference
mkdir -p reference
aws s3 cp --no-sign-request s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta reference
aws s3 cp --no-sign-request s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.fasta.fai reference
aws s3 cp --no-sign-request s3://ngi-igenomes/igenomes/Homo_sapiens/GATK/GRCh38/Sequence/WholeGenomeFasta/Homo_sapiens_assembly38.dict reference

# # GIAB: has error due to gvcf format
# cd ${HOME_DIR}/cohort_vcf/GIAB/data
# gsutil cp gs://deepvariant/case-study-outputs/1.10.0/deeptrio/wgs/HG002.child.g.vcf.gz .
# gsutil cp gs://deepvariant/case-study-outputs/1.10.0/deeptrio/wgs/HG002.child.g.vcf.gz.tbi .

# gsutil cp gs://deepvariant/case-study-outputs/1.10.0/deeptrio/wgs/HG003.parent1.g.vcf.gz .
# gsutil cp gs://deepvariant/case-study-outputs/1.10.0/deeptrio/wgs/HG003.parent1.g.vcf.gz.tbi .

# gsutil cp gs://deepvariant/case-study-outputs/1.10.0/deeptrio/wgs/HG004.parent2.g.vcf.gz .
# gsutil cp gs://deepvariant/case-study-outputs/1.10.0/deeptrio/wgs/HG004.parent2.g.vcf.gz.tbi .


# 1KGP
cd ${HOME_DIR}/cohort_vcf/1KGP/data
aws s3 cp --no-sign-request s3://1000genomes-dragen-v4-2-7/data/individuals/hg38_alt_masked_graph_v3/NA21144/NA21144.final.hard-filtered.gvcf.gz .
aws s3 cp --no-sign-request s3://1000genomes-dragen-v4-2-7/data/individuals/hg38_alt_masked_graph_v3/NA21144/NA21144.final.hard-filtered.gvcf.gz.tbi .

aws s3 cp --no-sign-request s3://1000genomes-dragen-v4-2-7/data/individuals/hg38_alt_masked_graph_v3/NA21143/NA21143.final.hard-filtered.gvcf.gz .
aws s3 cp --no-sign-request s3://1000genomes-dragen-v4-2-7/data/individuals/hg38_alt_masked_graph_v3/NA21143/NA21143.final.hard-filtered.gvcf.gz.tbi .

aws s3 cp --no-sign-request s3://1000genomes-dragen-v4-2-7/data/individuals/hg38_alt_masked_graph_v3/NA21142/NA21142.final.hard-filtered.gvcf.gz .
aws s3 cp --no-sign-request s3://1000genomes-dragen-v4-2-7/data/individuals/hg38_alt_masked_graph_v3/NA21142/NA21142.final.hard-filtered.gvcf.gz.tbi .
92 changes: 92 additions & 0 deletions dpgt/scripts/run_dpgt.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/bin/bash
# DPGT runner for joint genotyping cohort VCF

set -euo pipefail

PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
BUILD_LIB_PATH="${PROJECT_ROOT}/build/lib"
DPGT_JAR="${PROJECT_ROOT}/DPGT/target/dpgt-1.3.2.0.jar"

# Defaults (override with env vars if needed)
INPUT_LIST="${INPUT_LIST:-${PROJECT_ROOT}/cohort_vcf/1KGP/gvcf_input.list}"
REFERENCE_FASTA="${REFERENCE_FASTA:-${PROJECT_ROOT}/reference/Homo_sapiens_assembly38.fasta}"
OUTPUT_DIR="${OUTPUT_DIR:-${PROJECT_ROOT}/cohort_vcf/1KGP/results}"
TARGET_REGION="${TARGET_REGION:-chr12:111760000-111763759}"
JOBS="${JOBS:-4}"
ALLOW_OVERWRITE="${ALLOW_OVERWRITE:-0}"

echo "================================"
echo "DPGT Cohort VCF Runner"
echo "================================"
echo ""

# Check prerequisites
echo "Checking prerequisites..."
if [ -z "$DPGT_JAR" ] || [ ! -f "$DPGT_JAR" ]; then
echo "ERROR: DPGT JAR not found at $DPGT_JAR"
echo "Run 'make build' to compile DPGT first"
exit 1
fi

if [ ! -f "$BUILD_LIB_PATH/libcdpgt.so" ]; then
echo "ERROR: libcdpgt.so not found at $BUILD_LIB_PATH"
echo "Run 'make build-cpp' to compile C++ libraries"
exit 1
fi

if [ ! -f "$INPUT_LIST" ]; then
echo "ERROR: input list not found: $INPUT_LIST"
echo "Create it with one gVCF path per line (3-sample trio supported)."
echo "Example existing list: ${PROJECT_ROOT}/gvcf_input.list"
exit 1
fi

if [ ! -f "$REFERENCE_FASTA" ]; then
echo "ERROR: reference fasta not found: $REFERENCE_FASTA"
exit 1
fi

if [ -d "$OUTPUT_DIR" ] && [ "$(ls -A "$OUTPUT_DIR" 2>/dev/null || true)" != "" ]; then
if [ "$ALLOW_OVERWRITE" = "1" ]; then
echo "Output exists. Removing: $OUTPUT_DIR"
rm -rf "$OUTPUT_DIR"
else
echo "ERROR: output directory exists and is not empty: $OUTPUT_DIR"
echo "Set ALLOW_OVERWRITE=1 or choose another OUTPUT_DIR"
exit 1
fi
fi

echo "DPGT JAR: $DPGT_JAR"
echo "C++ Library: $BUILD_LIB_PATH/libcdpgt.so"
echo "Input List: $INPUT_LIST"
echo "Reference: $REFERENCE_FASTA"
echo "Output Dir: $OUTPUT_DIR"
echo "Region: $TARGET_REGION"
echo ""
echo "Note: default region is a small smoke-test interval."
echo ""

# Runtime environment
export LD_LIBRARY_PATH="$BUILD_LIB_PATH:${LD_LIBRARY_PATH:-}"

echo "Running DPGT joint genotyping..."
echo ""

# Some environments need explicit local filesystem implementations for Spark/Hadoop
java \
-Dspark.hadoop.fs.file.impl=org.apache.hadoop.fs.LocalFileSystem \
-Dspark.hadoop.fs.AbstractFileSystem.file.impl=org.apache.hadoop.fs.local.LocalFs \
-cp "$DPGT_JAR" \
org.bgi.flexlab.dpgt.jointcalling.JointCallingSpark \
-i "$INPUT_LIST" \
-r "$REFERENCE_FASTA" \
-o "$OUTPUT_DIR" \
-j "$JOBS" \
-l "$TARGET_REGION" \
--local

echo ""
echo "Run complete."
echo "Output files in: $OUTPUT_DIR"
find "$OUTPUT_DIR" -maxdepth 1 -type f -name "result*.vcf.gz" -print || true
Loading