Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions divref/divref/haplotype.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,33 @@ def get_range(i: hl.Expression) -> hl.Expression:
gnomad_freqs=ht.haplotype_indices.map(lambda i: ht.gnomad_freqs[i]),
)
return ht.drop("haplotype_indices")


def haplo_coordinates(
window_size: int,
variants: hl.Expression,
) -> hl.Expression:
"""
Compute the 0-based half-open reference genome coordinates of a haplotype sequence window.

The variant position coordinate is 1-based.

The window spans from `window_size` bases before the first variant to `window_size` bases
after the end of the last variant's reference allele — matching the flanking context added
by get_haplo_sequence.

Args:
window_size: Number of flanking reference bases on each side (same value passed to
get_haplo_sequence).
variants: Hail array expression of variant structs with locus and alleles fields.

Returns:
Hail struct expression with int32 fields `start` (inclusive) and `end` (exclusive).
"""
sorted_variants = hl.sorted(variants, key=lambda x: x.locus.position)
min_variant = sorted_variants[0]
max_variant = sorted_variants[-1]
return hl.struct(
start=min_variant.locus.position - 1 - window_size,
end=max_variant.locus.position - 1 + hl.len(max_variant.alleles[0]) + window_size,
)
6 changes: 4 additions & 2 deletions divref/divref/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from divref.tools.compute_haplotype_statistics import compute_haplotype_statistics
from divref.tools.compute_haplotypes import compute_haplotypes
from divref.tools.compute_variation_ratios import compute_variation_ratios
from divref.tools.create_fasta_and_index import create_fasta_and_index
from divref.tools.create_divref_fasta import create_divref_fasta
from divref.tools.create_duckdb_index import create_duckdb_index
from divref.tools.create_gnomad_sites_vcf import create_gnomad_sites_vcf
from divref.tools.extract_gnomad_afs import extract_gnomad_afs
from divref.tools.extract_gnomad_single_afs import extract_gnomad_single_afs
Expand All @@ -20,7 +21,8 @@
compute_haplotypes,
compute_haplotype_statistics,
compute_variation_ratios,
create_fasta_and_index,
create_duckdb_index,
create_divref_fasta,
create_gnomad_sites_vcf,
extract_gnomad_afs,
extract_gnomad_single_afs,
Expand Down
51 changes: 51 additions & 0 deletions divref/divref/tools/create_divref_fasta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Tool to write per-chromosome FASTA files from a DivRef DuckDB index."""

import logging
from pathlib import Path

import duckdb
import polars
from fgpyo.io import assert_path_is_readable
from fgpyo.io import assert_path_is_writable

logger = logging.getLogger(__name__)


def _write_fasta_files(df: polars.DataFrame, output_base: Path) -> None:
"""
Write one FASTA file per chromosome to {output_base}.{chrom}.fasta.

Args:
df: DataFrame with sequence_id, sequence, and variants columns.
output_base: Base path; chromosome name is appended as a suffix.
"""
for chrom in sorted(df["contig"].unique().to_list()):
logger.info("Creating FASTA for chromosome %s", chrom)
df_chrom = df.filter(df["contig"] == chrom)
out_path = Path(f"{output_base}.{chrom}.fasta")
with open(out_path, "w") as fasta_out:
for sequence_id, sequence in df_chrom.select("sequence_id", "sequence").iter_rows():
fasta_out.write(f">{sequence_id}\n{sequence}\n")
Comment on lines +22 to +28
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Don’t skip configured contigs that end up with zero sequences.

This loop only writes FASTAs for contigs present in df["contig"], but workflows/generate_divref.smk Lines 271-274 declare one output per configured chromosome. If a chromosome is filtered down to zero rows, no file gets created and the workflow fails on missing outputs. Pass the expected contig list into this tool and emit empty FASTAs for absent contigs.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@divref/divref/tools/create_divref_fasta.py` around lines 22 - 28, The current
loop only iterates over contigs present in df and omits creating files for
configured contigs with zero rows; change the function to accept the expected
contig list (e.g., contigs or configured_contigs) and iterate over that list
instead of sorted(df["contig"].unique().to_list()), then for each contig build
df_chrom = df.filter(df["contig"] == chrom) and always create out_path =
Path(f"{output_base}.{chrom}.fasta"); if df_chrom has rows write the sequences
as before, otherwise write an empty FASTA (or a comment/header line) so the file
exists; update logging (logger.info) to show when an empty FASTA is emitted.



def create_divref_fasta(
*,
duckdb_path: Path,
output_base: Path,
) -> None:
"""
Write per-chromosome FASTA files from a DivRef DuckDB index.

Reads sequence_id, sequence, and variants from the sequences table and writes one FASTA
file per chromosome to {output_base}.{chrom}.fasta.

Args:
duckdb_path: Path to an existing DivRef DuckDB index.
output_base: Base path for output FASTA files; chromosome name is appended as a suffix.
"""
assert_path_is_readable(duckdb_path)
assert_path_is_writable(output_base)
con = duckdb.connect(str(duckdb_path), read_only=True)
df = con.execute("SELECT sequence_id, sequence, contig FROM sequences").pl()
con.close()
_write_fasta_files(df, output_base)
Loading
Loading