fg-labs · ameynert · Apr 29, 2026 · Apr 23, 2026 · Apr 23, 2026 · Apr 23, 2026
diff --git a/divref/divref/haplotype.py b/divref/divref/haplotype.py
@@ -158,3 +158,33 @@ def get_range(i: hl.Expression) -> hl.Expression:
         gnomad_freqs=ht.haplotype_indices.map(lambda i: ht.gnomad_freqs[i]),
     )
     return ht.drop("haplotype_indices")
+
+
+def haplo_coordinates(
+    window_size: int,
+    variants: hl.Expression,
+) -> hl.Expression:
+    """
+    Compute the 0-based half-open reference genome coordinates of a haplotype sequence window.
+
+    The variant position coordinate is 1-based.
+
+    The window spans from `window_size` bases before the first variant to `window_size` bases
+    after the end of the last variant's reference allele — matching the flanking context added
+    by get_haplo_sequence.
+
+    Args:
+        window_size: Number of flanking reference bases on each side (same value passed to
+            get_haplo_sequence).
+        variants: Hail array expression of variant structs with locus and alleles fields.
+
+    Returns:
+        Hail struct expression with int32 fields `start` (inclusive) and `end` (exclusive).
+    """
+    sorted_variants = hl.sorted(variants, key=lambda x: x.locus.position)
+    min_variant = sorted_variants[0]
+    max_variant = sorted_variants[-1]
+    return hl.struct(
+        start=min_variant.locus.position - 1 - window_size,
+        end=max_variant.locus.position - 1 + hl.len(max_variant.alleles[0]) + window_size,
+    )
diff --git a/divref/divref/main.py b/divref/divref/main.py
@@ -8,7 +8,8 @@
 from divref.tools.compute_haplotype_statistics import compute_haplotype_statistics
 from divref.tools.compute_haplotypes import compute_haplotypes
 from divref.tools.compute_variation_ratios import compute_variation_ratios
-from divref.tools.create_fasta_and_index import create_fasta_and_index
+from divref.tools.create_divref_fasta import create_divref_fasta
+from divref.tools.create_duckdb_index import create_duckdb_index
 from divref.tools.create_gnomad_sites_vcf import create_gnomad_sites_vcf
 from divref.tools.extract_gnomad_afs import extract_gnomad_afs
 from divref.tools.extract_gnomad_single_afs import extract_gnomad_single_afs
@@ -20,7 +21,8 @@
     compute_haplotypes,
     compute_haplotype_statistics,
     compute_variation_ratios,
-    create_fasta_and_index,
+    create_duckdb_index,
+    create_divref_fasta,
     create_gnomad_sites_vcf,
     extract_gnomad_afs,
     extract_gnomad_single_afs,

diff --git a/divref/divref/tools/create_divref_fasta.py b/divref/divref/tools/create_divref_fasta.py
@@ -0,0 +1,51 @@
+"""Tool to write per-chromosome FASTA files from a DivRef DuckDB index."""
+
+import logging
+from pathlib import Path
+
+import duckdb
+import polars
+from fgpyo.io import assert_path_is_readable
+from fgpyo.io import assert_path_is_writable
+
+logger = logging.getLogger(__name__)
+
+
+def _write_fasta_files(df: polars.DataFrame, output_base: Path) -> None:
+    """
+    Write one FASTA file per chromosome to {output_base}.{chrom}.fasta.
+
+    Args:
+        df: DataFrame with sequence_id, sequence, and variants columns.
+        output_base: Base path; chromosome name is appended as a suffix.
+    """
+    for chrom in sorted(df["contig"].unique().to_list()):
+        logger.info("Creating FASTA for chromosome %s", chrom)
+        df_chrom = df.filter(df["contig"] == chrom)
+        out_path = Path(f"{output_base}.{chrom}.fasta")
+        with open(out_path, "w") as fasta_out:
+            for sequence_id, sequence in df_chrom.select("sequence_id", "sequence").iter_rows():
+                fasta_out.write(f">{sequence_id}\n{sequence}\n")
+
+
+def create_divref_fasta(
+    *,
+    duckdb_path: Path,
+    output_base: Path,
+) -> None:
+    """
+    Write per-chromosome FASTA files from a DivRef DuckDB index.
+
+    Reads sequence_id, sequence, and variants from the sequences table and writes one FASTA
+    file per chromosome to {output_base}.{chrom}.fasta.
+
+    Args:
+        duckdb_path: Path to an existing DivRef DuckDB index.
+        output_base: Base path for output FASTA files; chromosome name is appended as a suffix.
+    """
+    assert_path_is_readable(duckdb_path)
+    assert_path_is_writable(output_base)
+    con = duckdb.connect(str(duckdb_path), read_only=True)
+    df = con.execute("SELECT sequence_id, sequence, contig FROM sequences").pl()
+    con.close()
+    _write_fasta_files(df, output_base)