deepgenomics · ovesh · Apr 24, 2026 · Apr 28, 2026 · gkaur · Apr 29, 2026
diff --git a/data-src/hg38.p2/NCBI/v107/add_gene_biotype_to_gff3.py b/data-src/hg38.p2/NCBI/v107/add_gene_biotype_to_gff3.py
@@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+"""
+Add gene_biotype attribute to older NCBI RefSeq GFF3 files that lack it (e.g. v107).
+
+The biotype is inferred from:
+  - pseudo=true on gene lines -> "pseudogene"
+  - gbkey on transcript lines -> mRNA means "protein_coding",
+    ncRNA uses ncrna_class (lncRNA, miRNA, etc.), precursor_RNA -> "pre_miRNA",
+    misc_RNA -> "misc_RNA"
+  - If a gene has no transcripts, falls back to "misc_RNA"
+
+Two-pass approach:
+  1. First pass: collect transcript gbkey/ncrna_class per parent gene ID.
+  2. Second pass: write out the file, injecting gene_biotype on gene lines.
+
+Usage:
+    python add_gene_biotype_to_gff3.py input.gff3.gz output.gff3.gz
+    python add_gene_biotype_to_gff3.py input.gff3 output.gff3
+"""
+
+import argparse
+import gzip
+import sys
+from collections import defaultdict
+
+
+def open_maybe_gzip(path, mode="rt"):
+    if path.endswith(".gz"):
+        return gzip.open(path, mode)
+    return open(path, mode)
+
+
+def parse_attributes(attr_str):
+    """Parse GFF3 attribute string into a dict."""
+    attrs = {}
+    for item in attr_str.split(";"):
+        item = item.strip()
+        if not item:
+            continue
+        if "=" in item:
+            key, value = item.split("=", 1)
+            attrs[key] = value
+    return attrs
+
+
+def infer_biotype_from_transcript(gbkey, ncrna_class):
+    """Infer a gene_biotype string from transcript-level attributes."""
+    if gbkey == "mRNA":
+        return "protein_coding"
+    if gbkey == "ncRNA":
+        if ncrna_class:
+            # ncrna_class values like lncRNA, miRNA, etc. map directly
+            # to biotypes that GenomeKit knows (via as_biotype_compatible)
+            return ncrna_class
+        return "ncRNA"
+    if gbkey == "precursor_RNA":
+        return "pre_miRNA"
+    if gbkey == "rRNA":
+        return "rRNA"
+    if gbkey == "tRNA":
+        return "tRNA"
+    # misc_RNA, other
+    return "misc_RNA"
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Add gene_biotype attribute to NCBI RefSeq GFF3 files that lack it."
+    )
+    parser.add_argument("input", help="Input GFF3 file (optionally gzipped)")
+    parser.add_argument("output", help="Output GFF3 file (gzipped if .gz)")
+    args = parser.parse_args()
+
+    # --- Pass 1: collect transcript info per gene ---
+    # gene_id -> list of (gbkey, ncrna_class)
+    tran_info_by_gene = defaultdict(list)
+    # gene_id -> bool (has pseudo=true)
+    gene_is_pseudo = {}
+    # gene_id -> bool (already has gene_biotype)
+    gene_has_biotype = {}
+    # Track which gene IDs exist
+    gene_ids = set()
+
+    print(f"Pass 1: scanning {args.input} ...", file=sys.stderr)
+    with open_maybe_gzip(args.input) as f:
+        for line in f:
+            line = line.rstrip("\n")
+            if not line or line.startswith("#"):
+                continue
+
+            cols = line.split("\t")
+            if len(cols) != 9:
+                continue
+
+            attrs = parse_attributes(cols[8])
+            gff_id = attrs.get("ID", "")
+
+            # Detect gene lines
+            if cols[2] == "gene" or cols[2] == "pseudogene" or gff_id.startswith("gene"):
+                gene_ids.add(gff_id)
+                gene_is_pseudo[gff_id] = (
+                    attrs.get("pseudo", "") == "true" or cols[2] == "pseudogene"
+                )
+                gene_has_biotype[gff_id] = "gene_biotype" in attrs
+
+            # Detect transcript lines (rna IDs with Parent pointing to a gene)
+            elif gff_id.startswith("rna"):
+                parent = attrs.get("Parent", "")
+                if parent in gene_ids or parent.startswith("gene"):
+                    gbkey = attrs.get("gbkey", "")
+                    ncrna_class = attrs.get("ncrna_class", "")
+                    tran_info_by_gene[parent].append((gbkey, ncrna_class))
+
+    # --- Compute biotypes ---
+    gene_biotype = {}
+    for gid in gene_ids:
+        if gene_has_biotype[gid]:
+            continue  # already present, don't touch
+        if gene_is_pseudo[gid]:
+            gene_biotype[gid] = "pseudogene"
+        elif tran_info_by_gene[gid]:
+            # Use the first transcript to infer
+            gbkey, ncrna_class = tran_info_by_gene[gid][0]
+            gene_biotype[gid] = infer_biotype_from_transcript(gbkey, ncrna_class)
+        else:
+            gene_biotype[gid] = "misc_RNA"
+
+    n_added = len(gene_biotype)
+    n_already = sum(1 for v in gene_has_biotype.values() if v)
+    print(
+        f"  {len(gene_ids)} genes found, {n_already} already have gene_biotype, "
+        f"{n_added} will be added.",
+        file=sys.stderr,
+    )
+
+    # --- Pass 2: write output with gene_biotype injected ---
+    print(f"Pass 2: writing {args.output} ...", file=sys.stderr)
+    with open_maybe_gzip(args.input) as fin, open_maybe_gzip(args.output, "wt") as fout:
+        for line in fin:
+            line = line.rstrip("\n")
+            if not line or line.startswith("#"):
+                fout.write(line + "\n")
+                continue
+
+            cols = line.split("\t")
+            if len(cols) != 9:
+                fout.write(line + "\n")
+                continue
+
+            attrs = parse_attributes(cols[8])
+            gff_id = attrs.get("ID", "")
+
+            if gff_id in gene_biotype:
+                # Append gene_biotype to the attributes column
+                cols[8] = cols[8].rstrip(";") + f";gene_biotype={gene_biotype[gff_id]}"
+                fout.write("\t".join(cols) + "\n")
+            else:
+                fout.write(line + "\n")
+
+    print("Done.", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/data-src/hg38.p2/NCBI/v107/fetch.sh b/data-src/hg38.p2/NCBI/v107/fetch.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+
+wget -O v107.src.gff3.gz https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/ref_GRCh38.p2_top_level.gff3.gz
+# the gff is missing gene_biotype ; add it heuristically rather than supporting this in the builder
+python add_gene_biotype_to_gff3.py v107.src.gff3.gz v107.tmp.gff3.gz
+
+# NT_187507.1 isn't in hg38.p12.chromAlias.txt, which we are using for hg38.p2.
+# ok to remove it because NCBI says:
+# > Record suppressed. This RefSeq record was removed because the sequence was determined to have originated from Chinese hamster.
+# https://www.ncbi.nlm.nih.gov/nuccore/NT_187507.1?report=genbank
+
+# doing extra work of unzipping and gzipping again but saves code duplication and only takes a few seconds longer
+zgrep -v NT_187507.1 v107.tmp.gff3.gz | gzip > v107.gff3.gz
+rm v107.tmp.gff3.gz
diff --git a/data-src/hg38.p2/assembly/fetch.sh b/data-src/hg38.p2/assembly/fetch.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/GRCh38.p2.genome.fa.gz
+faToTwoBit GRCh38.p2.genome.fa.gz hg38.p2.2bit
+twoBitInfo hg38.p2.2bit stdout | sort -k2rn > hg38.p2.chrom.sizes
+
+# need chromAlias for ncbi, using hg38.p12.chromAlias.txt
+wget -O hg38.p2.chromAlias.txt https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p12/hg38.p12.chromAlias.txt