Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 165 additions & 0 deletions data-src/hg38.p2/NCBI/v107/add_gene_biotype_to_gff3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env python3
"""
Add gene_biotype attribute to older NCBI RefSeq GFF3 files that lack it (e.g. v107).

The biotype is inferred from:
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As specified here, this script produces inferred biotypes from GFF structure, whereas NCBI assigns biotypes using evidence-driven classification (Gnomon/EGAP). So results will only partially overlap with true RefSeq classifications.

- pseudo=true on gene lines -> "pseudogene"
- gbkey on transcript lines -> mRNA means "protein_coding",
ncRNA uses ncrna_class (lncRNA, miRNA, etc.), precursor_RNA -> "pre_miRNA",
misc_RNA -> "misc_RNA"
- If a gene has no transcripts, falls back to "misc_RNA"
Comment on lines +6 to +10
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't remember this stuff anymore, can maybe Gagan or someone review it as well?


Two-pass approach:
1. First pass: collect transcript gbkey/ncrna_class per parent gene ID.
2. Second pass: write out the file, injecting gene_biotype on gene lines.

Usage:
python add_gene_biotype_to_gff3.py input.gff3.gz output.gff3.gz
python add_gene_biotype_to_gff3.py input.gff3 output.gff3
"""

import argparse
import gzip
import sys
from collections import defaultdict


def open_maybe_gzip(path, mode="rt"):
if path.endswith(".gz"):
return gzip.open(path, mode)
return open(path, mode)


def parse_attributes(attr_str):
"""Parse GFF3 attribute string into a dict."""
attrs = {}
for item in attr_str.split(";"):
item = item.strip()
if not item:
continue
if "=" in item:
key, value = item.split("=", 1)
attrs[key] = value
return attrs


def infer_biotype_from_transcript(gbkey, ncrna_class):
"""Infer a gene_biotype string from transcript-level attributes."""
if gbkey == "mRNA":
return "protein_coding"
Comment on lines +48 to +49
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assumes all mRNA entries are protein-coding. NCBI classification also considers CDS integrity and supporting evidence, so this may overcall protein-coding genes.

It would be great to add a check for CDS entries for the transcript

if gbkey == "ncRNA":
if ncrna_class:
# ncrna_class values like lncRNA, miRNA, etc. map directly
# to biotypes that GenomeKit knows (via as_biotype_compatible)
return ncrna_class
return "ncRNA"
if gbkey == "precursor_RNA":
return "pre_miRNA"
if gbkey == "rRNA":
return "rRNA"
if gbkey == "tRNA":
return "tRNA"
# misc_RNA, other
return "misc_RNA"
Comment on lines +62 to +63
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Defaulting to misc_RNA may be misleading, could we use unknown?



def main():
parser = argparse.ArgumentParser(
description="Add gene_biotype attribute to NCBI RefSeq GFF3 files that lack it."
)
parser.add_argument("input", help="Input GFF3 file (optionally gzipped)")
parser.add_argument("output", help="Output GFF3 file (gzipped if .gz)")
args = parser.parse_args()

# --- Pass 1: collect transcript info per gene ---
# gene_id -> list of (gbkey, ncrna_class)
tran_info_by_gene = defaultdict(list)
# gene_id -> bool (has pseudo=true)
gene_is_pseudo = {}
# gene_id -> bool (already has gene_biotype)
gene_has_biotype = {}
# Track which gene IDs exist
gene_ids = set()

print(f"Pass 1: scanning {args.input} ...", file=sys.stderr)
with open_maybe_gzip(args.input) as f:
for line in f:
line = line.rstrip("\n")
if not line or line.startswith("#"):
continue

cols = line.split("\t")
if len(cols) != 9:
continue

attrs = parse_attributes(cols[8])
gff_id = attrs.get("ID", "")

# Detect gene lines
if cols[2] == "gene" or cols[2] == "pseudogene" or gff_id.startswith("gene"):
gene_ids.add(gff_id)
gene_is_pseudo[gff_id] = (
attrs.get("pseudo", "") == "true" or cols[2] == "pseudogene"
)
gene_has_biotype[gff_id] = "gene_biotype" in attrs

# Detect transcript lines (rna IDs with Parent pointing to a gene)
elif gff_id.startswith("rna"):
parent = attrs.get("Parent", "")
if parent in gene_ids or parent.startswith("gene"):
gbkey = attrs.get("gbkey", "")
ncrna_class = attrs.get("ncrna_class", "")
tran_info_by_gene[parent].append((gbkey, ncrna_class))

# --- Compute biotypes ---
gene_biotype = {}
for gid in gene_ids:
if gene_has_biotype[gid]:
continue # already present, don't touch
if gene_is_pseudo[gid]:
gene_biotype[gid] = "pseudogene"
elif tran_info_by_gene[gid]:
# Use the first transcript to infer
gbkey, ncrna_class = tran_info_by_gene[gid][0]
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uses only the first transcript to infer gene biotype. Should we consider aggregating across all transcripts and resolving conflicts (eg: prioritizing protein_coding if any transcript is coding)?

gene_biotype[gid] = infer_biotype_from_transcript(gbkey, ncrna_class)
else:
gene_biotype[gid] = "misc_RNA"

n_added = len(gene_biotype)
n_already = sum(1 for v in gene_has_biotype.values() if v)
print(
f" {len(gene_ids)} genes found, {n_already} already have gene_biotype, "
f"{n_added} will be added.",
file=sys.stderr,
)

# --- Pass 2: write output with gene_biotype injected ---
print(f"Pass 2: writing {args.output} ...", file=sys.stderr)
with open_maybe_gzip(args.input) as fin, open_maybe_gzip(args.output, "wt") as fout:
for line in fin:
line = line.rstrip("\n")
if not line or line.startswith("#"):
fout.write(line + "\n")
continue

cols = line.split("\t")
if len(cols) != 9:
fout.write(line + "\n")
continue

attrs = parse_attributes(cols[8])
gff_id = attrs.get("ID", "")

if gff_id in gene_biotype:
# Append gene_biotype to the attributes column
cols[8] = cols[8].rstrip(";") + f";gene_biotype={gene_biotype[gff_id]}"
fout.write("\t".join(cols) + "\n")
else:
fout.write(line + "\n")

print("Done.", file=sys.stderr)


if __name__ == "__main__":
main()

14 changes: 14 additions & 0 deletions data-src/hg38.p2/NCBI/v107/fetch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/usr/bin/env bash

wget -O v107.src.gff3.gz https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/ref_GRCh38.p2_top_level.gff3.gz
# the gff is missing gene_biotype ; add it heuristically rather than supporting this in the builder
python add_gene_biotype_to_gff3.py v107.src.gff3.gz v107.tmp.gff3.gz

# NT_187507.1 isn't in hg38.p12.chromAlias.txt, which we are using for hg38.p2.
# ok to remove it because NCBI says:
# > Record suppressed. This RefSeq record was removed because the sequence was determined to have originated from Chinese hamster.
# https://www.ncbi.nlm.nih.gov/nuccore/NT_187507.1?report=genbank

# doing extra work of unzipping and gzipping again but saves code duplication and only takes a few seconds longer
zgrep -v NT_187507.1 v107.tmp.gff3.gz | gzip > v107.gff3.gz
rm v107.tmp.gff3.gz
8 changes: 8 additions & 0 deletions data-src/hg38.p2/assembly/fetch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/usr/bin/env bash

wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/GRCh38.p2.genome.fa.gz
faToTwoBit GRCh38.p2.genome.fa.gz hg38.p2.2bit
twoBitInfo hg38.p2.2bit stdout | sort -k2rn > hg38.p2.chrom.sizes

# need chromAlias for ncbi, using hg38.p12.chromAlias.txt
wget -O hg38.p2.chromAlias.txt https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p12/hg38.p12.chromAlias.txt