diff --git a/data-src/hg38.p2/NCBI/v107/add_gene_biotype_to_gff3.py b/data-src/hg38.p2/NCBI/v107/add_gene_biotype_to_gff3.py new file mode 100644 index 00000000..9ad2641e --- /dev/null +++ b/data-src/hg38.p2/NCBI/v107/add_gene_biotype_to_gff3.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 +""" +Add gene_biotype attribute to older NCBI RefSeq GFF3 files that lack it (e.g. v107). + +The biotype is inferred from: + - pseudo=true on gene lines -> "pseudogene" + - gbkey on transcript lines -> mRNA means "protein_coding", + ncRNA uses ncrna_class (lncRNA, miRNA, etc.), precursor_RNA -> "pre_miRNA", + misc_RNA -> "misc_RNA" + - If a gene has no transcripts, falls back to "misc_RNA" + +Two-pass approach: + 1. First pass: collect transcript gbkey/ncrna_class per parent gene ID. + 2. Second pass: write out the file, injecting gene_biotype on gene lines. + +Usage: + python add_gene_biotype_to_gff3.py input.gff3.gz output.gff3.gz + python add_gene_biotype_to_gff3.py input.gff3 output.gff3 +""" + +import argparse +import gzip +import sys +from collections import defaultdict + + +def open_maybe_gzip(path, mode="rt"): + if path.endswith(".gz"): + return gzip.open(path, mode) + return open(path, mode) + + +def parse_attributes(attr_str): + """Parse GFF3 attribute string into a dict.""" + attrs = {} + for item in attr_str.split(";"): + item = item.strip() + if not item: + continue + if "=" in item: + key, value = item.split("=", 1) + attrs[key] = value + return attrs + + +def infer_biotype_from_transcript(gbkey, ncrna_class): + """Infer a gene_biotype string from transcript-level attributes.""" + if gbkey == "mRNA": + return "protein_coding" + if gbkey == "ncRNA": + if ncrna_class: + # ncrna_class values like lncRNA, miRNA, etc. map directly + # to biotypes that GenomeKit knows (via as_biotype_compatible) + return ncrna_class + return "ncRNA" + if gbkey == "precursor_RNA": + return "pre_miRNA" + if gbkey == "rRNA": + return "rRNA" + if gbkey == "tRNA": + return "tRNA" + # misc_RNA, other + return "misc_RNA" + + +def main(): + parser = argparse.ArgumentParser( + description="Add gene_biotype attribute to NCBI RefSeq GFF3 files that lack it." + ) + parser.add_argument("input", help="Input GFF3 file (optionally gzipped)") + parser.add_argument("output", help="Output GFF3 file (gzipped if .gz)") + args = parser.parse_args() + + # --- Pass 1: collect transcript info per gene --- + # gene_id -> list of (gbkey, ncrna_class) + tran_info_by_gene = defaultdict(list) + # gene_id -> bool (has pseudo=true) + gene_is_pseudo = {} + # gene_id -> bool (already has gene_biotype) + gene_has_biotype = {} + # Track which gene IDs exist + gene_ids = set() + + print(f"Pass 1: scanning {args.input} ...", file=sys.stderr) + with open_maybe_gzip(args.input) as f: + for line in f: + line = line.rstrip("\n") + if not line or line.startswith("#"): + continue + + cols = line.split("\t") + if len(cols) != 9: + continue + + attrs = parse_attributes(cols[8]) + gff_id = attrs.get("ID", "") + + # Detect gene lines + if cols[2] == "gene" or cols[2] == "pseudogene" or gff_id.startswith("gene"): + gene_ids.add(gff_id) + gene_is_pseudo[gff_id] = ( + attrs.get("pseudo", "") == "true" or cols[2] == "pseudogene" + ) + gene_has_biotype[gff_id] = "gene_biotype" in attrs + + # Detect transcript lines (rna IDs with Parent pointing to a gene) + elif gff_id.startswith("rna"): + parent = attrs.get("Parent", "") + if parent in gene_ids or parent.startswith("gene"): + gbkey = attrs.get("gbkey", "") + ncrna_class = attrs.get("ncrna_class", "") + tran_info_by_gene[parent].append((gbkey, ncrna_class)) + + # --- Compute biotypes --- + gene_biotype = {} + for gid in gene_ids: + if gene_has_biotype[gid]: + continue # already present, don't touch + if gene_is_pseudo[gid]: + gene_biotype[gid] = "pseudogene" + elif tran_info_by_gene[gid]: + # Use the first transcript to infer + gbkey, ncrna_class = tran_info_by_gene[gid][0] + gene_biotype[gid] = infer_biotype_from_transcript(gbkey, ncrna_class) + else: + gene_biotype[gid] = "misc_RNA" + + n_added = len(gene_biotype) + n_already = sum(1 for v in gene_has_biotype.values() if v) + print( + f" {len(gene_ids)} genes found, {n_already} already have gene_biotype, " + f"{n_added} will be added.", + file=sys.stderr, + ) + + # --- Pass 2: write output with gene_biotype injected --- + print(f"Pass 2: writing {args.output} ...", file=sys.stderr) + with open_maybe_gzip(args.input) as fin, open_maybe_gzip(args.output, "wt") as fout: + for line in fin: + line = line.rstrip("\n") + if not line or line.startswith("#"): + fout.write(line + "\n") + continue + + cols = line.split("\t") + if len(cols) != 9: + fout.write(line + "\n") + continue + + attrs = parse_attributes(cols[8]) + gff_id = attrs.get("ID", "") + + if gff_id in gene_biotype: + # Append gene_biotype to the attributes column + cols[8] = cols[8].rstrip(";") + f";gene_biotype={gene_biotype[gff_id]}" + fout.write("\t".join(cols) + "\n") + else: + fout.write(line + "\n") + + print("Done.", file=sys.stderr) + + +if __name__ == "__main__": + main() + diff --git a/data-src/hg38.p2/NCBI/v107/fetch.sh b/data-src/hg38.p2/NCBI/v107/fetch.sh new file mode 100644 index 00000000..8bdaa4f9 --- /dev/null +++ b/data-src/hg38.p2/NCBI/v107/fetch.sh @@ -0,0 +1,14 @@ +#!/usr/bin/env bash + +wget -O v107.src.gff3.gz https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/ref_GRCh38.p2_top_level.gff3.gz +# the gff is missing gene_biotype ; add it heuristically rather than supporting this in the builder +python add_gene_biotype_to_gff3.py v107.src.gff3.gz v107.tmp.gff3.gz + +# NT_187507.1 isn't in hg38.p12.chromAlias.txt, which we are using for hg38.p2. +# ok to remove it because NCBI says: +# > Record suppressed. This RefSeq record was removed because the sequence was determined to have originated from Chinese hamster. +# https://www.ncbi.nlm.nih.gov/nuccore/NT_187507.1?report=genbank + +# doing extra work of unzipping and gzipping again but saves code duplication and only takes a few seconds longer +zgrep -v NT_187507.1 v107.tmp.gff3.gz | gzip > v107.gff3.gz +rm v107.tmp.gff3.gz diff --git a/data-src/hg38.p2/assembly/fetch.sh b/data-src/hg38.p2/assembly/fetch.sh new file mode 100644 index 00000000..53a7c170 --- /dev/null +++ b/data-src/hg38.p2/assembly/fetch.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/GRCh38.p2.genome.fa.gz +faToTwoBit GRCh38.p2.genome.fa.gz hg38.p2.2bit +twoBitInfo hg38.p2.2bit stdout | sort -k2rn > hg38.p2.chrom.sizes + +# need chromAlias for ncbi, using hg38.p12.chromAlias.txt +wget -O hg38.p2.chromAlias.txt https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p12/hg38.p12.chromAlias.txt