-
Notifications
You must be signed in to change notification settings - Fork 7
chore: add ncbi_refseq.v107 and associated assembly hg38.2 #214
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,165 @@ | ||
| #!/usr/bin/env python3 | ||
| """ | ||
| Add gene_biotype attribute to older NCBI RefSeq GFF3 files that lack it (e.g. v107). | ||
|
|
||
| The biotype is inferred from: | ||
| - pseudo=true on gene lines -> "pseudogene" | ||
| - gbkey on transcript lines -> mRNA means "protein_coding", | ||
| ncRNA uses ncrna_class (lncRNA, miRNA, etc.), precursor_RNA -> "pre_miRNA", | ||
| misc_RNA -> "misc_RNA" | ||
| - If a gene has no transcripts, falls back to "misc_RNA" | ||
|
Comment on lines
+6
to
+10
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't remember this stuff anymore, can maybe Gagan or someone review it as well? |
||
|
|
||
| Two-pass approach: | ||
| 1. First pass: collect transcript gbkey/ncrna_class per parent gene ID. | ||
| 2. Second pass: write out the file, injecting gene_biotype on gene lines. | ||
|
|
||
| Usage: | ||
| python add_gene_biotype_to_gff3.py input.gff3.gz output.gff3.gz | ||
| python add_gene_biotype_to_gff3.py input.gff3 output.gff3 | ||
| """ | ||
|
|
||
| import argparse | ||
| import gzip | ||
| import sys | ||
| from collections import defaultdict | ||
|
|
||
|
|
||
| def open_maybe_gzip(path, mode="rt"): | ||
| if path.endswith(".gz"): | ||
| return gzip.open(path, mode) | ||
| return open(path, mode) | ||
|
|
||
|
|
||
| def parse_attributes(attr_str): | ||
| """Parse GFF3 attribute string into a dict.""" | ||
| attrs = {} | ||
| for item in attr_str.split(";"): | ||
| item = item.strip() | ||
| if not item: | ||
| continue | ||
| if "=" in item: | ||
| key, value = item.split("=", 1) | ||
| attrs[key] = value | ||
| return attrs | ||
|
|
||
|
|
||
| def infer_biotype_from_transcript(gbkey, ncrna_class): | ||
| """Infer a gene_biotype string from transcript-level attributes.""" | ||
| if gbkey == "mRNA": | ||
| return "protein_coding" | ||
|
Comment on lines
+48
to
+49
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This assumes all mRNA entries are protein-coding. NCBI classification also considers CDS integrity and supporting evidence, so this may overcall protein-coding genes. It would be great to add a check for CDS entries for the transcript |
||
| if gbkey == "ncRNA": | ||
| if ncrna_class: | ||
| # ncrna_class values like lncRNA, miRNA, etc. map directly | ||
| # to biotypes that GenomeKit knows (via as_biotype_compatible) | ||
| return ncrna_class | ||
| return "ncRNA" | ||
| if gbkey == "precursor_RNA": | ||
| return "pre_miRNA" | ||
| if gbkey == "rRNA": | ||
| return "rRNA" | ||
| if gbkey == "tRNA": | ||
| return "tRNA" | ||
| # misc_RNA, other | ||
| return "misc_RNA" | ||
|
Comment on lines
+62
to
+63
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Defaulting to |
||
|
|
||
|
|
||
| def main(): | ||
| parser = argparse.ArgumentParser( | ||
| description="Add gene_biotype attribute to NCBI RefSeq GFF3 files that lack it." | ||
| ) | ||
| parser.add_argument("input", help="Input GFF3 file (optionally gzipped)") | ||
| parser.add_argument("output", help="Output GFF3 file (gzipped if .gz)") | ||
| args = parser.parse_args() | ||
|
|
||
| # --- Pass 1: collect transcript info per gene --- | ||
| # gene_id -> list of (gbkey, ncrna_class) | ||
| tran_info_by_gene = defaultdict(list) | ||
| # gene_id -> bool (has pseudo=true) | ||
| gene_is_pseudo = {} | ||
| # gene_id -> bool (already has gene_biotype) | ||
| gene_has_biotype = {} | ||
| # Track which gene IDs exist | ||
| gene_ids = set() | ||
|
|
||
| print(f"Pass 1: scanning {args.input} ...", file=sys.stderr) | ||
| with open_maybe_gzip(args.input) as f: | ||
| for line in f: | ||
| line = line.rstrip("\n") | ||
| if not line or line.startswith("#"): | ||
| continue | ||
|
|
||
| cols = line.split("\t") | ||
| if len(cols) != 9: | ||
| continue | ||
|
|
||
| attrs = parse_attributes(cols[8]) | ||
| gff_id = attrs.get("ID", "") | ||
|
|
||
| # Detect gene lines | ||
| if cols[2] == "gene" or cols[2] == "pseudogene" or gff_id.startswith("gene"): | ||
| gene_ids.add(gff_id) | ||
| gene_is_pseudo[gff_id] = ( | ||
| attrs.get("pseudo", "") == "true" or cols[2] == "pseudogene" | ||
| ) | ||
| gene_has_biotype[gff_id] = "gene_biotype" in attrs | ||
|
|
||
| # Detect transcript lines (rna IDs with Parent pointing to a gene) | ||
| elif gff_id.startswith("rna"): | ||
| parent = attrs.get("Parent", "") | ||
| if parent in gene_ids or parent.startswith("gene"): | ||
| gbkey = attrs.get("gbkey", "") | ||
| ncrna_class = attrs.get("ncrna_class", "") | ||
| tran_info_by_gene[parent].append((gbkey, ncrna_class)) | ||
|
|
||
| # --- Compute biotypes --- | ||
| gene_biotype = {} | ||
| for gid in gene_ids: | ||
| if gene_has_biotype[gid]: | ||
| continue # already present, don't touch | ||
| if gene_is_pseudo[gid]: | ||
| gene_biotype[gid] = "pseudogene" | ||
| elif tran_info_by_gene[gid]: | ||
| # Use the first transcript to infer | ||
| gbkey, ncrna_class = tran_info_by_gene[gid][0] | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Uses only the first transcript to infer gene biotype. Should we consider aggregating across all transcripts and resolving conflicts (eg: prioritizing |
||
| gene_biotype[gid] = infer_biotype_from_transcript(gbkey, ncrna_class) | ||
| else: | ||
| gene_biotype[gid] = "misc_RNA" | ||
|
|
||
| n_added = len(gene_biotype) | ||
| n_already = sum(1 for v in gene_has_biotype.values() if v) | ||
| print( | ||
| f" {len(gene_ids)} genes found, {n_already} already have gene_biotype, " | ||
| f"{n_added} will be added.", | ||
| file=sys.stderr, | ||
| ) | ||
|
|
||
| # --- Pass 2: write output with gene_biotype injected --- | ||
| print(f"Pass 2: writing {args.output} ...", file=sys.stderr) | ||
| with open_maybe_gzip(args.input) as fin, open_maybe_gzip(args.output, "wt") as fout: | ||
| for line in fin: | ||
| line = line.rstrip("\n") | ||
| if not line or line.startswith("#"): | ||
| fout.write(line + "\n") | ||
| continue | ||
|
|
||
| cols = line.split("\t") | ||
| if len(cols) != 9: | ||
| fout.write(line + "\n") | ||
| continue | ||
|
|
||
| attrs = parse_attributes(cols[8]) | ||
| gff_id = attrs.get("ID", "") | ||
|
|
||
| if gff_id in gene_biotype: | ||
| # Append gene_biotype to the attributes column | ||
| cols[8] = cols[8].rstrip(";") + f";gene_biotype={gene_biotype[gff_id]}" | ||
| fout.write("\t".join(cols) + "\n") | ||
| else: | ||
| fout.write(line + "\n") | ||
|
|
||
| print("Done.", file=sys.stderr) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| main() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,14 @@ | ||
| #!/usr/bin/env bash | ||
|
|
||
| wget -O v107.src.gff3.gz https://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.107/GFF/ref_GRCh38.p2_top_level.gff3.gz | ||
| # the gff is missing gene_biotype ; add it heuristically rather than supporting this in the builder | ||
| python add_gene_biotype_to_gff3.py v107.src.gff3.gz v107.tmp.gff3.gz | ||
|
|
||
| # NT_187507.1 isn't in hg38.p12.chromAlias.txt, which we are using for hg38.p2. | ||
| # ok to remove it because NCBI says: | ||
| # > Record suppressed. This RefSeq record was removed because the sequence was determined to have originated from Chinese hamster. | ||
| # https://www.ncbi.nlm.nih.gov/nuccore/NT_187507.1?report=genbank | ||
|
|
||
| # doing extra work of unzipping and gzipping again but saves code duplication and only takes a few seconds longer | ||
| zgrep -v NT_187507.1 v107.tmp.gff3.gz | gzip > v107.gff3.gz | ||
| rm v107.tmp.gff3.gz |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| #!/usr/bin/env bash | ||
|
|
||
| wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_22/GRCh38.p2.genome.fa.gz | ||
| faToTwoBit GRCh38.p2.genome.fa.gz hg38.p2.2bit | ||
| twoBitInfo hg38.p2.2bit stdout | sort -k2rn > hg38.p2.chrom.sizes | ||
|
|
||
| # need chromAlias for ncbi, using hg38.p12.chromAlias.txt | ||
| wget -O hg38.p2.chromAlias.txt https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/p12/hg38.p12.chromAlias.txt |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As specified here, this script produces inferred biotypes from GFF structure, whereas NCBI assigns biotypes using evidence-driven classification (Gnomon/EGAP). So results will only partially overlap with true RefSeq classifications.