From 16286046ad29ba98796287dbba6d2b8e3d38ccc3 Mon Sep 17 00:00:00 2001 From: Reece Hart Date: Sat, 6 Dec 2025 22:12:51 -0800 Subject: [PATCH 1/2] update dbsnp facade to new XML structure --- .../eutils/_internal/xmlfacades/dbsnp.py | 24 ++- tests/data/cassettes/test_two_snps | 152 ++++++++++++++++++ ...snp.py => test_eutils_xmlfacades_dbsnp.py} | 0 3 files changed, 168 insertions(+), 8 deletions(-) create mode 100644 tests/data/cassettes/test_two_snps rename tests/{x-test_eutils_xmlfacades_dbsnp.py => test_eutils_xmlfacades_dbsnp.py} (100%) diff --git a/src/biocommons/eutils/_internal/xmlfacades/dbsnp.py b/src/biocommons/eutils/_internal/xmlfacades/dbsnp.py index bee58d7..f9b3b8f 100644 --- a/src/biocommons/eutils/_internal/xmlfacades/dbsnp.py +++ b/src/biocommons/eutils/_internal/xmlfacades/dbsnp.py @@ -14,21 +14,23 @@ def __iter__(self): return ( Rs(n) for n in self._xml_root.iterfind( - "docsum:Rs", namespaces={"docsum": self._xml_root.nsmap[None]} + "docsum:DocumentSummary", namespaces={"docsum": self._xml_root.nsmap[None]} ) ) def __len__(self): return len( - self._xml_root.findall("docsum:Rs", namespaces={"docsum": self._xml_root.nsmap[None]}) + self._xml_root.findall( + "docsum:DocumentSummary", namespaces={"docsum": self._xml_root.nsmap[None]} + ) ) class Rs: - _root_tag = "Rs" + _root_tag = "DocumentSummary" def __init__(self, rs_node): - assert rs_node.tag == "{https://www.ncbi.nlm.nih.gov/SNP/docsum}Rs" # noqa: S101 + assert rs_node.tag == "{https://www.ncbi.nlm.nih.gov/SNP/docsum}DocumentSummary" # noqa: S101 self._n = rs_node # def __str__(self): @@ -36,19 +38,25 @@ def __init__(self, rs_node): @property def rs_id(self): - return "rs" + self._n.get("rsId") + ns = {"docsum": self._n.nsmap[None]} + snp_id = self._n.findtext("docsum:SNP_ID", namespaces=ns) + return "rs" + snp_id @property def withdrawn(self): - return "notwithdrawn" not in self._n.get("snpType") + # snpType is no longer available in DocumentSummary format + # This would need to be determined from other fields if available + return False @property def orient(self): - return self._n.get("orient") + # orient attribute is no longer available in DocumentSummary format + return None @property def strand(self): - return self._n.get("strand") + # strand attribute is no longer available in DocumentSummary format + return None @property def hgvs_tags(self): diff --git a/tests/data/cassettes/test_two_snps b/tests/data/cassettes/test_two_snps new file mode 100644 index 0000000..80ea09f --- /dev/null +++ b/tests/data/cassettes/test_two_snps @@ -0,0 +1,152 @@ +interactions: +- request: + body: tool=biocommons.eutils._internal&email=biocommons-dev%40googlegroups.com&retmode=xml&usehistory=y&retmax=250&db=snp&id=%5B2031%2C+14181%5D + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, zstd + Connection: + - keep-alive + Content-Length: + - '138' + Content-Type: + - application/x-www-form-urlencoded + User-Agent: + - python-requests/2.32.5 + method: POST + uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi + response: + body: + string: ' + + 20311000GenomesT=0./01000Genomes_30XT=0.000156/1ALSPACT=0.003373/13EstonianT=0.002902/13GnomAD_exomesT=0.01162/172GnomAD_genomesT=0.002325/347GoNLT=0.002004/2NorthernSwedenT=0.015/9TWINSUKT=0.003506/13ALFAT=0.001392/490ZCCHC385364NC_000020.1120ACPOP,WIAF,HUMAN_LONGEVITY,GNOMAD,EVA,1000GENOMES,1000G_HIGH_COVERAGE,EVA_UK10K_ALSPAC,EVA_DECODE,TOPMED,HUGCELL_USP,EVA-GONL,EGCUT_WGS,SWEGEN,EVA_UK10K_TWINSUKNC_000020.11:300164:C:G,NC_000020.11:300164:C:T3_prime_UTR_variantby-frequency,by-alfa,by-clusterHGVS=NC_000020.11:g.300165C>G,NC_000020.11:g.300165C>T,NC_000020.10:g.280809C>G,NC_000020.10:g.280809C>T,NM_033089.7:c.*1367C>G,NM_033089.7:c.*1367C>T,NM_033089.6:c.*1367C>G,NM_033089.6:c.*1367C>T|SEQ=[C/G/T]|LEN=1|GENE=ZCCHC3:853649606361572000/09/19 + 17:022024/11/02 07:152057,994458752,1363772279,1638252636,1681246669,1698512275,2240725825,2964620177,3017746807,3684459275,3706408864,3743180751,6278874430,6319911214,6468101968,8079786236,8079786237,8435792673,8500251098,8613785173,8922377993,8957610598,10055927451Bsnv20:30016520:2808090000002031000003001650 + + 141811000GenomesT=0.0024/121000Genomes_30XT=0.0025/16SGDP_PRJC=0.5/1ALFAT=0.00075/310ZCCHC385364NC_000020.1120SGDP_PRJ,HUMAN_LONGEVITY,GNOMAD,CGAP-GAI,ILLUMINA,1000GENOMES,EVA,1000G_HIGH_COVERAGE,TOPMED,SANFORD_IMAGENETICS,HUGCELL_USPNC_000020.11:300026:C:A,NC_000020.11:300026:C:T3_prime_UTR_variantby-frequency,by-alfa,by-clusterHGVS=NC_000020.11:g.300027C>A,NC_000020.11:g.300027C>T,NC_000020.10:g.280671C>A,NC_000020.10:g.280671C>T,NM_033089.7:c.*1229C>A,NM_033089.7:c.*1229C>T,NM_033089.6:c.*1229C>A,NM_033089.6:c.*1229C>T|SEQ=[C/A/T]|LEN=1|GENE=ZCCHC3:853649606521572000/09/19 + 17:022024/11/02 07:1516555,228211653,481859551,484172829,534416668,779313025,781587078,834780474,1363772274,2240725819,2633765626,2964620167,3628331386,3631724045,3888574171,6278874425,6319911212,8079786196,8079786197,8307826362,8435792665,8500251094,8613785168,8662714871,8922377988,8957610594,10055927422,10055927423Hsnv20:30002720:2806710000014181000003000270 + + ' + headers: + Access-Control-Allow-Origin: + - '*' + Access-Control-Expose-Headers: + - X-RateLimit-Limit,X-RateLimit-Remaining + Cache-Control: + - private + Connection: + - Keep-Alive + Content-Security-Policy: + - upgrade-insecure-requests + Content-Type: + - text/xml; charset=UTF-8 + Date: + - Sun, 07 Dec 2025 05:45:50 GMT + Keep-Alive: + - timeout=4, max=40 + NCBI-PHID: + - 1D33B2963ACDB6A50000570B4D24D7EE.1.1.m_7 + NCBI-SID: + - 41B80D6CA196BF68_4C89SID + Referrer-Policy: + - origin-when-cross-origin + Server: + - Finatra + Set-Cookie: + - ncbi_sid=41B80D6CA196BF68_4C89SID; domain=.nih.gov; path=/; expires=Mon, 07 + Dec 2026 05:45:51 GMT + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-RateLimit-Limit: + - '10' + X-RateLimit-Remaining: + - '9' + X-UA-Compatible: + - IE=Edge + X-XSS-Protection: + - 1; mode=block + content-encoding: + - gzip + status: + code: 200 + message: OK +- request: + body: tool=biocommons.eutils._internal&email=biocommons-dev%40googlegroups.com&retmode=xml&usehistory=y&retmax=250&db=snp&id=%5B2031%2C+14181%5D + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate, zstd + Connection: + - keep-alive + Content-Length: + - '138' + Content-Type: + - application/x-www-form-urlencoded + User-Agent: + - python-requests/2.32.5 + method: POST + uri: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi + response: + body: + string: ' + + 20311000GenomesT=0./01000Genomes_30XT=0.000156/1ALSPACT=0.003373/13EstonianT=0.002902/13GnomAD_exomesT=0.01162/172GnomAD_genomesT=0.002325/347GoNLT=0.002004/2NorthernSwedenT=0.015/9TWINSUKT=0.003506/13ALFAT=0.001392/490ZCCHC385364NC_000020.1120ACPOP,WIAF,HUMAN_LONGEVITY,GNOMAD,EVA,1000GENOMES,1000G_HIGH_COVERAGE,EVA_UK10K_ALSPAC,EVA_DECODE,TOPMED,HUGCELL_USP,EVA-GONL,EGCUT_WGS,SWEGEN,EVA_UK10K_TWINSUKNC_000020.11:300164:C:G,NC_000020.11:300164:C:T3_prime_UTR_variantby-frequency,by-alfa,by-clusterHGVS=NC_000020.11:g.300165C>G,NC_000020.11:g.300165C>T,NC_000020.10:g.280809C>G,NC_000020.10:g.280809C>T,NM_033089.7:c.*1367C>G,NM_033089.7:c.*1367C>T,NM_033089.6:c.*1367C>G,NM_033089.6:c.*1367C>T|SEQ=[C/G/T]|LEN=1|GENE=ZCCHC3:853649606361572000/09/19 + 17:022024/11/02 07:152057,994458752,1363772279,1638252636,1681246669,1698512275,2240725825,2964620177,3017746807,3684459275,3706408864,3743180751,6278874430,6319911214,6468101968,8079786236,8079786237,8435792673,8500251098,8613785173,8922377993,8957610598,10055927451Bsnv20:30016520:2808090000002031000003001650 + + 141811000GenomesT=0.0024/121000Genomes_30XT=0.0025/16SGDP_PRJC=0.5/1ALFAT=0.00075/310ZCCHC385364NC_000020.1120SGDP_PRJ,HUMAN_LONGEVITY,GNOMAD,CGAP-GAI,ILLUMINA,1000GENOMES,EVA,1000G_HIGH_COVERAGE,TOPMED,SANFORD_IMAGENETICS,HUGCELL_USPNC_000020.11:300026:C:A,NC_000020.11:300026:C:T3_prime_UTR_variantby-frequency,by-alfa,by-clusterHGVS=NC_000020.11:g.300027C>A,NC_000020.11:g.300027C>T,NC_000020.10:g.280671C>A,NC_000020.10:g.280671C>T,NM_033089.7:c.*1229C>A,NM_033089.7:c.*1229C>T,NM_033089.6:c.*1229C>A,NM_033089.6:c.*1229C>T|SEQ=[C/A/T]|LEN=1|GENE=ZCCHC3:853649606521572000/09/19 + 17:022024/11/02 07:1516555,228211653,481859551,484172829,534416668,779313025,781587078,834780474,1363772274,2240725819,2633765626,2964620167,3628331386,3631724045,3888574171,6278874425,6319911212,8079786196,8079786197,8307826362,8435792665,8500251094,8613785168,8662714871,8922377988,8957610594,10055927422,10055927423Hsnv20:30002720:2806710000014181000003000270 + + ' + headers: + Access-Control-Allow-Origin: + - '*' + Access-Control-Expose-Headers: + - X-RateLimit-Limit,X-RateLimit-Remaining + Cache-Control: + - private + Connection: + - Keep-Alive + Content-Security-Policy: + - upgrade-insecure-requests + Content-Type: + - text/xml; charset=UTF-8 + Date: + - Sun, 07 Dec 2025 06:11:44 GMT + Keep-Alive: + - timeout=4, max=40 + NCBI-PHID: + - 1D33B2963ACDB6A500004612D7C7EAF2.1.1.m_7 + NCBI-SID: + - 0C36C7C25205FB1A_A2F6SID + Referrer-Policy: + - origin-when-cross-origin + Server: + - Finatra + Set-Cookie: + - ncbi_sid=0C36C7C25205FB1A_A2F6SID; domain=.nih.gov; path=/; expires=Mon, 07 + Dec 2026 06:11:45 GMT + Strict-Transport-Security: + - max-age=31536000; includeSubDomains; preload + Transfer-Encoding: + - chunked + X-RateLimit-Limit: + - '10' + X-RateLimit-Remaining: + - '9' + X-UA-Compatible: + - IE=Edge + X-XSS-Protection: + - 1; mode=block + content-encoding: + - gzip + status: + code: 200 + message: OK +version: 1 diff --git a/tests/x-test_eutils_xmlfacades_dbsnp.py b/tests/test_eutils_xmlfacades_dbsnp.py similarity index 100% rename from tests/x-test_eutils_xmlfacades_dbsnp.py rename to tests/test_eutils_xmlfacades_dbsnp.py From 29ee154039d015a97138b52b61e57c47a2dfdbd7 Mon Sep 17 00:00:00 2001 From: Reece Hart Date: Sun, 7 Dec 2025 14:24:08 -0800 Subject: [PATCH 2/2] Restore gbseq tests, but comment out broken functionality --- archive/eutils-fetch-biopython | 138 ++++++++++++++++++ .../eutils/_internal/xmlfacades/gbseq.py | 6 +- ...des_gi.py => test_eutils_xmlfacades_gi.py} | 13 +- 3 files changed, 148 insertions(+), 9 deletions(-) create mode 100755 archive/eutils-fetch-biopython rename tests/{x-test_eutils_xmlfacades_gi.py => test_eutils_xmlfacades_gi.py} (80%) diff --git a/archive/eutils-fetch-biopython b/archive/eutils-fetch-biopython new file mode 100755 index 0000000..28ee431 --- /dev/null +++ b/archive/eutils-fetch-biopython @@ -0,0 +1,138 @@ +#!/usr/bin/env python3 +""" +Fetch and display key information from a RefSeq accession using Biopython. +Usage: + python refseq_info.py NM_152783.5 + efetch -db nucleotide -id NM_152783.5 -format gb | ./archive/eutils-fetch-biopython + +» efetch -db nucleotide -id NM_152783.5 -format gb | ./archive/eutils-fetch-biopython +Reading XML from stdin... + +# Example usage: +» efetch -db nucleotide -id NM_152783.5 -format gb | ./archive/eutils-fetch-biopython +============================================================ +Accession.Version: NM_152783.5 +Length: 2566 bp +Molecule Type: mRNA +Species: Homo sapiens +Locus: NM_152783 +Gene Name(s): D2HGDH + +----------------------CDS Information:---------------------- +CDS Start: 159 +CDS End: 1724 + +------------------------Exon Table:------------------------- +Exon Start End +------------------------------ +Exon 1 1 66 +Exon 2 67 450 +Exon 3 451 508 +Exon 4 509 648 +Exon 5 649 842 +Exon 6 843 1011 +Exon 7 1012 1155 +Exon 8 1156 1298 +Exon 9 1299 1464 +Exon 10 1465 2566 +============================================================ + +""" + +import sys + +from Bio import Entrez, SeqIO + +# Always provide your email to NCBI +Entrez.email = "user@example.com" + + +def get_refseq_info(accession): + """Fetch and parse RefSeq record information from NCBI.""" + + # Fetch the record from NCBI + print(f"Fetching {accession} from NCBI...") + handle = Entrez.efetch(db="nucleotide", id=accession, rettype="gb", retmode="text") + record = SeqIO.read(handle, "genbank") + handle.close() + + return record + + +def display_record_info(record): + """Display information from a SeqRecord.""" + + # Extract basic information + print(f"\n{'=' * 60}") + print(f"Accession.Version: {record.id}") + print(f"Length: {len(record.seq)} bp") + print(f"Molecule Type: {record.annotations.get('molecule_type', 'N/A')}") + print(f"Species: {record.annotations.get('organism', 'N/A')}") + print(f"Locus: {record.name}") + + # Extract gene names + gene_names = set() + for feature in record.features: + if feature.type == "gene" and "gene" in feature.qualifiers: + gene_names.update(feature.qualifiers["gene"]) + + if gene_names: + print(f"Gene Name(s): {', '.join(sorted(gene_names))}") + else: + print("Gene Name(s): N/A") + + # Find CDS coordinates + print(f"\n{'CDS Information:':-^60}") + cds_found = False + for feature in record.features: + if feature.type == "CDS": + cds_found = True + # Handle compound locations (spliced CDS) + start = int(feature.location.start) + 1 # Convert to 1-based + end = int(feature.location.end) + print(f"CDS Start: {start}") + print(f"CDS End: {end}") + break + + if not cds_found: + print("CDS: Not found") + + # Extract exon information + print(f"\n{'Exon Table:':-^60}") + print(f"{'Exon':<10} {'Start':<10} {'End':<10}") + print("-" * 30) + + exon_count = 0 + for feature in record.features: + if feature.type == "exon": + exon_count += 1 + start = int(feature.location.start) + 1 # Convert to 1-based + end = int(feature.location.end) + print(f"Exon {exon_count:<4} {start:<10} {end:<10}") + + if exon_count == 0: + print("No exon features found in record") + + print(f"{'=' * 60}\n") + + +if __name__ == "__main__": + try: + if len(sys.argv) == 2: # noqa: PLR2004 + # Accession provided as argument + accession = sys.argv[1] + record = get_refseq_info(accession) + elif len(sys.argv) == 1: + # No argument - read XML from stdin + print("Reading XML from stdin...") + record = SeqIO.read(sys.stdin, "genbank") + else: + print("Usage: python refseq_info.py ", file=sys.stderr) + print(" or: cat record.xml | python refseq_info.py", file=sys.stderr) + sys.exit(1) + + display_record_info(record) + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) diff --git a/src/biocommons/eutils/_internal/xmlfacades/gbseq.py b/src/biocommons/eutils/_internal/xmlfacades/gbseq.py index 5ce5c62..048b119 100644 --- a/src/biocommons/eutils/_internal/xmlfacades/gbseq.py +++ b/src/biocommons/eutils/_internal/xmlfacades/gbseq.py @@ -36,7 +36,11 @@ def definition(self): @property def exons(self): - return [(f.start_i, f.end_i) for f in self.features.exons] + """This function is temporarily disabled because the XML schema changed + and this method no longer works.""" + msg = "The exons property is no longer supported" + raise NotImplementedError(msg) + # return [(f.start_i, f.end_i) for f in self.features.exons] @property def features(self): diff --git a/tests/x-test_eutils_xmlfacades_gi.py b/tests/test_eutils_xmlfacades_gi.py similarity index 80% rename from tests/x-test_eutils_xmlfacades_gi.py rename to tests/test_eutils_xmlfacades_gi.py index a362c4e..b88039b 100644 --- a/tests/x-test_eutils_xmlfacades_gi.py +++ b/tests/test_eutils_xmlfacades_gi.py @@ -3,7 +3,7 @@ @vcr.use_cassette def test_esearchresult(client): - r = next(iter(client.efetch(db="nuccore", id="NM_152783.3"))) + r = next(iter(client.efetch(db="nuccore", id="NM_152783.5"))) # in #150 fails with AttributeError: 'GBSeq' object has no attribute 'seqids' assert r.gi == 119964727 @@ -28,10 +28,7 @@ def test_esearchresult(client): ) assert prot == r.features.cds.translation - # this returns the ranges - exons = r.exons - assert len(exons) == 10 - - # this returns GBFeatureExon objects - exon = next(iter(r.features.exons)) - assert exon.inference == "alignment:Splign:1.39.8" + # exons (and perhaps other methods) are currently broken + # See eutils/_internal/xmlfacades/gbseq.py#L38 + # exons = r.exons + # assert len(exons) == 10