From 47234099299806b6cf0c8484554f1945f2b0b489 Mon Sep 17 00:00:00 2001 From: Lucian Fialho Date: Fri, 24 Apr 2026 21:31:17 -0300 Subject: [PATCH] feat(papers): add pubmed_search, fetch_pubmed, fetch_doi operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends hf_papers tool with three new operations covering biomedical and DOI-based literature beyond arXiv: - pubmed_search: keyword search via NCBI E-utilities (esearch + esummary) - fetch_pubmed: fetch abstract for a PMID via efetch XML - fetch_doi: fetch metadata + abstract for any DOI via Crossref API (covers bioRxiv, medRxiv, PsyArXiv, journal articles) No new dependencies — uses httpx (already a dep) and stdlib xml.etree. All three operations follow the existing ToolResult pattern and are registered in _OPERATIONS / HF_PAPERS_TOOL_SPEC. Closes #93 --- agent/tools/papers_tool.py | 188 ++++++++++++++++++++++++++++++++++++- 1 file changed, 186 insertions(+), 2 deletions(-) diff --git a/agent/tools/papers_tool.py b/agent/tools/papers_tool.py index 4032a770..628ed4a7 100644 --- a/agent/tools/papers_tool.py +++ b/agent/tools/papers_tool.py @@ -3,7 +3,8 @@ Operations: trending, search, paper_details, read_paper, find_datasets, find_models, find_collections, find_all_resources, - citation_graph, snippet_search, recommend + citation_graph, snippet_search, recommend, + pubmed_search, fetch_pubmed, fetch_doi """ import asyncio @@ -11,6 +12,7 @@ import re import time from typing import Any +from xml.etree import ElementTree import httpx from bs4 import BeautifulSoup, Tag @@ -21,6 +23,11 @@ ARXIV_HTML = "https://arxiv.org/html" AR5IV_HTML = "https://ar5iv.labs.arxiv.org/html" +PUBMED_ESEARCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" +PUBMED_ESUMMARY = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" +PUBMED_EFETCH = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" +CROSSREF_API = "https://api.crossref.org/works" + DEFAULT_LIMIT = 10 MAX_LIMIT = 50 MAX_SUMMARY_LEN = 300 @@ -1139,6 +1146,169 @@ async def _op_recommend(args: dict[str, Any], limit: int) -> ToolResult: } +# --------------------------------------------------------------------------- +# PubMed operations +# --------------------------------------------------------------------------- + +async def _op_pubmed_search(args: dict[str, Any], limit: int) -> ToolResult: + """Search PubMed via NCBI E-utilities (biomedical, clinical, pharmacological).""" + query = args.get("query", "").strip() + if not query: + return _error("'query' is required for pubmed_search.") + + params = { + "db": "pubmed", + "term": query, + "retmax": limit, + "retmode": "json", + "tool": "ml-intern", + } + async with httpx.AsyncClient(timeout=15) as client: + try: + resp = await client.get(PUBMED_ESEARCH, params=params) + resp.raise_for_status() + ids = resp.json().get("esearchresult", {}).get("idlist", []) + except Exception as exc: + return _error(f"PubMed search failed: {exc}") + + if not ids: + return {"formatted": f"No PubMed results for: {query}", "totalResults": 0, "resultsShared": 0} + + try: + sum_resp = await client.get( + PUBMED_ESUMMARY, + params={"db": "pubmed", "id": ",".join(ids), "retmode": "json", "tool": "ml-intern"}, + ) + sum_resp.raise_for_status() + result_data = sum_resp.json().get("result", {}) + except Exception as exc: + return _error(f"PubMed summary failed: {exc}") + + lines = [f"# PubMed results for: {query}\n"] + for pmid in ids: + item = result_data.get(pmid, {}) + title = item.get("title", "(no title)") + authors = ", ".join(a.get("name", "") for a in item.get("authors", [])[:3]) + if len(item.get("authors", [])) > 3: + authors += " et al." + source = item.get("source", "") + pubdate = item.get("pubdate", "") + lines.append(f"**pmid:{pmid}** — {title}") + lines.append(f" {authors} · {source} · {pubdate}") + lines.append(f" → fetch with: fetch_pubmed pmid={pmid}\n") + + return {"formatted": "\n".join(lines), "totalResults": len(ids), "resultsShared": len(ids)} + + +async def _op_fetch_pubmed(args: dict[str, Any], limit: int) -> ToolResult: + """Fetch a PubMed abstract by PMID.""" + pmid = str(args.get("pmid", "")).strip().removeprefix("pmid:") + if not pmid: + return _error("'pmid' is required for fetch_pubmed.") + + params = {"db": "pubmed", "id": pmid, "rettype": "abstract", "retmode": "xml", "tool": "ml-intern"} + async with httpx.AsyncClient(timeout=15) as client: + try: + resp = await client.get(PUBMED_EFETCH, params=params) + resp.raise_for_status() + except Exception as exc: + return _error(f"PubMed fetch failed for pmid:{pmid}: {exc}") + + try: + root = ElementTree.fromstring(resp.text) + article = root.find(".//PubmedArticle") + if article is None: + return _error(f"No article found for pmid:{pmid}") + + title = article.findtext(".//ArticleTitle") or "(no title)" + abstract_parts = article.findall(".//AbstractText") + abstract = " ".join( + (f"**{p.get('Label')}:** " if p.get("Label") else "") + (p.text or "") + for p in abstract_parts + ).strip() + authors = [] + for author in article.findall(".//Author")[:5]: + last = author.findtext("LastName") or "" + fore = author.findtext("ForeName") or "" + if last: + authors.append(f"{fore} {last}".strip()) + journal = article.findtext(".//Journal/Title") or article.findtext(".//MedlineTA") or "" + pub_year = article.findtext(".//PubDate/Year") or "" + doi = next( + (id_el.text for id_el in article.findall(".//ArticleId") if id_el.get("IdType") == "doi"), + None, + ) + except ElementTree.ParseError as exc: + return _error(f"Failed to parse PubMed XML for pmid:{pmid}: {exc}") + + lines = [f"# {title}"] + lines.append(f"**PMID:** {pmid} | **URL:** https://pubmed.ncbi.nlm.nih.gov/{pmid}/") + if doi: + lines.append(f"**DOI:** https://doi.org/{doi}") + if authors: + suffix = " et al." if len(article.findall(".//Author")) > 5 else "" + lines.append(f"**Authors:** {', '.join(authors)}{suffix}") + lines.append(f"**Journal:** {journal} | **Year:** {pub_year}") + lines.append("") + lines.append("## Abstract") + lines.append(abstract or "(no abstract available)") + return {"formatted": "\n".join(lines), "totalResults": 1, "resultsShared": 1} + + +# --------------------------------------------------------------------------- +# DOI fetch via Crossref +# --------------------------------------------------------------------------- + +async def _op_fetch_doi(args: dict[str, Any], limit: int) -> ToolResult: + """Fetch metadata and abstract for any DOI via Crossref (bioRxiv, medRxiv, journals).""" + doi = str(args.get("doi", "")).strip().removeprefix("doi:") + if not doi: + return _error("'doi' is required for fetch_doi.") + + url = f"{CROSSREF_API}/{doi}" + headers = {"User-Agent": "ml-intern/1.0 (mailto:ml-intern@huggingface.co)"} + async with httpx.AsyncClient(timeout=15, follow_redirects=True) as client: + try: + resp = await client.get(url, headers=headers) + resp.raise_for_status() + work = resp.json().get("message", {}) + except httpx.HTTPStatusError as exc: + return _error(f"Crossref returned {exc.response.status_code} for doi:{doi}") + except Exception as exc: + return _error(f"Crossref fetch failed for doi:{doi}: {exc}") + + title_parts = work.get("title") or [] + title = title_parts[0] if title_parts else "(no title)" + authors = [] + for a in (work.get("author") or [])[:5]: + given = a.get("given", "") + family = a.get("family", "") + if family: + authors.append(f"{given} {family}".strip()) + container = (work.get("container-title") or [""])[0] + pub_date_parts = (work.get("published") or work.get("published-print") or work.get("published-online") or {}).get("date-parts", [[]]) + pub_date = "-".join(str(p) for p in pub_date_parts[0]) if pub_date_parts else "" + abstract_raw = work.get("abstract", "") + abstract = re.sub(r"<[^>]+>", "", abstract_raw).strip() + full_text_url = f"https://doi.org/{doi}" + + lines = [f"# {title}"] + lines.append(f"**DOI:** https://doi.org/{doi}") + lines.append(f"**Source:** {container} | **Published:** {pub_date}") + if authors: + suffix = " et al." if len(work.get("author") or []) > 5 else "" + lines.append(f"**Authors:** {', '.join(authors)}{suffix}") + lines.append(f"**Full text:** {full_text_url}") + lines.append("") + if abstract: + lines.append("## Abstract") + lines.append(abstract) + else: + lines.append("*(Abstract not available via Crossref for this DOI)*") + + return {"formatted": "\n".join(lines), "totalResults": 1, "resultsShared": 1} + + # --------------------------------------------------------------------------- # Operation dispatch # --------------------------------------------------------------------------- @@ -1155,6 +1325,9 @@ async def _op_recommend(args: dict[str, Any], limit: int) -> ToolResult: "find_models": _op_find_models, "find_collections": _op_find_collections, "find_all_resources": _op_find_all_resources, + "pubmed_search": _op_pubmed_search, + "fetch_pubmed": _op_fetch_pubmed, + "fetch_doi": _op_fetch_doi, } @@ -1183,7 +1356,10 @@ async def _op_recommend(args: dict[str, Any], limit: int) -> ToolResult: "- find_datasets: Find datasets linked to a paper\n" "- find_models: Find models linked to a paper\n" "- find_collections: Find collections that include a paper\n" - "- find_all_resources: Parallel fetch of datasets + models + collections for a paper" + "- find_all_resources: Parallel fetch of datasets + models + collections for a paper\n" + "- pubmed_search: Search PubMed (biomedical, clinical, pharmacological literature)\n" + "- fetch_pubmed: Fetch abstract for a PubMed paper by PMID (e.g. pmid=38903003)\n" + "- fetch_doi: Fetch metadata + abstract for any DOI via Crossref (bioRxiv, medRxiv, journals)" ), "parameters": { "type": "object", @@ -1265,6 +1441,14 @@ async def _op_recommend(args: dict[str, Any], limit: int) -> ToolResult: "type": "integer", "description": "Maximum results to return (default: 10, max: 50).", }, + "pmid": { + "type": "string", + "description": "PubMed ID (e.g. '38903003' or 'pmid:38903003'). Required for: fetch_pubmed.", + }, + "doi": { + "type": "string", + "description": "DOI string (e.g. '10.1101/2023.12.15.571821' or 'doi:10.1101/...'). Required for: fetch_doi.", + }, }, "required": ["operation"], },