From 6f04dd6554b0549574b293fb8bf7ad6a9bd0684d Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Thu, 12 Mar 2026 11:21:03 +0000 Subject: [PATCH 01/36] Added submit_study script from private repo --- .gitignore | 2 + bin/submit_study.py | 825 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 827 insertions(+) create mode 100644 bin/submit_study.py diff --git a/.gitignore b/.gitignore index 1c11923..d8c4dbb 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,5 @@ null/ .nf-test* .idea/ test_data +.claude/ +CLAUDE.md diff --git a/bin/submit_study.py b/bin/submit_study.py new file mode 100644 index 0000000..9ec012a --- /dev/null +++ b/bin/submit_study.py @@ -0,0 +1,825 @@ +#!/usr/bin/env python3 +"""Submit studies to ENA via the Webin REST API v2. + +Read a DataHarmonizer export containing study metadata, +validate it against a LinkML schema and an XSD schema, +check for duplicate studies already registered under the +Webin account, construct an XML submission document, and +submit new studies to ENA. + +Credentials are read from environment variables to avoid +secrets appearing in shell history or process listings:: + + export ENA_USERNAME=Webin-XXXXX + export ENA_PASSWORD=SECRET + +Usage:: + + python scripts/submit_study.py \\ + --input studies.json \\ + --linkml schemas/SRA_study.yaml \\ + --xsd assets/ena_schema \\ + --test + + # With hold date (max 2 years): + python scripts/submit_study.py \\ + --input studies.json \\ + --linkml schemas/SRA_study.yaml \\ + --xsd assets/ena_schema \\ + --hold-until 2028-01-01 + + # Log to file: + python scripts/submit_study.py \\ + --input studies.json \\ + --linkml schemas/SRA_study.yaml \\ + --xsd assets/ena_schema \\ + --test --log submission.log +""" + +from __future__ import annotations + +import logging +import sys +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Any, Final + +import pendulum +import requests +import typer +from requests.auth import HTTPBasicAuth + +import ena_common as common + +app = typer.Typer( + help="Submit studies to ENA via the Webin REST API v2.", +) + +logger = logging.getLogger("ena_submit.study") + + +# ----------------------------------------------------------- +# Reports API (study-specific) +# ----------------------------------------------------------- + +_PROD_REPORTS_URL: Final = ( + "https://www.ebi.ac.uk/ena/submit/report/projects" +) +_TEST_REPORTS_URL: Final = ( + "https://wwwdev.ebi.ac.uk/ena/submit/report/projects" +) + + +def _normalize_study_report( + report: dict[str, Any], +) -> dict[str, str]: + """Normalise a raw study report dict.""" + return { + "title": ( + report.get("title") + or report.get("studyTitle") + or report.get("STUDY_TITLE", "") + ), + "alias": ( + report.get("alias") + or report.get("studyAlias") + or "" + ), + "accession": ( + report.get("accession") + or report.get("studyAccession") + or report.get("report", {}).get("id", "") + ), + "secondary_accession": ( + report.get("secondaryAccession") + or report.get("secondaryId", "") + ), + "status": report.get( + "releaseStatus", "UNKNOWN" + ), + } + + +def fetch_account_studies( + auth: HTTPBasicAuth, + use_test: bool = False, + max_results: int = 5000, +) -> list[dict[str, str]]: + """Fetch all projects from the Webin Reports API. + + Args: + auth: HTTP basic-auth credentials. + use_test: Try the test endpoint before production. + max_results: Maximum number of results to request. + + Returns: + List of normalised study dicts. + """ + return common.fetch_account_records( + auth, + use_test=use_test, + prod_url=_PROD_REPORTS_URL, + test_url=_TEST_REPORTS_URL, + normalizer=_normalize_study_report, + entity_label="studies", + max_results=max_results, + ) + + +def find_duplicate_studies( + new_studies: list[dict[str, Any]], + account_studies: list[dict[str, str]], +) -> dict[int, dict[str, str]]: + """Check new studies against existing account studies. + + Args: + new_studies: Studies the user wants to submit. + account_studies: Existing studies in the account. + + Returns: + Mapping of index to matching study info. + """ + return common.find_duplicates_by_alias_title( + new_studies, account_studies, + title_field="STUDY_TITLE", + entity_label="studies", + ) + + +# ----------------------------------------------------------- +# XML construction +# ----------------------------------------------------------- + + +def build_submission_xml( + studies: list[dict[str, Any]], + hold_until: str | None = None, + action: str = "ADD", +) -> ET.Element: + """Build a WEBIN XML document for submitting studies. + + Each study in the input list is converted to a PROJECT + element. + + Args: + studies: Study metadata dicts. + hold_until: Optional hold-until date string + (``YYYY-MM-DD``). + action: Submission action — ``"ADD"`` for new studies + or ``"MODIFY"`` to update existing ones. + + Returns: + Root ```` element. + """ + webin = ET.Element("WEBIN") + + # SUBMISSION_SET + submission_set = ET.SubElement(webin, "SUBMISSION_SET") + submission = ET.SubElement( + submission_set, "SUBMISSION", + ) + sub_alias = ( + "study-submission-" + + pendulum.now().format("YYYYMMDD-HHmmss") + ) + submission.set("alias", sub_alias) + actions = ET.SubElement(submission, "ACTIONS") + main_action = ET.SubElement(actions, "ACTION") + ET.SubElement(main_action, action.upper()) + if hold_until: + hold_action = ET.SubElement(actions, "ACTION") + hold_el = ET.SubElement(hold_action, "HOLD") + hold_el.set("HoldUntilDate", hold_until) + + # PROJECT_SET + project_set = ET.SubElement(webin, "PROJECT_SET") + for study in studies: + _add_project_element(project_set, study) + + return webin + + +def _add_project_element( + project_set: ET.Element, + study: dict[str, Any], +) -> None: + """Append a ```` element to *project_set*.""" + alias = study.get( + "alias", + study.get("STUDY_TITLE", "").replace(" ", "_")[:50], + ) + project = ET.SubElement(project_set, "PROJECT") + project.set("alias", alias) + + name_text = study.get("CENTER_PROJECT_NAME", alias) + if name_text: + name_el = ET.SubElement(project, "NAME") + name_el.text = name_text + + title_el = ET.SubElement(project, "TITLE") + title_el.text = study.get("STUDY_TITLE", "") + + desc_text = ( + study.get("STUDY_ABSTRACT") + or study.get("STUDY_DESCRIPTION", "") + ) + if desc_text: + desc_el = ET.SubElement(project, "DESCRIPTION") + desc_el.text = desc_text + + sp = ET.SubElement(project, "SUBMISSION_PROJECT") + ET.SubElement(sp, "SEQUENCING_PROJECT") + + study_type = study.get("existing_study_type") + if study_type: + attrs = ET.SubElement( + project, "PROJECT_ATTRIBUTES", + ) + _add_project_attribute( + attrs, "existing_study_type", study_type, + ) + new_type = study.get("new_study_type") + if new_type and study_type == "Other": + _add_project_attribute( + attrs, "new_study_type", new_type, + ) + + +def _add_project_attribute( + parent: ET.Element, + tag_text: str, + value_text: str, +) -> None: + """Append a ```` to *parent*.""" + attr = ET.SubElement(parent, "PROJECT_ATTRIBUTE") + tag_el = ET.SubElement(attr, "TAG") + tag_el.text = tag_text + val_el = ET.SubElement(attr, "VALUE") + val_el.text = value_text + + +# ----------------------------------------------------------- +# XSD validation (study-specific fallback) +# ----------------------------------------------------------- + + +def _validate_study_xml_structure( + xml_bytes: bytes, + messages: list[str], +) -> tuple[bool, list[str]]: + """Fallback structural check for study XML.""" + try: + tree = ET.fromstring(xml_bytes) + except ET.ParseError as exc: + messages.append( + f"ERROR: XML is not well-formed: {exc}" + ) + return False, messages + + messages.append( + "XML is well-formed (basic check passed)" + ) + + project_set = tree.find("PROJECT_SET") + if project_set is None: + messages.append( + "ERROR: Missing PROJECT_SET element" + ) + return False, messages + + projects = project_set.findall("PROJECT") + if not projects: + messages.append("ERROR: No PROJECT elements found") + return False, messages + + for proj in projects: + alias = proj.get("alias", "") + title = proj.find("TITLE") + if title is None or not title.text: + messages.append( + f"ERROR: PROJECT '{alias}' missing TITLE" + ) + return False, messages + sp = proj.find("SUBMISSION_PROJECT") + if sp is None: + messages.append( + f"ERROR: PROJECT '{alias}'" + " missing SUBMISSION_PROJECT" + ) + return False, messages + messages.append( + f"OK: PROJECT '{alias}' has required elements" + ) + + return True, messages + + +def validate_against_xsd( + xml_bytes: bytes, + xsd_dir: str | Path, +) -> tuple[bool, list[str]]: + """Validate study XML against ENA.project.xsd. + + Args: + xml_bytes: Serialised XML document. + xsd_dir: Directory containing ``ENA.project.xsd`` + and ``SRA.common.xsd``. + + Returns: + Tuple of (*is_valid*, *messages*). + """ + return common.validate_xml_against_xsd( + xml_bytes, xsd_dir, + xsd_filename="ENA.project.xsd", + fragment_tag="PROJECT_SET", + fallback_checker=_validate_study_xml_structure, + ) + + +# ----------------------------------------------------------- +# Receipt parsing +# ----------------------------------------------------------- + + +def parse_xml_receipt( + receipt_root: ET.Element, +) -> tuple[bool, list[dict[str, str]], list[str]]: + """Parse an ENA XML receipt for study submissions. + + Args: + receipt_root: Root element of the receipt XML. + + Returns: + Tuple of (*success*, *accessions*, *messages*). + """ + success = ( + receipt_root.get("success", "false").lower() + == "true" + ) + accessions: list[dict[str, str]] = [] + messages: list[str] = [] + + msgs_el = receipt_root.find("MESSAGES") + if msgs_el is not None: + for info in msgs_el.findall("INFO"): + messages.append(f"INFO: {info.text}") + for err in msgs_el.findall("ERROR"): + messages.append(f"ERROR: {err.text}") + + for proj in receipt_root.findall("PROJECT"): + acc_info: dict[str, str] = { + "alias": proj.get("alias", ""), + "accession": proj.get("accession", ""), + "status": proj.get("status", ""), + "holdUntilDate": proj.get( + "holdUntilDate", "" + ), + } + ext = proj.find("EXT_ID") + if ext is not None: + acc_info["external_accession"] = ext.get( + "accession", "" + ) + acc_info["external_type"] = ext.get( + "type", "" + ) + accessions.append(acc_info) + + # Some receipts use STUDY instead of PROJECT. + for study in receipt_root.findall("STUDY"): + accessions.append({ + "alias": study.get("alias", ""), + "accession": study.get("accession", ""), + "status": study.get("status", ""), + }) + + return success, accessions, messages + + +# ----------------------------------------------------------- +# Submission helper +# ----------------------------------------------------------- + + +def _do_submission( + base_url: str, + auth: Any, + xml_bytes: bytes, + xsd: Path, + action: str, + results: dict[str, list[dict[str, Any]]], + result_key: str, + env_label: str, + dry_run: bool, +) -> bool: + """Validate, optionally submit, and parse one batch. + + Args: + base_url: ENA Webin v2 submission base URL. + auth: HTTP basic-auth credentials. + xml_bytes: Serialised XML submission document. + xsd: Directory containing the XSD files. + action: Label for log messages (``"ADD"`` or + ``"MODIFY"``). + results: Results dict to accumulate into. + result_key: Key under which successes are stored. + env_label: ``"TEST"`` or ``"PRODUCTION"``. + dry_run: If ``True``, skip the actual submission. + + Returns: + ``True`` if the batch succeeded (or dry run). + """ + xsd_valid, xsd_messages = validate_against_xsd( + xml_bytes, xsd, + ) + for msg in xsd_messages: + logger.info(" %s", msg) + if not xsd_valid: + logger.error( + "XSD validation FAILED (%s)" + " — aborting submission", action, + ) + return False + + logger.info("XSD validation PASSED (%s)", action) + + if dry_run: + logger.info( + "DRY RUN — skipping %s submission", action, + ) + logger.info( + "Generated XML:\n%s", + xml_bytes.decode("utf-8"), + ) + return True + + logger.info( + "Submitting %s to ENA (%s)...", action, env_label, + ) + try: + receipt_root = common.submit_xml( + base_url, auth, xml_bytes, + ) + except requests.exceptions.HTTPError as exc: + logger.error( + "HTTP error during %s submission: %s", + action, exc, + ) + if exc.response is not None: + logger.error( + "Response body: %s", exc.response.text, + ) + return False + + success, accessions, receipt_messages = ( + parse_xml_receipt(receipt_root) + ) + for msg in receipt_messages: + logger.info(" Receipt: %s", msg) + + if success: + logger.info("%s SUCCESSFUL", action) + for acc in accessions: + ext = acc.get("external_accession", "") + ext_suffix = ( + f" (study: {ext})" if ext else "" + ) + logger.info( + " %s: alias=%s accession=%s" + " status=%s%s", + action, acc["alias"], acc["accession"], + acc["status"], ext_suffix, + ) + results[result_key].append(acc) + else: + logger.error("%s FAILED", action) + receipt_xml_str = ET.tostring( + receipt_root, encoding="unicode", + ) + logger.error("Receipt XML: %s", receipt_xml_str) + results["failed"].extend(accessions) + + return success + + +# ----------------------------------------------------------- +# Main +# ----------------------------------------------------------- + +_JSON_RECORD_KEYS: Final = ("studies", "data") + + +@app.command() +def main( + input_file: Path = typer.Option( + ..., "--input", exists=True, + help="Path to study metadata file" + " (JSON, CSV, TSV, XLS, or XLSX)", + ), + linkml: Path = typer.Option( + ..., exists=True, + help="Path to LinkML YAML schema" + " (e.g. schemas/SRA_study.yaml)", + ), + xsd: Path = typer.Option( + ..., exists=True, + file_okay=False, resolve_path=True, + help="Directory containing ENA.project.xsd" + " and SRA.common.xsd", + ), + test: bool = typer.Option( + False, "--test", + help="Use the ENA test service" + " (submissions are discarded daily)", + ), + hold_until: str | None = typer.Option( + None, "--hold-until", + help="Hold studies private until this date" + " (YYYY-MM-DD, max 2 years from now)", + ), + log: Path | None = typer.Option( + None, help="Path to log file", + ), + output: Path | None = typer.Option( + None, + help="Path to write JSON accession results" + " (default: stdout)", + ), + max_results: int = typer.Option( + 5000, "--max-results", + help="Maximum number of projects to fetch" + " from the Reports API for duplicate" + " checking", + ), + dry_run: bool = typer.Option( + False, "--dry-run", + help="Validate and build XML but do not" + " submit to ENA", + ), + automated: bool = typer.Option( + False, "--automated", + help="Skip duplicate detection against the" + " Webin Reports API (for automated pipelines)", + ), + force: bool = typer.Option( + False, "--force", + help="Submit duplicate studies using the MODIFY" + " action to overwrite existing ENA records," + " instead of skipping them", + ), +) -> None: + """Submit studies to ENA via the Webin REST API v2.""" + common.setup_logging(log) + username, password = common.get_credentials() + + env_label = "TEST" if test else "PRODUCTION" + logger.info( + "ENA Study Submission — environment: %s", + env_label, + ) + base_url = common.get_base_url(test) + auth = HTTPBasicAuth(username, password) + logger.debug("Auth username: %s", username) + + if hold_until: + common.validate_hold_until(hold_until) + + # -- Step 1: Load input file ------------------------- + logger.info("Loading input: %s", input_file) + studies = common.load_input_file( + input_file, json_record_keys=_JSON_RECORD_KEYS, + ) + if studies is None: + logger.error( + "Unsupported file format." + " Supported: .json, .csv, .tsv, .xlsx, .xls", + ) + sys.exit(1) + + logger.info( + "Loaded %d study/studies from input", + len(studies), + ) + + # -- Step 2: Check for duplicates -------------------- + if automated: + logger.info( + "Automated mode: skipping duplicate detection", + ) + duplicates: dict[int, dict[str, Any]] = {} + else: + account_studies = fetch_account_studies( + auth, use_test=test, + max_results=max_results, + ) + for ps in account_studies: + logger.info( + " Account study: %s | alias=%s" + " | title=%s | status=%s", + ps["accession"], ps["alias"], + ps["title"], ps["status"], + ) + duplicates = find_duplicate_studies( + studies, account_studies, + ) + + results: dict[str, list[dict[str, Any]]] = { + "duplicates": [], + "submitted": [], + "modified": [], + "failed": [], + } + + studies_to_modify: list[dict[str, Any]] = [] + if duplicates: + action_label = ( + "will be re-submitted with MODIFY" + if force else "will NOT be submitted" + ) + logger.warning( + "Found %d duplicate(s) — %s:", + len(duplicates), action_label, + ) + for idx, dup_info in duplicates.items(): + study_title = studies[idx].get( + "STUDY_TITLE", f"study[{idx}]", + ) + logger.warning( + " DUPLICATE: '%s' matches existing %s" + " (accession: %s)", + study_title, + dup_info["match_reason"], + dup_info["accession"], + ) + results["duplicates"].append({ + "input_index": idx, + "title": study_title, + "alias": studies[idx].get("alias", ""), + "existing_accession": ( + dup_info["accession"] + ), + "existing_secondary_accession": ( + dup_info.get( + "secondary_accession", "" + ) + ), + "match_reason": dup_info["match_reason"], + }) + if force: + study_copy = dict(studies[idx]) + existing_alias = dup_info.get("alias", "") + if existing_alias: + study_copy["alias"] = existing_alias + studies_to_modify.append(study_copy) + + studies_to_submit = [ + s for i, s in enumerate(studies) + if i not in duplicates + ] + + if not studies_to_submit and not studies_to_modify: + logger.info( + "No studies to submit" + " (all are duplicates or input is empty)", + ) + common.write_results(results, output) + return + + logger.info( + "%d new study/studies to ADD," + " %d duplicate(s) to MODIFY", + len(studies_to_submit), len(studies_to_modify), + ) + + # -- Step 3: Validate against LinkML ----------------- + logger.info("Loading LinkML schema: %s", linkml) + schema = common.load_linkml_schema(linkml) + + logger.info( + "Validating input against LinkML schema...", + ) + linkml_valid, linkml_messages = ( + common.validate_against_linkml( + studies_to_submit + studies_to_modify, schema, + label_fields=["STUDY_TITLE", "alias"], + entity_name="study", + unknown_field_note="will be ignored", + ) + ) + for msg in linkml_messages: + logger.info(" %s", msg) + + if not linkml_valid: + logger.error( + "LinkML validation FAILED" + " — aborting submission", + ) + sys.exit(1) + + logger.info("LinkML validation PASSED") + + overall_ok = True + + # -- Steps 4-7: ADD new studies ---------------------- + if studies_to_submit: + logger.info( + "Building ADD XML for %d new study/studies...", + len(studies_to_submit), + ) + xml_root = build_submission_xml( + studies_to_submit, hold_until=hold_until, + action="ADD", + ) + xml_bytes = common.xml_to_bytes(xml_root) + logger.debug( + "Generated XML (ADD):\n%s", + xml_bytes.decode("utf-8"), + ) + logger.info( + "XML document size (ADD): %d bytes", + len(xml_bytes), + ) + ok = _do_submission( + base_url, auth, xml_bytes, xsd, + action="ADD", + results=results, + result_key="submitted", + env_label=env_label, + dry_run=dry_run, + ) + overall_ok = overall_ok and ok + + # -- Steps 4-7: MODIFY duplicate studies (--force) --- + if studies_to_modify: + logger.info( + "Building MODIFY XML for %d duplicate(s)...", + len(studies_to_modify), + ) + xml_root = build_submission_xml( + studies_to_modify, hold_until=hold_until, + action="MODIFY", + ) + xml_bytes = common.xml_to_bytes(xml_root) + logger.debug( + "Generated XML (MODIFY):\n%s", + xml_bytes.decode("utf-8"), + ) + logger.info( + "XML document size (MODIFY): %d bytes", + len(xml_bytes), + ) + ok = _do_submission( + base_url, auth, xml_bytes, xsd, + action="MODIFY", + results=results, + result_key="modified", + env_label=env_label, + dry_run=dry_run, + ) + overall_ok = overall_ok and ok + + if not overall_ok: + sys.exit(1) + + # -- Step 8: Output results -------------------------- + common.write_results(results, output) + + logger.info("=" * 60) + logger.info("SUBMISSION SUMMARY") + logger.info( + " Duplicates skipped: %d", + len(results["duplicates"]) + - len(results["modified"]), + ) + for d in results["duplicates"]: + logger.info( + " %s -> %s", + d["title"], d["existing_accession"], + ) + logger.info( + " Newly submitted (ADD): %d", + len(results["submitted"]), + ) + for s in results["submitted"]: + ext = s.get("external_accession", "") + ext_suffix = f" ({ext})" if ext else "" + logger.info( + " %s -> %s%s", + s["alias"], s["accession"], ext_suffix, + ) + logger.info( + " Modified (MODIFY): %d", + len(results["modified"]), + ) + for m in results["modified"]: + ext = m.get("external_accession", "") + ext_suffix = f" ({ext})" if ext else "" + logger.info( + " %s -> %s%s", + m["alias"], m["accession"], ext_suffix, + ) + logger.info("=" * 60) + + +if __name__ == "__main__": + app() From c6d80fbb5d5e397d4668e7d6aea992282261dd3f Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Thu, 12 Mar 2026 11:38:02 +0000 Subject: [PATCH 02/36] Modified study-submit script to work in mgnify-pipelines-toolkit container. Removed linkml validation which is more useful for interactive submission. --- assets/test-fixtures/mimicc_study.csv | 3 + assets/test-fixtures/mimicc_study.json | 15 + assets/test-fixtures/mimicc_study.tsv | 3 + bin/ena_common.py | 726 +++++++++++++++++++++++++ bin/submit_study.py | 232 ++++---- bin/test_submit_study.py | 430 +++++++++++++++ 6 files changed, 1275 insertions(+), 134 deletions(-) create mode 100644 assets/test-fixtures/mimicc_study.csv create mode 100644 assets/test-fixtures/mimicc_study.json create mode 100644 assets/test-fixtures/mimicc_study.tsv create mode 100644 bin/ena_common.py create mode 100644 bin/test_submit_study.py diff --git a/assets/test-fixtures/mimicc_study.csv b/assets/test-fixtures/mimicc_study.csv new file mode 100644 index 0000000..2b68cc1 --- /dev/null +++ b/assets/test-fixtures/mimicc_study.csv @@ -0,0 +1,3 @@ +Generic,,,,,,,, +IS_PRIMARY,STUDY_TITLE,existing_study_type,new_study_type,STUDY_ABSTRACT,CENTER_NAME,CENTER_PROJECT_NAME,PROJECT_ID,STUDY_DESCRIPTION +YES,MIMICC,Metagenomics,,,,,, \ No newline at end of file diff --git a/assets/test-fixtures/mimicc_study.json b/assets/test-fixtures/mimicc_study.json new file mode 100644 index 0000000..cd9af28 --- /dev/null +++ b/assets/test-fixtures/mimicc_study.json @@ -0,0 +1,15 @@ +{ + "schema": "https://github.com/timrozday/ena-submission-dataharmonizer/SRA_study", + "location": "/templates/sra_study", + "version": "1.0.0", + "in_language": "en", + "Container": { + "SRA_studys": [ + { + "IS_PRIMARY": "YES", + "STUDY_TITLE": "MIMICC", + "existing_study_type": "Metagenomics" + } + ] + } +} \ No newline at end of file diff --git a/assets/test-fixtures/mimicc_study.tsv b/assets/test-fixtures/mimicc_study.tsv new file mode 100644 index 0000000..4682df1 --- /dev/null +++ b/assets/test-fixtures/mimicc_study.tsv @@ -0,0 +1,3 @@ +Generic +IS_PRIMARY STUDY_TITLE existing_study_type new_study_type STUDY_ABSTRACT CENTER_NAME CENTER_PROJECT_NAME PROJECT_ID STUDY_DESCRIPTION +YES MIMICC Metagenomics \ No newline at end of file diff --git a/bin/ena_common.py b/bin/ena_common.py new file mode 100644 index 0000000..de08c48 --- /dev/null +++ b/bin/ena_common.py @@ -0,0 +1,726 @@ +"""Shared utilities for ENA submission scripts. + +Provide logging, credential management, file loading, +XSD structural validation, Reports API access, duplicate +detection, XML serialisation, and result output used by +``submit_study.py``, ``submit_sample.py``, and +``submit_reads.py``. +""" + +from __future__ import annotations + +import csv +import datetime +import json +import logging +import os +import sys +import xml.etree.ElementTree as ET +from collections.abc import Callable, Sequence +from io import BytesIO +from pathlib import Path +from typing import Any, Final + +import click +import requests +from requests.auth import HTTPBasicAuth + +# All loggers in the ENA submission scripts are children of +# this root, so configuring it once propagates to all. +_LOGGER_NAME: Final = "ena_submit" + +logger = logging.getLogger(_LOGGER_NAME) + + +# ----------------------------------------------------------- +# Constants +# ----------------------------------------------------------- + +PROD_URL: Final = ( + "https://www.ebi.ac.uk/ena/submit/webin-v2" +) +TEST_URL: Final = ( + "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2" +) + +_MAX_HOLD_YEARS: Final = 2 + + +# ----------------------------------------------------------- +# Logging +# ----------------------------------------------------------- + + +def setup_logging(log_file: Path | None = None) -> None: + """Configure stderr and optional file logging. + + Attach handlers to the ``ena_submit`` parent logger. + Child loggers (e.g. ``ena_submit.study``) propagate + their messages to these handlers automatically. + + Args: + log_file: Path to a log file. If provided, + debug-level messages are written there in + addition to stderr. + """ + root = logging.getLogger(_LOGGER_NAME) + + # Avoid duplicate handlers on repeated calls. + if root.handlers: + return + + fmt = logging.Formatter( + "%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + root.setLevel(logging.DEBUG) + + stderr_handler = logging.StreamHandler(sys.stderr) + stderr_handler.setLevel(logging.INFO) + stderr_handler.setFormatter(fmt) + root.addHandler(stderr_handler) + + if log_file: + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(fmt) + root.addHandler(file_handler) + + +# ----------------------------------------------------------- +# Credentials +# ----------------------------------------------------------- + + +def get_credentials() -> tuple[str, str]: + """Read ENA credentials from environment variables. + + Returns: + Tuple of (*username*, *password*). + + Raises: + SystemExit: If either variable is unset or empty. + """ + username = os.environ.get("ENA_USERNAME", "").strip() + password = os.environ.get("ENA_PASSWORD", "").strip() + if not username or not password: + logger.error( + "ENA_USERNAME and ENA_PASSWORD environment" + " variables must be set", + ) + sys.exit(1) + return username, password + + +# ----------------------------------------------------------- +# ENA API helpers +# ----------------------------------------------------------- + + +def get_base_url(use_test: bool) -> str: + """Return the ENA Webin v2 submission base URL.""" + return TEST_URL if use_test else PROD_URL + + +def submit_xml( + base_url: str, + auth: HTTPBasicAuth, + xml_bytes: bytes, +) -> ET.Element: + """Submit an XML document to ENA via Webin v2. + + Args: + base_url: ENA submission service base URL. + auth: HTTP basic-auth credentials. + xml_bytes: Serialised XML submission document. + + Returns: + Parsed receipt XML element tree root. + """ + url = f"{base_url}/submit" + headers = { + "Content-Type": "application/xml", + "Accept": "application/xml", + } + resp = requests.post( + url, data=xml_bytes, + headers=headers, auth=auth, timeout=120, + ) + resp.raise_for_status() + return ET.fromstring(resp.content) + + +# ----------------------------------------------------------- +# XML utilities +# ----------------------------------------------------------- + + +def xml_to_bytes(root: ET.Element) -> bytes: + """Serialise an ElementTree element to UTF-8 bytes.""" + tree = ET.ElementTree(root) + buf = BytesIO() + tree.write(buf, encoding="UTF-8", xml_declaration=True) + return buf.getvalue() + + +# ----------------------------------------------------------- +# Hold-until date validation +# ----------------------------------------------------------- + + +def validate_hold_until(hold_until: str) -> datetime.date: + """Parse and validate a hold-until date string. + + Args: + hold_until: Date string in ``YYYY-MM-DD`` format. + + Returns: + Parsed date. + + Raises: + click.BadParameter: If the date format is invalid, + in the past, or more than 2 years from today. + """ + try: + hold_date = datetime.date.fromisoformat(hold_until) + except ValueError: + raise click.BadParameter( + f"Invalid date format: {hold_until!r}." + " Expected YYYY-MM-DD." + ) from None + + today = datetime.date.today() + max_date = today.replace(year=today.year + _MAX_HOLD_YEARS) + + if hold_date > max_date: + raise click.BadParameter( + f"Hold date {hold_until} is more than" + f" {_MAX_HOLD_YEARS} years from today" + f" ({today}). Maximum allowed: {max_date}." + ) + + if hold_date <= today: + raise click.BadParameter( + f"Hold date {hold_until} is not in the" + f" future (today is {today})." + ) + + return hold_date + + +# ----------------------------------------------------------- +# ENA checklist XML parsing +# ----------------------------------------------------------- + + +def parse_checklist_units( + xml_path: str | Path, +) -> dict[str, str]: + """Parse an ENA checklist XML and return field units. + + Reads the ```` elements from an ENA checklist XML + file (e.g. ``ERC000015.xml``) and returns a mapping from + slot name to unit string for every field that declares a + ```` element. + + Args: + xml_path: Path to the ENA checklist XML file. + + Returns: + Dict mapping slot name to unit string. + Fields without units are absent from the dict. + """ + units: dict[str, str] = {} + try: + tree = ET.parse(str(xml_path)) + except ET.ParseError as exc: + logger.warning( + "Could not parse checklist XML %s: %s", + xml_path, exc, + ) + return units + + for field in tree.iter("FIELD"): + name_el = field.find("NAME") + if name_el is None or not name_el.text: + continue + units_el = field.find("UNITS") + if units_el is None: + continue + unit_el = units_el.find("UNIT") + if unit_el is None or not unit_el.text: + continue + units[name_el.text.strip()] = unit_el.text.strip() + + return units + + +# ----------------------------------------------------------- +# XSD validation (structural fallback only) +# ----------------------------------------------------------- + + +def validate_xml_against_xsd( + xml_bytes: bytes, + fragment_tag: str | None = None, + fallback_checker: Callable[ + [bytes, list[str]], tuple[bool, list[str]] + ] | None = None, +) -> tuple[bool, list[str]]: + """Validate XML bytes using a structural check. + + Full XSD validation via lxml is not available in this + container. Uses *fallback_checker* if provided, + otherwise checks that the document is well-formed XML. + + Args: + xml_bytes: Serialised XML document. + fragment_tag: Unused; kept for API compatibility. + fallback_checker: Optional function called with + (*xml_bytes*, *messages*) that returns + (*is_valid*, *messages*). + + Returns: + Tuple of (*is_valid*, *messages*). + """ + messages: list[str] = [] + + if fallback_checker is not None: + return fallback_checker(xml_bytes, messages) + + try: + ET.fromstring(xml_bytes) + except ET.ParseError as exc: + messages.append( + f"ERROR: XML is not well-formed: {exc}" + ) + return False, messages + + messages.append( + "XML is well-formed (basic check passed)" + ) + return True, messages + + +# ----------------------------------------------------------- +# File loading (JSON, CSV, TSV) +# ----------------------------------------------------------- + + +def _is_metadata_row(row: Sequence[object]) -> bool: + """Check whether *row* is a DataHarmonizer label row. + + These rows have at most one non-empty cell. + """ + non_empty = sum( + 1 for c in row + if c is not None and str(c).strip() + ) + return non_empty <= 1 + + +def extract_records_from_tabular( + filepath: str | Path, + delimiter: str = ",", +) -> list[dict[str, str]]: + """Extract record dicts from a CSV or TSV file. + + Skip an optional DataHarmonizer metadata row if + detected. + + Args: + filepath: Path to the tabular file. + delimiter: Column delimiter character. + + Returns: + List of record dicts. + """ + with open(filepath, newline="", encoding="utf-8") as fh: + rows = list(csv.reader(fh, delimiter=delimiter)) + + if not rows: + return [] + + idx = 0 + if _is_metadata_row(rows[idx]): + idx += 1 + if idx >= len(rows): + return [] + + headers = rows[idx] + idx += 1 + + records: list[dict[str, str]] = [] + for row in rows[idx:]: + record: dict[str, str] = {} + for col, val in zip(headers, row): + col = col.strip() + if col and val is not None and val.strip(): + record[col] = val.strip() + if record: + records.append(record) + + return records + + +def extract_records_from_json( + input_data: object, + record_keys: Sequence[str] = ("data",), +) -> list[dict[str, Any]] | None: + """Extract record dicts from a DataHarmonizer JSON export. + + Handle several JSON shapes: + + * DataHarmonizer Container format:: + + {"Container": {"s": [{...}, ...]}} + + * Plain list of dicts. + * Dict with an entity-specific key or ``data`` key. + * Single record object (no wrapper). + + Args: + input_data: Parsed JSON data (any shape). + record_keys: Dict keys to check for record lists + (e.g. ``["studies", "data"]``). + + Returns: + List of record dicts, or ``None`` if unrecognised. + """ + if isinstance(input_data, list): + return input_data + + if isinstance(input_data, dict): + container = input_data.get("Container") + if isinstance(container, dict): + for key, val in container.items(): + if isinstance(val, list): + logger.info( + "Extracted records from" + " Container.%s", + key, + ) + return val + + for key in record_keys: + if key in input_data: + return input_data[key] + + return [input_data] + + return None + + +def load_input_file( + filepath: str | Path, + json_record_keys: Sequence[str] = ("data",), +) -> list[dict[str, Any]] | None: + """Load records from a supported file format. + + Supported formats: JSON, CSV, TSV. + + Args: + filepath: Path to the input file. + json_record_keys: Dict keys to check when parsing + JSON (e.g. ``["studies", "data"]``). + + Returns: + List of record dicts, or ``None`` if the format is + unrecognised. + """ + ext = Path(filepath).suffix.lower() + if ext == ".json": + with open(filepath) as fh: + input_data = json.load(fh) + return extract_records_from_json( + input_data, json_record_keys, + ) + if ext == ".csv": + return extract_records_from_tabular( + filepath, delimiter=",", + ) + if ext == ".tsv": + return extract_records_from_tabular( + filepath, delimiter="\t", + ) + return None + + +# ----------------------------------------------------------- +# Reports API +# ----------------------------------------------------------- + + +def fetch_from_reports_endpoint( + url: str, + auth: HTTPBasicAuth, + max_results: int = 5000, +) -> list[dict[str, Any]] | None: + """Fetch records from a single Webin Reports endpoint. + + Args: + url: Full URL of the reports endpoint. + auth: HTTP basic-auth credentials. + max_results: Maximum number of results to request. + + Returns: + List of raw report dicts, or ``None`` on error. + """ + params = { + "format": "json", + "max-results": max_results, + } + + req = requests.Request( + "GET", url, params=params, auth=auth, + ) + prepared = req.prepare() + logger.debug( + 'curl -u %s:*** "%s"', + auth.username, prepared.url, + ) + + try: + resp = requests.get( + url, params=params, auth=auth, timeout=60, + ) + logger.info( + "Reports API at %s returned %s", + url, resp.status_code, + ) + resp.raise_for_status() + return resp.json() + + except requests.exceptions.HTTPError as exc: + status = ( + exc.response.status_code + if exc.response is not None + else "unknown" + ) + if status == 404: + logger.info( + "Reports API at %s returned 404" + " — no records yet", + url, + ) + return [] + if status in (401, 403): + logger.warning( + "Reports API at %s returned %s" + " — endpoint may not be available" + " or credentials may differ", + url, status, + ) + return None + logger.warning( + "Reports API at %s returned HTTP %s", + url, status, + ) + return None + + except requests.exceptions.RequestException as exc: + logger.warning( + "Reports API at %s failed: %s", url, exc, + ) + return None + + +def fetch_account_records( + auth: HTTPBasicAuth, + use_test: bool, + prod_url: str, + test_url: str, + normalizer: Callable[ + [dict[str, Any]], dict[str, str] | None + ], + entity_label: str, + max_results: int = 5000, +) -> list[dict[str, str]]: + """Fetch and normalise records from the Reports API. + + Try test endpoint first (if *use_test*), then fall back + to production. + + Args: + auth: HTTP basic-auth credentials. + use_test: Try the test endpoint first. + prod_url: Production reports endpoint URL. + test_url: Test reports endpoint URL. + normalizer: Callable that maps a raw report dict to + a normalised dict, or ``None`` to skip. + entity_label: Label for log messages (e.g. + ``"studies"``). + max_results: Maximum number of results to request. + + Returns: + List of normalised record dicts. + """ + urls = ( + [test_url, prod_url] if use_test + else [prod_url] + ) + + for url in urls: + logger.info( + "Fetching account %s from: %s", + entity_label, url, + ) + raw = fetch_from_reports_endpoint( + url, auth, max_results, + ) + if raw is None: + continue + + records: list[dict[str, str]] = [] + for entry in raw: + report = entry.get("report") + if report is None: + continue + normalized = normalizer(report) + if normalized is not None: + records.append(normalized) + + logger.info( + "Found %d %s in account", + len(records), entity_label, + ) + return records + + logger.warning( + "Could not reach any Webin reports endpoint." + " Duplicate checking for %s will be skipped.", + entity_label, + ) + return [] + + +# ----------------------------------------------------------- +# Duplicate detection (alias + title matching) +# ----------------------------------------------------------- + + +def find_duplicates_by_alias_title( + new_records: Sequence[dict[str, Any]], + account_records: Sequence[dict[str, str]], + title_field: str, + entity_label: str, +) -> dict[int, dict[str, str]]: + """Check new records against account records. + + Match by ``alias`` (preferred) or by the entity-specific + title field against the pre-fetched account records from + the Webin Reports API. + + Args: + new_records: Records the user wants to submit. + account_records: Existing records already registered + under the Webin account. + title_field: Field name for the title in new records + (e.g. ``"STUDY_TITLE"`` or ``"SAMPLE_TITLE"``). + entity_label: Label for log messages. + + Returns: + Mapping of index in *new_records* to matching + existing record info. + """ + duplicates: dict[int, dict[str, str]] = {} + total = len(new_records) + + if not account_records: + return duplicates + + by_title: dict[str, dict[str, str]] = {} + by_alias: dict[str, dict[str, str]] = {} + for rec in account_records: + title = (rec.get("title") or "").strip() + alias = (rec.get("alias") or "").strip() + if title: + by_title[title] = rec + if alias: + by_alias[alias] = rec + + logger.info( + "Checking %d new %s against" + " %d existing account %s...", + total, entity_label, + len(account_records), entity_label, + ) + + for i, record in enumerate(new_records): + new_title = ( + record.get(title_field) or "" + ).strip() + new_alias = (record.get("alias") or "").strip() + + if not new_title and not new_alias: + continue + + match = _match_by_alias_title( + new_alias, new_title, by_alias, by_title, + ) + if match is not None: + duplicates[i] = match + logger.info( + " Duplicate: '%s' matches %s -> %s (%s)", + new_title or new_alias, + match["match_reason"], + match["accession"], + match["status"], + ) + + if len(duplicates) == total: + logger.info( + "All %s are duplicates" + " — skipping further checks", + entity_label, + ) + return duplicates + + return duplicates + + +def _match_by_alias_title( + new_alias: str, + new_title: str, + by_alias: dict[str, dict[str, str]], + by_title: dict[str, dict[str, str]], +) -> dict[str, str] | None: + """Return matching record info or ``None``.""" + if new_alias and new_alias in by_alias: + rec = by_alias[new_alias] + reason = f"alias '{new_alias}'" + elif new_title and new_title in by_title: + rec = by_title[new_title] + reason = f"title '{new_title}'" + else: + return None + + return { + "accession": rec.get("accession", ""), + "secondary_accession": rec.get( + "secondary_accession", "" + ), + "alias": rec.get("alias", ""), + "title": rec.get("title", ""), + "status": rec.get("status", "UNKNOWN"), + "match_reason": reason, + } + + +# ----------------------------------------------------------- +# Result output +# ----------------------------------------------------------- + + +def write_results( + results: dict[str, list[dict[str, Any]]], + output_path: Path | None, +) -> None: + """Write JSON results to file or stdout.""" + json_str = json.dumps(results, indent=2) + if output_path: + with open(output_path, "w") as fh: + fh.write(json_str + "\n") + logger.info("Results written to %s", output_path) + else: + print(json_str) diff --git a/bin/submit_study.py b/bin/submit_study.py index 9ec012a..656f746 100644 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -2,7 +2,6 @@ """Submit studies to ENA via the Webin REST API v2. Read a DataHarmonizer export containing study metadata, -validate it against a LinkML schema and an XSD schema, check for duplicate studies already registered under the Webin account, construct an XML submission document, and submit new studies to ENA. @@ -15,46 +14,36 @@ Usage:: - python scripts/submit_study.py \\ - --input studies.json \\ - --linkml schemas/SRA_study.yaml \\ - --xsd assets/ena_schema \\ + python scripts/submit_study.py \ + --input studies.json \ --test # With hold date (max 2 years): - python scripts/submit_study.py \\ - --input studies.json \\ - --linkml schemas/SRA_study.yaml \\ - --xsd assets/ena_schema \\ + python scripts/submit_study.py \ + --input studies.json \ --hold-until 2028-01-01 # Log to file: - python scripts/submit_study.py \\ - --input studies.json \\ - --linkml schemas/SRA_study.yaml \\ - --xsd assets/ena_schema \\ + python scripts/submit_study.py \ + --input studies.json \ --test --log submission.log """ from __future__ import annotations +import datetime import logging import sys import xml.etree.ElementTree as ET from pathlib import Path from typing import Any, Final -import pendulum +import click import requests -import typer from requests.auth import HTTPBasicAuth import ena_common as common -app = typer.Typer( - help="Submit studies to ENA via the Webin REST API v2.", -) - logger = logging.getLogger("ena_submit.study") @@ -180,7 +169,7 @@ def build_submission_xml( ) sub_alias = ( "study-submission-" - + pendulum.now().format("YYYYMMDD-HHmmss") + + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") ) submission.set("alias", sub_alias) actions = ET.SubElement(submission, "ACTIONS") @@ -259,7 +248,7 @@ def _add_project_attribute( # ----------------------------------------------------------- -# XSD validation (study-specific fallback) +# Structural XML validation (study-specific) # ----------------------------------------------------------- @@ -267,7 +256,7 @@ def _validate_study_xml_structure( xml_bytes: bytes, messages: list[str], ) -> tuple[bool, list[str]]: - """Fallback structural check for study XML.""" + """Structural check for study XML.""" try: tree = ET.fromstring(xml_bytes) except ET.ParseError as exc: @@ -314,24 +303,19 @@ def _validate_study_xml_structure( return True, messages -def validate_against_xsd( +def validate_study_xml( xml_bytes: bytes, - xsd_dir: str | Path, ) -> tuple[bool, list[str]]: - """Validate study XML against ENA.project.xsd. + """Validate study XML structure. Args: xml_bytes: Serialised XML document. - xsd_dir: Directory containing ``ENA.project.xsd`` - and ``SRA.common.xsd``. Returns: Tuple of (*is_valid*, *messages*). """ return common.validate_xml_against_xsd( - xml_bytes, xsd_dir, - xsd_filename="ENA.project.xsd", - fragment_tag="PROJECT_SET", + xml_bytes, fallback_checker=_validate_study_xml_structure, ) @@ -405,7 +389,6 @@ def _do_submission( base_url: str, auth: Any, xml_bytes: bytes, - xsd: Path, action: str, results: dict[str, list[dict[str, Any]]], result_key: str, @@ -418,7 +401,6 @@ def _do_submission( base_url: ENA Webin v2 submission base URL. auth: HTTP basic-auth credentials. xml_bytes: Serialised XML submission document. - xsd: Directory containing the XSD files. action: Label for log messages (``"ADD"`` or ``"MODIFY"``). results: Results dict to accumulate into. @@ -429,19 +411,17 @@ def _do_submission( Returns: ``True`` if the batch succeeded (or dry run). """ - xsd_valid, xsd_messages = validate_against_xsd( - xml_bytes, xsd, - ) - for msg in xsd_messages: + xml_valid, xml_messages = validate_study_xml(xml_bytes) + for msg in xml_messages: logger.info(" %s", msg) - if not xsd_valid: + if not xml_valid: logger.error( - "XSD validation FAILED (%s)" + "XML validation FAILED (%s)" " — aborting submission", action, ) return False - logger.info("XSD validation PASSED (%s)", action) + logger.info("XML validation PASSED (%s)", action) if dry_run: logger.info( @@ -509,75 +489,86 @@ def _do_submission( _JSON_RECORD_KEYS: Final = ("studies", "data") -@app.command() +@click.command( + help="Submit studies to ENA via the Webin REST API v2.", +) +@click.option( + "--input", "input_file", + required=True, + type=click.Path(exists=True, path_type=Path), + help="Path to study metadata file (JSON, CSV, or TSV)", +) +@click.option( + "--test", "use_test", + is_flag=True, default=False, + help="Use the ENA test service" + " (submissions are discarded daily)", +) +@click.option( + "--hold-until", + default=None, + help="Hold studies private until this date" + " (YYYY-MM-DD, max 2 years from now)", +) +@click.option( + "--log", "log_file", + type=click.Path(path_type=Path), + default=None, + help="Path to log file", +) +@click.option( + "--output", + type=click.Path(path_type=Path), + default=None, + help="Path to write JSON accession results" + " (default: stdout)", +) +@click.option( + "--max-results", + default=5000, + help="Maximum number of projects to fetch" + " from the Reports API for duplicate checking", +) +@click.option( + "--dry-run", + is_flag=True, default=False, + help="Validate and build XML but do not" + " submit to ENA", +) +@click.option( + "--automated", + is_flag=True, default=False, + help="Skip duplicate detection against the" + " Webin Reports API (for automated pipelines)", +) +@click.option( + "--force", + is_flag=True, default=False, + help="Submit duplicate studies using the MODIFY" + " action to overwrite existing ENA records," + " instead of skipping them", +) def main( - input_file: Path = typer.Option( - ..., "--input", exists=True, - help="Path to study metadata file" - " (JSON, CSV, TSV, XLS, or XLSX)", - ), - linkml: Path = typer.Option( - ..., exists=True, - help="Path to LinkML YAML schema" - " (e.g. schemas/SRA_study.yaml)", - ), - xsd: Path = typer.Option( - ..., exists=True, - file_okay=False, resolve_path=True, - help="Directory containing ENA.project.xsd" - " and SRA.common.xsd", - ), - test: bool = typer.Option( - False, "--test", - help="Use the ENA test service" - " (submissions are discarded daily)", - ), - hold_until: str | None = typer.Option( - None, "--hold-until", - help="Hold studies private until this date" - " (YYYY-MM-DD, max 2 years from now)", - ), - log: Path | None = typer.Option( - None, help="Path to log file", - ), - output: Path | None = typer.Option( - None, - help="Path to write JSON accession results" - " (default: stdout)", - ), - max_results: int = typer.Option( - 5000, "--max-results", - help="Maximum number of projects to fetch" - " from the Reports API for duplicate" - " checking", - ), - dry_run: bool = typer.Option( - False, "--dry-run", - help="Validate and build XML but do not" - " submit to ENA", - ), - automated: bool = typer.Option( - False, "--automated", - help="Skip duplicate detection against the" - " Webin Reports API (for automated pipelines)", - ), - force: bool = typer.Option( - False, "--force", - help="Submit duplicate studies using the MODIFY" - " action to overwrite existing ENA records," - " instead of skipping them", - ), + input_file: Path, + use_test: bool, + hold_until: str | None, + log_file: Path | None, + output: Path | None, + max_results: int, + dry_run: bool, + automated: bool, + force: bool, ) -> None: """Submit studies to ENA via the Webin REST API v2.""" - common.setup_logging(log) + common.setup_logging(log_file) username, password = common.get_credentials() - env_label = "TEST" if test else "PRODUCTION" + env_label = "TEST" if use_test else "PRODUCTION" logger.info( "ENA Study Submission — environment: %s", env_label, ) - base_url = common.get_base_url(test) + base_url = common.get_base_url(use_test) auth = HTTPBasicAuth(username, password) logger.debug("Auth username: %s", username) @@ -592,7 +583,7 @@ def main( if studies is None: logger.error( "Unsupported file format." - " Supported: .json, .csv, .tsv, .xlsx, .xls", + " Supported: .json, .csv, .tsv", ) sys.exit(1) @@ -609,7 +600,7 @@ def main( duplicates: dict[int, dict[str, Any]] = {} else: account_studies = fetch_account_studies( - auth, use_test=test, + auth, use_test=use_test, max_results=max_results, ) for ps in account_studies: @@ -691,36 +682,9 @@ def main( len(studies_to_submit), len(studies_to_modify), ) - # -- Step 3: Validate against LinkML ----------------- - logger.info("Loading LinkML schema: %s", linkml) - schema = common.load_linkml_schema(linkml) - - logger.info( - "Validating input against LinkML schema...", - ) - linkml_valid, linkml_messages = ( - common.validate_against_linkml( - studies_to_submit + studies_to_modify, schema, - label_fields=["STUDY_TITLE", "alias"], - entity_name="study", - unknown_field_note="will be ignored", - ) - ) - for msg in linkml_messages: - logger.info(" %s", msg) - - if not linkml_valid: - logger.error( - "LinkML validation FAILED" - " — aborting submission", - ) - sys.exit(1) - - logger.info("LinkML validation PASSED") - overall_ok = True - # -- Steps 4-7: ADD new studies ---------------------- + # -- Step 3: ADD new studies ------------------------- if studies_to_submit: logger.info( "Building ADD XML for %d new study/studies...", @@ -740,7 +704,7 @@ def main( len(xml_bytes), ) ok = _do_submission( - base_url, auth, xml_bytes, xsd, + base_url, auth, xml_bytes, action="ADD", results=results, result_key="submitted", @@ -749,7 +713,7 @@ def main( ) overall_ok = overall_ok and ok - # -- Steps 4-7: MODIFY duplicate studies (--force) --- + # -- Step 4: MODIFY duplicate studies (--force) ------ if studies_to_modify: logger.info( "Building MODIFY XML for %d duplicate(s)...", @@ -769,7 +733,7 @@ def main( len(xml_bytes), ) ok = _do_submission( - base_url, auth, xml_bytes, xsd, + base_url, auth, xml_bytes, action="MODIFY", results=results, result_key="modified", @@ -781,7 +745,7 @@ def main( if not overall_ok: sys.exit(1) - # -- Step 8: Output results -------------------------- + # -- Step 5: Output results -------------------------- common.write_results(results, output) logger.info("=" * 60) @@ -822,4 +786,4 @@ def main( if __name__ == "__main__": - app() + main() diff --git a/bin/test_submit_study.py b/bin/test_submit_study.py new file mode 100644 index 0000000..5944207 --- /dev/null +++ b/bin/test_submit_study.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python3 +"""Tests for submit_study.py and ena_common.py — study submission pipeline. + +Usage: + pytest bin/test_submit_study.py -v +""" + +from __future__ import annotations + +import json +import os +import sys + +import pytest + +# Ensure the scripts directory is importable +sys.path.insert(0, os.path.dirname(__file__)) + +import ena_common as common +from submit_study import ( + build_submission_xml, + find_duplicate_studies, + validate_study_xml, +) + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- + +FIXTURES_DIR = os.path.join( + os.path.dirname(__file__), "..", "assets", "test-fixtures", +) +MIMICC_JSON = os.path.join(FIXTURES_DIR, "mimicc_study.json") +MIMICC_CSV = os.path.join(FIXTURES_DIR, "mimicc_study.csv") +MIMICC_TSV = os.path.join(FIXTURES_DIR, "mimicc_study.tsv") + +_FIXTURES_PRESENT = os.path.isfile(MIMICC_JSON) +requires_fixtures = pytest.mark.skipif( + not _FIXTURES_PRESENT, + reason="mimicc test fixtures not present in assets/test-fixtures/", +) + +_JSON_RECORD_KEYS = ("studies", "data") + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture +def mimicc_json(): + """Load the MIMICC study JSON fixture.""" + with open(MIMICC_JSON) as f: + return json.load(f) + + +# --------------------------------------------------------------------------- +# extract_records_from_json tests +# --------------------------------------------------------------------------- + + +class TestExtractRecordsFromJson: + """Tests for extracting study rows from various JSON formats.""" + + @requires_fixtures + def test_dataharmonizer_container_format(self, mimicc_json): + """The mimicc_study.json fixture uses DataHarmonizer Container format.""" + studies = common.extract_records_from_json( + mimicc_json, record_keys=_JSON_RECORD_KEYS, + ) + assert studies is not None + assert len(studies) == 1 + assert studies[0]["STUDY_TITLE"] == "MIMICC" + assert studies[0]["existing_study_type"] == "Metagenomics" + assert studies[0]["IS_PRIMARY"] == "YES" + + def test_plain_list(self): + """Plain list input returns the list as-is.""" + data = [{"STUDY_TITLE": "Test Study", "IS_PRIMARY": "YES"}] + studies = common.extract_records_from_json( + data, record_keys=_JSON_RECORD_KEYS, + ) + assert studies == data + + def test_dict_with_studies_key(self): + """Dict with 'studies' key extracts the list.""" + data = {"studies": [{"STUDY_TITLE": "A"}, {"STUDY_TITLE": "B"}]} + studies = common.extract_records_from_json( + data, record_keys=_JSON_RECORD_KEYS, + ) + assert len(studies) == 2 + + def test_dict_with_data_key(self): + """Dict with 'data' key extracts the list.""" + data = {"data": [{"STUDY_TITLE": "C"}]} + studies = common.extract_records_from_json( + data, record_keys=_JSON_RECORD_KEYS, + ) + assert len(studies) == 1 + + def test_single_study_object(self): + """Single dict input is wrapped in a list.""" + data = {"STUDY_TITLE": "Single"} + studies = common.extract_records_from_json( + data, record_keys=_JSON_RECORD_KEYS, + ) + assert len(studies) == 1 + assert studies[0]["STUDY_TITLE"] == "Single" + + def test_invalid_input(self): + """Non-dict/list input returns None.""" + result = common.extract_records_from_json( + "not a dict or list", record_keys=_JSON_RECORD_KEYS, + ) + assert result is None + + def test_container_with_multiple_studies(self): + """Container format with multiple studies extracts all.""" + data = { + "Container": { + "SRA_studys": [ + {"STUDY_TITLE": "Study A"}, + {"STUDY_TITLE": "Study B"}, + ], + }, + } + studies = common.extract_records_from_json( + data, record_keys=_JSON_RECORD_KEYS, + ) + assert len(studies) == 2 + + +# --------------------------------------------------------------------------- +# XML building tests +# --------------------------------------------------------------------------- + + +class TestBuildSubmissionXml: + """Tests for building ENA study submission XML.""" + + def test_basic_xml_structure(self): + """Built XML contains expected elements and attributes.""" + studies = [ + { + "alias": "test-study", + "STUDY_TITLE": "Test Study", + "STUDY_ABSTRACT": "Abstract text.", + "existing_study_type": "Metagenomics", + }, + ] + root = build_submission_xml(studies) + xml_bytes = common.xml_to_bytes(root) + xml_str = xml_bytes.decode("utf-8") + assert "" in xml_str + assert 'alias="test-study"' in xml_str + assert "Test Study" in xml_str + assert "Abstract text." in xml_str + assert " dict[str, str]: + """Build a normalised account study dict.""" + return { + "title": title, + "alias": alias, + "accession": accession, + "secondary_accession": secondary_accession, + "status": status, + } + + def test_no_duplicates(self): + """No match when titles and aliases differ.""" + new = [{"STUDY_TITLE": "New Study", "alias": "new-1"}] + account = [ + self._make_account_study( + title="Other Study", alias="other-1", + ), + ] + dups = find_duplicate_studies(new, account) + assert len(dups) == 0 + + def test_duplicate_by_title(self): + """Exact title match flags a duplicate.""" + new = [{"STUDY_TITLE": "Existing Study"}] + account = [ + self._make_account_study( + title="Existing Study", + accession="PRJEB99", + status="PRIVATE", + ), + ] + dups = find_duplicate_studies(new, account) + assert 0 in dups + assert dups[0]["accession"] == "PRJEB99" + + def test_duplicate_by_alias(self): + """Alias match flags a duplicate even with different title.""" + new = [{"STUDY_TITLE": "New Title", "alias": "my-alias"}] + account = [ + self._make_account_study( + title="Different Title", + alias="my-alias", + accession="PRJEB60", + ), + ] + dups = find_duplicate_studies(new, account) + assert 0 in dups + assert dups[0]["accession"] == "PRJEB60" + assert "alias" in dups[0]["match_reason"] + + def test_alias_takes_precedence_over_title(self): + """When alias matches, it is reported as the match reason.""" + new = [{"STUDY_TITLE": "Same Title", "alias": "same-alias"}] + account = [ + self._make_account_study( + title="Same Title", + alias="same-alias", + accession="PRJEB70", + ), + ] + dups = find_duplicate_studies(new, account) + assert 0 in dups + assert "alias" in dups[0]["match_reason"] + + def test_partial_title_not_duplicate(self): + """Partial title match does not count as a duplicate.""" + new = [{"STUDY_TITLE": "My Study"}] + account = [ + self._make_account_study( + title="My Study Extended Title", + ), + ] + dups = find_duplicate_studies(new, account) + assert len(dups) == 0 + + def test_empty_account_no_duplicates(self): + """Empty account list produces no duplicates.""" + new = [{"STUDY_TITLE": "Test", "alias": "t"}] + dups = find_duplicate_studies(new, []) + assert len(dups) == 0 + + def test_empty_input_no_duplicates(self): + """Empty input list produces no duplicates.""" + account = [ + self._make_account_study(title="Existing"), + ] + dups = find_duplicate_studies([], account) + assert len(dups) == 0 + + def test_study_without_title_or_alias_skipped(self): + """Studies with no title or alias are not flagged.""" + new = [{}] + account = [ + self._make_account_study(title="Something"), + ] + dups = find_duplicate_studies(new, account) + assert len(dups) == 0 + + def test_mixed_duplicates_and_new(self): + """Mix of duplicate and new studies.""" + account = [ + self._make_account_study( + title="Dup By Title", + alias="dup-title", + accession="PRJEB10", + ), + self._make_account_study( + title="Other", + alias="dup-alias", + accession="PRJEB20", + ), + ] + new = [ + {"STUDY_TITLE": "Dup By Title", "alias": "new-alias"}, + {"STUDY_TITLE": "New Title", "alias": "dup-alias"}, + {"STUDY_TITLE": "Brand New", "alias": "brand-new"}, + ] + dups = find_duplicate_studies(new, account) + assert 0 in dups # title match + assert 1 in dups # alias match + assert 2 not in dups # new + + def test_all_duplicates_early_exit(self): + """All studies being duplicates terminates early.""" + account = [ + self._make_account_study( + title="A", accession="PRJEB1", + ), + self._make_account_study( + title="B", accession="PRJEB2", + ), + ] + new = [ + {"STUDY_TITLE": "A"}, + {"STUDY_TITLE": "B"}, + ] + dups = find_duplicate_studies(new, account) + assert len(dups) == 2 + + +# --------------------------------------------------------------------------- +# File loading tests (JSON, CSV, TSV) +# --------------------------------------------------------------------------- + +# The expected study data shared by all supported fixtures +EXPECTED_STUDY = { + "IS_PRIMARY": "YES", + "STUDY_TITLE": "MIMICC", + "existing_study_type": "Metagenomics", +} + + +@requires_fixtures +class TestLoadInputFile: + """Tests for loading study data from JSON, CSV, and TSV files.""" + + def test_load_csv(self): + """CSV file loads correctly.""" + studies = common.load_input_file( + MIMICC_CSV, json_record_keys=_JSON_RECORD_KEYS, + ) + assert studies is not None + assert len(studies) == 1 + for key, val in EXPECTED_STUDY.items(): + assert studies[0][key] == val + + def test_load_tsv(self): + """TSV file loads correctly.""" + studies = common.load_input_file( + MIMICC_TSV, json_record_keys=_JSON_RECORD_KEYS, + ) + assert studies is not None + assert len(studies) == 1 + for key, val in EXPECTED_STUDY.items(): + assert studies[0][key] == val + + def test_load_json(self): + """JSON file loads correctly.""" + studies = common.load_input_file( + MIMICC_JSON, json_record_keys=_JSON_RECORD_KEYS, + ) + assert studies is not None + assert len(studies) == 1 + for key, val in EXPECTED_STUDY.items(): + assert studies[0][key] == val + + def test_all_formats_produce_same_data(self): + """All supported formats should produce the same core study fields.""" + all_studies = [ + common.load_input_file( + path, json_record_keys=_JSON_RECORD_KEYS, + ) + for path in [MIMICC_JSON, MIMICC_CSV, MIMICC_TSV] + ] + for studies in all_studies: + assert len(studies) == 1 + for key, val in EXPECTED_STUDY.items(): + assert studies[0][key] == val + + def test_unknown_extension_returns_none(self, tmp_path): + """Unsupported file extension returns None.""" + unknown = tmp_path / "data.parquet" + unknown.write_text("dummy") + result = common.load_input_file( + str(unknown), json_record_keys=_JSON_RECORD_KEYS, + ) + assert result is None + + def test_csv_without_metadata_row(self, tmp_path): + """A CSV with no metadata row should still work.""" + csvfile = tmp_path / "no_meta.csv" + csvfile.write_text("STUDY_TITLE,IS_PRIMARY\nTest,YES\n") + studies = common.load_input_file( + str(csvfile), json_record_keys=_JSON_RECORD_KEYS, + ) + assert len(studies) == 1 + assert studies[0]["STUDY_TITLE"] == "Test" + assert studies[0]["IS_PRIMARY"] == "YES" + + def test_tabular_empty_values_omitted(self, tmp_path): + """Empty cells in tabular files should be omitted.""" + csvfile = tmp_path / "sparse.csv" + csvfile.write_text( + "STUDY_TITLE,STUDY_ABSTRACT,IS_PRIMARY\nTest,,YES\n", + ) + studies = common.load_input_file( + str(csvfile), json_record_keys=_JSON_RECORD_KEYS, + ) + assert len(studies) == 1 + assert "STUDY_ABSTRACT" not in studies[0] + assert studies[0]["STUDY_TITLE"] == "Test" From 55c3ca332633ae3519d435b0aac585b3f65bdf7e Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Thu, 12 Mar 2026 11:52:08 +0000 Subject: [PATCH 03/36] Wrote submit_study module based on submit_study python script. --- bin/submit_study.py | 0 modules/local/submit_study/environment.yml | 10 +++ modules/local/submit_study/main.nf | 47 +++++++++++++ modules/local/submit_study/meta.yml | 68 +++++++++++++++++++ modules/local/submit_study/tests/main.nf.test | 56 +++++++++++++++ .../submit_study/tests/main.nf.test.snap | 35 ++++++++++ .../local/submit_study/tests/nextflow.config | 18 +++++ nextflow.config | 10 +++ 8 files changed, 244 insertions(+) mode change 100644 => 100755 bin/submit_study.py create mode 100644 modules/local/submit_study/environment.yml create mode 100644 modules/local/submit_study/main.nf create mode 100644 modules/local/submit_study/meta.yml create mode 100644 modules/local/submit_study/tests/main.nf.test create mode 100644 modules/local/submit_study/tests/main.nf.test.snap create mode 100644 modules/local/submit_study/tests/nextflow.config diff --git a/bin/submit_study.py b/bin/submit_study.py old mode 100644 new mode 100755 diff --git a/modules/local/submit_study/environment.yml b/modules/local/submit_study/environment.yml new file mode 100644 index 0000000..6ee92a8 --- /dev/null +++ b/modules/local/submit_study/environment.yml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::python>=3.12 + - conda-forge::pip + - pip: + - mgnify-pipelines-toolkit==1.4.17 diff --git a/modules/local/submit_study/main.nf b/modules/local/submit_study/main.nf new file mode 100644 index 0000000..47a9d88 --- /dev/null +++ b/modules/local/submit_study/main.nf @@ -0,0 +1,47 @@ +process SUBMIT_STUDY { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "quay.io/microbiome-informatics/mgnify-pipelines-toolkit:1.4.17" + + // ENA_USERNAME and ENA_PASSWORD must be set in the process environment. + // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config: + // env { ENA_USERNAME = secrets.WEBIN_ACCOUNT; ENA_PASSWORD = secrets.WEBIN_PASSWORD } + + input: + tuple val(meta), path(study_metadata) + + output: + tuple val(meta), path("*_accessions.json"), emit: accessions + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + submit_study.py \\ + --input ${study_metadata} \\ + --output ${prefix}_accessions.json \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + echo '{"submitted":[],"duplicates":[],"modified":[],"failed":[]}' > ${prefix}_accessions.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))") + END_VERSIONS + """ +} diff --git a/modules/local/submit_study/meta.yml b/modules/local/submit_study/meta.yml new file mode 100644 index 0000000..e09d150 --- /dev/null +++ b/modules/local/submit_study/meta.yml @@ -0,0 +1,68 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "submit_study" +description: | + Submit a new study to ENA via the Webin REST API v2. + Reads study metadata from a JSON, CSV, or TSV file, checks for + duplicate studies already registered under the Webin account, + builds a PROJECT XML submission document, and submits to ENA. + Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD + Nextflow secrets, which are mapped to ENA_USERNAME and ENA_PASSWORD + inside the process. +keywords: + - ena + - submission + - study + - project + - webin +tools: + - mgnify-pipelines-toolkit: + description: | + A toolkit of utilities used in MGnify metagenomics pipelines, + including click, requests, and other dependencies required by + the ENA submission scripts. + homepage: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit + documentation: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit + tool_dev_url: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit + doi: "" + licence: ["Apache-2.0"] + identifier: null + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. `[ id:'sample1' ]` + - study_metadata: + type: file + description: | + Study metadata file in JSON, CSV, or TSV format. + JSON may follow the DataHarmonizer Container export format or be + a plain list/dict of study records. + Required fields per record: STUDY_TITLE, existing_study_type. + pattern: "*.{json,csv,tsv}" + +output: + - accessions: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. `[ id:'sample1' ]` + - "*_accessions.json": + type: file + description: | + JSON file containing the submission results with keys: + submitted (newly created accessions), duplicates (skipped), + modified (force-updated), and failed. + pattern: "*_accessions.json" + - versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@timrozday" +maintainers: + - "@timrozday" diff --git a/modules/local/submit_study/tests/main.nf.test b/modules/local/submit_study/tests/main.nf.test new file mode 100644 index 0000000..e37cccc --- /dev/null +++ b/modules/local/submit_study/tests/main.nf.test @@ -0,0 +1,56 @@ +nextflow_process { + name "Test Process SUBMIT_STUDY" + script "../main.nf" + config "./nextflow.config" + process "SUBMIT_STUDY" + + tag "modules" + tag "submit_study" + + test("submit_study - stub") { + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'mimicc' ], + file("${projectDir}/assets/test-fixtures/mimicc_study.json", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("submit_study - dry run against ENA test server") { + // Requires WEBIN_ACCOUNT and WEBIN_PASSWORD Nextflow secrets. + // Validates and builds the submission XML but does not submit to ENA. + + when { + process { + """ + input[0] = [ + [ id:'mimicc' ], + file("${projectDir}/assets/test-fixtures/mimicc_study.json", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.accessions[0][1]).exists() }, + { assert path(process.out.accessions[0][1]).json.submitted instanceof List }, + { assert path(process.out.accessions[0][1]).json.failed.size() == 0 } + ) + } + } +} diff --git a/modules/local/submit_study/tests/main.nf.test.snap b/modules/local/submit_study/tests/main.nf.test.snap new file mode 100644 index 0000000..dd56c7c --- /dev/null +++ b/modules/local/submit_study/tests/main.nf.test.snap @@ -0,0 +1,35 @@ +{ + "submit_study - stub": { + "content": [ + { + "0": [ + [ + { + "id": "mimicc" + }, + "mimicc_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" + ] + ], + "1": [ + "versions.yml:md5,1ffe6cc50bd36f7110413723e0796dd4" + ], + "accessions": [ + [ + { + "id": "mimicc" + }, + "mimicc_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" + ] + ], + "versions": [ + "versions.yml:md5,1ffe6cc50bd36f7110413723e0796dd4" + ] + } + ], + "timestamp": "2026-03-12T11:51:02.565164", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/local/submit_study/tests/nextflow.config b/modules/local/submit_study/tests/nextflow.config new file mode 100644 index 0000000..3611907 --- /dev/null +++ b/modules/local/submit_study/tests/nextflow.config @@ -0,0 +1,18 @@ +// Test configuration for SUBMIT_STUDY module. +// --test : use the ENA dev server (submissions are discarded daily) +// --automated : skip the Webin Reports duplicate-checking API call +// --dry-run : validate and build XML but do not submit to ENA +// +// Dummy credentials are sufficient for --dry-run --automated mode since +// no HTTP calls are made. For real submission tests, replace with secrets: +// env { ENA_USERNAME = secrets.WEBIN_ACCOUNT; ENA_PASSWORD = secrets.WEBIN_PASSWORD } +process { + withName: SUBMIT_STUDY { + ext.args = '--test --automated --dry-run' + } +} + +env { + ENA_USERNAME = 'Webin-000000' + ENA_PASSWORD = 'dummy-password' +} diff --git a/nextflow.config b/nextflow.config index 00d8b79..a6f7ae2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -179,6 +179,16 @@ profiles { } // TODO: figure out how to better orginise tests for different workflow types (bins, mags, metagenomic_assemblies) // test { includeConfig 'conf/test.config' } + test { + docker.enabled = true + conda.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + docker.runOptions = '-u $(id -u):$(id -g)' + } test_genome { includeConfig 'conf/test_genome.config' } test_assembly { includeConfig 'conf/test_assembly.config' } test_full { includeConfig 'conf/test_full.config' } From 632ef0a4f27525061ebb2a876b371531d4cc6f47 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Thu, 12 Mar 2026 11:59:21 +0000 Subject: [PATCH 04/36] Renamed files and parameters to align to existing registerstudy module, distinguish from it sing submit_study is for raw-reads rather than assemblies, and remove references to mimicc. --- .../{mimicc_study.csv => example_study.csv} | 0 .../{mimicc_study.json => example_study.json} | 0 .../{mimicc_study.tsv => example_study.tsv} | 0 bin/ena_common.py | 6 ++-- ...bmit_study.py => submit_rawreads_study.py} | 16 ++++----- ...study.py => test_submit_rawreads_study.py} | 36 +++++++++---------- .../environment.yml | 0 .../main.nf | 8 ++--- .../meta.yml | 4 +-- .../tests/main.nf.test | 20 +++++------ .../tests/main.nf.test.snap | 16 ++++----- .../tests/nextflow.config | 10 +++--- 12 files changed, 58 insertions(+), 58 deletions(-) rename assets/test-fixtures/{mimicc_study.csv => example_study.csv} (100%) rename assets/test-fixtures/{mimicc_study.json => example_study.json} (100%) rename assets/test-fixtures/{mimicc_study.tsv => example_study.tsv} (100%) rename bin/{submit_study.py => submit_rawreads_study.py} (98%) rename bin/{test_submit_study.py => test_submit_rawreads_study.py} (92%) rename modules/local/{submit_study => submit_rawreads_study}/environment.yml (100%) rename modules/local/{submit_study => submit_rawreads_study}/main.nf (84%) rename modules/local/{submit_study => submit_rawreads_study}/meta.yml (95%) rename modules/local/{submit_study => submit_rawreads_study}/tests/main.nf.test (62%) rename modules/local/{submit_study => submit_rawreads_study}/tests/main.nf.test.snap (50%) rename modules/local/{submit_study => submit_rawreads_study}/tests/nextflow.config (62%) diff --git a/assets/test-fixtures/mimicc_study.csv b/assets/test-fixtures/example_study.csv similarity index 100% rename from assets/test-fixtures/mimicc_study.csv rename to assets/test-fixtures/example_study.csv diff --git a/assets/test-fixtures/mimicc_study.json b/assets/test-fixtures/example_study.json similarity index 100% rename from assets/test-fixtures/mimicc_study.json rename to assets/test-fixtures/example_study.json diff --git a/assets/test-fixtures/mimicc_study.tsv b/assets/test-fixtures/example_study.tsv similarity index 100% rename from assets/test-fixtures/mimicc_study.tsv rename to assets/test-fixtures/example_study.tsv diff --git a/bin/ena_common.py b/bin/ena_common.py index de08c48..c782140 100644 --- a/bin/ena_common.py +++ b/bin/ena_common.py @@ -101,11 +101,11 @@ def get_credentials() -> tuple[str, str]: Raises: SystemExit: If either variable is unset or empty. """ - username = os.environ.get("ENA_USERNAME", "").strip() - password = os.environ.get("ENA_PASSWORD", "").strip() + username = os.environ.get("ENA_WEBIN", "").strip() + password = os.environ.get("ENA_WEBIN_PASSWORD", "").strip() if not username or not password: logger.error( - "ENA_USERNAME and ENA_PASSWORD environment" + "ENA_WEBIN and ENA_WEBIN_PASSWORD environment" " variables must be set", ) sys.exit(1) diff --git a/bin/submit_study.py b/bin/submit_rawreads_study.py similarity index 98% rename from bin/submit_study.py rename to bin/submit_rawreads_study.py index 656f746..f1850d7 100755 --- a/bin/submit_study.py +++ b/bin/submit_rawreads_study.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Submit studies to ENA via the Webin REST API v2. +"""Submit raw-reads studies to ENA via the Webin REST API v2. Read a DataHarmonizer export containing study metadata, check for duplicate studies already registered under the @@ -9,22 +9,22 @@ Credentials are read from environment variables to avoid secrets appearing in shell history or process listings:: - export ENA_USERNAME=Webin-XXXXX - export ENA_PASSWORD=SECRET + export ENA_WEBIN=Webin-XXXXX + export ENA_WEBIN_PASSWORD=SECRET Usage:: - python scripts/submit_study.py \ + python bin/submit_rawreads_study.py \ --input studies.json \ --test # With hold date (max 2 years): - python scripts/submit_study.py \ + python bin/submit_rawreads_study.py \ --input studies.json \ --hold-until 2028-01-01 # Log to file: - python scripts/submit_study.py \ + python bin/submit_rawreads_study.py \ --input studies.json \ --test --log submission.log """ @@ -44,7 +44,7 @@ import ena_common as common -logger = logging.getLogger("ena_submit.study") +logger = logging.getLogger("ena_submit.rawreads_study") # ----------------------------------------------------------- @@ -490,7 +490,7 @@ def _do_submission( @click.command( - help="Submit studies to ENA via the Webin REST API v2.", + help="Submit raw-reads studies to ENA via the Webin REST API v2.", ) @click.option( "--input", "input_file", diff --git a/bin/test_submit_study.py b/bin/test_submit_rawreads_study.py similarity index 92% rename from bin/test_submit_study.py rename to bin/test_submit_rawreads_study.py index 5944207..b7d3dcb 100644 --- a/bin/test_submit_study.py +++ b/bin/test_submit_rawreads_study.py @@ -1,8 +1,8 @@ #!/usr/bin/env python3 -"""Tests for submit_study.py and ena_common.py — study submission pipeline. +"""Tests for submit_rawreads_study.py and ena_common.py — study submission pipeline. Usage: - pytest bin/test_submit_study.py -v + pytest bin/test_submit_rawreads_study.py -v """ from __future__ import annotations @@ -17,7 +17,7 @@ sys.path.insert(0, os.path.dirname(__file__)) import ena_common as common -from submit_study import ( +from submit_rawreads_study import ( build_submission_xml, find_duplicate_studies, validate_study_xml, @@ -30,14 +30,14 @@ FIXTURES_DIR = os.path.join( os.path.dirname(__file__), "..", "assets", "test-fixtures", ) -MIMICC_JSON = os.path.join(FIXTURES_DIR, "mimicc_study.json") -MIMICC_CSV = os.path.join(FIXTURES_DIR, "mimicc_study.csv") -MIMICC_TSV = os.path.join(FIXTURES_DIR, "mimicc_study.tsv") +EXAMPLE_STUDY_JSON = os.path.join(FIXTURES_DIR, "example_study.json") +EXAMPLE_STUDY_CSV = os.path.join(FIXTURES_DIR, "example_study.csv") +EXAMPLE_STUDY_TSV = os.path.join(FIXTURES_DIR, "example_study.tsv") -_FIXTURES_PRESENT = os.path.isfile(MIMICC_JSON) +_FIXTURES_PRESENT = os.path.isfile(EXAMPLE_STUDY_JSON) requires_fixtures = pytest.mark.skipif( not _FIXTURES_PRESENT, - reason="mimicc test fixtures not present in assets/test-fixtures/", + reason="example study fixtures not present in assets/test-fixtures/", ) _JSON_RECORD_KEYS = ("studies", "data") @@ -48,9 +48,9 @@ @pytest.fixture -def mimicc_json(): - """Load the MIMICC study JSON fixture.""" - with open(MIMICC_JSON) as f: +def example_study_json(): + """Load the example study JSON fixture.""" + with open(EXAMPLE_STUDY_JSON) as f: return json.load(f) @@ -63,10 +63,10 @@ class TestExtractRecordsFromJson: """Tests for extracting study rows from various JSON formats.""" @requires_fixtures - def test_dataharmonizer_container_format(self, mimicc_json): - """The mimicc_study.json fixture uses DataHarmonizer Container format.""" + def test_dataharmonizer_container_format(self, example_study_json): + """The example_study.json fixture uses DataHarmonizer Container format.""" studies = common.extract_records_from_json( - mimicc_json, record_keys=_JSON_RECORD_KEYS, + example_study_json, record_keys=_JSON_RECORD_KEYS, ) assert studies is not None assert len(studies) == 1 @@ -356,7 +356,7 @@ class TestLoadInputFile: def test_load_csv(self): """CSV file loads correctly.""" studies = common.load_input_file( - MIMICC_CSV, json_record_keys=_JSON_RECORD_KEYS, + EXAMPLE_STUDY_CSV, json_record_keys=_JSON_RECORD_KEYS, ) assert studies is not None assert len(studies) == 1 @@ -366,7 +366,7 @@ def test_load_csv(self): def test_load_tsv(self): """TSV file loads correctly.""" studies = common.load_input_file( - MIMICC_TSV, json_record_keys=_JSON_RECORD_KEYS, + EXAMPLE_STUDY_TSV, json_record_keys=_JSON_RECORD_KEYS, ) assert studies is not None assert len(studies) == 1 @@ -376,7 +376,7 @@ def test_load_tsv(self): def test_load_json(self): """JSON file loads correctly.""" studies = common.load_input_file( - MIMICC_JSON, json_record_keys=_JSON_RECORD_KEYS, + EXAMPLE_STUDY_JSON, json_record_keys=_JSON_RECORD_KEYS, ) assert studies is not None assert len(studies) == 1 @@ -389,7 +389,7 @@ def test_all_formats_produce_same_data(self): common.load_input_file( path, json_record_keys=_JSON_RECORD_KEYS, ) - for path in [MIMICC_JSON, MIMICC_CSV, MIMICC_TSV] + for path in [EXAMPLE_STUDY_JSON, EXAMPLE_STUDY_CSV, EXAMPLE_STUDY_TSV] ] for studies in all_studies: assert len(studies) == 1 diff --git a/modules/local/submit_study/environment.yml b/modules/local/submit_rawreads_study/environment.yml similarity index 100% rename from modules/local/submit_study/environment.yml rename to modules/local/submit_rawreads_study/environment.yml diff --git a/modules/local/submit_study/main.nf b/modules/local/submit_rawreads_study/main.nf similarity index 84% rename from modules/local/submit_study/main.nf rename to modules/local/submit_rawreads_study/main.nf index 47a9d88..51bc062 100644 --- a/modules/local/submit_study/main.nf +++ b/modules/local/submit_rawreads_study/main.nf @@ -1,13 +1,13 @@ -process SUBMIT_STUDY { +process SUBMIT_RAWREADS_STUDY { tag "$meta.id" label 'process_single' conda "${moduleDir}/environment.yml" container "quay.io/microbiome-informatics/mgnify-pipelines-toolkit:1.4.17" - // ENA_USERNAME and ENA_PASSWORD must be set in the process environment. + // ENA_WEBIN and ENA_WEBIN_PASSWORD must be set in the process environment. // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config: - // env { ENA_USERNAME = secrets.WEBIN_ACCOUNT; ENA_PASSWORD = secrets.WEBIN_PASSWORD } + // env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD } input: tuple val(meta), path(study_metadata) @@ -23,7 +23,7 @@ process SUBMIT_STUDY { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - submit_study.py \\ + submit_rawreads_study.py \\ --input ${study_metadata} \\ --output ${prefix}_accessions.json \\ ${args} diff --git a/modules/local/submit_study/meta.yml b/modules/local/submit_rawreads_study/meta.yml similarity index 95% rename from modules/local/submit_study/meta.yml rename to modules/local/submit_rawreads_study/meta.yml index e09d150..629512f 100644 --- a/modules/local/submit_study/meta.yml +++ b/modules/local/submit_rawreads_study/meta.yml @@ -1,12 +1,12 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json -name: "submit_study" +name: "submit_rawreads_study" description: | Submit a new study to ENA via the Webin REST API v2. Reads study metadata from a JSON, CSV, or TSV file, checks for duplicate studies already registered under the Webin account, builds a PROJECT XML submission document, and submits to ENA. Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD - Nextflow secrets, which are mapped to ENA_USERNAME and ENA_PASSWORD + Nextflow secrets, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD inside the process. keywords: - ena diff --git a/modules/local/submit_study/tests/main.nf.test b/modules/local/submit_rawreads_study/tests/main.nf.test similarity index 62% rename from modules/local/submit_study/tests/main.nf.test rename to modules/local/submit_rawreads_study/tests/main.nf.test index e37cccc..a0cb4fd 100644 --- a/modules/local/submit_study/tests/main.nf.test +++ b/modules/local/submit_rawreads_study/tests/main.nf.test @@ -1,21 +1,21 @@ nextflow_process { - name "Test Process SUBMIT_STUDY" + name "Test Process SUBMIT_RAWREADS_STUDY" script "../main.nf" config "./nextflow.config" - process "SUBMIT_STUDY" + process "SUBMIT_RAWREADS_STUDY" tag "modules" - tag "submit_study" + tag "submit_rawreads_study" - test("submit_study - stub") { + test("submit_rawreads_study - stub") { options "-stub" when { process { """ input[0] = [ - [ id:'mimicc' ], - file("${projectDir}/assets/test-fixtures/mimicc_study.json", checkIfExists: true) + [ id:'example_study' ], + file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true) ] """ } @@ -29,16 +29,16 @@ nextflow_process { } } - test("submit_study - dry run against ENA test server") { - // Requires WEBIN_ACCOUNT and WEBIN_PASSWORD Nextflow secrets. + test("submit_rawreads_study - dry run against ENA test server") { // Validates and builds the submission XML but does not submit to ENA. + // Dummy credentials in tests/nextflow.config are sufficient for dry-run mode. when { process { """ input[0] = [ - [ id:'mimicc' ], - file("${projectDir}/assets/test-fixtures/mimicc_study.json", checkIfExists: true) + [ id:'example_study' ], + file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true) ] """ } diff --git a/modules/local/submit_study/tests/main.nf.test.snap b/modules/local/submit_rawreads_study/tests/main.nf.test.snap similarity index 50% rename from modules/local/submit_study/tests/main.nf.test.snap rename to modules/local/submit_rawreads_study/tests/main.nf.test.snap index dd56c7c..08f7fdb 100644 --- a/modules/local/submit_study/tests/main.nf.test.snap +++ b/modules/local/submit_rawreads_study/tests/main.nf.test.snap @@ -1,32 +1,32 @@ { - "submit_study - stub": { + "submit_rawreads_study - stub": { "content": [ { "0": [ [ { - "id": "mimicc" + "id": "example_study" }, - "mimicc_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" + "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" ] ], "1": [ - "versions.yml:md5,1ffe6cc50bd36f7110413723e0796dd4" + "versions.yml:md5,d7080ded74f0381019a674b865daa329" ], "accessions": [ [ { - "id": "mimicc" + "id": "example_study" }, - "mimicc_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" + "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" ] ], "versions": [ - "versions.yml:md5,1ffe6cc50bd36f7110413723e0796dd4" + "versions.yml:md5,d7080ded74f0381019a674b865daa329" ] } ], - "timestamp": "2026-03-12T11:51:02.565164", + "timestamp": "2026-03-12T11:57:10.234715", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" diff --git a/modules/local/submit_study/tests/nextflow.config b/modules/local/submit_rawreads_study/tests/nextflow.config similarity index 62% rename from modules/local/submit_study/tests/nextflow.config rename to modules/local/submit_rawreads_study/tests/nextflow.config index 3611907..c4633fa 100644 --- a/modules/local/submit_study/tests/nextflow.config +++ b/modules/local/submit_rawreads_study/tests/nextflow.config @@ -1,18 +1,18 @@ -// Test configuration for SUBMIT_STUDY module. +// Test configuration for SUBMIT_RAWREADS_STUDY module. // --test : use the ENA dev server (submissions are discarded daily) // --automated : skip the Webin Reports duplicate-checking API call // --dry-run : validate and build XML but do not submit to ENA // // Dummy credentials are sufficient for --dry-run --automated mode since // no HTTP calls are made. For real submission tests, replace with secrets: -// env { ENA_USERNAME = secrets.WEBIN_ACCOUNT; ENA_PASSWORD = secrets.WEBIN_PASSWORD } +// env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD } process { - withName: SUBMIT_STUDY { + withName: SUBMIT_RAWREADS_STUDY { ext.args = '--test --automated --dry-run' } } env { - ENA_USERNAME = 'Webin-000000' - ENA_PASSWORD = 'dummy-password' + ENA_WEBIN = 'Webin-000000' + ENA_WEBIN_PASSWORD = 'dummy-password' } From 38e74cf16be0112aa449a6746356c184d7b36200 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Thu, 12 Mar 2026 12:24:55 +0000 Subject: [PATCH 05/36] Tidied up code --- bin/ena_common.py | 97 ++++--------- bin/submit_rawreads_study.py | 262 +++++++++-------------------------- 2 files changed, 86 insertions(+), 273 deletions(-) diff --git a/bin/ena_common.py b/bin/ena_common.py index c782140..89e41ab 100644 --- a/bin/ena_common.py +++ b/bin/ena_common.py @@ -36,12 +36,8 @@ # Constants # ----------------------------------------------------------- -PROD_URL: Final = ( - "https://www.ebi.ac.uk/ena/submit/webin-v2" -) -TEST_URL: Final = ( - "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2" -) +PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2" +TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2" _MAX_HOLD_YEARS: Final = 2 @@ -104,10 +100,7 @@ def get_credentials() -> tuple[str, str]: username = os.environ.get("ENA_WEBIN", "").strip() password = os.environ.get("ENA_WEBIN_PASSWORD", "").strip() if not username or not password: - logger.error( - "ENA_WEBIN and ENA_WEBIN_PASSWORD environment" - " variables must be set", - ) + logger.error("ENA_WEBIN and ENA_WEBIN_PASSWORD environment variables must be set") sys.exit(1) return username, password @@ -185,8 +178,7 @@ def validate_hold_until(hold_until: str) -> datetime.date: hold_date = datetime.date.fromisoformat(hold_until) except ValueError: raise click.BadParameter( - f"Invalid date format: {hold_until!r}." - " Expected YYYY-MM-DD." + f"Invalid date format: {hold_until!r}. Expected YYYY-MM-DD." ) from None today = datetime.date.today() @@ -194,15 +186,13 @@ def validate_hold_until(hold_until: str) -> datetime.date: if hold_date > max_date: raise click.BadParameter( - f"Hold date {hold_until} is more than" - f" {_MAX_HOLD_YEARS} years from today" + f"Hold date {hold_until} is more than {_MAX_HOLD_YEARS} years from today" f" ({today}). Maximum allowed: {max_date}." ) if hold_date <= today: raise click.BadParameter( - f"Hold date {hold_until} is not in the" - f" future (today is {today})." + f"Hold date {hold_until} is not in the future (today is {today})." ) return hold_date @@ -262,7 +252,7 @@ def parse_checklist_units( def validate_xml_against_xsd( xml_bytes: bytes, - fragment_tag: str | None = None, + _fragment_tag: str | None = None, # unused; kept for API compatibility fallback_checker: Callable[ [bytes, list[str]], tuple[bool, list[str]] ] | None = None, @@ -275,7 +265,7 @@ def validate_xml_against_xsd( Args: xml_bytes: Serialised XML document. - fragment_tag: Unused; kept for API compatibility. + _fragment_tag: Unused; kept for API compatibility. fallback_checker: Optional function called with (*xml_bytes*, *messages*) that returns (*is_valid*, *messages*). @@ -395,11 +385,7 @@ def extract_records_from_json( if isinstance(container, dict): for key, val in container.items(): if isinstance(val, list): - logger.info( - "Extracted records from" - " Container.%s", - key, - ) + logger.info("Extracted records from Container.%s", key) return val for key in record_keys: @@ -471,23 +457,13 @@ def fetch_from_reports_endpoint( "max-results": max_results, } - req = requests.Request( - "GET", url, params=params, auth=auth, - ) + req = requests.Request("GET", url, params=params, auth=auth) prepared = req.prepare() - logger.debug( - 'curl -u %s:*** "%s"', - auth.username, prepared.url, - ) + logger.debug('curl -u %s:*** "%s"', auth.username, prepared.url) try: - resp = requests.get( - url, params=params, auth=auth, timeout=60, - ) - logger.info( - "Reports API at %s returned %s", - url, resp.status_code, - ) + resp = requests.get(url, params=params, auth=auth, timeout=60) + logger.info("Reports API at %s returned %s", url, resp.status_code) resp.raise_for_status() return resp.json() @@ -498,30 +474,20 @@ def fetch_from_reports_endpoint( else "unknown" ) if status == 404: - logger.info( - "Reports API at %s returned 404" - " — no records yet", - url, - ) + logger.info("Reports API at %s returned 404 — no records yet", url) return [] if status in (401, 403): logger.warning( - "Reports API at %s returned %s" - " — endpoint may not be available" + "Reports API at %s returned %s — endpoint may not be available" " or credentials may differ", url, status, ) return None - logger.warning( - "Reports API at %s returned HTTP %s", - url, status, - ) + logger.warning("Reports API at %s returned HTTP %s", url, status) return None except requests.exceptions.RequestException as exc: - logger.warning( - "Reports API at %s failed: %s", url, exc, - ) + logger.warning("Reports API at %s failed: %s", url, exc) return None @@ -561,13 +527,8 @@ def fetch_account_records( ) for url in urls: - logger.info( - "Fetching account %s from: %s", - entity_label, url, - ) - raw = fetch_from_reports_endpoint( - url, auth, max_results, - ) + logger.info("Fetching account %s from: %s", entity_label, url) + raw = fetch_from_reports_endpoint(url, auth, max_results) if raw is None: continue @@ -580,15 +541,11 @@ def fetch_account_records( if normalized is not None: records.append(normalized) - logger.info( - "Found %d %s in account", - len(records), entity_label, - ) + logger.info("Found %d %s in account", len(records), entity_label) return records logger.warning( - "Could not reach any Webin reports endpoint." - " Duplicate checking for %s will be skipped.", + "Could not reach any Webin reports endpoint. Duplicate checking for %s will be skipped.", entity_label, ) return [] @@ -640,10 +597,8 @@ def find_duplicates_by_alias_title( by_alias[alias] = rec logger.info( - "Checking %d new %s against" - " %d existing account %s...", - total, entity_label, - len(account_records), entity_label, + "Checking %d new %s against %d existing account %s...", + total, entity_label, len(account_records), entity_label, ) for i, record in enumerate(new_records): @@ -669,11 +624,7 @@ def find_duplicates_by_alias_title( ) if len(duplicates) == total: - logger.info( - "All %s are duplicates" - " — skipping further checks", - entity_label, - ) + logger.info("All %s are duplicates — skipping further checks", entity_label) return duplicates return duplicates diff --git a/bin/submit_rawreads_study.py b/bin/submit_rawreads_study.py index f1850d7..1664a16 100755 --- a/bin/submit_rawreads_study.py +++ b/bin/submit_rawreads_study.py @@ -51,12 +51,8 @@ # Reports API (study-specific) # ----------------------------------------------------------- -_PROD_REPORTS_URL: Final = ( - "https://www.ebi.ac.uk/ena/submit/report/projects" -) -_TEST_REPORTS_URL: Final = ( - "https://wwwdev.ebi.ac.uk/ena/submit/report/projects" -) +_PROD_REPORTS_URL: Final = "https://www.ebi.ac.uk/ena/submit/report/projects" +_TEST_REPORTS_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/report/projects" def _normalize_study_report( @@ -65,27 +61,16 @@ def _normalize_study_report( """Normalise a raw study report dict.""" return { "title": ( - report.get("title") - or report.get("studyTitle") - or report.get("STUDY_TITLE", "") - ), - "alias": ( - report.get("alias") - or report.get("studyAlias") - or "" + report.get("title") or report.get("studyTitle") or report.get("STUDY_TITLE", "") ), + "alias": report.get("alias") or report.get("studyAlias") or "", "accession": ( report.get("accession") or report.get("studyAccession") or report.get("report", {}).get("id", "") ), - "secondary_accession": ( - report.get("secondaryAccession") - or report.get("secondaryId", "") - ), - "status": report.get( - "releaseStatus", "UNKNOWN" - ), + "secondary_accession": report.get("secondaryAccession") or report.get("secondaryId", ""), + "status": report.get("releaseStatus", "UNKNOWN"), } @@ -167,10 +152,7 @@ def build_submission_xml( submission = ET.SubElement( submission_set, "SUBMISSION", ) - sub_alias = ( - "study-submission-" - + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") - ) + sub_alias = f"study-submission-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}" submission.set("alias", sub_alias) actions = ET.SubElement(submission, "ACTIONS") main_action = ET.SubElement(actions, "ACTION") @@ -271,9 +253,7 @@ def _validate_study_xml_structure( project_set = tree.find("PROJECT_SET") if project_set is None: - messages.append( - "ERROR: Missing PROJECT_SET element" - ) + messages.append("ERROR: Missing PROJECT_SET element") return False, messages projects = project_set.findall("PROJECT") @@ -285,20 +265,13 @@ def _validate_study_xml_structure( alias = proj.get("alias", "") title = proj.find("TITLE") if title is None or not title.text: - messages.append( - f"ERROR: PROJECT '{alias}' missing TITLE" - ) + messages.append(f"ERROR: PROJECT '{alias}' missing TITLE") return False, messages sp = proj.find("SUBMISSION_PROJECT") if sp is None: - messages.append( - f"ERROR: PROJECT '{alias}'" - " missing SUBMISSION_PROJECT" - ) + messages.append(f"ERROR: PROJECT '{alias}' missing SUBMISSION_PROJECT") return False, messages - messages.append( - f"OK: PROJECT '{alias}' has required elements" - ) + messages.append(f"OK: PROJECT '{alias}' has required elements") return True, messages @@ -336,10 +309,7 @@ def parse_xml_receipt( Returns: Tuple of (*success*, *accessions*, *messages*). """ - success = ( - receipt_root.get("success", "false").lower() - == "true" - ) + success = receipt_root.get("success", "false").lower() == "true" accessions: list[dict[str, str]] = [] messages: list[str] = [] @@ -355,18 +325,12 @@ def parse_xml_receipt( "alias": proj.get("alias", ""), "accession": proj.get("accession", ""), "status": proj.get("status", ""), - "holdUntilDate": proj.get( - "holdUntilDate", "" - ), + "holdUntilDate": proj.get("holdUntilDate", ""), } ext = proj.find("EXT_ID") if ext is not None: - acc_info["external_accession"] = ext.get( - "accession", "" - ) - acc_info["external_type"] = ext.get( - "type", "" - ) + acc_info["external_accession"] = ext.get("accession", "") + acc_info["external_type"] = ext.get("type", "") accessions.append(acc_info) # Some receipts use STUDY instead of PROJECT. @@ -415,45 +379,26 @@ def _do_submission( for msg in xml_messages: logger.info(" %s", msg) if not xml_valid: - logger.error( - "XML validation FAILED (%s)" - " — aborting submission", action, - ) + logger.error("XML validation FAILED (%s) — aborting submission", action) return False logger.info("XML validation PASSED (%s)", action) if dry_run: - logger.info( - "DRY RUN — skipping %s submission", action, - ) - logger.info( - "Generated XML:\n%s", - xml_bytes.decode("utf-8"), - ) + logger.info("DRY RUN — skipping %s submission", action) + logger.info("Generated XML:\n%s", xml_bytes.decode("utf-8")) return True - logger.info( - "Submitting %s to ENA (%s)...", action, env_label, - ) + logger.info("Submitting %s to ENA (%s)...", action, env_label) try: - receipt_root = common.submit_xml( - base_url, auth, xml_bytes, - ) + receipt_root = common.submit_xml(base_url, auth, xml_bytes) except requests.exceptions.HTTPError as exc: - logger.error( - "HTTP error during %s submission: %s", - action, exc, - ) + logger.error("HTTP error during %s submission: %s", action, exc) if exc.response is not None: - logger.error( - "Response body: %s", exc.response.text, - ) + logger.error("Response body: %s", exc.response.text) return False - success, accessions, receipt_messages = ( - parse_xml_receipt(receipt_root) - ) + success, accessions, receipt_messages = parse_xml_receipt(receipt_root) for msg in receipt_messages: logger.info(" Receipt: %s", msg) @@ -461,14 +406,10 @@ def _do_submission( logger.info("%s SUCCESSFUL", action) for acc in accessions: ext = acc.get("external_accession", "") - ext_suffix = ( - f" (study: {ext})" if ext else "" - ) + ext_suffix = f" (study: {ext})" if ext else "" logger.info( - " %s: alias=%s accession=%s" - " status=%s%s", - action, acc["alias"], acc["accession"], - acc["status"], ext_suffix, + " %s: alias=%s accession=%s status=%s%s", + action, acc["alias"], acc["accession"], acc["status"], ext_suffix, ) results[result_key].append(acc) else: @@ -501,14 +442,12 @@ def _do_submission( @click.option( "--test", "use_test", is_flag=True, default=False, - help="Use the ENA test service" - " (submissions are discarded daily)", + help="Use the ENA test service (submissions are discarded daily)", ) @click.option( "--hold-until", default=None, - help="Hold studies private until this date" - " (YYYY-MM-DD, max 2 years from now)", + help="Hold studies private until this date (YYYY-MM-DD, max 2 years from now)", ) @click.option( "--log", "log_file", @@ -520,32 +459,27 @@ def _do_submission( "--output", type=click.Path(path_type=Path), default=None, - help="Path to write JSON accession results" - " (default: stdout)", + help="Path to write JSON accession results (default: stdout)", ) @click.option( "--max-results", default=5000, - help="Maximum number of projects to fetch" - " from the Reports API for duplicate checking", + help="Maximum number of projects to fetch from the Reports API for duplicate checking", ) @click.option( "--dry-run", is_flag=True, default=False, - help="Validate and build XML but do not" - " submit to ENA", + help="Validate and build XML but do not submit to ENA", ) @click.option( "--automated", is_flag=True, default=False, - help="Skip duplicate detection against the" - " Webin Reports API (for automated pipelines)", + help="Skip duplicate detection against the Webin Reports API (for automated pipelines)", ) @click.option( "--force", is_flag=True, default=False, - help="Submit duplicate studies using the MODIFY" - " action to overwrite existing ENA records," + help="Submit duplicate studies using the MODIFY action to overwrite existing ENA records," " instead of skipping them", ) def main( @@ -564,10 +498,7 @@ def main( username, password = common.get_credentials() env_label = "TEST" if use_test else "PRODUCTION" - logger.info( - "ENA Study Submission — environment: %s", - env_label, - ) + logger.info("ENA Study Submission — environment: %s", env_label) base_url = common.get_base_url(use_test) auth = HTTPBasicAuth(username, password) logger.debug("Auth username: %s", username) @@ -581,22 +512,14 @@ def main( input_file, json_record_keys=_JSON_RECORD_KEYS, ) if studies is None: - logger.error( - "Unsupported file format." - " Supported: .json, .csv, .tsv", - ) + logger.error("Unsupported file format. Supported: .json, .csv, .tsv") sys.exit(1) - logger.info( - "Loaded %d study/studies from input", - len(studies), - ) + logger.info("Loaded %d study/studies from input", len(studies)) # -- Step 2: Check for duplicates -------------------- if automated: - logger.info( - "Automated mode: skipping duplicate detection", - ) + logger.info("Automated mode: skipping duplicate detection") duplicates: dict[int, dict[str, Any]] = {} else: account_studies = fetch_account_studies( @@ -605,10 +528,8 @@ def main( ) for ps in account_studies: logger.info( - " Account study: %s | alias=%s" - " | title=%s | status=%s", - ps["accession"], ps["alias"], - ps["title"], ps["status"], + " Account study: %s | alias=%s | title=%s | status=%s", + ps["accession"], ps["alias"], ps["title"], ps["status"], ) duplicates = find_duplicate_studies( studies, account_studies, @@ -623,37 +544,23 @@ def main( studies_to_modify: list[dict[str, Any]] = [] if duplicates: - action_label = ( - "will be re-submitted with MODIFY" - if force else "will NOT be submitted" - ) + action_label = "will be re-submitted with MODIFY" if force else "will NOT be submitted" logger.warning( "Found %d duplicate(s) — %s:", len(duplicates), action_label, ) for idx, dup_info in duplicates.items(): - study_title = studies[idx].get( - "STUDY_TITLE", f"study[{idx}]", - ) + study_title = studies[idx].get("STUDY_TITLE", f"study[{idx}]") logger.warning( - " DUPLICATE: '%s' matches existing %s" - " (accession: %s)", - study_title, - dup_info["match_reason"], - dup_info["accession"], + " DUPLICATE: '%s' matches existing %s (accession: %s)", + study_title, dup_info["match_reason"], dup_info["accession"], ) results["duplicates"].append({ "input_index": idx, "title": study_title, "alias": studies[idx].get("alias", ""), - "existing_accession": ( - dup_info["accession"] - ), - "existing_secondary_accession": ( - dup_info.get( - "secondary_accession", "" - ) - ), + "existing_accession": dup_info["accession"], + "existing_secondary_accession": dup_info.get("secondary_accession", ""), "match_reason": dup_info["match_reason"], }) if force: @@ -669,16 +576,12 @@ def main( ] if not studies_to_submit and not studies_to_modify: - logger.info( - "No studies to submit" - " (all are duplicates or input is empty)", - ) + logger.info("No studies to submit (all are duplicates or input is empty)") common.write_results(results, output) return logger.info( - "%d new study/studies to ADD," - " %d duplicate(s) to MODIFY", + "%d new study/studies to ADD, %d duplicate(s) to MODIFY", len(studies_to_submit), len(studies_to_modify), ) @@ -686,23 +589,11 @@ def main( # -- Step 3: ADD new studies ------------------------- if studies_to_submit: - logger.info( - "Building ADD XML for %d new study/studies...", - len(studies_to_submit), - ) - xml_root = build_submission_xml( - studies_to_submit, hold_until=hold_until, - action="ADD", - ) + logger.info("Building ADD XML for %d new study/studies...", len(studies_to_submit)) + xml_root = build_submission_xml(studies_to_submit, hold_until=hold_until, action="ADD") xml_bytes = common.xml_to_bytes(xml_root) - logger.debug( - "Generated XML (ADD):\n%s", - xml_bytes.decode("utf-8"), - ) - logger.info( - "XML document size (ADD): %d bytes", - len(xml_bytes), - ) + logger.debug("Generated XML (ADD):\n%s", xml_bytes.decode("utf-8")) + logger.info("XML document size (ADD): %d bytes", len(xml_bytes)) ok = _do_submission( base_url, auth, xml_bytes, action="ADD", @@ -715,23 +606,11 @@ def main( # -- Step 4: MODIFY duplicate studies (--force) ------ if studies_to_modify: - logger.info( - "Building MODIFY XML for %d duplicate(s)...", - len(studies_to_modify), - ) - xml_root = build_submission_xml( - studies_to_modify, hold_until=hold_until, - action="MODIFY", - ) + logger.info("Building MODIFY XML for %d duplicate(s)...", len(studies_to_modify)) + xml_root = build_submission_xml(studies_to_modify, hold_until=hold_until, action="MODIFY") xml_bytes = common.xml_to_bytes(xml_root) - logger.debug( - "Generated XML (MODIFY):\n%s", - xml_bytes.decode("utf-8"), - ) - logger.info( - "XML document size (MODIFY): %d bytes", - len(xml_bytes), - ) + logger.debug("Generated XML (MODIFY):\n%s", xml_bytes.decode("utf-8")) + logger.info("XML document size (MODIFY): %d bytes", len(xml_bytes)) ok = _do_submission( base_url, auth, xml_bytes, action="MODIFY", @@ -751,39 +630,22 @@ def main( logger.info("=" * 60) logger.info("SUBMISSION SUMMARY") logger.info( - " Duplicates skipped: %d", - len(results["duplicates"]) - - len(results["modified"]), + " Duplicates skipped: %d", len(results["duplicates"]) - len(results["modified"]), ) for d in results["duplicates"]: - logger.info( - " %s -> %s", - d["title"], d["existing_accession"], - ) - logger.info( - " Newly submitted (ADD): %d", - len(results["submitted"]), - ) + logger.info(" %s -> %s", d["title"], d["existing_accession"]) + logger.info(" Newly submitted (ADD): %d", len(results["submitted"])) for s in results["submitted"]: ext = s.get("external_accession", "") ext_suffix = f" ({ext})" if ext else "" - logger.info( - " %s -> %s%s", - s["alias"], s["accession"], ext_suffix, - ) - logger.info( - " Modified (MODIFY): %d", - len(results["modified"]), - ) + logger.info(" %s -> %s%s", s["alias"], s["accession"], ext_suffix) + logger.info(" Modified (MODIFY): %d", len(results["modified"])) for m in results["modified"]: ext = m.get("external_accession", "") ext_suffix = f" ({ext})" if ext else "" - logger.info( - " %s -> %s%s", - m["alias"], m["accession"], ext_suffix, - ) + logger.info(" %s -> %s%s", m["alias"], m["accession"], ext_suffix) logger.info("=" * 60) if __name__ == "__main__": - main() + main() # type: ignore[call-arg] From 3798fa4c66c6e808361997a1aa3991af56cded93 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Thu, 12 Mar 2026 12:44:28 +0000 Subject: [PATCH 06/36] Rename files --- bin/{ena_common.py => ena_submit_common.py} | 0 bin/submit_rawreads_study.py | 2 +- bin/test_submit_rawreads_study.py | 4 ++-- 3 files changed, 3 insertions(+), 3 deletions(-) rename bin/{ena_common.py => ena_submit_common.py} (100%) diff --git a/bin/ena_common.py b/bin/ena_submit_common.py similarity index 100% rename from bin/ena_common.py rename to bin/ena_submit_common.py diff --git a/bin/submit_rawreads_study.py b/bin/submit_rawreads_study.py index 1664a16..c00ee6d 100755 --- a/bin/submit_rawreads_study.py +++ b/bin/submit_rawreads_study.py @@ -42,7 +42,7 @@ import requests from requests.auth import HTTPBasicAuth -import ena_common as common +import ena_submit_common as common logger = logging.getLogger("ena_submit.rawreads_study") diff --git a/bin/test_submit_rawreads_study.py b/bin/test_submit_rawreads_study.py index b7d3dcb..0612f43 100644 --- a/bin/test_submit_rawreads_study.py +++ b/bin/test_submit_rawreads_study.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Tests for submit_rawreads_study.py and ena_common.py — study submission pipeline. +"""Tests for submit_rawreads_study.py and ena_submit_common.py — study submission pipeline. Usage: pytest bin/test_submit_rawreads_study.py -v @@ -16,7 +16,7 @@ # Ensure the scripts directory is importable sys.path.insert(0, os.path.dirname(__file__)) -import ena_common as common +import ena_submit_common as common from submit_rawreads_study import ( build_submission_xml, find_duplicate_studies, From ab5f92d481ec0e9bd0dda759c979faa01e3a0384 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Thu, 12 Mar 2026 13:37:24 +0000 Subject: [PATCH 07/36] Update tests for study_submit --- bin/test_submit_rawreads_study.py | 1650 +++++++++++++++++++++------ conf/modules.config | 6 +- conf/test_assembly.config | 2 - modules/local/ena_webin_cli/main.nf | 12 + modules/local/genome_upload/main.nf | 3 +- nextflow.config | 5 +- nextflow_schema.json | 15 +- tests/default.nf.test | 67 +- tests/default.nf.test.snap | 58 + tests/nextflow.config | 15 + workflows/assemblysubmit.nf | 20 +- workflows/genomesubmit.nf | 20 +- 12 files changed, 1512 insertions(+), 361 deletions(-) create mode 100644 tests/default.nf.test.snap diff --git a/bin/test_submit_rawreads_study.py b/bin/test_submit_rawreads_study.py index 0612f43..f07f85e 100644 --- a/bin/test_submit_rawreads_study.py +++ b/bin/test_submit_rawreads_study.py @@ -1,8 +1,19 @@ #!/usr/bin/env python3 -"""Tests for submit_rawreads_study.py and ena_submit_common.py — study submission pipeline. +"""Tests for submit_rawreads_study.py — raw-reads study submission pipeline. + +Covers: + A. Unit tests for build_submission_xml and _add_project_element + B. Unit tests for validate_study_xml + C. Unit tests for parse_xml_receipt + D. Unit tests for find_duplicate_studies and fetch_account_studies + E. CLI integration tests for main() using click.testing.CliRunner Usage: pytest bin/test_submit_rawreads_study.py -v + +All external I/O (HTTP requests, ENA reports API) is mocked. Tests do NOT +import from ena_submit_common directly — all assertions go through the public +API of submit_rawreads_study. """ from __future__ import annotations @@ -10,421 +21,1400 @@ import json import os import sys +import xml.etree.ElementTree as ET +from pathlib import Path +from textwrap import dedent +from typing import Any +from unittest.mock import MagicMock, patch import pytest +from click.testing import CliRunner +from requests.auth import HTTPBasicAuth -# Ensure the scripts directory is importable +# Ensure the scripts directory is on the path before importing the module. sys.path.insert(0, os.path.dirname(__file__)) -import ena_submit_common as common -from submit_rawreads_study import ( +from submit_rawreads_study import ( # noqa: E402 + _normalize_study_report, build_submission_xml, + fetch_account_studies, find_duplicate_studies, + main, + parse_xml_receipt, validate_study_xml, ) # --------------------------------------------------------------------------- -# Paths +# Constants shared across test groups # --------------------------------------------------------------------------- -FIXTURES_DIR = os.path.join( - os.path.dirname(__file__), "..", "assets", "test-fixtures", -) -EXAMPLE_STUDY_JSON = os.path.join(FIXTURES_DIR, "example_study.json") -EXAMPLE_STUDY_CSV = os.path.join(FIXTURES_DIR, "example_study.csv") -EXAMPLE_STUDY_TSV = os.path.join(FIXTURES_DIR, "example_study.tsv") - -_FIXTURES_PRESENT = os.path.isfile(EXAMPLE_STUDY_JSON) -requires_fixtures = pytest.mark.skipif( - not _FIXTURES_PRESENT, - reason="example study fixtures not present in assets/test-fixtures/", -) - -_JSON_RECORD_KEYS = ("studies", "data") +_PROD_REPORTS_URL = "https://www.ebi.ac.uk/ena/submit/report/projects" +_TEST_REPORTS_URL = "https://wwwdev.ebi.ac.uk/ena/submit/report/projects" # --------------------------------------------------------------------------- -# Fixtures +# Shared fixtures # --------------------------------------------------------------------------- @pytest.fixture -def example_study_json(): - """Load the example study JSON fixture.""" - with open(EXAMPLE_STUDY_JSON) as f: - return json.load(f) +def basic_study() -> dict[str, Any]: + """Return a minimal valid study metadata dict.""" + return { + "alias": "test-study-001", + "STUDY_TITLE": "A Basic Test Study", + "STUDY_ABSTRACT": "An abstract for the test study.", + "CENTER_PROJECT_NAME": "My Centre Project", + "existing_study_type": "Metagenomics", + } -# --------------------------------------------------------------------------- -# extract_records_from_json tests -# --------------------------------------------------------------------------- +@pytest.fixture +def metagenomics_assembly_study() -> dict[str, Any]: + """Return a study dict representing a metagenomics assembly submission.""" + return { + "alias": "metagenome-assembly-001", + "STUDY_TITLE": "Primary Metagenome Assembly of Soil Sample", + "STUDY_ABSTRACT": "Assembly of contigs from metagenome sequencing of soil.", + "CENTER_PROJECT_NAME": "Soil Metagenome Project", + "existing_study_type": "Metagenomics", + } -class TestExtractRecordsFromJson: - """Tests for extracting study rows from various JSON formats.""" +@pytest.fixture +def mag_genome_study() -> dict[str, Any]: + """Return a study dict representing a MAG/genome submission.""" + return { + "alias": "mag-genome-001", + "STUDY_TITLE": "Metagenome-Assembled Genome from Soil Microbiome", + "STUDY_ABSTRACT": "A high-quality MAG reconstructed from binned metagenome data.", + "existing_study_type": "Other", + "new_study_type": "Genome Sequencing", + } - @requires_fixtures - def test_dataharmonizer_container_format(self, example_study_json): - """The example_study.json fixture uses DataHarmonizer Container format.""" - studies = common.extract_records_from_json( - example_study_json, record_keys=_JSON_RECORD_KEYS, - ) - assert studies is not None - assert len(studies) == 1 - assert studies[0]["STUDY_TITLE"] == "MIMICC" - assert studies[0]["existing_study_type"] == "Metagenomics" - assert studies[0]["IS_PRIMARY"] == "YES" - - def test_plain_list(self): - """Plain list input returns the list as-is.""" - data = [{"STUDY_TITLE": "Test Study", "IS_PRIMARY": "YES"}] - studies = common.extract_records_from_json( - data, record_keys=_JSON_RECORD_KEYS, - ) - assert studies == data - def test_dict_with_studies_key(self): - """Dict with 'studies' key extracts the list.""" - data = {"studies": [{"STUDY_TITLE": "A"}, {"STUDY_TITLE": "B"}]} - studies = common.extract_records_from_json( - data, record_keys=_JSON_RECORD_KEYS, - ) - assert len(studies) == 2 +@pytest.fixture +def mock_credentials() -> tuple[str, str]: + """Return mock ENA credentials.""" + return ("Webin-12345", "pass") - def test_dict_with_data_key(self): - """Dict with 'data' key extracts the list.""" - data = {"data": [{"STUDY_TITLE": "C"}]} - studies = common.extract_records_from_json( - data, record_keys=_JSON_RECORD_KEYS, - ) - assert len(studies) == 1 - def test_single_study_object(self): - """Single dict input is wrapped in a list.""" - data = {"STUDY_TITLE": "Single"} - studies = common.extract_records_from_json( - data, record_keys=_JSON_RECORD_KEYS, - ) - assert len(studies) == 1 - assert studies[0]["STUDY_TITLE"] == "Single" +@pytest.fixture +def auth(mock_credentials: tuple[str, str]) -> HTTPBasicAuth: + """Return mock HTTPBasicAuth built from mock credentials.""" + return HTTPBasicAuth(*mock_credentials) - def test_invalid_input(self): - """Non-dict/list input returns None.""" - result = common.extract_records_from_json( - "not a dict or list", record_keys=_JSON_RECORD_KEYS, - ) - assert result is None - - def test_container_with_multiple_studies(self): - """Container format with multiple studies extracts all.""" - data = { - "Container": { - "SRA_studys": [ - {"STUDY_TITLE": "Study A"}, - {"STUDY_TITLE": "Study B"}, - ], - }, - } - studies = common.extract_records_from_json( - data, record_keys=_JSON_RECORD_KEYS, - ) - assert len(studies) == 2 + +@pytest.fixture +def account_study_record() -> dict[str, str]: + """Return a normalised account study record as returned by the Reports API.""" + return { + "title": "Existing Study Title", + "alias": "existing-study-alias", + "accession": "PRJEB99001", + "secondary_accession": "ERP099001", + "status": "PRIVATE", + } # --------------------------------------------------------------------------- -# XML building tests +# A. Unit tests for build_submission_xml and _add_project_element # --------------------------------------------------------------------------- class TestBuildSubmissionXml: - """Tests for building ENA study submission XML.""" - - def test_basic_xml_structure(self): - """Built XML contains expected elements and attributes.""" - studies = [ - { - "alias": "test-study", - "STUDY_TITLE": "Test Study", - "STUDY_ABSTRACT": "Abstract text.", - "existing_study_type": "Metagenomics", - }, + """Unit tests for build_submission_xml and _add_project_element.""" + + # ---- helper ------------------------------------------------------- + + @staticmethod + def _to_str(root: ET.Element) -> str: + """Serialise an ElementTree element to a UTF-8 string.""" + return ET.tostring(root, encoding="unicode") + + # ---- A1: Basic study fields ------------------------------------------- + + def test_study_title_round_trips(self, basic_study: dict[str, Any]) -> None: + """STUDY_TITLE is written as the TITLE element text.""" + root = build_submission_xml([basic_study]) + title_el = root.find(".//TITLE") + assert title_el is not None + assert title_el.text == basic_study["STUDY_TITLE"] + + def test_study_abstract_round_trips(self, basic_study: dict[str, Any]) -> None: + """STUDY_ABSTRACT is written as the DESCRIPTION element text.""" + root = build_submission_xml([basic_study]) + desc_el = root.find(".//DESCRIPTION") + assert desc_el is not None + assert desc_el.text == basic_study["STUDY_ABSTRACT"] + + def test_alias_round_trips(self, basic_study: dict[str, Any]) -> None: + """The alias attribute on PROJECT matches the input alias.""" + root = build_submission_xml([basic_study]) + project_el = root.find(".//PROJECT") + assert project_el is not None + assert project_el.get("alias") == basic_study["alias"] + + def test_center_project_name_round_trips(self, basic_study: dict[str, Any]) -> None: + """CENTER_PROJECT_NAME is written as the NAME element text.""" + root = build_submission_xml([basic_study]) + name_el = root.find(".//NAME") + assert name_el is not None + assert name_el.text == basic_study["CENTER_PROJECT_NAME"] + + def test_submission_project_present(self, basic_study: dict[str, Any]) -> None: + """SUBMISSION_PROJECT with SEQUENCING_PROJECT is always present.""" + root = build_submission_xml([basic_study]) + sp_el = root.find(".//SUBMISSION_PROJECT") + assert sp_el is not None + seq_el = sp_el.find("SEQUENCING_PROJECT") + assert seq_el is not None + + # ---- A2: Study type PROJECT_ATTRIBUTEs -------------------------------- + + def test_existing_study_type_emitted_as_project_attribute( + self, basic_study: dict[str, Any] + ) -> None: + """existing_study_type is emitted as a PROJECT_ATTRIBUTE TAG/VALUE pair.""" + root = build_submission_xml([basic_study]) + xml_str = self._to_str(root) + assert "existing_study_type" in xml_str + assert basic_study["existing_study_type"] in xml_str + + def test_new_study_type_absent_when_not_other(self, basic_study: dict[str, Any]) -> None: + """new_study_type is NOT emitted when existing_study_type != 'Other'.""" + study = dict(basic_study) + study["new_study_type"] = "Genome Sequencing" + root = build_submission_xml([study]) + xml_str = self._to_str(root) + assert "new_study_type" not in xml_str + + def test_new_study_type_present_when_existing_is_other( + self, mag_genome_study: dict[str, Any] + ) -> None: + """new_study_type appears as a PROJECT_ATTRIBUTE when existing_study_type == 'Other'.""" + root = build_submission_xml([mag_genome_study]) + tags = [ + el.text + for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") + if el.text is not None + ] + values = [ + el.text + for el in root.findall(".//PROJECT_ATTRIBUTE/VALUE") + if el.text is not None + ] + assert "existing_study_type" in tags + assert "new_study_type" in tags + assert "Other" in values + assert "Genome Sequencing" in values + + def test_no_project_attributes_when_no_study_type(self) -> None: + """No PROJECT_ATTRIBUTES element when existing_study_type is absent.""" + study = { + "alias": "no-type", + "STUDY_TITLE": "No Type Study", + } + root = build_submission_xml([study]) + attrs_el = root.find(".//PROJECT_ATTRIBUTES") + assert attrs_el is None + + # ---- A3: Hold date ---------------------------------------------------- + + def test_hold_until_present_in_submission(self, basic_study: dict[str, Any]) -> None: + """When hold_until is given, HOLD element with HoldUntilDate appears in SUBMISSION.""" + root = build_submission_xml([basic_study], hold_until="2028-06-15") + hold_el = root.find(".//HOLD") + assert hold_el is not None + assert hold_el.get("HoldUntilDate") == "2028-06-15" + + def test_hold_until_absent_when_not_provided(self, basic_study: dict[str, Any]) -> None: + """When hold_until is not given, no HOLD element appears.""" + root = build_submission_xml([basic_study]) + hold_el = root.find(".//HOLD") + assert hold_el is None + + # ---- A4: MODIFY action ------------------------------------------------ + + def test_modify_action_produces_modify_element(self, basic_study: dict[str, Any]) -> None: + """Using action='MODIFY' produces a MODIFY element instead of ADD.""" + root = build_submission_xml([basic_study], action="MODIFY") + xml_str = self._to_str(root) + assert "" in xml_str + + def test_add_action_produces_add_element(self, basic_study: dict[str, Any]) -> None: + """Default action='ADD' produces an ADD element.""" + root = build_submission_xml([basic_study]) + xml_str = self._to_str(root) + assert "" in xml_str + + def test_modify_action_does_not_produce_add(self, basic_study: dict[str, Any]) -> None: + """MODIFY action does not produce an ADD element.""" + root = build_submission_xml([basic_study], action="MODIFY") + xml_str = self._to_str(root) + # Strip the XML preamble to avoid false positives in attributes + assert "" not in xml_str + + # ---- A5: Assembly/metagenomics study ---------------------------------- + + def test_metagenomics_assembly_study_round_trips( + self, metagenomics_assembly_study: dict[str, Any] + ) -> None: + """Metagenomics assembly study dict round-trips correctly into XML.""" + root = build_submission_xml([metagenomics_assembly_study]) + project_el = root.find(".//PROJECT") + assert project_el is not None + assert project_el.get("alias") == metagenomics_assembly_study["alias"] + + title_el = root.find(".//TITLE") + assert title_el is not None + assert title_el.text == metagenomics_assembly_study["STUDY_TITLE"] + + tags = [ + el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text ] - root = build_submission_xml(studies) - xml_bytes = common.xml_to_bytes(root) - xml_str = xml_bytes.decode("utf-8") - assert "" in xml_str - assert 'alias="test-study"' in xml_str - assert "Test Study" in xml_str - assert "Abstract text." in xml_str - assert " None: + """MAG/genome study with existing_study_type=Other produces both PROJECT_ATTRIBUTEs.""" + root = build_submission_xml([mag_genome_study]) + attr_els = root.findall(".//PROJECT_ATTRIBUTE") + assert len(attr_els) == 2 + + pairs: dict[str, str] = {} + for attr_el in attr_els: + tag_el = attr_el.find("TAG") + val_el = attr_el.find("VALUE") + if tag_el is not None and val_el is not None: + pairs[tag_el.text or ""] = val_el.text or "" + + assert pairs.get("existing_study_type") == "Other" + assert pairs.get("new_study_type") == "Genome Sequencing" + + # ---- Multiple studies in one call ------------------------------------- + + def test_multiple_studies_produce_multiple_project_elements( + self, + basic_study: dict[str, Any], + metagenomics_assembly_study: dict[str, Any], + ) -> None: + """Multiple studies in input produce multiple PROJECT elements.""" + root = build_submission_xml([basic_study, metagenomics_assembly_study]) + projects = root.findall(".//PROJECT") + assert len(projects) == 2 + + # ---- Alias auto-derived from title when absent ------------------------ + + def test_alias_derived_from_title_when_absent(self) -> None: + """When no alias is provided, alias is derived from STUDY_TITLE (spaces→underscores).""" + study = {"STUDY_TITLE": "My Derived Title"} + root = build_submission_xml([study]) + project_el = root.find(".//PROJECT") + assert project_el is not None + alias = project_el.get("alias", "") + assert "_" in alias or alias == "My_Derived_Title"[:50] + + +# --------------------------------------------------------------------------- +# B. Unit tests for validate_study_xml +# --------------------------------------------------------------------------- + + +class TestValidateStudyXml: + """Unit tests for validate_study_xml.""" + + @staticmethod + def _build_valid_xml_bytes(alias: str = "study-1", title: str = "Test Study") -> bytes: + """Build a minimal valid study XML document as bytes. + + Args: + alias: The PROJECT alias attribute value. + title: The TITLE element text. + + Returns: + UTF-8 encoded XML bytes. + """ + xml_str = dedent(f"""\ + + + + + {title} + + + + + + + """) + return xml_str.encode("utf-8") + + # ---- B7: Valid XML passes --------------------------------------------- + + def test_valid_assembly_study_xml_passes(self) -> None: + """A valid assembly study XML passes validation without errors.""" + xml_bytes = self._build_valid_xml_bytes( + alias="assembly-study", title="Assembly Study Title" + ) + is_valid, messages = validate_study_xml(xml_bytes) + assert is_valid, f"Expected valid; messages: {messages}" + + def test_valid_metagenomics_xml_passes(self) -> None: + """Well-formed XML with required elements passes validation.""" + study = { + "alias": "meta-study", + "STUDY_TITLE": "Metagenomics Study", + "existing_study_type": "Metagenomics", + } + import ena_submit_common as _common # local import; only for xml_to_bytes helper + + root = build_submission_xml([study]) + xml_bytes = _common.xml_to_bytes(root) is_valid, messages = validate_study_xml(xml_bytes) - for msg in messages: - print(msg) - assert is_valid + assert is_valid, f"Expected valid; messages: {messages}" + + # ---- B8: Missing TITLE ------------------------------------------------ + + def test_missing_title_fails_with_title_in_message(self) -> None: + """A PROJECT without a TITLE element fails validation with 'TITLE' in the message.""" + xml_str = dedent("""\ + + + + + + + + + """) + is_valid, messages = validate_study_xml(xml_str.encode("utf-8")) + assert not is_valid + combined = " ".join(messages) + assert "TITLE" in combined + + # ---- B9: Missing SUBMISSION_PROJECT ----------------------------------- + + def test_missing_submission_project_fails(self) -> None: + """A PROJECT without SUBMISSION_PROJECT fails with 'SUBMISSION_PROJECT' in message.""" + xml_str = dedent("""\ + + + + + Some Title + + + + """) + is_valid, messages = validate_study_xml(xml_str.encode("utf-8")) + assert not is_valid + combined = " ".join(messages) + assert "SUBMISSION_PROJECT" in combined + + # ---- B10: Malformed XML ----------------------------------------------- + + def test_malformed_xml_fails_with_not_well_formed_message(self) -> None: + """Malformed XML fails validation with 'not well-formed' in the message.""" + bad_xml = b"Unclosed" + is_valid, messages = validate_study_xml(bad_xml) + assert not is_valid + combined = " ".join(messages).lower() + assert "not well-formed" in combined or "well-formed" in combined + + # ---- Extra structural checks ----------------------------------------- + + def test_empty_title_fails_validation(self) -> None: + """A PROJECT with an empty TITLE element fails validation.""" + xml_str = dedent("""\ + <?xml version='1.0' encoding='UTF-8'?> + <WEBIN> + <PROJECT_SET> + <PROJECT alias="empty-title"> + <TITLE> + + + + + """) + is_valid, messages = validate_study_xml(xml_str.encode("utf-8")) + assert not is_valid + + def test_missing_project_set_fails_validation(self) -> None: + """XML without a PROJECT_SET element fails validation.""" + xml_str = b"" + is_valid, messages = validate_study_xml(xml_str) + assert not is_valid + + def test_validation_returns_tuple_of_bool_and_list(self) -> None: + """validate_study_xml always returns (bool, list).""" + xml_bytes = self._build_valid_xml_bytes() + result = validate_study_xml(xml_bytes) + assert isinstance(result, tuple) + assert len(result) == 2 + is_valid, messages = result + assert isinstance(is_valid, bool) + assert isinstance(messages, list) # --------------------------------------------------------------------------- -# Duplicate detection tests +# C. Unit tests for parse_xml_receipt +# --------------------------------------------------------------------------- + + +class TestParseXmlReceipt: + """Unit tests for parse_xml_receipt.""" + + @staticmethod + def _parse(xml_str: str) -> tuple[bool, list[dict[str, str]], list[str]]: + """Parse an XML receipt string via parse_xml_receipt. + + Args: + xml_str: Raw XML receipt string. + + Returns: + Tuple of (success, accessions, messages). + """ + root = ET.fromstring(xml_str) + return parse_xml_receipt(root) + + # ---- C11: Successful PROJECT receipt ---------------------------------- + + def test_successful_project_receipt_returns_true(self) -> None: + """A success='true' receipt returns success=True.""" + xml_str = dedent("""\ + + + + + + """) + success, accessions, messages = self._parse(xml_str) + assert success is True + + def test_successful_project_receipt_accession_round_trips(self) -> None: + """PROJECT accession, alias, status, holdUntilDate, and external_accession round-trip.""" + xml_str = dedent("""\ + + + + + + """) + success, accessions, messages = self._parse(xml_str) + assert len(accessions) == 1 + acc = accessions[0] + assert acc["accession"] == "PRJEB12345" + assert acc["alias"] == "my-study" + assert acc["status"] == "PRIVATE" + assert acc["holdUntilDate"] == "2025-01-15" + assert acc["external_accession"] == "ERP012345" + assert acc["external_type"] == "study" + + # ---- C12: Failed receipt ---------------------------------------------- + + def test_failed_receipt_returns_false(self) -> None: + """A success='false' receipt returns success=False.""" + xml_str = dedent("""\ + + + Center name "Unknown" is not permitted to submit in Webin-12345. + + + """) + success, accessions, messages = self._parse(xml_str) + assert success is False + + def test_failed_receipt_captures_error_message(self) -> None: + """Error text from MESSAGES/ERROR is captured in the messages list.""" + xml_str = dedent("""\ + + + Submission failed due to duplicate alias. + + + """) + _, _, messages = self._parse(xml_str) + assert any("Submission failed due to duplicate alias" in m for m in messages) + + # ---- C13: STUDY tag (alternate ENA format) ---------------------------- + + def test_study_tag_receipt_extracts_accession_and_alias(self) -> None: + """Receipts using STUDY instead of PROJECT still extract accession and alias.""" + xml_str = dedent("""\ + + + + """) + success, accessions, messages = self._parse(xml_str) + assert success is True + assert len(accessions) == 1 + assert accessions[0]["accession"] == "ERP099999" + assert accessions[0]["alias"] == "study-alias-1" + + # ---- C14: MESSAGES with INFO and ERROR -------------------------------- + + def test_receipt_with_info_messages_captured(self) -> None: + """INFO elements in MESSAGES are captured in the messages list.""" + xml_str = dedent("""\ + + + + Submission processed successfully. + + + """) + _, _, messages = self._parse(xml_str) + assert any("Submission processed successfully" in m for m in messages) + assert any(m.startswith("INFO:") for m in messages) + + def test_receipt_with_multiple_error_messages(self) -> None: + """Multiple ERROR elements are all captured.""" + xml_str = dedent("""\ + + + First error. + Second error. + + + """) + _, _, messages = self._parse(xml_str) + error_msgs = [m for m in messages if m.startswith("ERROR:")] + assert len(error_msgs) == 2 + + def test_receipt_both_info_and_error_captured(self) -> None: + """Both INFO and ERROR elements are captured in messages.""" + xml_str = dedent("""\ + + + Partial success. + Some records failed. + + + """) + _, _, messages = self._parse(xml_str) + assert any(m.startswith("INFO:") for m in messages) + assert any(m.startswith("ERROR:") for m in messages) + + def test_receipt_no_messages_element_returns_empty_list(self) -> None: + """A receipt without a MESSAGES element returns an empty messages list.""" + xml_str = dedent("""\ + + + + """) + _, _, messages = self._parse(xml_str) + assert messages == [] + + def test_receipt_success_false_string(self) -> None: + """Receipts with success='false' (string) correctly parse to False.""" + xml_str = "" + success, _, _ = self._parse(xml_str) + assert success is False + + def test_receipt_missing_success_defaults_to_false(self) -> None: + """A receipt without a success attribute defaults to False.""" + xml_str = "" + success, _, _ = self._parse(xml_str) + assert success is False + + +# --------------------------------------------------------------------------- +# D. Unit tests for find_duplicate_studies and fetch_account_studies # --------------------------------------------------------------------------- class TestFindDuplicateStudies: - """Tests for alias/title-based duplicate detection.""" + """Unit tests for find_duplicate_studies.""" - def _make_account_study( - self, + @staticmethod + def _account_record( title: str = "", alias: str = "", - accession: str = "PRJEB99", - secondary_accession: str = "", + accession: str = "PRJEB00001", status: str = "PRIVATE", ) -> dict[str, str]: - """Build a normalised account study dict.""" + """Build a normalised account study record. + + Args: + title: Study title (as returned by Reports API normalizer). + alias: Study alias. + accession: ENA project accession. + status: Release status. + + Returns: + Normalised study dict. + """ return { "title": title, "alias": alias, "accession": accession, - "secondary_accession": secondary_accession, + "secondary_accession": "", "status": status, } - def test_no_duplicates(self): - """No match when titles and aliases differ.""" - new = [{"STUDY_TITLE": "New Study", "alias": "new-1"}] - account = [ - self._make_account_study( - title="Other Study", alias="other-1", - ), - ] - dups = find_duplicate_studies(new, account) - assert len(dups) == 0 + # ---- D15: Exact alias match ------------------------------------------ - def test_duplicate_by_title(self): - """Exact title match flags a duplicate.""" - new = [{"STUDY_TITLE": "Existing Study"}] - account = [ - self._make_account_study( - title="Existing Study", - accession="PRJEB99", - status="PRIVATE", - ), - ] - dups = find_duplicate_studies(new, account) - assert 0 in dups - assert dups[0]["accession"] == "PRJEB99" - - def test_duplicate_by_alias(self): - """Alias match flags a duplicate even with different title.""" - new = [{"STUDY_TITLE": "New Title", "alias": "my-alias"}] - account = [ - self._make_account_study( - title="Different Title", - alias="my-alias", - accession="PRJEB60", - ), - ] - dups = find_duplicate_studies(new, account) + def test_exact_alias_match_detected_as_duplicate(self) -> None: + """An exact alias match is detected as a duplicate.""" + new_studies = [{"STUDY_TITLE": "Different Title", "alias": "my-alias-x"}] + account = [self._account_record(title="Other", alias="my-alias-x", accession="PRJEB10")] + dups = find_duplicate_studies(new_studies, account) assert 0 in dups - assert dups[0]["accession"] == "PRJEB60" + assert dups[0]["accession"] == "PRJEB10" assert "alias" in dups[0]["match_reason"] - def test_alias_takes_precedence_over_title(self): - """When alias matches, it is reported as the match reason.""" - new = [{"STUDY_TITLE": "Same Title", "alias": "same-alias"}] + # ---- D16: Exact title match ------------------------------------------ + + def test_exact_title_match_detected_as_duplicate(self) -> None: + """An exact STUDY_TITLE match is detected as a duplicate.""" + new_studies = [{"STUDY_TITLE": "My Metagenomics Study"}] account = [ - self._make_account_study( - title="Same Title", - alias="same-alias", - accession="PRJEB70", - ), + self._account_record(title="My Metagenomics Study", accession="PRJEB20") ] - dups = find_duplicate_studies(new, account) + dups = find_duplicate_studies(new_studies, account) assert 0 in dups - assert "alias" in dups[0]["match_reason"] + assert dups[0]["accession"] == "PRJEB20" + assert "title" in dups[0]["match_reason"] - def test_partial_title_not_duplicate(self): - """Partial title match does not count as a duplicate.""" - new = [{"STUDY_TITLE": "My Study"}] - account = [ - self._make_account_study( - title="My Study Extended Title", - ), - ] - dups = find_duplicate_studies(new, account) - assert len(dups) == 0 + # ---- D17: No match returns empty dict -------------------------------- - def test_empty_account_no_duplicates(self): - """Empty account list produces no duplicates.""" - new = [{"STUDY_TITLE": "Test", "alias": "t"}] - dups = find_duplicate_studies(new, []) - assert len(dups) == 0 + def test_no_match_returns_empty_dict(self) -> None: + """When neither alias nor title matches, an empty dict is returned.""" + new_studies = [{"STUDY_TITLE": "Completely Novel Study", "alias": "novel-alias"}] + account = [self._account_record(title="Existing Study", alias="existing-alias")] + dups = find_duplicate_studies(new_studies, account) + assert dups == {} - def test_empty_input_no_duplicates(self): - """Empty input list produces no duplicates.""" - account = [ - self._make_account_study(title="Existing"), - ] + def test_empty_account_returns_empty_dict(self) -> None: + """Empty account list results in no duplicates.""" + new_studies = [{"STUDY_TITLE": "Any Study"}] + dups = find_duplicate_studies(new_studies, []) + assert dups == {} + + def test_empty_new_studies_returns_empty_dict(self) -> None: + """Empty new studies list results in no duplicates.""" + account = [self._account_record(title="Existing")] dups = find_duplicate_studies([], account) - assert len(dups) == 0 + assert dups == {} - def test_study_without_title_or_alias_skipped(self): - """Studies with no title or alias are not flagged.""" - new = [{}] - account = [ - self._make_account_study(title="Something"), - ] - dups = find_duplicate_studies(new, account) - assert len(dups) == 0 + def test_study_without_title_or_alias_not_flagged(self) -> None: + """A study dict with neither title nor alias is not flagged as duplicate.""" + new_studies = [{"IS_PRIMARY": "YES"}] # no STUDY_TITLE, no alias + account = [self._account_record(title="Existing")] + dups = find_duplicate_studies(new_studies, account) + assert dups == {} - def test_mixed_duplicates_and_new(self): - """Mix of duplicate and new studies.""" - account = [ - self._make_account_study( - title="Dup By Title", - alias="dup-title", - accession="PRJEB10", - ), - self._make_account_study( - title="Other", - alias="dup-alias", - accession="PRJEB20", - ), - ] - new = [ - {"STUDY_TITLE": "Dup By Title", "alias": "new-alias"}, - {"STUDY_TITLE": "New Title", "alias": "dup-alias"}, - {"STUDY_TITLE": "Brand New", "alias": "brand-new"}, - ] - dups = find_duplicate_studies(new, account) - assert 0 in dups # title match - assert 1 in dups # alias match - assert 2 not in dups # new + def test_partial_title_not_a_duplicate(self) -> None: + """A partial title match does not count as a duplicate (exact match only).""" + new_studies = [{"STUDY_TITLE": "Metagenomics"}] + account = [self._account_record(title="Metagenomics Assembly Study")] + dups = find_duplicate_studies(new_studies, account) + assert dups == {} - def test_all_duplicates_early_exit(self): - """All studies being duplicates terminates early.""" - account = [ - self._make_account_study( - title="A", accession="PRJEB1", - ), - self._make_account_study( - title="B", accession="PRJEB2", - ), + def test_multiple_studies_only_matching_flagged(self) -> None: + """Only the matching study is flagged when multiple new studies are submitted.""" + account = [self._account_record(title="Old Study", alias="old-alias", accession="PRJEB50")] + new_studies = [ + {"STUDY_TITLE": "Old Study"}, + {"STUDY_TITLE": "New Study"}, ] - new = [ - {"STUDY_TITLE": "A"}, - {"STUDY_TITLE": "B"}, + dups = find_duplicate_studies(new_studies, account) + assert 0 in dups + assert 1 not in dups + + def test_duplicate_index_corresponds_to_new_studies_list(self) -> None: + """The index in the duplicates dict matches the position in new_studies.""" + account = [self._account_record(title="Study C", accession="PRJEB33")] + new_studies = [ + {"STUDY_TITLE": "Study A"}, + {"STUDY_TITLE": "Study B"}, + {"STUDY_TITLE": "Study C"}, ] - dups = find_duplicate_studies(new, account) - assert len(dups) == 2 + dups = find_duplicate_studies(new_studies, account) + assert 2 in dups + assert dups[2]["accession"] == "PRJEB33" # --------------------------------------------------------------------------- -# File loading tests (JSON, CSV, TSV) +# D18: _normalize_study_report and fetch_account_studies # --------------------------------------------------------------------------- -# The expected study data shared by all supported fixtures -EXPECTED_STUDY = { - "IS_PRIMARY": "YES", - "STUDY_TITLE": "MIMICC", - "existing_study_type": "Metagenomics", -} + +class TestNormalizeStudyReport: + """Unit tests for _normalize_study_report field normalisation.""" + + def test_title_field_normalised(self) -> None: + """The 'title' field is extracted from the raw report dict.""" + report = {"title": "My Title", "alias": "my-alias", "accession": "PRJEB1"} + result = _normalize_study_report(report) + assert result["title"] == "My Title" + + def test_study_title_fallback(self) -> None: + """studyTitle is used when 'title' is absent.""" + report = {"studyTitle": "Study Title Fallback", "alias": "a", "accession": "PRJEB2"} + result = _normalize_study_report(report) + assert result["title"] == "Study Title Fallback" + + def test_alias_field_normalised(self) -> None: + """The 'alias' field is extracted.""" + report = {"title": "T", "alias": "direct-alias", "accession": "PRJEB3"} + result = _normalize_study_report(report) + assert result["alias"] == "direct-alias" + + def test_study_alias_fallback(self) -> None: + """studyAlias is used when 'alias' is absent.""" + report = {"title": "T", "studyAlias": "study-alias-fallback", "accession": "PRJEB4"} + result = _normalize_study_report(report) + assert result["alias"] == "study-alias-fallback" + + def test_accession_field_normalised(self) -> None: + """The 'accession' field is extracted.""" + report = {"title": "T", "alias": "a", "accession": "PRJEB5"} + result = _normalize_study_report(report) + assert result["accession"] == "PRJEB5" + + def test_study_accession_fallback(self) -> None: + """studyAccession is used when 'accession' is absent.""" + report = {"title": "T", "alias": "a", "studyAccession": "PRJEB99"} + result = _normalize_study_report(report) + assert result["accession"] == "PRJEB99" + + def test_missing_fields_default_to_empty_string(self) -> None: + """Missing fields default to empty string without raising.""" + report = {} + result = _normalize_study_report(report) + assert result["title"] == "" + assert result["alias"] == "" + assert result["accession"] == "" + + def test_status_field_defaults_to_unknown(self) -> None: + """The status field defaults to 'UNKNOWN' when absent.""" + report = {"title": "T", "alias": "a", "accession": "PRJEB6"} + result = _normalize_study_report(report) + assert result["status"] == "UNKNOWN" + + def test_release_status_used_for_status(self) -> None: + """releaseStatus is mapped to the 'status' key.""" + report = {"title": "T", "alias": "a", "accession": "PRJEB7", "releaseStatus": "PUBLIC"} + result = _normalize_study_report(report) + assert result["status"] == "PUBLIC" -@requires_fixtures -class TestLoadInputFile: - """Tests for loading study data from JSON, CSV, and TSV files.""" +class TestFetchAccountStudies: + """Unit tests for fetch_account_studies calling common.fetch_account_records.""" - def test_load_csv(self): - """CSV file loads correctly.""" - studies = common.load_input_file( - EXAMPLE_STUDY_CSV, json_record_keys=_JSON_RECORD_KEYS, + def test_fetch_calls_fetch_account_records_with_correct_urls( + self, auth: HTTPBasicAuth + ) -> None: + """fetch_account_studies calls common.fetch_account_records with prod/test URLs.""" + target = "submit_rawreads_study.common.fetch_account_records" + with patch(target, return_value=[]) as mock_fetch: + fetch_account_studies(auth, use_test=False) + mock_fetch.assert_called_once() + call_kwargs = mock_fetch.call_args + assert call_kwargs.kwargs.get("prod_url") == _PROD_REPORTS_URL + assert call_kwargs.kwargs.get("test_url") == _TEST_REPORTS_URL + + def test_fetch_passes_normalizer_callable(self, auth: HTTPBasicAuth) -> None: + """fetch_account_studies passes a callable normalizer to fetch_account_records.""" + target = "submit_rawreads_study.common.fetch_account_records" + with patch(target, return_value=[]) as mock_fetch: + fetch_account_studies(auth, use_test=False) + call_kwargs = mock_fetch.call_args + normalizer = call_kwargs.kwargs.get("normalizer") + assert callable(normalizer) + + def test_fetch_normalizer_handles_title_variant(self, auth: HTTPBasicAuth) -> None: + """The normalizer passed to fetch_account_records handles title/studyTitle variants.""" + target = "submit_rawreads_study.common.fetch_account_records" + captured_normalizer = None + + def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: + nonlocal captured_normalizer + captured_normalizer = kwargs.get("normalizer") + return [] + + with patch(target, side_effect=capture_normalizer): + fetch_account_studies(auth, use_test=False) + + assert captured_normalizer is not None + result_title = captured_normalizer({"title": "Direct Title", "accession": "PRJEB1"}) + assert result_title["title"] == "Direct Title" + + result_study_title = captured_normalizer( + {"studyTitle": "Fallback Title", "accession": "PRJEB2"} ) - assert studies is not None - assert len(studies) == 1 - for key, val in EXPECTED_STUDY.items(): - assert studies[0][key] == val - - def test_load_tsv(self): - """TSV file loads correctly.""" - studies = common.load_input_file( - EXAMPLE_STUDY_TSV, json_record_keys=_JSON_RECORD_KEYS, + assert result_study_title["title"] == "Fallback Title" + + def test_fetch_normalizer_handles_alias_variant(self, auth: HTTPBasicAuth) -> None: + """The normalizer handles alias/studyAlias field variants.""" + target = "submit_rawreads_study.common.fetch_account_records" + captured_normalizer = None + + def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: + nonlocal captured_normalizer + captured_normalizer = kwargs.get("normalizer") + return [] + + with patch(target, side_effect=capture_normalizer): + fetch_account_studies(auth, use_test=False) + + assert captured_normalizer is not None + result = captured_normalizer({"alias": "direct-alias", "accession": "PRJEB3"}) + assert result["alias"] == "direct-alias" + + result_fallback = captured_normalizer( + {"studyAlias": "study-alias-fallback", "accession": "PRJEB4"} ) - assert studies is not None - assert len(studies) == 1 - for key, val in EXPECTED_STUDY.items(): - assert studies[0][key] == val - - def test_load_json(self): - """JSON file loads correctly.""" - studies = common.load_input_file( - EXAMPLE_STUDY_JSON, json_record_keys=_JSON_RECORD_KEYS, + assert result_fallback["alias"] == "study-alias-fallback" + + def test_fetch_normalizer_handles_accession_variant(self, auth: HTTPBasicAuth) -> None: + """The normalizer handles accession/studyAccession field variants.""" + target = "submit_rawreads_study.common.fetch_account_records" + captured_normalizer = None + + def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: + nonlocal captured_normalizer + captured_normalizer = kwargs.get("normalizer") + return [] + + with patch(target, side_effect=capture_normalizer): + fetch_account_studies(auth, use_test=False) + + assert captured_normalizer is not None + result = captured_normalizer( + {"title": "T", "studyAccession": "PRJEB99", "accession": ""} ) - assert studies is not None - assert len(studies) == 1 - for key, val in EXPECTED_STUDY.items(): - assert studies[0][key] == val - - def test_all_formats_produce_same_data(self): - """All supported formats should produce the same core study fields.""" - all_studies = [ - common.load_input_file( - path, json_record_keys=_JSON_RECORD_KEYS, + # studyAccession falls back when 'accession' is falsy + assert result["accession"] == "PRJEB99" + + +# --------------------------------------------------------------------------- +# E. CLI integration tests for main() using click.testing.CliRunner +# --------------------------------------------------------------------------- + + +def _extract_json_from_output(output: str) -> dict[str, Any]: + """Extract the JSON results dict from mixed CLI output. + + The CLI writes JSON results via ``print()`` to stdout, but logging + also emits to stderr which CliRunner captures in ``result.output``. + This helper finds the last top-level JSON object in the output. + + Args: + output: The full ``result.output`` string from CliRunner. + + Returns: + Parsed JSON dict. + + Raises: + ValueError: If no valid JSON object is found. + """ + # Walk backwards through the output looking for a complete JSON block. + # The results JSON always starts with "{\n " and ends with "\n}". + depth = 0 + end = -1 + start = -1 + for i in range(len(output) - 1, -1, -1): + ch = output[i] + if ch == "}": + if depth == 0: + end = i + depth += 1 + elif ch == "{": + depth -= 1 + if depth == 0: + start = i + break + if start == -1 or end == -1: + raise ValueError(f"No JSON object found in output: {output[:200]!r}") + return json.loads(output[start : end + 1]) + + +def _make_study_json(study: dict[str, Any]) -> str: + """Serialise a study dict into a JSON string using the Container format. + + Args: + study: Study metadata dict. + + Returns: + JSON string in DataHarmonizer Container format. + """ + return json.dumps({ + "Container": { + "SRA_studys": [study], + } + }) + + +def _make_study_csv(study: dict[str, Any]) -> str: + """Serialise a study dict into a minimal CSV string. + + Args: + study: Study metadata dict. + + Returns: + CSV string with header and one data row. + """ + headers = list(study.keys()) + values = [str(study[h]) for h in headers] + return ",".join(headers) + "\n" + ",".join(values) + "\n" + + +def _make_study_tsv(study: dict[str, Any]) -> str: + """Serialise a study dict into a minimal TSV string. + + Args: + study: Study metadata dict. + + Returns: + TSV string with header and one data row. + """ + headers = list(study.keys()) + values = [str(study[h]) for h in headers] + return "\t".join(headers) + "\n" + "\t".join(values) + "\n" + + +@pytest.fixture +def runner() -> CliRunner: + """Return a Click test runner with isolated filesystem.""" + return CliRunner() + + +@pytest.fixture +def minimal_metagenomics_study() -> dict[str, Any]: + """Return a minimal metagenomics study for CLI tests.""" + return { + "alias": "cli-metagenomics-001", + "STUDY_TITLE": "CLI Metagenomics Test Study", + "STUDY_ABSTRACT": "Abstract for CLI test.", + "existing_study_type": "Metagenomics", + } + + +class TestMainCli: + """CLI integration tests for main() using CliRunner.""" + + _CRED_TARGET = "submit_rawreads_study.common.get_credentials" + _SUBMIT_TARGET = "submit_rawreads_study.common.submit_xml" + + def _invoke( + self, + runner: CliRunner, + args: list[str], + input_filename: str, + input_content: str, + ) -> Any: + """Write input file and invoke the CLI. + + Args: + runner: Click CliRunner instance. + args: CLI arguments (excluding --input, which is added automatically). + input_filename: Filename for the temporary input file. + input_content: Content to write to the input file. + + Returns: + Click Result object. + """ + with runner.isolated_filesystem(): + Path(input_filename).write_text(input_content) + result = runner.invoke( + main, + ["--input", input_filename] + args, + catch_exceptions=False, ) - for path in [EXAMPLE_STUDY_JSON, EXAMPLE_STUDY_CSV, EXAMPLE_STUDY_TSV] - ] - for studies in all_studies: - assert len(studies) == 1 - for key, val in EXPECTED_STUDY.items(): - assert studies[0][key] == val - - def test_unknown_extension_returns_none(self, tmp_path): - """Unsupported file extension returns None.""" - unknown = tmp_path / "data.parquet" - unknown.write_text("dummy") - result = common.load_input_file( - str(unknown), json_record_keys=_JSON_RECORD_KEYS, + return result + + # ---- E19: JSON input, automated mode, dry-run ------------------------- + + def test_json_input_automated_dry_run_exits_0( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """JSON input with --automated --dry-run exits 0 and output has 'submitted' key.""" + content = _make_study_json(minimal_metagenomics_study) + with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): + result = self._invoke( + runner, ["--automated", "--dry-run"], "studies.json", content + ) + assert result.exit_code == 0, f"stdout: {result.output}" + data = _extract_json_from_output(result.output) + assert "submitted" in data + + # ---- E20: CSV input --------------------------------------------------- + + def test_csv_input_automated_dry_run_exits_0( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """CSV input with --automated --dry-run exits 0 and output has 'submitted' key.""" + content = _make_study_csv(minimal_metagenomics_study) + with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): + result = self._invoke( + runner, ["--automated", "--dry-run"], "studies.csv", content + ) + assert result.exit_code == 0, f"stdout: {result.output}" + data = _extract_json_from_output(result.output) + assert "submitted" in data + + # ---- E21: TSV input --------------------------------------------------- + + def test_tsv_input_automated_dry_run_exits_0( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """TSV input with --automated --dry-run exits 0 and output has 'submitted' key.""" + content = _make_study_tsv(minimal_metagenomics_study) + with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): + result = self._invoke( + runner, ["--automated", "--dry-run"], "studies.tsv", content + ) + assert result.exit_code == 0, f"stdout: {result.output}" + data = _extract_json_from_output(result.output) + assert "submitted" in data + + # ---- E22: Duplicate detection ----------------------------------------- + + def test_duplicate_detection_records_duplicate_and_skips_submission( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """When account already has a matching study, duplicate is recorded; nothing submitted.""" + existing = { + "title": minimal_metagenomics_study["STUDY_TITLE"], + "alias": minimal_metagenomics_study["alias"], + "accession": "PRJEB55555", + "secondary_accession": "ERP055555", + "status": "PRIVATE", + } + content = _make_study_json(minimal_metagenomics_study) + with runner.isolated_filesystem(): + Path("studies.json").write_text(content) + with ( + patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), + patch( + "submit_rawreads_study.fetch_account_studies", + return_value=[existing], + ), + ): + result = runner.invoke( + main, + ["--input", "studies.json"], + catch_exceptions=False, + ) + assert result.exit_code == 0, f"stdout: {result.output}" + data = _extract_json_from_output(result.output) + assert len(data["duplicates"]) == 1 + assert data["duplicates"][0]["existing_accession"] == "PRJEB55555" + assert data["submitted"] == [] + + # ---- E23: --force with duplicate triggers MODIFY ---------------------- + + def test_force_flag_with_duplicate_triggers_modify( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """--force with a detected duplicate triggers MODIFY and study appears in 'modified'.""" + existing = { + "title": minimal_metagenomics_study["STUDY_TITLE"], + "alias": minimal_metagenomics_study["alias"], + "accession": "PRJEB66666", + "secondary_accession": "ERP066666", + "status": "PRIVATE", + } + receipt_xml = ET.fromstring( + '' + '' + "" ) - assert result is None - - def test_csv_without_metadata_row(self, tmp_path): - """A CSV with no metadata row should still work.""" - csvfile = tmp_path / "no_meta.csv" - csvfile.write_text("STUDY_TITLE,IS_PRIMARY\nTest,YES\n") - studies = common.load_input_file( - str(csvfile), json_record_keys=_JSON_RECORD_KEYS, + content = _make_study_json(minimal_metagenomics_study) + with runner.isolated_filesystem(): + Path("studies.json").write_text(content) + with ( + patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), + patch( + "submit_rawreads_study.fetch_account_studies", + return_value=[existing], + ), + patch(self._SUBMIT_TARGET, return_value=receipt_xml), + ): + result = runner.invoke( + main, + ["--input", "studies.json", "--force"], + catch_exceptions=False, + ) + assert result.exit_code == 0, f"stdout: {result.output}" + data = _extract_json_from_output(result.output) + assert len(data["modified"]) == 1 + assert data["modified"][0]["accession"] == "PRJEB66666" + + # ---- E24: Failed submission exits 1 ----------------------------------- + + def test_failed_submission_exits_1( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """When common.submit_xml raises HTTPError, the CLI exits with code 1.""" + import requests + + content = _make_study_json(minimal_metagenomics_study) + http_error = requests.exceptions.HTTPError(response=MagicMock(status_code=500, text="err")) + with runner.isolated_filesystem(): + Path("studies.json").write_text(content) + with ( + patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), + patch(self._SUBMIT_TARGET, side_effect=http_error), + ): + result = runner.invoke( + main, + ["--input", "studies.json", "--automated"], + catch_exceptions=False, + ) + assert result.exit_code == 1 + + # ---- E25: MAG/genome study dry-run XML contains both PROJECT_ATTRIBUTEs --- + + def test_mag_genome_study_dry_run_xml_has_both_attributes( + self, + runner: CliRunner, + ) -> None: + """MAG/genome study with existing_study_type=Other produces both PROJECT_ATTRIBUTEs.""" + study = { + "alias": "mag-001", + "STUDY_TITLE": "MAG Genome Study", + "existing_study_type": "Other", + "new_study_type": "Genome Sequencing", + } + content = _make_study_json(study) + with runner.isolated_filesystem(): + Path("studies.json").write_text(content) + with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): + result = runner.invoke( + main, + ["--input", "studies.json", "--automated", "--dry-run"], + catch_exceptions=False, + ) + assert result.exit_code == 0, f"output: {result.output}" + data = _extract_json_from_output(result.output) + assert "submitted" in data + # Also verify the XML would contain both attributes by building it directly + root = build_submission_xml([study]) + tags = [el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text] + assert "existing_study_type" in tags + assert "new_study_type" in tags + + # ---- E26: --hold-until date present in XML ---------------------------- + + def test_hold_until_date_appears_in_submission_xml( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """--hold-until date is present in the HOLD element of the generated XML.""" + study = dict(minimal_metagenomics_study) + root = build_submission_xml([study], hold_until="2027-12-31") + hold_el = root.find(".//HOLD") + assert hold_el is not None + assert hold_el.get("HoldUntilDate") == "2027-12-31" + + def test_hold_until_cli_flag_passes_validation( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """CLI --hold-until with a valid future date exits 0 in dry-run mode.""" + content = _make_study_json(minimal_metagenomics_study) + with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): + result = self._invoke( + runner, + ["--automated", "--dry-run", "--hold-until", "2027-06-01"], + "studies.json", + content, + ) + assert result.exit_code == 0, f"output: {result.output}" + + # ---- E27: --output writes results to file ----------------------------- + + def test_output_flag_writes_results_to_file( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """--output flag writes JSON results to a file rather than stdout.""" + content = _make_study_json(minimal_metagenomics_study) + with runner.isolated_filesystem(): + Path("studies.json").write_text(content) + with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): + result = runner.invoke( + main, + ["--input", "studies.json", "--automated", "--dry-run", + "--output", "results.json"], + catch_exceptions=False, + ) + assert result.exit_code == 0, f"stdout: {result.output}" + # With --output, the JSON results go to file, not stdout (stdout has only logging). + results_path = Path("results.json") + assert results_path.exists(), "results.json was not created" + data = json.loads(results_path.read_text()) + assert "submitted" in data + + # ---- E28: --test flag routes to test base URL ------------------------- + + def test_test_flag_uses_test_base_url( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """--test flag results in the test base URL being used for submission.""" + receipt_xml = ET.fromstring( + '' + '' + "" ) - assert len(studies) == 1 - assert studies[0]["STUDY_TITLE"] == "Test" - assert studies[0]["IS_PRIMARY"] == "YES" - - def test_tabular_empty_values_omitted(self, tmp_path): - """Empty cells in tabular files should be omitted.""" - csvfile = tmp_path / "sparse.csv" - csvfile.write_text( - "STUDY_TITLE,STUDY_ABSTRACT,IS_PRIMARY\nTest,,YES\n", + content = _make_study_json(minimal_metagenomics_study) + with runner.isolated_filesystem(): + Path("studies.json").write_text(content) + with ( + patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), + patch(self._SUBMIT_TARGET, return_value=receipt_xml) as mock_submit, + ): + result = runner.invoke( + main, + ["--input", "studies.json", "--automated", "--test"], + catch_exceptions=False, + ) + assert result.exit_code == 0, f"stdout: {result.output}" + assert mock_submit.called + called_url = mock_submit.call_args[0][0] + assert "wwwdev" in called_url, f"Expected test URL; got {called_url}" + + def test_no_test_flag_uses_production_base_url( + self, + runner: CliRunner, + minimal_metagenomics_study: dict[str, Any], + ) -> None: + """Without --test flag, the production base URL is used.""" + receipt_xml = ET.fromstring( + '' + '' + "" ) - studies = common.load_input_file( - str(csvfile), json_record_keys=_JSON_RECORD_KEYS, + content = _make_study_json(minimal_metagenomics_study) + with runner.isolated_filesystem(): + Path("studies.json").write_text(content) + with ( + patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), + patch(self._SUBMIT_TARGET, return_value=receipt_xml) as mock_submit, + ): + result = runner.invoke( + main, + ["--input", "studies.json", "--automated"], + catch_exceptions=False, + ) + assert result.exit_code == 0, f"stdout: {result.output}" + assert mock_submit.called + called_url = mock_submit.call_args[0][0] + assert "wwwdev" not in called_url, f"Expected prod URL; got {called_url}" + + +# --------------------------------------------------------------------------- +# Parametrized study-type cases +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "study_type,new_type,expect_new_type", + [ + ("Metagenomics", None, False), + ("RNASeq", None, False), + ("Population Genomics", None, False), + ("Other", "Genome Sequencing", True), + ("Other", "Transcriptome Analysis", True), + ("Other", None, False), + ], +) +def test_project_attribute_new_study_type_conditional( + study_type: str, + new_type: str | None, + expect_new_type: bool, +) -> None: + """new_study_type attribute appears iff existing_study_type=='Other' and new_type is set. + + Args: + study_type: Value for existing_study_type. + new_type: Value for new_study_type (or None). + expect_new_type: Whether new_study_type should appear in the XML. + """ + study: dict[str, Any] = { + "alias": "param-test", + "STUDY_TITLE": "Parametrized Study", + "existing_study_type": study_type, + } + if new_type is not None: + study["new_study_type"] = new_type + + root = build_submission_xml([study]) + tags = [el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text] + if expect_new_type: + assert "new_study_type" in tags, ( + f"Expected new_study_type in tags for {study_type!r} / {new_type!r}" + ) + else: + assert "new_study_type" not in tags, ( + f"Did not expect new_study_type in tags for {study_type!r} / {new_type!r}" ) - assert len(studies) == 1 - assert "STUDY_ABSTRACT" not in studies[0] - assert studies[0]["STUDY_TITLE"] == "Test" + + +@pytest.mark.parametrize( + "hold_until,expect_hold", + [ + ("2027-03-01", True), + ("2028-12-31", True), + (None, False), + ], +) +def test_hold_until_element_conditional(hold_until: str | None, expect_hold: bool) -> None: + """HOLD element appears iff hold_until is provided. + + Args: + hold_until: The hold-until date string, or None. + expect_hold: Whether the HOLD element should appear. + """ + study = {"alias": "hold-test", "STUDY_TITLE": "Hold Date Test"} + root = build_submission_xml([study], hold_until=hold_until) + hold_el = root.find(".//HOLD") + if expect_hold: + assert hold_el is not None + assert hold_el.get("HoldUntilDate") == hold_until + else: + assert hold_el is None + + +@pytest.mark.parametrize("action", ["ADD", "MODIFY"]) +def test_submission_action_element_present(action: str) -> None: + """The correct action element (ADD or MODIFY) appears in the SUBMISSION. + + Args: + action: The submission action string. + """ + study = {"alias": "action-test", "STUDY_TITLE": "Action Test"} + root = build_submission_xml([study], action=action) + xml_str = ET.tostring(root, encoding="unicode") + assert f"<{action}" in xml_str or f"<{action}/>" in xml_str + opposite = "MODIFY" if action == "ADD" else "ADD" + assert f"<{opposite}" not in xml_str diff --git a/conf/modules.config b/conf/modules.config index 1eadfb0..b55d4f9 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -43,7 +43,11 @@ process { ] } - withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI|REGISTERSTUDY' { + withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI' { ext.args = { params.test_upload ? "--test" : "" } } + + withName: 'SUBMIT_RAWREADS_STUDY' { + ext.args = { [params.test_upload ? "--test" : "", "--automated"].findAll().join(" ") } + } } diff --git a/conf/test_assembly.config b/conf/test_assembly.config index d94b5bc..389e102 100644 --- a/conf/test_assembly.config +++ b/conf/test_assembly.config @@ -30,8 +30,6 @@ params { mode = "metagenomic_assemblies" submission_study = "PRJEB98843" - ena_raw_reads_study_accession = "PRJEB65995" - library = "metagenome" centre_name = "TEST_CENTER" } diff --git a/modules/local/ena_webin_cli/main.nf b/modules/local/ena_webin_cli/main.nf index 25b12f4..e5f878e 100644 --- a/modules/local/ena_webin_cli/main.nf +++ b/modules/local/ena_webin_cli/main.nf @@ -58,4 +58,16 @@ process ENA_WEBIN_CLI { false fi """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_webin-cli.report + export STATUS="success" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ena-webin-cli: 0.0.0 + END_VERSIONS + """ } diff --git a/modules/local/genome_upload/main.nf b/modules/local/genome_upload/main.nf index f8bf1a5..3c5d348 100644 --- a/modules/local/genome_upload/main.nf +++ b/modules/local/genome_upload/main.nf @@ -11,6 +11,7 @@ process GENOME_UPLOAD { path(mags) path(table_for_upload) val(mags_or_bins_flag) + val(submission_study) output: path "results/{MAG,bin}_upload/manifests*/*.manifest" , emit: manifests @@ -34,7 +35,7 @@ process GENOME_UPLOAD { export ENA_WEBIN_PASSWORD=\$WEBIN_PASSWORD genome_upload \\ - -u $params.submission_study \\ + -u $submission_study \\ --genome_info ${table_for_upload} \\ --centre_name $params.centre_name \\ --${mags_or_bins_flag} \\ diff --git a/nextflow.config b/nextflow.config index a6f7ae2..dba0973 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,10 +13,7 @@ params { input = null mode = null // {mags, bins, metagenomic_assemblies} - // TODO rewrite register_study script to remove this unnecessary parameters - ena_raw_reads_study_accession = null - library = null - + study_metadata = null submission_study = null centre_name = null upload_tpa = false diff --git a/nextflow_schema.json b/nextflow_schema.json index d31596a..dedf312 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -261,16 +261,13 @@ "description": "ENA study accession (PRJ/ERP) to submit the data to", "help_text": "Current implementation of pipeline requires to pre-register ENA project (PRJ/ERP) where you want to upload data to. Documentation how to register study: https://ena-docs.readthedocs.io/en/latest/submit/study.html" }, - "library": { + "study_metadata": { "type": "string", - "enum": ["metagenome", "metatranscriptome"], - "description": "Type of library for the submission. Required for creation of the new submission study.", - "help_text": "Uses script register_study from assembly_uploader package that requires this parameter to compose study title." - }, - "ena_raw_reads_study_accession": { - "type": "string", - "description": "ENA study accession (PRJ/ERP) of the raw reads study associated with the assembly submission. Required for creation of the new submission study.", - "help_text": "Uses script register_study from assembly_uploader package that requires this parameter to compose study title and description." + "format": "file-path", + "exists": true, + "description": "Path to study metadata file (JSON, CSV, or TSV) for registering a new ENA study. Required when submission_study is not provided.", + "help_text": "File containing study metadata fields (STUDY_TITLE, STUDY_ABSTRACT, existing_study_type, alias, etc.). Used by SUBMIT_RAWREADS_STUDY to create a new study in ENA when no existing submission_study accession is given.", + "fa_icon": "fas fa-file-alt" }, "webincli_submit": { "type": "boolean", diff --git a/tests/default.nf.test b/tests/default.nf.test index 44f2465..919645d 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -4,30 +4,81 @@ nextflow_pipeline { script "../main.nf" tag "pipeline" - test("-profile test") { + test("metagenomic_assemblies mode — submission_study provided (no study registration)") { + // Exercises the assembly submission path using a pre-registered study (stub mode). + // SUBMIT_RAWREADS_STUDY is NOT called here; the module-level nf-test covers it. + // + // A samplesheet is generated on the fly with absolute paths so that nf-schema + // validation succeeds regardless of the nf-test launchDir. + options "-stub" when { params { - outdir = "$outputDir" + def csv = new File("${outputDir}/samplesheet_assembly.csv") + csv.parentFile.mkdirs() + csv.text = [ + "sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version", + "sample1,${projectDir}/tests/data/contigs.fasta.gz,${projectDir}/tests/data/fastq_1.fastq,${projectDir}/tests/data/fastq_2.fastq,,ERR000001,SPAdes,3.15", + "sample2,${projectDir}/tests/data/invalid_assembly.fasta.gz,,,45,ERR000002,Velvet,1.2.10", + "sample3,${projectDir}/tests/data/contigs.fasta.gz,,,30,ERR000003,MEGAHIT,1.2.9" + ].join("\n") + + outdir = "$outputDir" + input = csv.absolutePath + mode = "metagenomic_assemblies" + submission_study = "PRJEB98843" + centre_name = "TEST_CENTER" } } then { - // stable_name: All files + folders in ${params.outdir}/ with a stable name def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) - // stable_path: All files in ${params.outdir}/ with stable content def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') assertAll( - { assert workflow.success}, + { assert workflow.success }, { assert snapshot( - // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), - // All stable path name, with a relative path stable_name, - // All files with stable contents stable_path ).match() } ) } } + + test("metagenomic_assemblies mode — study_metadata provided (SUBMIT_RAWREADS_STUDY registers study)") { + // Tests the study-registration path in stub mode. SUBMIT_RAWREADS_STUDY stub + // outputs an empty accessions JSON, so this test validates the plumbing rather + // than the end-to-end submission output. + options "-stub" + + when { + params { + def csv = new File("${outputDir}/samplesheet_assembly.csv") + csv.parentFile.mkdirs() + csv.text = [ + "sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version", + "sample1,${projectDir}/tests/data/contigs.fasta.gz,${projectDir}/tests/data/fastq_1.fastq,${projectDir}/tests/data/fastq_2.fastq,,ERR000001,SPAdes,3.15", + "sample2,${projectDir}/tests/data/invalid_assembly.fasta.gz,,,45,ERR000002,Velvet,1.2.10", + "sample3,${projectDir}/tests/data/contigs.fasta.gz,,,30,ERR000003,MEGAHIT,1.2.9" + ].join("\n") + + outdir = "$outputDir" + input = csv.absolutePath + mode = "metagenomic_assemblies" + study_metadata = "${projectDir}/assets/test-fixtures/example_study.json" + centre_name = "TEST_CENTER" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert workflow.trace.succeeded().any { it.name.contains("SUBMIT_RAWREADS_STUDY") } } + ) + } + } + + // NOTE: The MAGs/bins test requires remote genome files from nf-core/test-datasets + // (https://github.com/nf-core/test-datasets/tree/seqsubmit) and cannot run offline. + // Run it manually with: nf-test test tests/default.nf.test --filter "mags" --profile test_genome,docker } diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap new file mode 100644 index 0000000..71a254e --- /dev/null +++ b/tests/default.nf.test.snap @@ -0,0 +1,58 @@ +{ + "metagenomic_assemblies mode \u2014 submission_study provided (no study registration)": { + "content": [ + { + "Workflow": { + "nf-core/seqsubmit": "v1.0.0dev" + } + }, + [ + "coverm", + "coverm/sample1.depth.txt", + "fastavalidator", + "fastavalidator/sample1.success.log", + "fastavalidator/sample2.success.log", + "fastavalidator/sample3.success.log", + "generate", + "generate/PRJEB98843_upload", + "generate/PRJEB98843_upload/test.manifest", + "metagenomic_assemblies", + "metagenomic_assemblies/multiqc", + "metagenomic_assemblies/multiqc/multiqc_data", + "metagenomic_assemblies/multiqc/multiqc_plots", + "metagenomic_assemblies/multiqc/multiqc_report.html", + "metagenomic_assemblies/sample1_assembly_metadata.csv", + "metagenomic_assemblies/sample2_assembly_metadata.csv", + "metagenomic_assemblies/sample3_assembly_metadata.csv", + "metagenomic_assemblies/upload", + "metagenomic_assemblies/upload/webin_cli", + "metagenomic_assemblies/upload/webin_cli/sample1_webin-cli.report", + "metagenomic_assemblies/upload/webin_cli/sample2_webin-cli.report", + "metagenomic_assemblies/upload/webin_cli/sample3_webin-cli.report", + "pipeline_info", + "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml", + "samplesheet_assembly.csv" + ], + [ + "sample1.depth.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "sample1.success.log:md5,b0b859eda1db5cd43915846e00ebc22c", + "sample2.success.log:md5,b0b859eda1db5cd43915846e00ebc22c", + "sample3.success.log:md5,b0b859eda1db5cd43915846e00ebc22c", + "test.manifest:md5,d41d8cd98f00b204e9800998ecf8427e", + "multiqc_report.html:md5,d41d8cd98f00b204e9800998ecf8427e", + "sample1_assembly_metadata.csv:md5,e1a00dc628e95c38e18dfd5161fa2ce4", + "sample2_assembly_metadata.csv:md5,901e55730b100224efb27f23aabf4f67", + "sample3_assembly_metadata.csv:md5,d5b1575095ece78d988395b874440bef", + "sample1_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e", + "sample2_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e", + "sample3_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e", + "samplesheet_assembly.csv:md5,2f74b281cb7096ad80a378b8960aabee" + ] + ], + "timestamp": "2026-03-12T13:22:15.261886", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config index 695d52b..be915f5 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -11,4 +11,19 @@ params { pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/seqsubmit' } +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '1.h' + ] +} + +// Override secrets-based env vars so nf-test runs don't require a populated keystore. +// Stub-mode tests never use the actual credentials. +env { + ENA_WEBIN = "test_webin_account" + ENA_WEBIN_PASSWORD = "test_webin_password" +} + aws.client.anonymous = true // fixes S3 access issues on self-hosted runners diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf index 918e1d7..b383a6c 100644 --- a/workflows/assemblysubmit.nf +++ b/workflows/assemblysubmit.nf @@ -7,7 +7,7 @@ include { COVERM_CONTIG } from '../modules/nf-core/coverm/contig/main' include { FASTAVALIDATOR } from '../modules/nf-core/fastavalidator/main' include { GENERATE_ASSEMBLY_MANIFEST } from '../modules/local/generate_assembly_manifest/main' -include { REGISTERSTUDY } from '../modules/local/registerstudy/main' +include { SUBMIT_RAWREADS_STUDY } from '../modules/local/submit_rawreads_study/main' include { ENA_WEBIN_CLI } from '../modules/local/ena_webin_cli' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -99,6 +99,9 @@ workflow ASSEMBLYSUBMIT { .map { meta, coverage_file -> // Read the file and calculate average def lines = coverage_file.readLines() + if (lines.size() < 2) { + return [meta, 0.0] + } def coverages = lines[1..-1].collect { line -> line.split('\t')[1] as Double } @@ -139,6 +142,7 @@ workflow ASSEMBLYSUBMIT { def content = "${header}\n${row}" def csv_file = file("${params.outdir}/${params.mode}/${meta.id}_assembly_metadata.csv") + csv_file.parent.toFile().mkdirs() csv_file.text = content [meta, csv_file] @@ -149,11 +153,17 @@ workflow ASSEMBLYSUBMIT { // Use provided study accession directly study_accession_ch = channel.of(params.submission_study) } else { - // Register a new study - REGISTERSTUDY( - [[id:"study"], params.ena_raw_reads_study_accession, params.centre_name, params.library ] + // Register a new study using the study metadata file + SUBMIT_RAWREADS_STUDY( + channel.of([[id: "study"], file(params.study_metadata)]) ) - study_accession_ch = REGISTERSTUDY.out.study_accession.map { _meta, accession -> accession } + ch_versions = ch_versions.mix(SUBMIT_RAWREADS_STUDY.out.versions) + study_accession_ch = SUBMIT_RAWREADS_STUDY.out.accessions + .map { _meta, json -> + def data = new groovy.json.JsonSlurper().parse(json) + data.submitted[0]?.accession + ?: data.duplicates[0]?.existing_accession + } } // Generate assembly manifest files and submit them to ENA diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf index 063d56c..b34a704 100644 --- a/workflows/genomesubmit.nf +++ b/workflows/genomesubmit.nf @@ -5,6 +5,7 @@ */ include { GENOME_UPLOAD } from '../modules/local/genome_upload' include { ENA_WEBIN_CLI } from '../modules/local/ena_webin_cli' +include { SUBMIT_RAWREADS_STUDY } from '../modules/local/submit_rawreads_study/main' include { RNA_DETECTION } from '../subworkflows/local/rna_detection' @@ -109,10 +110,27 @@ workflow GENOMESUBMIT { newLine: true ) + def study_accession_ch + if (params.submission_study) { + study_accession_ch = channel.of(params.submission_study) + } else { + SUBMIT_RAWREADS_STUDY( + channel.of([[id: "study"], file(params.study_metadata)]) + ) + ch_versions = ch_versions.mix(SUBMIT_RAWREADS_STUDY.out.versions) + study_accession_ch = SUBMIT_RAWREADS_STUDY.out.accessions + .map { _meta, json -> + def data = new groovy.json.JsonSlurper().parse(json) + data.submitted[0]?.accession + ?: data.duplicates[0]?.existing_accession + } + } + GENOME_UPLOAD( genome_fasta.map{meta, fasta -> fasta}.collect(), genome_metadata_csv, - params.mode + params.mode, + study_accession_ch.first() ) ch_versions = ch_versions.mix( GENOME_UPLOAD.out.versions ) From 6f265d8985dc1580bbf59fc081c732f1dde79191 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Thu, 12 Mar 2026 13:56:05 +0000 Subject: [PATCH 08/36] Replaced REGSTERSTUDY module with new one based on submit_study.py script. Also renamed submit_rawreads_study to submit_study --- ...bmit_rawreads_study.py => submit_study.py} | 11 +-- ...rawreads_study.py => test_submit_study.py} | 26 +++--- conf/modules.config | 2 +- .../tests/main.nf.test.snap | 32 +++---- modules/local/registerstudy/environment.yml | 8 +- modules/local/registerstudy/main.nf | 41 ++++----- modules/local/registerstudy/meta.yml | 90 +++++++++---------- modules/local/registerstudy/nextflow.config | 9 -- .../local/registerstudy/tests/main.nf.test | 34 +++---- .../registerstudy/tests/main.nf.test.snap | 59 +++--------- .../tests/nextflow.config | 4 +- .../submit_rawreads_study/environment.yml | 10 --- modules/local/submit_rawreads_study/main.nf | 47 ---------- modules/local/submit_rawreads_study/meta.yml | 68 -------------- .../submit_rawreads_study/tests/main.nf.test | 56 ------------ .../tests/main.nf.test.snap | 35 -------- nextflow_schema.json | 2 +- tests/default.nf.test | 8 +- workflows/assemblysubmit.nf | 8 +- workflows/genomesubmit.nf | 8 +- 20 files changed, 140 insertions(+), 418 deletions(-) rename bin/{submit_rawreads_study.py => submit_study.py} (98%) rename bin/{test_submit_rawreads_study.py => test_submit_study.py} (98%) delete mode 100644 modules/local/registerstudy/nextflow.config rename modules/local/{submit_rawreads_study => registerstudy}/tests/nextflow.config (86%) delete mode 100644 modules/local/submit_rawreads_study/environment.yml delete mode 100644 modules/local/submit_rawreads_study/main.nf delete mode 100644 modules/local/submit_rawreads_study/meta.yml delete mode 100644 modules/local/submit_rawreads_study/tests/main.nf.test delete mode 100644 modules/local/submit_rawreads_study/tests/main.nf.test.snap diff --git a/bin/submit_rawreads_study.py b/bin/submit_study.py similarity index 98% rename from bin/submit_rawreads_study.py rename to bin/submit_study.py index c00ee6d..ae72d69 100755 --- a/bin/submit_rawreads_study.py +++ b/bin/submit_study.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 -"""Submit raw-reads studies to ENA via the Webin REST API v2. +"""Submit raw-reads, assembly and genome studies to ENA +via the Webin REST API v2. Read a DataHarmonizer export containing study metadata, check for duplicate studies already registered under the @@ -14,17 +15,17 @@ Usage:: - python bin/submit_rawreads_study.py \ + python bin/submit_study.py \ --input studies.json \ --test # With hold date (max 2 years): - python bin/submit_rawreads_study.py \ + python bin/submit_study.py \ --input studies.json \ --hold-until 2028-01-01 # Log to file: - python bin/submit_rawreads_study.py \ + python bin/submit_study.py \ --input studies.json \ --test --log submission.log """ @@ -431,7 +432,7 @@ def _do_submission( @click.command( - help="Submit raw-reads studies to ENA via the Webin REST API v2.", + help="Submit raw-reads, assembly and genome studies to ENA via the Webin REST API v2.", ) @click.option( "--input", "input_file", diff --git a/bin/test_submit_rawreads_study.py b/bin/test_submit_study.py similarity index 98% rename from bin/test_submit_rawreads_study.py rename to bin/test_submit_study.py index f07f85e..d021383 100644 --- a/bin/test_submit_rawreads_study.py +++ b/bin/test_submit_study.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Tests for submit_rawreads_study.py — raw-reads study submission pipeline. +"""Tests for submit_study.py — ENA study submission pipeline. Covers: A. Unit tests for build_submission_xml and _add_project_element @@ -9,11 +9,11 @@ E. CLI integration tests for main() using click.testing.CliRunner Usage: - pytest bin/test_submit_rawreads_study.py -v + pytest bin/test_submit_study.py -v All external I/O (HTTP requests, ENA reports API) is mocked. Tests do NOT import from ena_submit_common directly — all assertions go through the public -API of submit_rawreads_study. +API of submit_study. """ from __future__ import annotations @@ -34,7 +34,7 @@ # Ensure the scripts directory is on the path before importing the module. sys.path.insert(0, os.path.dirname(__file__)) -from submit_rawreads_study import ( # noqa: E402 +from bin.submit_study import ( # noqa: E402 _normalize_study_report, build_submission_xml, fetch_account_studies, @@ -816,7 +816,7 @@ def test_fetch_calls_fetch_account_records_with_correct_urls( self, auth: HTTPBasicAuth ) -> None: """fetch_account_studies calls common.fetch_account_records with prod/test URLs.""" - target = "submit_rawreads_study.common.fetch_account_records" + target = "submit_study.common.fetch_account_records" with patch(target, return_value=[]) as mock_fetch: fetch_account_studies(auth, use_test=False) mock_fetch.assert_called_once() @@ -826,7 +826,7 @@ def test_fetch_calls_fetch_account_records_with_correct_urls( def test_fetch_passes_normalizer_callable(self, auth: HTTPBasicAuth) -> None: """fetch_account_studies passes a callable normalizer to fetch_account_records.""" - target = "submit_rawreads_study.common.fetch_account_records" + target = "submit_study.common.fetch_account_records" with patch(target, return_value=[]) as mock_fetch: fetch_account_studies(auth, use_test=False) call_kwargs = mock_fetch.call_args @@ -835,7 +835,7 @@ def test_fetch_passes_normalizer_callable(self, auth: HTTPBasicAuth) -> None: def test_fetch_normalizer_handles_title_variant(self, auth: HTTPBasicAuth) -> None: """The normalizer passed to fetch_account_records handles title/studyTitle variants.""" - target = "submit_rawreads_study.common.fetch_account_records" + target = "submit_study.common.fetch_account_records" captured_normalizer = None def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: @@ -857,7 +857,7 @@ def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: def test_fetch_normalizer_handles_alias_variant(self, auth: HTTPBasicAuth) -> None: """The normalizer handles alias/studyAlias field variants.""" - target = "submit_rawreads_study.common.fetch_account_records" + target = "submit_study.common.fetch_account_records" captured_normalizer = None def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: @@ -879,7 +879,7 @@ def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: def test_fetch_normalizer_handles_accession_variant(self, auth: HTTPBasicAuth) -> None: """The normalizer handles accession/studyAccession field variants.""" - target = "submit_rawreads_study.common.fetch_account_records" + target = "submit_study.common.fetch_account_records" captured_normalizer = None def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: @@ -1004,8 +1004,8 @@ def minimal_metagenomics_study() -> dict[str, Any]: class TestMainCli: """CLI integration tests for main() using CliRunner.""" - _CRED_TARGET = "submit_rawreads_study.common.get_credentials" - _SUBMIT_TARGET = "submit_rawreads_study.common.submit_xml" + _CRED_TARGET = "submit_study.common.get_credentials" + _SUBMIT_TARGET = "submit_study.common.submit_xml" def _invoke( self, @@ -1106,7 +1106,7 @@ def test_duplicate_detection_records_duplicate_and_skips_submission( with ( patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), patch( - "submit_rawreads_study.fetch_account_studies", + "submit_study.fetch_account_studies", return_value=[existing], ), ): @@ -1147,7 +1147,7 @@ def test_force_flag_with_duplicate_triggers_modify( with ( patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), patch( - "submit_rawreads_study.fetch_account_studies", + "submit_study.fetch_account_studies", return_value=[existing], ), patch(self._SUBMIT_TARGET, return_value=receipt_xml), diff --git a/conf/modules.config b/conf/modules.config index b55d4f9..eaef036 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -47,7 +47,7 @@ process { ext.args = { params.test_upload ? "--test" : "" } } - withName: 'SUBMIT_RAWREADS_STUDY' { + withName: 'REGISTERSTUDY' { ext.args = { [params.test_upload ? "--test" : "", "--automated"].findAll().join(" ") } } } diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap index 7fef896..f594383 100644 --- a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap +++ b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap @@ -11,7 +11,7 @@ ] ], "1": [ - "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" + "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" ], "manifest": [ [ @@ -22,20 +22,20 @@ ] ], "versions": [ - "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" + "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" ] }, { "GENERATE_ASSEMBLY_MANIFEST": { - "assembly_uploader": "assembly_uploader 1.3.3" + "assembly_uploader": "assembly_uploader 1.3.4" } } ], + "timestamp": "2026-03-12T13:52:01.267817", "meta": { - "nf-test": "0.9.0", - "nextflow": "25.04.1" - }, - "timestamp": "2025-10-30T15:10:02.229709" + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } }, "GENERATE_ASSEMBLY_MANIFEST completes with expected outputs": { "content": [ @@ -45,34 +45,34 @@ { "id": "test" }, - "233126d4c4d023f18c7836ed36395e3c.manifest:md5,3152b34ddec05a2c9937a2e03416e5e1" + "233126d4c4d0.manifest:md5,8387c0e6c123313259db613612c09dce" ] ], "1": [ - "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" + "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" ], "manifest": [ [ { "id": "test" }, - "233126d4c4d023f18c7836ed36395e3c.manifest:md5,3152b34ddec05a2c9937a2e03416e5e1" + "233126d4c4d0.manifest:md5,8387c0e6c123313259db613612c09dce" ] ], "versions": [ - "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e" + "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" ] }, { "GENERATE_ASSEMBLY_MANIFEST": { - "assembly_uploader": "assembly_uploader 1.3.3" + "assembly_uploader": "assembly_uploader 1.3.4" } } ], + "timestamp": "2026-03-12T13:51:56.121365", "meta": { - "nf-test": "0.9.0", - "nextflow": "25.04.1" - }, - "timestamp": "2025-10-30T15:09:57.708757" + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } } } \ No newline at end of file diff --git a/modules/local/registerstudy/environment.yml b/modules/local/registerstudy/environment.yml index 80dd37e..6ee92a8 100644 --- a/modules/local/registerstudy/environment.yml +++ b/modules/local/registerstudy/environment.yml @@ -4,7 +4,7 @@ channels: - conda-forge - bioconda dependencies: - # TODO nf-core: List required Conda package(s). - # Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10"). - # For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems. - - "bioconda::assembly_uploader=1.3.2" + - conda-forge::python>=3.12 + - conda-forge::pip + - pip: + - mgnify-pipelines-toolkit==1.4.17 diff --git a/modules/local/registerstudy/main.nf b/modules/local/registerstudy/main.nf index 0621043..67766e0 100644 --- a/modules/local/registerstudy/main.nf +++ b/modules/local/registerstudy/main.nf @@ -3,54 +3,45 @@ process REGISTERSTUDY { label 'process_single' conda "${moduleDir}/environment.yml" - container "community.wave.seqera.io/library/pip_assembly-uploader:2a65298c0161c561" + container "quay.io/microbiome-informatics/mgnify-pipelines-toolkit:1.4.17" - input: - tuple val(meta), val(study), val(center), val(library) + // ENA_WEBIN and ENA_WEBIN_PASSWORD must be set in the process environment. + // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config: + // env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD } + input: + tuple val(meta), path(study_metadata) output: - tuple val(meta), env("STUDY_ID"), emit: study_accession - path "versions.yml" , emit: versions + tuple val(meta), path("*_accessions.json"), emit: accessions + path "versions.yml", emit: versions when: task.ext.when == null || task.ext.when script: - def args = task.ext.args ?: '' - def args2 = task.ext.args2 ?: '' + def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - echo "Generate study XMLs" - study_xmls \\ - $args \\ - --study ${study} \\ - --library ${library} \\ - --center ${center} \\ - - echo "Submit study to ENA" - submit_study \\ - $args2 \\ - --directory ${study}_upload \\ - --study ${study} 2>&1 | tee report.log - - STUDY_ID=\$(grep 'A new study accession has been created' report.log | grep -oE '(PRJ|ERP)[[:alnum:]_]+[[:digit:]]+') + submit_study.py \\ + --input ${study_metadata} \\ + --output ${prefix}_accessions.json \\ + ${args} cat <<-END_VERSIONS > versions.yml "${task.process}": - assembly_uploader: \$(study_xmls --version) + mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))") END_VERSIONS """ stub: - def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - touch ${prefix}.report + echo '{"submitted":[],"duplicates":[],"modified":[],"failed":[]}' > ${prefix}_accessions.json cat <<-END_VERSIONS > versions.yml "${task.process}": - assembly_uploader: \$(study_xmls --version) + mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))") END_VERSIONS """ } diff --git a/modules/local/registerstudy/meta.yml b/modules/local/registerstudy/meta.yml index c459a19..549f187 100644 --- a/modules/local/registerstudy/meta.yml +++ b/modules/local/registerstudy/meta.yml @@ -1,18 +1,28 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "registerstudy" -description: This module registers a study and project and generates accessions that will be used for metagenomic assembly uploads in ENA. The study generated will reference reads from an already public project. +description: | + Submit a new study to ENA via the Webin REST API v2. + Reads study metadata from a JSON, CSV, or TSV file, checks for + duplicate studies already registered under the Webin account, + builds a PROJECT XML submission document, and submits to ENA. + Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD + Nextflow secrets, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD + inside the process. keywords: - - assembly - - register + - ena + - submission - study + - project + - webin tools: - - "registerstudy": - description: "Nextflow module to register study/project to upload primary metagenome and metatranscriptome - assemblies to ENA on a per-study basis. The scripts generate xmls to register a new study and create manifests - necessary for submission of assemblies using webin-cli." - homepage: "https://github.com/EBI-Metagenomics/assembly_uploader" - documentation: "https://github.com/EBI-Metagenomics/assembly_uploader" - tool_dev_url: "None" + - mgnify-pipelines-toolkit: + description: | + A toolkit of utilities used in MGnify metagenomics pipelines, + including click, requests, and other dependencies required by + the ENA submission scripts. + homepage: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit + documentation: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit + tool_dev_url: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit doi: "" licence: ["Apache-2.0"] identifier: null @@ -21,50 +31,38 @@ input: - - meta: type: map description: | - Groovy Map containing sample information + Groovy Map containing sample information. e.g. `[ id:'sample1' ]` - - study: - type: value - description: | - Study accession with raw reads public in ENA. - Example: "PRJNA312520" - - - center: - type: value + - study_metadata: + type: file description: | - Name of the sequencing or submitting center. - Example: "Wellcome Sanger Institute" - - - library: - type: value - description: | - Library information associated with the study. - Example: "metagenome" - enum: - - metagenome - - metatranscriptome + Study metadata file in JSON, CSV, or TSV format. + JSON may follow the DataHarmonizer Container export format or be + a plain list/dict of study records. + Required fields per record: STUDY_TITLE, existing_study_type. + pattern: "*.{json,csv,tsv}" output: - study_accession: - - - meta: + - accessions: + - meta: type: map description: | - Groovy Map containing sample information + Groovy Map containing sample information. e.g. `[ id:'sample1' ]` - - study: - type: value + - "*_accessions.json": + type: file description: | - Study accession registered in ENA. - Example: "PRJEB312520" - versions: - - "versions.yml": - type: file - description: File containing software versions - pattern: "versions.yml" - ontologies: - - edam: "http://edamontology.org/format_3750" # YAML + JSON file containing the submission results with keys: + submitted (newly created accessions), duplicates (skipped), + modified (force-updated), and failed. + pattern: "*_accessions.json" + - versions: + - "versions.yml": + type: file + description: File containing software versions + pattern: "versions.yml" authors: - - "@alisha246" + - "@timrozday" maintainers: - - "@alisha246" + - "@timrozday" diff --git a/modules/local/registerstudy/nextflow.config b/modules/local/registerstudy/nextflow.config deleted file mode 100644 index 3f71a8e..0000000 --- a/modules/local/registerstudy/nextflow.config +++ /dev/null @@ -1,9 +0,0 @@ -process { - withName: REGISTERSTUDY { - ext.args2 = '--test' - } -} -env { - ENA_WEBIN = secrets.WEBIN_ACCOUNT - ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD -} diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test index d11a6d1..42f6902 100644 --- a/modules/local/registerstudy/tests/main.nf.test +++ b/modules/local/registerstudy/tests/main.nf.test @@ -1,25 +1,21 @@ -// TODO nf-core: Once you have added the required tests, please run the following command to build this file: -// nf-core modules test registerstudy nextflow_process { name "Test Process REGISTERSTUDY" script "../main.nf" - config "../nextflow.config" + config "./nextflow.config" process "REGISTERSTUDY" - tag "modules" tag "registerstudy" - test("registerstudy - should register a study on ENA test server") { + test("registerstudy - stub") { + options "-stub" when { process { """ input[0] = [ - [ id:'test', single_end:false ], // meta map - "PRJNA318468", - "EMG", - "metagenome" + [ id:'example_study' ], + file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true) ] """ } @@ -28,23 +24,21 @@ nextflow_process { then { assertAll( { assert process.success }, - //TODO improve assertions + { assert snapshot(process.out).match() } ) } } - test("registerstudy - stub") { - - options "-stub" + test("registerstudy - dry run against ENA test server") { + // Validates and builds the submission XML but does not submit to ENA. + // Dummy credentials in tests/nextflow.config are sufficient for dry-run mode. when { process { """ input[0] = [ - [ id:'test', single_end:false ], // meta map - "PRJNA318468", - "EMG", - "metagenome" + [ id:'example_study' ], + file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true) ] """ } @@ -53,10 +47,10 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out).match() } - //TODO improve assertions + { assert path(process.out.accessions[0][1]).exists() }, + { assert path(process.out.accessions[0][1]).json.submitted instanceof List }, + { assert path(process.out.accessions[0][1]).json.failed.size() == 0 } ) } - } } diff --git a/modules/local/registerstudy/tests/main.nf.test.snap b/modules/local/registerstudy/tests/main.nf.test.snap index 1dd3a79..4b184e9 100644 --- a/modules/local/registerstudy/tests/main.nf.test.snap +++ b/modules/local/registerstudy/tests/main.nf.test.snap @@ -1,72 +1,35 @@ { - "registerstudy - report - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "" - ] - ], - "1": [ - "versions.yml:md5,ea872d341a2054fde3b2c8f06bbf8177" - ], - "study_accession": [ - [ - { - "id": "test", - "single_end": false - }, - "" - ] - ], - "versions": [ - "versions.yml:md5,ea872d341a2054fde3b2c8f06bbf8177" - ] - } - ], - "meta": { - "nf-test": "0.9.3", - "nextflow": "25.10.0" - }, - "timestamp": "2025-10-28T16:35:02.331026" - }, "registerstudy - stub": { "content": [ { "0": [ [ { - "id": "test", - "single_end": false + "id": "example_study" }, - "" + "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" ] ], "1": [ - "versions.yml:md5,1d079512d28737f6b925e85563aa2c53" + "versions.yml:md5,ddcc758a7d28faecd4286941889ab7e1" ], - "study_accession": [ + "accessions": [ [ { - "id": "test", - "single_end": false + "id": "example_study" }, - "" + "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" ] ], "versions": [ - "versions.yml:md5,1d079512d28737f6b925e85563aa2c53" + "versions.yml:md5,ddcc758a7d28faecd4286941889ab7e1" ] } ], + "timestamp": "2026-03-12T13:52:06.989729", "meta": { - "nf-test": "0.9.0", - "nextflow": "25.04.1" - }, - "timestamp": "2025-10-30T14:58:53.721718" + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } } } \ No newline at end of file diff --git a/modules/local/submit_rawreads_study/tests/nextflow.config b/modules/local/registerstudy/tests/nextflow.config similarity index 86% rename from modules/local/submit_rawreads_study/tests/nextflow.config rename to modules/local/registerstudy/tests/nextflow.config index c4633fa..4a84743 100644 --- a/modules/local/submit_rawreads_study/tests/nextflow.config +++ b/modules/local/registerstudy/tests/nextflow.config @@ -1,4 +1,4 @@ -// Test configuration for SUBMIT_RAWREADS_STUDY module. +// Test configuration for REGISTERSTUDY module. // --test : use the ENA dev server (submissions are discarded daily) // --automated : skip the Webin Reports duplicate-checking API call // --dry-run : validate and build XML but do not submit to ENA @@ -7,7 +7,7 @@ // no HTTP calls are made. For real submission tests, replace with secrets: // env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD } process { - withName: SUBMIT_RAWREADS_STUDY { + withName: REGISTERSTUDY { ext.args = '--test --automated --dry-run' } } diff --git a/modules/local/submit_rawreads_study/environment.yml b/modules/local/submit_rawreads_study/environment.yml deleted file mode 100644 index 6ee92a8..0000000 --- a/modules/local/submit_rawreads_study/environment.yml +++ /dev/null @@ -1,10 +0,0 @@ ---- -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json -channels: - - conda-forge - - bioconda -dependencies: - - conda-forge::python>=3.12 - - conda-forge::pip - - pip: - - mgnify-pipelines-toolkit==1.4.17 diff --git a/modules/local/submit_rawreads_study/main.nf b/modules/local/submit_rawreads_study/main.nf deleted file mode 100644 index 51bc062..0000000 --- a/modules/local/submit_rawreads_study/main.nf +++ /dev/null @@ -1,47 +0,0 @@ -process SUBMIT_RAWREADS_STUDY { - tag "$meta.id" - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "quay.io/microbiome-informatics/mgnify-pipelines-toolkit:1.4.17" - - // ENA_WEBIN and ENA_WEBIN_PASSWORD must be set in the process environment. - // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config: - // env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD } - - input: - tuple val(meta), path(study_metadata) - - output: - tuple val(meta), path("*_accessions.json"), emit: accessions - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - """ - submit_rawreads_study.py \\ - --input ${study_metadata} \\ - --output ${prefix}_accessions.json \\ - ${args} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))") - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - echo '{"submitted":[],"duplicates":[],"modified":[],"failed":[]}' > ${prefix}_accessions.json - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))") - END_VERSIONS - """ -} diff --git a/modules/local/submit_rawreads_study/meta.yml b/modules/local/submit_rawreads_study/meta.yml deleted file mode 100644 index 629512f..0000000 --- a/modules/local/submit_rawreads_study/meta.yml +++ /dev/null @@ -1,68 +0,0 @@ -# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json -name: "submit_rawreads_study" -description: | - Submit a new study to ENA via the Webin REST API v2. - Reads study metadata from a JSON, CSV, or TSV file, checks for - duplicate studies already registered under the Webin account, - builds a PROJECT XML submission document, and submits to ENA. - Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD - Nextflow secrets, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD - inside the process. -keywords: - - ena - - submission - - study - - project - - webin -tools: - - mgnify-pipelines-toolkit: - description: | - A toolkit of utilities used in MGnify metagenomics pipelines, - including click, requests, and other dependencies required by - the ENA submission scripts. - homepage: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit - documentation: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit - tool_dev_url: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit - doi: "" - licence: ["Apache-2.0"] - identifier: null - -input: - - - meta: - type: map - description: | - Groovy Map containing sample information. - e.g. `[ id:'sample1' ]` - - study_metadata: - type: file - description: | - Study metadata file in JSON, CSV, or TSV format. - JSON may follow the DataHarmonizer Container export format or be - a plain list/dict of study records. - Required fields per record: STUDY_TITLE, existing_study_type. - pattern: "*.{json,csv,tsv}" - -output: - - accessions: - - meta: - type: map - description: | - Groovy Map containing sample information. - e.g. `[ id:'sample1' ]` - - "*_accessions.json": - type: file - description: | - JSON file containing the submission results with keys: - submitted (newly created accessions), duplicates (skipped), - modified (force-updated), and failed. - pattern: "*_accessions.json" - - versions: - - "versions.yml": - type: file - description: File containing software versions - pattern: "versions.yml" - -authors: - - "@timrozday" -maintainers: - - "@timrozday" diff --git a/modules/local/submit_rawreads_study/tests/main.nf.test b/modules/local/submit_rawreads_study/tests/main.nf.test deleted file mode 100644 index a0cb4fd..0000000 --- a/modules/local/submit_rawreads_study/tests/main.nf.test +++ /dev/null @@ -1,56 +0,0 @@ -nextflow_process { - name "Test Process SUBMIT_RAWREADS_STUDY" - script "../main.nf" - config "./nextflow.config" - process "SUBMIT_RAWREADS_STUDY" - - tag "modules" - tag "submit_rawreads_study" - - test("submit_rawreads_study - stub") { - options "-stub" - - when { - process { - """ - input[0] = [ - [ id:'example_study' ], - file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("submit_rawreads_study - dry run against ENA test server") { - // Validates and builds the submission XML but does not submit to ENA. - // Dummy credentials in tests/nextflow.config are sufficient for dry-run mode. - - when { - process { - """ - input[0] = [ - [ id:'example_study' ], - file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true) - ] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert path(process.out.accessions[0][1]).exists() }, - { assert path(process.out.accessions[0][1]).json.submitted instanceof List }, - { assert path(process.out.accessions[0][1]).json.failed.size() == 0 } - ) - } - } -} diff --git a/modules/local/submit_rawreads_study/tests/main.nf.test.snap b/modules/local/submit_rawreads_study/tests/main.nf.test.snap deleted file mode 100644 index 08f7fdb..0000000 --- a/modules/local/submit_rawreads_study/tests/main.nf.test.snap +++ /dev/null @@ -1,35 +0,0 @@ -{ - "submit_rawreads_study - stub": { - "content": [ - { - "0": [ - [ - { - "id": "example_study" - }, - "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" - ] - ], - "1": [ - "versions.yml:md5,d7080ded74f0381019a674b865daa329" - ], - "accessions": [ - [ - { - "id": "example_study" - }, - "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" - ] - ], - "versions": [ - "versions.yml:md5,d7080ded74f0381019a674b865daa329" - ] - } - ], - "timestamp": "2026-03-12T11:57:10.234715", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.4" - } - } -} \ No newline at end of file diff --git a/nextflow_schema.json b/nextflow_schema.json index dedf312..2ee3d9c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -266,7 +266,7 @@ "format": "file-path", "exists": true, "description": "Path to study metadata file (JSON, CSV, or TSV) for registering a new ENA study. Required when submission_study is not provided.", - "help_text": "File containing study metadata fields (STUDY_TITLE, STUDY_ABSTRACT, existing_study_type, alias, etc.). Used by SUBMIT_RAWREADS_STUDY to create a new study in ENA when no existing submission_study accession is given.", + "help_text": "File containing study metadata fields (STUDY_TITLE, STUDY_ABSTRACT, existing_study_type, alias, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.", "fa_icon": "fas fa-file-alt" }, "webincli_submit": { diff --git a/tests/default.nf.test b/tests/default.nf.test index 919645d..4a3b628 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -6,7 +6,7 @@ nextflow_pipeline { test("metagenomic_assemblies mode — submission_study provided (no study registration)") { // Exercises the assembly submission path using a pre-registered study (stub mode). - // SUBMIT_RAWREADS_STUDY is NOT called here; the module-level nf-test covers it. + // REGISTERSTUDY is NOT called here; the module-level nf-test covers it. // // A samplesheet is generated on the fly with absolute paths so that nf-schema // validation succeeds regardless of the nf-test launchDir. @@ -45,8 +45,8 @@ nextflow_pipeline { } } - test("metagenomic_assemblies mode — study_metadata provided (SUBMIT_RAWREADS_STUDY registers study)") { - // Tests the study-registration path in stub mode. SUBMIT_RAWREADS_STUDY stub + test("metagenomic_assemblies mode — study_metadata provided (REGISTERSTUDY registers study)") { + // Tests the study-registration path in stub mode. REGISTERSTUDY stub // outputs an empty accessions JSON, so this test validates the plumbing rather // than the end-to-end submission output. options "-stub" @@ -73,7 +73,7 @@ nextflow_pipeline { then { assertAll( { assert workflow.success }, - { assert workflow.trace.succeeded().any { it.name.contains("SUBMIT_RAWREADS_STUDY") } } + { assert workflow.trace.succeeded().any { it.name.contains("REGISTERSTUDY") } } ) } } diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf index b383a6c..ec1309f 100644 --- a/workflows/assemblysubmit.nf +++ b/workflows/assemblysubmit.nf @@ -7,7 +7,7 @@ include { COVERM_CONTIG } from '../modules/nf-core/coverm/contig/main' include { FASTAVALIDATOR } from '../modules/nf-core/fastavalidator/main' include { GENERATE_ASSEMBLY_MANIFEST } from '../modules/local/generate_assembly_manifest/main' -include { SUBMIT_RAWREADS_STUDY } from '../modules/local/submit_rawreads_study/main' +include { REGISTERSTUDY } from '../modules/local/registerstudy/main' include { ENA_WEBIN_CLI } from '../modules/local/ena_webin_cli' include { MULTIQC } from '../modules/nf-core/multiqc/main' @@ -154,11 +154,11 @@ workflow ASSEMBLYSUBMIT { study_accession_ch = channel.of(params.submission_study) } else { // Register a new study using the study metadata file - SUBMIT_RAWREADS_STUDY( + REGISTERSTUDY( channel.of([[id: "study"], file(params.study_metadata)]) ) - ch_versions = ch_versions.mix(SUBMIT_RAWREADS_STUDY.out.versions) - study_accession_ch = SUBMIT_RAWREADS_STUDY.out.accessions + ch_versions = ch_versions.mix(REGISTERSTUDY.out.versions) + study_accession_ch = REGISTERSTUDY.out.accessions .map { _meta, json -> def data = new groovy.json.JsonSlurper().parse(json) data.submitted[0]?.accession diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf index b34a704..e9b17bb 100644 --- a/workflows/genomesubmit.nf +++ b/workflows/genomesubmit.nf @@ -5,7 +5,7 @@ */ include { GENOME_UPLOAD } from '../modules/local/genome_upload' include { ENA_WEBIN_CLI } from '../modules/local/ena_webin_cli' -include { SUBMIT_RAWREADS_STUDY } from '../modules/local/submit_rawreads_study/main' +include { REGISTERSTUDY } from '../modules/local/registerstudy/main' include { RNA_DETECTION } from '../subworkflows/local/rna_detection' @@ -114,11 +114,11 @@ workflow GENOMESUBMIT { if (params.submission_study) { study_accession_ch = channel.of(params.submission_study) } else { - SUBMIT_RAWREADS_STUDY( + REGISTERSTUDY( channel.of([[id: "study"], file(params.study_metadata)]) ) - ch_versions = ch_versions.mix(SUBMIT_RAWREADS_STUDY.out.versions) - study_accession_ch = SUBMIT_RAWREADS_STUDY.out.accessions + ch_versions = ch_versions.mix(REGISTERSTUDY.out.versions) + study_accession_ch = REGISTERSTUDY.out.accessions .map { _meta, json -> def data = new groovy.json.JsonSlurper().parse(json) data.submitted[0]?.accession From cc840dedea7bc77b175b44f163ed0a4b8a45cab4 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Thu, 12 Mar 2026 14:09:30 +0000 Subject: [PATCH 09/36] Remove pytest from bin --- bin/test_submit_study.py | 1420 -------------------------------------- 1 file changed, 1420 deletions(-) delete mode 100644 bin/test_submit_study.py diff --git a/bin/test_submit_study.py b/bin/test_submit_study.py deleted file mode 100644 index d021383..0000000 --- a/bin/test_submit_study.py +++ /dev/null @@ -1,1420 +0,0 @@ -#!/usr/bin/env python3 -"""Tests for submit_study.py — ENA study submission pipeline. - -Covers: - A. Unit tests for build_submission_xml and _add_project_element - B. Unit tests for validate_study_xml - C. Unit tests for parse_xml_receipt - D. Unit tests for find_duplicate_studies and fetch_account_studies - E. CLI integration tests for main() using click.testing.CliRunner - -Usage: - pytest bin/test_submit_study.py -v - -All external I/O (HTTP requests, ENA reports API) is mocked. Tests do NOT -import from ena_submit_common directly — all assertions go through the public -API of submit_study. -""" - -from __future__ import annotations - -import json -import os -import sys -import xml.etree.ElementTree as ET -from pathlib import Path -from textwrap import dedent -from typing import Any -from unittest.mock import MagicMock, patch - -import pytest -from click.testing import CliRunner -from requests.auth import HTTPBasicAuth - -# Ensure the scripts directory is on the path before importing the module. -sys.path.insert(0, os.path.dirname(__file__)) - -from bin.submit_study import ( # noqa: E402 - _normalize_study_report, - build_submission_xml, - fetch_account_studies, - find_duplicate_studies, - main, - parse_xml_receipt, - validate_study_xml, -) - -# --------------------------------------------------------------------------- -# Constants shared across test groups -# --------------------------------------------------------------------------- - -_PROD_REPORTS_URL = "https://www.ebi.ac.uk/ena/submit/report/projects" -_TEST_REPORTS_URL = "https://wwwdev.ebi.ac.uk/ena/submit/report/projects" - -# --------------------------------------------------------------------------- -# Shared fixtures -# --------------------------------------------------------------------------- - - -@pytest.fixture -def basic_study() -> dict[str, Any]: - """Return a minimal valid study metadata dict.""" - return { - "alias": "test-study-001", - "STUDY_TITLE": "A Basic Test Study", - "STUDY_ABSTRACT": "An abstract for the test study.", - "CENTER_PROJECT_NAME": "My Centre Project", - "existing_study_type": "Metagenomics", - } - - -@pytest.fixture -def metagenomics_assembly_study() -> dict[str, Any]: - """Return a study dict representing a metagenomics assembly submission.""" - return { - "alias": "metagenome-assembly-001", - "STUDY_TITLE": "Primary Metagenome Assembly of Soil Sample", - "STUDY_ABSTRACT": "Assembly of contigs from metagenome sequencing of soil.", - "CENTER_PROJECT_NAME": "Soil Metagenome Project", - "existing_study_type": "Metagenomics", - } - - -@pytest.fixture -def mag_genome_study() -> dict[str, Any]: - """Return a study dict representing a MAG/genome submission.""" - return { - "alias": "mag-genome-001", - "STUDY_TITLE": "Metagenome-Assembled Genome from Soil Microbiome", - "STUDY_ABSTRACT": "A high-quality MAG reconstructed from binned metagenome data.", - "existing_study_type": "Other", - "new_study_type": "Genome Sequencing", - } - - -@pytest.fixture -def mock_credentials() -> tuple[str, str]: - """Return mock ENA credentials.""" - return ("Webin-12345", "pass") - - -@pytest.fixture -def auth(mock_credentials: tuple[str, str]) -> HTTPBasicAuth: - """Return mock HTTPBasicAuth built from mock credentials.""" - return HTTPBasicAuth(*mock_credentials) - - -@pytest.fixture -def account_study_record() -> dict[str, str]: - """Return a normalised account study record as returned by the Reports API.""" - return { - "title": "Existing Study Title", - "alias": "existing-study-alias", - "accession": "PRJEB99001", - "secondary_accession": "ERP099001", - "status": "PRIVATE", - } - - -# --------------------------------------------------------------------------- -# A. Unit tests for build_submission_xml and _add_project_element -# --------------------------------------------------------------------------- - - -class TestBuildSubmissionXml: - """Unit tests for build_submission_xml and _add_project_element.""" - - # ---- helper ------------------------------------------------------- - - @staticmethod - def _to_str(root: ET.Element) -> str: - """Serialise an ElementTree element to a UTF-8 string.""" - return ET.tostring(root, encoding="unicode") - - # ---- A1: Basic study fields ------------------------------------------- - - def test_study_title_round_trips(self, basic_study: dict[str, Any]) -> None: - """STUDY_TITLE is written as the TITLE element text.""" - root = build_submission_xml([basic_study]) - title_el = root.find(".//TITLE") - assert title_el is not None - assert title_el.text == basic_study["STUDY_TITLE"] - - def test_study_abstract_round_trips(self, basic_study: dict[str, Any]) -> None: - """STUDY_ABSTRACT is written as the DESCRIPTION element text.""" - root = build_submission_xml([basic_study]) - desc_el = root.find(".//DESCRIPTION") - assert desc_el is not None - assert desc_el.text == basic_study["STUDY_ABSTRACT"] - - def test_alias_round_trips(self, basic_study: dict[str, Any]) -> None: - """The alias attribute on PROJECT matches the input alias.""" - root = build_submission_xml([basic_study]) - project_el = root.find(".//PROJECT") - assert project_el is not None - assert project_el.get("alias") == basic_study["alias"] - - def test_center_project_name_round_trips(self, basic_study: dict[str, Any]) -> None: - """CENTER_PROJECT_NAME is written as the NAME element text.""" - root = build_submission_xml([basic_study]) - name_el = root.find(".//NAME") - assert name_el is not None - assert name_el.text == basic_study["CENTER_PROJECT_NAME"] - - def test_submission_project_present(self, basic_study: dict[str, Any]) -> None: - """SUBMISSION_PROJECT with SEQUENCING_PROJECT is always present.""" - root = build_submission_xml([basic_study]) - sp_el = root.find(".//SUBMISSION_PROJECT") - assert sp_el is not None - seq_el = sp_el.find("SEQUENCING_PROJECT") - assert seq_el is not None - - # ---- A2: Study type PROJECT_ATTRIBUTEs -------------------------------- - - def test_existing_study_type_emitted_as_project_attribute( - self, basic_study: dict[str, Any] - ) -> None: - """existing_study_type is emitted as a PROJECT_ATTRIBUTE TAG/VALUE pair.""" - root = build_submission_xml([basic_study]) - xml_str = self._to_str(root) - assert "existing_study_type" in xml_str - assert basic_study["existing_study_type"] in xml_str - - def test_new_study_type_absent_when_not_other(self, basic_study: dict[str, Any]) -> None: - """new_study_type is NOT emitted when existing_study_type != 'Other'.""" - study = dict(basic_study) - study["new_study_type"] = "Genome Sequencing" - root = build_submission_xml([study]) - xml_str = self._to_str(root) - assert "new_study_type" not in xml_str - - def test_new_study_type_present_when_existing_is_other( - self, mag_genome_study: dict[str, Any] - ) -> None: - """new_study_type appears as a PROJECT_ATTRIBUTE when existing_study_type == 'Other'.""" - root = build_submission_xml([mag_genome_study]) - tags = [ - el.text - for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") - if el.text is not None - ] - values = [ - el.text - for el in root.findall(".//PROJECT_ATTRIBUTE/VALUE") - if el.text is not None - ] - assert "existing_study_type" in tags - assert "new_study_type" in tags - assert "Other" in values - assert "Genome Sequencing" in values - - def test_no_project_attributes_when_no_study_type(self) -> None: - """No PROJECT_ATTRIBUTES element when existing_study_type is absent.""" - study = { - "alias": "no-type", - "STUDY_TITLE": "No Type Study", - } - root = build_submission_xml([study]) - attrs_el = root.find(".//PROJECT_ATTRIBUTES") - assert attrs_el is None - - # ---- A3: Hold date ---------------------------------------------------- - - def test_hold_until_present_in_submission(self, basic_study: dict[str, Any]) -> None: - """When hold_until is given, HOLD element with HoldUntilDate appears in SUBMISSION.""" - root = build_submission_xml([basic_study], hold_until="2028-06-15") - hold_el = root.find(".//HOLD") - assert hold_el is not None - assert hold_el.get("HoldUntilDate") == "2028-06-15" - - def test_hold_until_absent_when_not_provided(self, basic_study: dict[str, Any]) -> None: - """When hold_until is not given, no HOLD element appears.""" - root = build_submission_xml([basic_study]) - hold_el = root.find(".//HOLD") - assert hold_el is None - - # ---- A4: MODIFY action ------------------------------------------------ - - def test_modify_action_produces_modify_element(self, basic_study: dict[str, Any]) -> None: - """Using action='MODIFY' produces a MODIFY element instead of ADD.""" - root = build_submission_xml([basic_study], action="MODIFY") - xml_str = self._to_str(root) - assert "" in xml_str - - def test_add_action_produces_add_element(self, basic_study: dict[str, Any]) -> None: - """Default action='ADD' produces an ADD element.""" - root = build_submission_xml([basic_study]) - xml_str = self._to_str(root) - assert "" in xml_str - - def test_modify_action_does_not_produce_add(self, basic_study: dict[str, Any]) -> None: - """MODIFY action does not produce an ADD element.""" - root = build_submission_xml([basic_study], action="MODIFY") - xml_str = self._to_str(root) - # Strip the XML preamble to avoid false positives in attributes - assert "" not in xml_str - - # ---- A5: Assembly/metagenomics study ---------------------------------- - - def test_metagenomics_assembly_study_round_trips( - self, metagenomics_assembly_study: dict[str, Any] - ) -> None: - """Metagenomics assembly study dict round-trips correctly into XML.""" - root = build_submission_xml([metagenomics_assembly_study]) - project_el = root.find(".//PROJECT") - assert project_el is not None - assert project_el.get("alias") == metagenomics_assembly_study["alias"] - - title_el = root.find(".//TITLE") - assert title_el is not None - assert title_el.text == metagenomics_assembly_study["STUDY_TITLE"] - - tags = [ - el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text - ] - values = [ - el.text for el in root.findall(".//PROJECT_ATTRIBUTE/VALUE") if el.text - ] - assert "existing_study_type" in tags - assert "Metagenomics" in values - - # ---- A6: MAG/genome study with Other + new_study_type ----------------- - - def test_mag_genome_study_has_both_project_attributes( - self, mag_genome_study: dict[str, Any] - ) -> None: - """MAG/genome study with existing_study_type=Other produces both PROJECT_ATTRIBUTEs.""" - root = build_submission_xml([mag_genome_study]) - attr_els = root.findall(".//PROJECT_ATTRIBUTE") - assert len(attr_els) == 2 - - pairs: dict[str, str] = {} - for attr_el in attr_els: - tag_el = attr_el.find("TAG") - val_el = attr_el.find("VALUE") - if tag_el is not None and val_el is not None: - pairs[tag_el.text or ""] = val_el.text or "" - - assert pairs.get("existing_study_type") == "Other" - assert pairs.get("new_study_type") == "Genome Sequencing" - - # ---- Multiple studies in one call ------------------------------------- - - def test_multiple_studies_produce_multiple_project_elements( - self, - basic_study: dict[str, Any], - metagenomics_assembly_study: dict[str, Any], - ) -> None: - """Multiple studies in input produce multiple PROJECT elements.""" - root = build_submission_xml([basic_study, metagenomics_assembly_study]) - projects = root.findall(".//PROJECT") - assert len(projects) == 2 - - # ---- Alias auto-derived from title when absent ------------------------ - - def test_alias_derived_from_title_when_absent(self) -> None: - """When no alias is provided, alias is derived from STUDY_TITLE (spaces→underscores).""" - study = {"STUDY_TITLE": "My Derived Title"} - root = build_submission_xml([study]) - project_el = root.find(".//PROJECT") - assert project_el is not None - alias = project_el.get("alias", "") - assert "_" in alias or alias == "My_Derived_Title"[:50] - - -# --------------------------------------------------------------------------- -# B. Unit tests for validate_study_xml -# --------------------------------------------------------------------------- - - -class TestValidateStudyXml: - """Unit tests for validate_study_xml.""" - - @staticmethod - def _build_valid_xml_bytes(alias: str = "study-1", title: str = "Test Study") -> bytes: - """Build a minimal valid study XML document as bytes. - - Args: - alias: The PROJECT alias attribute value. - title: The TITLE element text. - - Returns: - UTF-8 encoded XML bytes. - """ - xml_str = dedent(f"""\ - - - - - {title} - - - - - - - """) - return xml_str.encode("utf-8") - - # ---- B7: Valid XML passes --------------------------------------------- - - def test_valid_assembly_study_xml_passes(self) -> None: - """A valid assembly study XML passes validation without errors.""" - xml_bytes = self._build_valid_xml_bytes( - alias="assembly-study", title="Assembly Study Title" - ) - is_valid, messages = validate_study_xml(xml_bytes) - assert is_valid, f"Expected valid; messages: {messages}" - - def test_valid_metagenomics_xml_passes(self) -> None: - """Well-formed XML with required elements passes validation.""" - study = { - "alias": "meta-study", - "STUDY_TITLE": "Metagenomics Study", - "existing_study_type": "Metagenomics", - } - import ena_submit_common as _common # local import; only for xml_to_bytes helper - - root = build_submission_xml([study]) - xml_bytes = _common.xml_to_bytes(root) - is_valid, messages = validate_study_xml(xml_bytes) - assert is_valid, f"Expected valid; messages: {messages}" - - # ---- B8: Missing TITLE ------------------------------------------------ - - def test_missing_title_fails_with_title_in_message(self) -> None: - """A PROJECT without a TITLE element fails validation with 'TITLE' in the message.""" - xml_str = dedent("""\ - - - - - - - - - """) - is_valid, messages = validate_study_xml(xml_str.encode("utf-8")) - assert not is_valid - combined = " ".join(messages) - assert "TITLE" in combined - - # ---- B9: Missing SUBMISSION_PROJECT ----------------------------------- - - def test_missing_submission_project_fails(self) -> None: - """A PROJECT without SUBMISSION_PROJECT fails with 'SUBMISSION_PROJECT' in message.""" - xml_str = dedent("""\ - - - - - Some Title - - - - """) - is_valid, messages = validate_study_xml(xml_str.encode("utf-8")) - assert not is_valid - combined = " ".join(messages) - assert "SUBMISSION_PROJECT" in combined - - # ---- B10: Malformed XML ----------------------------------------------- - - def test_malformed_xml_fails_with_not_well_formed_message(self) -> None: - """Malformed XML fails validation with 'not well-formed' in the message.""" - bad_xml = b"Unclosed" - is_valid, messages = validate_study_xml(bad_xml) - assert not is_valid - combined = " ".join(messages).lower() - assert "not well-formed" in combined or "well-formed" in combined - - # ---- Extra structural checks ----------------------------------------- - - def test_empty_title_fails_validation(self) -> None: - """A PROJECT with an empty TITLE element fails validation.""" - xml_str = dedent("""\ - <?xml version='1.0' encoding='UTF-8'?> - <WEBIN> - <PROJECT_SET> - <PROJECT alias="empty-title"> - <TITLE> - - - - - """) - is_valid, messages = validate_study_xml(xml_str.encode("utf-8")) - assert not is_valid - - def test_missing_project_set_fails_validation(self) -> None: - """XML without a PROJECT_SET element fails validation.""" - xml_str = b"" - is_valid, messages = validate_study_xml(xml_str) - assert not is_valid - - def test_validation_returns_tuple_of_bool_and_list(self) -> None: - """validate_study_xml always returns (bool, list).""" - xml_bytes = self._build_valid_xml_bytes() - result = validate_study_xml(xml_bytes) - assert isinstance(result, tuple) - assert len(result) == 2 - is_valid, messages = result - assert isinstance(is_valid, bool) - assert isinstance(messages, list) - - -# --------------------------------------------------------------------------- -# C. Unit tests for parse_xml_receipt -# --------------------------------------------------------------------------- - - -class TestParseXmlReceipt: - """Unit tests for parse_xml_receipt.""" - - @staticmethod - def _parse(xml_str: str) -> tuple[bool, list[dict[str, str]], list[str]]: - """Parse an XML receipt string via parse_xml_receipt. - - Args: - xml_str: Raw XML receipt string. - - Returns: - Tuple of (success, accessions, messages). - """ - root = ET.fromstring(xml_str) - return parse_xml_receipt(root) - - # ---- C11: Successful PROJECT receipt ---------------------------------- - - def test_successful_project_receipt_returns_true(self) -> None: - """A success='true' receipt returns success=True.""" - xml_str = dedent("""\ - - - - - - """) - success, accessions, messages = self._parse(xml_str) - assert success is True - - def test_successful_project_receipt_accession_round_trips(self) -> None: - """PROJECT accession, alias, status, holdUntilDate, and external_accession round-trip.""" - xml_str = dedent("""\ - - - - - - """) - success, accessions, messages = self._parse(xml_str) - assert len(accessions) == 1 - acc = accessions[0] - assert acc["accession"] == "PRJEB12345" - assert acc["alias"] == "my-study" - assert acc["status"] == "PRIVATE" - assert acc["holdUntilDate"] == "2025-01-15" - assert acc["external_accession"] == "ERP012345" - assert acc["external_type"] == "study" - - # ---- C12: Failed receipt ---------------------------------------------- - - def test_failed_receipt_returns_false(self) -> None: - """A success='false' receipt returns success=False.""" - xml_str = dedent("""\ - - - Center name "Unknown" is not permitted to submit in Webin-12345. - - - """) - success, accessions, messages = self._parse(xml_str) - assert success is False - - def test_failed_receipt_captures_error_message(self) -> None: - """Error text from MESSAGES/ERROR is captured in the messages list.""" - xml_str = dedent("""\ - - - Submission failed due to duplicate alias. - - - """) - _, _, messages = self._parse(xml_str) - assert any("Submission failed due to duplicate alias" in m for m in messages) - - # ---- C13: STUDY tag (alternate ENA format) ---------------------------- - - def test_study_tag_receipt_extracts_accession_and_alias(self) -> None: - """Receipts using STUDY instead of PROJECT still extract accession and alias.""" - xml_str = dedent("""\ - - - - """) - success, accessions, messages = self._parse(xml_str) - assert success is True - assert len(accessions) == 1 - assert accessions[0]["accession"] == "ERP099999" - assert accessions[0]["alias"] == "study-alias-1" - - # ---- C14: MESSAGES with INFO and ERROR -------------------------------- - - def test_receipt_with_info_messages_captured(self) -> None: - """INFO elements in MESSAGES are captured in the messages list.""" - xml_str = dedent("""\ - - - - Submission processed successfully. - - - """) - _, _, messages = self._parse(xml_str) - assert any("Submission processed successfully" in m for m in messages) - assert any(m.startswith("INFO:") for m in messages) - - def test_receipt_with_multiple_error_messages(self) -> None: - """Multiple ERROR elements are all captured.""" - xml_str = dedent("""\ - - - First error. - Second error. - - - """) - _, _, messages = self._parse(xml_str) - error_msgs = [m for m in messages if m.startswith("ERROR:")] - assert len(error_msgs) == 2 - - def test_receipt_both_info_and_error_captured(self) -> None: - """Both INFO and ERROR elements are captured in messages.""" - xml_str = dedent("""\ - - - Partial success. - Some records failed. - - - """) - _, _, messages = self._parse(xml_str) - assert any(m.startswith("INFO:") for m in messages) - assert any(m.startswith("ERROR:") for m in messages) - - def test_receipt_no_messages_element_returns_empty_list(self) -> None: - """A receipt without a MESSAGES element returns an empty messages list.""" - xml_str = dedent("""\ - - - - """) - _, _, messages = self._parse(xml_str) - assert messages == [] - - def test_receipt_success_false_string(self) -> None: - """Receipts with success='false' (string) correctly parse to False.""" - xml_str = "" - success, _, _ = self._parse(xml_str) - assert success is False - - def test_receipt_missing_success_defaults_to_false(self) -> None: - """A receipt without a success attribute defaults to False.""" - xml_str = "" - success, _, _ = self._parse(xml_str) - assert success is False - - -# --------------------------------------------------------------------------- -# D. Unit tests for find_duplicate_studies and fetch_account_studies -# --------------------------------------------------------------------------- - - -class TestFindDuplicateStudies: - """Unit tests for find_duplicate_studies.""" - - @staticmethod - def _account_record( - title: str = "", - alias: str = "", - accession: str = "PRJEB00001", - status: str = "PRIVATE", - ) -> dict[str, str]: - """Build a normalised account study record. - - Args: - title: Study title (as returned by Reports API normalizer). - alias: Study alias. - accession: ENA project accession. - status: Release status. - - Returns: - Normalised study dict. - """ - return { - "title": title, - "alias": alias, - "accession": accession, - "secondary_accession": "", - "status": status, - } - - # ---- D15: Exact alias match ------------------------------------------ - - def test_exact_alias_match_detected_as_duplicate(self) -> None: - """An exact alias match is detected as a duplicate.""" - new_studies = [{"STUDY_TITLE": "Different Title", "alias": "my-alias-x"}] - account = [self._account_record(title="Other", alias="my-alias-x", accession="PRJEB10")] - dups = find_duplicate_studies(new_studies, account) - assert 0 in dups - assert dups[0]["accession"] == "PRJEB10" - assert "alias" in dups[0]["match_reason"] - - # ---- D16: Exact title match ------------------------------------------ - - def test_exact_title_match_detected_as_duplicate(self) -> None: - """An exact STUDY_TITLE match is detected as a duplicate.""" - new_studies = [{"STUDY_TITLE": "My Metagenomics Study"}] - account = [ - self._account_record(title="My Metagenomics Study", accession="PRJEB20") - ] - dups = find_duplicate_studies(new_studies, account) - assert 0 in dups - assert dups[0]["accession"] == "PRJEB20" - assert "title" in dups[0]["match_reason"] - - # ---- D17: No match returns empty dict -------------------------------- - - def test_no_match_returns_empty_dict(self) -> None: - """When neither alias nor title matches, an empty dict is returned.""" - new_studies = [{"STUDY_TITLE": "Completely Novel Study", "alias": "novel-alias"}] - account = [self._account_record(title="Existing Study", alias="existing-alias")] - dups = find_duplicate_studies(new_studies, account) - assert dups == {} - - def test_empty_account_returns_empty_dict(self) -> None: - """Empty account list results in no duplicates.""" - new_studies = [{"STUDY_TITLE": "Any Study"}] - dups = find_duplicate_studies(new_studies, []) - assert dups == {} - - def test_empty_new_studies_returns_empty_dict(self) -> None: - """Empty new studies list results in no duplicates.""" - account = [self._account_record(title="Existing")] - dups = find_duplicate_studies([], account) - assert dups == {} - - def test_study_without_title_or_alias_not_flagged(self) -> None: - """A study dict with neither title nor alias is not flagged as duplicate.""" - new_studies = [{"IS_PRIMARY": "YES"}] # no STUDY_TITLE, no alias - account = [self._account_record(title="Existing")] - dups = find_duplicate_studies(new_studies, account) - assert dups == {} - - def test_partial_title_not_a_duplicate(self) -> None: - """A partial title match does not count as a duplicate (exact match only).""" - new_studies = [{"STUDY_TITLE": "Metagenomics"}] - account = [self._account_record(title="Metagenomics Assembly Study")] - dups = find_duplicate_studies(new_studies, account) - assert dups == {} - - def test_multiple_studies_only_matching_flagged(self) -> None: - """Only the matching study is flagged when multiple new studies are submitted.""" - account = [self._account_record(title="Old Study", alias="old-alias", accession="PRJEB50")] - new_studies = [ - {"STUDY_TITLE": "Old Study"}, - {"STUDY_TITLE": "New Study"}, - ] - dups = find_duplicate_studies(new_studies, account) - assert 0 in dups - assert 1 not in dups - - def test_duplicate_index_corresponds_to_new_studies_list(self) -> None: - """The index in the duplicates dict matches the position in new_studies.""" - account = [self._account_record(title="Study C", accession="PRJEB33")] - new_studies = [ - {"STUDY_TITLE": "Study A"}, - {"STUDY_TITLE": "Study B"}, - {"STUDY_TITLE": "Study C"}, - ] - dups = find_duplicate_studies(new_studies, account) - assert 2 in dups - assert dups[2]["accession"] == "PRJEB33" - - -# --------------------------------------------------------------------------- -# D18: _normalize_study_report and fetch_account_studies -# --------------------------------------------------------------------------- - - -class TestNormalizeStudyReport: - """Unit tests for _normalize_study_report field normalisation.""" - - def test_title_field_normalised(self) -> None: - """The 'title' field is extracted from the raw report dict.""" - report = {"title": "My Title", "alias": "my-alias", "accession": "PRJEB1"} - result = _normalize_study_report(report) - assert result["title"] == "My Title" - - def test_study_title_fallback(self) -> None: - """studyTitle is used when 'title' is absent.""" - report = {"studyTitle": "Study Title Fallback", "alias": "a", "accession": "PRJEB2"} - result = _normalize_study_report(report) - assert result["title"] == "Study Title Fallback" - - def test_alias_field_normalised(self) -> None: - """The 'alias' field is extracted.""" - report = {"title": "T", "alias": "direct-alias", "accession": "PRJEB3"} - result = _normalize_study_report(report) - assert result["alias"] == "direct-alias" - - def test_study_alias_fallback(self) -> None: - """studyAlias is used when 'alias' is absent.""" - report = {"title": "T", "studyAlias": "study-alias-fallback", "accession": "PRJEB4"} - result = _normalize_study_report(report) - assert result["alias"] == "study-alias-fallback" - - def test_accession_field_normalised(self) -> None: - """The 'accession' field is extracted.""" - report = {"title": "T", "alias": "a", "accession": "PRJEB5"} - result = _normalize_study_report(report) - assert result["accession"] == "PRJEB5" - - def test_study_accession_fallback(self) -> None: - """studyAccession is used when 'accession' is absent.""" - report = {"title": "T", "alias": "a", "studyAccession": "PRJEB99"} - result = _normalize_study_report(report) - assert result["accession"] == "PRJEB99" - - def test_missing_fields_default_to_empty_string(self) -> None: - """Missing fields default to empty string without raising.""" - report = {} - result = _normalize_study_report(report) - assert result["title"] == "" - assert result["alias"] == "" - assert result["accession"] == "" - - def test_status_field_defaults_to_unknown(self) -> None: - """The status field defaults to 'UNKNOWN' when absent.""" - report = {"title": "T", "alias": "a", "accession": "PRJEB6"} - result = _normalize_study_report(report) - assert result["status"] == "UNKNOWN" - - def test_release_status_used_for_status(self) -> None: - """releaseStatus is mapped to the 'status' key.""" - report = {"title": "T", "alias": "a", "accession": "PRJEB7", "releaseStatus": "PUBLIC"} - result = _normalize_study_report(report) - assert result["status"] == "PUBLIC" - - -class TestFetchAccountStudies: - """Unit tests for fetch_account_studies calling common.fetch_account_records.""" - - def test_fetch_calls_fetch_account_records_with_correct_urls( - self, auth: HTTPBasicAuth - ) -> None: - """fetch_account_studies calls common.fetch_account_records with prod/test URLs.""" - target = "submit_study.common.fetch_account_records" - with patch(target, return_value=[]) as mock_fetch: - fetch_account_studies(auth, use_test=False) - mock_fetch.assert_called_once() - call_kwargs = mock_fetch.call_args - assert call_kwargs.kwargs.get("prod_url") == _PROD_REPORTS_URL - assert call_kwargs.kwargs.get("test_url") == _TEST_REPORTS_URL - - def test_fetch_passes_normalizer_callable(self, auth: HTTPBasicAuth) -> None: - """fetch_account_studies passes a callable normalizer to fetch_account_records.""" - target = "submit_study.common.fetch_account_records" - with patch(target, return_value=[]) as mock_fetch: - fetch_account_studies(auth, use_test=False) - call_kwargs = mock_fetch.call_args - normalizer = call_kwargs.kwargs.get("normalizer") - assert callable(normalizer) - - def test_fetch_normalizer_handles_title_variant(self, auth: HTTPBasicAuth) -> None: - """The normalizer passed to fetch_account_records handles title/studyTitle variants.""" - target = "submit_study.common.fetch_account_records" - captured_normalizer = None - - def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: - nonlocal captured_normalizer - captured_normalizer = kwargs.get("normalizer") - return [] - - with patch(target, side_effect=capture_normalizer): - fetch_account_studies(auth, use_test=False) - - assert captured_normalizer is not None - result_title = captured_normalizer({"title": "Direct Title", "accession": "PRJEB1"}) - assert result_title["title"] == "Direct Title" - - result_study_title = captured_normalizer( - {"studyTitle": "Fallback Title", "accession": "PRJEB2"} - ) - assert result_study_title["title"] == "Fallback Title" - - def test_fetch_normalizer_handles_alias_variant(self, auth: HTTPBasicAuth) -> None: - """The normalizer handles alias/studyAlias field variants.""" - target = "submit_study.common.fetch_account_records" - captured_normalizer = None - - def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: - nonlocal captured_normalizer - captured_normalizer = kwargs.get("normalizer") - return [] - - with patch(target, side_effect=capture_normalizer): - fetch_account_studies(auth, use_test=False) - - assert captured_normalizer is not None - result = captured_normalizer({"alias": "direct-alias", "accession": "PRJEB3"}) - assert result["alias"] == "direct-alias" - - result_fallback = captured_normalizer( - {"studyAlias": "study-alias-fallback", "accession": "PRJEB4"} - ) - assert result_fallback["alias"] == "study-alias-fallback" - - def test_fetch_normalizer_handles_accession_variant(self, auth: HTTPBasicAuth) -> None: - """The normalizer handles accession/studyAccession field variants.""" - target = "submit_study.common.fetch_account_records" - captured_normalizer = None - - def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]: - nonlocal captured_normalizer - captured_normalizer = kwargs.get("normalizer") - return [] - - with patch(target, side_effect=capture_normalizer): - fetch_account_studies(auth, use_test=False) - - assert captured_normalizer is not None - result = captured_normalizer( - {"title": "T", "studyAccession": "PRJEB99", "accession": ""} - ) - # studyAccession falls back when 'accession' is falsy - assert result["accession"] == "PRJEB99" - - -# --------------------------------------------------------------------------- -# E. CLI integration tests for main() using click.testing.CliRunner -# --------------------------------------------------------------------------- - - -def _extract_json_from_output(output: str) -> dict[str, Any]: - """Extract the JSON results dict from mixed CLI output. - - The CLI writes JSON results via ``print()`` to stdout, but logging - also emits to stderr which CliRunner captures in ``result.output``. - This helper finds the last top-level JSON object in the output. - - Args: - output: The full ``result.output`` string from CliRunner. - - Returns: - Parsed JSON dict. - - Raises: - ValueError: If no valid JSON object is found. - """ - # Walk backwards through the output looking for a complete JSON block. - # The results JSON always starts with "{\n " and ends with "\n}". - depth = 0 - end = -1 - start = -1 - for i in range(len(output) - 1, -1, -1): - ch = output[i] - if ch == "}": - if depth == 0: - end = i - depth += 1 - elif ch == "{": - depth -= 1 - if depth == 0: - start = i - break - if start == -1 or end == -1: - raise ValueError(f"No JSON object found in output: {output[:200]!r}") - return json.loads(output[start : end + 1]) - - -def _make_study_json(study: dict[str, Any]) -> str: - """Serialise a study dict into a JSON string using the Container format. - - Args: - study: Study metadata dict. - - Returns: - JSON string in DataHarmonizer Container format. - """ - return json.dumps({ - "Container": { - "SRA_studys": [study], - } - }) - - -def _make_study_csv(study: dict[str, Any]) -> str: - """Serialise a study dict into a minimal CSV string. - - Args: - study: Study metadata dict. - - Returns: - CSV string with header and one data row. - """ - headers = list(study.keys()) - values = [str(study[h]) for h in headers] - return ",".join(headers) + "\n" + ",".join(values) + "\n" - - -def _make_study_tsv(study: dict[str, Any]) -> str: - """Serialise a study dict into a minimal TSV string. - - Args: - study: Study metadata dict. - - Returns: - TSV string with header and one data row. - """ - headers = list(study.keys()) - values = [str(study[h]) for h in headers] - return "\t".join(headers) + "\n" + "\t".join(values) + "\n" - - -@pytest.fixture -def runner() -> CliRunner: - """Return a Click test runner with isolated filesystem.""" - return CliRunner() - - -@pytest.fixture -def minimal_metagenomics_study() -> dict[str, Any]: - """Return a minimal metagenomics study for CLI tests.""" - return { - "alias": "cli-metagenomics-001", - "STUDY_TITLE": "CLI Metagenomics Test Study", - "STUDY_ABSTRACT": "Abstract for CLI test.", - "existing_study_type": "Metagenomics", - } - - -class TestMainCli: - """CLI integration tests for main() using CliRunner.""" - - _CRED_TARGET = "submit_study.common.get_credentials" - _SUBMIT_TARGET = "submit_study.common.submit_xml" - - def _invoke( - self, - runner: CliRunner, - args: list[str], - input_filename: str, - input_content: str, - ) -> Any: - """Write input file and invoke the CLI. - - Args: - runner: Click CliRunner instance. - args: CLI arguments (excluding --input, which is added automatically). - input_filename: Filename for the temporary input file. - input_content: Content to write to the input file. - - Returns: - Click Result object. - """ - with runner.isolated_filesystem(): - Path(input_filename).write_text(input_content) - result = runner.invoke( - main, - ["--input", input_filename] + args, - catch_exceptions=False, - ) - return result - - # ---- E19: JSON input, automated mode, dry-run ------------------------- - - def test_json_input_automated_dry_run_exits_0( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """JSON input with --automated --dry-run exits 0 and output has 'submitted' key.""" - content = _make_study_json(minimal_metagenomics_study) - with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): - result = self._invoke( - runner, ["--automated", "--dry-run"], "studies.json", content - ) - assert result.exit_code == 0, f"stdout: {result.output}" - data = _extract_json_from_output(result.output) - assert "submitted" in data - - # ---- E20: CSV input --------------------------------------------------- - - def test_csv_input_automated_dry_run_exits_0( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """CSV input with --automated --dry-run exits 0 and output has 'submitted' key.""" - content = _make_study_csv(minimal_metagenomics_study) - with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): - result = self._invoke( - runner, ["--automated", "--dry-run"], "studies.csv", content - ) - assert result.exit_code == 0, f"stdout: {result.output}" - data = _extract_json_from_output(result.output) - assert "submitted" in data - - # ---- E21: TSV input --------------------------------------------------- - - def test_tsv_input_automated_dry_run_exits_0( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """TSV input with --automated --dry-run exits 0 and output has 'submitted' key.""" - content = _make_study_tsv(minimal_metagenomics_study) - with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): - result = self._invoke( - runner, ["--automated", "--dry-run"], "studies.tsv", content - ) - assert result.exit_code == 0, f"stdout: {result.output}" - data = _extract_json_from_output(result.output) - assert "submitted" in data - - # ---- E22: Duplicate detection ----------------------------------------- - - def test_duplicate_detection_records_duplicate_and_skips_submission( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """When account already has a matching study, duplicate is recorded; nothing submitted.""" - existing = { - "title": minimal_metagenomics_study["STUDY_TITLE"], - "alias": minimal_metagenomics_study["alias"], - "accession": "PRJEB55555", - "secondary_accession": "ERP055555", - "status": "PRIVATE", - } - content = _make_study_json(minimal_metagenomics_study) - with runner.isolated_filesystem(): - Path("studies.json").write_text(content) - with ( - patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), - patch( - "submit_study.fetch_account_studies", - return_value=[existing], - ), - ): - result = runner.invoke( - main, - ["--input", "studies.json"], - catch_exceptions=False, - ) - assert result.exit_code == 0, f"stdout: {result.output}" - data = _extract_json_from_output(result.output) - assert len(data["duplicates"]) == 1 - assert data["duplicates"][0]["existing_accession"] == "PRJEB55555" - assert data["submitted"] == [] - - # ---- E23: --force with duplicate triggers MODIFY ---------------------- - - def test_force_flag_with_duplicate_triggers_modify( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """--force with a detected duplicate triggers MODIFY and study appears in 'modified'.""" - existing = { - "title": minimal_metagenomics_study["STUDY_TITLE"], - "alias": minimal_metagenomics_study["alias"], - "accession": "PRJEB66666", - "secondary_accession": "ERP066666", - "status": "PRIVATE", - } - receipt_xml = ET.fromstring( - '' - '' - "" - ) - content = _make_study_json(minimal_metagenomics_study) - with runner.isolated_filesystem(): - Path("studies.json").write_text(content) - with ( - patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), - patch( - "submit_study.fetch_account_studies", - return_value=[existing], - ), - patch(self._SUBMIT_TARGET, return_value=receipt_xml), - ): - result = runner.invoke( - main, - ["--input", "studies.json", "--force"], - catch_exceptions=False, - ) - assert result.exit_code == 0, f"stdout: {result.output}" - data = _extract_json_from_output(result.output) - assert len(data["modified"]) == 1 - assert data["modified"][0]["accession"] == "PRJEB66666" - - # ---- E24: Failed submission exits 1 ----------------------------------- - - def test_failed_submission_exits_1( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """When common.submit_xml raises HTTPError, the CLI exits with code 1.""" - import requests - - content = _make_study_json(minimal_metagenomics_study) - http_error = requests.exceptions.HTTPError(response=MagicMock(status_code=500, text="err")) - with runner.isolated_filesystem(): - Path("studies.json").write_text(content) - with ( - patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), - patch(self._SUBMIT_TARGET, side_effect=http_error), - ): - result = runner.invoke( - main, - ["--input", "studies.json", "--automated"], - catch_exceptions=False, - ) - assert result.exit_code == 1 - - # ---- E25: MAG/genome study dry-run XML contains both PROJECT_ATTRIBUTEs --- - - def test_mag_genome_study_dry_run_xml_has_both_attributes( - self, - runner: CliRunner, - ) -> None: - """MAG/genome study with existing_study_type=Other produces both PROJECT_ATTRIBUTEs.""" - study = { - "alias": "mag-001", - "STUDY_TITLE": "MAG Genome Study", - "existing_study_type": "Other", - "new_study_type": "Genome Sequencing", - } - content = _make_study_json(study) - with runner.isolated_filesystem(): - Path("studies.json").write_text(content) - with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): - result = runner.invoke( - main, - ["--input", "studies.json", "--automated", "--dry-run"], - catch_exceptions=False, - ) - assert result.exit_code == 0, f"output: {result.output}" - data = _extract_json_from_output(result.output) - assert "submitted" in data - # Also verify the XML would contain both attributes by building it directly - root = build_submission_xml([study]) - tags = [el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text] - assert "existing_study_type" in tags - assert "new_study_type" in tags - - # ---- E26: --hold-until date present in XML ---------------------------- - - def test_hold_until_date_appears_in_submission_xml( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """--hold-until date is present in the HOLD element of the generated XML.""" - study = dict(minimal_metagenomics_study) - root = build_submission_xml([study], hold_until="2027-12-31") - hold_el = root.find(".//HOLD") - assert hold_el is not None - assert hold_el.get("HoldUntilDate") == "2027-12-31" - - def test_hold_until_cli_flag_passes_validation( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """CLI --hold-until with a valid future date exits 0 in dry-run mode.""" - content = _make_study_json(minimal_metagenomics_study) - with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): - result = self._invoke( - runner, - ["--automated", "--dry-run", "--hold-until", "2027-06-01"], - "studies.json", - content, - ) - assert result.exit_code == 0, f"output: {result.output}" - - # ---- E27: --output writes results to file ----------------------------- - - def test_output_flag_writes_results_to_file( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """--output flag writes JSON results to a file rather than stdout.""" - content = _make_study_json(minimal_metagenomics_study) - with runner.isolated_filesystem(): - Path("studies.json").write_text(content) - with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")): - result = runner.invoke( - main, - ["--input", "studies.json", "--automated", "--dry-run", - "--output", "results.json"], - catch_exceptions=False, - ) - assert result.exit_code == 0, f"stdout: {result.output}" - # With --output, the JSON results go to file, not stdout (stdout has only logging). - results_path = Path("results.json") - assert results_path.exists(), "results.json was not created" - data = json.loads(results_path.read_text()) - assert "submitted" in data - - # ---- E28: --test flag routes to test base URL ------------------------- - - def test_test_flag_uses_test_base_url( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """--test flag results in the test base URL being used for submission.""" - receipt_xml = ET.fromstring( - '' - '' - "" - ) - content = _make_study_json(minimal_metagenomics_study) - with runner.isolated_filesystem(): - Path("studies.json").write_text(content) - with ( - patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), - patch(self._SUBMIT_TARGET, return_value=receipt_xml) as mock_submit, - ): - result = runner.invoke( - main, - ["--input", "studies.json", "--automated", "--test"], - catch_exceptions=False, - ) - assert result.exit_code == 0, f"stdout: {result.output}" - assert mock_submit.called - called_url = mock_submit.call_args[0][0] - assert "wwwdev" in called_url, f"Expected test URL; got {called_url}" - - def test_no_test_flag_uses_production_base_url( - self, - runner: CliRunner, - minimal_metagenomics_study: dict[str, Any], - ) -> None: - """Without --test flag, the production base URL is used.""" - receipt_xml = ET.fromstring( - '' - '' - "" - ) - content = _make_study_json(minimal_metagenomics_study) - with runner.isolated_filesystem(): - Path("studies.json").write_text(content) - with ( - patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")), - patch(self._SUBMIT_TARGET, return_value=receipt_xml) as mock_submit, - ): - result = runner.invoke( - main, - ["--input", "studies.json", "--automated"], - catch_exceptions=False, - ) - assert result.exit_code == 0, f"stdout: {result.output}" - assert mock_submit.called - called_url = mock_submit.call_args[0][0] - assert "wwwdev" not in called_url, f"Expected prod URL; got {called_url}" - - -# --------------------------------------------------------------------------- -# Parametrized study-type cases -# --------------------------------------------------------------------------- - - -@pytest.mark.parametrize( - "study_type,new_type,expect_new_type", - [ - ("Metagenomics", None, False), - ("RNASeq", None, False), - ("Population Genomics", None, False), - ("Other", "Genome Sequencing", True), - ("Other", "Transcriptome Analysis", True), - ("Other", None, False), - ], -) -def test_project_attribute_new_study_type_conditional( - study_type: str, - new_type: str | None, - expect_new_type: bool, -) -> None: - """new_study_type attribute appears iff existing_study_type=='Other' and new_type is set. - - Args: - study_type: Value for existing_study_type. - new_type: Value for new_study_type (or None). - expect_new_type: Whether new_study_type should appear in the XML. - """ - study: dict[str, Any] = { - "alias": "param-test", - "STUDY_TITLE": "Parametrized Study", - "existing_study_type": study_type, - } - if new_type is not None: - study["new_study_type"] = new_type - - root = build_submission_xml([study]) - tags = [el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text] - if expect_new_type: - assert "new_study_type" in tags, ( - f"Expected new_study_type in tags for {study_type!r} / {new_type!r}" - ) - else: - assert "new_study_type" not in tags, ( - f"Did not expect new_study_type in tags for {study_type!r} / {new_type!r}" - ) - - -@pytest.mark.parametrize( - "hold_until,expect_hold", - [ - ("2027-03-01", True), - ("2028-12-31", True), - (None, False), - ], -) -def test_hold_until_element_conditional(hold_until: str | None, expect_hold: bool) -> None: - """HOLD element appears iff hold_until is provided. - - Args: - hold_until: The hold-until date string, or None. - expect_hold: Whether the HOLD element should appear. - """ - study = {"alias": "hold-test", "STUDY_TITLE": "Hold Date Test"} - root = build_submission_xml([study], hold_until=hold_until) - hold_el = root.find(".//HOLD") - if expect_hold: - assert hold_el is not None - assert hold_el.get("HoldUntilDate") == hold_until - else: - assert hold_el is None - - -@pytest.mark.parametrize("action", ["ADD", "MODIFY"]) -def test_submission_action_element_present(action: str) -> None: - """The correct action element (ADD or MODIFY) appears in the SUBMISSION. - - Args: - action: The submission action string. - """ - study = {"alias": "action-test", "STUDY_TITLE": "Action Test"} - root = build_submission_xml([study], action=action) - xml_str = ET.tostring(root, encoding="unicode") - assert f"<{action}" in xml_str or f"<{action}/>" in xml_str - opposite = "MODIFY" if action == "ADD" else "ADD" - assert f"<{opposite}" not in xml_str From d2a78b6b60e4107d9905218bfce4ebee563de8b8 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 12:11:59 +0000 Subject: [PATCH 10/36] In response to PR comments, merged study_submit.py python scripts into one, trialling sanitizeOutput for test snapshot. --- .gitignore | 4 +- bin/ena_submit_common.py | 677 ----------------- bin/submit_study.py | 693 +++++++++++++++++- .../local/registerstudy/tests/main.nf.test | 2 +- 4 files changed, 672 insertions(+), 704 deletions(-) delete mode 100644 bin/ena_submit_common.py diff --git a/.gitignore b/.gitignore index d8c4dbb..601993a 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,4 @@ testing* null/ .nf-test* .idea/ -test_data -.claude/ -CLAUDE.md +test_data \ No newline at end of file diff --git a/bin/ena_submit_common.py b/bin/ena_submit_common.py deleted file mode 100644 index 89e41ab..0000000 --- a/bin/ena_submit_common.py +++ /dev/null @@ -1,677 +0,0 @@ -"""Shared utilities for ENA submission scripts. - -Provide logging, credential management, file loading, -XSD structural validation, Reports API access, duplicate -detection, XML serialisation, and result output used by -``submit_study.py``, ``submit_sample.py``, and -``submit_reads.py``. -""" - -from __future__ import annotations - -import csv -import datetime -import json -import logging -import os -import sys -import xml.etree.ElementTree as ET -from collections.abc import Callable, Sequence -from io import BytesIO -from pathlib import Path -from typing import Any, Final - -import click -import requests -from requests.auth import HTTPBasicAuth - -# All loggers in the ENA submission scripts are children of -# this root, so configuring it once propagates to all. -_LOGGER_NAME: Final = "ena_submit" - -logger = logging.getLogger(_LOGGER_NAME) - - -# ----------------------------------------------------------- -# Constants -# ----------------------------------------------------------- - -PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2" -TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2" - -_MAX_HOLD_YEARS: Final = 2 - - -# ----------------------------------------------------------- -# Logging -# ----------------------------------------------------------- - - -def setup_logging(log_file: Path | None = None) -> None: - """Configure stderr and optional file logging. - - Attach handlers to the ``ena_submit`` parent logger. - Child loggers (e.g. ``ena_submit.study``) propagate - their messages to these handlers automatically. - - Args: - log_file: Path to a log file. If provided, - debug-level messages are written there in - addition to stderr. - """ - root = logging.getLogger(_LOGGER_NAME) - - # Avoid duplicate handlers on repeated calls. - if root.handlers: - return - - fmt = logging.Formatter( - "%(asctime)s [%(levelname)s] %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - root.setLevel(logging.DEBUG) - - stderr_handler = logging.StreamHandler(sys.stderr) - stderr_handler.setLevel(logging.INFO) - stderr_handler.setFormatter(fmt) - root.addHandler(stderr_handler) - - if log_file: - file_handler = logging.FileHandler(log_file) - file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(fmt) - root.addHandler(file_handler) - - -# ----------------------------------------------------------- -# Credentials -# ----------------------------------------------------------- - - -def get_credentials() -> tuple[str, str]: - """Read ENA credentials from environment variables. - - Returns: - Tuple of (*username*, *password*). - - Raises: - SystemExit: If either variable is unset or empty. - """ - username = os.environ.get("ENA_WEBIN", "").strip() - password = os.environ.get("ENA_WEBIN_PASSWORD", "").strip() - if not username or not password: - logger.error("ENA_WEBIN and ENA_WEBIN_PASSWORD environment variables must be set") - sys.exit(1) - return username, password - - -# ----------------------------------------------------------- -# ENA API helpers -# ----------------------------------------------------------- - - -def get_base_url(use_test: bool) -> str: - """Return the ENA Webin v2 submission base URL.""" - return TEST_URL if use_test else PROD_URL - - -def submit_xml( - base_url: str, - auth: HTTPBasicAuth, - xml_bytes: bytes, -) -> ET.Element: - """Submit an XML document to ENA via Webin v2. - - Args: - base_url: ENA submission service base URL. - auth: HTTP basic-auth credentials. - xml_bytes: Serialised XML submission document. - - Returns: - Parsed receipt XML element tree root. - """ - url = f"{base_url}/submit" - headers = { - "Content-Type": "application/xml", - "Accept": "application/xml", - } - resp = requests.post( - url, data=xml_bytes, - headers=headers, auth=auth, timeout=120, - ) - resp.raise_for_status() - return ET.fromstring(resp.content) - - -# ----------------------------------------------------------- -# XML utilities -# ----------------------------------------------------------- - - -def xml_to_bytes(root: ET.Element) -> bytes: - """Serialise an ElementTree element to UTF-8 bytes.""" - tree = ET.ElementTree(root) - buf = BytesIO() - tree.write(buf, encoding="UTF-8", xml_declaration=True) - return buf.getvalue() - - -# ----------------------------------------------------------- -# Hold-until date validation -# ----------------------------------------------------------- - - -def validate_hold_until(hold_until: str) -> datetime.date: - """Parse and validate a hold-until date string. - - Args: - hold_until: Date string in ``YYYY-MM-DD`` format. - - Returns: - Parsed date. - - Raises: - click.BadParameter: If the date format is invalid, - in the past, or more than 2 years from today. - """ - try: - hold_date = datetime.date.fromisoformat(hold_until) - except ValueError: - raise click.BadParameter( - f"Invalid date format: {hold_until!r}. Expected YYYY-MM-DD." - ) from None - - today = datetime.date.today() - max_date = today.replace(year=today.year + _MAX_HOLD_YEARS) - - if hold_date > max_date: - raise click.BadParameter( - f"Hold date {hold_until} is more than {_MAX_HOLD_YEARS} years from today" - f" ({today}). Maximum allowed: {max_date}." - ) - - if hold_date <= today: - raise click.BadParameter( - f"Hold date {hold_until} is not in the future (today is {today})." - ) - - return hold_date - - -# ----------------------------------------------------------- -# ENA checklist XML parsing -# ----------------------------------------------------------- - - -def parse_checklist_units( - xml_path: str | Path, -) -> dict[str, str]: - """Parse an ENA checklist XML and return field units. - - Reads the ```` elements from an ENA checklist XML - file (e.g. ``ERC000015.xml``) and returns a mapping from - slot name to unit string for every field that declares a - ```` element. - - Args: - xml_path: Path to the ENA checklist XML file. - - Returns: - Dict mapping slot name to unit string. - Fields without units are absent from the dict. - """ - units: dict[str, str] = {} - try: - tree = ET.parse(str(xml_path)) - except ET.ParseError as exc: - logger.warning( - "Could not parse checklist XML %s: %s", - xml_path, exc, - ) - return units - - for field in tree.iter("FIELD"): - name_el = field.find("NAME") - if name_el is None or not name_el.text: - continue - units_el = field.find("UNITS") - if units_el is None: - continue - unit_el = units_el.find("UNIT") - if unit_el is None or not unit_el.text: - continue - units[name_el.text.strip()] = unit_el.text.strip() - - return units - - -# ----------------------------------------------------------- -# XSD validation (structural fallback only) -# ----------------------------------------------------------- - - -def validate_xml_against_xsd( - xml_bytes: bytes, - _fragment_tag: str | None = None, # unused; kept for API compatibility - fallback_checker: Callable[ - [bytes, list[str]], tuple[bool, list[str]] - ] | None = None, -) -> tuple[bool, list[str]]: - """Validate XML bytes using a structural check. - - Full XSD validation via lxml is not available in this - container. Uses *fallback_checker* if provided, - otherwise checks that the document is well-formed XML. - - Args: - xml_bytes: Serialised XML document. - _fragment_tag: Unused; kept for API compatibility. - fallback_checker: Optional function called with - (*xml_bytes*, *messages*) that returns - (*is_valid*, *messages*). - - Returns: - Tuple of (*is_valid*, *messages*). - """ - messages: list[str] = [] - - if fallback_checker is not None: - return fallback_checker(xml_bytes, messages) - - try: - ET.fromstring(xml_bytes) - except ET.ParseError as exc: - messages.append( - f"ERROR: XML is not well-formed: {exc}" - ) - return False, messages - - messages.append( - "XML is well-formed (basic check passed)" - ) - return True, messages - - -# ----------------------------------------------------------- -# File loading (JSON, CSV, TSV) -# ----------------------------------------------------------- - - -def _is_metadata_row(row: Sequence[object]) -> bool: - """Check whether *row* is a DataHarmonizer label row. - - These rows have at most one non-empty cell. - """ - non_empty = sum( - 1 for c in row - if c is not None and str(c).strip() - ) - return non_empty <= 1 - - -def extract_records_from_tabular( - filepath: str | Path, - delimiter: str = ",", -) -> list[dict[str, str]]: - """Extract record dicts from a CSV or TSV file. - - Skip an optional DataHarmonizer metadata row if - detected. - - Args: - filepath: Path to the tabular file. - delimiter: Column delimiter character. - - Returns: - List of record dicts. - """ - with open(filepath, newline="", encoding="utf-8") as fh: - rows = list(csv.reader(fh, delimiter=delimiter)) - - if not rows: - return [] - - idx = 0 - if _is_metadata_row(rows[idx]): - idx += 1 - if idx >= len(rows): - return [] - - headers = rows[idx] - idx += 1 - - records: list[dict[str, str]] = [] - for row in rows[idx:]: - record: dict[str, str] = {} - for col, val in zip(headers, row): - col = col.strip() - if col and val is not None and val.strip(): - record[col] = val.strip() - if record: - records.append(record) - - return records - - -def extract_records_from_json( - input_data: object, - record_keys: Sequence[str] = ("data",), -) -> list[dict[str, Any]] | None: - """Extract record dicts from a DataHarmonizer JSON export. - - Handle several JSON shapes: - - * DataHarmonizer Container format:: - - {"Container": {"s": [{...}, ...]}} - - * Plain list of dicts. - * Dict with an entity-specific key or ``data`` key. - * Single record object (no wrapper). - - Args: - input_data: Parsed JSON data (any shape). - record_keys: Dict keys to check for record lists - (e.g. ``["studies", "data"]``). - - Returns: - List of record dicts, or ``None`` if unrecognised. - """ - if isinstance(input_data, list): - return input_data - - if isinstance(input_data, dict): - container = input_data.get("Container") - if isinstance(container, dict): - for key, val in container.items(): - if isinstance(val, list): - logger.info("Extracted records from Container.%s", key) - return val - - for key in record_keys: - if key in input_data: - return input_data[key] - - return [input_data] - - return None - - -def load_input_file( - filepath: str | Path, - json_record_keys: Sequence[str] = ("data",), -) -> list[dict[str, Any]] | None: - """Load records from a supported file format. - - Supported formats: JSON, CSV, TSV. - - Args: - filepath: Path to the input file. - json_record_keys: Dict keys to check when parsing - JSON (e.g. ``["studies", "data"]``). - - Returns: - List of record dicts, or ``None`` if the format is - unrecognised. - """ - ext = Path(filepath).suffix.lower() - if ext == ".json": - with open(filepath) as fh: - input_data = json.load(fh) - return extract_records_from_json( - input_data, json_record_keys, - ) - if ext == ".csv": - return extract_records_from_tabular( - filepath, delimiter=",", - ) - if ext == ".tsv": - return extract_records_from_tabular( - filepath, delimiter="\t", - ) - return None - - -# ----------------------------------------------------------- -# Reports API -# ----------------------------------------------------------- - - -def fetch_from_reports_endpoint( - url: str, - auth: HTTPBasicAuth, - max_results: int = 5000, -) -> list[dict[str, Any]] | None: - """Fetch records from a single Webin Reports endpoint. - - Args: - url: Full URL of the reports endpoint. - auth: HTTP basic-auth credentials. - max_results: Maximum number of results to request. - - Returns: - List of raw report dicts, or ``None`` on error. - """ - params = { - "format": "json", - "max-results": max_results, - } - - req = requests.Request("GET", url, params=params, auth=auth) - prepared = req.prepare() - logger.debug('curl -u %s:*** "%s"', auth.username, prepared.url) - - try: - resp = requests.get(url, params=params, auth=auth, timeout=60) - logger.info("Reports API at %s returned %s", url, resp.status_code) - resp.raise_for_status() - return resp.json() - - except requests.exceptions.HTTPError as exc: - status = ( - exc.response.status_code - if exc.response is not None - else "unknown" - ) - if status == 404: - logger.info("Reports API at %s returned 404 — no records yet", url) - return [] - if status in (401, 403): - logger.warning( - "Reports API at %s returned %s — endpoint may not be available" - " or credentials may differ", - url, status, - ) - return None - logger.warning("Reports API at %s returned HTTP %s", url, status) - return None - - except requests.exceptions.RequestException as exc: - logger.warning("Reports API at %s failed: %s", url, exc) - return None - - -def fetch_account_records( - auth: HTTPBasicAuth, - use_test: bool, - prod_url: str, - test_url: str, - normalizer: Callable[ - [dict[str, Any]], dict[str, str] | None - ], - entity_label: str, - max_results: int = 5000, -) -> list[dict[str, str]]: - """Fetch and normalise records from the Reports API. - - Try test endpoint first (if *use_test*), then fall back - to production. - - Args: - auth: HTTP basic-auth credentials. - use_test: Try the test endpoint first. - prod_url: Production reports endpoint URL. - test_url: Test reports endpoint URL. - normalizer: Callable that maps a raw report dict to - a normalised dict, or ``None`` to skip. - entity_label: Label for log messages (e.g. - ``"studies"``). - max_results: Maximum number of results to request. - - Returns: - List of normalised record dicts. - """ - urls = ( - [test_url, prod_url] if use_test - else [prod_url] - ) - - for url in urls: - logger.info("Fetching account %s from: %s", entity_label, url) - raw = fetch_from_reports_endpoint(url, auth, max_results) - if raw is None: - continue - - records: list[dict[str, str]] = [] - for entry in raw: - report = entry.get("report") - if report is None: - continue - normalized = normalizer(report) - if normalized is not None: - records.append(normalized) - - logger.info("Found %d %s in account", len(records), entity_label) - return records - - logger.warning( - "Could not reach any Webin reports endpoint. Duplicate checking for %s will be skipped.", - entity_label, - ) - return [] - - -# ----------------------------------------------------------- -# Duplicate detection (alias + title matching) -# ----------------------------------------------------------- - - -def find_duplicates_by_alias_title( - new_records: Sequence[dict[str, Any]], - account_records: Sequence[dict[str, str]], - title_field: str, - entity_label: str, -) -> dict[int, dict[str, str]]: - """Check new records against account records. - - Match by ``alias`` (preferred) or by the entity-specific - title field against the pre-fetched account records from - the Webin Reports API. - - Args: - new_records: Records the user wants to submit. - account_records: Existing records already registered - under the Webin account. - title_field: Field name for the title in new records - (e.g. ``"STUDY_TITLE"`` or ``"SAMPLE_TITLE"``). - entity_label: Label for log messages. - - Returns: - Mapping of index in *new_records* to matching - existing record info. - """ - duplicates: dict[int, dict[str, str]] = {} - total = len(new_records) - - if not account_records: - return duplicates - - by_title: dict[str, dict[str, str]] = {} - by_alias: dict[str, dict[str, str]] = {} - for rec in account_records: - title = (rec.get("title") or "").strip() - alias = (rec.get("alias") or "").strip() - if title: - by_title[title] = rec - if alias: - by_alias[alias] = rec - - logger.info( - "Checking %d new %s against %d existing account %s...", - total, entity_label, len(account_records), entity_label, - ) - - for i, record in enumerate(new_records): - new_title = ( - record.get(title_field) or "" - ).strip() - new_alias = (record.get("alias") or "").strip() - - if not new_title and not new_alias: - continue - - match = _match_by_alias_title( - new_alias, new_title, by_alias, by_title, - ) - if match is not None: - duplicates[i] = match - logger.info( - " Duplicate: '%s' matches %s -> %s (%s)", - new_title or new_alias, - match["match_reason"], - match["accession"], - match["status"], - ) - - if len(duplicates) == total: - logger.info("All %s are duplicates — skipping further checks", entity_label) - return duplicates - - return duplicates - - -def _match_by_alias_title( - new_alias: str, - new_title: str, - by_alias: dict[str, dict[str, str]], - by_title: dict[str, dict[str, str]], -) -> dict[str, str] | None: - """Return matching record info or ``None``.""" - if new_alias and new_alias in by_alias: - rec = by_alias[new_alias] - reason = f"alias '{new_alias}'" - elif new_title and new_title in by_title: - rec = by_title[new_title] - reason = f"title '{new_title}'" - else: - return None - - return { - "accession": rec.get("accession", ""), - "secondary_accession": rec.get( - "secondary_accession", "" - ), - "alias": rec.get("alias", ""), - "title": rec.get("title", ""), - "status": rec.get("status", "UNKNOWN"), - "match_reason": reason, - } - - -# ----------------------------------------------------------- -# Result output -# ----------------------------------------------------------- - - -def write_results( - results: dict[str, list[dict[str, Any]]], - output_path: Path | None, -) -> None: - """Write JSON results to file or stdout.""" - json_str = json.dumps(results, indent=2) - if output_path: - with open(output_path, "w") as fh: - fh.write(json_str + "\n") - logger.info("Results written to %s", output_path) - else: - print(json_str) diff --git a/bin/submit_study.py b/bin/submit_study.py index ae72d69..463318d 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -"""Submit raw-reads, assembly and genome studies to ENA -via the Webin REST API v2. +"""Submit raw-reads, assembly and genome studies to ENA via the Webin REST API v2. Read a DataHarmonizer export containing study metadata, check for duplicate studies already registered under the @@ -15,27 +14,32 @@ Usage:: - python bin/submit_study.py \ - --input studies.json \ + python bin/submit_study.py \\ + --input studies.json \\ --test # With hold date (max 2 years): - python bin/submit_study.py \ - --input studies.json \ + python bin/submit_study.py \\ + --input studies.json \\ --hold-until 2028-01-01 # Log to file: - python bin/submit_study.py \ - --input studies.json \ + python bin/submit_study.py \\ + --input studies.json \\ --test --log submission.log """ from __future__ import annotations +import csv import datetime +import json import logging +import os import sys import xml.etree.ElementTree as ET +from collections.abc import Callable, Sequence +from io import BytesIO from pathlib import Path from typing import Any, Final @@ -43,9 +47,652 @@ import requests from requests.auth import HTTPBasicAuth -import ena_submit_common as common -logger = logging.getLogger("ena_submit.rawreads_study") +# ----------------------------------------------------------- +# Logging +# ----------------------------------------------------------- + +# All loggers in the ENA submission scripts share this root, +# so configuring it once propagates to all child loggers. +_LOGGER_NAME: Final = "ena_submit" + +logger = logging.getLogger("ena_submit.study") + + +def setup_logging(log_file: Path | None = None) -> None: + """Configure stderr and optional file logging. + + Attach handlers to the ``ena_submit`` parent logger. + Child loggers (e.g. ``ena_submit.study``) propagate + their messages to these handlers automatically. + + Args: + log_file: Path to a log file. If provided, + debug-level messages are written there in + addition to stderr. + """ + root = logging.getLogger(_LOGGER_NAME) + + # Avoid duplicate handlers on repeated calls. + if root.handlers: + return + + fmt = logging.Formatter( + "%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + ) + root.setLevel(logging.DEBUG) + + stderr_handler = logging.StreamHandler(sys.stderr) + stderr_handler.setLevel(logging.INFO) + stderr_handler.setFormatter(fmt) + root.addHandler(stderr_handler) + + if log_file: + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(fmt) + root.addHandler(file_handler) + + +# ----------------------------------------------------------- +# Credentials +# ----------------------------------------------------------- + + +def get_credentials() -> tuple[str, str]: + """Read ENA credentials from environment variables. + + Returns: + Tuple of (*username*, *password*). + + Raises: + SystemExit: If either variable is unset or empty. + """ + username = os.environ.get("ENA_WEBIN", "").strip() + password = os.environ.get("ENA_WEBIN_PASSWORD", "").strip() + if not username or not password: + logger.error("ENA_WEBIN and ENA_WEBIN_PASSWORD environment variables must be set") + sys.exit(1) + return username, password + + +# ----------------------------------------------------------- +# ENA API helpers +# ----------------------------------------------------------- + +PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2" +TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2" + + +def get_base_url(use_test: bool) -> str: + """Return the ENA Webin v2 submission base URL.""" + return TEST_URL if use_test else PROD_URL + + +def submit_xml( + base_url: str, + auth: HTTPBasicAuth, + xml_bytes: bytes, +) -> ET.Element: + """Submit an XML document to ENA via Webin v2. + + Args: + base_url: ENA submission service base URL. + auth: HTTP basic-auth credentials. + xml_bytes: Serialised XML submission document. + + Returns: + Parsed receipt XML element tree root. + """ + url = f"{base_url}/submit" + headers = { + "Content-Type": "application/xml", + "Accept": "application/xml", + } + resp = requests.post( + url, data=xml_bytes, + headers=headers, auth=auth, timeout=120, + ) + resp.raise_for_status() + return ET.fromstring(resp.content) + + +# ----------------------------------------------------------- +# XML utilities +# ----------------------------------------------------------- + + +def xml_to_bytes(root: ET.Element) -> bytes: + """Serialise an ElementTree element to UTF-8 bytes.""" + tree = ET.ElementTree(root) + buf = BytesIO() + tree.write(buf, encoding="UTF-8", xml_declaration=True) + return buf.getvalue() + + +# ----------------------------------------------------------- +# Hold-until date validation +# ----------------------------------------------------------- + +_MAX_HOLD_YEARS: Final = 2 + + +def validate_hold_until(hold_until: str) -> datetime.date: + """Parse and validate a hold-until date string. + + Args: + hold_until: Date string in ``YYYY-MM-DD`` format. + + Returns: + Parsed date. + + Raises: + click.BadParameter: If the date format is invalid, + in the past, or more than 2 years from today. + """ + try: + hold_date = datetime.date.fromisoformat(hold_until) + except ValueError: + raise click.BadParameter( + f"Invalid date format: {hold_until!r}. Expected YYYY-MM-DD." + ) from None + + today = datetime.date.today() + max_date = today.replace(year=today.year + _MAX_HOLD_YEARS) + + if hold_date > max_date: + raise click.BadParameter( + f"Hold date {hold_until} is more than {_MAX_HOLD_YEARS} years from today" + f" ({today}). Maximum allowed: {max_date}." + ) + + if hold_date <= today: + raise click.BadParameter( + f"Hold date {hold_until} is not in the future (today is {today})." + ) + + return hold_date + + +# ----------------------------------------------------------- +# ENA checklist XML parsing +# ----------------------------------------------------------- + + +def parse_checklist_units( + xml_path: str | Path, +) -> dict[str, str]: + """Parse an ENA checklist XML and return field units. + + Reads the ```` elements from an ENA checklist XML + file (e.g. ``ERC000015.xml``) and returns a mapping from + slot name to unit string for every field that declares a + ```` element. + + Args: + xml_path: Path to the ENA checklist XML file. + + Returns: + Dict mapping slot name to unit string. + Fields without units are absent from the dict. + """ + units: dict[str, str] = {} + try: + tree = ET.parse(str(xml_path)) + except ET.ParseError as exc: + logger.warning( + "Could not parse checklist XML %s: %s", + xml_path, exc, + ) + return units + + for field in tree.iter("FIELD"): + name_el = field.find("NAME") + if name_el is None or not name_el.text: + continue + units_el = field.find("UNITS") + if units_el is None: + continue + unit_el = units_el.find("UNIT") + if unit_el is None or not unit_el.text: + continue + units[name_el.text.strip()] = unit_el.text.strip() + + return units + + +# ----------------------------------------------------------- +# XSD validation (structural fallback only) +# ----------------------------------------------------------- + + +def validate_xml_against_xsd( + xml_bytes: bytes, + _fragment_tag: str | None = None, # unused; kept for API compatibility + fallback_checker: Callable[ + [bytes, list[str]], tuple[bool, list[str]] + ] | None = None, +) -> tuple[bool, list[str]]: + """Validate XML bytes using a structural check. + + Full XSD validation via lxml is not available in this + container. Uses *fallback_checker* if provided, + otherwise checks that the document is well-formed XML. + + Args: + xml_bytes: Serialised XML document. + _fragment_tag: Unused; kept for API compatibility. + fallback_checker: Optional function called with + (*xml_bytes*, *messages*) that returns + (*is_valid*, *messages*). + + Returns: + Tuple of (*is_valid*, *messages*). + """ + messages: list[str] = [] + + if fallback_checker is not None: + return fallback_checker(xml_bytes, messages) + + try: + ET.fromstring(xml_bytes) + except ET.ParseError as exc: + messages.append( + f"ERROR: XML is not well-formed: {exc}" + ) + return False, messages + + messages.append( + "XML is well-formed (basic check passed)" + ) + return True, messages + + +# ----------------------------------------------------------- +# File loading (JSON, CSV, TSV) +# ----------------------------------------------------------- + + +def _is_metadata_row(row: Sequence[object]) -> bool: + """Check whether *row* is a DataHarmonizer label row. + + These rows have at most one non-empty cell. + """ + non_empty = sum( + 1 for c in row + if c is not None and str(c).strip() + ) + return non_empty <= 1 + + +def extract_records_from_tabular( + filepath: str | Path, + delimiter: str = ",", +) -> list[dict[str, str]]: + """Extract record dicts from a CSV or TSV file. + + Skip an optional DataHarmonizer metadata row if + detected. + + Args: + filepath: Path to the tabular file. + delimiter: Column delimiter character. + + Returns: + List of record dicts. + """ + with open(filepath, newline="", encoding="utf-8") as fh: + rows = list(csv.reader(fh, delimiter=delimiter)) + + if not rows: + return [] + + idx = 0 + if _is_metadata_row(rows[idx]): + idx += 1 + if idx >= len(rows): + return [] + + headers = rows[idx] + idx += 1 + + records: list[dict[str, str]] = [] + for row in rows[idx:]: + record: dict[str, str] = {} + for col, val in zip(headers, row): + col = col.strip() + if col and val is not None and val.strip(): + record[col] = val.strip() + if record: + records.append(record) + + return records + + +def extract_records_from_json( + input_data: object, + record_keys: Sequence[str] = ("data",), +) -> list[dict[str, Any]] | None: + """Extract record dicts from a DataHarmonizer JSON export. + + Handle several JSON shapes: + + * DataHarmonizer Container format:: + + {"Container": {"s": [{...}, ...]}} + + * Plain list of dicts. + * Dict with an entity-specific key or ``data`` key. + * Single record object (no wrapper). + + Args: + input_data: Parsed JSON data (any shape). + record_keys: Dict keys to check for record lists + (e.g. ``["studies", "data"]``). + + Returns: + List of record dicts, or ``None`` if unrecognised. + """ + if isinstance(input_data, list): + return input_data + + if isinstance(input_data, dict): + container = input_data.get("Container") + if isinstance(container, dict): + for key, val in container.items(): + if isinstance(val, list): + logger.info("Extracted records from Container.%s", key) + return val + + for key in record_keys: + if key in input_data: + return input_data[key] + + return [input_data] + + return None + + +def load_input_file( + filepath: str | Path, + json_record_keys: Sequence[str] = ("data",), +) -> list[dict[str, Any]] | None: + """Load records from a supported file format. + + Supported formats: JSON, CSV, TSV. + + Args: + filepath: Path to the input file. + json_record_keys: Dict keys to check when parsing + JSON (e.g. ``["studies", "data"]``). + + Returns: + List of record dicts, or ``None`` if the format is + unrecognised. + """ + ext = Path(filepath).suffix.lower() + if ext == ".json": + with open(filepath) as fh: + input_data = json.load(fh) + return extract_records_from_json( + input_data, json_record_keys, + ) + if ext == ".csv": + return extract_records_from_tabular( + filepath, delimiter=",", + ) + if ext == ".tsv": + return extract_records_from_tabular( + filepath, delimiter="\t", + ) + return None + + +# ----------------------------------------------------------- +# Reports API +# ----------------------------------------------------------- + + +def fetch_from_reports_endpoint( + url: str, + auth: HTTPBasicAuth, + max_results: int = 5000, +) -> list[dict[str, Any]] | None: + """Fetch records from a single Webin Reports endpoint. + + Args: + url: Full URL of the reports endpoint. + auth: HTTP basic-auth credentials. + max_results: Maximum number of results to request. + + Returns: + List of raw report dicts, or ``None`` on error. + """ + params = { + "format": "json", + "max-results": max_results, + } + + req = requests.Request("GET", url, params=params, auth=auth) + prepared = req.prepare() + logger.debug('curl -u %s:*** "%s"', auth.username, prepared.url) + + try: + resp = requests.get(url, params=params, auth=auth, timeout=60) + logger.info("Reports API at %s returned %s", url, resp.status_code) + resp.raise_for_status() + return resp.json() + + except requests.exceptions.HTTPError as exc: + status = ( + exc.response.status_code + if exc.response is not None + else "unknown" + ) + if status == 404: + logger.info("Reports API at %s returned 404 — no records yet", url) + return [] + if status in (401, 403): + logger.warning( + "Reports API at %s returned %s — endpoint may not be available" + " or credentials may differ", + url, status, + ) + return None + logger.warning("Reports API at %s returned HTTP %s", url, status) + return None + + except requests.exceptions.RequestException as exc: + logger.warning("Reports API at %s failed: %s", url, exc) + return None + + +def fetch_account_records( + auth: HTTPBasicAuth, + use_test: bool, + prod_url: str, + test_url: str, + normalizer: Callable[ + [dict[str, Any]], dict[str, str] | None + ], + entity_label: str, + max_results: int = 5000, +) -> list[dict[str, str]]: + """Fetch and normalise records from the Reports API. + + Try test endpoint first (if *use_test*), then fall back + to production. + + Args: + auth: HTTP basic-auth credentials. + use_test: Try the test endpoint first. + prod_url: Production reports endpoint URL. + test_url: Test reports endpoint URL. + normalizer: Callable that maps a raw report dict to + a normalised dict, or ``None`` to skip. + entity_label: Label for log messages (e.g. + ``"studies"``). + max_results: Maximum number of results to request. + + Returns: + List of normalised record dicts. + """ + urls = ( + [test_url, prod_url] if use_test + else [prod_url] + ) + + for url in urls: + logger.info("Fetching account %s from: %s", entity_label, url) + raw = fetch_from_reports_endpoint(url, auth, max_results) + if raw is None: + continue + + records: list[dict[str, str]] = [] + for entry in raw: + report = entry.get("report") + if report is None: + continue + normalized = normalizer(report) + if normalized is not None: + records.append(normalized) + + logger.info("Found %d %s in account", len(records), entity_label) + return records + + logger.warning( + "Could not reach any Webin reports endpoint." + " Duplicate checking for %s will be skipped.", + entity_label, + ) + return [] + + +# ----------------------------------------------------------- +# Duplicate detection (alias + title matching) +# ----------------------------------------------------------- + + +def find_duplicates_by_alias_title( + new_records: Sequence[dict[str, Any]], + account_records: Sequence[dict[str, str]], + title_field: str, + entity_label: str, +) -> dict[int, dict[str, str]]: + """Check new records against account records. + + Match by ``alias`` (preferred) or by the entity-specific + title field against the pre-fetched account records from + the Webin Reports API. + + Args: + new_records: Records the user wants to submit. + account_records: Existing records already registered + under the Webin account. + title_field: Field name for the title in new records + (e.g. ``"STUDY_TITLE"`` or ``"SAMPLE_TITLE"``). + entity_label: Label for log messages. + + Returns: + Mapping of index in *new_records* to matching + existing record info. + """ + duplicates: dict[int, dict[str, str]] = {} + total = len(new_records) + + if not account_records: + return duplicates + + by_title: dict[str, dict[str, str]] = {} + by_alias: dict[str, dict[str, str]] = {} + for rec in account_records: + title = (rec.get("title") or "").strip() + alias = (rec.get("alias") or "").strip() + if title: + by_title[title] = rec + if alias: + by_alias[alias] = rec + + logger.info( + "Checking %d new %s against %d existing account %s...", + total, entity_label, len(account_records), entity_label, + ) + + for i, record in enumerate(new_records): + new_title = ( + record.get(title_field) or "" + ).strip() + new_alias = (record.get("alias") or "").strip() + + if not new_title and not new_alias: + continue + + match = _match_by_alias_title( + new_alias, new_title, by_alias, by_title, + ) + if match is not None: + duplicates[i] = match + logger.info( + " Duplicate: '%s' matches %s -> %s (%s)", + new_title or new_alias, + match["match_reason"], + match["accession"], + match["status"], + ) + + if len(duplicates) == total: + logger.info("All %s are duplicates — skipping further checks", entity_label) + return duplicates + + return duplicates + + +def _match_by_alias_title( + new_alias: str, + new_title: str, + by_alias: dict[str, dict[str, str]], + by_title: dict[str, dict[str, str]], +) -> dict[str, str] | None: + """Return matching record info or ``None``.""" + if new_alias and new_alias in by_alias: + rec = by_alias[new_alias] + reason = f"alias '{new_alias}'" + elif new_title and new_title in by_title: + rec = by_title[new_title] + reason = f"title '{new_title}'" + else: + return None + + return { + "accession": rec.get("accession", ""), + "secondary_accession": rec.get( + "secondary_accession", "" + ), + "alias": rec.get("alias", ""), + "title": rec.get("title", ""), + "status": rec.get("status", "UNKNOWN"), + "match_reason": reason, + } + + +# ----------------------------------------------------------- +# Result output +# ----------------------------------------------------------- + + +def write_results( + results: dict[str, list[dict[str, Any]]], + output_path: Path | None, +) -> None: + """Write JSON results to file or stdout.""" + json_str = json.dumps(results, indent=2) + if output_path: + with open(output_path, "w") as fh: + fh.write(json_str + "\n") + logger.info("Results written to %s", output_path) + else: + print(json_str) # ----------------------------------------------------------- @@ -90,7 +737,7 @@ def fetch_account_studies( Returns: List of normalised study dicts. """ - return common.fetch_account_records( + return fetch_account_records( auth, use_test=use_test, prod_url=_PROD_REPORTS_URL, @@ -114,7 +761,7 @@ def find_duplicate_studies( Returns: Mapping of index to matching study info. """ - return common.find_duplicates_by_alias_title( + return find_duplicates_by_alias_title( new_studies, account_studies, title_field="STUDY_TITLE", entity_label="studies", @@ -288,7 +935,7 @@ def validate_study_xml( Returns: Tuple of (*is_valid*, *messages*). """ - return common.validate_xml_against_xsd( + return validate_xml_against_xsd( xml_bytes, fallback_checker=_validate_study_xml_structure, ) @@ -392,7 +1039,7 @@ def _do_submission( logger.info("Submitting %s to ENA (%s)...", action, env_label) try: - receipt_root = common.submit_xml(base_url, auth, xml_bytes) + receipt_root = submit_xml(base_url, auth, xml_bytes) except requests.exceptions.HTTPError as exc: logger.error("HTTP error during %s submission: %s", action, exc) if exc.response is not None: @@ -495,21 +1142,21 @@ def main( force: bool, ) -> None: """Submit studies to ENA via the Webin REST API v2.""" - common.setup_logging(log_file) - username, password = common.get_credentials() + setup_logging(log_file) + username, password = get_credentials() env_label = "TEST" if use_test else "PRODUCTION" logger.info("ENA Study Submission — environment: %s", env_label) - base_url = common.get_base_url(use_test) + base_url = get_base_url(use_test) auth = HTTPBasicAuth(username, password) logger.debug("Auth username: %s", username) if hold_until: - common.validate_hold_until(hold_until) + validate_hold_until(hold_until) # -- Step 1: Load input file ------------------------- logger.info("Loading input: %s", input_file) - studies = common.load_input_file( + studies = load_input_file( input_file, json_record_keys=_JSON_RECORD_KEYS, ) if studies is None: @@ -578,7 +1225,7 @@ def main( if not studies_to_submit and not studies_to_modify: logger.info("No studies to submit (all are duplicates or input is empty)") - common.write_results(results, output) + write_results(results, output) return logger.info( @@ -592,7 +1239,7 @@ def main( if studies_to_submit: logger.info("Building ADD XML for %d new study/studies...", len(studies_to_submit)) xml_root = build_submission_xml(studies_to_submit, hold_until=hold_until, action="ADD") - xml_bytes = common.xml_to_bytes(xml_root) + xml_bytes = xml_to_bytes(xml_root) logger.debug("Generated XML (ADD):\n%s", xml_bytes.decode("utf-8")) logger.info("XML document size (ADD): %d bytes", len(xml_bytes)) ok = _do_submission( @@ -609,7 +1256,7 @@ def main( if studies_to_modify: logger.info("Building MODIFY XML for %d duplicate(s)...", len(studies_to_modify)) xml_root = build_submission_xml(studies_to_modify, hold_until=hold_until, action="MODIFY") - xml_bytes = common.xml_to_bytes(xml_root) + xml_bytes = xml_to_bytes(xml_root) logger.debug("Generated XML (MODIFY):\n%s", xml_bytes.decode("utf-8")) logger.info("XML document size (MODIFY): %d bytes", len(xml_bytes)) ok = _do_submission( @@ -626,7 +1273,7 @@ def main( sys.exit(1) # -- Step 5: Output results -------------------------- - common.write_results(results, output) + write_results(results, output) logger.info("=" * 60) logger.info("SUBMISSION SUMMARY") diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test index 42f6902..43c72eb 100644 --- a/modules/local/registerstudy/tests/main.nf.test +++ b/modules/local/registerstudy/tests/main.nf.test @@ -24,7 +24,7 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(process.out).match() } + { assert snapshot(sanitizeOutput(process.out)).match() } ) } } From 15a0f7efccd2cef1189e5cbf59c944bc78d3fb44 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 12:15:10 +0000 Subject: [PATCH 11/36] Update nft-utils to get sanatizeOutputs to work --- modules/local/registerstudy/tests/main.nf.test.snap | 13 +------------ nf-test.config | 2 +- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/modules/local/registerstudy/tests/main.nf.test.snap b/modules/local/registerstudy/tests/main.nf.test.snap index 4b184e9..5e2fde1 100644 --- a/modules/local/registerstudy/tests/main.nf.test.snap +++ b/modules/local/registerstudy/tests/main.nf.test.snap @@ -2,17 +2,6 @@ "registerstudy - stub": { "content": [ { - "0": [ - [ - { - "id": "example_study" - }, - "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" - ] - ], - "1": [ - "versions.yml:md5,ddcc758a7d28faecd4286941889ab7e1" - ], "accessions": [ [ { @@ -26,7 +15,7 @@ ] } ], - "timestamp": "2026-03-12T13:52:06.989729", + "timestamp": "2026-03-13T12:14:02.650852", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" diff --git a/nf-test.config b/nf-test.config index 3525ead..613fc05 100644 --- a/nf-test.config +++ b/nf-test.config @@ -19,6 +19,6 @@ config { // load the necessary plugins plugins { - load "nft-utils@0.0.3" + load "nft-utils@0.0.9" } } From 7403a3e62611634307ccae6e0844479fa5b72cfb Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 12:17:50 +0000 Subject: [PATCH 12/36] Revert test config --- nextflow.config | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/nextflow.config b/nextflow.config index dba0973..dd678eb 100644 --- a/nextflow.config +++ b/nextflow.config @@ -175,17 +175,7 @@ profiles { singularity.runOptions = '--nv' } // TODO: figure out how to better orginise tests for different workflow types (bins, mags, metagenomic_assemblies) - // test { includeConfig 'conf/test.config' } - test { - docker.enabled = true - conda.enabled = false - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false - apptainer.enabled = false - docker.runOptions = '-u $(id -u):$(id -g)' - } + test { includeConfig 'conf/test.config' } test_genome { includeConfig 'conf/test_genome.config' } test_assembly { includeConfig 'conf/test_assembly.config' } test_full { includeConfig 'conf/test_full.config' } From 416fa7568b0bc5005b351bacb5950a3f4b4e0f59 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 12:56:21 +0000 Subject: [PATCH 13/36] Updated tests to run locally --- .../tests/main.nf.test | 2 +- .../tests/main.nf.test.snap | 17 +++-------------- .../local/registerstudy/tests/nextflow.config | 1 + 3 files changed, 5 insertions(+), 15 deletions(-) diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test b/modules/local/generate_assembly_manifest/tests/main.nf.test index 897744a..790d402 100644 --- a/modules/local/generate_assembly_manifest/tests/main.nf.test +++ b/modules/local/generate_assembly_manifest/tests/main.nf.test @@ -28,7 +28,7 @@ nextflow_process { assert process.success assertAll( { assert snapshot( - process.out, + sanitizeOutput(process.out), path(process.out.versions[0]).yaml ).match() }, { assert process.out.manifest.size() == 1 }, diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap index f594383..5f5b1d7 100644 --- a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap +++ b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap @@ -31,7 +31,7 @@ } } ], - "timestamp": "2026-03-12T13:52:01.267817", + "timestamp": "2026-03-13T12:32:28.183967", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" @@ -40,23 +40,12 @@ "GENERATE_ASSEMBLY_MANIFEST completes with expected outputs": { "content": [ { - "0": [ - [ - { - "id": "test" - }, - "233126d4c4d0.manifest:md5,8387c0e6c123313259db613612c09dce" - ] - ], - "1": [ - "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" - ], "manifest": [ [ { "id": "test" }, - "233126d4c4d0.manifest:md5,8387c0e6c123313259db613612c09dce" + "233126d4c4d0.manifest:md5,cacedcfcce220081e7aa2f98c2f4ffd6" ] ], "versions": [ @@ -69,7 +58,7 @@ } } ], - "timestamp": "2026-03-12T13:51:56.121365", + "timestamp": "2026-03-13T12:32:23.722449", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" diff --git a/modules/local/registerstudy/tests/nextflow.config b/modules/local/registerstudy/tests/nextflow.config index 4a84743..f22b24f 100644 --- a/modules/local/registerstudy/tests/nextflow.config +++ b/modules/local/registerstudy/tests/nextflow.config @@ -6,6 +6,7 @@ // Dummy credentials are sufficient for --dry-run --automated mode since // no HTTP calls are made. For real submission tests, replace with secrets: // env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD } + process { withName: REGISTERSTUDY { ext.args = '--test --automated --dry-run' From 7e2b7b8a3ead5d0e8ac9e13b4d5a3f09fa738904 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 13:29:11 +0000 Subject: [PATCH 14/36] Remove duplicate detection functionality from study submit --- bin/submit_study.py | 496 +++----------------------------------------- 1 file changed, 27 insertions(+), 469 deletions(-) diff --git a/bin/submit_study.py b/bin/submit_study.py index 463318d..c2b165e 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -54,47 +54,14 @@ # All loggers in the ENA submission scripts share this root, # so configuring it once propagates to all child loggers. -_LOGGER_NAME: Final = "ena_submit" - +logging.basicConfig( + format="%(levelname)s: %(message)s", + level=logging.INFO, + stream=sys.stderr, +) logger = logging.getLogger("ena_submit.study") -def setup_logging(log_file: Path | None = None) -> None: - """Configure stderr and optional file logging. - - Attach handlers to the ``ena_submit`` parent logger. - Child loggers (e.g. ``ena_submit.study``) propagate - their messages to these handlers automatically. - - Args: - log_file: Path to a log file. If provided, - debug-level messages are written there in - addition to stderr. - """ - root = logging.getLogger(_LOGGER_NAME) - - # Avoid duplicate handlers on repeated calls. - if root.handlers: - return - - fmt = logging.Formatter( - "%(asctime)s [%(levelname)s] %(message)s", - datefmt="%Y-%m-%d %H:%M:%S", - ) - root.setLevel(logging.DEBUG) - - stderr_handler = logging.StreamHandler(sys.stderr) - stderr_handler.setLevel(logging.INFO) - stderr_handler.setFormatter(fmt) - root.addHandler(stderr_handler) - - if log_file: - file_handler = logging.FileHandler(log_file) - file_handler.setLevel(logging.DEBUG) - file_handler.setFormatter(fmt) - root.addHandler(file_handler) - - # ----------------------------------------------------------- # Credentials # ----------------------------------------------------------- @@ -449,233 +416,6 @@ def load_input_file( return None -# ----------------------------------------------------------- -# Reports API -# ----------------------------------------------------------- - - -def fetch_from_reports_endpoint( - url: str, - auth: HTTPBasicAuth, - max_results: int = 5000, -) -> list[dict[str, Any]] | None: - """Fetch records from a single Webin Reports endpoint. - - Args: - url: Full URL of the reports endpoint. - auth: HTTP basic-auth credentials. - max_results: Maximum number of results to request. - - Returns: - List of raw report dicts, or ``None`` on error. - """ - params = { - "format": "json", - "max-results": max_results, - } - - req = requests.Request("GET", url, params=params, auth=auth) - prepared = req.prepare() - logger.debug('curl -u %s:*** "%s"', auth.username, prepared.url) - - try: - resp = requests.get(url, params=params, auth=auth, timeout=60) - logger.info("Reports API at %s returned %s", url, resp.status_code) - resp.raise_for_status() - return resp.json() - - except requests.exceptions.HTTPError as exc: - status = ( - exc.response.status_code - if exc.response is not None - else "unknown" - ) - if status == 404: - logger.info("Reports API at %s returned 404 — no records yet", url) - return [] - if status in (401, 403): - logger.warning( - "Reports API at %s returned %s — endpoint may not be available" - " or credentials may differ", - url, status, - ) - return None - logger.warning("Reports API at %s returned HTTP %s", url, status) - return None - - except requests.exceptions.RequestException as exc: - logger.warning("Reports API at %s failed: %s", url, exc) - return None - - -def fetch_account_records( - auth: HTTPBasicAuth, - use_test: bool, - prod_url: str, - test_url: str, - normalizer: Callable[ - [dict[str, Any]], dict[str, str] | None - ], - entity_label: str, - max_results: int = 5000, -) -> list[dict[str, str]]: - """Fetch and normalise records from the Reports API. - - Try test endpoint first (if *use_test*), then fall back - to production. - - Args: - auth: HTTP basic-auth credentials. - use_test: Try the test endpoint first. - prod_url: Production reports endpoint URL. - test_url: Test reports endpoint URL. - normalizer: Callable that maps a raw report dict to - a normalised dict, or ``None`` to skip. - entity_label: Label for log messages (e.g. - ``"studies"``). - max_results: Maximum number of results to request. - - Returns: - List of normalised record dicts. - """ - urls = ( - [test_url, prod_url] if use_test - else [prod_url] - ) - - for url in urls: - logger.info("Fetching account %s from: %s", entity_label, url) - raw = fetch_from_reports_endpoint(url, auth, max_results) - if raw is None: - continue - - records: list[dict[str, str]] = [] - for entry in raw: - report = entry.get("report") - if report is None: - continue - normalized = normalizer(report) - if normalized is not None: - records.append(normalized) - - logger.info("Found %d %s in account", len(records), entity_label) - return records - - logger.warning( - "Could not reach any Webin reports endpoint." - " Duplicate checking for %s will be skipped.", - entity_label, - ) - return [] - - -# ----------------------------------------------------------- -# Duplicate detection (alias + title matching) -# ----------------------------------------------------------- - - -def find_duplicates_by_alias_title( - new_records: Sequence[dict[str, Any]], - account_records: Sequence[dict[str, str]], - title_field: str, - entity_label: str, -) -> dict[int, dict[str, str]]: - """Check new records against account records. - - Match by ``alias`` (preferred) or by the entity-specific - title field against the pre-fetched account records from - the Webin Reports API. - - Args: - new_records: Records the user wants to submit. - account_records: Existing records already registered - under the Webin account. - title_field: Field name for the title in new records - (e.g. ``"STUDY_TITLE"`` or ``"SAMPLE_TITLE"``). - entity_label: Label for log messages. - - Returns: - Mapping of index in *new_records* to matching - existing record info. - """ - duplicates: dict[int, dict[str, str]] = {} - total = len(new_records) - - if not account_records: - return duplicates - - by_title: dict[str, dict[str, str]] = {} - by_alias: dict[str, dict[str, str]] = {} - for rec in account_records: - title = (rec.get("title") or "").strip() - alias = (rec.get("alias") or "").strip() - if title: - by_title[title] = rec - if alias: - by_alias[alias] = rec - - logger.info( - "Checking %d new %s against %d existing account %s...", - total, entity_label, len(account_records), entity_label, - ) - - for i, record in enumerate(new_records): - new_title = ( - record.get(title_field) or "" - ).strip() - new_alias = (record.get("alias") or "").strip() - - if not new_title and not new_alias: - continue - - match = _match_by_alias_title( - new_alias, new_title, by_alias, by_title, - ) - if match is not None: - duplicates[i] = match - logger.info( - " Duplicate: '%s' matches %s -> %s (%s)", - new_title or new_alias, - match["match_reason"], - match["accession"], - match["status"], - ) - - if len(duplicates) == total: - logger.info("All %s are duplicates — skipping further checks", entity_label) - return duplicates - - return duplicates - - -def _match_by_alias_title( - new_alias: str, - new_title: str, - by_alias: dict[str, dict[str, str]], - by_title: dict[str, dict[str, str]], -) -> dict[str, str] | None: - """Return matching record info or ``None``.""" - if new_alias and new_alias in by_alias: - rec = by_alias[new_alias] - reason = f"alias '{new_alias}'" - elif new_title and new_title in by_title: - rec = by_title[new_title] - reason = f"title '{new_title}'" - else: - return None - - return { - "accession": rec.get("accession", ""), - "secondary_accession": rec.get( - "secondary_accession", "" - ), - "alias": rec.get("alias", ""), - "title": rec.get("title", ""), - "status": rec.get("status", "UNKNOWN"), - "match_reason": reason, - } - - # ----------------------------------------------------------- # Result output # ----------------------------------------------------------- @@ -695,79 +435,6 @@ def write_results( print(json_str) -# ----------------------------------------------------------- -# Reports API (study-specific) -# ----------------------------------------------------------- - -_PROD_REPORTS_URL: Final = "https://www.ebi.ac.uk/ena/submit/report/projects" -_TEST_REPORTS_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/report/projects" - - -def _normalize_study_report( - report: dict[str, Any], -) -> dict[str, str]: - """Normalise a raw study report dict.""" - return { - "title": ( - report.get("title") or report.get("studyTitle") or report.get("STUDY_TITLE", "") - ), - "alias": report.get("alias") or report.get("studyAlias") or "", - "accession": ( - report.get("accession") - or report.get("studyAccession") - or report.get("report", {}).get("id", "") - ), - "secondary_accession": report.get("secondaryAccession") or report.get("secondaryId", ""), - "status": report.get("releaseStatus", "UNKNOWN"), - } - - -def fetch_account_studies( - auth: HTTPBasicAuth, - use_test: bool = False, - max_results: int = 5000, -) -> list[dict[str, str]]: - """Fetch all projects from the Webin Reports API. - - Args: - auth: HTTP basic-auth credentials. - use_test: Try the test endpoint before production. - max_results: Maximum number of results to request. - - Returns: - List of normalised study dicts. - """ - return fetch_account_records( - auth, - use_test=use_test, - prod_url=_PROD_REPORTS_URL, - test_url=_TEST_REPORTS_URL, - normalizer=_normalize_study_report, - entity_label="studies", - max_results=max_results, - ) - - -def find_duplicate_studies( - new_studies: list[dict[str, Any]], - account_studies: list[dict[str, str]], -) -> dict[int, dict[str, str]]: - """Check new studies against existing account studies. - - Args: - new_studies: Studies the user wants to submit. - account_studies: Existing studies in the account. - - Returns: - Mapping of index to matching study info. - """ - return find_duplicates_by_alias_title( - new_studies, account_studies, - title_field="STUDY_TITLE", - entity_label="studies", - ) - - # ----------------------------------------------------------- # XML construction # ----------------------------------------------------------- @@ -1110,39 +777,19 @@ def _do_submission( help="Path to write JSON accession results (default: stdout)", ) @click.option( - "--max-results", - default=5000, - help="Maximum number of projects to fetch from the Reports API for duplicate checking", -) -@click.option( - "--dry-run", + "--validate", is_flag=True, default=False, help="Validate and build XML but do not submit to ENA", ) -@click.option( - "--automated", - is_flag=True, default=False, - help="Skip duplicate detection against the Webin Reports API (for automated pipelines)", -) -@click.option( - "--force", - is_flag=True, default=False, - help="Submit duplicate studies using the MODIFY action to overwrite existing ENA records," - " instead of skipping them", -) def main( input_file: Path, use_test: bool, hold_until: str | None, log_file: Path | None, output: Path | None, - max_results: int, - dry_run: bool, - automated: bool, - force: bool, + validate: bool, ) -> None: """Submit studies to ENA via the Webin REST API v2.""" - setup_logging(log_file) username, password = get_credentials() env_label = "TEST" if use_test else "PRODUCTION" @@ -1165,133 +812,44 @@ def main( logger.info("Loaded %d study/studies from input", len(studies)) - # -- Step 2: Check for duplicates -------------------- - if automated: - logger.info("Automated mode: skipping duplicate detection") - duplicates: dict[int, dict[str, Any]] = {} - else: - account_studies = fetch_account_studies( - auth, use_test=use_test, - max_results=max_results, - ) - for ps in account_studies: - logger.info( - " Account study: %s | alias=%s | title=%s | status=%s", - ps["accession"], ps["alias"], ps["title"], ps["status"], - ) - duplicates = find_duplicate_studies( - studies, account_studies, - ) + if not studies: + logger.info("No studies to submit") + write_results({"submitted": [], "failed": []}, output) + return results: dict[str, list[dict[str, Any]]] = { - "duplicates": [], "submitted": [], - "modified": [], "failed": [], } - studies_to_modify: list[dict[str, Any]] = [] - if duplicates: - action_label = "will be re-submitted with MODIFY" if force else "will NOT be submitted" - logger.warning( - "Found %d duplicate(s) — %s:", - len(duplicates), action_label, - ) - for idx, dup_info in duplicates.items(): - study_title = studies[idx].get("STUDY_TITLE", f"study[{idx}]") - logger.warning( - " DUPLICATE: '%s' matches existing %s (accession: %s)", - study_title, dup_info["match_reason"], dup_info["accession"], - ) - results["duplicates"].append({ - "input_index": idx, - "title": study_title, - "alias": studies[idx].get("alias", ""), - "existing_accession": dup_info["accession"], - "existing_secondary_accession": dup_info.get("secondary_accession", ""), - "match_reason": dup_info["match_reason"], - }) - if force: - study_copy = dict(studies[idx]) - existing_alias = dup_info.get("alias", "") - if existing_alias: - study_copy["alias"] = existing_alias - studies_to_modify.append(study_copy) - - studies_to_submit = [ - s for i, s in enumerate(studies) - if i not in duplicates - ] - - if not studies_to_submit and not studies_to_modify: - logger.info("No studies to submit (all are duplicates or input is empty)") - write_results(results, output) - return - - logger.info( - "%d new study/studies to ADD, %d duplicate(s) to MODIFY", - len(studies_to_submit), len(studies_to_modify), + # -- Step 2: Build and submit XML -------------------- + logger.info("Building ADD XML for %d study/studies...", len(studies)) + xml_root = build_submission_xml(studies, hold_until=hold_until, action="ADD") + xml_bytes = xml_to_bytes(xml_root) + logger.debug("Generated XML:\n%s", xml_bytes.decode("utf-8")) + logger.info("XML document size: %d bytes", len(xml_bytes)) + ok = _do_submission( + base_url, auth, xml_bytes, + action="ADD", + results=results, + result_key="submitted", + env_label=env_label, + dry_run=validate, ) - overall_ok = True - - # -- Step 3: ADD new studies ------------------------- - if studies_to_submit: - logger.info("Building ADD XML for %d new study/studies...", len(studies_to_submit)) - xml_root = build_submission_xml(studies_to_submit, hold_until=hold_until, action="ADD") - xml_bytes = xml_to_bytes(xml_root) - logger.debug("Generated XML (ADD):\n%s", xml_bytes.decode("utf-8")) - logger.info("XML document size (ADD): %d bytes", len(xml_bytes)) - ok = _do_submission( - base_url, auth, xml_bytes, - action="ADD", - results=results, - result_key="submitted", - env_label=env_label, - dry_run=dry_run, - ) - overall_ok = overall_ok and ok - - # -- Step 4: MODIFY duplicate studies (--force) ------ - if studies_to_modify: - logger.info("Building MODIFY XML for %d duplicate(s)...", len(studies_to_modify)) - xml_root = build_submission_xml(studies_to_modify, hold_until=hold_until, action="MODIFY") - xml_bytes = xml_to_bytes(xml_root) - logger.debug("Generated XML (MODIFY):\n%s", xml_bytes.decode("utf-8")) - logger.info("XML document size (MODIFY): %d bytes", len(xml_bytes)) - ok = _do_submission( - base_url, auth, xml_bytes, - action="MODIFY", - results=results, - result_key="modified", - env_label=env_label, - dry_run=dry_run, - ) - overall_ok = overall_ok and ok - - if not overall_ok: + if not ok: sys.exit(1) - # -- Step 5: Output results -------------------------- + # -- Step 3: Output results -------------------------- write_results(results, output) logger.info("=" * 60) logger.info("SUBMISSION SUMMARY") - logger.info( - " Duplicates skipped: %d", len(results["duplicates"]) - len(results["modified"]), - ) - for d in results["duplicates"]: - logger.info(" %s -> %s", d["title"], d["existing_accession"]) - logger.info(" Newly submitted (ADD): %d", len(results["submitted"])) + logger.info(" Submitted (ADD): %d", len(results["submitted"])) for s in results["submitted"]: ext = s.get("external_accession", "") ext_suffix = f" ({ext})" if ext else "" logger.info(" %s -> %s%s", s["alias"], s["accession"], ext_suffix) - logger.info(" Modified (MODIFY): %d", len(results["modified"])) - for m in results["modified"]: - ext = m.get("external_accession", "") - ext_suffix = f" ({ext})" if ext else "" - logger.info(" %s -> %s%s", m["alias"], m["accession"], ext_suffix) logger.info("=" * 60) From 1e87af775a500e8de78ee7c6a7a1c572539fc1f1 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 13:33:52 +0000 Subject: [PATCH 15/36] Remove unused code --- bin/submit_study.py | 49 +-------------------------------------------- 1 file changed, 1 insertion(+), 48 deletions(-) diff --git a/bin/submit_study.py b/bin/submit_study.py index c2b165e..57edb92 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -182,53 +182,6 @@ def validate_hold_until(hold_until: str) -> datetime.date: return hold_date -# ----------------------------------------------------------- -# ENA checklist XML parsing -# ----------------------------------------------------------- - - -def parse_checklist_units( - xml_path: str | Path, -) -> dict[str, str]: - """Parse an ENA checklist XML and return field units. - - Reads the ```` elements from an ENA checklist XML - file (e.g. ``ERC000015.xml``) and returns a mapping from - slot name to unit string for every field that declares a - ```` element. - - Args: - xml_path: Path to the ENA checklist XML file. - - Returns: - Dict mapping slot name to unit string. - Fields without units are absent from the dict. - """ - units: dict[str, str] = {} - try: - tree = ET.parse(str(xml_path)) - except ET.ParseError as exc: - logger.warning( - "Could not parse checklist XML %s: %s", - xml_path, exc, - ) - return units - - for field in tree.iter("FIELD"): - name_el = field.find("NAME") - if name_el is None or not name_el.text: - continue - units_el = field.find("UNITS") - if units_el is None: - continue - unit_el = units_el.find("UNIT") - if unit_el is None or not unit_el.text: - continue - units[name_el.text.strip()] = unit_el.text.strip() - - return units - - # ----------------------------------------------------------- # XSD validation (structural fallback only) # ----------------------------------------------------------- @@ -746,7 +699,7 @@ def _do_submission( @click.command( - help="Submit raw-reads, assembly and genome studies to ENA via the Webin REST API v2.", + help="Submit studies to ENA via the Webin REST API v2.", ) @click.option( "--input", "input_file", From ffc4e904a712fd53c0ab66620a3aa6b5c70b7fbb Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 13:40:18 +0000 Subject: [PATCH 16/36] Move test fixtures to nf-croe test-datasets --- assets/test-fixtures/example_study.csv | 3 --- assets/test-fixtures/example_study.json | 15 --------------- assets/test-fixtures/example_study.tsv | 3 --- modules/local/registerstudy/tests/main.nf.test | 4 ++-- tests/default.nf.test | 2 +- 5 files changed, 3 insertions(+), 24 deletions(-) delete mode 100644 assets/test-fixtures/example_study.csv delete mode 100644 assets/test-fixtures/example_study.json delete mode 100644 assets/test-fixtures/example_study.tsv diff --git a/assets/test-fixtures/example_study.csv b/assets/test-fixtures/example_study.csv deleted file mode 100644 index 2b68cc1..0000000 --- a/assets/test-fixtures/example_study.csv +++ /dev/null @@ -1,3 +0,0 @@ -Generic,,,,,,,, -IS_PRIMARY,STUDY_TITLE,existing_study_type,new_study_type,STUDY_ABSTRACT,CENTER_NAME,CENTER_PROJECT_NAME,PROJECT_ID,STUDY_DESCRIPTION -YES,MIMICC,Metagenomics,,,,,, \ No newline at end of file diff --git a/assets/test-fixtures/example_study.json b/assets/test-fixtures/example_study.json deleted file mode 100644 index cd9af28..0000000 --- a/assets/test-fixtures/example_study.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "schema": "https://github.com/timrozday/ena-submission-dataharmonizer/SRA_study", - "location": "/templates/sra_study", - "version": "1.0.0", - "in_language": "en", - "Container": { - "SRA_studys": [ - { - "IS_PRIMARY": "YES", - "STUDY_TITLE": "MIMICC", - "existing_study_type": "Metagenomics" - } - ] - } -} \ No newline at end of file diff --git a/assets/test-fixtures/example_study.tsv b/assets/test-fixtures/example_study.tsv deleted file mode 100644 index 4682df1..0000000 --- a/assets/test-fixtures/example_study.tsv +++ /dev/null @@ -1,3 +0,0 @@ -Generic -IS_PRIMARY STUDY_TITLE existing_study_type new_study_type STUDY_ABSTRACT CENTER_NAME CENTER_PROJECT_NAME PROJECT_ID STUDY_DESCRIPTION -YES MIMICC Metagenomics \ No newline at end of file diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test index 43c72eb..5d61673 100644 --- a/modules/local/registerstudy/tests/main.nf.test +++ b/modules/local/registerstudy/tests/main.nf.test @@ -15,7 +15,7 @@ nextflow_process { """ input[0] = [ [ id:'example_study' ], - file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true) + file(params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json", checkIfExists: true) ] """ } @@ -38,7 +38,7 @@ nextflow_process { """ input[0] = [ [ id:'example_study' ], - file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true) + file(params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json", checkIfExists: true) ] """ } diff --git a/tests/default.nf.test b/tests/default.nf.test index 4a3b628..9ed7563 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -65,7 +65,7 @@ nextflow_pipeline { outdir = "$outputDir" input = csv.absolutePath mode = "metagenomic_assemblies" - study_metadata = "${projectDir}/assets/test-fixtures/example_study.json" + study_metadata = params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json" centre_name = "TEST_CENTER" } } From 3824311ce3f94ea762aa0a40de40f846b447d819 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 14:04:50 +0000 Subject: [PATCH 17/36] Fix the tests using nf-core test-datasets and removing inputs and outputs that have been removed previously --- .../tests/main.nf.test.snap | 8 ++++---- modules/local/registerstudy/main.nf | 2 +- .../local/registerstudy/tests/main.nf.test.snap | 4 ++-- modules/local/registerstudy/tests/nextflow.config | 14 +++++++++----- tests/default.nf.test | 2 +- workflows/assemblysubmit.nf | 1 - 6 files changed, 17 insertions(+), 14 deletions(-) diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap index 5f5b1d7..cf8a9e1 100644 --- a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap +++ b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap @@ -11,7 +11,7 @@ ] ], "1": [ - "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" + "versions.yml:md5,4711ed8f2fd35e895aefafebd29f0333" ], "manifest": [ [ @@ -22,16 +22,16 @@ ] ], "versions": [ - "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2" + "versions.yml:md5,4711ed8f2fd35e895aefafebd29f0333" ] }, { "GENERATE_ASSEMBLY_MANIFEST": { - "assembly_uploader": "assembly_uploader 1.3.4" + "assembly_uploader": null } } ], - "timestamp": "2026-03-13T12:32:28.183967", + "timestamp": "2026-03-13T14:02:14.937082", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" diff --git a/modules/local/registerstudy/main.nf b/modules/local/registerstudy/main.nf index 67766e0..99533da 100644 --- a/modules/local/registerstudy/main.nf +++ b/modules/local/registerstudy/main.nf @@ -37,7 +37,7 @@ process REGISTERSTUDY { stub: def prefix = task.ext.prefix ?: "${meta.id}" """ - echo '{"submitted":[],"duplicates":[],"modified":[],"failed":[]}' > ${prefix}_accessions.json + echo '{"submitted":[],"failed":[]}' > ${prefix}_accessions.json cat <<-END_VERSIONS > versions.yml "${task.process}": diff --git a/modules/local/registerstudy/tests/main.nf.test.snap b/modules/local/registerstudy/tests/main.nf.test.snap index 5e2fde1..385b735 100644 --- a/modules/local/registerstudy/tests/main.nf.test.snap +++ b/modules/local/registerstudy/tests/main.nf.test.snap @@ -7,7 +7,7 @@ { "id": "example_study" }, - "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f" + "example_study_accessions.json:md5,83600b2fb33a560c25351dbd4a9bdba2" ] ], "versions": [ @@ -15,7 +15,7 @@ ] } ], - "timestamp": "2026-03-13T12:14:02.650852", + "timestamp": "2026-03-13T14:02:21.161445", "meta": { "nf-test": "0.9.4", "nextflow": "25.10.4" diff --git a/modules/local/registerstudy/tests/nextflow.config b/modules/local/registerstudy/tests/nextflow.config index f22b24f..aaf8385 100644 --- a/modules/local/registerstudy/tests/nextflow.config +++ b/modules/local/registerstudy/tests/nextflow.config @@ -1,15 +1,14 @@ // Test configuration for REGISTERSTUDY module. -// --test : use the ENA dev server (submissions are discarded daily) -// --automated : skip the Webin Reports duplicate-checking API call -// --dry-run : validate and build XML but do not submit to ENA +// --test : use the ENA dev server (submissions are discarded daily) +// --validate : validate and build XML but do not submit to ENA // -// Dummy credentials are sufficient for --dry-run --automated mode since +// Dummy credentials are sufficient for --validate mode since // no HTTP calls are made. For real submission tests, replace with secrets: // env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD } process { withName: REGISTERSTUDY { - ext.args = '--test --automated --dry-run' + ext.args = '--test --validate' } } @@ -17,3 +16,8 @@ env { ENA_WEBIN = 'Webin-000000' ENA_WEBIN_PASSWORD = 'dummy-password' } + +docker { + enabled = true + runOptions = '-u $(id -u):$(id -g)' +} diff --git a/tests/default.nf.test b/tests/default.nf.test index 9ed7563..b436ff9 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -65,7 +65,7 @@ nextflow_pipeline { outdir = "$outputDir" input = csv.absolutePath mode = "metagenomic_assemblies" - study_metadata = params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json" + study_metadata = "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/seqsubmit/test_data/study_metadata/example_study.json" centre_name = "TEST_CENTER" } } diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf index ec1309f..a7897ba 100644 --- a/workflows/assemblysubmit.nf +++ b/workflows/assemblysubmit.nf @@ -162,7 +162,6 @@ workflow ASSEMBLYSUBMIT { .map { _meta, json -> def data = new groovy.json.JsonSlurper().parse(json) data.submitted[0]?.accession - ?: data.duplicates[0]?.existing_accession } } From 5d0c83ee25fa6d744574ea21a2c96a4a14040a70 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 14:07:47 +0000 Subject: [PATCH 18/36] Remove references to dataharmonizer --- bin/submit_study.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/bin/submit_study.py b/bin/submit_study.py index 57edb92..04fff2a 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -1,10 +1,8 @@ #!/usr/bin/env python3 """Submit raw-reads, assembly and genome studies to ENA via the Webin REST API v2. -Read a DataHarmonizer export containing study metadata, -check for duplicate studies already registered under the -Webin account, construct an XML submission document, and -submit new studies to ENA. +Read a study metadata file (JSON, CSV, or TSV), construct an +XML submission document, and submit new studies to ENA. Credentials are read from environment variables to avoid secrets appearing in shell history or process listings:: @@ -235,9 +233,10 @@ def validate_xml_against_xsd( def _is_metadata_row(row: Sequence[object]) -> bool: - """Check whether *row* is a DataHarmonizer label row. + """Check whether *row* is a non-data header/metadata row. - These rows have at most one non-empty cell. + Such rows have at most one non-empty cell and are skipped + during record extraction. """ non_empty = sum( 1 for c in row @@ -252,8 +251,8 @@ def extract_records_from_tabular( ) -> list[dict[str, str]]: """Extract record dicts from a CSV or TSV file. - Skip an optional DataHarmonizer metadata row if - detected. + Skip an optional leading metadata/label row if detected + (a row with at most one non-empty cell). Args: filepath: Path to the tabular file. @@ -294,11 +293,11 @@ def extract_records_from_json( input_data: object, record_keys: Sequence[str] = ("data",), ) -> list[dict[str, Any]] | None: - """Extract record dicts from a DataHarmonizer JSON export. + """Extract record dicts from a JSON input. Handle several JSON shapes: - * DataHarmonizer Container format:: + * Container format (e.g. DataHarmonizer exports):: {"Container": {"s": [{...}, ...]}} From fc14c4b8c406ad39f9d07631171f75d9c9cebd3d Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 14:10:03 +0000 Subject: [PATCH 19/36] Remove xml validation from submit study script --- bin/submit_study.py | 73 --------------------------------------------- 1 file changed, 73 deletions(-) diff --git a/bin/submit_study.py b/bin/submit_study.py index 04fff2a..cc9d698 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -180,53 +180,6 @@ def validate_hold_until(hold_until: str) -> datetime.date: return hold_date -# ----------------------------------------------------------- -# XSD validation (structural fallback only) -# ----------------------------------------------------------- - - -def validate_xml_against_xsd( - xml_bytes: bytes, - _fragment_tag: str | None = None, # unused; kept for API compatibility - fallback_checker: Callable[ - [bytes, list[str]], tuple[bool, list[str]] - ] | None = None, -) -> tuple[bool, list[str]]: - """Validate XML bytes using a structural check. - - Full XSD validation via lxml is not available in this - container. Uses *fallback_checker* if provided, - otherwise checks that the document is well-formed XML. - - Args: - xml_bytes: Serialised XML document. - _fragment_tag: Unused; kept for API compatibility. - fallback_checker: Optional function called with - (*xml_bytes*, *messages*) that returns - (*is_valid*, *messages*). - - Returns: - Tuple of (*is_valid*, *messages*). - """ - messages: list[str] = [] - - if fallback_checker is not None: - return fallback_checker(xml_bytes, messages) - - try: - ET.fromstring(xml_bytes) - except ET.ParseError as exc: - messages.append( - f"ERROR: XML is not well-formed: {exc}" - ) - return False, messages - - messages.append( - "XML is well-formed (basic check passed)" - ) - return True, messages - - # ----------------------------------------------------------- # File loading (JSON, CSV, TSV) # ----------------------------------------------------------- @@ -543,23 +496,6 @@ def _validate_study_xml_structure( return True, messages -def validate_study_xml( - xml_bytes: bytes, -) -> tuple[bool, list[str]]: - """Validate study XML structure. - - Args: - xml_bytes: Serialised XML document. - - Returns: - Tuple of (*is_valid*, *messages*). - """ - return validate_xml_against_xsd( - xml_bytes, - fallback_checker=_validate_study_xml_structure, - ) - - # ----------------------------------------------------------- # Receipt parsing # ----------------------------------------------------------- @@ -642,15 +578,6 @@ def _do_submission( Returns: ``True`` if the batch succeeded (or dry run). """ - xml_valid, xml_messages = validate_study_xml(xml_bytes) - for msg in xml_messages: - logger.info(" %s", msg) - if not xml_valid: - logger.error("XML validation FAILED (%s) — aborting submission", action) - return False - - logger.info("XML validation PASSED (%s)", action) - if dry_run: logger.info("DRY RUN — skipping %s submission", action) logger.info("Generated XML:\n%s", xml_bytes.decode("utf-8")) From 7e586c44ed7e3953a6a69da2fb67df4e807a3fb2 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 14:13:49 +0000 Subject: [PATCH 20/36] Split e2e tests into seperate files per workflow --- tests/default.nf.test | 84 -------------------------------------- tests/default.nf.test.snap | 58 -------------------------- 2 files changed, 142 deletions(-) delete mode 100644 tests/default.nf.test delete mode 100644 tests/default.nf.test.snap diff --git a/tests/default.nf.test b/tests/default.nf.test deleted file mode 100644 index b436ff9..0000000 --- a/tests/default.nf.test +++ /dev/null @@ -1,84 +0,0 @@ -nextflow_pipeline { - - name "Test pipeline" - script "../main.nf" - tag "pipeline" - - test("metagenomic_assemblies mode — submission_study provided (no study registration)") { - // Exercises the assembly submission path using a pre-registered study (stub mode). - // REGISTERSTUDY is NOT called here; the module-level nf-test covers it. - // - // A samplesheet is generated on the fly with absolute paths so that nf-schema - // validation succeeds regardless of the nf-test launchDir. - options "-stub" - - when { - params { - def csv = new File("${outputDir}/samplesheet_assembly.csv") - csv.parentFile.mkdirs() - csv.text = [ - "sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version", - "sample1,${projectDir}/tests/data/contigs.fasta.gz,${projectDir}/tests/data/fastq_1.fastq,${projectDir}/tests/data/fastq_2.fastq,,ERR000001,SPAdes,3.15", - "sample2,${projectDir}/tests/data/invalid_assembly.fasta.gz,,,45,ERR000002,Velvet,1.2.10", - "sample3,${projectDir}/tests/data/contigs.fasta.gz,,,30,ERR000003,MEGAHIT,1.2.9" - ].join("\n") - - outdir = "$outputDir" - input = csv.absolutePath - mode = "metagenomic_assemblies" - submission_study = "PRJEB98843" - centre_name = "TEST_CENTER" - } - } - - then { - def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) - def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') - assertAll( - { assert workflow.success }, - { assert snapshot( - removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), - stable_name, - stable_path - ).match() } - ) - } - } - - test("metagenomic_assemblies mode — study_metadata provided (REGISTERSTUDY registers study)") { - // Tests the study-registration path in stub mode. REGISTERSTUDY stub - // outputs an empty accessions JSON, so this test validates the plumbing rather - // than the end-to-end submission output. - options "-stub" - - when { - params { - def csv = new File("${outputDir}/samplesheet_assembly.csv") - csv.parentFile.mkdirs() - csv.text = [ - "sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version", - "sample1,${projectDir}/tests/data/contigs.fasta.gz,${projectDir}/tests/data/fastq_1.fastq,${projectDir}/tests/data/fastq_2.fastq,,ERR000001,SPAdes,3.15", - "sample2,${projectDir}/tests/data/invalid_assembly.fasta.gz,,,45,ERR000002,Velvet,1.2.10", - "sample3,${projectDir}/tests/data/contigs.fasta.gz,,,30,ERR000003,MEGAHIT,1.2.9" - ].join("\n") - - outdir = "$outputDir" - input = csv.absolutePath - mode = "metagenomic_assemblies" - study_metadata = "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/seqsubmit/test_data/study_metadata/example_study.json" - centre_name = "TEST_CENTER" - } - } - - then { - assertAll( - { assert workflow.success }, - { assert workflow.trace.succeeded().any { it.name.contains("REGISTERSTUDY") } } - ) - } - } - - // NOTE: The MAGs/bins test requires remote genome files from nf-core/test-datasets - // (https://github.com/nf-core/test-datasets/tree/seqsubmit) and cannot run offline. - // Run it manually with: nf-test test tests/default.nf.test --filter "mags" --profile test_genome,docker -} diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap deleted file mode 100644 index 71a254e..0000000 --- a/tests/default.nf.test.snap +++ /dev/null @@ -1,58 +0,0 @@ -{ - "metagenomic_assemblies mode \u2014 submission_study provided (no study registration)": { - "content": [ - { - "Workflow": { - "nf-core/seqsubmit": "v1.0.0dev" - } - }, - [ - "coverm", - "coverm/sample1.depth.txt", - "fastavalidator", - "fastavalidator/sample1.success.log", - "fastavalidator/sample2.success.log", - "fastavalidator/sample3.success.log", - "generate", - "generate/PRJEB98843_upload", - "generate/PRJEB98843_upload/test.manifest", - "metagenomic_assemblies", - "metagenomic_assemblies/multiqc", - "metagenomic_assemblies/multiqc/multiqc_data", - "metagenomic_assemblies/multiqc/multiqc_plots", - "metagenomic_assemblies/multiqc/multiqc_report.html", - "metagenomic_assemblies/sample1_assembly_metadata.csv", - "metagenomic_assemblies/sample2_assembly_metadata.csv", - "metagenomic_assemblies/sample3_assembly_metadata.csv", - "metagenomic_assemblies/upload", - "metagenomic_assemblies/upload/webin_cli", - "metagenomic_assemblies/upload/webin_cli/sample1_webin-cli.report", - "metagenomic_assemblies/upload/webin_cli/sample2_webin-cli.report", - "metagenomic_assemblies/upload/webin_cli/sample3_webin-cli.report", - "pipeline_info", - "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml", - "samplesheet_assembly.csv" - ], - [ - "sample1.depth.txt:md5,d41d8cd98f00b204e9800998ecf8427e", - "sample1.success.log:md5,b0b859eda1db5cd43915846e00ebc22c", - "sample2.success.log:md5,b0b859eda1db5cd43915846e00ebc22c", - "sample3.success.log:md5,b0b859eda1db5cd43915846e00ebc22c", - "test.manifest:md5,d41d8cd98f00b204e9800998ecf8427e", - "multiqc_report.html:md5,d41d8cd98f00b204e9800998ecf8427e", - "sample1_assembly_metadata.csv:md5,e1a00dc628e95c38e18dfd5161fa2ce4", - "sample2_assembly_metadata.csv:md5,901e55730b100224efb27f23aabf4f67", - "sample3_assembly_metadata.csv:md5,d5b1575095ece78d988395b874440bef", - "sample1_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e", - "sample2_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e", - "sample3_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e", - "samplesheet_assembly.csv:md5,2f74b281cb7096ad80a378b8960aabee" - ] - ], - "timestamp": "2026-03-12T13:22:15.261886", - "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.4" - } - } -} \ No newline at end of file From d80420be2a86248360dd4bec9995b88d77985585 Mon Sep 17 00:00:00 2001 From: Tim Rozday Date: Fri, 13 Mar 2026 14:24:18 +0000 Subject: [PATCH 21/36] nf-core linting --- ro-crate-metadata.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index befa2db..3ff34eb 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "InProgress", "datePublished": "2025-11-20T09:32:34+00:00", - "description": "

\n \n \n \"nf-core/seqsubmit\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/seqsubmit)\n[![GitHub Actions CI Status](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/seqsubmit/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/seqsubmit)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23seqsubmit-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/seqsubmit)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/seqsubmit** is a Nextflow pipeline for submitting sequence data to [ENA](https://www.ebi.ac.uk/ena/browser/home).\nCurrently, the pipeline supports three submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure:\n\n- `mags` for Metagenome Assembled Genomes (MAGs) submission with `GENOMESUBMIT` workflow\n- `bins` for bins submission with `GENOMESUBMIT` workflow\n- `metagenomic_assemblies` for assembly submission with `ASSEMBLYSUBMIT` workflow\n\n![seqsubmit workflow diagram](assets/seqsubmit_schema.png)\n\n## Requirements\n\n- [Nextflow](https://www.nextflow.io/) `>=25.04.0`\n- Webin account registered at https://www.ebi.ac.uk/ena/submit/webin/login\n- Raw reads used to assemble contigs submitted to [INSDC](https://www.insdc.org/) and associated accessions available\n\nSetup your environment secrets before running the pipeline:\n\n`nextflow secrets set WEBIN_ACCOUNT \"Webin-XXX\"`\n\n`nextflow secrets set WEBIN_PASSWORD \"XXX\"`\n\nMake sure you update commands above with your authorised credentials.\n\n## Input samplesheets\n\n### `mags` and `bins` modes (`GENOMESUBMIT`)\n\nThe input must follow `assets/schema_input_genome.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `accession`\n- `assembly_software`\n- `binning_software`\n- `binning_parameters`\n- `stats_generation_software`\n- `metagenome`\n- `environmental_medium`\n- `broad_environment`\n- `local_environment`\n- `co-assembly`\n\nColumns that required for now, but will be optional in the nearest future:\n\n- `completeness`\n- `contamination`\n- `genome_coverage`\n- `rRNA_presence`\n- `NCBI_lineage`\n\nThose fields are metadata required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package. They are described in [docs](https://github.com/EBI-Metagenomics/genome_uploader/blob/main/README.md#input-tsv-and-fields).\n\nExample `samplesheet_genome.csv`:\n\n```csv\nsample,fasta,accession,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,rRNA_presence,NCBI_lineage\nlachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,false,marine,cable_bacteria,marine_sediment,false,d__Bacteria;p__Proteobacteria;s_unclassified_Proteobacteria\n```\n\n### `metagenomic_assemblies` mode (`ASSEMBLYSUBMIT`)\n\nThe input must follow `assets/schema_input_assembly.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `run_accession`\n- `assembler`\n- `assembler_version`\n\nAt least one of the following must be provided per row:\n\n- reads (`fastq_1`, optional `fastq_2` for paired-end)\n- `coverage`\n\nIf `coverage` is missing and reads are provided, the workflow calculates average coverage with `coverm`.\n\nExample `samplesheet_assembly.csv`:\n\n```csv\nsample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version\nassembly_1,data/contigs_1.fasta.gz,data/reads_1.fastq.gz,data/reads_2.fastq.gz,,ERR011322,SPAdes,3.15.5\nassembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9\n```\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n### Required parameters:\n\n| Parameter | Description |\n| -------------------- | --------------------------------------------------------------------------------- |\n| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` |\n| `--input` | Path to the samplesheet describing the data to be submitted |\n| `--outdir` | Path to the output directory for pipeline results |\n| `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to |\n| `--centre_name` | Name of the submitter's organisation |\n\n### Optional parameters:\n\n| Parameter | Description |\n| ------------------- | ---------------------------------------------------------------------------------------- |\n| `--upload_tpa` | Flag to control the type of assembly study (third party assembly or not). Default: false |\n| `--test_upload` | Upload to TEST ENA server instead of LIVE. Default: false |\n| `--webincli_submit` | If set to false, submissions will be validated, but not submitted. Default: true |\n\nGeneral command template:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile \\\n --mode \\\n --input \\\n --centre_name \\\n --submission_study \\\n --outdir \n```\n\nValidation run (submission to the ENA TEST server) in `mags` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode mags \\\n --input assets/samplesheet_genomes.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_mags\n```\n\nValidation run (submission to the ENA TEST server) in `metagenomic_assemblies` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_assemblies\n```\n\nLive submission example:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study PRJEB98843 \\\n --test_upload false \\\n --webincli_submit true \\\n --outdir results/live_assembly\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/seqsubmit/usage) and the [parameter documentation](https://nf-co.re/seqsubmit/parameters).\n\n## Pipeline output\n\nKey output locations in `--outdir`:\n\n- `upload/manifests/`: generated manifest files for submission\n- `upload/webin_cli/`: ENA Webin CLI reports\n- `multiqc/`: MultiQC summary report\n- `pipeline_info/`: execution reports, trace, DAG, and software versions\n\nFor full details, see the [output documentation](https://nf-co.re/seqsubmit/output).\n\n## Credits\n\nnf-core/seqsubmit was originally written by [Martin Beracochea](https://github.com/mberacochea), [Ekaterina Sakharova](https://github.com/KateSakharova), [Sofiia Ochkalova](https://github.com/ochkalova), [Evangelos Karatzas](https://github.com/vagkaratzas).\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#seqsubmit` channel](https://nfcore.slack.com/channels/seqsubmit) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\n\nIf you use this pipeline please make sure to cite all used software.\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **MGnify: the microbiome sequence data analysis resource in 2023**\n>\n> Richardson L, Allen B, Baldi G, Beracochea M, Bileschi ML, Burdett T, et al.\n>\n> Vol. 51, Nucleic Acids Research. Oxford University Press (OUP); 2022. p. D753\u20139. Available from: http://dx.doi.org/10.1093/nar/gkac1080\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

\n \n \n \"nf-core/seqsubmit\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/seqsubmit)\n[![GitHub Actions CI Status](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/seqsubmit/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/seqsubmit)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23seqsubmit-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/seqsubmit)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/seqsubmit** is a Nextflow pipeline for submitting sequence data to [ENA](https://www.ebi.ac.uk/ena/browser/home).\nCurrently, the pipeline supports three submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure:\n\n- `mags` for Metagenome Assembled Genomes (MAGs) submission with `GENOMESUBMIT` workflow\n- `bins` for bins submission with `GENOMESUBMIT` workflow\n- `metagenomic_assemblies` for assembly submission with `ASSEMBLYSUBMIT` workflow\n\n![seqsubmit workflow diagram](assets/seqsubmit_schema.png)\n\n## Requirements\n\n- [Nextflow](https://www.nextflow.io/) `>=25.04.0`\n- Webin account registered at https://www.ebi.ac.uk/ena/submit/webin/login\n- Raw reads used to assemble contigs submitted to [INSDC](https://www.insdc.org/) and associated accessions available\n\nSetup your environment secrets before running the pipeline:\n\n`nextflow secrets set WEBIN_ACCOUNT \"Webin-XXX\"`\n\n`nextflow secrets set WEBIN_PASSWORD \"XXX\"`\n\nMake sure you update commands above with your authorised credentials.\n\n## Input samplesheets\n\nFor detailed descriptions of all samplesheet columns, see the [usage documentation](docs/usage.md#samplesheet-input).\n\n### `mags` and `bins` modes (`GENOMESUBMIT`)\n\nThe input must follow `assets/schema_input_genome.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `accession`\n- `assembly_software`\n- `binning_software`\n- `binning_parameters`\n- `stats_generation_software`\n- `metagenome`\n- `environmental_medium`\n- `broad_environment`\n- `local_environment`\n- `co-assembly`\n\nColumns that required for now, but will be optional in the nearest future:\n\n- `completeness`\n- `contamination`\n- `genome_coverage`\n- `RNA_presence`\n- `NCBI_lineage`\n\nThose fields are metadata required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package.\n\nExample `samplesheet_genome.csv`:\n\n```csv\nsample,fasta,accession,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,RNA_presence,NCBI_lineage\nlachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,No,marine,cable_bacteria,marine_sediment,No,d__Bacteria;p__Proteobacteria;s_unclassified_Proteobacteria\n```\n\n### `metagenomic_assemblies` mode (`ASSEMBLYSUBMIT`)\n\nThe input must follow `assets/schema_input_assembly.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `run_accession`\n- `assembler`\n- `assembler_version`\n\nAt least one of the following must be provided per row:\n\n- reads (`fastq_1`, optional `fastq_2` for paired-end)\n- `coverage`\n\nIf `coverage` is missing and reads are provided, the workflow calculates average coverage with `coverm`.\n\nExample `samplesheet_assembly.csv`:\n\n```csv\nsample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version\nassembly_1,data/contigs_1.fasta.gz,data/reads_1.fastq.gz,data/reads_2.fastq.gz,,ERR011322,SPAdes,3.15.5\nassembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9\n```\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n### Required parameters:\n\n| Parameter | Description |\n| -------------------- | --------------------------------------------------------------------------------- |\n| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` |\n| `--input` | Path to the samplesheet describing the data to be submitted |\n| `--outdir` | Path to the output directory for pipeline results |\n| `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to |\n| `--centre_name` | Name of the submitter's organisation |\n\n### Optional parameters:\n\n| Parameter | Description |\n| ------------------- | ---------------------------------------------------------------------------------------- |\n| `--upload_tpa` | Flag to control the type of assembly study (third party assembly or not). Default: false |\n| `--test_upload` | Upload to TEST ENA server instead of LIVE. Default: false |\n| `--webincli_submit` | If set to false, submissions will be validated, but not submitted. Default: true |\n\nGeneral command template:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile \\\n --mode \\\n --input \\\n --centre_name \\\n --submission_study \\\n --outdir \n```\n\nValidation run (submission to the ENA TEST server) in `mags` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode mags \\\n --input assets/samplesheet_genomes.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_mags\n```\n\nValidation run (submission to the ENA TEST server) in `metagenomic_assemblies` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study \\\n --centre_name TEST_CENTER \\\n --webincli_submit true \\\n --test_upload true \\\n --outdir results/validate_assemblies\n```\n\nLive submission example:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n -profile docker \\\n --mode metagenomic_assemblies \\\n --input assets/samplesheet_assembly.csv \\\n --submission_study PRJEB98843 \\\n --test_upload false \\\n --webincli_submit true \\\n --outdir results/live_assembly\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/seqsubmit/usage) and the [parameter documentation](https://nf-co.re/seqsubmit/parameters).\n\n## Pipeline output\n\nKey output locations in `--outdir`:\n\n- `upload/manifests/`: generated manifest files for submission\n- `upload/webin_cli/`: ENA Webin CLI reports\n- `multiqc/`: MultiQC summary report\n- `pipeline_info/`: execution reports, trace, DAG, and software versions\n\nFor full details, see the [output documentation](https://nf-co.re/seqsubmit/output).\n\n## Credits\n\nnf-core/seqsubmit was originally written by [Martin Beracochea](https://github.com/mberacochea), [Ekaterina Sakharova](https://github.com/KateSakharova), [Sofiia Ochkalova](https://github.com/ochkalova), [Evangelos Karatzas](https://github.com/vagkaratzas).\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#seqsubmit` channel](https://nfcore.slack.com/channels/seqsubmit) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\n\n\n\nIf you use this pipeline please make sure to cite all used software.\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **MGnify: the microbiome sequence data analysis resource in 2023**\n>\n> Richardson L, Allen B, Baldi G, Beracochea M, Bileschi ML, Burdett T, et al.\n>\n> Vol. 51, Nucleic Acids Research. Oxford University Press (OUP); 2022. p. D753\u20139. Available from: http://dx.doi.org/10.1093/nar/gkac1080\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" From 0d743f25a9417978d94195146fa9eec6c767d860 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova Date: Mon, 23 Mar 2026 14:48:13 +0000 Subject: [PATCH 22/36] remove dead code --- bin/submit_study.py | 59 +++------------------------------------------ conf/modules.config | 2 +- 2 files changed, 4 insertions(+), 57 deletions(-) diff --git a/bin/submit_study.py b/bin/submit_study.py index cc9d698..b644ca3 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -8,10 +8,11 @@ secrets appearing in shell history or process listings:: export ENA_WEBIN=Webin-XXXXX - export ENA_WEBIN_PASSWORD=SECRET + export ENA_WEBIN_PASSWORD=XXXXX Usage:: + # Submission to TEST server (submissions are discarded daily): python bin/submit_study.py \\ --input studies.json \\ --test @@ -36,7 +37,7 @@ import os import sys import xml.etree.ElementTree as ET -from collections.abc import Callable, Sequence +from collections.abc import Sequence from io import BytesIO from pathlib import Path from typing import Any, Final @@ -449,53 +450,6 @@ def _add_project_attribute( val_el.text = value_text -# ----------------------------------------------------------- -# Structural XML validation (study-specific) -# ----------------------------------------------------------- - - -def _validate_study_xml_structure( - xml_bytes: bytes, - messages: list[str], -) -> tuple[bool, list[str]]: - """Structural check for study XML.""" - try: - tree = ET.fromstring(xml_bytes) - except ET.ParseError as exc: - messages.append( - f"ERROR: XML is not well-formed: {exc}" - ) - return False, messages - - messages.append( - "XML is well-formed (basic check passed)" - ) - - project_set = tree.find("PROJECT_SET") - if project_set is None: - messages.append("ERROR: Missing PROJECT_SET element") - return False, messages - - projects = project_set.findall("PROJECT") - if not projects: - messages.append("ERROR: No PROJECT elements found") - return False, messages - - for proj in projects: - alias = proj.get("alias", "") - title = proj.find("TITLE") - if title is None or not title.text: - messages.append(f"ERROR: PROJECT '{alias}' missing TITLE") - return False, messages - sp = proj.find("SUBMISSION_PROJECT") - if sp is None: - messages.append(f"ERROR: PROJECT '{alias}' missing SUBMISSION_PROJECT") - return False, messages - messages.append(f"OK: PROJECT '{alias}' has required elements") - - return True, messages - - # ----------------------------------------------------------- # Receipt parsing # ----------------------------------------------------------- @@ -643,12 +597,6 @@ def _do_submission( default=None, help="Hold studies private until this date (YYYY-MM-DD, max 2 years from now)", ) -@click.option( - "--log", "log_file", - type=click.Path(path_type=Path), - default=None, - help="Path to log file", -) @click.option( "--output", type=click.Path(path_type=Path), @@ -664,7 +612,6 @@ def main( input_file: Path, use_test: bool, hold_until: str | None, - log_file: Path | None, output: Path | None, validate: bool, ) -> None: diff --git a/conf/modules.config b/conf/modules.config index 94f2e94..4497a5f 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -57,6 +57,6 @@ process { } withName: 'REGISTERSTUDY' { - ext.args = { [params.test_upload ? "--test" : "", "--automated"].findAll().join(" ") } + ext.args = { params.test_upload ? "--test" : "" } } } From 763caeadcf75aff130d1d31d73b25c88d755f5f7 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova Date: Mon, 23 Mar 2026 17:50:16 +0000 Subject: [PATCH 23/36] add input format examples, do minor simplifications of the code --- bin/submit_study.py | 102 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 89 insertions(+), 13 deletions(-) diff --git a/bin/submit_study.py b/bin/submit_study.py index b644ca3..0fed37e 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -1,9 +1,91 @@ #!/usr/bin/env python3 -"""Submit raw-reads, assembly and genome studies to ENA via the Webin REST API v2. +"""Submit studies to ENA via the Webin REST API v2. Read a study metadata file (JSON, CSV, or TSV), construct an XML submission document, and submit new studies to ENA. +# TODO: Currently script supports multiple input format that might be unnecessary. +# TODO: Consider standardising on a single format (e.g. JSON and/or TSV) and deprecating the others. +Input formats accepted (``--input``): + +* ``.json`` +* ``.csv`` +* ``.tsv`` + +Example JSON inputs accepted:: + + { + "alias": "study-gut-2026", + "STUDY_TITLE": "Gut microbiome study", + "STUDY_ABSTRACT": "Characterisation of gut microbial communities", + "existing_study_type": "Metagenomics" + } + + [ + { + "alias": "study-gut-2026", + "STUDY_TITLE": "Gut microbiome study", + "STUDY_ABSTRACT": "Characterisation of gut microbial communities", + "existing_study_type": "Metagenomics" + }, + ... + ] + + { + "studies": [ + { + "alias": "study-soil-2026", + "STUDY_TITLE": "Soil microbiome study", + "existing_study_type": "Other", + "new_study_type": "Environmental microbiome" + } + ] + } + + { + "data": [ + { + "alias": "study-soil-2026", + "STUDY_TITLE": "Soil microbiome study", + } + ] + } + + { + "Container": { + "Studies": [ + { + "STUDY_TITLE": "Marine metagenome study", + "STUDY_ABSTRACT": "Shotgun metagenomics from seawater" + } + ] + } + } + +Example CSV input accepted:: + + alias,STUDY_TITLE,STUDY_ABSTRACT,existing_study_type + study-gut-2026,Gut microbiome study,Characterisation of gut microbial communities,Metagenomics + +Example TSV input accepted:: + + alias\tSTUDY_TITLE\tSTUDY_ABSTRACT\texisting_study_type + study-soil-2026\tSoil microbiome study\tSurvey of soil microbiota\tMetagenomics + +Study metadata fields: + +Mandatory: + +* ``STUDY_TITLE`` — study title used in ````. + +Optional: + +* ``alias`` — project alias; if missing, derived from ``STUDY_TITLE`` (first 50 characters). +* ``CENTER_PROJECT_NAME`` — written to ``<NAME>``; defaults to alias. +* ``STUDY_ABSTRACT`` or ``STUDY_DESCRIPTION`` — written to ``<DESCRIPTION>``. +* ``existing_study_type`` — included as PROJECT_ATTRIBUTE. +* ``new_study_type`` — included only when ``existing_study_type == "Other"``. + Credentials are read from environment variables to avoid secrets appearing in shell history or process listings:: @@ -58,7 +140,7 @@ level=logging.INFO, stream=sys.stderr, ) -logger = logging.getLogger("ena_submit.study") +logger = logging.getLogger() # ----------------------------------------------------------- @@ -91,11 +173,6 @@ def get_credentials() -> tuple[str, str]: TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2" -def get_base_url(use_test: bool) -> str: - """Return the ENA Webin v2 submission base URL.""" - return TEST_URL if use_test else PROD_URL - - def submit_xml( base_url: str, auth: HTTPBasicAuth, @@ -512,7 +589,6 @@ def _do_submission( xml_bytes: bytes, action: str, results: dict[str, list[dict[str, Any]]], - result_key: str, env_label: str, dry_run: bool, ) -> bool: @@ -525,8 +601,7 @@ def _do_submission( action: Label for log messages (``"ADD"`` or ``"MODIFY"``). results: Results dict to accumulate into. - result_key: Key under which successes are stored. - env_label: ``"TEST"`` or ``"PRODUCTION"``. + env_label: ``"TEST server"`` or ``"LIVE server"``. dry_run: If ``True``, skip the actual submission. Returns: @@ -559,7 +634,7 @@ def _do_submission( " %s: alias=%s accession=%s status=%s%s", action, acc["alias"], acc["accession"], acc["status"], ext_suffix, ) - results[result_key].append(acc) + results["submitted"].append(acc) else: logger.error("%s FAILED", action) receipt_xml_str = ET.tostring( @@ -618,9 +693,10 @@ def main( """Submit studies to ENA via the Webin REST API v2.""" username, password = get_credentials() - env_label = "TEST" if use_test else "PRODUCTION" + env_label = "TEST server" if use_test else "LIVE server" logger.info("ENA Study Submission — environment: %s", env_label) - base_url = get_base_url(use_test) + base_url = TEST_URL if use_test else PROD_URL + auth = HTTPBasicAuth(username, password) logger.debug("Auth username: %s", username) From ac77c707864b1368dcdff471fe12a6897ccff78d Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Mon, 23 Mar 2026 17:50:45 +0000 Subject: [PATCH 24/36] update meta.yml for registerstudy --- modules/local/registerstudy/meta.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/modules/local/registerstudy/meta.yml b/modules/local/registerstudy/meta.yml index 549f187..385c79e 100644 --- a/modules/local/registerstudy/meta.yml +++ b/modules/local/registerstudy/meta.yml @@ -2,11 +2,10 @@ name: "registerstudy" description: | Submit a new study to ENA via the Webin REST API v2. - Reads study metadata from a JSON, CSV, or TSV file, checks for - duplicate studies already registered under the Webin account, + Reads study metadata from a JSON, CSV, or TSV file, builds a PROJECT XML submission document, and submits to ENA. Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD - Nextflow secrets, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD + env variables, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD inside the process. keywords: - ena @@ -53,8 +52,7 @@ output: type: file description: | JSON file containing the submission results with keys: - submitted (newly created accessions), duplicates (skipped), - modified (force-updated), and failed. + submitted (newly created accessions) and failed. pattern: "*_accessions.json" - versions: - "versions.yml": From 6ede603bcde4716b51612ccd1c486249c479a33f Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Tue, 24 Mar 2026 11:35:24 +0000 Subject: [PATCH 25/36] refactor and bugfix registerstudy --- bin/submit_study.py | 193 ++++++++++-------- conf/modules.config | 5 +- .../local/registerstudy/tests/nextflow.config | 11 +- tests/nextflow.config | 29 --- 4 files changed, 109 insertions(+), 129 deletions(-) delete mode 100644 tests/nextflow.config diff --git a/bin/submit_study.py b/bin/submit_study.py index 0fed37e..0c88bfd 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -1,11 +1,13 @@ #!/usr/bin/env python3 -"""Submit studies to ENA via the Webin REST API v2. +"""Submit studies to ENA via the Webin drop-box XML submission service. Read a study metadata file (JSON, CSV, or TSV), construct an XML submission document, and submit new studies to ENA. # TODO: Currently script supports multiple input format that might be unnecessary. # TODO: Consider standardising on a single format (e.g. JSON and/or TSV) and deprecating the others. +# TODO: Consider which columns are mandatory vs optional. "alias" is optional, might be worth making it mandatory. +# TODO: Add input file validation and error handling (e.g. missing mandatory fields, long alias). Input formats accepted (``--input``): * ``.json`` @@ -114,6 +116,7 @@ import csv import datetime +import hashlib import json import logging import os @@ -169,33 +172,34 @@ def get_credentials() -> tuple[str, str]: # ENA API helpers # ----------------------------------------------------------- -PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2" -TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2" +PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/drop-box" +TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box" def submit_xml( base_url: str, auth: HTTPBasicAuth, - xml_bytes: bytes, + submission_xml_bytes: bytes, + project_xml_bytes: bytes, ) -> ET.Element: - """Submit an XML document to ENA via Webin v2. + """Submit study XMLs to ENA via the submit/drop-box endpoint. Args: base_url: ENA submission service base URL. auth: HTTP basic-auth credentials. - xml_bytes: Serialised XML submission document. + submission_xml_bytes: Serialised ``<SUBMISSION>`` XML. + project_xml_bytes: Serialised ``<PROJECT_SET>`` XML. Returns: Parsed receipt XML element tree root. """ url = f"{base_url}/submit" - headers = { - "Content-Type": "application/xml", - "Accept": "application/xml", + files = { + "SUBMISSION": ("submission.xml", submission_xml_bytes, "application/xml"), + "PROJECT": ("project.xml", project_xml_bytes, "application/xml"), } resp = requests.post( - url, data=xml_bytes, - headers=headers, auth=auth, timeout=120, + url, files=files, auth=auth, timeout=120, ) resp.raise_for_status() return ET.fromstring(resp.content) @@ -423,35 +427,24 @@ def write_results( # ----------------------------------------------------------- -def build_submission_xml( - studies: list[dict[str, Any]], +def build_submission_actions_xml( hold_until: str | None = None, action: str = "ADD", ) -> ET.Element: - """Build a WEBIN XML document for submitting studies. + """Build the ``<SUBMISSION>`` actions XML element. - Each study in the input list is converted to a PROJECT - element. + This is submitted as the ``SUBMISSION`` multipart field. Args: - studies: Study metadata dicts. hold_until: Optional hold-until date string (``YYYY-MM-DD``). action: Submission action — ``"ADD"`` for new studies or ``"MODIFY"`` to update existing ones. Returns: - Root ``<WEBIN>`` element. + Root ``<SUBMISSION>`` element. """ - webin = ET.Element("WEBIN") - - # SUBMISSION_SET - submission_set = ET.SubElement(webin, "SUBMISSION_SET") - submission = ET.SubElement( - submission_set, "SUBMISSION", - ) - sub_alias = f"study-submission-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}" - submission.set("alias", sub_alias) + submission = ET.Element("SUBMISSION") actions = ET.SubElement(submission, "ACTIONS") main_action = ET.SubElement(actions, "ACTION") ET.SubElement(main_action, action.upper()) @@ -459,59 +452,74 @@ def build_submission_xml( hold_action = ET.SubElement(actions, "ACTION") hold_el = ET.SubElement(hold_action, "HOLD") hold_el.set("HoldUntilDate", hold_until) + return submission - # PROJECT_SET - project_set = ET.SubElement(webin, "PROJECT_SET") - for study in studies: - _add_project_element(project_set, study) - - return webin - - -def _add_project_element( - project_set: ET.Element, - study: dict[str, Any], -) -> None: - """Append a ``<PROJECT>`` element to *project_set*.""" - alias = study.get( - "alias", - study.get("STUDY_TITLE", "").replace(" ", "_")[:50], - ) - project = ET.SubElement(project_set, "PROJECT") - project.set("alias", alias) - - name_text = study.get("CENTER_PROJECT_NAME", alias) - if name_text: - name_el = ET.SubElement(project, "NAME") - name_el.text = name_text - title_el = ET.SubElement(project, "TITLE") - title_el.text = study.get("STUDY_TITLE", "") +def build_project_set_xml( + studies: list[dict[str, Any]], + test: bool = False, +) -> ET.Element: + """Build the ``<PROJECT_SET>`` XML element. - desc_text = ( - study.get("STUDY_ABSTRACT") - or study.get("STUDY_DESCRIPTION", "") - ) - if desc_text: - desc_el = ET.SubElement(project, "DESCRIPTION") - desc_el.text = desc_text + This is submitted as the ``PROJECT`` multipart field. - sp = ET.SubElement(project, "SUBMISSION_PROJECT") - ET.SubElement(sp, "SEQUENCING_PROJECT") + Args: + studies: Study metadata dicts. + test: If ``True``, append a timestamp-based hash to aliases + for uniqueness in test submissions. - study_type = study.get("existing_study_type") - if study_type: - attrs = ET.SubElement( - project, "PROJECT_ATTRIBUTES", + Returns: + Root ``<PROJECT_SET>`` element. + """ + project_set = ET.Element("PROJECT_SET") + for study in studies: + alias = study.get( + "alias", + study.get("STUDY_TITLE", "").replace(" ", "_")[:50], ) - _add_project_attribute( - attrs, "existing_study_type", study_type, + if test: + # Append 8-character hash of current timestamp for uniqueness in test mode + timestamp_hash = hashlib.md5( + datetime.datetime.now().isoformat().encode() + ).hexdigest()[:8] + alias = f"{alias}_{timestamp_hash}" + + project = ET.SubElement(project_set, "PROJECT") + project.set("alias", alias) + + name_text = study.get("CENTER_PROJECT_NAME", alias) + if name_text: + name_el = ET.SubElement(project, "NAME") + name_el.text = name_text + + title_el = ET.SubElement(project, "TITLE") + title_el.text = study.get("STUDY_TITLE", "") + + desc_text = ( + study.get("STUDY_ABSTRACT") + or study.get("STUDY_DESCRIPTION", "") ) - new_type = study.get("new_study_type") - if new_type and study_type == "Other": + if desc_text: + desc_el = ET.SubElement(project, "DESCRIPTION") + desc_el.text = desc_text + + sp = ET.SubElement(project, "SUBMISSION_PROJECT") + ET.SubElement(sp, "SEQUENCING_PROJECT") + # TODO: Check existing_study_type and new_study_type metadata fields, do we need those? + study_type = study.get("existing_study_type") + if study_type: + attrs = ET.SubElement( + project, "PROJECT_ATTRIBUTES", + ) _add_project_attribute( - attrs, "new_study_type", new_type, + attrs, "existing_study_type", study_type, ) + new_type = study.get("new_study_type") + if new_type and study_type == "Other": + _add_project_attribute( + attrs, "new_study_type", new_type, + ) + return project_set def _add_project_attribute( @@ -554,6 +562,8 @@ def parse_xml_receipt( for err in msgs_el.findall("ERROR"): messages.append(f"ERROR: {err.text}") + # TODO: "accession" should be present for successful submissions + # TODO: remove get default and log error if missing. for proj in receipt_root.findall("PROJECT"): acc_info: dict[str, str] = { "alias": proj.get("alias", ""), @@ -586,7 +596,8 @@ def parse_xml_receipt( def _do_submission( base_url: str, auth: Any, - xml_bytes: bytes, + submission_xml_bytes: bytes, + project_xml_bytes: bytes, action: str, results: dict[str, list[dict[str, Any]]], env_label: str, @@ -595,9 +606,10 @@ def _do_submission( """Validate, optionally submit, and parse one batch. Args: - base_url: ENA Webin v2 submission base URL. + base_url: ENA submission base URL. auth: HTTP basic-auth credentials. - xml_bytes: Serialised XML submission document. + submission_xml_bytes: Serialised ``<SUBMISSION>`` actions XML. + project_xml_bytes: Serialised ``<PROJECT_SET>`` XML. action: Label for log messages (``"ADD"`` or ``"MODIFY"``). results: Results dict to accumulate into. @@ -609,12 +621,13 @@ def _do_submission( """ if dry_run: logger.info("DRY RUN — skipping %s submission", action) - logger.info("Generated XML:\n%s", xml_bytes.decode("utf-8")) + logger.info("SUBMISSION XML:\n%s", submission_xml_bytes.decode("utf-8")) + logger.info("PROJECT XML:\n%s", project_xml_bytes.decode("utf-8")) return True logger.info("Submitting %s to ENA (%s)...", action, env_label) try: - receipt_root = submit_xml(base_url, auth, xml_bytes) + receipt_root = submit_xml(base_url, auth, submission_xml_bytes, project_xml_bytes) except requests.exceptions.HTTPError as exc: logger.error("HTTP error during %s submission: %s", action, exc) if exc.response is not None: @@ -654,7 +667,7 @@ def _do_submission( @click.command( - help="Submit studies to ENA via the Webin REST API v2.", + help="Register studies with ENA using Webin XML submission service.", ) @click.option( "--input", "input_file", @@ -690,7 +703,7 @@ def main( output: Path | None, validate: bool, ) -> None: - """Submit studies to ENA via the Webin REST API v2.""" + """Register studies with ENA using Webin XML submission service.""" username, password = get_credentials() env_label = "TEST server" if use_test else "LIVE server" @@ -726,15 +739,18 @@ def main( # -- Step 2: Build and submit XML -------------------- logger.info("Building ADD XML for %d study/studies...", len(studies)) - xml_root = build_submission_xml(studies, hold_until=hold_until, action="ADD") - xml_bytes = xml_to_bytes(xml_root) - logger.debug("Generated XML:\n%s", xml_bytes.decode("utf-8")) - logger.info("XML document size: %d bytes", len(xml_bytes)) + submission_root = build_submission_actions_xml(hold_until=hold_until, action="ADD") + project_root = build_project_set_xml(studies, test=use_test) + submission_xml_bytes = xml_to_bytes(submission_root) + project_xml_bytes = xml_to_bytes(project_root) + logger.info("SUBMISSION XML document size: %d bytes", len(submission_xml_bytes)) + logger.debug("SUBMISSION XML:\n%s", submission_xml_bytes.decode("utf-8")) + logger.info("PROJECT XML document size: %d bytes", len(project_xml_bytes)) + logger.debug("PROJECT XML:\n%s", project_xml_bytes.decode("utf-8")) ok = _do_submission( - base_url, auth, xml_bytes, + base_url, auth, submission_xml_bytes, project_xml_bytes, action="ADD", results=results, - result_key="submitted", env_label=env_label, dry_run=validate, ) @@ -748,10 +764,11 @@ def main( logger.info("=" * 60) logger.info("SUBMISSION SUMMARY") logger.info(" Submitted (ADD): %d", len(results["submitted"])) - for s in results["submitted"]: - ext = s.get("external_accession", "") - ext_suffix = f" ({ext})" if ext else "" - logger.info(" %s -> %s%s", s["alias"], s["accession"], ext_suffix) + for submission in results["submitted"]: + alias = submission["alias"] + accession = submission["accession"] + external_accession = submission["external_accession"] + logger.info(f" {alias} -> {accession} ({external_accession})") logger.info("=" * 60) diff --git a/conf/modules.config b/conf/modules.config index c45656c..aab05b3 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -71,11 +71,8 @@ process { ] } - withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI' { + withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI|REGISTERSTUDY' { ext.args = { params.test_upload ? "--test" : "" } } - withName: 'REGISTERSTUDY' { - ext.args = { params.test_upload ? "--test" : "" } - } } diff --git a/modules/local/registerstudy/tests/nextflow.config b/modules/local/registerstudy/tests/nextflow.config index aaf8385..0a1acb3 100644 --- a/modules/local/registerstudy/tests/nextflow.config +++ b/modules/local/registerstudy/tests/nextflow.config @@ -8,16 +8,11 @@ process { withName: REGISTERSTUDY { - ext.args = '--test --validate' + ext.args = '--test' } } env { - ENA_WEBIN = 'Webin-000000' - ENA_WEBIN_PASSWORD = 'dummy-password' -} - -docker { - enabled = true - runOptions = '-u $(id -u):$(id -g)' + ENA_WEBIN = secrets.ENA_WEBIN + ENA_WEBIN_PASSWORD = secrets.ENA_WEBIN_PASSWORD } diff --git a/tests/nextflow.config b/tests/nextflow.config deleted file mode 100644 index be915f5..0000000 --- a/tests/nextflow.config +++ /dev/null @@ -1,29 +0,0 @@ -/* -======================================================================================== - Nextflow config file for running nf-test tests -======================================================================================== -*/ - -// TODO nf-core: Specify any additional parameters here -// Or any resources requirements -params { - modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/seqsubmit' -} - -process { - resourceLimits = [ - cpus: 2, - memory: '8.GB', - time: '1.h' - ] -} - -// Override secrets-based env vars so nf-test runs don't require a populated keystore. -// Stub-mode tests never use the actual credentials. -env { - ENA_WEBIN = "test_webin_account" - ENA_WEBIN_PASSWORD = "test_webin_password" -} - -aws.client.anonymous = true // fixes S3 access issues on self-hosted runners From ab305dabe7da90ceab172af721999fab271f5cd2 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Tue, 24 Mar 2026 12:46:20 +0000 Subject: [PATCH 26/36] add real test submission for registerstudy in nf-tests --- bin/submit_study.py | 2 -- modules/local/registerstudy/meta.yml | 8 +++++--- modules/local/registerstudy/tests/main.nf.test | 16 +++++++--------- workflows/genomesubmit.nf | 1 - 4 files changed, 12 insertions(+), 15 deletions(-) diff --git a/bin/submit_study.py b/bin/submit_study.py index 0c88bfd..b4b700f 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -136,8 +136,6 @@ # Logging # ----------------------------------------------------------- -# All loggers in the ENA submission scripts share this root, -# so configuring it once propagates to all child loggers. logging.basicConfig( format="%(levelname)s: %(message)s", level=logging.INFO, diff --git a/modules/local/registerstudy/meta.yml b/modules/local/registerstudy/meta.yml index 385c79e..e3e3245 100644 --- a/modules/local/registerstudy/meta.yml +++ b/modules/local/registerstudy/meta.yml @@ -1,10 +1,10 @@ # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json name: "registerstudy" description: | - Submit a new study to ENA via the Webin REST API v2. + Submit a new study to ENA via via the Webin drop-box XML submission service. Reads study metadata from a JSON, CSV, or TSV file, - builds a PROJECT XML submission document, and submits to ENA. - Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD + builds SUBMISSION XML and PROJECT XML, and submits to ENA. + Credentials are read from the ENA_WEBIN and ENA_WEBIN_PASSWORD env variables, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD inside the process. keywords: @@ -62,5 +62,7 @@ output: authors: - "@timrozday" + - "@ochkalova" maintainers: - "@timrozday" + - "@ochkalova" diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test index 5d61673..cdc0e69 100644 --- a/modules/local/registerstudy/tests/main.nf.test +++ b/modules/local/registerstudy/tests/main.nf.test @@ -7,8 +7,7 @@ nextflow_process { tag "modules" tag "registerstudy" - test("registerstudy - stub") { - options "-stub" + test("registerstudy - submission to ENA test server") { when { process { @@ -24,14 +23,15 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert snapshot(sanitizeOutput(process.out)).match() } + { assert path(process.out.accessions[0][1]).exists() }, + { assert path(process.out.accessions[0][1]).json.submitted instanceof List }, + { assert path(process.out.accessions[0][1]).json.failed.size() == 0 } ) } } - test("registerstudy - dry run against ENA test server") { - // Validates and builds the submission XML but does not submit to ENA. - // Dummy credentials in tests/nextflow.config are sufficient for dry-run mode. + test("registerstudy - stub") { + options "-stub" when { process { @@ -47,9 +47,7 @@ nextflow_process { then { assertAll( { assert process.success }, - { assert path(process.out.accessions[0][1]).exists() }, - { assert path(process.out.accessions[0][1]).json.submitted instanceof List }, - { assert path(process.out.accessions[0][1]).json.failed.size() == 0 } + { assert snapshot(sanitizeOutput(process.out)).match() } ) } } diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf index afc7859..4b8ca08 100644 --- a/workflows/genomesubmit.nf +++ b/workflows/genomesubmit.nf @@ -259,7 +259,6 @@ workflow GENOMESUBMIT { .map { _meta, json -> def data = new groovy.json.JsonSlurper().parse(json) data.submitted[0]?.accession - ?: data.duplicates[0]?.existing_accession } } From 1ddd2b15982fe07812799a486875f430df26a83c Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Tue, 24 Mar 2026 12:58:13 +0000 Subject: [PATCH 27/36] update schema --- nextflow_schema.json | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/nextflow_schema.json b/nextflow_schema.json index 7e42ad9..d5d0ebb 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -308,7 +308,7 @@ "format": "file-path", "exists": true, "description": "Path to study metadata file (JSON, CSV, or TSV) for registering a new ENA study. Required when submission_study is not provided.", - "help_text": "File containing study metadata fields (STUDY_TITLE, STUDY_ABSTRACT, existing_study_type, alias, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.", + "help_text": "File containing study metadata fields (required: STUDY_TITLE, optional: alias, STUDY_ABSTRACT, existing_study_type, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.", "fa_icon": "fas fa-file-alt" }, "webincli_submit": { @@ -320,6 +320,20 @@ } } }, + "oneOf": [ + { + "required": ["submission_study"], + "not": { + "required": ["study_metadata"] + } + }, + { + "required": ["study_metadata"], + "not": { + "required": ["submission_study"] + } + } + ], "allOf": [ { "$ref": "#/$defs/input_output_options" From 6026318b4cd3fdda1bd44e1e1b8c19a80d062362 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Tue, 24 Mar 2026 13:07:52 +0000 Subject: [PATCH 28/36] revert some minor changes for branch merging later --- conf/modules.config | 1 - modules/local/ena_webin_cli/main.nf | 12 ----------- tests/default.nf.test | 33 +++++++++++++++++++++++++++++ tests/nextflow.config | 14 ++++++++++++ 4 files changed, 47 insertions(+), 13 deletions(-) create mode 100644 tests/default.nf.test create mode 100644 tests/nextflow.config diff --git a/conf/modules.config b/conf/modules.config index aab05b3..f828bef 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -74,5 +74,4 @@ process { withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI|REGISTERSTUDY' { ext.args = { params.test_upload ? "--test" : "" } } - } diff --git a/modules/local/ena_webin_cli/main.nf b/modules/local/ena_webin_cli/main.nf index e5f878e..25b12f4 100644 --- a/modules/local/ena_webin_cli/main.nf +++ b/modules/local/ena_webin_cli/main.nf @@ -58,16 +58,4 @@ process ENA_WEBIN_CLI { false fi """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}_webin-cli.report - export STATUS="success" - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - ena-webin-cli: 0.0.0 - END_VERSIONS - """ } diff --git a/tests/default.nf.test b/tests/default.nf.test new file mode 100644 index 0000000..44f2465 --- /dev/null +++ b/tests/default.nf.test @@ -0,0 +1,33 @@ +nextflow_pipeline { + + name "Test pipeline" + script "../main.nf" + tag "pipeline" + + test("-profile test") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + assertAll( + { assert workflow.success}, + { assert snapshot( + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 0000000..695d52b --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,14 @@ +/* +======================================================================================== + Nextflow config file for running nf-test tests +======================================================================================== +*/ + +// TODO nf-core: Specify any additional parameters here +// Or any resources requirements +params { + modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/seqsubmit' +} + +aws.client.anonymous = true // fixes S3 access issues on self-hosted runners From 8ac674db79b7c85e9a5383538dd23dc81921c0d8 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Tue, 24 Mar 2026 13:08:09 +0000 Subject: [PATCH 29/36] update docs --- README.md | 20 ++++++++++------ docs/usage.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 450e725..fb8c2cf 100644 --- a/README.md +++ b/README.md @@ -116,15 +116,21 @@ assembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9 > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. +### Submission study + +All data submitted through this pipeline must be associated with an ENA study (project). You can either pass an accession of your existing study via `--submission_study`or provide a metadata file via `--study_metadata` and the pipeline will register the study with ENA before submitting your data. + +See the [usage documentation](docs/usage.md#submission-study) for more details. + ### Required parameters: -| Parameter | Description | -| -------------------- | --------------------------------------------------------------------------------- | -| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` | -| `--input` | Path to the samplesheet describing the data to be submitted | -| `--outdir` | Path to the output directory for pipeline results | -| `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to | -| `--centre_name` | Name of the submitter's organisation | +| Parameter | Description | +| ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------- | +| `--mode` | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` | +| `--input` | Path to the samplesheet describing the data to be submitted | +| `--outdir` | Path to the output directory for pipeline results | +| `--submission_study` OR `--study_metadata` | ENA study accession (PRJ/ERP) to submit the data to OR metadata file in JSON/TSV/CSV format to register new study | +| `--centre_name` | Name of the submitter's organisation | ### Optional parameters: diff --git a/docs/usage.md b/docs/usage.md index 0833bb6..9cab0d0 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -99,6 +99,70 @@ assembly_002,data/assembly_002.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9 An example file is available at [assets/samplesheet_assembly.csv](../assets/samplesheet_assembly.csv). +## Submission study + +All data submitted through this pipeline must be associated with an ENA study (project). You have two options: + +### Option 1 — Use an existing study + +If you already have an ENA study, pass its accession (starting with `PRJ` or `ERP`) via `--submission_study`: + +```bash +--submission_study PRJEB12345 +``` + +You can create a study manually via the [Webin Portal](https://www.ebi.ac.uk/ena/submit/webin/login) and then use the assigned accession here. + +### Option 2 — Register a new study automatically + +Provide a study metadata file via `--study_metadata` and the pipeline will register the study with ENA before submitting your data: + +```bash +--study_metadata study_metadata.json +``` + +The pipeline accepts JSON, CSV, and TSV formats. + +#### JSON formats + +Single study as a flat object: + +```json +{ + "alias": "study-gut-2026", + "STUDY_TITLE": "Gut microbiome study", + "STUDY_ABSTRACT": "Characterisation of gut microbial communities" +} +``` + +#### CSV format + +```csv +alias,STUDY_TITLE,STUDY_ABSTRACT +study-gut-2026,Gut microbiome study,Characterisation of gut microbial communities +``` + +#### TSV format + +```tsv +alias STUDY_TITLE STUDY_ABSTRACT +study-soil-2026 Soil microbiome study Survey of soil microbiota +``` + +#### Study metadata fields + +| Field | Required | Description | +| --------------------- | -------- | ------------------------------------------------------------------------------------------- | +| `STUDY_TITLE` | Yes | Descriptive title of the study. | +| `alias` | No | Unique project alias within your Webin account. Derived from `STUDY_TITLE` if not provided. | +| `STUDY_ABSTRACT` | No | Free-text abstract describing the study. | +| `STUDY_DESCRIPTION` | No | Alternative to `STUDY_ABSTRACT`. | +| `CENTER_PROJECT_NAME` | No | Internal project name at your centre. Defaults to `alias`. | +| `existing_study_type` | No | ENA study type (e.g. `Metagenomics`, `Other`). | +| `new_study_type` | No | Custom study type. Only used when `existing_study_type` is set to `Other`. | + +An example metadata file is available at [assets/study_metadata.json](../assets/study_metadata.json). + ## Running the pipeline General command template: From 6eae0bd3286551d8d59d0dd8a9dd02647f3d6894 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Tue, 24 Mar 2026 13:08:40 +0000 Subject: [PATCH 30/36] add tests for pipeline with study registration --- ...assembly_no_study_complete_metadata.config | 37 +++++++++++++++++ ...test_mag_no_study_complete_metadata.config | 40 +++++++++++++++++++ ...ssembly_no_study_complete_metadata.nf.test | 39 ++++++++++++++++++ tests/mag_no_study_complete_metadata.nf.test | 39 ++++++++++++++++++ 4 files changed, 155 insertions(+) create mode 100644 conf/test_assembly_no_study_complete_metadata.config create mode 100644 conf/test_mag_no_study_complete_metadata.config create mode 100644 tests/assembly_no_study_complete_metadata.nf.test create mode 100644 tests/mag_no_study_complete_metadata.nf.test diff --git a/conf/test_assembly_no_study_complete_metadata.config b/conf/test_assembly_no_study_complete_metadata.config new file mode 100644 index 0000000..f729c77 --- /dev/null +++ b/conf/test_assembly_no_study_complete_metadata.config @@ -0,0 +1,37 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test,<docker/singularity> --outdir <OUTDIR> + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '8.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode metagenomic_assemblies complete_metadata profile' + config_profile_description = 'Single-case assembly test with complete metadata values provided' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/assembly_complete_metadata.csv' + + mode = "metagenomic_assemblies" + submission_study = null + study_metadata = "$projectDir/assets/study_metadata.json" + centre_name = "TEST_CENTER" + + test_upload = true + +} + +docker.enabled = true diff --git a/conf/test_mag_no_study_complete_metadata.config b/conf/test_mag_no_study_complete_metadata.config new file mode 100644 index 0000000..dd3e659 --- /dev/null +++ b/conf/test_mag_no_study_complete_metadata.config @@ -0,0 +1,40 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/seqsubmit -profile test,<docker/singularity> --outdir <OUTDIR> + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 2, + memory: '16.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test --mode mags complete_metadata profile' + config_profile_description = 'Single-case MAG test with complete metadata values provided' + + // Input data + input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_complete_metadata.csv' + + mode = "mags" + submission_study = null + study_metadata = "$projectDir/assets/study_metadata.json" + centre_name = "TEST_CENTER" + + test_upload = true + + cat_db = null + checkm2_db = null + +} + +docker.enabled = true diff --git a/tests/assembly_no_study_complete_metadata.nf.test b/tests/assembly_no_study_complete_metadata.nf.test new file mode 100644 index 0000000..b6c857b --- /dev/null +++ b/tests/assembly_no_study_complete_metadata.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test assembly submission workflow stub - complete_metadata" + script "../main.nf" + tag "pipeline" + tag "mode_assembly" + tag "test_assembly_no_study_complete_metadata" + profile "test_assembly_no_study_complete_metadata" + + test("-profile test_assembly_no_study_complete_metadata") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} diff --git a/tests/mag_no_study_complete_metadata.nf.test b/tests/mag_no_study_complete_metadata.nf.test new file mode 100644 index 0000000..d585286 --- /dev/null +++ b/tests/mag_no_study_complete_metadata.nf.test @@ -0,0 +1,39 @@ +nextflow_pipeline { + + name "Test genome submission workflow - complete_metadata" + script "../main.nf" + tag "pipeline" + tag "mode_mag" + tag "test_mag_no_study_complete_metadata" + profile "test_mag_no_study_complete_metadata" + + test("-profile test_mag_no_study_complete_metadata") { + + when { + params { + outdir = "$outputDir" + } + } + + then { + // stable_name: All files + folders in ${params.outdir}/ with a stable name + def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}', '**/manifests_test/*']) + // stable_path: All files in ${params.outdir}/ with stable content + def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore') + // Early failure no need to test the rest of snapshots + assert workflow.success + assertAll( + { assert snapshot( + // Number of successful tasks + workflow.trace.succeeded().size(), + // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions + removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"), + // All stable path name, with a relative path + stable_name, + // All files with stable contents + stable_path + ).match() } + ) + } + } +} From cc29a3e4c04ee4ffe82214d12d2064fc9124de5d Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Tue, 24 Mar 2026 14:27:39 +0000 Subject: [PATCH 31/36] apply linter --- modules/local/registerstudy/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/registerstudy/environment.yml b/modules/local/registerstudy/environment.yml index 6ee92a8..a5e1bf2 100644 --- a/modules/local/registerstudy/environment.yml +++ b/modules/local/registerstudy/environment.yml @@ -7,4 +7,4 @@ dependencies: - conda-forge::python>=3.12 - conda-forge::pip - pip: - - mgnify-pipelines-toolkit==1.4.17 + - mgnify-pipelines-toolkit==1.4.17 From bbd8305ce6ec19897838a5104ff1bbcacc062a06 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Tue, 24 Mar 2026 14:35:20 +0000 Subject: [PATCH 32/36] linter on .gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 601993a..1c11923 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ testing* null/ .nf-test* .idea/ -test_data \ No newline at end of file +test_data From 1a82bec1da2906a6b7cf07296ccf4101d0d447f3 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Tue, 24 Mar 2026 15:26:29 +0000 Subject: [PATCH 33/36] revert to REST API usage --- bin/submit_study.py | 192 ++++++++++++++++++++++---------------------- 1 file changed, 95 insertions(+), 97 deletions(-) diff --git a/bin/submit_study.py b/bin/submit_study.py index b4b700f..7bae1ef 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Submit studies to ENA via the Webin drop-box XML submission service. +"""Submit studies to ENA via the Webin REST API v2. Read a study metadata file (JSON, CSV, or TSV), construct an XML submission document, and submit new studies to ENA. @@ -170,34 +170,33 @@ def get_credentials() -> tuple[str, str]: # ENA API helpers # ----------------------------------------------------------- -PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/drop-box" -TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box" +PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2" +TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2" def submit_xml( base_url: str, auth: HTTPBasicAuth, - submission_xml_bytes: bytes, - project_xml_bytes: bytes, + xml_bytes: bytes, ) -> ET.Element: - """Submit study XMLs to ENA via the submit/drop-box endpoint. + """Submit an XML document to ENA via Webin REST API v2. Args: base_url: ENA submission service base URL. auth: HTTP basic-auth credentials. - submission_xml_bytes: Serialised ``<SUBMISSION>`` XML. - project_xml_bytes: Serialised ``<PROJECT_SET>`` XML. + xml_bytes: Serialised XML submission document. Returns: Parsed receipt XML element tree root. """ url = f"{base_url}/submit" - files = { - "SUBMISSION": ("submission.xml", submission_xml_bytes, "application/xml"), - "PROJECT": ("project.xml", project_xml_bytes, "application/xml"), + headers = { + "Content-Type": "application/xml", + "Accept": "application/xml", } resp = requests.post( - url, files=files, auth=auth, timeout=120, + url, data=xml_bytes, + headers=headers, auth=auth, timeout=120, ) resp.raise_for_status() return ET.fromstring(resp.content) @@ -425,24 +424,33 @@ def write_results( # ----------------------------------------------------------- -def build_submission_actions_xml( +def build_submission_xml( + studies: list[dict[str, Any]], hold_until: str | None = None, action: str = "ADD", + test: bool = False, ) -> ET.Element: - """Build the ``<SUBMISSION>`` actions XML element. - - This is submitted as the ``SUBMISSION`` multipart field. + """Build a ``<WEBIN>`` XML document for submitting studies. Args: + studies: Study metadata dicts. hold_until: Optional hold-until date string (``YYYY-MM-DD``). action: Submission action — ``"ADD"`` for new studies or ``"MODIFY"`` to update existing ones. + test: If ``True``, append a timestamp-based hash to aliases + for uniqueness in test submissions. Returns: - Root ``<SUBMISSION>`` element. + Root ``<WEBIN>`` element. """ - submission = ET.Element("SUBMISSION") + webin = ET.Element("WEBIN") + + # SUBMISSION_SET + submission_set = ET.SubElement(webin, "SUBMISSION_SET") + submission = ET.SubElement(submission_set, "SUBMISSION") + sub_alias = f"study-submission-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}" + submission.set("alias", sub_alias) actions = ET.SubElement(submission, "ACTIONS") main_action = ET.SubElement(actions, "ACTION") ET.SubElement(main_action, action.upper()) @@ -450,74 +458,66 @@ def build_submission_actions_xml( hold_action = ET.SubElement(actions, "ACTION") hold_el = ET.SubElement(hold_action, "HOLD") hold_el.set("HoldUntilDate", hold_until) - return submission + # PROJECT_SET + project_set = ET.SubElement(webin, "PROJECT_SET") + for study in studies: + _add_project_element(project_set, study, test=test) + return webin -def build_project_set_xml( - studies: list[dict[str, Any]], - test: bool = False, -) -> ET.Element: - """Build the ``<PROJECT_SET>`` XML element. - - This is submitted as the ``PROJECT`` multipart field. - Args: - studies: Study metadata dicts. - test: If ``True``, append a timestamp-based hash to aliases - for uniqueness in test submissions. - - Returns: - Root ``<PROJECT_SET>`` element. - """ - project_set = ET.Element("PROJECT_SET") - for study in studies: - alias = study.get( - "alias", - study.get("STUDY_TITLE", "").replace(" ", "_")[:50], +def _add_project_element( + project_set: ET.Element, + study: dict[str, Any], + test: bool = False, +) -> None: + """Append a ``<PROJECT>`` element to *project_set*.""" + alias = study.get( + "alias", + study.get("STUDY_TITLE", "").replace(" ", "_")[:50], + ) + if test: + # Append 8-character hash of current timestamp for uniqueness in test mode + timestamp_hash = hashlib.md5( + datetime.datetime.now().isoformat().encode() + ).hexdigest()[:8] + alias = f"{alias}_{timestamp_hash}" + + project = ET.SubElement(project_set, "PROJECT") + project.set("alias", alias) + + name_text = study.get("CENTER_PROJECT_NAME", alias) + if name_text: + name_el = ET.SubElement(project, "NAME") + name_el.text = name_text + + title_el = ET.SubElement(project, "TITLE") + title_el.text = study.get("STUDY_TITLE", "") + + desc_text = ( + study.get("STUDY_ABSTRACT") + or study.get("STUDY_DESCRIPTION", "") + ) + if desc_text: + desc_el = ET.SubElement(project, "DESCRIPTION") + desc_el.text = desc_text + + sp = ET.SubElement(project, "SUBMISSION_PROJECT") + ET.SubElement(sp, "SEQUENCING_PROJECT") + # TODO: Check existing_study_type and new_study_type metadata fields, do we need those? + study_type = study.get("existing_study_type") + if study_type: + attrs = ET.SubElement( + project, "PROJECT_ATTRIBUTES", ) - if test: - # Append 8-character hash of current timestamp for uniqueness in test mode - timestamp_hash = hashlib.md5( - datetime.datetime.now().isoformat().encode() - ).hexdigest()[:8] - alias = f"{alias}_{timestamp_hash}" - - project = ET.SubElement(project_set, "PROJECT") - project.set("alias", alias) - - name_text = study.get("CENTER_PROJECT_NAME", alias) - if name_text: - name_el = ET.SubElement(project, "NAME") - name_el.text = name_text - - title_el = ET.SubElement(project, "TITLE") - title_el.text = study.get("STUDY_TITLE", "") - - desc_text = ( - study.get("STUDY_ABSTRACT") - or study.get("STUDY_DESCRIPTION", "") + _add_project_attribute( + attrs, "existing_study_type", study_type, ) - if desc_text: - desc_el = ET.SubElement(project, "DESCRIPTION") - desc_el.text = desc_text - - sp = ET.SubElement(project, "SUBMISSION_PROJECT") - ET.SubElement(sp, "SEQUENCING_PROJECT") - # TODO: Check existing_study_type and new_study_type metadata fields, do we need those? - study_type = study.get("existing_study_type") - if study_type: - attrs = ET.SubElement( - project, "PROJECT_ATTRIBUTES", - ) + new_type = study.get("new_study_type") + if new_type and study_type == "Other": _add_project_attribute( - attrs, "existing_study_type", study_type, + attrs, "new_study_type", new_type, ) - new_type = study.get("new_study_type") - if new_type and study_type == "Other": - _add_project_attribute( - attrs, "new_study_type", new_type, - ) - return project_set def _add_project_attribute( @@ -594,8 +594,7 @@ def parse_xml_receipt( def _do_submission( base_url: str, auth: Any, - submission_xml_bytes: bytes, - project_xml_bytes: bytes, + xml_bytes: bytes, action: str, results: dict[str, list[dict[str, Any]]], env_label: str, @@ -606,8 +605,7 @@ def _do_submission( Args: base_url: ENA submission base URL. auth: HTTP basic-auth credentials. - submission_xml_bytes: Serialised ``<SUBMISSION>`` actions XML. - project_xml_bytes: Serialised ``<PROJECT_SET>`` XML. + xml_bytes: Serialised XML submission document. action: Label for log messages (``"ADD"`` or ``"MODIFY"``). results: Results dict to accumulate into. @@ -619,13 +617,12 @@ def _do_submission( """ if dry_run: logger.info("DRY RUN — skipping %s submission", action) - logger.info("SUBMISSION XML:\n%s", submission_xml_bytes.decode("utf-8")) - logger.info("PROJECT XML:\n%s", project_xml_bytes.decode("utf-8")) + logger.info("Generated XML:\n%s", xml_bytes.decode("utf-8")) return True logger.info("Submitting %s to ENA (%s)...", action, env_label) try: - receipt_root = submit_xml(base_url, auth, submission_xml_bytes, project_xml_bytes) + receipt_root = submit_xml(base_url, auth, xml_bytes) except requests.exceptions.HTTPError as exc: logger.error("HTTP error during %s submission: %s", action, exc) if exc.response is not None: @@ -665,7 +662,7 @@ def _do_submission( @click.command( - help="Register studies with ENA using Webin XML submission service.", + help="Submit studies to ENA via the Webin REST API v2.", ) @click.option( "--input", "input_file", @@ -701,7 +698,7 @@ def main( output: Path | None, validate: bool, ) -> None: - """Register studies with ENA using Webin XML submission service.""" + """Submit studies to ENA via the Webin REST API v2.""" username, password = get_credentials() env_label = "TEST server" if use_test else "LIVE server" @@ -737,16 +734,17 @@ def main( # -- Step 2: Build and submit XML -------------------- logger.info("Building ADD XML for %d study/studies...", len(studies)) - submission_root = build_submission_actions_xml(hold_until=hold_until, action="ADD") - project_root = build_project_set_xml(studies, test=use_test) - submission_xml_bytes = xml_to_bytes(submission_root) - project_xml_bytes = xml_to_bytes(project_root) - logger.info("SUBMISSION XML document size: %d bytes", len(submission_xml_bytes)) - logger.debug("SUBMISSION XML:\n%s", submission_xml_bytes.decode("utf-8")) - logger.info("PROJECT XML document size: %d bytes", len(project_xml_bytes)) - logger.debug("PROJECT XML:\n%s", project_xml_bytes.decode("utf-8")) + xml_root = build_submission_xml( + studies, + hold_until=hold_until, + action="ADD", + test=use_test, + ) + xml_bytes = xml_to_bytes(xml_root) + logger.info("XML document size: %d bytes", len(xml_bytes)) + logger.debug("Generated XML:\n%s", xml_bytes.decode("utf-8")) ok = _do_submission( - base_url, auth, submission_xml_bytes, project_xml_bytes, + base_url, auth, xml_bytes, action="ADD", results=results, env_label=env_label, From 8e98878fcfa75969be569e68b191283db539fc05 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Wed, 25 Mar 2026 10:31:55 +0000 Subject: [PATCH 34/36] refactor input parsing in submit_study.py --- assets/study_metadata.json | 6 + assets/study_metadata.tsv | 2 + bin/submit_study.py | 305 +++++------------- docs/usage.md | 26 +- modules/local/registerstudy/meta.yml | 5 +- .../local/registerstudy/tests/main.nf.test | 29 +- nextflow_schema.json | 2 +- 7 files changed, 133 insertions(+), 242 deletions(-) create mode 100644 assets/study_metadata.json create mode 100644 assets/study_metadata.tsv diff --git a/assets/study_metadata.json b/assets/study_metadata.json new file mode 100644 index 0000000..fbc2b28 --- /dev/null +++ b/assets/study_metadata.json @@ -0,0 +1,6 @@ +{ + "alias": "study-example-2026", + "study_title": "Example metagenome study", + "study_abstract": "Description of the study aims and methods.", + "existing_study_type": "Metagenomics" +} diff --git a/assets/study_metadata.tsv b/assets/study_metadata.tsv new file mode 100644 index 0000000..2389f1d --- /dev/null +++ b/assets/study_metadata.tsv @@ -0,0 +1,2 @@ +alias study_title study_abstract existing_study_type +study-example-2026 Example metagenome study Description of the study aims and methods. Metagenomics diff --git a/bin/submit_study.py b/bin/submit_study.py index 7bae1ef..28c1f9a 100755 --- a/bin/submit_study.py +++ b/bin/submit_study.py @@ -1,117 +1,4 @@ #!/usr/bin/env python3 -"""Submit studies to ENA via the Webin REST API v2. - -Read a study metadata file (JSON, CSV, or TSV), construct an -XML submission document, and submit new studies to ENA. - -# TODO: Currently script supports multiple input format that might be unnecessary. -# TODO: Consider standardising on a single format (e.g. JSON and/or TSV) and deprecating the others. -# TODO: Consider which columns are mandatory vs optional. "alias" is optional, might be worth making it mandatory. -# TODO: Add input file validation and error handling (e.g. missing mandatory fields, long alias). -Input formats accepted (``--input``): - -* ``.json`` -* ``.csv`` -* ``.tsv`` - -Example JSON inputs accepted:: - - { - "alias": "study-gut-2026", - "STUDY_TITLE": "Gut microbiome study", - "STUDY_ABSTRACT": "Characterisation of gut microbial communities", - "existing_study_type": "Metagenomics" - } - - [ - { - "alias": "study-gut-2026", - "STUDY_TITLE": "Gut microbiome study", - "STUDY_ABSTRACT": "Characterisation of gut microbial communities", - "existing_study_type": "Metagenomics" - }, - ... - ] - - { - "studies": [ - { - "alias": "study-soil-2026", - "STUDY_TITLE": "Soil microbiome study", - "existing_study_type": "Other", - "new_study_type": "Environmental microbiome" - } - ] - } - - { - "data": [ - { - "alias": "study-soil-2026", - "STUDY_TITLE": "Soil microbiome study", - } - ] - } - - { - "Container": { - "Studies": [ - { - "STUDY_TITLE": "Marine metagenome study", - "STUDY_ABSTRACT": "Shotgun metagenomics from seawater" - } - ] - } - } - -Example CSV input accepted:: - - alias,STUDY_TITLE,STUDY_ABSTRACT,existing_study_type - study-gut-2026,Gut microbiome study,Characterisation of gut microbial communities,Metagenomics - -Example TSV input accepted:: - - alias\tSTUDY_TITLE\tSTUDY_ABSTRACT\texisting_study_type - study-soil-2026\tSoil microbiome study\tSurvey of soil microbiota\tMetagenomics - -Study metadata fields: - -Mandatory: - -* ``STUDY_TITLE`` — study title used in ``<TITLE>``. - -Optional: - -* ``alias`` — project alias; if missing, derived from ``STUDY_TITLE`` (first 50 characters). -* ``CENTER_PROJECT_NAME`` — written to ``<NAME>``; defaults to alias. -* ``STUDY_ABSTRACT`` or ``STUDY_DESCRIPTION`` — written to ``<DESCRIPTION>``. -* ``existing_study_type`` — included as PROJECT_ATTRIBUTE. -* ``new_study_type`` — included only when ``existing_study_type == "Other"``. - -Credentials are read from environment variables to avoid -secrets appearing in shell history or process listings:: - - export ENA_WEBIN=Webin-XXXXX - export ENA_WEBIN_PASSWORD=XXXXX - -Usage:: - - # Submission to TEST server (submissions are discarded daily): - python bin/submit_study.py \\ - --input studies.json \\ - --test - - # With hold date (max 2 years): - python bin/submit_study.py \\ - --input studies.json \\ - --hold-until 2028-01-01 - - # Log to file: - python bin/submit_study.py \\ - --input studies.json \\ - --test --log submission.log -""" - from __future__ import annotations import csv @@ -122,7 +9,6 @@ import os import sys import xml.etree.ElementTree as ET -from collections.abc import Sequence from io import BytesIO from pathlib import Path from typing import Any, Final @@ -260,21 +146,31 @@ def validate_hold_until(hold_until: str) -> datetime.date: # ----------------------------------------------------------- -# File loading (JSON, CSV, TSV) +# Study metadata field definitions # ----------------------------------------------------------- +#: Fields that must be present and non-empty in every record. +_REQUIRED_FIELDS: Final[frozenset[str]] = frozenset({ + "alias", + "study_title", +}) -def _is_metadata_row(row: Sequence[object]) -> bool: - """Check whether *row* is a non-data header/metadata row. +#: Fields that are recognised but optional. +_OPTIONAL_FIELDS: Final[frozenset[str]] = frozenset({ + "project_name", + "study_abstract", + "study_description", + "existing_study_type", + "new_study_type", +}) - Such rows have at most one non-empty cell and are skipped - during record extraction. - """ - non_empty = sum( - 1 for c in row - if c is not None and str(c).strip() - ) - return non_empty <= 1 +#: All recognised field names (required + optional). +_ALL_FIELDS: Final[frozenset[str]] = _REQUIRED_FIELDS | _OPTIONAL_FIELDS + + +# ----------------------------------------------------------- +# File loading (JSON, CSV, TSV) +# ----------------------------------------------------------- def extract_records_from_tabular( @@ -283,8 +179,8 @@ def extract_records_from_tabular( ) -> list[dict[str, str]]: """Extract record dicts from a CSV or TSV file. - Skip an optional leading metadata/label row if detected - (a row with at most one non-empty cell). + Only columns present in _ALL_FIELDS are retained; + unknown columns are ignored. Args: filepath: Path to the tabular file. @@ -293,111 +189,88 @@ def extract_records_from_tabular( Returns: List of record dicts. """ + records = [] + with open(filepath, newline="", encoding="utf-8") as fh: - rows = list(csv.reader(fh, delimiter=delimiter)) - - if not rows: - return [] - - idx = 0 - if _is_metadata_row(rows[idx]): - idx += 1 - if idx >= len(rows): - return [] - - headers = rows[idx] - idx += 1 - - records: list[dict[str, str]] = [] - for row in rows[idx:]: - record: dict[str, str] = {} - for col, val in zip(headers, row): - col = col.strip() - if col and val is not None and val.strip(): - record[col] = val.strip() - if record: - records.append(record) + reader = csv.DictReader(fh, delimiter=delimiter) + for line in reader: + record = {} + for col in _ALL_FIELDS: + value = line.get(col, "").strip() + if value: + record[col] = value + if record: + records.append(record) - return records + return records def extract_records_from_json( - input_data: object, - record_keys: Sequence[str] = ("data",), -) -> list[dict[str, Any]] | None: - """Extract record dicts from a JSON input. - - Handle several JSON shapes: - - * Container format (e.g. DataHarmonizer exports):: + filepath: str | Path, +) -> list[dict[str, Any]]: + """Extract record dicts from a JSON file. - {"Container": {"<ClassName>s": [{...}, ...]}} + Handle two JSON shapes: * Plain list of dicts. - * Dict with an entity-specific key or ``data`` key. * Single record object (no wrapper). Args: - input_data: Parsed JSON data (any shape). - record_keys: Dict keys to check for record lists - (e.g. ``["studies", "data"]``). + filepath: Path to the JSON file. Returns: - List of record dicts, or ``None`` if unrecognised. + List of record dicts, or [] if unrecognised. """ + with open(filepath) as fh: + input_data = json.load(fh) + if isinstance(input_data, list): return input_data if isinstance(input_data, dict): - container = input_data.get("Container") - if isinstance(container, dict): - for key, val in container.items(): - if isinstance(val, list): - logger.info("Extracted records from Container.%s", key) - return val - - for key in record_keys: - if key in input_data: - return input_data[key] - return [input_data] - return None + return [] -def load_input_file( +def load_and_validate_input_file( filepath: str | Path, - json_record_keys: Sequence[str] = ("data",), -) -> list[dict[str, Any]] | None: - """Load records from a supported file format. +) -> list[dict[str, Any]]: + """Load and validate records from a supported file format. - Supported formats: JSON, CSV, TSV. + Supported formats: JSON, CSV, TSV. Other formats will cause a ValueError. + Records are validated against _REQUIRED_FIELDS before being returned; + missing required fields will cause a ValueError. Args: filepath: Path to the input file. - json_record_keys: Dict keys to check when parsing - JSON (e.g. ``["studies", "data"]``). Returns: - List of record dicts, or ``None`` if the format is - unrecognised. + List of record dicts. If the file format is + unrecognised (based on file extension) or required fields are missing, + raises ValueError. """ ext = Path(filepath).suffix.lower() if ext == ".json": - with open(filepath) as fh: - input_data = json.load(fh) - return extract_records_from_json( - input_data, json_record_keys, - ) - if ext == ".csv": - return extract_records_from_tabular( - filepath, delimiter=",", - ) - if ext == ".tsv": - return extract_records_from_tabular( - filepath, delimiter="\t", - ) - return None + records = extract_records_from_json(filepath) + elif ext == ".csv": + records = extract_records_from_tabular(filepath, delimiter=",") + elif ext == ".tsv": + records = extract_records_from_tabular(filepath, delimiter="\t") + else: + raise ValueError(f"Unsupported file format: {ext}. Supported: .json, .csv, .tsv") + + if not records: + raise ValueError(f"File {filepath} seems to be empty. Check the format and content.") + + for record in records: + for field in _REQUIRED_FIELDS: + if not record.get(field, "").strip(): + raise ValueError( + f"Record with alias {record.get('alias', '<missing>')} is missing required field: {field}" + ) + + return records # ----------------------------------------------------------- @@ -472,10 +345,7 @@ def _add_project_element( test: bool = False, ) -> None: """Append a ``<PROJECT>`` element to *project_set*.""" - alias = study.get( - "alias", - study.get("STUDY_TITLE", "").replace(" ", "_")[:50], - ) + alias = study.get("alias", "") if test: # Append 8-character hash of current timestamp for uniqueness in test mode timestamp_hash = hashlib.md5( @@ -486,17 +356,17 @@ def _add_project_element( project = ET.SubElement(project_set, "PROJECT") project.set("alias", alias) - name_text = study.get("CENTER_PROJECT_NAME", alias) + name_text = study.get("project_name", study.get("study_title", "")) if name_text: name_el = ET.SubElement(project, "NAME") name_el.text = name_text title_el = ET.SubElement(project, "TITLE") - title_el.text = study.get("STUDY_TITLE", "") + title_el.text = study.get("study_title", "") desc_text = ( - study.get("STUDY_ABSTRACT") - or study.get("STUDY_DESCRIPTION", "") + study.get("study_abstract") + or study.get("study_description", "") ) if desc_text: desc_el = ET.SubElement(project, "DESCRIPTION") @@ -658,9 +528,6 @@ def _do_submission( # Main # ----------------------------------------------------------- -_JSON_RECORD_KEYS: Final = ("studies", "data") - - @click.command( help="Submit studies to ENA via the Webin REST API v2.", ) @@ -713,20 +580,14 @@ def main( # -- Step 1: Load input file ------------------------- logger.info("Loading input: %s", input_file) - studies = load_input_file( - input_file, json_record_keys=_JSON_RECORD_KEYS, - ) - if studies is None: - logger.error("Unsupported file format. Supported: .json, .csv, .tsv") - sys.exit(1) + try: + studies = load_and_validate_input_file(input_file) + except ValueError as exc: + # Re-raise as click.BadParameter to get nice error formatting without a full stack trace + raise click.BadParameter(str(exc), param_hint="--input") from exc logger.info("Loaded %d study/studies from input", len(studies)) - if not studies: - logger.info("No studies to submit") - write_results({"submitted": [], "failed": []}, output) - return - results: dict[str, list[dict[str, Any]]] = { "submitted": [], "failed": [], diff --git a/docs/usage.md b/docs/usage.md index 9cab0d0..ad32375 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -130,36 +130,36 @@ Single study as a flat object: ```json { "alias": "study-gut-2026", - "STUDY_TITLE": "Gut microbiome study", - "STUDY_ABSTRACT": "Characterisation of gut microbial communities" + "study_title": "Gut microbiome study", + "study_abstract": "Characterisation of gut microbial communities" } ``` #### CSV format ```csv -alias,STUDY_TITLE,STUDY_ABSTRACT +alias,study_title,study_abstract study-gut-2026,Gut microbiome study,Characterisation of gut microbial communities ``` #### TSV format ```tsv -alias STUDY_TITLE STUDY_ABSTRACT +alias study_title study_abstract study-soil-2026 Soil microbiome study Survey of soil microbiota ``` #### Study metadata fields -| Field | Required | Description | -| --------------------- | -------- | ------------------------------------------------------------------------------------------- | -| `STUDY_TITLE` | Yes | Descriptive title of the study. | -| `alias` | No | Unique project alias within your Webin account. Derived from `STUDY_TITLE` if not provided. | -| `STUDY_ABSTRACT` | No | Free-text abstract describing the study. | -| `STUDY_DESCRIPTION` | No | Alternative to `STUDY_ABSTRACT`. | -| `CENTER_PROJECT_NAME` | No | Internal project name at your centre. Defaults to `alias`. | -| `existing_study_type` | No | ENA study type (e.g. `Metagenomics`, `Other`). | -| `new_study_type` | No | Custom study type. Only used when `existing_study_type` is set to `Other`. | +| Field | Required | Description | +| --------------------- | -------- | ---------------------------------------------------------------------------- | +| `study_title` | Yes | Descriptive title of the study. | +| `alias` | Yes | Unique project alias within your Webin account. Max length is 50 characters. | +| `study_abstract` | No | Free-text abstract describing the study. | +| `study_description` | No | Alternative to `study_abstract`. | +| `project_name` | No | Project name. Defaults to `study_title`. | +| `existing_study_type` | No | ENA study type (e.g. `Metagenomics`, `Other`). | +| `new_study_type` | No | Custom study type. Only used when `existing_study_type` is set to `Other`. | An example metadata file is available at [assets/study_metadata.json](../assets/study_metadata.json). diff --git a/modules/local/registerstudy/meta.yml b/modules/local/registerstudy/meta.yml index e3e3245..f0e6ce7 100644 --- a/modules/local/registerstudy/meta.yml +++ b/modules/local/registerstudy/meta.yml @@ -36,9 +36,8 @@ input: type: file description: | Study metadata file in JSON, CSV, or TSV format. - JSON may follow the DataHarmonizer Container export format or be - a plain list/dict of study records. - Required fields per record: STUDY_TITLE, existing_study_type. + JSON may be a plain list of dicts or a single dict of study records. + Required fields per record: study_title, alias. pattern: "*.{json,csv,tsv}" output: diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test index cdc0e69..2ec967f 100644 --- a/modules/local/registerstudy/tests/main.nf.test +++ b/modules/local/registerstudy/tests/main.nf.test @@ -7,14 +7,37 @@ nextflow_process { tag "modules" tag "registerstudy" - test("registerstudy - submission to ENA test server") { + test("registerstudy - submission to ENA test server (JSON metadata)") { when { process { """ input[0] = [ [ id:'example_study' ], - file(params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json", checkIfExists: true) + file("$projectDir/assets/study_metadata.json", checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.accessions[0][1]).exists() }, + { assert path(process.out.accessions[0][1]).json.submitted instanceof List }, + { assert path(process.out.accessions[0][1]).json.failed.size() == 0 } + ) + } + } + + test("registerstudy - submission to ENA test server (TSV metadata)") { + + when { + process { + """ + input[0] = [ + [ id:'example_study_tsv' ], + file("$projectDir/assets/study_metadata.tsv", checkIfExists: true) ] """ } @@ -38,7 +61,7 @@ nextflow_process { """ input[0] = [ [ id:'example_study' ], - file(params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json", checkIfExists: true) + file("$projectDir/assets/study_metadata.json", checkIfExists: true) ] """ } diff --git a/nextflow_schema.json b/nextflow_schema.json index d5d0ebb..83b1ed2 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -308,7 +308,7 @@ "format": "file-path", "exists": true, "description": "Path to study metadata file (JSON, CSV, or TSV) for registering a new ENA study. Required when submission_study is not provided.", - "help_text": "File containing study metadata fields (required: STUDY_TITLE, optional: alias, STUDY_ABSTRACT, existing_study_type, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.", + "help_text": "File containing study metadata fields (required: study_title and alias, optional: study_abstract, existing_study_type, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.", "fa_icon": "fas fa-file-alt" }, "webincli_submit": { From 217edea8c477540909e1ae47108c1fef5eb32506 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Wed, 25 Mar 2026 10:54:38 +0000 Subject: [PATCH 35/36] update container for mgnify-pipelines-toolkit --- modules/local/registerstudy/environment.yml | 2 +- modules/local/registerstudy/main.nf | 4 +++- modules/local/registerstudy/tests/main.nf.test.snap | 10 +++++----- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/modules/local/registerstudy/environment.yml b/modules/local/registerstudy/environment.yml index a5e1bf2..2faa83d 100644 --- a/modules/local/registerstudy/environment.yml +++ b/modules/local/registerstudy/environment.yml @@ -7,4 +7,4 @@ dependencies: - conda-forge::python>=3.12 - conda-forge::pip - pip: - - mgnify-pipelines-toolkit==1.4.17 + - mgnify-pipelines-toolkit==1.4.21 diff --git a/modules/local/registerstudy/main.nf b/modules/local/registerstudy/main.nf index 99533da..573a38c 100644 --- a/modules/local/registerstudy/main.nf +++ b/modules/local/registerstudy/main.nf @@ -3,7 +3,9 @@ process REGISTERSTUDY { label 'process_single' conda "${moduleDir}/environment.yml" - container "quay.io/microbiome-informatics/mgnify-pipelines-toolkit:1.4.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mgnify-pipelines-toolkit:1.4.21--pyhdfd78af_0': + 'biocontainers/mgnify-pipelines-toolkit:1.4.21--pyhdfd78af_0' }" // ENA_WEBIN and ENA_WEBIN_PASSWORD must be set in the process environment. // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config: diff --git a/modules/local/registerstudy/tests/main.nf.test.snap b/modules/local/registerstudy/tests/main.nf.test.snap index 385b735..d1cb6ea 100644 --- a/modules/local/registerstudy/tests/main.nf.test.snap +++ b/modules/local/registerstudy/tests/main.nf.test.snap @@ -11,14 +11,14 @@ ] ], "versions": [ - "versions.yml:md5,ddcc758a7d28faecd4286941889ab7e1" + "versions.yml:md5,29d54944e57cbb7cb12b7605f13fd0fc" ] } ], - "timestamp": "2026-03-13T14:02:21.161445", "meta": { - "nf-test": "0.9.4", - "nextflow": "25.10.4" - } + "nf-test": "0.9.0", + "nextflow": "25.04.1" + }, + "timestamp": "2026-03-25T10:54:18.30373" } } \ No newline at end of file From 55373f83f84aab8d4f515e710a48d9e7f1b57bff Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova <so.ochkalova@gmail.com> Date: Wed, 25 Mar 2026 14:06:06 +0000 Subject: [PATCH 36/36] remove docker.enabled = true from test profiles, update nextflow.config --- conf/test_assembly_no_study_complete_metadata.config | 10 ++++------ conf/test_mag_no_study_complete_metadata.config | 2 -- nextflow.config | 2 ++ 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/conf/test_assembly_no_study_complete_metadata.config b/conf/test_assembly_no_study_complete_metadata.config index f729c77..b1c96d7 100644 --- a/conf/test_assembly_no_study_complete_metadata.config +++ b/conf/test_assembly_no_study_complete_metadata.config @@ -25,13 +25,11 @@ params { // Input data input = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/assembly_complete_metadata.csv' - mode = "metagenomic_assemblies" - submission_study = null - study_metadata = "$projectDir/assets/study_metadata.json" - centre_name = "TEST_CENTER" + mode = "metagenomic_assemblies" + submission_study = null + study_metadata = "$projectDir/assets/study_metadata.json" + centre_name = "TEST_CENTER" test_upload = true } - -docker.enabled = true diff --git a/conf/test_mag_no_study_complete_metadata.config b/conf/test_mag_no_study_complete_metadata.config index dd3e659..aea18b1 100644 --- a/conf/test_mag_no_study_complete_metadata.config +++ b/conf/test_mag_no_study_complete_metadata.config @@ -36,5 +36,3 @@ params { checkm2_db = null } - -docker.enabled = true diff --git a/nextflow.config b/nextflow.config index 6e9f1b0..1cb8aff 100644 --- a/nextflow.config +++ b/nextflow.config @@ -187,6 +187,8 @@ profiles { test_genome { includeConfig 'conf/test_genome.config' } test_assembly { includeConfig 'conf/test_assembly.config' } test_full { includeConfig 'conf/test_full.config' } + test_assembly_no_study_complete_metadata { includeConfig 'conf/test_assembly_no_study_complete_metadata.config' } + test_mag_no_study_complete_metadata { includeConfig 'conf/test_mag_no_study_complete_metadata.config' } } // Load nf-core custom profiles from different institutions