From 6f04dd6554b0549574b293fb8bf7ad6a9bd0684d Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Thu, 12 Mar 2026 11:21:03 +0000
Subject: [PATCH 01/36] Added submit_study script from private repo

---
 .gitignore          |   2 +
 bin/submit_study.py | 825 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 827 insertions(+)
 create mode 100644 bin/submit_study.py

diff --git a/.gitignore b/.gitignore
index 1c11923..d8c4dbb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,5 @@ null/
 .nf-test*
 .idea/
 test_data
+.claude/
+CLAUDE.md
diff --git a/bin/submit_study.py b/bin/submit_study.py
new file mode 100644
index 0000000..9ec012a
--- /dev/null
+++ b/bin/submit_study.py
@@ -0,0 +1,825 @@
+#!/usr/bin/env python3
+"""Submit studies to ENA via the Webin REST API v2.
+
+Read a DataHarmonizer export containing study metadata,
+validate it against a LinkML schema and an XSD schema,
+check for duplicate studies already registered under the
+Webin account, construct an XML submission document, and
+submit new studies to ENA.
+
+Credentials are read from environment variables to avoid
+secrets appearing in shell history or process listings::
+
+    export ENA_USERNAME=Webin-XXXXX
+    export ENA_PASSWORD=SECRET
+
+Usage::
+
+    python scripts/submit_study.py \\
+        --input studies.json \\
+        --linkml schemas/SRA_study.yaml \\
+        --xsd assets/ena_schema \\
+        --test
+
+    # With hold date (max 2 years):
+    python scripts/submit_study.py \\
+        --input studies.json \\
+        --linkml schemas/SRA_study.yaml \\
+        --xsd assets/ena_schema \\
+        --hold-until 2028-01-01
+
+    # Log to file:
+    python scripts/submit_study.py \\
+        --input studies.json \\
+        --linkml schemas/SRA_study.yaml \\
+        --xsd assets/ena_schema \\
+        --test --log submission.log
+"""
+
+from __future__ import annotations
+
+import logging
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from typing import Any, Final
+
+import pendulum
+import requests
+import typer
+from requests.auth import HTTPBasicAuth
+
+import ena_common as common
+
+app = typer.Typer(
+    help="Submit studies to ENA via the Webin REST API v2.",
+)
+
+logger = logging.getLogger("ena_submit.study")
+
+
+# -----------------------------------------------------------
+# Reports API (study-specific)
+# -----------------------------------------------------------
+
+_PROD_REPORTS_URL: Final = (
+    "https://www.ebi.ac.uk/ena/submit/report/projects"
+)
+_TEST_REPORTS_URL: Final = (
+    "https://wwwdev.ebi.ac.uk/ena/submit/report/projects"
+)
+
+
+def _normalize_study_report(
+    report: dict[str, Any],
+) -> dict[str, str]:
+    """Normalise a raw study report dict."""
+    return {
+        "title": (
+            report.get("title")
+            or report.get("studyTitle")
+            or report.get("STUDY_TITLE", "")
+        ),
+        "alias": (
+            report.get("alias")
+            or report.get("studyAlias")
+            or ""
+        ),
+        "accession": (
+            report.get("accession")
+            or report.get("studyAccession")
+            or report.get("report", {}).get("id", "")
+        ),
+        "secondary_accession": (
+            report.get("secondaryAccession")
+            or report.get("secondaryId", "")
+        ),
+        "status": report.get(
+            "releaseStatus", "UNKNOWN"
+        ),
+    }
+
+
+def fetch_account_studies(
+    auth: HTTPBasicAuth,
+    use_test: bool = False,
+    max_results: int = 5000,
+) -> list[dict[str, str]]:
+    """Fetch all projects from the Webin Reports API.
+
+    Args:
+        auth: HTTP basic-auth credentials.
+        use_test: Try the test endpoint before production.
+        max_results: Maximum number of results to request.
+
+    Returns:
+        List of normalised study dicts.
+    """
+    return common.fetch_account_records(
+        auth,
+        use_test=use_test,
+        prod_url=_PROD_REPORTS_URL,
+        test_url=_TEST_REPORTS_URL,
+        normalizer=_normalize_study_report,
+        entity_label="studies",
+        max_results=max_results,
+    )
+
+
+def find_duplicate_studies(
+    new_studies: list[dict[str, Any]],
+    account_studies: list[dict[str, str]],
+) -> dict[int, dict[str, str]]:
+    """Check new studies against existing account studies.
+
+    Args:
+        new_studies: Studies the user wants to submit.
+        account_studies: Existing studies in the account.
+
+    Returns:
+        Mapping of index to matching study info.
+    """
+    return common.find_duplicates_by_alias_title(
+        new_studies, account_studies,
+        title_field="STUDY_TITLE",
+        entity_label="studies",
+    )
+
+
+# -----------------------------------------------------------
+# XML construction
+# -----------------------------------------------------------
+
+
+def build_submission_xml(
+    studies: list[dict[str, Any]],
+    hold_until: str | None = None,
+    action: str = "ADD",
+) -> ET.Element:
+    """Build a WEBIN XML document for submitting studies.
+
+    Each study in the input list is converted to a PROJECT
+    element.
+
+    Args:
+        studies: Study metadata dicts.
+        hold_until: Optional hold-until date string
+            (``YYYY-MM-DD``).
+        action: Submission action — ``"ADD"`` for new studies
+            or ``"MODIFY"`` to update existing ones.
+
+    Returns:
+        Root ``<WEBIN>`` element.
+    """
+    webin = ET.Element("WEBIN")
+
+    # SUBMISSION_SET
+    submission_set = ET.SubElement(webin, "SUBMISSION_SET")
+    submission = ET.SubElement(
+        submission_set, "SUBMISSION",
+    )
+    sub_alias = (
+        "study-submission-"
+        + pendulum.now().format("YYYYMMDD-HHmmss")
+    )
+    submission.set("alias", sub_alias)
+    actions = ET.SubElement(submission, "ACTIONS")
+    main_action = ET.SubElement(actions, "ACTION")
+    ET.SubElement(main_action, action.upper())
+    if hold_until:
+        hold_action = ET.SubElement(actions, "ACTION")
+        hold_el = ET.SubElement(hold_action, "HOLD")
+        hold_el.set("HoldUntilDate", hold_until)
+
+    # PROJECT_SET
+    project_set = ET.SubElement(webin, "PROJECT_SET")
+    for study in studies:
+        _add_project_element(project_set, study)
+
+    return webin
+
+
+def _add_project_element(
+    project_set: ET.Element,
+    study: dict[str, Any],
+) -> None:
+    """Append a ``<PROJECT>`` element to *project_set*."""
+    alias = study.get(
+        "alias",
+        study.get("STUDY_TITLE", "").replace(" ", "_")[:50],
+    )
+    project = ET.SubElement(project_set, "PROJECT")
+    project.set("alias", alias)
+
+    name_text = study.get("CENTER_PROJECT_NAME", alias)
+    if name_text:
+        name_el = ET.SubElement(project, "NAME")
+        name_el.text = name_text
+
+    title_el = ET.SubElement(project, "TITLE")
+    title_el.text = study.get("STUDY_TITLE", "")
+
+    desc_text = (
+        study.get("STUDY_ABSTRACT")
+        or study.get("STUDY_DESCRIPTION", "")
+    )
+    if desc_text:
+        desc_el = ET.SubElement(project, "DESCRIPTION")
+        desc_el.text = desc_text
+
+    sp = ET.SubElement(project, "SUBMISSION_PROJECT")
+    ET.SubElement(sp, "SEQUENCING_PROJECT")
+
+    study_type = study.get("existing_study_type")
+    if study_type:
+        attrs = ET.SubElement(
+            project, "PROJECT_ATTRIBUTES",
+        )
+        _add_project_attribute(
+            attrs, "existing_study_type", study_type,
+        )
+        new_type = study.get("new_study_type")
+        if new_type and study_type == "Other":
+            _add_project_attribute(
+                attrs, "new_study_type", new_type,
+            )
+
+
+def _add_project_attribute(
+    parent: ET.Element,
+    tag_text: str,
+    value_text: str,
+) -> None:
+    """Append a ``<PROJECT_ATTRIBUTE>`` to *parent*."""
+    attr = ET.SubElement(parent, "PROJECT_ATTRIBUTE")
+    tag_el = ET.SubElement(attr, "TAG")
+    tag_el.text = tag_text
+    val_el = ET.SubElement(attr, "VALUE")
+    val_el.text = value_text
+
+
+# -----------------------------------------------------------
+# XSD validation (study-specific fallback)
+# -----------------------------------------------------------
+
+
+def _validate_study_xml_structure(
+    xml_bytes: bytes,
+    messages: list[str],
+) -> tuple[bool, list[str]]:
+    """Fallback structural check for study XML."""
+    try:
+        tree = ET.fromstring(xml_bytes)
+    except ET.ParseError as exc:
+        messages.append(
+            f"ERROR: XML is not well-formed: {exc}"
+        )
+        return False, messages
+
+    messages.append(
+        "XML is well-formed (basic check passed)"
+    )
+
+    project_set = tree.find("PROJECT_SET")
+    if project_set is None:
+        messages.append(
+            "ERROR: Missing PROJECT_SET element"
+        )
+        return False, messages
+
+    projects = project_set.findall("PROJECT")
+    if not projects:
+        messages.append("ERROR: No PROJECT elements found")
+        return False, messages
+
+    for proj in projects:
+        alias = proj.get("alias", "<no alias>")
+        title = proj.find("TITLE")
+        if title is None or not title.text:
+            messages.append(
+                f"ERROR: PROJECT '{alias}' missing TITLE"
+            )
+            return False, messages
+        sp = proj.find("SUBMISSION_PROJECT")
+        if sp is None:
+            messages.append(
+                f"ERROR: PROJECT '{alias}'"
+                " missing SUBMISSION_PROJECT"
+            )
+            return False, messages
+        messages.append(
+            f"OK: PROJECT '{alias}' has required elements"
+        )
+
+    return True, messages
+
+
+def validate_against_xsd(
+    xml_bytes: bytes,
+    xsd_dir: str | Path,
+) -> tuple[bool, list[str]]:
+    """Validate study XML against ENA.project.xsd.
+
+    Args:
+        xml_bytes: Serialised XML document.
+        xsd_dir: Directory containing ``ENA.project.xsd``
+            and ``SRA.common.xsd``.
+
+    Returns:
+        Tuple of (*is_valid*, *messages*).
+    """
+    return common.validate_xml_against_xsd(
+        xml_bytes, xsd_dir,
+        xsd_filename="ENA.project.xsd",
+        fragment_tag="PROJECT_SET",
+        fallback_checker=_validate_study_xml_structure,
+    )
+
+
+# -----------------------------------------------------------
+# Receipt parsing
+# -----------------------------------------------------------
+
+
+def parse_xml_receipt(
+    receipt_root: ET.Element,
+) -> tuple[bool, list[dict[str, str]], list[str]]:
+    """Parse an ENA XML receipt for study submissions.
+
+    Args:
+        receipt_root: Root element of the receipt XML.
+
+    Returns:
+        Tuple of (*success*, *accessions*, *messages*).
+    """
+    success = (
+        receipt_root.get("success", "false").lower()
+        == "true"
+    )
+    accessions: list[dict[str, str]] = []
+    messages: list[str] = []
+
+    msgs_el = receipt_root.find("MESSAGES")
+    if msgs_el is not None:
+        for info in msgs_el.findall("INFO"):
+            messages.append(f"INFO: {info.text}")
+        for err in msgs_el.findall("ERROR"):
+            messages.append(f"ERROR: {err.text}")
+
+    for proj in receipt_root.findall("PROJECT"):
+        acc_info: dict[str, str] = {
+            "alias": proj.get("alias", ""),
+            "accession": proj.get("accession", ""),
+            "status": proj.get("status", ""),
+            "holdUntilDate": proj.get(
+                "holdUntilDate", ""
+            ),
+        }
+        ext = proj.find("EXT_ID")
+        if ext is not None:
+            acc_info["external_accession"] = ext.get(
+                "accession", ""
+            )
+            acc_info["external_type"] = ext.get(
+                "type", ""
+            )
+        accessions.append(acc_info)
+
+    # Some receipts use STUDY instead of PROJECT.
+    for study in receipt_root.findall("STUDY"):
+        accessions.append({
+            "alias": study.get("alias", ""),
+            "accession": study.get("accession", ""),
+            "status": study.get("status", ""),
+        })
+
+    return success, accessions, messages
+
+
+# -----------------------------------------------------------
+# Submission helper
+# -----------------------------------------------------------
+
+
+def _do_submission(
+    base_url: str,
+    auth: Any,
+    xml_bytes: bytes,
+    xsd: Path,
+    action: str,
+    results: dict[str, list[dict[str, Any]]],
+    result_key: str,
+    env_label: str,
+    dry_run: bool,
+) -> bool:
+    """Validate, optionally submit, and parse one batch.
+
+    Args:
+        base_url: ENA Webin v2 submission base URL.
+        auth: HTTP basic-auth credentials.
+        xml_bytes: Serialised XML submission document.
+        xsd: Directory containing the XSD files.
+        action: Label for log messages (``"ADD"`` or
+            ``"MODIFY"``).
+        results: Results dict to accumulate into.
+        result_key: Key under which successes are stored.
+        env_label: ``"TEST"`` or ``"PRODUCTION"``.
+        dry_run: If ``True``, skip the actual submission.
+
+    Returns:
+        ``True`` if the batch succeeded (or dry run).
+    """
+    xsd_valid, xsd_messages = validate_against_xsd(
+        xml_bytes, xsd,
+    )
+    for msg in xsd_messages:
+        logger.info("  %s", msg)
+    if not xsd_valid:
+        logger.error(
+            "XSD validation FAILED (%s)"
+            " — aborting submission", action,
+        )
+        return False
+
+    logger.info("XSD validation PASSED (%s)", action)
+
+    if dry_run:
+        logger.info(
+            "DRY RUN — skipping %s submission", action,
+        )
+        logger.info(
+            "Generated XML:\n%s",
+            xml_bytes.decode("utf-8"),
+        )
+        return True
+
+    logger.info(
+        "Submitting %s to ENA (%s)...", action, env_label,
+    )
+    try:
+        receipt_root = common.submit_xml(
+            base_url, auth, xml_bytes,
+        )
+    except requests.exceptions.HTTPError as exc:
+        logger.error(
+            "HTTP error during %s submission: %s",
+            action, exc,
+        )
+        if exc.response is not None:
+            logger.error(
+                "Response body: %s", exc.response.text,
+            )
+        return False
+
+    success, accessions, receipt_messages = (
+        parse_xml_receipt(receipt_root)
+    )
+    for msg in receipt_messages:
+        logger.info("  Receipt: %s", msg)
+
+    if success:
+        logger.info("%s SUCCESSFUL", action)
+        for acc in accessions:
+            ext = acc.get("external_accession", "")
+            ext_suffix = (
+                f" (study: {ext})" if ext else ""
+            )
+            logger.info(
+                "  %s: alias=%s accession=%s"
+                " status=%s%s",
+                action, acc["alias"], acc["accession"],
+                acc["status"], ext_suffix,
+            )
+            results[result_key].append(acc)
+    else:
+        logger.error("%s FAILED", action)
+        receipt_xml_str = ET.tostring(
+            receipt_root, encoding="unicode",
+        )
+        logger.error("Receipt XML: %s", receipt_xml_str)
+        results["failed"].extend(accessions)
+
+    return success
+
+
+# -----------------------------------------------------------
+# Main
+# -----------------------------------------------------------
+
+_JSON_RECORD_KEYS: Final = ("studies", "data")
+
+
+@app.command()
+def main(
+    input_file: Path = typer.Option(
+        ..., "--input", exists=True,
+        help="Path to study metadata file"
+        " (JSON, CSV, TSV, XLS, or XLSX)",
+    ),
+    linkml: Path = typer.Option(
+        ..., exists=True,
+        help="Path to LinkML YAML schema"
+        " (e.g. schemas/SRA_study.yaml)",
+    ),
+    xsd: Path = typer.Option(
+        ..., exists=True,
+        file_okay=False, resolve_path=True,
+        help="Directory containing ENA.project.xsd"
+        " and SRA.common.xsd",
+    ),
+    test: bool = typer.Option(
+        False, "--test",
+        help="Use the ENA test service"
+        " (submissions are discarded daily)",
+    ),
+    hold_until: str | None = typer.Option(
+        None, "--hold-until",
+        help="Hold studies private until this date"
+        " (YYYY-MM-DD, max 2 years from now)",
+    ),
+    log: Path | None = typer.Option(
+        None, help="Path to log file",
+    ),
+    output: Path | None = typer.Option(
+        None,
+        help="Path to write JSON accession results"
+        " (default: stdout)",
+    ),
+    max_results: int = typer.Option(
+        5000, "--max-results",
+        help="Maximum number of projects to fetch"
+        " from the Reports API for duplicate"
+        " checking",
+    ),
+    dry_run: bool = typer.Option(
+        False, "--dry-run",
+        help="Validate and build XML but do not"
+        " submit to ENA",
+    ),
+    automated: bool = typer.Option(
+        False, "--automated",
+        help="Skip duplicate detection against the"
+        " Webin Reports API (for automated pipelines)",
+    ),
+    force: bool = typer.Option(
+        False, "--force",
+        help="Submit duplicate studies using the MODIFY"
+        " action to overwrite existing ENA records,"
+        " instead of skipping them",
+    ),
+) -> None:
+    """Submit studies to ENA via the Webin REST API v2."""
+    common.setup_logging(log)
+    username, password = common.get_credentials()
+
+    env_label = "TEST" if test else "PRODUCTION"
+    logger.info(
+        "ENA Study Submission — environment: %s",
+        env_label,
+    )
+    base_url = common.get_base_url(test)
+    auth = HTTPBasicAuth(username, password)
+    logger.debug("Auth username: %s", username)
+
+    if hold_until:
+        common.validate_hold_until(hold_until)
+
+    # -- Step 1: Load input file -------------------------
+    logger.info("Loading input: %s", input_file)
+    studies = common.load_input_file(
+        input_file, json_record_keys=_JSON_RECORD_KEYS,
+    )
+    if studies is None:
+        logger.error(
+            "Unsupported file format."
+            " Supported: .json, .csv, .tsv, .xlsx, .xls",
+        )
+        sys.exit(1)
+
+    logger.info(
+        "Loaded %d study/studies from input",
+        len(studies),
+    )
+
+    # -- Step 2: Check for duplicates --------------------
+    if automated:
+        logger.info(
+            "Automated mode: skipping duplicate detection",
+        )
+        duplicates: dict[int, dict[str, Any]] = {}
+    else:
+        account_studies = fetch_account_studies(
+            auth, use_test=test,
+            max_results=max_results,
+        )
+        for ps in account_studies:
+            logger.info(
+                "  Account study: %s | alias=%s"
+                " | title=%s | status=%s",
+                ps["accession"], ps["alias"],
+                ps["title"], ps["status"],
+            )
+        duplicates = find_duplicate_studies(
+            studies, account_studies,
+        )
+
+    results: dict[str, list[dict[str, Any]]] = {
+        "duplicates": [],
+        "submitted": [],
+        "modified": [],
+        "failed": [],
+    }
+
+    studies_to_modify: list[dict[str, Any]] = []
+    if duplicates:
+        action_label = (
+            "will be re-submitted with MODIFY"
+            if force else "will NOT be submitted"
+        )
+        logger.warning(
+            "Found %d duplicate(s) — %s:",
+            len(duplicates), action_label,
+        )
+        for idx, dup_info in duplicates.items():
+            study_title = studies[idx].get(
+                "STUDY_TITLE", f"study[{idx}]",
+            )
+            logger.warning(
+                "  DUPLICATE: '%s' matches existing %s"
+                " (accession: %s)",
+                study_title,
+                dup_info["match_reason"],
+                dup_info["accession"],
+            )
+            results["duplicates"].append({
+                "input_index": idx,
+                "title": study_title,
+                "alias": studies[idx].get("alias", ""),
+                "existing_accession": (
+                    dup_info["accession"]
+                ),
+                "existing_secondary_accession": (
+                    dup_info.get(
+                        "secondary_accession", ""
+                    )
+                ),
+                "match_reason": dup_info["match_reason"],
+            })
+            if force:
+                study_copy = dict(studies[idx])
+                existing_alias = dup_info.get("alias", "")
+                if existing_alias:
+                    study_copy["alias"] = existing_alias
+                studies_to_modify.append(study_copy)
+
+    studies_to_submit = [
+        s for i, s in enumerate(studies)
+        if i not in duplicates
+    ]
+
+    if not studies_to_submit and not studies_to_modify:
+        logger.info(
+            "No studies to submit"
+            " (all are duplicates or input is empty)",
+        )
+        common.write_results(results, output)
+        return
+
+    logger.info(
+        "%d new study/studies to ADD,"
+        " %d duplicate(s) to MODIFY",
+        len(studies_to_submit), len(studies_to_modify),
+    )
+
+    # -- Step 3: Validate against LinkML -----------------
+    logger.info("Loading LinkML schema: %s", linkml)
+    schema = common.load_linkml_schema(linkml)
+
+    logger.info(
+        "Validating input against LinkML schema...",
+    )
+    linkml_valid, linkml_messages = (
+        common.validate_against_linkml(
+            studies_to_submit + studies_to_modify, schema,
+            label_fields=["STUDY_TITLE", "alias"],
+            entity_name="study",
+            unknown_field_note="will be ignored",
+        )
+    )
+    for msg in linkml_messages:
+        logger.info("  %s", msg)
+
+    if not linkml_valid:
+        logger.error(
+            "LinkML validation FAILED"
+            " — aborting submission",
+        )
+        sys.exit(1)
+
+    logger.info("LinkML validation PASSED")
+
+    overall_ok = True
+
+    # -- Steps 4-7: ADD new studies ----------------------
+    if studies_to_submit:
+        logger.info(
+            "Building ADD XML for %d new study/studies...",
+            len(studies_to_submit),
+        )
+        xml_root = build_submission_xml(
+            studies_to_submit, hold_until=hold_until,
+            action="ADD",
+        )
+        xml_bytes = common.xml_to_bytes(xml_root)
+        logger.debug(
+            "Generated XML (ADD):\n%s",
+            xml_bytes.decode("utf-8"),
+        )
+        logger.info(
+            "XML document size (ADD): %d bytes",
+            len(xml_bytes),
+        )
+        ok = _do_submission(
+            base_url, auth, xml_bytes, xsd,
+            action="ADD",
+            results=results,
+            result_key="submitted",
+            env_label=env_label,
+            dry_run=dry_run,
+        )
+        overall_ok = overall_ok and ok
+
+    # -- Steps 4-7: MODIFY duplicate studies (--force) ---
+    if studies_to_modify:
+        logger.info(
+            "Building MODIFY XML for %d duplicate(s)...",
+            len(studies_to_modify),
+        )
+        xml_root = build_submission_xml(
+            studies_to_modify, hold_until=hold_until,
+            action="MODIFY",
+        )
+        xml_bytes = common.xml_to_bytes(xml_root)
+        logger.debug(
+            "Generated XML (MODIFY):\n%s",
+            xml_bytes.decode("utf-8"),
+        )
+        logger.info(
+            "XML document size (MODIFY): %d bytes",
+            len(xml_bytes),
+        )
+        ok = _do_submission(
+            base_url, auth, xml_bytes, xsd,
+            action="MODIFY",
+            results=results,
+            result_key="modified",
+            env_label=env_label,
+            dry_run=dry_run,
+        )
+        overall_ok = overall_ok and ok
+
+    if not overall_ok:
+        sys.exit(1)
+
+    # -- Step 8: Output results --------------------------
+    common.write_results(results, output)
+
+    logger.info("=" * 60)
+    logger.info("SUBMISSION SUMMARY")
+    logger.info(
+        "  Duplicates skipped: %d",
+        len(results["duplicates"])
+        - len(results["modified"]),
+    )
+    for d in results["duplicates"]:
+        logger.info(
+            "    %s -> %s",
+            d["title"], d["existing_accession"],
+        )
+    logger.info(
+        "  Newly submitted (ADD): %d",
+        len(results["submitted"]),
+    )
+    for s in results["submitted"]:
+        ext = s.get("external_accession", "")
+        ext_suffix = f" ({ext})" if ext else ""
+        logger.info(
+            "    %s -> %s%s",
+            s["alias"], s["accession"], ext_suffix,
+        )
+    logger.info(
+        "  Modified (MODIFY): %d",
+        len(results["modified"]),
+    )
+    for m in results["modified"]:
+        ext = m.get("external_accession", "")
+        ext_suffix = f" ({ext})" if ext else ""
+        logger.info(
+            "    %s -> %s%s",
+            m["alias"], m["accession"], ext_suffix,
+        )
+    logger.info("=" * 60)
+
+
+if __name__ == "__main__":
+    app()

From c6d80fbb5d5e397d4668e7d6aea992282261dd3f Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Thu, 12 Mar 2026 11:38:02 +0000
Subject: [PATCH 02/36] Modified study-submit script to work in
 mgnify-pipelines-toolkit container. Removed linkml validation which is more
 useful for interactive submission.

---
 assets/test-fixtures/mimicc_study.csv  |   3 +
 assets/test-fixtures/mimicc_study.json |  15 +
 assets/test-fixtures/mimicc_study.tsv  |   3 +
 bin/ena_common.py                      | 726 +++++++++++++++++++++++++
 bin/submit_study.py                    | 232 ++++----
 bin/test_submit_study.py               | 430 +++++++++++++++
 6 files changed, 1275 insertions(+), 134 deletions(-)
 create mode 100644 assets/test-fixtures/mimicc_study.csv
 create mode 100644 assets/test-fixtures/mimicc_study.json
 create mode 100644 assets/test-fixtures/mimicc_study.tsv
 create mode 100644 bin/ena_common.py
 create mode 100644 bin/test_submit_study.py

diff --git a/assets/test-fixtures/mimicc_study.csv b/assets/test-fixtures/mimicc_study.csv
new file mode 100644
index 0000000..2b68cc1
--- /dev/null
+++ b/assets/test-fixtures/mimicc_study.csv
@@ -0,0 +1,3 @@
+Generic,,,,,,,,
+IS_PRIMARY,STUDY_TITLE,existing_study_type,new_study_type,STUDY_ABSTRACT,CENTER_NAME,CENTER_PROJECT_NAME,PROJECT_ID,STUDY_DESCRIPTION
+YES,MIMICC,Metagenomics,,,,,,
\ No newline at end of file
diff --git a/assets/test-fixtures/mimicc_study.json b/assets/test-fixtures/mimicc_study.json
new file mode 100644
index 0000000..cd9af28
--- /dev/null
+++ b/assets/test-fixtures/mimicc_study.json
@@ -0,0 +1,15 @@
+{
+  "schema": "https://github.com/timrozday/ena-submission-dataharmonizer/SRA_study",
+  "location": "/templates/sra_study",
+  "version": "1.0.0",
+  "in_language": "en",
+  "Container": {
+    "SRA_studys": [
+      {
+        "IS_PRIMARY": "YES",
+        "STUDY_TITLE": "MIMICC",
+        "existing_study_type": "Metagenomics"
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/assets/test-fixtures/mimicc_study.tsv b/assets/test-fixtures/mimicc_study.tsv
new file mode 100644
index 0000000..4682df1
--- /dev/null
+++ b/assets/test-fixtures/mimicc_study.tsv
@@ -0,0 +1,3 @@
+Generic								
+IS_PRIMARY	STUDY_TITLE	existing_study_type	new_study_type	STUDY_ABSTRACT	CENTER_NAME	CENTER_PROJECT_NAME	PROJECT_ID	STUDY_DESCRIPTION
+YES	MIMICC	Metagenomics						
\ No newline at end of file
diff --git a/bin/ena_common.py b/bin/ena_common.py
new file mode 100644
index 0000000..de08c48
--- /dev/null
+++ b/bin/ena_common.py
@@ -0,0 +1,726 @@
+"""Shared utilities for ENA submission scripts.
+
+Provide logging, credential management, file loading,
+XSD structural validation, Reports API access, duplicate
+detection, XML serialisation, and result output used by
+``submit_study.py``, ``submit_sample.py``, and
+``submit_reads.py``.
+"""
+
+from __future__ import annotations
+
+import csv
+import datetime
+import json
+import logging
+import os
+import sys
+import xml.etree.ElementTree as ET
+from collections.abc import Callable, Sequence
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Final
+
+import click
+import requests
+from requests.auth import HTTPBasicAuth
+
+# All loggers in the ENA submission scripts are children of
+# this root, so configuring it once propagates to all.
+_LOGGER_NAME: Final = "ena_submit"
+
+logger = logging.getLogger(_LOGGER_NAME)
+
+
+# -----------------------------------------------------------
+# Constants
+# -----------------------------------------------------------
+
+PROD_URL: Final = (
+    "https://www.ebi.ac.uk/ena/submit/webin-v2"
+)
+TEST_URL: Final = (
+    "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2"
+)
+
+_MAX_HOLD_YEARS: Final = 2
+
+
+# -----------------------------------------------------------
+# Logging
+# -----------------------------------------------------------
+
+
+def setup_logging(log_file: Path | None = None) -> None:
+    """Configure stderr and optional file logging.
+
+    Attach handlers to the ``ena_submit`` parent logger.
+    Child loggers (e.g. ``ena_submit.study``) propagate
+    their messages to these handlers automatically.
+
+    Args:
+        log_file: Path to a log file.  If provided,
+            debug-level messages are written there in
+            addition to stderr.
+    """
+    root = logging.getLogger(_LOGGER_NAME)
+
+    # Avoid duplicate handlers on repeated calls.
+    if root.handlers:
+        return
+
+    fmt = logging.Formatter(
+        "%(asctime)s [%(levelname)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    root.setLevel(logging.DEBUG)
+
+    stderr_handler = logging.StreamHandler(sys.stderr)
+    stderr_handler.setLevel(logging.INFO)
+    stderr_handler.setFormatter(fmt)
+    root.addHandler(stderr_handler)
+
+    if log_file:
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(fmt)
+        root.addHandler(file_handler)
+
+
+# -----------------------------------------------------------
+# Credentials
+# -----------------------------------------------------------
+
+
+def get_credentials() -> tuple[str, str]:
+    """Read ENA credentials from environment variables.
+
+    Returns:
+        Tuple of (*username*, *password*).
+
+    Raises:
+        SystemExit: If either variable is unset or empty.
+    """
+    username = os.environ.get("ENA_USERNAME", "").strip()
+    password = os.environ.get("ENA_PASSWORD", "").strip()
+    if not username or not password:
+        logger.error(
+            "ENA_USERNAME and ENA_PASSWORD environment"
+            " variables must be set",
+        )
+        sys.exit(1)
+    return username, password
+
+
+# -----------------------------------------------------------
+# ENA API helpers
+# -----------------------------------------------------------
+
+
+def get_base_url(use_test: bool) -> str:
+    """Return the ENA Webin v2 submission base URL."""
+    return TEST_URL if use_test else PROD_URL
+
+
+def submit_xml(
+    base_url: str,
+    auth: HTTPBasicAuth,
+    xml_bytes: bytes,
+) -> ET.Element:
+    """Submit an XML document to ENA via Webin v2.
+
+    Args:
+        base_url: ENA submission service base URL.
+        auth: HTTP basic-auth credentials.
+        xml_bytes: Serialised XML submission document.
+
+    Returns:
+        Parsed receipt XML element tree root.
+    """
+    url = f"{base_url}/submit"
+    headers = {
+        "Content-Type": "application/xml",
+        "Accept": "application/xml",
+    }
+    resp = requests.post(
+        url, data=xml_bytes,
+        headers=headers, auth=auth, timeout=120,
+    )
+    resp.raise_for_status()
+    return ET.fromstring(resp.content)
+
+
+# -----------------------------------------------------------
+# XML utilities
+# -----------------------------------------------------------
+
+
+def xml_to_bytes(root: ET.Element) -> bytes:
+    """Serialise an ElementTree element to UTF-8 bytes."""
+    tree = ET.ElementTree(root)
+    buf = BytesIO()
+    tree.write(buf, encoding="UTF-8", xml_declaration=True)
+    return buf.getvalue()
+
+
+# -----------------------------------------------------------
+# Hold-until date validation
+# -----------------------------------------------------------
+
+
+def validate_hold_until(hold_until: str) -> datetime.date:
+    """Parse and validate a hold-until date string.
+
+    Args:
+        hold_until: Date string in ``YYYY-MM-DD`` format.
+
+    Returns:
+        Parsed date.
+
+    Raises:
+        click.BadParameter: If the date format is invalid,
+            in the past, or more than 2 years from today.
+    """
+    try:
+        hold_date = datetime.date.fromisoformat(hold_until)
+    except ValueError:
+        raise click.BadParameter(
+            f"Invalid date format: {hold_until!r}."
+            " Expected YYYY-MM-DD."
+        ) from None
+
+    today = datetime.date.today()
+    max_date = today.replace(year=today.year + _MAX_HOLD_YEARS)
+
+    if hold_date > max_date:
+        raise click.BadParameter(
+            f"Hold date {hold_until} is more than"
+            f" {_MAX_HOLD_YEARS} years from today"
+            f" ({today}). Maximum allowed: {max_date}."
+        )
+
+    if hold_date <= today:
+        raise click.BadParameter(
+            f"Hold date {hold_until} is not in the"
+            f" future (today is {today})."
+        )
+
+    return hold_date
+
+
+# -----------------------------------------------------------
+# ENA checklist XML parsing
+# -----------------------------------------------------------
+
+
+def parse_checklist_units(
+    xml_path: str | Path,
+) -> dict[str, str]:
+    """Parse an ENA checklist XML and return field units.
+
+    Reads the ``<FIELD>`` elements from an ENA checklist XML
+    file (e.g. ``ERC000015.xml``) and returns a mapping from
+    slot name to unit string for every field that declares a
+    ``<UNITS><UNIT>`` element.
+
+    Args:
+        xml_path: Path to the ENA checklist XML file.
+
+    Returns:
+        Dict mapping slot name to unit string.
+        Fields without units are absent from the dict.
+    """
+    units: dict[str, str] = {}
+    try:
+        tree = ET.parse(str(xml_path))
+    except ET.ParseError as exc:
+        logger.warning(
+            "Could not parse checklist XML %s: %s",
+            xml_path, exc,
+        )
+        return units
+
+    for field in tree.iter("FIELD"):
+        name_el = field.find("NAME")
+        if name_el is None or not name_el.text:
+            continue
+        units_el = field.find("UNITS")
+        if units_el is None:
+            continue
+        unit_el = units_el.find("UNIT")
+        if unit_el is None or not unit_el.text:
+            continue
+        units[name_el.text.strip()] = unit_el.text.strip()
+
+    return units
+
+
+# -----------------------------------------------------------
+# XSD validation (structural fallback only)
+# -----------------------------------------------------------
+
+
+def validate_xml_against_xsd(
+    xml_bytes: bytes,
+    fragment_tag: str | None = None,
+    fallback_checker: Callable[
+        [bytes, list[str]], tuple[bool, list[str]]
+    ] | None = None,
+) -> tuple[bool, list[str]]:
+    """Validate XML bytes using a structural check.
+
+    Full XSD validation via lxml is not available in this
+    container.  Uses *fallback_checker* if provided,
+    otherwise checks that the document is well-formed XML.
+
+    Args:
+        xml_bytes: Serialised XML document.
+        fragment_tag: Unused; kept for API compatibility.
+        fallback_checker: Optional function called with
+            (*xml_bytes*, *messages*) that returns
+            (*is_valid*, *messages*).
+
+    Returns:
+        Tuple of (*is_valid*, *messages*).
+    """
+    messages: list[str] = []
+
+    if fallback_checker is not None:
+        return fallback_checker(xml_bytes, messages)
+
+    try:
+        ET.fromstring(xml_bytes)
+    except ET.ParseError as exc:
+        messages.append(
+            f"ERROR: XML is not well-formed: {exc}"
+        )
+        return False, messages
+
+    messages.append(
+        "XML is well-formed (basic check passed)"
+    )
+    return True, messages
+
+
+# -----------------------------------------------------------
+# File loading (JSON, CSV, TSV)
+# -----------------------------------------------------------
+
+
+def _is_metadata_row(row: Sequence[object]) -> bool:
+    """Check whether *row* is a DataHarmonizer label row.
+
+    These rows have at most one non-empty cell.
+    """
+    non_empty = sum(
+        1 for c in row
+        if c is not None and str(c).strip()
+    )
+    return non_empty <= 1
+
+
+def extract_records_from_tabular(
+    filepath: str | Path,
+    delimiter: str = ",",
+) -> list[dict[str, str]]:
+    """Extract record dicts from a CSV or TSV file.
+
+    Skip an optional DataHarmonizer metadata row if
+    detected.
+
+    Args:
+        filepath: Path to the tabular file.
+        delimiter: Column delimiter character.
+
+    Returns:
+        List of record dicts.
+    """
+    with open(filepath, newline="", encoding="utf-8") as fh:
+        rows = list(csv.reader(fh, delimiter=delimiter))
+
+    if not rows:
+        return []
+
+    idx = 0
+    if _is_metadata_row(rows[idx]):
+        idx += 1
+    if idx >= len(rows):
+        return []
+
+    headers = rows[idx]
+    idx += 1
+
+    records: list[dict[str, str]] = []
+    for row in rows[idx:]:
+        record: dict[str, str] = {}
+        for col, val in zip(headers, row):
+            col = col.strip()
+            if col and val is not None and val.strip():
+                record[col] = val.strip()
+        if record:
+            records.append(record)
+
+    return records
+
+
+def extract_records_from_json(
+    input_data: object,
+    record_keys: Sequence[str] = ("data",),
+) -> list[dict[str, Any]] | None:
+    """Extract record dicts from a DataHarmonizer JSON export.
+
+    Handle several JSON shapes:
+
+    * DataHarmonizer Container format::
+
+        {"Container": {"<ClassName>s": [{...}, ...]}}
+
+    * Plain list of dicts.
+    * Dict with an entity-specific key or ``data`` key.
+    * Single record object (no wrapper).
+
+    Args:
+        input_data: Parsed JSON data (any shape).
+        record_keys: Dict keys to check for record lists
+            (e.g. ``["studies", "data"]``).
+
+    Returns:
+        List of record dicts, or ``None`` if unrecognised.
+    """
+    if isinstance(input_data, list):
+        return input_data
+
+    if isinstance(input_data, dict):
+        container = input_data.get("Container")
+        if isinstance(container, dict):
+            for key, val in container.items():
+                if isinstance(val, list):
+                    logger.info(
+                        "Extracted records from"
+                        " Container.%s",
+                        key,
+                    )
+                    return val
+
+        for key in record_keys:
+            if key in input_data:
+                return input_data[key]
+
+        return [input_data]
+
+    return None
+
+
+def load_input_file(
+    filepath: str | Path,
+    json_record_keys: Sequence[str] = ("data",),
+) -> list[dict[str, Any]] | None:
+    """Load records from a supported file format.
+
+    Supported formats: JSON, CSV, TSV.
+
+    Args:
+        filepath: Path to the input file.
+        json_record_keys: Dict keys to check when parsing
+            JSON (e.g. ``["studies", "data"]``).
+
+    Returns:
+        List of record dicts, or ``None`` if the format is
+        unrecognised.
+    """
+    ext = Path(filepath).suffix.lower()
+    if ext == ".json":
+        with open(filepath) as fh:
+            input_data = json.load(fh)
+        return extract_records_from_json(
+            input_data, json_record_keys,
+        )
+    if ext == ".csv":
+        return extract_records_from_tabular(
+            filepath, delimiter=",",
+        )
+    if ext == ".tsv":
+        return extract_records_from_tabular(
+            filepath, delimiter="\t",
+        )
+    return None
+
+
+# -----------------------------------------------------------
+# Reports API
+# -----------------------------------------------------------
+
+
+def fetch_from_reports_endpoint(
+    url: str,
+    auth: HTTPBasicAuth,
+    max_results: int = 5000,
+) -> list[dict[str, Any]] | None:
+    """Fetch records from a single Webin Reports endpoint.
+
+    Args:
+        url: Full URL of the reports endpoint.
+        auth: HTTP basic-auth credentials.
+        max_results: Maximum number of results to request.
+
+    Returns:
+        List of raw report dicts, or ``None`` on error.
+    """
+    params = {
+        "format": "json",
+        "max-results": max_results,
+    }
+
+    req = requests.Request(
+        "GET", url, params=params, auth=auth,
+    )
+    prepared = req.prepare()
+    logger.debug(
+        'curl -u %s:*** "%s"',
+        auth.username, prepared.url,
+    )
+
+    try:
+        resp = requests.get(
+            url, params=params, auth=auth, timeout=60,
+        )
+        logger.info(
+            "Reports API at %s returned %s",
+            url, resp.status_code,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+    except requests.exceptions.HTTPError as exc:
+        status = (
+            exc.response.status_code
+            if exc.response is not None
+            else "unknown"
+        )
+        if status == 404:
+            logger.info(
+                "Reports API at %s returned 404"
+                " — no records yet",
+                url,
+            )
+            return []
+        if status in (401, 403):
+            logger.warning(
+                "Reports API at %s returned %s"
+                " — endpoint may not be available"
+                " or credentials may differ",
+                url, status,
+            )
+            return None
+        logger.warning(
+            "Reports API at %s returned HTTP %s",
+            url, status,
+        )
+        return None
+
+    except requests.exceptions.RequestException as exc:
+        logger.warning(
+            "Reports API at %s failed: %s", url, exc,
+        )
+        return None
+
+
+def fetch_account_records(
+    auth: HTTPBasicAuth,
+    use_test: bool,
+    prod_url: str,
+    test_url: str,
+    normalizer: Callable[
+        [dict[str, Any]], dict[str, str] | None
+    ],
+    entity_label: str,
+    max_results: int = 5000,
+) -> list[dict[str, str]]:
+    """Fetch and normalise records from the Reports API.
+
+    Try test endpoint first (if *use_test*), then fall back
+    to production.
+
+    Args:
+        auth: HTTP basic-auth credentials.
+        use_test: Try the test endpoint first.
+        prod_url: Production reports endpoint URL.
+        test_url: Test reports endpoint URL.
+        normalizer: Callable that maps a raw report dict to
+            a normalised dict, or ``None`` to skip.
+        entity_label: Label for log messages (e.g.
+            ``"studies"``).
+        max_results: Maximum number of results to request.
+
+    Returns:
+        List of normalised record dicts.
+    """
+    urls = (
+        [test_url, prod_url] if use_test
+        else [prod_url]
+    )
+
+    for url in urls:
+        logger.info(
+            "Fetching account %s from: %s",
+            entity_label, url,
+        )
+        raw = fetch_from_reports_endpoint(
+            url, auth, max_results,
+        )
+        if raw is None:
+            continue
+
+        records: list[dict[str, str]] = []
+        for entry in raw:
+            report = entry.get("report")
+            if report is None:
+                continue
+            normalized = normalizer(report)
+            if normalized is not None:
+                records.append(normalized)
+
+        logger.info(
+            "Found %d %s in account",
+            len(records), entity_label,
+        )
+        return records
+
+    logger.warning(
+        "Could not reach any Webin reports endpoint."
+        " Duplicate checking for %s will be skipped.",
+        entity_label,
+    )
+    return []
+
+
+# -----------------------------------------------------------
+# Duplicate detection (alias + title matching)
+# -----------------------------------------------------------
+
+
+def find_duplicates_by_alias_title(
+    new_records: Sequence[dict[str, Any]],
+    account_records: Sequence[dict[str, str]],
+    title_field: str,
+    entity_label: str,
+) -> dict[int, dict[str, str]]:
+    """Check new records against account records.
+
+    Match by ``alias`` (preferred) or by the entity-specific
+    title field against the pre-fetched account records from
+    the Webin Reports API.
+
+    Args:
+        new_records: Records the user wants to submit.
+        account_records: Existing records already registered
+            under the Webin account.
+        title_field: Field name for the title in new records
+            (e.g. ``"STUDY_TITLE"`` or ``"SAMPLE_TITLE"``).
+        entity_label: Label for log messages.
+
+    Returns:
+        Mapping of index in *new_records* to matching
+        existing record info.
+    """
+    duplicates: dict[int, dict[str, str]] = {}
+    total = len(new_records)
+
+    if not account_records:
+        return duplicates
+
+    by_title: dict[str, dict[str, str]] = {}
+    by_alias: dict[str, dict[str, str]] = {}
+    for rec in account_records:
+        title = (rec.get("title") or "").strip()
+        alias = (rec.get("alias") or "").strip()
+        if title:
+            by_title[title] = rec
+        if alias:
+            by_alias[alias] = rec
+
+    logger.info(
+        "Checking %d new %s against"
+        " %d existing account %s...",
+        total, entity_label,
+        len(account_records), entity_label,
+    )
+
+    for i, record in enumerate(new_records):
+        new_title = (
+            record.get(title_field) or ""
+        ).strip()
+        new_alias = (record.get("alias") or "").strip()
+
+        if not new_title and not new_alias:
+            continue
+
+        match = _match_by_alias_title(
+            new_alias, new_title, by_alias, by_title,
+        )
+        if match is not None:
+            duplicates[i] = match
+            logger.info(
+                "  Duplicate: '%s' matches %s -> %s (%s)",
+                new_title or new_alias,
+                match["match_reason"],
+                match["accession"],
+                match["status"],
+            )
+
+            if len(duplicates) == total:
+                logger.info(
+                    "All %s are duplicates"
+                    " — skipping further checks",
+                    entity_label,
+                )
+                return duplicates
+
+    return duplicates
+
+
+def _match_by_alias_title(
+    new_alias: str,
+    new_title: str,
+    by_alias: dict[str, dict[str, str]],
+    by_title: dict[str, dict[str, str]],
+) -> dict[str, str] | None:
+    """Return matching record info or ``None``."""
+    if new_alias and new_alias in by_alias:
+        rec = by_alias[new_alias]
+        reason = f"alias '{new_alias}'"
+    elif new_title and new_title in by_title:
+        rec = by_title[new_title]
+        reason = f"title '{new_title}'"
+    else:
+        return None
+
+    return {
+        "accession": rec.get("accession", ""),
+        "secondary_accession": rec.get(
+            "secondary_accession", ""
+        ),
+        "alias": rec.get("alias", ""),
+        "title": rec.get("title", ""),
+        "status": rec.get("status", "UNKNOWN"),
+        "match_reason": reason,
+    }
+
+
+# -----------------------------------------------------------
+# Result output
+# -----------------------------------------------------------
+
+
+def write_results(
+    results: dict[str, list[dict[str, Any]]],
+    output_path: Path | None,
+) -> None:
+    """Write JSON results to file or stdout."""
+    json_str = json.dumps(results, indent=2)
+    if output_path:
+        with open(output_path, "w") as fh:
+            fh.write(json_str + "\n")
+        logger.info("Results written to %s", output_path)
+    else:
+        print(json_str)
diff --git a/bin/submit_study.py b/bin/submit_study.py
index 9ec012a..656f746 100644
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -2,7 +2,6 @@
 """Submit studies to ENA via the Webin REST API v2.
 
 Read a DataHarmonizer export containing study metadata,
-validate it against a LinkML schema and an XSD schema,
 check for duplicate studies already registered under the
 Webin account, construct an XML submission document, and
 submit new studies to ENA.
@@ -15,46 +14,36 @@
 
 Usage::
 
-    python scripts/submit_study.py \\
-        --input studies.json \\
-        --linkml schemas/SRA_study.yaml \\
-        --xsd assets/ena_schema \\
+    python scripts/submit_study.py \
+        --input studies.json \
         --test
 
     # With hold date (max 2 years):
-    python scripts/submit_study.py \\
-        --input studies.json \\
-        --linkml schemas/SRA_study.yaml \\
-        --xsd assets/ena_schema \\
+    python scripts/submit_study.py \
+        --input studies.json \
         --hold-until 2028-01-01
 
     # Log to file:
-    python scripts/submit_study.py \\
-        --input studies.json \\
-        --linkml schemas/SRA_study.yaml \\
-        --xsd assets/ena_schema \\
+    python scripts/submit_study.py \
+        --input studies.json \
         --test --log submission.log
 """
 
 from __future__ import annotations
 
+import datetime
 import logging
 import sys
 import xml.etree.ElementTree as ET
 from pathlib import Path
 from typing import Any, Final
 
-import pendulum
+import click
 import requests
-import typer
 from requests.auth import HTTPBasicAuth
 
 import ena_common as common
 
-app = typer.Typer(
-    help="Submit studies to ENA via the Webin REST API v2.",
-)
-
 logger = logging.getLogger("ena_submit.study")
 
 
@@ -180,7 +169,7 @@ def build_submission_xml(
     )
     sub_alias = (
         "study-submission-"
-        + pendulum.now().format("YYYYMMDD-HHmmss")
+        + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
     )
     submission.set("alias", sub_alias)
     actions = ET.SubElement(submission, "ACTIONS")
@@ -259,7 +248,7 @@ def _add_project_attribute(
 
 
 # -----------------------------------------------------------
-# XSD validation (study-specific fallback)
+# Structural XML validation (study-specific)
 # -----------------------------------------------------------
 
 
@@ -267,7 +256,7 @@ def _validate_study_xml_structure(
     xml_bytes: bytes,
     messages: list[str],
 ) -> tuple[bool, list[str]]:
-    """Fallback structural check for study XML."""
+    """Structural check for study XML."""
     try:
         tree = ET.fromstring(xml_bytes)
     except ET.ParseError as exc:
@@ -314,24 +303,19 @@ def _validate_study_xml_structure(
     return True, messages
 
 
-def validate_against_xsd(
+def validate_study_xml(
     xml_bytes: bytes,
-    xsd_dir: str | Path,
 ) -> tuple[bool, list[str]]:
-    """Validate study XML against ENA.project.xsd.
+    """Validate study XML structure.
 
     Args:
         xml_bytes: Serialised XML document.
-        xsd_dir: Directory containing ``ENA.project.xsd``
-            and ``SRA.common.xsd``.
 
     Returns:
         Tuple of (*is_valid*, *messages*).
     """
     return common.validate_xml_against_xsd(
-        xml_bytes, xsd_dir,
-        xsd_filename="ENA.project.xsd",
-        fragment_tag="PROJECT_SET",
+        xml_bytes,
         fallback_checker=_validate_study_xml_structure,
     )
 
@@ -405,7 +389,6 @@ def _do_submission(
     base_url: str,
     auth: Any,
     xml_bytes: bytes,
-    xsd: Path,
     action: str,
     results: dict[str, list[dict[str, Any]]],
     result_key: str,
@@ -418,7 +401,6 @@ def _do_submission(
         base_url: ENA Webin v2 submission base URL.
         auth: HTTP basic-auth credentials.
         xml_bytes: Serialised XML submission document.
-        xsd: Directory containing the XSD files.
         action: Label for log messages (``"ADD"`` or
             ``"MODIFY"``).
         results: Results dict to accumulate into.
@@ -429,19 +411,17 @@ def _do_submission(
     Returns:
         ``True`` if the batch succeeded (or dry run).
     """
-    xsd_valid, xsd_messages = validate_against_xsd(
-        xml_bytes, xsd,
-    )
-    for msg in xsd_messages:
+    xml_valid, xml_messages = validate_study_xml(xml_bytes)
+    for msg in xml_messages:
         logger.info("  %s", msg)
-    if not xsd_valid:
+    if not xml_valid:
         logger.error(
-            "XSD validation FAILED (%s)"
+            "XML validation FAILED (%s)"
             " — aborting submission", action,
         )
         return False
 
-    logger.info("XSD validation PASSED (%s)", action)
+    logger.info("XML validation PASSED (%s)", action)
 
     if dry_run:
         logger.info(
@@ -509,75 +489,86 @@ def _do_submission(
 _JSON_RECORD_KEYS: Final = ("studies", "data")
 
 
-@app.command()
+@click.command(
+    help="Submit studies to ENA via the Webin REST API v2.",
+)
+@click.option(
+    "--input", "input_file",
+    required=True,
+    type=click.Path(exists=True, path_type=Path),
+    help="Path to study metadata file (JSON, CSV, or TSV)",
+)
+@click.option(
+    "--test", "use_test",
+    is_flag=True, default=False,
+    help="Use the ENA test service"
+    " (submissions are discarded daily)",
+)
+@click.option(
+    "--hold-until",
+    default=None,
+    help="Hold studies private until this date"
+    " (YYYY-MM-DD, max 2 years from now)",
+)
+@click.option(
+    "--log", "log_file",
+    type=click.Path(path_type=Path),
+    default=None,
+    help="Path to log file",
+)
+@click.option(
+    "--output",
+    type=click.Path(path_type=Path),
+    default=None,
+    help="Path to write JSON accession results"
+    " (default: stdout)",
+)
+@click.option(
+    "--max-results",
+    default=5000,
+    help="Maximum number of projects to fetch"
+    " from the Reports API for duplicate checking",
+)
+@click.option(
+    "--dry-run",
+    is_flag=True, default=False,
+    help="Validate and build XML but do not"
+    " submit to ENA",
+)
+@click.option(
+    "--automated",
+    is_flag=True, default=False,
+    help="Skip duplicate detection against the"
+    " Webin Reports API (for automated pipelines)",
+)
+@click.option(
+    "--force",
+    is_flag=True, default=False,
+    help="Submit duplicate studies using the MODIFY"
+    " action to overwrite existing ENA records,"
+    " instead of skipping them",
+)
 def main(
-    input_file: Path = typer.Option(
-        ..., "--input", exists=True,
-        help="Path to study metadata file"
-        " (JSON, CSV, TSV, XLS, or XLSX)",
-    ),
-    linkml: Path = typer.Option(
-        ..., exists=True,
-        help="Path to LinkML YAML schema"
-        " (e.g. schemas/SRA_study.yaml)",
-    ),
-    xsd: Path = typer.Option(
-        ..., exists=True,
-        file_okay=False, resolve_path=True,
-        help="Directory containing ENA.project.xsd"
-        " and SRA.common.xsd",
-    ),
-    test: bool = typer.Option(
-        False, "--test",
-        help="Use the ENA test service"
-        " (submissions are discarded daily)",
-    ),
-    hold_until: str | None = typer.Option(
-        None, "--hold-until",
-        help="Hold studies private until this date"
-        " (YYYY-MM-DD, max 2 years from now)",
-    ),
-    log: Path | None = typer.Option(
-        None, help="Path to log file",
-    ),
-    output: Path | None = typer.Option(
-        None,
-        help="Path to write JSON accession results"
-        " (default: stdout)",
-    ),
-    max_results: int = typer.Option(
-        5000, "--max-results",
-        help="Maximum number of projects to fetch"
-        " from the Reports API for duplicate"
-        " checking",
-    ),
-    dry_run: bool = typer.Option(
-        False, "--dry-run",
-        help="Validate and build XML but do not"
-        " submit to ENA",
-    ),
-    automated: bool = typer.Option(
-        False, "--automated",
-        help="Skip duplicate detection against the"
-        " Webin Reports API (for automated pipelines)",
-    ),
-    force: bool = typer.Option(
-        False, "--force",
-        help="Submit duplicate studies using the MODIFY"
-        " action to overwrite existing ENA records,"
-        " instead of skipping them",
-    ),
+    input_file: Path,
+    use_test: bool,
+    hold_until: str | None,
+    log_file: Path | None,
+    output: Path | None,
+    max_results: int,
+    dry_run: bool,
+    automated: bool,
+    force: bool,
 ) -> None:
     """Submit studies to ENA via the Webin REST API v2."""
-    common.setup_logging(log)
+    common.setup_logging(log_file)
     username, password = common.get_credentials()
 
-    env_label = "TEST" if test else "PRODUCTION"
+    env_label = "TEST" if use_test else "PRODUCTION"
     logger.info(
         "ENA Study Submission — environment: %s",
         env_label,
     )
-    base_url = common.get_base_url(test)
+    base_url = common.get_base_url(use_test)
     auth = HTTPBasicAuth(username, password)
     logger.debug("Auth username: %s", username)
 
@@ -592,7 +583,7 @@ def main(
     if studies is None:
         logger.error(
             "Unsupported file format."
-            " Supported: .json, .csv, .tsv, .xlsx, .xls",
+            " Supported: .json, .csv, .tsv",
         )
         sys.exit(1)
 
@@ -609,7 +600,7 @@ def main(
         duplicates: dict[int, dict[str, Any]] = {}
     else:
         account_studies = fetch_account_studies(
-            auth, use_test=test,
+            auth, use_test=use_test,
             max_results=max_results,
         )
         for ps in account_studies:
@@ -691,36 +682,9 @@ def main(
         len(studies_to_submit), len(studies_to_modify),
     )
 
-    # -- Step 3: Validate against LinkML -----------------
-    logger.info("Loading LinkML schema: %s", linkml)
-    schema = common.load_linkml_schema(linkml)
-
-    logger.info(
-        "Validating input against LinkML schema...",
-    )
-    linkml_valid, linkml_messages = (
-        common.validate_against_linkml(
-            studies_to_submit + studies_to_modify, schema,
-            label_fields=["STUDY_TITLE", "alias"],
-            entity_name="study",
-            unknown_field_note="will be ignored",
-        )
-    )
-    for msg in linkml_messages:
-        logger.info("  %s", msg)
-
-    if not linkml_valid:
-        logger.error(
-            "LinkML validation FAILED"
-            " — aborting submission",
-        )
-        sys.exit(1)
-
-    logger.info("LinkML validation PASSED")
-
     overall_ok = True
 
-    # -- Steps 4-7: ADD new studies ----------------------
+    # -- Step 3: ADD new studies -------------------------
     if studies_to_submit:
         logger.info(
             "Building ADD XML for %d new study/studies...",
@@ -740,7 +704,7 @@ def main(
             len(xml_bytes),
         )
         ok = _do_submission(
-            base_url, auth, xml_bytes, xsd,
+            base_url, auth, xml_bytes,
             action="ADD",
             results=results,
             result_key="submitted",
@@ -749,7 +713,7 @@ def main(
         )
         overall_ok = overall_ok and ok
 
-    # -- Steps 4-7: MODIFY duplicate studies (--force) ---
+    # -- Step 4: MODIFY duplicate studies (--force) ------
     if studies_to_modify:
         logger.info(
             "Building MODIFY XML for %d duplicate(s)...",
@@ -769,7 +733,7 @@ def main(
             len(xml_bytes),
         )
         ok = _do_submission(
-            base_url, auth, xml_bytes, xsd,
+            base_url, auth, xml_bytes,
             action="MODIFY",
             results=results,
             result_key="modified",
@@ -781,7 +745,7 @@ def main(
     if not overall_ok:
         sys.exit(1)
 
-    # -- Step 8: Output results --------------------------
+    # -- Step 5: Output results --------------------------
     common.write_results(results, output)
 
     logger.info("=" * 60)
@@ -822,4 +786,4 @@ def main(
 
 
 if __name__ == "__main__":
-    app()
+    main()
diff --git a/bin/test_submit_study.py b/bin/test_submit_study.py
new file mode 100644
index 0000000..5944207
--- /dev/null
+++ b/bin/test_submit_study.py
@@ -0,0 +1,430 @@
+#!/usr/bin/env python3
+"""Tests for submit_study.py and ena_common.py — study submission pipeline.
+
+Usage:
+    pytest bin/test_submit_study.py -v
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import sys
+
+import pytest
+
+# Ensure the scripts directory is importable
+sys.path.insert(0, os.path.dirname(__file__))
+
+import ena_common as common
+from submit_study import (
+    build_submission_xml,
+    find_duplicate_studies,
+    validate_study_xml,
+)
+
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+
+FIXTURES_DIR = os.path.join(
+    os.path.dirname(__file__), "..", "assets", "test-fixtures",
+)
+MIMICC_JSON = os.path.join(FIXTURES_DIR, "mimicc_study.json")
+MIMICC_CSV = os.path.join(FIXTURES_DIR, "mimicc_study.csv")
+MIMICC_TSV = os.path.join(FIXTURES_DIR, "mimicc_study.tsv")
+
+_FIXTURES_PRESENT = os.path.isfile(MIMICC_JSON)
+requires_fixtures = pytest.mark.skipif(
+    not _FIXTURES_PRESENT,
+    reason="mimicc test fixtures not present in assets/test-fixtures/",
+)
+
+_JSON_RECORD_KEYS = ("studies", "data")
+
+# ---------------------------------------------------------------------------
+# Fixtures
+# ---------------------------------------------------------------------------
+
+
+@pytest.fixture
+def mimicc_json():
+    """Load the MIMICC study JSON fixture."""
+    with open(MIMICC_JSON) as f:
+        return json.load(f)
+
+
+# ---------------------------------------------------------------------------
+# extract_records_from_json tests
+# ---------------------------------------------------------------------------
+
+
+class TestExtractRecordsFromJson:
+    """Tests for extracting study rows from various JSON formats."""
+
+    @requires_fixtures
+    def test_dataharmonizer_container_format(self, mimicc_json):
+        """The mimicc_study.json fixture uses DataHarmonizer Container format."""
+        studies = common.extract_records_from_json(
+            mimicc_json, record_keys=_JSON_RECORD_KEYS,
+        )
+        assert studies is not None
+        assert len(studies) == 1
+        assert studies[0]["STUDY_TITLE"] == "MIMICC"
+        assert studies[0]["existing_study_type"] == "Metagenomics"
+        assert studies[0]["IS_PRIMARY"] == "YES"
+
+    def test_plain_list(self):
+        """Plain list input returns the list as-is."""
+        data = [{"STUDY_TITLE": "Test Study", "IS_PRIMARY": "YES"}]
+        studies = common.extract_records_from_json(
+            data, record_keys=_JSON_RECORD_KEYS,
+        )
+        assert studies == data
+
+    def test_dict_with_studies_key(self):
+        """Dict with 'studies' key extracts the list."""
+        data = {"studies": [{"STUDY_TITLE": "A"}, {"STUDY_TITLE": "B"}]}
+        studies = common.extract_records_from_json(
+            data, record_keys=_JSON_RECORD_KEYS,
+        )
+        assert len(studies) == 2
+
+    def test_dict_with_data_key(self):
+        """Dict with 'data' key extracts the list."""
+        data = {"data": [{"STUDY_TITLE": "C"}]}
+        studies = common.extract_records_from_json(
+            data, record_keys=_JSON_RECORD_KEYS,
+        )
+        assert len(studies) == 1
+
+    def test_single_study_object(self):
+        """Single dict input is wrapped in a list."""
+        data = {"STUDY_TITLE": "Single"}
+        studies = common.extract_records_from_json(
+            data, record_keys=_JSON_RECORD_KEYS,
+        )
+        assert len(studies) == 1
+        assert studies[0]["STUDY_TITLE"] == "Single"
+
+    def test_invalid_input(self):
+        """Non-dict/list input returns None."""
+        result = common.extract_records_from_json(
+            "not a dict or list", record_keys=_JSON_RECORD_KEYS,
+        )
+        assert result is None
+
+    def test_container_with_multiple_studies(self):
+        """Container format with multiple studies extracts all."""
+        data = {
+            "Container": {
+                "SRA_studys": [
+                    {"STUDY_TITLE": "Study A"},
+                    {"STUDY_TITLE": "Study B"},
+                ],
+            },
+        }
+        studies = common.extract_records_from_json(
+            data, record_keys=_JSON_RECORD_KEYS,
+        )
+        assert len(studies) == 2
+
+
+# ---------------------------------------------------------------------------
+# XML building tests
+# ---------------------------------------------------------------------------
+
+
+class TestBuildSubmissionXml:
+    """Tests for building ENA study submission XML."""
+
+    def test_basic_xml_structure(self):
+        """Built XML contains expected elements and attributes."""
+        studies = [
+            {
+                "alias": "test-study",
+                "STUDY_TITLE": "Test Study",
+                "STUDY_ABSTRACT": "Abstract text.",
+                "existing_study_type": "Metagenomics",
+            },
+        ]
+        root = build_submission_xml(studies)
+        xml_bytes = common.xml_to_bytes(root)
+        xml_str = xml_bytes.decode("utf-8")
+        assert "<PROJECT_SET>" in xml_str
+        assert 'alias="test-study"' in xml_str
+        assert "<TITLE>Test Study</TITLE>" in xml_str
+        assert "<DESCRIPTION>Abstract text.</DESCRIPTION>" in xml_str
+        assert "<SEQUENCING_PROJECT" in xml_str
+
+    def test_hold_until_date(self):
+        """Hold-until date appears in the submission XML."""
+        studies = [{"STUDY_TITLE": "T", "alias": "a"}]
+        root = build_submission_xml(studies, hold_until="2028-01-01")
+        xml_bytes = common.xml_to_bytes(root)
+        xml_str = xml_bytes.decode("utf-8")
+        assert 'HoldUntilDate="2028-01-01"' in xml_str
+
+    def test_structural_validation(self):
+        """Built XML should pass structural validation."""
+        studies = [
+            {
+                "alias": "val-test",
+                "STUDY_TITLE": "Validation Test Study",
+                "existing_study_type": "RNASeq",
+            },
+        ]
+        root = build_submission_xml(studies)
+        xml_bytes = common.xml_to_bytes(root)
+        is_valid, messages = validate_study_xml(xml_bytes)
+        for msg in messages:
+            print(msg)
+        assert is_valid
+
+
+# ---------------------------------------------------------------------------
+# Duplicate detection tests
+# ---------------------------------------------------------------------------
+
+
+class TestFindDuplicateStudies:
+    """Tests for alias/title-based duplicate detection."""
+
+    def _make_account_study(
+        self,
+        title: str = "",
+        alias: str = "",
+        accession: str = "PRJEB99",
+        secondary_accession: str = "",
+        status: str = "PRIVATE",
+    ) -> dict[str, str]:
+        """Build a normalised account study dict."""
+        return {
+            "title": title,
+            "alias": alias,
+            "accession": accession,
+            "secondary_accession": secondary_accession,
+            "status": status,
+        }
+
+    def test_no_duplicates(self):
+        """No match when titles and aliases differ."""
+        new = [{"STUDY_TITLE": "New Study", "alias": "new-1"}]
+        account = [
+            self._make_account_study(
+                title="Other Study", alias="other-1",
+            ),
+        ]
+        dups = find_duplicate_studies(new, account)
+        assert len(dups) == 0
+
+    def test_duplicate_by_title(self):
+        """Exact title match flags a duplicate."""
+        new = [{"STUDY_TITLE": "Existing Study"}]
+        account = [
+            self._make_account_study(
+                title="Existing Study",
+                accession="PRJEB99",
+                status="PRIVATE",
+            ),
+        ]
+        dups = find_duplicate_studies(new, account)
+        assert 0 in dups
+        assert dups[0]["accession"] == "PRJEB99"
+
+    def test_duplicate_by_alias(self):
+        """Alias match flags a duplicate even with different title."""
+        new = [{"STUDY_TITLE": "New Title", "alias": "my-alias"}]
+        account = [
+            self._make_account_study(
+                title="Different Title",
+                alias="my-alias",
+                accession="PRJEB60",
+            ),
+        ]
+        dups = find_duplicate_studies(new, account)
+        assert 0 in dups
+        assert dups[0]["accession"] == "PRJEB60"
+        assert "alias" in dups[0]["match_reason"]
+
+    def test_alias_takes_precedence_over_title(self):
+        """When alias matches, it is reported as the match reason."""
+        new = [{"STUDY_TITLE": "Same Title", "alias": "same-alias"}]
+        account = [
+            self._make_account_study(
+                title="Same Title",
+                alias="same-alias",
+                accession="PRJEB70",
+            ),
+        ]
+        dups = find_duplicate_studies(new, account)
+        assert 0 in dups
+        assert "alias" in dups[0]["match_reason"]
+
+    def test_partial_title_not_duplicate(self):
+        """Partial title match does not count as a duplicate."""
+        new = [{"STUDY_TITLE": "My Study"}]
+        account = [
+            self._make_account_study(
+                title="My Study Extended Title",
+            ),
+        ]
+        dups = find_duplicate_studies(new, account)
+        assert len(dups) == 0
+
+    def test_empty_account_no_duplicates(self):
+        """Empty account list produces no duplicates."""
+        new = [{"STUDY_TITLE": "Test", "alias": "t"}]
+        dups = find_duplicate_studies(new, [])
+        assert len(dups) == 0
+
+    def test_empty_input_no_duplicates(self):
+        """Empty input list produces no duplicates."""
+        account = [
+            self._make_account_study(title="Existing"),
+        ]
+        dups = find_duplicate_studies([], account)
+        assert len(dups) == 0
+
+    def test_study_without_title_or_alias_skipped(self):
+        """Studies with no title or alias are not flagged."""
+        new = [{}]
+        account = [
+            self._make_account_study(title="Something"),
+        ]
+        dups = find_duplicate_studies(new, account)
+        assert len(dups) == 0
+
+    def test_mixed_duplicates_and_new(self):
+        """Mix of duplicate and new studies."""
+        account = [
+            self._make_account_study(
+                title="Dup By Title",
+                alias="dup-title",
+                accession="PRJEB10",
+            ),
+            self._make_account_study(
+                title="Other",
+                alias="dup-alias",
+                accession="PRJEB20",
+            ),
+        ]
+        new = [
+            {"STUDY_TITLE": "Dup By Title", "alias": "new-alias"},
+            {"STUDY_TITLE": "New Title", "alias": "dup-alias"},
+            {"STUDY_TITLE": "Brand New", "alias": "brand-new"},
+        ]
+        dups = find_duplicate_studies(new, account)
+        assert 0 in dups  # title match
+        assert 1 in dups  # alias match
+        assert 2 not in dups  # new
+
+    def test_all_duplicates_early_exit(self):
+        """All studies being duplicates terminates early."""
+        account = [
+            self._make_account_study(
+                title="A", accession="PRJEB1",
+            ),
+            self._make_account_study(
+                title="B", accession="PRJEB2",
+            ),
+        ]
+        new = [
+            {"STUDY_TITLE": "A"},
+            {"STUDY_TITLE": "B"},
+        ]
+        dups = find_duplicate_studies(new, account)
+        assert len(dups) == 2
+
+
+# ---------------------------------------------------------------------------
+# File loading tests (JSON, CSV, TSV)
+# ---------------------------------------------------------------------------
+
+# The expected study data shared by all supported fixtures
+EXPECTED_STUDY = {
+    "IS_PRIMARY": "YES",
+    "STUDY_TITLE": "MIMICC",
+    "existing_study_type": "Metagenomics",
+}
+
+
+@requires_fixtures
+class TestLoadInputFile:
+    """Tests for loading study data from JSON, CSV, and TSV files."""
+
+    def test_load_csv(self):
+        """CSV file loads correctly."""
+        studies = common.load_input_file(
+            MIMICC_CSV, json_record_keys=_JSON_RECORD_KEYS,
+        )
+        assert studies is not None
+        assert len(studies) == 1
+        for key, val in EXPECTED_STUDY.items():
+            assert studies[0][key] == val
+
+    def test_load_tsv(self):
+        """TSV file loads correctly."""
+        studies = common.load_input_file(
+            MIMICC_TSV, json_record_keys=_JSON_RECORD_KEYS,
+        )
+        assert studies is not None
+        assert len(studies) == 1
+        for key, val in EXPECTED_STUDY.items():
+            assert studies[0][key] == val
+
+    def test_load_json(self):
+        """JSON file loads correctly."""
+        studies = common.load_input_file(
+            MIMICC_JSON, json_record_keys=_JSON_RECORD_KEYS,
+        )
+        assert studies is not None
+        assert len(studies) == 1
+        for key, val in EXPECTED_STUDY.items():
+            assert studies[0][key] == val
+
+    def test_all_formats_produce_same_data(self):
+        """All supported formats should produce the same core study fields."""
+        all_studies = [
+            common.load_input_file(
+                path, json_record_keys=_JSON_RECORD_KEYS,
+            )
+            for path in [MIMICC_JSON, MIMICC_CSV, MIMICC_TSV]
+        ]
+        for studies in all_studies:
+            assert len(studies) == 1
+            for key, val in EXPECTED_STUDY.items():
+                assert studies[0][key] == val
+
+    def test_unknown_extension_returns_none(self, tmp_path):
+        """Unsupported file extension returns None."""
+        unknown = tmp_path / "data.parquet"
+        unknown.write_text("dummy")
+        result = common.load_input_file(
+            str(unknown), json_record_keys=_JSON_RECORD_KEYS,
+        )
+        assert result is None
+
+    def test_csv_without_metadata_row(self, tmp_path):
+        """A CSV with no metadata row should still work."""
+        csvfile = tmp_path / "no_meta.csv"
+        csvfile.write_text("STUDY_TITLE,IS_PRIMARY\nTest,YES\n")
+        studies = common.load_input_file(
+            str(csvfile), json_record_keys=_JSON_RECORD_KEYS,
+        )
+        assert len(studies) == 1
+        assert studies[0]["STUDY_TITLE"] == "Test"
+        assert studies[0]["IS_PRIMARY"] == "YES"
+
+    def test_tabular_empty_values_omitted(self, tmp_path):
+        """Empty cells in tabular files should be omitted."""
+        csvfile = tmp_path / "sparse.csv"
+        csvfile.write_text(
+            "STUDY_TITLE,STUDY_ABSTRACT,IS_PRIMARY\nTest,,YES\n",
+        )
+        studies = common.load_input_file(
+            str(csvfile), json_record_keys=_JSON_RECORD_KEYS,
+        )
+        assert len(studies) == 1
+        assert "STUDY_ABSTRACT" not in studies[0]
+        assert studies[0]["STUDY_TITLE"] == "Test"

From 55c3ca332633ae3519d435b0aac585b3f65bdf7e Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Thu, 12 Mar 2026 11:52:08 +0000
Subject: [PATCH 03/36] Wrote submit_study module based on submit_study python
 script.

---
 bin/submit_study.py                           |  0
 modules/local/submit_study/environment.yml    | 10 +++
 modules/local/submit_study/main.nf            | 47 +++++++++++++
 modules/local/submit_study/meta.yml           | 68 +++++++++++++++++++
 modules/local/submit_study/tests/main.nf.test | 56 +++++++++++++++
 .../submit_study/tests/main.nf.test.snap      | 35 ++++++++++
 .../local/submit_study/tests/nextflow.config  | 18 +++++
 nextflow.config                               | 10 +++
 8 files changed, 244 insertions(+)
 mode change 100644 => 100755 bin/submit_study.py
 create mode 100644 modules/local/submit_study/environment.yml
 create mode 100644 modules/local/submit_study/main.nf
 create mode 100644 modules/local/submit_study/meta.yml
 create mode 100644 modules/local/submit_study/tests/main.nf.test
 create mode 100644 modules/local/submit_study/tests/main.nf.test.snap
 create mode 100644 modules/local/submit_study/tests/nextflow.config

diff --git a/bin/submit_study.py b/bin/submit_study.py
old mode 100644
new mode 100755
diff --git a/modules/local/submit_study/environment.yml b/modules/local/submit_study/environment.yml
new file mode 100644
index 0000000..6ee92a8
--- /dev/null
+++ b/modules/local/submit_study/environment.yml
@@ -0,0 +1,10 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+  - conda-forge
+  - bioconda
+dependencies:
+  - conda-forge::python>=3.12
+  - conda-forge::pip
+  - pip:
+    - mgnify-pipelines-toolkit==1.4.17
diff --git a/modules/local/submit_study/main.nf b/modules/local/submit_study/main.nf
new file mode 100644
index 0000000..47a9d88
--- /dev/null
+++ b/modules/local/submit_study/main.nf
@@ -0,0 +1,47 @@
+process SUBMIT_STUDY {
+    tag "$meta.id"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "quay.io/microbiome-informatics/mgnify-pipelines-toolkit:1.4.17"
+
+    // ENA_USERNAME and ENA_PASSWORD must be set in the process environment.
+    // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config:
+    //   env { ENA_USERNAME = secrets.WEBIN_ACCOUNT; ENA_PASSWORD = secrets.WEBIN_PASSWORD }
+
+    input:
+    tuple val(meta), path(study_metadata)
+
+    output:
+    tuple val(meta), path("*_accessions.json"), emit: accessions
+    path "versions.yml",                        emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args   = task.ext.args   ?: ''
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    submit_study.py \\
+        --input ${study_metadata} \\
+        --output ${prefix}_accessions.json \\
+        ${args}
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))")
+    END_VERSIONS
+    """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    echo '{"submitted":[],"duplicates":[],"modified":[],"failed":[]}' > ${prefix}_accessions.json
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))")
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/submit_study/meta.yml b/modules/local/submit_study/meta.yml
new file mode 100644
index 0000000..e09d150
--- /dev/null
+++ b/modules/local/submit_study/meta.yml
@@ -0,0 +1,68 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
+name: "submit_study"
+description: |
+  Submit a new study to ENA via the Webin REST API v2.
+  Reads study metadata from a JSON, CSV, or TSV file, checks for
+  duplicate studies already registered under the Webin account,
+  builds a PROJECT XML submission document, and submits to ENA.
+  Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD
+  Nextflow secrets, which are mapped to ENA_USERNAME and ENA_PASSWORD
+  inside the process.
+keywords:
+  - ena
+  - submission
+  - study
+  - project
+  - webin
+tools:
+  - mgnify-pipelines-toolkit:
+      description: |
+        A toolkit of utilities used in MGnify metagenomics pipelines,
+        including click, requests, and other dependencies required by
+        the ENA submission scripts.
+      homepage: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit
+      documentation: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit
+      tool_dev_url: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit
+      doi: ""
+      licence: ["Apache-2.0"]
+      identifier: null
+
+input:
+  - - meta:
+        type: map
+        description: |
+          Groovy Map containing sample information.
+          e.g. `[ id:'sample1' ]`
+    - study_metadata:
+        type: file
+        description: |
+          Study metadata file in JSON, CSV, or TSV format.
+          JSON may follow the DataHarmonizer Container export format or be
+          a plain list/dict of study records.
+          Required fields per record: STUDY_TITLE, existing_study_type.
+        pattern: "*.{json,csv,tsv}"
+
+output:
+  - accessions:
+      - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information.
+            e.g. `[ id:'sample1' ]`
+      - "*_accessions.json":
+          type: file
+          description: |
+            JSON file containing the submission results with keys:
+            submitted (newly created accessions), duplicates (skipped),
+            modified (force-updated), and failed.
+          pattern: "*_accessions.json"
+  - versions:
+      - "versions.yml":
+          type: file
+          description: File containing software versions
+          pattern: "versions.yml"
+
+authors:
+  - "@timrozday"
+maintainers:
+  - "@timrozday"
diff --git a/modules/local/submit_study/tests/main.nf.test b/modules/local/submit_study/tests/main.nf.test
new file mode 100644
index 0000000..e37cccc
--- /dev/null
+++ b/modules/local/submit_study/tests/main.nf.test
@@ -0,0 +1,56 @@
+nextflow_process {
+    name "Test Process SUBMIT_STUDY"
+    script "../main.nf"
+    config "./nextflow.config"
+    process "SUBMIT_STUDY"
+
+    tag "modules"
+    tag "submit_study"
+
+    test("submit_study - stub") {
+        options "-stub"
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'mimicc' ],
+                    file("${projectDir}/assets/test-fixtures/mimicc_study.json", checkIfExists: true)
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert snapshot(process.out).match() }
+            )
+        }
+    }
+
+    test("submit_study - dry run against ENA test server") {
+        // Requires WEBIN_ACCOUNT and WEBIN_PASSWORD Nextflow secrets.
+        // Validates and builds the submission XML but does not submit to ENA.
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'mimicc' ],
+                    file("${projectDir}/assets/test-fixtures/mimicc_study.json", checkIfExists: true)
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert path(process.out.accessions[0][1]).exists() },
+                { assert path(process.out.accessions[0][1]).json.submitted instanceof List },
+                { assert path(process.out.accessions[0][1]).json.failed.size() == 0 }
+            )
+        }
+    }
+}
diff --git a/modules/local/submit_study/tests/main.nf.test.snap b/modules/local/submit_study/tests/main.nf.test.snap
new file mode 100644
index 0000000..dd56c7c
--- /dev/null
+++ b/modules/local/submit_study/tests/main.nf.test.snap
@@ -0,0 +1,35 @@
+{
+    "submit_study - stub": {
+        "content": [
+            {
+                "0": [
+                    [
+                        {
+                            "id": "mimicc"
+                        },
+                        "mimicc_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
+                    ]
+                ],
+                "1": [
+                    "versions.yml:md5,1ffe6cc50bd36f7110413723e0796dd4"
+                ],
+                "accessions": [
+                    [
+                        {
+                            "id": "mimicc"
+                        },
+                        "mimicc_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
+                    ]
+                ],
+                "versions": [
+                    "versions.yml:md5,1ffe6cc50bd36f7110413723e0796dd4"
+                ]
+            }
+        ],
+        "timestamp": "2026-03-12T11:51:02.565164",
+        "meta": {
+            "nf-test": "0.9.4",
+            "nextflow": "25.10.4"
+        }
+    }
+}
\ No newline at end of file
diff --git a/modules/local/submit_study/tests/nextflow.config b/modules/local/submit_study/tests/nextflow.config
new file mode 100644
index 0000000..3611907
--- /dev/null
+++ b/modules/local/submit_study/tests/nextflow.config
@@ -0,0 +1,18 @@
+// Test configuration for SUBMIT_STUDY module.
+// --test        : use the ENA dev server (submissions are discarded daily)
+// --automated   : skip the Webin Reports duplicate-checking API call
+// --dry-run     : validate and build XML but do not submit to ENA
+//
+// Dummy credentials are sufficient for --dry-run --automated mode since
+// no HTTP calls are made. For real submission tests, replace with secrets:
+//   env { ENA_USERNAME = secrets.WEBIN_ACCOUNT; ENA_PASSWORD = secrets.WEBIN_PASSWORD }
+process {
+    withName: SUBMIT_STUDY {
+        ext.args = '--test --automated --dry-run'
+    }
+}
+
+env {
+    ENA_USERNAME = 'Webin-000000'
+    ENA_PASSWORD  = 'dummy-password'
+}
diff --git a/nextflow.config b/nextflow.config
index 00d8b79..a6f7ae2 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -179,6 +179,16 @@ profiles {
     }
     // TODO: figure out how to better orginise tests for different workflow types (bins, mags, metagenomic_assemblies)
     // test             { includeConfig 'conf/test.config'          }
+    test {
+        docker.enabled          = true
+        conda.enabled           = false
+        singularity.enabled     = false
+        podman.enabled          = false
+        shifter.enabled         = false
+        charliecloud.enabled    = false
+        apptainer.enabled       = false
+        docker.runOptions       = '-u $(id -u):$(id -g)'
+    }
     test_genome      { includeConfig 'conf/test_genome.config'   }
     test_assembly    { includeConfig 'conf/test_assembly.config' }
     test_full        { includeConfig 'conf/test_full.config'     }

From 632ef0a4f27525061ebb2a876b371531d4cc6f47 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Thu, 12 Mar 2026 11:59:21 +0000
Subject: [PATCH 04/36] Renamed files and parameters to align to existing
 registerstudy module, distinguish from it sing submit_study is for raw-reads
 rather than assemblies, and remove references to mimicc.

---
 .../{mimicc_study.csv => example_study.csv}   |  0
 .../{mimicc_study.json => example_study.json} |  0
 .../{mimicc_study.tsv => example_study.tsv}   |  0
 bin/ena_common.py                             |  6 ++--
 ...bmit_study.py => submit_rawreads_study.py} | 16 ++++-----
 ...study.py => test_submit_rawreads_study.py} | 36 +++++++++----------
 .../environment.yml                           |  0
 .../main.nf                                   |  8 ++---
 .../meta.yml                                  |  4 +--
 .../tests/main.nf.test                        | 20 +++++------
 .../tests/main.nf.test.snap                   | 16 ++++-----
 .../tests/nextflow.config                     | 10 +++---
 12 files changed, 58 insertions(+), 58 deletions(-)
 rename assets/test-fixtures/{mimicc_study.csv => example_study.csv} (100%)
 rename assets/test-fixtures/{mimicc_study.json => example_study.json} (100%)
 rename assets/test-fixtures/{mimicc_study.tsv => example_study.tsv} (100%)
 rename bin/{submit_study.py => submit_rawreads_study.py} (98%)
 rename bin/{test_submit_study.py => test_submit_rawreads_study.py} (92%)
 rename modules/local/{submit_study => submit_rawreads_study}/environment.yml (100%)
 rename modules/local/{submit_study => submit_rawreads_study}/main.nf (84%)
 rename modules/local/{submit_study => submit_rawreads_study}/meta.yml (95%)
 rename modules/local/{submit_study => submit_rawreads_study}/tests/main.nf.test (62%)
 rename modules/local/{submit_study => submit_rawreads_study}/tests/main.nf.test.snap (50%)
 rename modules/local/{submit_study => submit_rawreads_study}/tests/nextflow.config (62%)

diff --git a/assets/test-fixtures/mimicc_study.csv b/assets/test-fixtures/example_study.csv
similarity index 100%
rename from assets/test-fixtures/mimicc_study.csv
rename to assets/test-fixtures/example_study.csv
diff --git a/assets/test-fixtures/mimicc_study.json b/assets/test-fixtures/example_study.json
similarity index 100%
rename from assets/test-fixtures/mimicc_study.json
rename to assets/test-fixtures/example_study.json
diff --git a/assets/test-fixtures/mimicc_study.tsv b/assets/test-fixtures/example_study.tsv
similarity index 100%
rename from assets/test-fixtures/mimicc_study.tsv
rename to assets/test-fixtures/example_study.tsv
diff --git a/bin/ena_common.py b/bin/ena_common.py
index de08c48..c782140 100644
--- a/bin/ena_common.py
+++ b/bin/ena_common.py
@@ -101,11 +101,11 @@ def get_credentials() -> tuple[str, str]:
     Raises:
         SystemExit: If either variable is unset or empty.
     """
-    username = os.environ.get("ENA_USERNAME", "").strip()
-    password = os.environ.get("ENA_PASSWORD", "").strip()
+    username = os.environ.get("ENA_WEBIN", "").strip()
+    password = os.environ.get("ENA_WEBIN_PASSWORD", "").strip()
     if not username or not password:
         logger.error(
-            "ENA_USERNAME and ENA_PASSWORD environment"
+            "ENA_WEBIN and ENA_WEBIN_PASSWORD environment"
             " variables must be set",
         )
         sys.exit(1)
diff --git a/bin/submit_study.py b/bin/submit_rawreads_study.py
similarity index 98%
rename from bin/submit_study.py
rename to bin/submit_rawreads_study.py
index 656f746..f1850d7 100755
--- a/bin/submit_study.py
+++ b/bin/submit_rawreads_study.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Submit studies to ENA via the Webin REST API v2.
+"""Submit raw-reads studies to ENA via the Webin REST API v2.
 
 Read a DataHarmonizer export containing study metadata,
 check for duplicate studies already registered under the
@@ -9,22 +9,22 @@
 Credentials are read from environment variables to avoid
 secrets appearing in shell history or process listings::
 
-    export ENA_USERNAME=Webin-XXXXX
-    export ENA_PASSWORD=SECRET
+    export ENA_WEBIN=Webin-XXXXX
+    export ENA_WEBIN_PASSWORD=SECRET
 
 Usage::
 
-    python scripts/submit_study.py \
+    python bin/submit_rawreads_study.py \
         --input studies.json \
         --test
 
     # With hold date (max 2 years):
-    python scripts/submit_study.py \
+    python bin/submit_rawreads_study.py \
         --input studies.json \
         --hold-until 2028-01-01
 
     # Log to file:
-    python scripts/submit_study.py \
+    python bin/submit_rawreads_study.py \
         --input studies.json \
         --test --log submission.log
 """
@@ -44,7 +44,7 @@
 
 import ena_common as common
 
-logger = logging.getLogger("ena_submit.study")
+logger = logging.getLogger("ena_submit.rawreads_study")
 
 
 # -----------------------------------------------------------
@@ -490,7 +490,7 @@ def _do_submission(
 
 
 @click.command(
-    help="Submit studies to ENA via the Webin REST API v2.",
+    help="Submit raw-reads studies to ENA via the Webin REST API v2.",
 )
 @click.option(
     "--input", "input_file",
diff --git a/bin/test_submit_study.py b/bin/test_submit_rawreads_study.py
similarity index 92%
rename from bin/test_submit_study.py
rename to bin/test_submit_rawreads_study.py
index 5944207..b7d3dcb 100644
--- a/bin/test_submit_study.py
+++ b/bin/test_submit_rawreads_study.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
-"""Tests for submit_study.py and ena_common.py — study submission pipeline.
+"""Tests for submit_rawreads_study.py and ena_common.py — study submission pipeline.
 
 Usage:
-    pytest bin/test_submit_study.py -v
+    pytest bin/test_submit_rawreads_study.py -v
 """
 
 from __future__ import annotations
@@ -17,7 +17,7 @@
 sys.path.insert(0, os.path.dirname(__file__))
 
 import ena_common as common
-from submit_study import (
+from submit_rawreads_study import (
     build_submission_xml,
     find_duplicate_studies,
     validate_study_xml,
@@ -30,14 +30,14 @@
 FIXTURES_DIR = os.path.join(
     os.path.dirname(__file__), "..", "assets", "test-fixtures",
 )
-MIMICC_JSON = os.path.join(FIXTURES_DIR, "mimicc_study.json")
-MIMICC_CSV = os.path.join(FIXTURES_DIR, "mimicc_study.csv")
-MIMICC_TSV = os.path.join(FIXTURES_DIR, "mimicc_study.tsv")
+EXAMPLE_STUDY_JSON = os.path.join(FIXTURES_DIR, "example_study.json")
+EXAMPLE_STUDY_CSV = os.path.join(FIXTURES_DIR, "example_study.csv")
+EXAMPLE_STUDY_TSV = os.path.join(FIXTURES_DIR, "example_study.tsv")
 
-_FIXTURES_PRESENT = os.path.isfile(MIMICC_JSON)
+_FIXTURES_PRESENT = os.path.isfile(EXAMPLE_STUDY_JSON)
 requires_fixtures = pytest.mark.skipif(
     not _FIXTURES_PRESENT,
-    reason="mimicc test fixtures not present in assets/test-fixtures/",
+    reason="example study fixtures not present in assets/test-fixtures/",
 )
 
 _JSON_RECORD_KEYS = ("studies", "data")
@@ -48,9 +48,9 @@
 
 
 @pytest.fixture
-def mimicc_json():
-    """Load the MIMICC study JSON fixture."""
-    with open(MIMICC_JSON) as f:
+def example_study_json():
+    """Load the example study JSON fixture."""
+    with open(EXAMPLE_STUDY_JSON) as f:
         return json.load(f)
 
 
@@ -63,10 +63,10 @@ class TestExtractRecordsFromJson:
     """Tests for extracting study rows from various JSON formats."""
 
     @requires_fixtures
-    def test_dataharmonizer_container_format(self, mimicc_json):
-        """The mimicc_study.json fixture uses DataHarmonizer Container format."""
+    def test_dataharmonizer_container_format(self, example_study_json):
+        """The example_study.json fixture uses DataHarmonizer Container format."""
         studies = common.extract_records_from_json(
-            mimicc_json, record_keys=_JSON_RECORD_KEYS,
+            example_study_json, record_keys=_JSON_RECORD_KEYS,
         )
         assert studies is not None
         assert len(studies) == 1
@@ -356,7 +356,7 @@ class TestLoadInputFile:
     def test_load_csv(self):
         """CSV file loads correctly."""
         studies = common.load_input_file(
-            MIMICC_CSV, json_record_keys=_JSON_RECORD_KEYS,
+            EXAMPLE_STUDY_CSV, json_record_keys=_JSON_RECORD_KEYS,
         )
         assert studies is not None
         assert len(studies) == 1
@@ -366,7 +366,7 @@ def test_load_csv(self):
     def test_load_tsv(self):
         """TSV file loads correctly."""
         studies = common.load_input_file(
-            MIMICC_TSV, json_record_keys=_JSON_RECORD_KEYS,
+            EXAMPLE_STUDY_TSV, json_record_keys=_JSON_RECORD_KEYS,
         )
         assert studies is not None
         assert len(studies) == 1
@@ -376,7 +376,7 @@ def test_load_tsv(self):
     def test_load_json(self):
         """JSON file loads correctly."""
         studies = common.load_input_file(
-            MIMICC_JSON, json_record_keys=_JSON_RECORD_KEYS,
+            EXAMPLE_STUDY_JSON, json_record_keys=_JSON_RECORD_KEYS,
         )
         assert studies is not None
         assert len(studies) == 1
@@ -389,7 +389,7 @@ def test_all_formats_produce_same_data(self):
             common.load_input_file(
                 path, json_record_keys=_JSON_RECORD_KEYS,
             )
-            for path in [MIMICC_JSON, MIMICC_CSV, MIMICC_TSV]
+            for path in [EXAMPLE_STUDY_JSON, EXAMPLE_STUDY_CSV, EXAMPLE_STUDY_TSV]
         ]
         for studies in all_studies:
             assert len(studies) == 1
diff --git a/modules/local/submit_study/environment.yml b/modules/local/submit_rawreads_study/environment.yml
similarity index 100%
rename from modules/local/submit_study/environment.yml
rename to modules/local/submit_rawreads_study/environment.yml
diff --git a/modules/local/submit_study/main.nf b/modules/local/submit_rawreads_study/main.nf
similarity index 84%
rename from modules/local/submit_study/main.nf
rename to modules/local/submit_rawreads_study/main.nf
index 47a9d88..51bc062 100644
--- a/modules/local/submit_study/main.nf
+++ b/modules/local/submit_rawreads_study/main.nf
@@ -1,13 +1,13 @@
-process SUBMIT_STUDY {
+process SUBMIT_RAWREADS_STUDY {
     tag "$meta.id"
     label 'process_single'
 
     conda "${moduleDir}/environment.yml"
     container "quay.io/microbiome-informatics/mgnify-pipelines-toolkit:1.4.17"
 
-    // ENA_USERNAME and ENA_PASSWORD must be set in the process environment.
+    // ENA_WEBIN and ENA_WEBIN_PASSWORD must be set in the process environment.
     // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config:
-    //   env { ENA_USERNAME = secrets.WEBIN_ACCOUNT; ENA_PASSWORD = secrets.WEBIN_PASSWORD }
+    //   env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD }
 
     input:
     tuple val(meta), path(study_metadata)
@@ -23,7 +23,7 @@ process SUBMIT_STUDY {
     def args   = task.ext.args   ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    submit_study.py \\
+    submit_rawreads_study.py \\
         --input ${study_metadata} \\
         --output ${prefix}_accessions.json \\
         ${args}
diff --git a/modules/local/submit_study/meta.yml b/modules/local/submit_rawreads_study/meta.yml
similarity index 95%
rename from modules/local/submit_study/meta.yml
rename to modules/local/submit_rawreads_study/meta.yml
index e09d150..629512f 100644
--- a/modules/local/submit_study/meta.yml
+++ b/modules/local/submit_rawreads_study/meta.yml
@@ -1,12 +1,12 @@
 # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
-name: "submit_study"
+name: "submit_rawreads_study"
 description: |
   Submit a new study to ENA via the Webin REST API v2.
   Reads study metadata from a JSON, CSV, or TSV file, checks for
   duplicate studies already registered under the Webin account,
   builds a PROJECT XML submission document, and submits to ENA.
   Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD
-  Nextflow secrets, which are mapped to ENA_USERNAME and ENA_PASSWORD
+  Nextflow secrets, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD
   inside the process.
 keywords:
   - ena
diff --git a/modules/local/submit_study/tests/main.nf.test b/modules/local/submit_rawreads_study/tests/main.nf.test
similarity index 62%
rename from modules/local/submit_study/tests/main.nf.test
rename to modules/local/submit_rawreads_study/tests/main.nf.test
index e37cccc..a0cb4fd 100644
--- a/modules/local/submit_study/tests/main.nf.test
+++ b/modules/local/submit_rawreads_study/tests/main.nf.test
@@ -1,21 +1,21 @@
 nextflow_process {
-    name "Test Process SUBMIT_STUDY"
+    name "Test Process SUBMIT_RAWREADS_STUDY"
     script "../main.nf"
     config "./nextflow.config"
-    process "SUBMIT_STUDY"
+    process "SUBMIT_RAWREADS_STUDY"
 
     tag "modules"
-    tag "submit_study"
+    tag "submit_rawreads_study"
 
-    test("submit_study - stub") {
+    test("submit_rawreads_study - stub") {
         options "-stub"
 
         when {
             process {
                 """
                 input[0] = [
-                    [ id:'mimicc' ],
-                    file("${projectDir}/assets/test-fixtures/mimicc_study.json", checkIfExists: true)
+                    [ id:'example_study' ],
+                    file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true)
                 ]
                 """
             }
@@ -29,16 +29,16 @@ nextflow_process {
         }
     }
 
-    test("submit_study - dry run against ENA test server") {
-        // Requires WEBIN_ACCOUNT and WEBIN_PASSWORD Nextflow secrets.
+    test("submit_rawreads_study - dry run against ENA test server") {
         // Validates and builds the submission XML but does not submit to ENA.
+        // Dummy credentials in tests/nextflow.config are sufficient for dry-run mode.
 
         when {
             process {
                 """
                 input[0] = [
-                    [ id:'mimicc' ],
-                    file("${projectDir}/assets/test-fixtures/mimicc_study.json", checkIfExists: true)
+                    [ id:'example_study' ],
+                    file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true)
                 ]
                 """
             }
diff --git a/modules/local/submit_study/tests/main.nf.test.snap b/modules/local/submit_rawreads_study/tests/main.nf.test.snap
similarity index 50%
rename from modules/local/submit_study/tests/main.nf.test.snap
rename to modules/local/submit_rawreads_study/tests/main.nf.test.snap
index dd56c7c..08f7fdb 100644
--- a/modules/local/submit_study/tests/main.nf.test.snap
+++ b/modules/local/submit_rawreads_study/tests/main.nf.test.snap
@@ -1,32 +1,32 @@
 {
-    "submit_study - stub": {
+    "submit_rawreads_study - stub": {
         "content": [
             {
                 "0": [
                     [
                         {
-                            "id": "mimicc"
+                            "id": "example_study"
                         },
-                        "mimicc_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
+                        "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
                     ]
                 ],
                 "1": [
-                    "versions.yml:md5,1ffe6cc50bd36f7110413723e0796dd4"
+                    "versions.yml:md5,d7080ded74f0381019a674b865daa329"
                 ],
                 "accessions": [
                     [
                         {
-                            "id": "mimicc"
+                            "id": "example_study"
                         },
-                        "mimicc_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
+                        "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
                     ]
                 ],
                 "versions": [
-                    "versions.yml:md5,1ffe6cc50bd36f7110413723e0796dd4"
+                    "versions.yml:md5,d7080ded74f0381019a674b865daa329"
                 ]
             }
         ],
-        "timestamp": "2026-03-12T11:51:02.565164",
+        "timestamp": "2026-03-12T11:57:10.234715",
         "meta": {
             "nf-test": "0.9.4",
             "nextflow": "25.10.4"
diff --git a/modules/local/submit_study/tests/nextflow.config b/modules/local/submit_rawreads_study/tests/nextflow.config
similarity index 62%
rename from modules/local/submit_study/tests/nextflow.config
rename to modules/local/submit_rawreads_study/tests/nextflow.config
index 3611907..c4633fa 100644
--- a/modules/local/submit_study/tests/nextflow.config
+++ b/modules/local/submit_rawreads_study/tests/nextflow.config
@@ -1,18 +1,18 @@
-// Test configuration for SUBMIT_STUDY module.
+// Test configuration for SUBMIT_RAWREADS_STUDY module.
 // --test        : use the ENA dev server (submissions are discarded daily)
 // --automated   : skip the Webin Reports duplicate-checking API call
 // --dry-run     : validate and build XML but do not submit to ENA
 //
 // Dummy credentials are sufficient for --dry-run --automated mode since
 // no HTTP calls are made. For real submission tests, replace with secrets:
-//   env { ENA_USERNAME = secrets.WEBIN_ACCOUNT; ENA_PASSWORD = secrets.WEBIN_PASSWORD }
+//   env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD }
 process {
-    withName: SUBMIT_STUDY {
+    withName: SUBMIT_RAWREADS_STUDY {
         ext.args = '--test --automated --dry-run'
     }
 }
 
 env {
-    ENA_USERNAME = 'Webin-000000'
-    ENA_PASSWORD  = 'dummy-password'
+    ENA_WEBIN          = 'Webin-000000'
+    ENA_WEBIN_PASSWORD = 'dummy-password'
 }

From 38e74cf16be0112aa449a6746356c184d7b36200 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Thu, 12 Mar 2026 12:24:55 +0000
Subject: [PATCH 05/36] Tidied up code

---
 bin/ena_common.py            |  97 ++++---------
 bin/submit_rawreads_study.py | 262 +++++++++--------------------------
 2 files changed, 86 insertions(+), 273 deletions(-)

diff --git a/bin/ena_common.py b/bin/ena_common.py
index c782140..89e41ab 100644
--- a/bin/ena_common.py
+++ b/bin/ena_common.py
@@ -36,12 +36,8 @@
 # Constants
 # -----------------------------------------------------------
 
-PROD_URL: Final = (
-    "https://www.ebi.ac.uk/ena/submit/webin-v2"
-)
-TEST_URL: Final = (
-    "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2"
-)
+PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2"
+TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2"
 
 _MAX_HOLD_YEARS: Final = 2
 
@@ -104,10 +100,7 @@ def get_credentials() -> tuple[str, str]:
     username = os.environ.get("ENA_WEBIN", "").strip()
     password = os.environ.get("ENA_WEBIN_PASSWORD", "").strip()
     if not username or not password:
-        logger.error(
-            "ENA_WEBIN and ENA_WEBIN_PASSWORD environment"
-            " variables must be set",
-        )
+        logger.error("ENA_WEBIN and ENA_WEBIN_PASSWORD environment variables must be set")
         sys.exit(1)
     return username, password
 
@@ -185,8 +178,7 @@ def validate_hold_until(hold_until: str) -> datetime.date:
         hold_date = datetime.date.fromisoformat(hold_until)
     except ValueError:
         raise click.BadParameter(
-            f"Invalid date format: {hold_until!r}."
-            " Expected YYYY-MM-DD."
+            f"Invalid date format: {hold_until!r}. Expected YYYY-MM-DD."
         ) from None
 
     today = datetime.date.today()
@@ -194,15 +186,13 @@ def validate_hold_until(hold_until: str) -> datetime.date:
 
     if hold_date > max_date:
         raise click.BadParameter(
-            f"Hold date {hold_until} is more than"
-            f" {_MAX_HOLD_YEARS} years from today"
+            f"Hold date {hold_until} is more than {_MAX_HOLD_YEARS} years from today"
             f" ({today}). Maximum allowed: {max_date}."
         )
 
     if hold_date <= today:
         raise click.BadParameter(
-            f"Hold date {hold_until} is not in the"
-            f" future (today is {today})."
+            f"Hold date {hold_until} is not in the future (today is {today})."
         )
 
     return hold_date
@@ -262,7 +252,7 @@ def parse_checklist_units(
 
 def validate_xml_against_xsd(
     xml_bytes: bytes,
-    fragment_tag: str | None = None,
+    _fragment_tag: str | None = None,  # unused; kept for API compatibility
     fallback_checker: Callable[
         [bytes, list[str]], tuple[bool, list[str]]
     ] | None = None,
@@ -275,7 +265,7 @@ def validate_xml_against_xsd(
 
     Args:
         xml_bytes: Serialised XML document.
-        fragment_tag: Unused; kept for API compatibility.
+        _fragment_tag: Unused; kept for API compatibility.
         fallback_checker: Optional function called with
             (*xml_bytes*, *messages*) that returns
             (*is_valid*, *messages*).
@@ -395,11 +385,7 @@ def extract_records_from_json(
         if isinstance(container, dict):
             for key, val in container.items():
                 if isinstance(val, list):
-                    logger.info(
-                        "Extracted records from"
-                        " Container.%s",
-                        key,
-                    )
+                    logger.info("Extracted records from Container.%s", key)
                     return val
 
         for key in record_keys:
@@ -471,23 +457,13 @@ def fetch_from_reports_endpoint(
         "max-results": max_results,
     }
 
-    req = requests.Request(
-        "GET", url, params=params, auth=auth,
-    )
+    req = requests.Request("GET", url, params=params, auth=auth)
     prepared = req.prepare()
-    logger.debug(
-        'curl -u %s:*** "%s"',
-        auth.username, prepared.url,
-    )
+    logger.debug('curl -u %s:*** "%s"', auth.username, prepared.url)
 
     try:
-        resp = requests.get(
-            url, params=params, auth=auth, timeout=60,
-        )
-        logger.info(
-            "Reports API at %s returned %s",
-            url, resp.status_code,
-        )
+        resp = requests.get(url, params=params, auth=auth, timeout=60)
+        logger.info("Reports API at %s returned %s", url, resp.status_code)
         resp.raise_for_status()
         return resp.json()
 
@@ -498,30 +474,20 @@ def fetch_from_reports_endpoint(
             else "unknown"
         )
         if status == 404:
-            logger.info(
-                "Reports API at %s returned 404"
-                " — no records yet",
-                url,
-            )
+            logger.info("Reports API at %s returned 404 — no records yet", url)
             return []
         if status in (401, 403):
             logger.warning(
-                "Reports API at %s returned %s"
-                " — endpoint may not be available"
+                "Reports API at %s returned %s — endpoint may not be available"
                 " or credentials may differ",
                 url, status,
             )
             return None
-        logger.warning(
-            "Reports API at %s returned HTTP %s",
-            url, status,
-        )
+        logger.warning("Reports API at %s returned HTTP %s", url, status)
         return None
 
     except requests.exceptions.RequestException as exc:
-        logger.warning(
-            "Reports API at %s failed: %s", url, exc,
-        )
+        logger.warning("Reports API at %s failed: %s", url, exc)
         return None
 
 
@@ -561,13 +527,8 @@ def fetch_account_records(
     )
 
     for url in urls:
-        logger.info(
-            "Fetching account %s from: %s",
-            entity_label, url,
-        )
-        raw = fetch_from_reports_endpoint(
-            url, auth, max_results,
-        )
+        logger.info("Fetching account %s from: %s", entity_label, url)
+        raw = fetch_from_reports_endpoint(url, auth, max_results)
         if raw is None:
             continue
 
@@ -580,15 +541,11 @@ def fetch_account_records(
             if normalized is not None:
                 records.append(normalized)
 
-        logger.info(
-            "Found %d %s in account",
-            len(records), entity_label,
-        )
+        logger.info("Found %d %s in account", len(records), entity_label)
         return records
 
     logger.warning(
-        "Could not reach any Webin reports endpoint."
-        " Duplicate checking for %s will be skipped.",
+        "Could not reach any Webin reports endpoint. Duplicate checking for %s will be skipped.",
         entity_label,
     )
     return []
@@ -640,10 +597,8 @@ def find_duplicates_by_alias_title(
             by_alias[alias] = rec
 
     logger.info(
-        "Checking %d new %s against"
-        " %d existing account %s...",
-        total, entity_label,
-        len(account_records), entity_label,
+        "Checking %d new %s against %d existing account %s...",
+        total, entity_label, len(account_records), entity_label,
     )
 
     for i, record in enumerate(new_records):
@@ -669,11 +624,7 @@ def find_duplicates_by_alias_title(
             )
 
             if len(duplicates) == total:
-                logger.info(
-                    "All %s are duplicates"
-                    " — skipping further checks",
-                    entity_label,
-                )
+                logger.info("All %s are duplicates — skipping further checks", entity_label)
                 return duplicates
 
     return duplicates
diff --git a/bin/submit_rawreads_study.py b/bin/submit_rawreads_study.py
index f1850d7..1664a16 100755
--- a/bin/submit_rawreads_study.py
+++ b/bin/submit_rawreads_study.py
@@ -51,12 +51,8 @@
 # Reports API (study-specific)
 # -----------------------------------------------------------
 
-_PROD_REPORTS_URL: Final = (
-    "https://www.ebi.ac.uk/ena/submit/report/projects"
-)
-_TEST_REPORTS_URL: Final = (
-    "https://wwwdev.ebi.ac.uk/ena/submit/report/projects"
-)
+_PROD_REPORTS_URL: Final = "https://www.ebi.ac.uk/ena/submit/report/projects"
+_TEST_REPORTS_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/report/projects"
 
 
 def _normalize_study_report(
@@ -65,27 +61,16 @@ def _normalize_study_report(
     """Normalise a raw study report dict."""
     return {
         "title": (
-            report.get("title")
-            or report.get("studyTitle")
-            or report.get("STUDY_TITLE", "")
-        ),
-        "alias": (
-            report.get("alias")
-            or report.get("studyAlias")
-            or ""
+            report.get("title") or report.get("studyTitle") or report.get("STUDY_TITLE", "")
         ),
+        "alias": report.get("alias") or report.get("studyAlias") or "",
         "accession": (
             report.get("accession")
             or report.get("studyAccession")
             or report.get("report", {}).get("id", "")
         ),
-        "secondary_accession": (
-            report.get("secondaryAccession")
-            or report.get("secondaryId", "")
-        ),
-        "status": report.get(
-            "releaseStatus", "UNKNOWN"
-        ),
+        "secondary_accession": report.get("secondaryAccession") or report.get("secondaryId", ""),
+        "status": report.get("releaseStatus", "UNKNOWN"),
     }
 
 
@@ -167,10 +152,7 @@ def build_submission_xml(
     submission = ET.SubElement(
         submission_set, "SUBMISSION",
     )
-    sub_alias = (
-        "study-submission-"
-        + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
-    )
+    sub_alias = f"study-submission-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"
     submission.set("alias", sub_alias)
     actions = ET.SubElement(submission, "ACTIONS")
     main_action = ET.SubElement(actions, "ACTION")
@@ -271,9 +253,7 @@ def _validate_study_xml_structure(
 
     project_set = tree.find("PROJECT_SET")
     if project_set is None:
-        messages.append(
-            "ERROR: Missing PROJECT_SET element"
-        )
+        messages.append("ERROR: Missing PROJECT_SET element")
         return False, messages
 
     projects = project_set.findall("PROJECT")
@@ -285,20 +265,13 @@ def _validate_study_xml_structure(
         alias = proj.get("alias", "<no alias>")
         title = proj.find("TITLE")
         if title is None or not title.text:
-            messages.append(
-                f"ERROR: PROJECT '{alias}' missing TITLE"
-            )
+            messages.append(f"ERROR: PROJECT '{alias}' missing TITLE")
             return False, messages
         sp = proj.find("SUBMISSION_PROJECT")
         if sp is None:
-            messages.append(
-                f"ERROR: PROJECT '{alias}'"
-                " missing SUBMISSION_PROJECT"
-            )
+            messages.append(f"ERROR: PROJECT '{alias}' missing SUBMISSION_PROJECT")
             return False, messages
-        messages.append(
-            f"OK: PROJECT '{alias}' has required elements"
-        )
+        messages.append(f"OK: PROJECT '{alias}' has required elements")
 
     return True, messages
 
@@ -336,10 +309,7 @@ def parse_xml_receipt(
     Returns:
         Tuple of (*success*, *accessions*, *messages*).
     """
-    success = (
-        receipt_root.get("success", "false").lower()
-        == "true"
-    )
+    success = receipt_root.get("success", "false").lower() == "true"
     accessions: list[dict[str, str]] = []
     messages: list[str] = []
 
@@ -355,18 +325,12 @@ def parse_xml_receipt(
             "alias": proj.get("alias", ""),
             "accession": proj.get("accession", ""),
             "status": proj.get("status", ""),
-            "holdUntilDate": proj.get(
-                "holdUntilDate", ""
-            ),
+            "holdUntilDate": proj.get("holdUntilDate", ""),
         }
         ext = proj.find("EXT_ID")
         if ext is not None:
-            acc_info["external_accession"] = ext.get(
-                "accession", ""
-            )
-            acc_info["external_type"] = ext.get(
-                "type", ""
-            )
+            acc_info["external_accession"] = ext.get("accession", "")
+            acc_info["external_type"] = ext.get("type", "")
         accessions.append(acc_info)
 
     # Some receipts use STUDY instead of PROJECT.
@@ -415,45 +379,26 @@ def _do_submission(
     for msg in xml_messages:
         logger.info("  %s", msg)
     if not xml_valid:
-        logger.error(
-            "XML validation FAILED (%s)"
-            " — aborting submission", action,
-        )
+        logger.error("XML validation FAILED (%s) — aborting submission", action)
         return False
 
     logger.info("XML validation PASSED (%s)", action)
 
     if dry_run:
-        logger.info(
-            "DRY RUN — skipping %s submission", action,
-        )
-        logger.info(
-            "Generated XML:\n%s",
-            xml_bytes.decode("utf-8"),
-        )
+        logger.info("DRY RUN — skipping %s submission", action)
+        logger.info("Generated XML:\n%s", xml_bytes.decode("utf-8"))
         return True
 
-    logger.info(
-        "Submitting %s to ENA (%s)...", action, env_label,
-    )
+    logger.info("Submitting %s to ENA (%s)...", action, env_label)
     try:
-        receipt_root = common.submit_xml(
-            base_url, auth, xml_bytes,
-        )
+        receipt_root = common.submit_xml(base_url, auth, xml_bytes)
     except requests.exceptions.HTTPError as exc:
-        logger.error(
-            "HTTP error during %s submission: %s",
-            action, exc,
-        )
+        logger.error("HTTP error during %s submission: %s", action, exc)
         if exc.response is not None:
-            logger.error(
-                "Response body: %s", exc.response.text,
-            )
+            logger.error("Response body: %s", exc.response.text)
         return False
 
-    success, accessions, receipt_messages = (
-        parse_xml_receipt(receipt_root)
-    )
+    success, accessions, receipt_messages = parse_xml_receipt(receipt_root)
     for msg in receipt_messages:
         logger.info("  Receipt: %s", msg)
 
@@ -461,14 +406,10 @@ def _do_submission(
         logger.info("%s SUCCESSFUL", action)
         for acc in accessions:
             ext = acc.get("external_accession", "")
-            ext_suffix = (
-                f" (study: {ext})" if ext else ""
-            )
+            ext_suffix = f" (study: {ext})" if ext else ""
             logger.info(
-                "  %s: alias=%s accession=%s"
-                " status=%s%s",
-                action, acc["alias"], acc["accession"],
-                acc["status"], ext_suffix,
+                "  %s: alias=%s accession=%s status=%s%s",
+                action, acc["alias"], acc["accession"], acc["status"], ext_suffix,
             )
             results[result_key].append(acc)
     else:
@@ -501,14 +442,12 @@ def _do_submission(
 @click.option(
     "--test", "use_test",
     is_flag=True, default=False,
-    help="Use the ENA test service"
-    " (submissions are discarded daily)",
+    help="Use the ENA test service (submissions are discarded daily)",
 )
 @click.option(
     "--hold-until",
     default=None,
-    help="Hold studies private until this date"
-    " (YYYY-MM-DD, max 2 years from now)",
+    help="Hold studies private until this date (YYYY-MM-DD, max 2 years from now)",
 )
 @click.option(
     "--log", "log_file",
@@ -520,32 +459,27 @@ def _do_submission(
     "--output",
     type=click.Path(path_type=Path),
     default=None,
-    help="Path to write JSON accession results"
-    " (default: stdout)",
+    help="Path to write JSON accession results (default: stdout)",
 )
 @click.option(
     "--max-results",
     default=5000,
-    help="Maximum number of projects to fetch"
-    " from the Reports API for duplicate checking",
+    help="Maximum number of projects to fetch from the Reports API for duplicate checking",
 )
 @click.option(
     "--dry-run",
     is_flag=True, default=False,
-    help="Validate and build XML but do not"
-    " submit to ENA",
+    help="Validate and build XML but do not submit to ENA",
 )
 @click.option(
     "--automated",
     is_flag=True, default=False,
-    help="Skip duplicate detection against the"
-    " Webin Reports API (for automated pipelines)",
+    help="Skip duplicate detection against the Webin Reports API (for automated pipelines)",
 )
 @click.option(
     "--force",
     is_flag=True, default=False,
-    help="Submit duplicate studies using the MODIFY"
-    " action to overwrite existing ENA records,"
+    help="Submit duplicate studies using the MODIFY action to overwrite existing ENA records,"
     " instead of skipping them",
 )
 def main(
@@ -564,10 +498,7 @@ def main(
     username, password = common.get_credentials()
 
     env_label = "TEST" if use_test else "PRODUCTION"
-    logger.info(
-        "ENA Study Submission — environment: %s",
-        env_label,
-    )
+    logger.info("ENA Study Submission — environment: %s", env_label)
     base_url = common.get_base_url(use_test)
     auth = HTTPBasicAuth(username, password)
     logger.debug("Auth username: %s", username)
@@ -581,22 +512,14 @@ def main(
         input_file, json_record_keys=_JSON_RECORD_KEYS,
     )
     if studies is None:
-        logger.error(
-            "Unsupported file format."
-            " Supported: .json, .csv, .tsv",
-        )
+        logger.error("Unsupported file format. Supported: .json, .csv, .tsv")
         sys.exit(1)
 
-    logger.info(
-        "Loaded %d study/studies from input",
-        len(studies),
-    )
+    logger.info("Loaded %d study/studies from input", len(studies))
 
     # -- Step 2: Check for duplicates --------------------
     if automated:
-        logger.info(
-            "Automated mode: skipping duplicate detection",
-        )
+        logger.info("Automated mode: skipping duplicate detection")
         duplicates: dict[int, dict[str, Any]] = {}
     else:
         account_studies = fetch_account_studies(
@@ -605,10 +528,8 @@ def main(
         )
         for ps in account_studies:
             logger.info(
-                "  Account study: %s | alias=%s"
-                " | title=%s | status=%s",
-                ps["accession"], ps["alias"],
-                ps["title"], ps["status"],
+                "  Account study: %s | alias=%s | title=%s | status=%s",
+                ps["accession"], ps["alias"], ps["title"], ps["status"],
             )
         duplicates = find_duplicate_studies(
             studies, account_studies,
@@ -623,37 +544,23 @@ def main(
 
     studies_to_modify: list[dict[str, Any]] = []
     if duplicates:
-        action_label = (
-            "will be re-submitted with MODIFY"
-            if force else "will NOT be submitted"
-        )
+        action_label = "will be re-submitted with MODIFY" if force else "will NOT be submitted"
         logger.warning(
             "Found %d duplicate(s) — %s:",
             len(duplicates), action_label,
         )
         for idx, dup_info in duplicates.items():
-            study_title = studies[idx].get(
-                "STUDY_TITLE", f"study[{idx}]",
-            )
+            study_title = studies[idx].get("STUDY_TITLE", f"study[{idx}]")
             logger.warning(
-                "  DUPLICATE: '%s' matches existing %s"
-                " (accession: %s)",
-                study_title,
-                dup_info["match_reason"],
-                dup_info["accession"],
+                "  DUPLICATE: '%s' matches existing %s (accession: %s)",
+                study_title, dup_info["match_reason"], dup_info["accession"],
             )
             results["duplicates"].append({
                 "input_index": idx,
                 "title": study_title,
                 "alias": studies[idx].get("alias", ""),
-                "existing_accession": (
-                    dup_info["accession"]
-                ),
-                "existing_secondary_accession": (
-                    dup_info.get(
-                        "secondary_accession", ""
-                    )
-                ),
+                "existing_accession": dup_info["accession"],
+                "existing_secondary_accession": dup_info.get("secondary_accession", ""),
                 "match_reason": dup_info["match_reason"],
             })
             if force:
@@ -669,16 +576,12 @@ def main(
     ]
 
     if not studies_to_submit and not studies_to_modify:
-        logger.info(
-            "No studies to submit"
-            " (all are duplicates or input is empty)",
-        )
+        logger.info("No studies to submit (all are duplicates or input is empty)")
         common.write_results(results, output)
         return
 
     logger.info(
-        "%d new study/studies to ADD,"
-        " %d duplicate(s) to MODIFY",
+        "%d new study/studies to ADD, %d duplicate(s) to MODIFY",
         len(studies_to_submit), len(studies_to_modify),
     )
 
@@ -686,23 +589,11 @@ def main(
 
     # -- Step 3: ADD new studies -------------------------
     if studies_to_submit:
-        logger.info(
-            "Building ADD XML for %d new study/studies...",
-            len(studies_to_submit),
-        )
-        xml_root = build_submission_xml(
-            studies_to_submit, hold_until=hold_until,
-            action="ADD",
-        )
+        logger.info("Building ADD XML for %d new study/studies...", len(studies_to_submit))
+        xml_root = build_submission_xml(studies_to_submit, hold_until=hold_until, action="ADD")
         xml_bytes = common.xml_to_bytes(xml_root)
-        logger.debug(
-            "Generated XML (ADD):\n%s",
-            xml_bytes.decode("utf-8"),
-        )
-        logger.info(
-            "XML document size (ADD): %d bytes",
-            len(xml_bytes),
-        )
+        logger.debug("Generated XML (ADD):\n%s", xml_bytes.decode("utf-8"))
+        logger.info("XML document size (ADD): %d bytes", len(xml_bytes))
         ok = _do_submission(
             base_url, auth, xml_bytes,
             action="ADD",
@@ -715,23 +606,11 @@ def main(
 
     # -- Step 4: MODIFY duplicate studies (--force) ------
     if studies_to_modify:
-        logger.info(
-            "Building MODIFY XML for %d duplicate(s)...",
-            len(studies_to_modify),
-        )
-        xml_root = build_submission_xml(
-            studies_to_modify, hold_until=hold_until,
-            action="MODIFY",
-        )
+        logger.info("Building MODIFY XML for %d duplicate(s)...", len(studies_to_modify))
+        xml_root = build_submission_xml(studies_to_modify, hold_until=hold_until, action="MODIFY")
         xml_bytes = common.xml_to_bytes(xml_root)
-        logger.debug(
-            "Generated XML (MODIFY):\n%s",
-            xml_bytes.decode("utf-8"),
-        )
-        logger.info(
-            "XML document size (MODIFY): %d bytes",
-            len(xml_bytes),
-        )
+        logger.debug("Generated XML (MODIFY):\n%s", xml_bytes.decode("utf-8"))
+        logger.info("XML document size (MODIFY): %d bytes", len(xml_bytes))
         ok = _do_submission(
             base_url, auth, xml_bytes,
             action="MODIFY",
@@ -751,39 +630,22 @@ def main(
     logger.info("=" * 60)
     logger.info("SUBMISSION SUMMARY")
     logger.info(
-        "  Duplicates skipped: %d",
-        len(results["duplicates"])
-        - len(results["modified"]),
+        "  Duplicates skipped: %d", len(results["duplicates"]) - len(results["modified"]),
     )
     for d in results["duplicates"]:
-        logger.info(
-            "    %s -> %s",
-            d["title"], d["existing_accession"],
-        )
-    logger.info(
-        "  Newly submitted (ADD): %d",
-        len(results["submitted"]),
-    )
+        logger.info("    %s -> %s", d["title"], d["existing_accession"])
+    logger.info("  Newly submitted (ADD): %d", len(results["submitted"]))
     for s in results["submitted"]:
         ext = s.get("external_accession", "")
         ext_suffix = f" ({ext})" if ext else ""
-        logger.info(
-            "    %s -> %s%s",
-            s["alias"], s["accession"], ext_suffix,
-        )
-    logger.info(
-        "  Modified (MODIFY): %d",
-        len(results["modified"]),
-    )
+        logger.info("    %s -> %s%s", s["alias"], s["accession"], ext_suffix)
+    logger.info("  Modified (MODIFY): %d", len(results["modified"]))
     for m in results["modified"]:
         ext = m.get("external_accession", "")
         ext_suffix = f" ({ext})" if ext else ""
-        logger.info(
-            "    %s -> %s%s",
-            m["alias"], m["accession"], ext_suffix,
-        )
+        logger.info("    %s -> %s%s", m["alias"], m["accession"], ext_suffix)
     logger.info("=" * 60)
 
 
 if __name__ == "__main__":
-    main()
+    main()  # type: ignore[call-arg]

From 3798fa4c66c6e808361997a1aa3991af56cded93 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Thu, 12 Mar 2026 12:44:28 +0000
Subject: [PATCH 06/36] Rename files

---
 bin/{ena_common.py => ena_submit_common.py} | 0
 bin/submit_rawreads_study.py                | 2 +-
 bin/test_submit_rawreads_study.py           | 4 ++--
 3 files changed, 3 insertions(+), 3 deletions(-)
 rename bin/{ena_common.py => ena_submit_common.py} (100%)

diff --git a/bin/ena_common.py b/bin/ena_submit_common.py
similarity index 100%
rename from bin/ena_common.py
rename to bin/ena_submit_common.py
diff --git a/bin/submit_rawreads_study.py b/bin/submit_rawreads_study.py
index 1664a16..c00ee6d 100755
--- a/bin/submit_rawreads_study.py
+++ b/bin/submit_rawreads_study.py
@@ -42,7 +42,7 @@
 import requests
 from requests.auth import HTTPBasicAuth
 
-import ena_common as common
+import ena_submit_common as common
 
 logger = logging.getLogger("ena_submit.rawreads_study")
 
diff --git a/bin/test_submit_rawreads_study.py b/bin/test_submit_rawreads_study.py
index b7d3dcb..0612f43 100644
--- a/bin/test_submit_rawreads_study.py
+++ b/bin/test_submit_rawreads_study.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Tests for submit_rawreads_study.py and ena_common.py — study submission pipeline.
+"""Tests for submit_rawreads_study.py and ena_submit_common.py — study submission pipeline.
 
 Usage:
     pytest bin/test_submit_rawreads_study.py -v
@@ -16,7 +16,7 @@
 # Ensure the scripts directory is importable
 sys.path.insert(0, os.path.dirname(__file__))
 
-import ena_common as common
+import ena_submit_common as common
 from submit_rawreads_study import (
     build_submission_xml,
     find_duplicate_studies,

From ab5f92d481ec0e9bd0dda759c979faa01e3a0384 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Thu, 12 Mar 2026 13:37:24 +0000
Subject: [PATCH 07/36] Update tests for study_submit

---
 bin/test_submit_rawreads_study.py   | 1650 +++++++++++++++++++++------
 conf/modules.config                 |    6 +-
 conf/test_assembly.config           |    2 -
 modules/local/ena_webin_cli/main.nf |   12 +
 modules/local/genome_upload/main.nf |    3 +-
 nextflow.config                     |    5 +-
 nextflow_schema.json                |   15 +-
 tests/default.nf.test               |   67 +-
 tests/default.nf.test.snap          |   58 +
 tests/nextflow.config               |   15 +
 workflows/assemblysubmit.nf         |   20 +-
 workflows/genomesubmit.nf           |   20 +-
 12 files changed, 1512 insertions(+), 361 deletions(-)
 create mode 100644 tests/default.nf.test.snap

diff --git a/bin/test_submit_rawreads_study.py b/bin/test_submit_rawreads_study.py
index 0612f43..f07f85e 100644
--- a/bin/test_submit_rawreads_study.py
+++ b/bin/test_submit_rawreads_study.py
@@ -1,8 +1,19 @@
 #!/usr/bin/env python3
-"""Tests for submit_rawreads_study.py and ena_submit_common.py — study submission pipeline.
+"""Tests for submit_rawreads_study.py — raw-reads study submission pipeline.
+
+Covers:
+    A. Unit tests for build_submission_xml and _add_project_element
+    B. Unit tests for validate_study_xml
+    C. Unit tests for parse_xml_receipt
+    D. Unit tests for find_duplicate_studies and fetch_account_studies
+    E. CLI integration tests for main() using click.testing.CliRunner
 
 Usage:
     pytest bin/test_submit_rawreads_study.py -v
+
+All external I/O (HTTP requests, ENA reports API) is mocked. Tests do NOT
+import from ena_submit_common directly — all assertions go through the public
+API of submit_rawreads_study.
 """
 
 from __future__ import annotations
@@ -10,421 +21,1400 @@
 import json
 import os
 import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+from textwrap import dedent
+from typing import Any
+from unittest.mock import MagicMock, patch
 
 import pytest
+from click.testing import CliRunner
+from requests.auth import HTTPBasicAuth
 
-# Ensure the scripts directory is importable
+# Ensure the scripts directory is on the path before importing the module.
 sys.path.insert(0, os.path.dirname(__file__))
 
-import ena_submit_common as common
-from submit_rawreads_study import (
+from submit_rawreads_study import (  # noqa: E402
+    _normalize_study_report,
     build_submission_xml,
+    fetch_account_studies,
     find_duplicate_studies,
+    main,
+    parse_xml_receipt,
     validate_study_xml,
 )
 
 # ---------------------------------------------------------------------------
-# Paths
+# Constants shared across test groups
 # ---------------------------------------------------------------------------
 
-FIXTURES_DIR = os.path.join(
-    os.path.dirname(__file__), "..", "assets", "test-fixtures",
-)
-EXAMPLE_STUDY_JSON = os.path.join(FIXTURES_DIR, "example_study.json")
-EXAMPLE_STUDY_CSV = os.path.join(FIXTURES_DIR, "example_study.csv")
-EXAMPLE_STUDY_TSV = os.path.join(FIXTURES_DIR, "example_study.tsv")
-
-_FIXTURES_PRESENT = os.path.isfile(EXAMPLE_STUDY_JSON)
-requires_fixtures = pytest.mark.skipif(
-    not _FIXTURES_PRESENT,
-    reason="example study fixtures not present in assets/test-fixtures/",
-)
-
-_JSON_RECORD_KEYS = ("studies", "data")
+_PROD_REPORTS_URL = "https://www.ebi.ac.uk/ena/submit/report/projects"
+_TEST_REPORTS_URL = "https://wwwdev.ebi.ac.uk/ena/submit/report/projects"
 
 # ---------------------------------------------------------------------------
-# Fixtures
+# Shared fixtures
 # ---------------------------------------------------------------------------
 
 
 @pytest.fixture
-def example_study_json():
-    """Load the example study JSON fixture."""
-    with open(EXAMPLE_STUDY_JSON) as f:
-        return json.load(f)
+def basic_study() -> dict[str, Any]:
+    """Return a minimal valid study metadata dict."""
+    return {
+        "alias": "test-study-001",
+        "STUDY_TITLE": "A Basic Test Study",
+        "STUDY_ABSTRACT": "An abstract for the test study.",
+        "CENTER_PROJECT_NAME": "My Centre Project",
+        "existing_study_type": "Metagenomics",
+    }
 
 
-# ---------------------------------------------------------------------------
-# extract_records_from_json tests
-# ---------------------------------------------------------------------------
+@pytest.fixture
+def metagenomics_assembly_study() -> dict[str, Any]:
+    """Return a study dict representing a metagenomics assembly submission."""
+    return {
+        "alias": "metagenome-assembly-001",
+        "STUDY_TITLE": "Primary Metagenome Assembly of Soil Sample",
+        "STUDY_ABSTRACT": "Assembly of contigs from metagenome sequencing of soil.",
+        "CENTER_PROJECT_NAME": "Soil Metagenome Project",
+        "existing_study_type": "Metagenomics",
+    }
 
 
-class TestExtractRecordsFromJson:
-    """Tests for extracting study rows from various JSON formats."""
+@pytest.fixture
+def mag_genome_study() -> dict[str, Any]:
+    """Return a study dict representing a MAG/genome submission."""
+    return {
+        "alias": "mag-genome-001",
+        "STUDY_TITLE": "Metagenome-Assembled Genome from Soil Microbiome",
+        "STUDY_ABSTRACT": "A high-quality MAG reconstructed from binned metagenome data.",
+        "existing_study_type": "Other",
+        "new_study_type": "Genome Sequencing",
+    }
 
-    @requires_fixtures
-    def test_dataharmonizer_container_format(self, example_study_json):
-        """The example_study.json fixture uses DataHarmonizer Container format."""
-        studies = common.extract_records_from_json(
-            example_study_json, record_keys=_JSON_RECORD_KEYS,
-        )
-        assert studies is not None
-        assert len(studies) == 1
-        assert studies[0]["STUDY_TITLE"] == "MIMICC"
-        assert studies[0]["existing_study_type"] == "Metagenomics"
-        assert studies[0]["IS_PRIMARY"] == "YES"
-
-    def test_plain_list(self):
-        """Plain list input returns the list as-is."""
-        data = [{"STUDY_TITLE": "Test Study", "IS_PRIMARY": "YES"}]
-        studies = common.extract_records_from_json(
-            data, record_keys=_JSON_RECORD_KEYS,
-        )
-        assert studies == data
 
-    def test_dict_with_studies_key(self):
-        """Dict with 'studies' key extracts the list."""
-        data = {"studies": [{"STUDY_TITLE": "A"}, {"STUDY_TITLE": "B"}]}
-        studies = common.extract_records_from_json(
-            data, record_keys=_JSON_RECORD_KEYS,
-        )
-        assert len(studies) == 2
+@pytest.fixture
+def mock_credentials() -> tuple[str, str]:
+    """Return mock ENA credentials."""
+    return ("Webin-12345", "pass")
 
-    def test_dict_with_data_key(self):
-        """Dict with 'data' key extracts the list."""
-        data = {"data": [{"STUDY_TITLE": "C"}]}
-        studies = common.extract_records_from_json(
-            data, record_keys=_JSON_RECORD_KEYS,
-        )
-        assert len(studies) == 1
 
-    def test_single_study_object(self):
-        """Single dict input is wrapped in a list."""
-        data = {"STUDY_TITLE": "Single"}
-        studies = common.extract_records_from_json(
-            data, record_keys=_JSON_RECORD_KEYS,
-        )
-        assert len(studies) == 1
-        assert studies[0]["STUDY_TITLE"] == "Single"
+@pytest.fixture
+def auth(mock_credentials: tuple[str, str]) -> HTTPBasicAuth:
+    """Return mock HTTPBasicAuth built from mock credentials."""
+    return HTTPBasicAuth(*mock_credentials)
 
-    def test_invalid_input(self):
-        """Non-dict/list input returns None."""
-        result = common.extract_records_from_json(
-            "not a dict or list", record_keys=_JSON_RECORD_KEYS,
-        )
-        assert result is None
-
-    def test_container_with_multiple_studies(self):
-        """Container format with multiple studies extracts all."""
-        data = {
-            "Container": {
-                "SRA_studys": [
-                    {"STUDY_TITLE": "Study A"},
-                    {"STUDY_TITLE": "Study B"},
-                ],
-            },
-        }
-        studies = common.extract_records_from_json(
-            data, record_keys=_JSON_RECORD_KEYS,
-        )
-        assert len(studies) == 2
+
+@pytest.fixture
+def account_study_record() -> dict[str, str]:
+    """Return a normalised account study record as returned by the Reports API."""
+    return {
+        "title": "Existing Study Title",
+        "alias": "existing-study-alias",
+        "accession": "PRJEB99001",
+        "secondary_accession": "ERP099001",
+        "status": "PRIVATE",
+    }
 
 
 # ---------------------------------------------------------------------------
-# XML building tests
+# A. Unit tests for build_submission_xml and _add_project_element
 # ---------------------------------------------------------------------------
 
 
 class TestBuildSubmissionXml:
-    """Tests for building ENA study submission XML."""
-
-    def test_basic_xml_structure(self):
-        """Built XML contains expected elements and attributes."""
-        studies = [
-            {
-                "alias": "test-study",
-                "STUDY_TITLE": "Test Study",
-                "STUDY_ABSTRACT": "Abstract text.",
-                "existing_study_type": "Metagenomics",
-            },
+    """Unit tests for build_submission_xml and _add_project_element."""
+
+    # ---- helper -------------------------------------------------------
+
+    @staticmethod
+    def _to_str(root: ET.Element) -> str:
+        """Serialise an ElementTree element to a UTF-8 string."""
+        return ET.tostring(root, encoding="unicode")
+
+    # ---- A1: Basic study fields -------------------------------------------
+
+    def test_study_title_round_trips(self, basic_study: dict[str, Any]) -> None:
+        """STUDY_TITLE is written as the TITLE element text."""
+        root = build_submission_xml([basic_study])
+        title_el = root.find(".//TITLE")
+        assert title_el is not None
+        assert title_el.text == basic_study["STUDY_TITLE"]
+
+    def test_study_abstract_round_trips(self, basic_study: dict[str, Any]) -> None:
+        """STUDY_ABSTRACT is written as the DESCRIPTION element text."""
+        root = build_submission_xml([basic_study])
+        desc_el = root.find(".//DESCRIPTION")
+        assert desc_el is not None
+        assert desc_el.text == basic_study["STUDY_ABSTRACT"]
+
+    def test_alias_round_trips(self, basic_study: dict[str, Any]) -> None:
+        """The alias attribute on PROJECT matches the input alias."""
+        root = build_submission_xml([basic_study])
+        project_el = root.find(".//PROJECT")
+        assert project_el is not None
+        assert project_el.get("alias") == basic_study["alias"]
+
+    def test_center_project_name_round_trips(self, basic_study: dict[str, Any]) -> None:
+        """CENTER_PROJECT_NAME is written as the NAME element text."""
+        root = build_submission_xml([basic_study])
+        name_el = root.find(".//NAME")
+        assert name_el is not None
+        assert name_el.text == basic_study["CENTER_PROJECT_NAME"]
+
+    def test_submission_project_present(self, basic_study: dict[str, Any]) -> None:
+        """SUBMISSION_PROJECT with SEQUENCING_PROJECT is always present."""
+        root = build_submission_xml([basic_study])
+        sp_el = root.find(".//SUBMISSION_PROJECT")
+        assert sp_el is not None
+        seq_el = sp_el.find("SEQUENCING_PROJECT")
+        assert seq_el is not None
+
+    # ---- A2: Study type PROJECT_ATTRIBUTEs --------------------------------
+
+    def test_existing_study_type_emitted_as_project_attribute(
+        self, basic_study: dict[str, Any]
+    ) -> None:
+        """existing_study_type is emitted as a PROJECT_ATTRIBUTE TAG/VALUE pair."""
+        root = build_submission_xml([basic_study])
+        xml_str = self._to_str(root)
+        assert "existing_study_type" in xml_str
+        assert basic_study["existing_study_type"] in xml_str
+
+    def test_new_study_type_absent_when_not_other(self, basic_study: dict[str, Any]) -> None:
+        """new_study_type is NOT emitted when existing_study_type != 'Other'."""
+        study = dict(basic_study)
+        study["new_study_type"] = "Genome Sequencing"
+        root = build_submission_xml([study])
+        xml_str = self._to_str(root)
+        assert "new_study_type" not in xml_str
+
+    def test_new_study_type_present_when_existing_is_other(
+        self, mag_genome_study: dict[str, Any]
+    ) -> None:
+        """new_study_type appears as a PROJECT_ATTRIBUTE when existing_study_type == 'Other'."""
+        root = build_submission_xml([mag_genome_study])
+        tags = [
+            el.text
+            for el in root.findall(".//PROJECT_ATTRIBUTE/TAG")
+            if el.text is not None
+        ]
+        values = [
+            el.text
+            for el in root.findall(".//PROJECT_ATTRIBUTE/VALUE")
+            if el.text is not None
+        ]
+        assert "existing_study_type" in tags
+        assert "new_study_type" in tags
+        assert "Other" in values
+        assert "Genome Sequencing" in values
+
+    def test_no_project_attributes_when_no_study_type(self) -> None:
+        """No PROJECT_ATTRIBUTES element when existing_study_type is absent."""
+        study = {
+            "alias": "no-type",
+            "STUDY_TITLE": "No Type Study",
+        }
+        root = build_submission_xml([study])
+        attrs_el = root.find(".//PROJECT_ATTRIBUTES")
+        assert attrs_el is None
+
+    # ---- A3: Hold date ----------------------------------------------------
+
+    def test_hold_until_present_in_submission(self, basic_study: dict[str, Any]) -> None:
+        """When hold_until is given, HOLD element with HoldUntilDate appears in SUBMISSION."""
+        root = build_submission_xml([basic_study], hold_until="2028-06-15")
+        hold_el = root.find(".//HOLD")
+        assert hold_el is not None
+        assert hold_el.get("HoldUntilDate") == "2028-06-15"
+
+    def test_hold_until_absent_when_not_provided(self, basic_study: dict[str, Any]) -> None:
+        """When hold_until is not given, no HOLD element appears."""
+        root = build_submission_xml([basic_study])
+        hold_el = root.find(".//HOLD")
+        assert hold_el is None
+
+    # ---- A4: MODIFY action ------------------------------------------------
+
+    def test_modify_action_produces_modify_element(self, basic_study: dict[str, Any]) -> None:
+        """Using action='MODIFY' produces a MODIFY element instead of ADD."""
+        root = build_submission_xml([basic_study], action="MODIFY")
+        xml_str = self._to_str(root)
+        assert "<MODIFY" in xml_str or "<MODIFY/>" in xml_str
+
+    def test_add_action_produces_add_element(self, basic_study: dict[str, Any]) -> None:
+        """Default action='ADD' produces an ADD element."""
+        root = build_submission_xml([basic_study])
+        xml_str = self._to_str(root)
+        assert "<ADD" in xml_str or "<ADD/>" in xml_str
+
+    def test_modify_action_does_not_produce_add(self, basic_study: dict[str, Any]) -> None:
+        """MODIFY action does not produce an ADD element."""
+        root = build_submission_xml([basic_study], action="MODIFY")
+        xml_str = self._to_str(root)
+        # Strip the XML preamble to avoid false positives in attributes
+        assert "<ADD" not in xml_str and "<ADD/>" not in xml_str
+
+    # ---- A5: Assembly/metagenomics study ----------------------------------
+
+    def test_metagenomics_assembly_study_round_trips(
+        self, metagenomics_assembly_study: dict[str, Any]
+    ) -> None:
+        """Metagenomics assembly study dict round-trips correctly into XML."""
+        root = build_submission_xml([metagenomics_assembly_study])
+        project_el = root.find(".//PROJECT")
+        assert project_el is not None
+        assert project_el.get("alias") == metagenomics_assembly_study["alias"]
+
+        title_el = root.find(".//TITLE")
+        assert title_el is not None
+        assert title_el.text == metagenomics_assembly_study["STUDY_TITLE"]
+
+        tags = [
+            el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text
         ]
-        root = build_submission_xml(studies)
-        xml_bytes = common.xml_to_bytes(root)
-        xml_str = xml_bytes.decode("utf-8")
-        assert "<PROJECT_SET>" in xml_str
-        assert 'alias="test-study"' in xml_str
-        assert "<TITLE>Test Study</TITLE>" in xml_str
-        assert "<DESCRIPTION>Abstract text.</DESCRIPTION>" in xml_str
-        assert "<SEQUENCING_PROJECT" in xml_str
-
-    def test_hold_until_date(self):
-        """Hold-until date appears in the submission XML."""
-        studies = [{"STUDY_TITLE": "T", "alias": "a"}]
-        root = build_submission_xml(studies, hold_until="2028-01-01")
-        xml_bytes = common.xml_to_bytes(root)
-        xml_str = xml_bytes.decode("utf-8")
-        assert 'HoldUntilDate="2028-01-01"' in xml_str
-
-    def test_structural_validation(self):
-        """Built XML should pass structural validation."""
-        studies = [
-            {
-                "alias": "val-test",
-                "STUDY_TITLE": "Validation Test Study",
-                "existing_study_type": "RNASeq",
-            },
+        values = [
+            el.text for el in root.findall(".//PROJECT_ATTRIBUTE/VALUE") if el.text
         ]
-        root = build_submission_xml(studies)
-        xml_bytes = common.xml_to_bytes(root)
+        assert "existing_study_type" in tags
+        assert "Metagenomics" in values
+
+    # ---- A6: MAG/genome study with Other + new_study_type -----------------
+
+    def test_mag_genome_study_has_both_project_attributes(
+        self, mag_genome_study: dict[str, Any]
+    ) -> None:
+        """MAG/genome study with existing_study_type=Other produces both PROJECT_ATTRIBUTEs."""
+        root = build_submission_xml([mag_genome_study])
+        attr_els = root.findall(".//PROJECT_ATTRIBUTE")
+        assert len(attr_els) == 2
+
+        pairs: dict[str, str] = {}
+        for attr_el in attr_els:
+            tag_el = attr_el.find("TAG")
+            val_el = attr_el.find("VALUE")
+            if tag_el is not None and val_el is not None:
+                pairs[tag_el.text or ""] = val_el.text or ""
+
+        assert pairs.get("existing_study_type") == "Other"
+        assert pairs.get("new_study_type") == "Genome Sequencing"
+
+    # ---- Multiple studies in one call -------------------------------------
+
+    def test_multiple_studies_produce_multiple_project_elements(
+        self,
+        basic_study: dict[str, Any],
+        metagenomics_assembly_study: dict[str, Any],
+    ) -> None:
+        """Multiple studies in input produce multiple PROJECT elements."""
+        root = build_submission_xml([basic_study, metagenomics_assembly_study])
+        projects = root.findall(".//PROJECT")
+        assert len(projects) == 2
+
+    # ---- Alias auto-derived from title when absent ------------------------
+
+    def test_alias_derived_from_title_when_absent(self) -> None:
+        """When no alias is provided, alias is derived from STUDY_TITLE (spaces→underscores)."""
+        study = {"STUDY_TITLE": "My Derived Title"}
+        root = build_submission_xml([study])
+        project_el = root.find(".//PROJECT")
+        assert project_el is not None
+        alias = project_el.get("alias", "")
+        assert "_" in alias or alias == "My_Derived_Title"[:50]
+
+
+# ---------------------------------------------------------------------------
+# B. Unit tests for validate_study_xml
+# ---------------------------------------------------------------------------
+
+
+class TestValidateStudyXml:
+    """Unit tests for validate_study_xml."""
+
+    @staticmethod
+    def _build_valid_xml_bytes(alias: str = "study-1", title: str = "Test Study") -> bytes:
+        """Build a minimal valid study XML document as bytes.
+
+        Args:
+            alias: The PROJECT alias attribute value.
+            title: The TITLE element text.
+
+        Returns:
+            UTF-8 encoded XML bytes.
+        """
+        xml_str = dedent(f"""\
+            <?xml version='1.0' encoding='UTF-8'?>
+            <WEBIN>
+              <PROJECT_SET>
+                <PROJECT alias="{alias}">
+                  <TITLE>{title}</TITLE>
+                  <SUBMISSION_PROJECT>
+                    <SEQUENCING_PROJECT/>
+                  </SUBMISSION_PROJECT>
+                </PROJECT>
+              </PROJECT_SET>
+            </WEBIN>
+        """)
+        return xml_str.encode("utf-8")
+
+    # ---- B7: Valid XML passes ---------------------------------------------
+
+    def test_valid_assembly_study_xml_passes(self) -> None:
+        """A valid assembly study XML passes validation without errors."""
+        xml_bytes = self._build_valid_xml_bytes(
+            alias="assembly-study", title="Assembly Study Title"
+        )
+        is_valid, messages = validate_study_xml(xml_bytes)
+        assert is_valid, f"Expected valid; messages: {messages}"
+
+    def test_valid_metagenomics_xml_passes(self) -> None:
+        """Well-formed XML with required elements passes validation."""
+        study = {
+            "alias": "meta-study",
+            "STUDY_TITLE": "Metagenomics Study",
+            "existing_study_type": "Metagenomics",
+        }
+        import ena_submit_common as _common  # local import; only for xml_to_bytes helper
+
+        root = build_submission_xml([study])
+        xml_bytes = _common.xml_to_bytes(root)
         is_valid, messages = validate_study_xml(xml_bytes)
-        for msg in messages:
-            print(msg)
-        assert is_valid
+        assert is_valid, f"Expected valid; messages: {messages}"
+
+    # ---- B8: Missing TITLE ------------------------------------------------
+
+    def test_missing_title_fails_with_title_in_message(self) -> None:
+        """A PROJECT without a TITLE element fails validation with 'TITLE' in the message."""
+        xml_str = dedent("""\
+            <?xml version='1.0' encoding='UTF-8'?>
+            <WEBIN>
+              <PROJECT_SET>
+                <PROJECT alias="no-title">
+                  <SUBMISSION_PROJECT><SEQUENCING_PROJECT/></SUBMISSION_PROJECT>
+                </PROJECT>
+              </PROJECT_SET>
+            </WEBIN>
+        """)
+        is_valid, messages = validate_study_xml(xml_str.encode("utf-8"))
+        assert not is_valid
+        combined = " ".join(messages)
+        assert "TITLE" in combined
+
+    # ---- B9: Missing SUBMISSION_PROJECT -----------------------------------
+
+    def test_missing_submission_project_fails(self) -> None:
+        """A PROJECT without SUBMISSION_PROJECT fails with 'SUBMISSION_PROJECT' in message."""
+        xml_str = dedent("""\
+            <?xml version='1.0' encoding='UTF-8'?>
+            <WEBIN>
+              <PROJECT_SET>
+                <PROJECT alias="no-sp">
+                  <TITLE>Some Title</TITLE>
+                </PROJECT>
+              </PROJECT_SET>
+            </WEBIN>
+        """)
+        is_valid, messages = validate_study_xml(xml_str.encode("utf-8"))
+        assert not is_valid
+        combined = " ".join(messages)
+        assert "SUBMISSION_PROJECT" in combined
+
+    # ---- B10: Malformed XML -----------------------------------------------
+
+    def test_malformed_xml_fails_with_not_well_formed_message(self) -> None:
+        """Malformed XML fails validation with 'not well-formed' in the message."""
+        bad_xml = b"<WEBIN><PROJECT_SET><PROJECT alias='x'><TITLE>Unclosed"
+        is_valid, messages = validate_study_xml(bad_xml)
+        assert not is_valid
+        combined = " ".join(messages).lower()
+        assert "not well-formed" in combined or "well-formed" in combined
+
+    # ---- Extra structural checks -----------------------------------------
+
+    def test_empty_title_fails_validation(self) -> None:
+        """A PROJECT with an empty TITLE element fails validation."""
+        xml_str = dedent("""\
+            <?xml version='1.0' encoding='UTF-8'?>
+            <WEBIN>
+              <PROJECT_SET>
+                <PROJECT alias="empty-title">
+                  <TITLE></TITLE>
+                  <SUBMISSION_PROJECT><SEQUENCING_PROJECT/></SUBMISSION_PROJECT>
+                </PROJECT>
+              </PROJECT_SET>
+            </WEBIN>
+        """)
+        is_valid, messages = validate_study_xml(xml_str.encode("utf-8"))
+        assert not is_valid
+
+    def test_missing_project_set_fails_validation(self) -> None:
+        """XML without a PROJECT_SET element fails validation."""
+        xml_str = b"<?xml version='1.0'?><WEBIN/>"
+        is_valid, messages = validate_study_xml(xml_str)
+        assert not is_valid
+
+    def test_validation_returns_tuple_of_bool_and_list(self) -> None:
+        """validate_study_xml always returns (bool, list)."""
+        xml_bytes = self._build_valid_xml_bytes()
+        result = validate_study_xml(xml_bytes)
+        assert isinstance(result, tuple)
+        assert len(result) == 2
+        is_valid, messages = result
+        assert isinstance(is_valid, bool)
+        assert isinstance(messages, list)
 
 
 # ---------------------------------------------------------------------------
-# Duplicate detection tests
+# C. Unit tests for parse_xml_receipt
+# ---------------------------------------------------------------------------
+
+
+class TestParseXmlReceipt:
+    """Unit tests for parse_xml_receipt."""
+
+    @staticmethod
+    def _parse(xml_str: str) -> tuple[bool, list[dict[str, str]], list[str]]:
+        """Parse an XML receipt string via parse_xml_receipt.
+
+        Args:
+            xml_str: Raw XML receipt string.
+
+        Returns:
+            Tuple of (success, accessions, messages).
+        """
+        root = ET.fromstring(xml_str)
+        return parse_xml_receipt(root)
+
+    # ---- C11: Successful PROJECT receipt ----------------------------------
+
+    def test_successful_project_receipt_returns_true(self) -> None:
+        """A success='true' receipt returns success=True."""
+        xml_str = dedent("""\
+            <RECEIPT success="true" receiptDate="2024-01-15T12:00:00.000Z">
+              <PROJECT accession="PRJEB12345" alias="my-study"
+                       status="PRIVATE" holdUntilDate="2025-01-15">
+                <EXT_ID accession="ERP012345" type="study"/>
+              </PROJECT>
+            </RECEIPT>
+        """)
+        success, accessions, messages = self._parse(xml_str)
+        assert success is True
+
+    def test_successful_project_receipt_accession_round_trips(self) -> None:
+        """PROJECT accession, alias, status, holdUntilDate, and external_accession round-trip."""
+        xml_str = dedent("""\
+            <RECEIPT success="true">
+              <PROJECT accession="PRJEB12345" alias="my-study"
+                       status="PRIVATE" holdUntilDate="2025-01-15">
+                <EXT_ID accession="ERP012345" type="study"/>
+              </PROJECT>
+            </RECEIPT>
+        """)
+        success, accessions, messages = self._parse(xml_str)
+        assert len(accessions) == 1
+        acc = accessions[0]
+        assert acc["accession"] == "PRJEB12345"
+        assert acc["alias"] == "my-study"
+        assert acc["status"] == "PRIVATE"
+        assert acc["holdUntilDate"] == "2025-01-15"
+        assert acc["external_accession"] == "ERP012345"
+        assert acc["external_type"] == "study"
+
+    # ---- C12: Failed receipt ----------------------------------------------
+
+    def test_failed_receipt_returns_false(self) -> None:
+        """A success='false' receipt returns success=False."""
+        xml_str = dedent("""\
+            <RECEIPT success="false">
+              <MESSAGES>
+                <ERROR>Center name "Unknown" is not permitted to submit in Webin-12345.</ERROR>
+              </MESSAGES>
+            </RECEIPT>
+        """)
+        success, accessions, messages = self._parse(xml_str)
+        assert success is False
+
+    def test_failed_receipt_captures_error_message(self) -> None:
+        """Error text from MESSAGES/ERROR is captured in the messages list."""
+        xml_str = dedent("""\
+            <RECEIPT success="false">
+              <MESSAGES>
+                <ERROR>Submission failed due to duplicate alias.</ERROR>
+              </MESSAGES>
+            </RECEIPT>
+        """)
+        _, _, messages = self._parse(xml_str)
+        assert any("Submission failed due to duplicate alias" in m for m in messages)
+
+    # ---- C13: STUDY tag (alternate ENA format) ----------------------------
+
+    def test_study_tag_receipt_extracts_accession_and_alias(self) -> None:
+        """Receipts using STUDY instead of PROJECT still extract accession and alias."""
+        xml_str = dedent("""\
+            <RECEIPT success="true">
+              <STUDY accession="ERP099999" alias="study-alias-1" status="PRIVATE"/>
+            </RECEIPT>
+        """)
+        success, accessions, messages = self._parse(xml_str)
+        assert success is True
+        assert len(accessions) == 1
+        assert accessions[0]["accession"] == "ERP099999"
+        assert accessions[0]["alias"] == "study-alias-1"
+
+    # ---- C14: MESSAGES with INFO and ERROR --------------------------------
+
+    def test_receipt_with_info_messages_captured(self) -> None:
+        """INFO elements in MESSAGES are captured in the messages list."""
+        xml_str = dedent("""\
+            <RECEIPT success="true">
+              <PROJECT accession="PRJEB00001" alias="x" status="PRIVATE"/>
+              <MESSAGES>
+                <INFO>Submission processed successfully.</INFO>
+              </MESSAGES>
+            </RECEIPT>
+        """)
+        _, _, messages = self._parse(xml_str)
+        assert any("Submission processed successfully" in m for m in messages)
+        assert any(m.startswith("INFO:") for m in messages)
+
+    def test_receipt_with_multiple_error_messages(self) -> None:
+        """Multiple ERROR elements are all captured."""
+        xml_str = dedent("""\
+            <RECEIPT success="false">
+              <MESSAGES>
+                <ERROR>First error.</ERROR>
+                <ERROR>Second error.</ERROR>
+              </MESSAGES>
+            </RECEIPT>
+        """)
+        _, _, messages = self._parse(xml_str)
+        error_msgs = [m for m in messages if m.startswith("ERROR:")]
+        assert len(error_msgs) == 2
+
+    def test_receipt_both_info_and_error_captured(self) -> None:
+        """Both INFO and ERROR elements are captured in messages."""
+        xml_str = dedent("""\
+            <RECEIPT success="false">
+              <MESSAGES>
+                <INFO>Partial success.</INFO>
+                <ERROR>Some records failed.</ERROR>
+              </MESSAGES>
+            </RECEIPT>
+        """)
+        _, _, messages = self._parse(xml_str)
+        assert any(m.startswith("INFO:") for m in messages)
+        assert any(m.startswith("ERROR:") for m in messages)
+
+    def test_receipt_no_messages_element_returns_empty_list(self) -> None:
+        """A receipt without a MESSAGES element returns an empty messages list."""
+        xml_str = dedent("""\
+            <RECEIPT success="true">
+              <PROJECT accession="PRJEB00001" alias="x" status="PRIVATE"/>
+            </RECEIPT>
+        """)
+        _, _, messages = self._parse(xml_str)
+        assert messages == []
+
+    def test_receipt_success_false_string(self) -> None:
+        """Receipts with success='false' (string) correctly parse to False."""
+        xml_str = "<RECEIPT success='false'/>"
+        success, _, _ = self._parse(xml_str)
+        assert success is False
+
+    def test_receipt_missing_success_defaults_to_false(self) -> None:
+        """A receipt without a success attribute defaults to False."""
+        xml_str = "<RECEIPT/>"
+        success, _, _ = self._parse(xml_str)
+        assert success is False
+
+
+# ---------------------------------------------------------------------------
+# D. Unit tests for find_duplicate_studies and fetch_account_studies
 # ---------------------------------------------------------------------------
 
 
 class TestFindDuplicateStudies:
-    """Tests for alias/title-based duplicate detection."""
+    """Unit tests for find_duplicate_studies."""
 
-    def _make_account_study(
-        self,
+    @staticmethod
+    def _account_record(
         title: str = "",
         alias: str = "",
-        accession: str = "PRJEB99",
-        secondary_accession: str = "",
+        accession: str = "PRJEB00001",
         status: str = "PRIVATE",
     ) -> dict[str, str]:
-        """Build a normalised account study dict."""
+        """Build a normalised account study record.
+
+        Args:
+            title: Study title (as returned by Reports API normalizer).
+            alias: Study alias.
+            accession: ENA project accession.
+            status: Release status.
+
+        Returns:
+            Normalised study dict.
+        """
         return {
             "title": title,
             "alias": alias,
             "accession": accession,
-            "secondary_accession": secondary_accession,
+            "secondary_accession": "",
             "status": status,
         }
 
-    def test_no_duplicates(self):
-        """No match when titles and aliases differ."""
-        new = [{"STUDY_TITLE": "New Study", "alias": "new-1"}]
-        account = [
-            self._make_account_study(
-                title="Other Study", alias="other-1",
-            ),
-        ]
-        dups = find_duplicate_studies(new, account)
-        assert len(dups) == 0
+    # ---- D15: Exact alias match ------------------------------------------
 
-    def test_duplicate_by_title(self):
-        """Exact title match flags a duplicate."""
-        new = [{"STUDY_TITLE": "Existing Study"}]
-        account = [
-            self._make_account_study(
-                title="Existing Study",
-                accession="PRJEB99",
-                status="PRIVATE",
-            ),
-        ]
-        dups = find_duplicate_studies(new, account)
-        assert 0 in dups
-        assert dups[0]["accession"] == "PRJEB99"
-
-    def test_duplicate_by_alias(self):
-        """Alias match flags a duplicate even with different title."""
-        new = [{"STUDY_TITLE": "New Title", "alias": "my-alias"}]
-        account = [
-            self._make_account_study(
-                title="Different Title",
-                alias="my-alias",
-                accession="PRJEB60",
-            ),
-        ]
-        dups = find_duplicate_studies(new, account)
+    def test_exact_alias_match_detected_as_duplicate(self) -> None:
+        """An exact alias match is detected as a duplicate."""
+        new_studies = [{"STUDY_TITLE": "Different Title", "alias": "my-alias-x"}]
+        account = [self._account_record(title="Other", alias="my-alias-x", accession="PRJEB10")]
+        dups = find_duplicate_studies(new_studies, account)
         assert 0 in dups
-        assert dups[0]["accession"] == "PRJEB60"
+        assert dups[0]["accession"] == "PRJEB10"
         assert "alias" in dups[0]["match_reason"]
 
-    def test_alias_takes_precedence_over_title(self):
-        """When alias matches, it is reported as the match reason."""
-        new = [{"STUDY_TITLE": "Same Title", "alias": "same-alias"}]
+    # ---- D16: Exact title match ------------------------------------------
+
+    def test_exact_title_match_detected_as_duplicate(self) -> None:
+        """An exact STUDY_TITLE match is detected as a duplicate."""
+        new_studies = [{"STUDY_TITLE": "My Metagenomics Study"}]
         account = [
-            self._make_account_study(
-                title="Same Title",
-                alias="same-alias",
-                accession="PRJEB70",
-            ),
+            self._account_record(title="My Metagenomics Study", accession="PRJEB20")
         ]
-        dups = find_duplicate_studies(new, account)
+        dups = find_duplicate_studies(new_studies, account)
         assert 0 in dups
-        assert "alias" in dups[0]["match_reason"]
+        assert dups[0]["accession"] == "PRJEB20"
+        assert "title" in dups[0]["match_reason"]
 
-    def test_partial_title_not_duplicate(self):
-        """Partial title match does not count as a duplicate."""
-        new = [{"STUDY_TITLE": "My Study"}]
-        account = [
-            self._make_account_study(
-                title="My Study Extended Title",
-            ),
-        ]
-        dups = find_duplicate_studies(new, account)
-        assert len(dups) == 0
+    # ---- D17: No match returns empty dict --------------------------------
 
-    def test_empty_account_no_duplicates(self):
-        """Empty account list produces no duplicates."""
-        new = [{"STUDY_TITLE": "Test", "alias": "t"}]
-        dups = find_duplicate_studies(new, [])
-        assert len(dups) == 0
+    def test_no_match_returns_empty_dict(self) -> None:
+        """When neither alias nor title matches, an empty dict is returned."""
+        new_studies = [{"STUDY_TITLE": "Completely Novel Study", "alias": "novel-alias"}]
+        account = [self._account_record(title="Existing Study", alias="existing-alias")]
+        dups = find_duplicate_studies(new_studies, account)
+        assert dups == {}
 
-    def test_empty_input_no_duplicates(self):
-        """Empty input list produces no duplicates."""
-        account = [
-            self._make_account_study(title="Existing"),
-        ]
+    def test_empty_account_returns_empty_dict(self) -> None:
+        """Empty account list results in no duplicates."""
+        new_studies = [{"STUDY_TITLE": "Any Study"}]
+        dups = find_duplicate_studies(new_studies, [])
+        assert dups == {}
+
+    def test_empty_new_studies_returns_empty_dict(self) -> None:
+        """Empty new studies list results in no duplicates."""
+        account = [self._account_record(title="Existing")]
         dups = find_duplicate_studies([], account)
-        assert len(dups) == 0
+        assert dups == {}
 
-    def test_study_without_title_or_alias_skipped(self):
-        """Studies with no title or alias are not flagged."""
-        new = [{}]
-        account = [
-            self._make_account_study(title="Something"),
-        ]
-        dups = find_duplicate_studies(new, account)
-        assert len(dups) == 0
+    def test_study_without_title_or_alias_not_flagged(self) -> None:
+        """A study dict with neither title nor alias is not flagged as duplicate."""
+        new_studies = [{"IS_PRIMARY": "YES"}]  # no STUDY_TITLE, no alias
+        account = [self._account_record(title="Existing")]
+        dups = find_duplicate_studies(new_studies, account)
+        assert dups == {}
 
-    def test_mixed_duplicates_and_new(self):
-        """Mix of duplicate and new studies."""
-        account = [
-            self._make_account_study(
-                title="Dup By Title",
-                alias="dup-title",
-                accession="PRJEB10",
-            ),
-            self._make_account_study(
-                title="Other",
-                alias="dup-alias",
-                accession="PRJEB20",
-            ),
-        ]
-        new = [
-            {"STUDY_TITLE": "Dup By Title", "alias": "new-alias"},
-            {"STUDY_TITLE": "New Title", "alias": "dup-alias"},
-            {"STUDY_TITLE": "Brand New", "alias": "brand-new"},
-        ]
-        dups = find_duplicate_studies(new, account)
-        assert 0 in dups  # title match
-        assert 1 in dups  # alias match
-        assert 2 not in dups  # new
+    def test_partial_title_not_a_duplicate(self) -> None:
+        """A partial title match does not count as a duplicate (exact match only)."""
+        new_studies = [{"STUDY_TITLE": "Metagenomics"}]
+        account = [self._account_record(title="Metagenomics Assembly Study")]
+        dups = find_duplicate_studies(new_studies, account)
+        assert dups == {}
 
-    def test_all_duplicates_early_exit(self):
-        """All studies being duplicates terminates early."""
-        account = [
-            self._make_account_study(
-                title="A", accession="PRJEB1",
-            ),
-            self._make_account_study(
-                title="B", accession="PRJEB2",
-            ),
+    def test_multiple_studies_only_matching_flagged(self) -> None:
+        """Only the matching study is flagged when multiple new studies are submitted."""
+        account = [self._account_record(title="Old Study", alias="old-alias", accession="PRJEB50")]
+        new_studies = [
+            {"STUDY_TITLE": "Old Study"},
+            {"STUDY_TITLE": "New Study"},
         ]
-        new = [
-            {"STUDY_TITLE": "A"},
-            {"STUDY_TITLE": "B"},
+        dups = find_duplicate_studies(new_studies, account)
+        assert 0 in dups
+        assert 1 not in dups
+
+    def test_duplicate_index_corresponds_to_new_studies_list(self) -> None:
+        """The index in the duplicates dict matches the position in new_studies."""
+        account = [self._account_record(title="Study C", accession="PRJEB33")]
+        new_studies = [
+            {"STUDY_TITLE": "Study A"},
+            {"STUDY_TITLE": "Study B"},
+            {"STUDY_TITLE": "Study C"},
         ]
-        dups = find_duplicate_studies(new, account)
-        assert len(dups) == 2
+        dups = find_duplicate_studies(new_studies, account)
+        assert 2 in dups
+        assert dups[2]["accession"] == "PRJEB33"
 
 
 # ---------------------------------------------------------------------------
-# File loading tests (JSON, CSV, TSV)
+# D18: _normalize_study_report and fetch_account_studies
 # ---------------------------------------------------------------------------
 
-# The expected study data shared by all supported fixtures
-EXPECTED_STUDY = {
-    "IS_PRIMARY": "YES",
-    "STUDY_TITLE": "MIMICC",
-    "existing_study_type": "Metagenomics",
-}
+
+class TestNormalizeStudyReport:
+    """Unit tests for _normalize_study_report field normalisation."""
+
+    def test_title_field_normalised(self) -> None:
+        """The 'title' field is extracted from the raw report dict."""
+        report = {"title": "My Title", "alias": "my-alias", "accession": "PRJEB1"}
+        result = _normalize_study_report(report)
+        assert result["title"] == "My Title"
+
+    def test_study_title_fallback(self) -> None:
+        """studyTitle is used when 'title' is absent."""
+        report = {"studyTitle": "Study Title Fallback", "alias": "a", "accession": "PRJEB2"}
+        result = _normalize_study_report(report)
+        assert result["title"] == "Study Title Fallback"
+
+    def test_alias_field_normalised(self) -> None:
+        """The 'alias' field is extracted."""
+        report = {"title": "T", "alias": "direct-alias", "accession": "PRJEB3"}
+        result = _normalize_study_report(report)
+        assert result["alias"] == "direct-alias"
+
+    def test_study_alias_fallback(self) -> None:
+        """studyAlias is used when 'alias' is absent."""
+        report = {"title": "T", "studyAlias": "study-alias-fallback", "accession": "PRJEB4"}
+        result = _normalize_study_report(report)
+        assert result["alias"] == "study-alias-fallback"
+
+    def test_accession_field_normalised(self) -> None:
+        """The 'accession' field is extracted."""
+        report = {"title": "T", "alias": "a", "accession": "PRJEB5"}
+        result = _normalize_study_report(report)
+        assert result["accession"] == "PRJEB5"
+
+    def test_study_accession_fallback(self) -> None:
+        """studyAccession is used when 'accession' is absent."""
+        report = {"title": "T", "alias": "a", "studyAccession": "PRJEB99"}
+        result = _normalize_study_report(report)
+        assert result["accession"] == "PRJEB99"
+
+    def test_missing_fields_default_to_empty_string(self) -> None:
+        """Missing fields default to empty string without raising."""
+        report = {}
+        result = _normalize_study_report(report)
+        assert result["title"] == ""
+        assert result["alias"] == ""
+        assert result["accession"] == ""
+
+    def test_status_field_defaults_to_unknown(self) -> None:
+        """The status field defaults to 'UNKNOWN' when absent."""
+        report = {"title": "T", "alias": "a", "accession": "PRJEB6"}
+        result = _normalize_study_report(report)
+        assert result["status"] == "UNKNOWN"
+
+    def test_release_status_used_for_status(self) -> None:
+        """releaseStatus is mapped to the 'status' key."""
+        report = {"title": "T", "alias": "a", "accession": "PRJEB7", "releaseStatus": "PUBLIC"}
+        result = _normalize_study_report(report)
+        assert result["status"] == "PUBLIC"
 
 
-@requires_fixtures
-class TestLoadInputFile:
-    """Tests for loading study data from JSON, CSV, and TSV files."""
+class TestFetchAccountStudies:
+    """Unit tests for fetch_account_studies calling common.fetch_account_records."""
 
-    def test_load_csv(self):
-        """CSV file loads correctly."""
-        studies = common.load_input_file(
-            EXAMPLE_STUDY_CSV, json_record_keys=_JSON_RECORD_KEYS,
+    def test_fetch_calls_fetch_account_records_with_correct_urls(
+        self, auth: HTTPBasicAuth
+    ) -> None:
+        """fetch_account_studies calls common.fetch_account_records with prod/test URLs."""
+        target = "submit_rawreads_study.common.fetch_account_records"
+        with patch(target, return_value=[]) as mock_fetch:
+            fetch_account_studies(auth, use_test=False)
+            mock_fetch.assert_called_once()
+            call_kwargs = mock_fetch.call_args
+            assert call_kwargs.kwargs.get("prod_url") == _PROD_REPORTS_URL
+            assert call_kwargs.kwargs.get("test_url") == _TEST_REPORTS_URL
+
+    def test_fetch_passes_normalizer_callable(self, auth: HTTPBasicAuth) -> None:
+        """fetch_account_studies passes a callable normalizer to fetch_account_records."""
+        target = "submit_rawreads_study.common.fetch_account_records"
+        with patch(target, return_value=[]) as mock_fetch:
+            fetch_account_studies(auth, use_test=False)
+            call_kwargs = mock_fetch.call_args
+            normalizer = call_kwargs.kwargs.get("normalizer")
+            assert callable(normalizer)
+
+    def test_fetch_normalizer_handles_title_variant(self, auth: HTTPBasicAuth) -> None:
+        """The normalizer passed to fetch_account_records handles title/studyTitle variants."""
+        target = "submit_rawreads_study.common.fetch_account_records"
+        captured_normalizer = None
+
+        def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
+            nonlocal captured_normalizer
+            captured_normalizer = kwargs.get("normalizer")
+            return []
+
+        with patch(target, side_effect=capture_normalizer):
+            fetch_account_studies(auth, use_test=False)
+
+        assert captured_normalizer is not None
+        result_title = captured_normalizer({"title": "Direct Title", "accession": "PRJEB1"})
+        assert result_title["title"] == "Direct Title"
+
+        result_study_title = captured_normalizer(
+            {"studyTitle": "Fallback Title", "accession": "PRJEB2"}
         )
-        assert studies is not None
-        assert len(studies) == 1
-        for key, val in EXPECTED_STUDY.items():
-            assert studies[0][key] == val
-
-    def test_load_tsv(self):
-        """TSV file loads correctly."""
-        studies = common.load_input_file(
-            EXAMPLE_STUDY_TSV, json_record_keys=_JSON_RECORD_KEYS,
+        assert result_study_title["title"] == "Fallback Title"
+
+    def test_fetch_normalizer_handles_alias_variant(self, auth: HTTPBasicAuth) -> None:
+        """The normalizer handles alias/studyAlias field variants."""
+        target = "submit_rawreads_study.common.fetch_account_records"
+        captured_normalizer = None
+
+        def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
+            nonlocal captured_normalizer
+            captured_normalizer = kwargs.get("normalizer")
+            return []
+
+        with patch(target, side_effect=capture_normalizer):
+            fetch_account_studies(auth, use_test=False)
+
+        assert captured_normalizer is not None
+        result = captured_normalizer({"alias": "direct-alias", "accession": "PRJEB3"})
+        assert result["alias"] == "direct-alias"
+
+        result_fallback = captured_normalizer(
+            {"studyAlias": "study-alias-fallback", "accession": "PRJEB4"}
         )
-        assert studies is not None
-        assert len(studies) == 1
-        for key, val in EXPECTED_STUDY.items():
-            assert studies[0][key] == val
-
-    def test_load_json(self):
-        """JSON file loads correctly."""
-        studies = common.load_input_file(
-            EXAMPLE_STUDY_JSON, json_record_keys=_JSON_RECORD_KEYS,
+        assert result_fallback["alias"] == "study-alias-fallback"
+
+    def test_fetch_normalizer_handles_accession_variant(self, auth: HTTPBasicAuth) -> None:
+        """The normalizer handles accession/studyAccession field variants."""
+        target = "submit_rawreads_study.common.fetch_account_records"
+        captured_normalizer = None
+
+        def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
+            nonlocal captured_normalizer
+            captured_normalizer = kwargs.get("normalizer")
+            return []
+
+        with patch(target, side_effect=capture_normalizer):
+            fetch_account_studies(auth, use_test=False)
+
+        assert captured_normalizer is not None
+        result = captured_normalizer(
+            {"title": "T", "studyAccession": "PRJEB99", "accession": ""}
         )
-        assert studies is not None
-        assert len(studies) == 1
-        for key, val in EXPECTED_STUDY.items():
-            assert studies[0][key] == val
-
-    def test_all_formats_produce_same_data(self):
-        """All supported formats should produce the same core study fields."""
-        all_studies = [
-            common.load_input_file(
-                path, json_record_keys=_JSON_RECORD_KEYS,
+        # studyAccession falls back when 'accession' is falsy
+        assert result["accession"] == "PRJEB99"
+
+
+# ---------------------------------------------------------------------------
+# E. CLI integration tests for main() using click.testing.CliRunner
+# ---------------------------------------------------------------------------
+
+
+def _extract_json_from_output(output: str) -> dict[str, Any]:
+    """Extract the JSON results dict from mixed CLI output.
+
+    The CLI writes JSON results via ``print()`` to stdout, but logging
+    also emits to stderr which CliRunner captures in ``result.output``.
+    This helper finds the last top-level JSON object in the output.
+
+    Args:
+        output: The full ``result.output`` string from CliRunner.
+
+    Returns:
+        Parsed JSON dict.
+
+    Raises:
+        ValueError: If no valid JSON object is found.
+    """
+    # Walk backwards through the output looking for a complete JSON block.
+    # The results JSON always starts with "{\n  " and ends with "\n}".
+    depth = 0
+    end = -1
+    start = -1
+    for i in range(len(output) - 1, -1, -1):
+        ch = output[i]
+        if ch == "}":
+            if depth == 0:
+                end = i
+            depth += 1
+        elif ch == "{":
+            depth -= 1
+            if depth == 0:
+                start = i
+                break
+    if start == -1 or end == -1:
+        raise ValueError(f"No JSON object found in output: {output[:200]!r}")
+    return json.loads(output[start : end + 1])
+
+
+def _make_study_json(study: dict[str, Any]) -> str:
+    """Serialise a study dict into a JSON string using the Container format.
+
+    Args:
+        study: Study metadata dict.
+
+    Returns:
+        JSON string in DataHarmonizer Container format.
+    """
+    return json.dumps({
+        "Container": {
+            "SRA_studys": [study],
+        }
+    })
+
+
+def _make_study_csv(study: dict[str, Any]) -> str:
+    """Serialise a study dict into a minimal CSV string.
+
+    Args:
+        study: Study metadata dict.
+
+    Returns:
+        CSV string with header and one data row.
+    """
+    headers = list(study.keys())
+    values = [str(study[h]) for h in headers]
+    return ",".join(headers) + "\n" + ",".join(values) + "\n"
+
+
+def _make_study_tsv(study: dict[str, Any]) -> str:
+    """Serialise a study dict into a minimal TSV string.
+
+    Args:
+        study: Study metadata dict.
+
+    Returns:
+        TSV string with header and one data row.
+    """
+    headers = list(study.keys())
+    values = [str(study[h]) for h in headers]
+    return "\t".join(headers) + "\n" + "\t".join(values) + "\n"
+
+
+@pytest.fixture
+def runner() -> CliRunner:
+    """Return a Click test runner with isolated filesystem."""
+    return CliRunner()
+
+
+@pytest.fixture
+def minimal_metagenomics_study() -> dict[str, Any]:
+    """Return a minimal metagenomics study for CLI tests."""
+    return {
+        "alias": "cli-metagenomics-001",
+        "STUDY_TITLE": "CLI Metagenomics Test Study",
+        "STUDY_ABSTRACT": "Abstract for CLI test.",
+        "existing_study_type": "Metagenomics",
+    }
+
+
+class TestMainCli:
+    """CLI integration tests for main() using CliRunner."""
+
+    _CRED_TARGET = "submit_rawreads_study.common.get_credentials"
+    _SUBMIT_TARGET = "submit_rawreads_study.common.submit_xml"
+
+    def _invoke(
+        self,
+        runner: CliRunner,
+        args: list[str],
+        input_filename: str,
+        input_content: str,
+    ) -> Any:
+        """Write input file and invoke the CLI.
+
+        Args:
+            runner: Click CliRunner instance.
+            args: CLI arguments (excluding --input, which is added automatically).
+            input_filename: Filename for the temporary input file.
+            input_content: Content to write to the input file.
+
+        Returns:
+            Click Result object.
+        """
+        with runner.isolated_filesystem():
+            Path(input_filename).write_text(input_content)
+            result = runner.invoke(
+                main,
+                ["--input", input_filename] + args,
+                catch_exceptions=False,
             )
-            for path in [EXAMPLE_STUDY_JSON, EXAMPLE_STUDY_CSV, EXAMPLE_STUDY_TSV]
-        ]
-        for studies in all_studies:
-            assert len(studies) == 1
-            for key, val in EXPECTED_STUDY.items():
-                assert studies[0][key] == val
-
-    def test_unknown_extension_returns_none(self, tmp_path):
-        """Unsupported file extension returns None."""
-        unknown = tmp_path / "data.parquet"
-        unknown.write_text("dummy")
-        result = common.load_input_file(
-            str(unknown), json_record_keys=_JSON_RECORD_KEYS,
+        return result
+
+    # ---- E19: JSON input, automated mode, dry-run -------------------------
+
+    def test_json_input_automated_dry_run_exits_0(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """JSON input with --automated --dry-run exits 0 and output has 'submitted' key."""
+        content = _make_study_json(minimal_metagenomics_study)
+        with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
+            result = self._invoke(
+                runner, ["--automated", "--dry-run"], "studies.json", content
+            )
+        assert result.exit_code == 0, f"stdout: {result.output}"
+        data = _extract_json_from_output(result.output)
+        assert "submitted" in data
+
+    # ---- E20: CSV input ---------------------------------------------------
+
+    def test_csv_input_automated_dry_run_exits_0(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """CSV input with --automated --dry-run exits 0 and output has 'submitted' key."""
+        content = _make_study_csv(minimal_metagenomics_study)
+        with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
+            result = self._invoke(
+                runner, ["--automated", "--dry-run"], "studies.csv", content
+            )
+        assert result.exit_code == 0, f"stdout: {result.output}"
+        data = _extract_json_from_output(result.output)
+        assert "submitted" in data
+
+    # ---- E21: TSV input ---------------------------------------------------
+
+    def test_tsv_input_automated_dry_run_exits_0(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """TSV input with --automated --dry-run exits 0 and output has 'submitted' key."""
+        content = _make_study_tsv(minimal_metagenomics_study)
+        with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
+            result = self._invoke(
+                runner, ["--automated", "--dry-run"], "studies.tsv", content
+            )
+        assert result.exit_code == 0, f"stdout: {result.output}"
+        data = _extract_json_from_output(result.output)
+        assert "submitted" in data
+
+    # ---- E22: Duplicate detection -----------------------------------------
+
+    def test_duplicate_detection_records_duplicate_and_skips_submission(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """When account already has a matching study, duplicate is recorded; nothing submitted."""
+        existing = {
+            "title": minimal_metagenomics_study["STUDY_TITLE"],
+            "alias": minimal_metagenomics_study["alias"],
+            "accession": "PRJEB55555",
+            "secondary_accession": "ERP055555",
+            "status": "PRIVATE",
+        }
+        content = _make_study_json(minimal_metagenomics_study)
+        with runner.isolated_filesystem():
+            Path("studies.json").write_text(content)
+            with (
+                patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
+                patch(
+                    "submit_rawreads_study.fetch_account_studies",
+                    return_value=[existing],
+                ),
+            ):
+                result = runner.invoke(
+                    main,
+                    ["--input", "studies.json"],
+                    catch_exceptions=False,
+                )
+        assert result.exit_code == 0, f"stdout: {result.output}"
+        data = _extract_json_from_output(result.output)
+        assert len(data["duplicates"]) == 1
+        assert data["duplicates"][0]["existing_accession"] == "PRJEB55555"
+        assert data["submitted"] == []
+
+    # ---- E23: --force with duplicate triggers MODIFY ----------------------
+
+    def test_force_flag_with_duplicate_triggers_modify(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """--force with a detected duplicate triggers MODIFY and study appears in 'modified'."""
+        existing = {
+            "title": minimal_metagenomics_study["STUDY_TITLE"],
+            "alias": minimal_metagenomics_study["alias"],
+            "accession": "PRJEB66666",
+            "secondary_accession": "ERP066666",
+            "status": "PRIVATE",
+        }
+        receipt_xml = ET.fromstring(
+            '<RECEIPT success="true">'
+            '<PROJECT accession="PRJEB66666" alias="cli-metagenomics-001" status="PRIVATE"/>'
+            "</RECEIPT>"
         )
-        assert result is None
-
-    def test_csv_without_metadata_row(self, tmp_path):
-        """A CSV with no metadata row should still work."""
-        csvfile = tmp_path / "no_meta.csv"
-        csvfile.write_text("STUDY_TITLE,IS_PRIMARY\nTest,YES\n")
-        studies = common.load_input_file(
-            str(csvfile), json_record_keys=_JSON_RECORD_KEYS,
+        content = _make_study_json(minimal_metagenomics_study)
+        with runner.isolated_filesystem():
+            Path("studies.json").write_text(content)
+            with (
+                patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
+                patch(
+                    "submit_rawreads_study.fetch_account_studies",
+                    return_value=[existing],
+                ),
+                patch(self._SUBMIT_TARGET, return_value=receipt_xml),
+            ):
+                result = runner.invoke(
+                    main,
+                    ["--input", "studies.json", "--force"],
+                    catch_exceptions=False,
+                )
+        assert result.exit_code == 0, f"stdout: {result.output}"
+        data = _extract_json_from_output(result.output)
+        assert len(data["modified"]) == 1
+        assert data["modified"][0]["accession"] == "PRJEB66666"
+
+    # ---- E24: Failed submission exits 1 -----------------------------------
+
+    def test_failed_submission_exits_1(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """When common.submit_xml raises HTTPError, the CLI exits with code 1."""
+        import requests
+
+        content = _make_study_json(minimal_metagenomics_study)
+        http_error = requests.exceptions.HTTPError(response=MagicMock(status_code=500, text="err"))
+        with runner.isolated_filesystem():
+            Path("studies.json").write_text(content)
+            with (
+                patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
+                patch(self._SUBMIT_TARGET, side_effect=http_error),
+            ):
+                result = runner.invoke(
+                    main,
+                    ["--input", "studies.json", "--automated"],
+                    catch_exceptions=False,
+                )
+        assert result.exit_code == 1
+
+    # ---- E25: MAG/genome study dry-run XML contains both PROJECT_ATTRIBUTEs ---
+
+    def test_mag_genome_study_dry_run_xml_has_both_attributes(
+        self,
+        runner: CliRunner,
+    ) -> None:
+        """MAG/genome study with existing_study_type=Other produces both PROJECT_ATTRIBUTEs."""
+        study = {
+            "alias": "mag-001",
+            "STUDY_TITLE": "MAG Genome Study",
+            "existing_study_type": "Other",
+            "new_study_type": "Genome Sequencing",
+        }
+        content = _make_study_json(study)
+        with runner.isolated_filesystem():
+            Path("studies.json").write_text(content)
+            with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
+                result = runner.invoke(
+                    main,
+                    ["--input", "studies.json", "--automated", "--dry-run"],
+                    catch_exceptions=False,
+                )
+        assert result.exit_code == 0, f"output: {result.output}"
+        data = _extract_json_from_output(result.output)
+        assert "submitted" in data
+        # Also verify the XML would contain both attributes by building it directly
+        root = build_submission_xml([study])
+        tags = [el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text]
+        assert "existing_study_type" in tags
+        assert "new_study_type" in tags
+
+    # ---- E26: --hold-until date present in XML ----------------------------
+
+    def test_hold_until_date_appears_in_submission_xml(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """--hold-until date is present in the HOLD element of the generated XML."""
+        study = dict(minimal_metagenomics_study)
+        root = build_submission_xml([study], hold_until="2027-12-31")
+        hold_el = root.find(".//HOLD")
+        assert hold_el is not None
+        assert hold_el.get("HoldUntilDate") == "2027-12-31"
+
+    def test_hold_until_cli_flag_passes_validation(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """CLI --hold-until with a valid future date exits 0 in dry-run mode."""
+        content = _make_study_json(minimal_metagenomics_study)
+        with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
+            result = self._invoke(
+                runner,
+                ["--automated", "--dry-run", "--hold-until", "2027-06-01"],
+                "studies.json",
+                content,
+            )
+        assert result.exit_code == 0, f"output: {result.output}"
+
+    # ---- E27: --output writes results to file -----------------------------
+
+    def test_output_flag_writes_results_to_file(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """--output flag writes JSON results to a file rather than stdout."""
+        content = _make_study_json(minimal_metagenomics_study)
+        with runner.isolated_filesystem():
+            Path("studies.json").write_text(content)
+            with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
+                result = runner.invoke(
+                    main,
+                    ["--input", "studies.json", "--automated", "--dry-run",
+                     "--output", "results.json"],
+                    catch_exceptions=False,
+                )
+            assert result.exit_code == 0, f"stdout: {result.output}"
+            # With --output, the JSON results go to file, not stdout (stdout has only logging).
+            results_path = Path("results.json")
+            assert results_path.exists(), "results.json was not created"
+            data = json.loads(results_path.read_text())
+            assert "submitted" in data
+
+    # ---- E28: --test flag routes to test base URL -------------------------
+
+    def test_test_flag_uses_test_base_url(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """--test flag results in the test base URL being used for submission."""
+        receipt_xml = ET.fromstring(
+            '<RECEIPT success="true">'
+            '<PROJECT accession="PRJEB00001" alias="cli-metagenomics-001" status="PRIVATE"/>'
+            "</RECEIPT>"
         )
-        assert len(studies) == 1
-        assert studies[0]["STUDY_TITLE"] == "Test"
-        assert studies[0]["IS_PRIMARY"] == "YES"
-
-    def test_tabular_empty_values_omitted(self, tmp_path):
-        """Empty cells in tabular files should be omitted."""
-        csvfile = tmp_path / "sparse.csv"
-        csvfile.write_text(
-            "STUDY_TITLE,STUDY_ABSTRACT,IS_PRIMARY\nTest,,YES\n",
+        content = _make_study_json(minimal_metagenomics_study)
+        with runner.isolated_filesystem():
+            Path("studies.json").write_text(content)
+            with (
+                patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
+                patch(self._SUBMIT_TARGET, return_value=receipt_xml) as mock_submit,
+            ):
+                result = runner.invoke(
+                    main,
+                    ["--input", "studies.json", "--automated", "--test"],
+                    catch_exceptions=False,
+                )
+        assert result.exit_code == 0, f"stdout: {result.output}"
+        assert mock_submit.called
+        called_url = mock_submit.call_args[0][0]
+        assert "wwwdev" in called_url, f"Expected test URL; got {called_url}"
+
+    def test_no_test_flag_uses_production_base_url(
+        self,
+        runner: CliRunner,
+        minimal_metagenomics_study: dict[str, Any],
+    ) -> None:
+        """Without --test flag, the production base URL is used."""
+        receipt_xml = ET.fromstring(
+            '<RECEIPT success="true">'
+            '<PROJECT accession="PRJEB00002" alias="cli-metagenomics-001" status="PRIVATE"/>'
+            "</RECEIPT>"
         )
-        studies = common.load_input_file(
-            str(csvfile), json_record_keys=_JSON_RECORD_KEYS,
+        content = _make_study_json(minimal_metagenomics_study)
+        with runner.isolated_filesystem():
+            Path("studies.json").write_text(content)
+            with (
+                patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
+                patch(self._SUBMIT_TARGET, return_value=receipt_xml) as mock_submit,
+            ):
+                result = runner.invoke(
+                    main,
+                    ["--input", "studies.json", "--automated"],
+                    catch_exceptions=False,
+                )
+        assert result.exit_code == 0, f"stdout: {result.output}"
+        assert mock_submit.called
+        called_url = mock_submit.call_args[0][0]
+        assert "wwwdev" not in called_url, f"Expected prod URL; got {called_url}"
+
+
+# ---------------------------------------------------------------------------
+# Parametrized study-type cases
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "study_type,new_type,expect_new_type",
+    [
+        ("Metagenomics", None, False),
+        ("RNASeq", None, False),
+        ("Population Genomics", None, False),
+        ("Other", "Genome Sequencing", True),
+        ("Other", "Transcriptome Analysis", True),
+        ("Other", None, False),
+    ],
+)
+def test_project_attribute_new_study_type_conditional(
+    study_type: str,
+    new_type: str | None,
+    expect_new_type: bool,
+) -> None:
+    """new_study_type attribute appears iff existing_study_type=='Other' and new_type is set.
+
+    Args:
+        study_type: Value for existing_study_type.
+        new_type: Value for new_study_type (or None).
+        expect_new_type: Whether new_study_type should appear in the XML.
+    """
+    study: dict[str, Any] = {
+        "alias": "param-test",
+        "STUDY_TITLE": "Parametrized Study",
+        "existing_study_type": study_type,
+    }
+    if new_type is not None:
+        study["new_study_type"] = new_type
+
+    root = build_submission_xml([study])
+    tags = [el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text]
+    if expect_new_type:
+        assert "new_study_type" in tags, (
+            f"Expected new_study_type in tags for {study_type!r} / {new_type!r}"
+        )
+    else:
+        assert "new_study_type" not in tags, (
+            f"Did not expect new_study_type in tags for {study_type!r} / {new_type!r}"
         )
-        assert len(studies) == 1
-        assert "STUDY_ABSTRACT" not in studies[0]
-        assert studies[0]["STUDY_TITLE"] == "Test"
+
+
+@pytest.mark.parametrize(
+    "hold_until,expect_hold",
+    [
+        ("2027-03-01", True),
+        ("2028-12-31", True),
+        (None, False),
+    ],
+)
+def test_hold_until_element_conditional(hold_until: str | None, expect_hold: bool) -> None:
+    """HOLD element appears iff hold_until is provided.
+
+    Args:
+        hold_until: The hold-until date string, or None.
+        expect_hold: Whether the HOLD element should appear.
+    """
+    study = {"alias": "hold-test", "STUDY_TITLE": "Hold Date Test"}
+    root = build_submission_xml([study], hold_until=hold_until)
+    hold_el = root.find(".//HOLD")
+    if expect_hold:
+        assert hold_el is not None
+        assert hold_el.get("HoldUntilDate") == hold_until
+    else:
+        assert hold_el is None
+
+
+@pytest.mark.parametrize("action", ["ADD", "MODIFY"])
+def test_submission_action_element_present(action: str) -> None:
+    """The correct action element (ADD or MODIFY) appears in the SUBMISSION.
+
+    Args:
+        action: The submission action string.
+    """
+    study = {"alias": "action-test", "STUDY_TITLE": "Action Test"}
+    root = build_submission_xml([study], action=action)
+    xml_str = ET.tostring(root, encoding="unicode")
+    assert f"<{action}" in xml_str or f"<{action}/>" in xml_str
+    opposite = "MODIFY" if action == "ADD" else "ADD"
+    assert f"<{opposite}" not in xml_str
diff --git a/conf/modules.config b/conf/modules.config
index 1eadfb0..b55d4f9 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -43,7 +43,11 @@ process {
         ]
     }
 
-    withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI|REGISTERSTUDY' {
+    withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI' {
         ext.args = { params.test_upload ? "--test" : "" }
     }
+
+    withName: 'SUBMIT_RAWREADS_STUDY' {
+        ext.args = { [params.test_upload ? "--test" : "", "--automated"].findAll().join(" ") }
+    }
 }
diff --git a/conf/test_assembly.config b/conf/test_assembly.config
index d94b5bc..389e102 100644
--- a/conf/test_assembly.config
+++ b/conf/test_assembly.config
@@ -30,8 +30,6 @@ params {
 
     mode                          = "metagenomic_assemblies"
     submission_study              = "PRJEB98843"
-    ena_raw_reads_study_accession = "PRJEB65995"
-    library                       = "metagenome"
     centre_name                   = "TEST_CENTER"
 
 }
diff --git a/modules/local/ena_webin_cli/main.nf b/modules/local/ena_webin_cli/main.nf
index 25b12f4..e5f878e 100644
--- a/modules/local/ena_webin_cli/main.nf
+++ b/modules/local/ena_webin_cli/main.nf
@@ -58,4 +58,16 @@ process ENA_WEBIN_CLI {
         false
     fi
     """
+
+    stub:
+    def prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    touch ${prefix}_webin-cli.report
+    export STATUS="success"
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        ena-webin-cli: 0.0.0
+    END_VERSIONS
+    """
 }
diff --git a/modules/local/genome_upload/main.nf b/modules/local/genome_upload/main.nf
index f8bf1a5..3c5d348 100644
--- a/modules/local/genome_upload/main.nf
+++ b/modules/local/genome_upload/main.nf
@@ -11,6 +11,7 @@ process GENOME_UPLOAD {
     path(mags)
     path(table_for_upload)
     val(mags_or_bins_flag)
+    val(submission_study)
 
     output:
     path "results/{MAG,bin}_upload/manifests*/*.manifest"      , emit: manifests
@@ -34,7 +35,7 @@ process GENOME_UPLOAD {
     export ENA_WEBIN_PASSWORD=\$WEBIN_PASSWORD
 
     genome_upload \\
-        -u $params.submission_study \\
+        -u $submission_study \\
         --genome_info ${table_for_upload} \\
         --centre_name $params.centre_name \\
         --${mags_or_bins_flag} \\
diff --git a/nextflow.config b/nextflow.config
index a6f7ae2..dba0973 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -13,10 +13,7 @@ params {
     input                         = null
     mode                          = null  // {mags, bins, metagenomic_assemblies}
 
-    // TODO rewrite register_study script to remove this unnecessary parameters
-    ena_raw_reads_study_accession = null
-    library                       = null
-
+    study_metadata                = null
     submission_study              = null
     centre_name                   = null
     upload_tpa                    = false
diff --git a/nextflow_schema.json b/nextflow_schema.json
index d31596a..dedf312 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -261,16 +261,13 @@
                     "description": "ENA study accession (PRJ/ERP) to submit the data to",
                     "help_text": "Current implementation of pipeline requires to pre-register ENA project (PRJ/ERP) where you want to upload data to. Documentation how to register study: https://ena-docs.readthedocs.io/en/latest/submit/study.html"
                 },
-                "library": {
+                "study_metadata": {
                     "type": "string",
-                    "enum": ["metagenome", "metatranscriptome"],
-                    "description": "Type of library for the submission. Required for creation of the new submission study.",
-                    "help_text": "Uses script register_study from assembly_uploader package that requires this parameter to compose study title."
-                },
-                "ena_raw_reads_study_accession": {
-                    "type": "string",
-                    "description": "ENA study accession (PRJ/ERP) of the raw reads study associated with the assembly submission. Required for creation of the new submission study.",
-                    "help_text": "Uses script register_study from assembly_uploader package that requires this parameter to compose study title and description."
+                    "format": "file-path",
+                    "exists": true,
+                    "description": "Path to study metadata file (JSON, CSV, or TSV) for registering a new ENA study. Required when submission_study is not provided.",
+                    "help_text": "File containing study metadata fields (STUDY_TITLE, STUDY_ABSTRACT, existing_study_type, alias, etc.). Used by SUBMIT_RAWREADS_STUDY to create a new study in ENA when no existing submission_study accession is given.",
+                    "fa_icon": "fas fa-file-alt"
                 },
                 "webincli_submit": {
                     "type": "boolean",
diff --git a/tests/default.nf.test b/tests/default.nf.test
index 44f2465..919645d 100644
--- a/tests/default.nf.test
+++ b/tests/default.nf.test
@@ -4,30 +4,81 @@ nextflow_pipeline {
     script "../main.nf"
     tag "pipeline"
 
-    test("-profile test") {
+    test("metagenomic_assemblies mode — submission_study provided (no study registration)") {
+        // Exercises the assembly submission path using a pre-registered study (stub mode).
+        // SUBMIT_RAWREADS_STUDY is NOT called here; the module-level nf-test covers it.
+        //
+        // A samplesheet is generated on the fly with absolute paths so that nf-schema
+        // validation succeeds regardless of the nf-test launchDir.
+        options "-stub"
 
         when {
             params {
-                outdir = "$outputDir"
+                def csv = new File("${outputDir}/samplesheet_assembly.csv")
+                csv.parentFile.mkdirs()
+                csv.text = [
+                    "sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version",
+                    "sample1,${projectDir}/tests/data/contigs.fasta.gz,${projectDir}/tests/data/fastq_1.fastq,${projectDir}/tests/data/fastq_2.fastq,,ERR000001,SPAdes,3.15",
+                    "sample2,${projectDir}/tests/data/invalid_assembly.fasta.gz,,,45,ERR000002,Velvet,1.2.10",
+                    "sample3,${projectDir}/tests/data/contigs.fasta.gz,,,30,ERR000003,MEGAHIT,1.2.9"
+                ].join("\n")
+
+                outdir           = "$outputDir"
+                input            = csv.absolutePath
+                mode             = "metagenomic_assemblies"
+                submission_study = "PRJEB98843"
+                centre_name      = "TEST_CENTER"
             }
         }
 
         then {
-            // stable_name: All files + folders in ${params.outdir}/ with a stable name
             def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}'])
-            // stable_path: All files in ${params.outdir}/ with stable content
             def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore')
             assertAll(
-                { assert workflow.success},
+                { assert workflow.success },
                 { assert snapshot(
-                    // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions
                     removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"),
-                    // All stable path name, with a relative path
                     stable_name,
-                    // All files with stable contents
                     stable_path
                 ).match() }
             )
         }
     }
+
+    test("metagenomic_assemblies mode — study_metadata provided (SUBMIT_RAWREADS_STUDY registers study)") {
+        // Tests the study-registration path in stub mode. SUBMIT_RAWREADS_STUDY stub
+        // outputs an empty accessions JSON, so this test validates the plumbing rather
+        // than the end-to-end submission output.
+        options "-stub"
+
+        when {
+            params {
+                def csv = new File("${outputDir}/samplesheet_assembly.csv")
+                csv.parentFile.mkdirs()
+                csv.text = [
+                    "sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version",
+                    "sample1,${projectDir}/tests/data/contigs.fasta.gz,${projectDir}/tests/data/fastq_1.fastq,${projectDir}/tests/data/fastq_2.fastq,,ERR000001,SPAdes,3.15",
+                    "sample2,${projectDir}/tests/data/invalid_assembly.fasta.gz,,,45,ERR000002,Velvet,1.2.10",
+                    "sample3,${projectDir}/tests/data/contigs.fasta.gz,,,30,ERR000003,MEGAHIT,1.2.9"
+                ].join("\n")
+
+                outdir         = "$outputDir"
+                input          = csv.absolutePath
+                mode           = "metagenomic_assemblies"
+                study_metadata = "${projectDir}/assets/test-fixtures/example_study.json"
+                centre_name    = "TEST_CENTER"
+            }
+        }
+
+        then {
+            assertAll(
+                { assert workflow.success },
+                { assert workflow.trace.succeeded().any { it.name.contains("SUBMIT_RAWREADS_STUDY") } }
+            )
+        }
+    }
+
+    // NOTE: The MAGs/bins test requires remote genome files from nf-core/test-datasets
+    // (https://github.com/nf-core/test-datasets/tree/seqsubmit) and cannot run offline.
+    // Run it manually with: nf-test test tests/default.nf.test --filter "mags" --profile test_genome,docker
 }
diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap
new file mode 100644
index 0000000..71a254e
--- /dev/null
+++ b/tests/default.nf.test.snap
@@ -0,0 +1,58 @@
+{
+    "metagenomic_assemblies mode \u2014 submission_study provided (no study registration)": {
+        "content": [
+            {
+                "Workflow": {
+                    "nf-core/seqsubmit": "v1.0.0dev"
+                }
+            },
+            [
+                "coverm",
+                "coverm/sample1.depth.txt",
+                "fastavalidator",
+                "fastavalidator/sample1.success.log",
+                "fastavalidator/sample2.success.log",
+                "fastavalidator/sample3.success.log",
+                "generate",
+                "generate/PRJEB98843_upload",
+                "generate/PRJEB98843_upload/test.manifest",
+                "metagenomic_assemblies",
+                "metagenomic_assemblies/multiqc",
+                "metagenomic_assemblies/multiqc/multiqc_data",
+                "metagenomic_assemblies/multiqc/multiqc_plots",
+                "metagenomic_assemblies/multiqc/multiqc_report.html",
+                "metagenomic_assemblies/sample1_assembly_metadata.csv",
+                "metagenomic_assemblies/sample2_assembly_metadata.csv",
+                "metagenomic_assemblies/sample3_assembly_metadata.csv",
+                "metagenomic_assemblies/upload",
+                "metagenomic_assemblies/upload/webin_cli",
+                "metagenomic_assemblies/upload/webin_cli/sample1_webin-cli.report",
+                "metagenomic_assemblies/upload/webin_cli/sample2_webin-cli.report",
+                "metagenomic_assemblies/upload/webin_cli/sample3_webin-cli.report",
+                "pipeline_info",
+                "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml",
+                "samplesheet_assembly.csv"
+            ],
+            [
+                "sample1.depth.txt:md5,d41d8cd98f00b204e9800998ecf8427e",
+                "sample1.success.log:md5,b0b859eda1db5cd43915846e00ebc22c",
+                "sample2.success.log:md5,b0b859eda1db5cd43915846e00ebc22c",
+                "sample3.success.log:md5,b0b859eda1db5cd43915846e00ebc22c",
+                "test.manifest:md5,d41d8cd98f00b204e9800998ecf8427e",
+                "multiqc_report.html:md5,d41d8cd98f00b204e9800998ecf8427e",
+                "sample1_assembly_metadata.csv:md5,e1a00dc628e95c38e18dfd5161fa2ce4",
+                "sample2_assembly_metadata.csv:md5,901e55730b100224efb27f23aabf4f67",
+                "sample3_assembly_metadata.csv:md5,d5b1575095ece78d988395b874440bef",
+                "sample1_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e",
+                "sample2_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e",
+                "sample3_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e",
+                "samplesheet_assembly.csv:md5,2f74b281cb7096ad80a378b8960aabee"
+            ]
+        ],
+        "timestamp": "2026-03-12T13:22:15.261886",
+        "meta": {
+            "nf-test": "0.9.4",
+            "nextflow": "25.10.4"
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/nextflow.config b/tests/nextflow.config
index 695d52b..be915f5 100644
--- a/tests/nextflow.config
+++ b/tests/nextflow.config
@@ -11,4 +11,19 @@ params {
     pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/seqsubmit'
 }
 
+process {
+    resourceLimits = [
+        cpus: 2,
+        memory: '8.GB',
+        time: '1.h'
+    ]
+}
+
+// Override secrets-based env vars so nf-test runs don't require a populated keystore.
+// Stub-mode tests never use the actual credentials.
+env {
+    ENA_WEBIN          = "test_webin_account"
+    ENA_WEBIN_PASSWORD = "test_webin_password"
+}
+
 aws.client.anonymous = true // fixes S3 access issues on self-hosted runners
diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf
index 918e1d7..b383a6c 100644
--- a/workflows/assemblysubmit.nf
+++ b/workflows/assemblysubmit.nf
@@ -7,7 +7,7 @@
 include { COVERM_CONTIG              } from '../modules/nf-core/coverm/contig/main'
 include { FASTAVALIDATOR             } from '../modules/nf-core/fastavalidator/main'
 include { GENERATE_ASSEMBLY_MANIFEST } from '../modules/local/generate_assembly_manifest/main'
-include { REGISTERSTUDY              } from '../modules/local/registerstudy/main'
+include { SUBMIT_RAWREADS_STUDY      } from '../modules/local/submit_rawreads_study/main'
 include { ENA_WEBIN_CLI              } from '../modules/local/ena_webin_cli'
 
 include { MULTIQC                    } from '../modules/nf-core/multiqc/main'
@@ -99,6 +99,9 @@ workflow ASSEMBLYSUBMIT {
         .map { meta, coverage_file ->
             // Read the file and calculate average
             def lines = coverage_file.readLines()
+            if (lines.size() < 2) {
+                return [meta, 0.0]
+            }
             def coverages = lines[1..-1].collect { line ->
                 line.split('\t')[1] as Double
             }
@@ -139,6 +142,7 @@ workflow ASSEMBLYSUBMIT {
 
             def content = "${header}\n${row}"
             def csv_file = file("${params.outdir}/${params.mode}/${meta.id}_assembly_metadata.csv")
+            csv_file.parent.toFile().mkdirs()
             csv_file.text = content
 
             [meta, csv_file]
@@ -149,11 +153,17 @@ workflow ASSEMBLYSUBMIT {
         // Use provided study accession directly
         study_accession_ch = channel.of(params.submission_study)
     } else {
-        // Register a new study
-        REGISTERSTUDY(
-            [[id:"study"], params.ena_raw_reads_study_accession, params.centre_name, params.library ]
+        // Register a new study using the study metadata file
+        SUBMIT_RAWREADS_STUDY(
+            channel.of([[id: "study"], file(params.study_metadata)])
         )
-        study_accession_ch = REGISTERSTUDY.out.study_accession.map { _meta, accession -> accession }
+        ch_versions = ch_versions.mix(SUBMIT_RAWREADS_STUDY.out.versions)
+        study_accession_ch = SUBMIT_RAWREADS_STUDY.out.accessions
+            .map { _meta, json ->
+                def data = new groovy.json.JsonSlurper().parse(json)
+                data.submitted[0]?.accession
+                    ?: data.duplicates[0]?.existing_accession
+            }
     }
 
     // Generate assembly manifest files and submit them to ENA
diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf
index 063d56c..b34a704 100644
--- a/workflows/genomesubmit.nf
+++ b/workflows/genomesubmit.nf
@@ -5,6 +5,7 @@
 */
 include { GENOME_UPLOAD          } from '../modules/local/genome_upload'
 include { ENA_WEBIN_CLI          } from '../modules/local/ena_webin_cli'
+include { SUBMIT_RAWREADS_STUDY  } from '../modules/local/submit_rawreads_study/main'
 
 include { RNA_DETECTION           } from '../subworkflows/local/rna_detection'
 
@@ -109,10 +110,27 @@ workflow GENOMESUBMIT {
             newLine: true
         )
 
+    def study_accession_ch
+    if (params.submission_study) {
+        study_accession_ch = channel.of(params.submission_study)
+    } else {
+        SUBMIT_RAWREADS_STUDY(
+            channel.of([[id: "study"], file(params.study_metadata)])
+        )
+        ch_versions = ch_versions.mix(SUBMIT_RAWREADS_STUDY.out.versions)
+        study_accession_ch = SUBMIT_RAWREADS_STUDY.out.accessions
+            .map { _meta, json ->
+                def data = new groovy.json.JsonSlurper().parse(json)
+                data.submitted[0]?.accession
+                    ?: data.duplicates[0]?.existing_accession
+            }
+    }
+
     GENOME_UPLOAD(
         genome_fasta.map{meta, fasta -> fasta}.collect(),
         genome_metadata_csv,
-        params.mode
+        params.mode,
+        study_accession_ch.first()
     )
     ch_versions = ch_versions.mix( GENOME_UPLOAD.out.versions )
 

From 6f265d8985dc1580bbf59fc081c732f1dde79191 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Thu, 12 Mar 2026 13:56:05 +0000
Subject: [PATCH 08/36] Replaced REGSTERSTUDY module with new one based on
 submit_study.py script. Also renamed submit_rawreads_study to submit_study

---
 ...bmit_rawreads_study.py => submit_study.py} | 11 +--
 ...rawreads_study.py => test_submit_study.py} | 26 +++---
 conf/modules.config                           |  2 +-
 .../tests/main.nf.test.snap                   | 32 +++----
 modules/local/registerstudy/environment.yml   |  8 +-
 modules/local/registerstudy/main.nf           | 41 ++++-----
 modules/local/registerstudy/meta.yml          | 90 +++++++++----------
 modules/local/registerstudy/nextflow.config   |  9 --
 .../local/registerstudy/tests/main.nf.test    | 34 +++----
 .../registerstudy/tests/main.nf.test.snap     | 59 +++---------
 .../tests/nextflow.config                     |  4 +-
 .../submit_rawreads_study/environment.yml     | 10 ---
 modules/local/submit_rawreads_study/main.nf   | 47 ----------
 modules/local/submit_rawreads_study/meta.yml  | 68 --------------
 .../submit_rawreads_study/tests/main.nf.test  | 56 ------------
 .../tests/main.nf.test.snap                   | 35 --------
 nextflow_schema.json                          |  2 +-
 tests/default.nf.test                         |  8 +-
 workflows/assemblysubmit.nf                   |  8 +-
 workflows/genomesubmit.nf                     |  8 +-
 20 files changed, 140 insertions(+), 418 deletions(-)
 rename bin/{submit_rawreads_study.py => submit_study.py} (98%)
 rename bin/{test_submit_rawreads_study.py => test_submit_study.py} (98%)
 delete mode 100644 modules/local/registerstudy/nextflow.config
 rename modules/local/{submit_rawreads_study => registerstudy}/tests/nextflow.config (86%)
 delete mode 100644 modules/local/submit_rawreads_study/environment.yml
 delete mode 100644 modules/local/submit_rawreads_study/main.nf
 delete mode 100644 modules/local/submit_rawreads_study/meta.yml
 delete mode 100644 modules/local/submit_rawreads_study/tests/main.nf.test
 delete mode 100644 modules/local/submit_rawreads_study/tests/main.nf.test.snap

diff --git a/bin/submit_rawreads_study.py b/bin/submit_study.py
similarity index 98%
rename from bin/submit_rawreads_study.py
rename to bin/submit_study.py
index c00ee6d..ae72d69 100755
--- a/bin/submit_rawreads_study.py
+++ b/bin/submit_study.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python3
-"""Submit raw-reads studies to ENA via the Webin REST API v2.
+"""Submit raw-reads, assembly and genome studies to ENA 
+via the Webin REST API v2.
 
 Read a DataHarmonizer export containing study metadata,
 check for duplicate studies already registered under the
@@ -14,17 +15,17 @@
 
 Usage::
 
-    python bin/submit_rawreads_study.py \
+    python bin/submit_study.py \
         --input studies.json \
         --test
 
     # With hold date (max 2 years):
-    python bin/submit_rawreads_study.py \
+    python bin/submit_study.py \
         --input studies.json \
         --hold-until 2028-01-01
 
     # Log to file:
-    python bin/submit_rawreads_study.py \
+    python bin/submit_study.py \
         --input studies.json \
         --test --log submission.log
 """
@@ -431,7 +432,7 @@ def _do_submission(
 
 
 @click.command(
-    help="Submit raw-reads studies to ENA via the Webin REST API v2.",
+    help="Submit raw-reads, assembly and genome studies to ENA via the Webin REST API v2.",
 )
 @click.option(
     "--input", "input_file",
diff --git a/bin/test_submit_rawreads_study.py b/bin/test_submit_study.py
similarity index 98%
rename from bin/test_submit_rawreads_study.py
rename to bin/test_submit_study.py
index f07f85e..d021383 100644
--- a/bin/test_submit_rawreads_study.py
+++ b/bin/test_submit_study.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Tests for submit_rawreads_study.py — raw-reads study submission pipeline.
+"""Tests for submit_study.py — ENA study submission pipeline.
 
 Covers:
     A. Unit tests for build_submission_xml and _add_project_element
@@ -9,11 +9,11 @@
     E. CLI integration tests for main() using click.testing.CliRunner
 
 Usage:
-    pytest bin/test_submit_rawreads_study.py -v
+    pytest bin/test_submit_study.py -v
 
 All external I/O (HTTP requests, ENA reports API) is mocked. Tests do NOT
 import from ena_submit_common directly — all assertions go through the public
-API of submit_rawreads_study.
+API of submit_study.
 """
 
 from __future__ import annotations
@@ -34,7 +34,7 @@
 # Ensure the scripts directory is on the path before importing the module.
 sys.path.insert(0, os.path.dirname(__file__))
 
-from submit_rawreads_study import (  # noqa: E402
+from bin.submit_study import (  # noqa: E402
     _normalize_study_report,
     build_submission_xml,
     fetch_account_studies,
@@ -816,7 +816,7 @@ def test_fetch_calls_fetch_account_records_with_correct_urls(
         self, auth: HTTPBasicAuth
     ) -> None:
         """fetch_account_studies calls common.fetch_account_records with prod/test URLs."""
-        target = "submit_rawreads_study.common.fetch_account_records"
+        target = "submit_study.common.fetch_account_records"
         with patch(target, return_value=[]) as mock_fetch:
             fetch_account_studies(auth, use_test=False)
             mock_fetch.assert_called_once()
@@ -826,7 +826,7 @@ def test_fetch_calls_fetch_account_records_with_correct_urls(
 
     def test_fetch_passes_normalizer_callable(self, auth: HTTPBasicAuth) -> None:
         """fetch_account_studies passes a callable normalizer to fetch_account_records."""
-        target = "submit_rawreads_study.common.fetch_account_records"
+        target = "submit_study.common.fetch_account_records"
         with patch(target, return_value=[]) as mock_fetch:
             fetch_account_studies(auth, use_test=False)
             call_kwargs = mock_fetch.call_args
@@ -835,7 +835,7 @@ def test_fetch_passes_normalizer_callable(self, auth: HTTPBasicAuth) -> None:
 
     def test_fetch_normalizer_handles_title_variant(self, auth: HTTPBasicAuth) -> None:
         """The normalizer passed to fetch_account_records handles title/studyTitle variants."""
-        target = "submit_rawreads_study.common.fetch_account_records"
+        target = "submit_study.common.fetch_account_records"
         captured_normalizer = None
 
         def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
@@ -857,7 +857,7 @@ def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
 
     def test_fetch_normalizer_handles_alias_variant(self, auth: HTTPBasicAuth) -> None:
         """The normalizer handles alias/studyAlias field variants."""
-        target = "submit_rawreads_study.common.fetch_account_records"
+        target = "submit_study.common.fetch_account_records"
         captured_normalizer = None
 
         def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
@@ -879,7 +879,7 @@ def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
 
     def test_fetch_normalizer_handles_accession_variant(self, auth: HTTPBasicAuth) -> None:
         """The normalizer handles accession/studyAccession field variants."""
-        target = "submit_rawreads_study.common.fetch_account_records"
+        target = "submit_study.common.fetch_account_records"
         captured_normalizer = None
 
         def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
@@ -1004,8 +1004,8 @@ def minimal_metagenomics_study() -> dict[str, Any]:
 class TestMainCli:
     """CLI integration tests for main() using CliRunner."""
 
-    _CRED_TARGET = "submit_rawreads_study.common.get_credentials"
-    _SUBMIT_TARGET = "submit_rawreads_study.common.submit_xml"
+    _CRED_TARGET = "submit_study.common.get_credentials"
+    _SUBMIT_TARGET = "submit_study.common.submit_xml"
 
     def _invoke(
         self,
@@ -1106,7 +1106,7 @@ def test_duplicate_detection_records_duplicate_and_skips_submission(
             with (
                 patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
                 patch(
-                    "submit_rawreads_study.fetch_account_studies",
+                    "submit_study.fetch_account_studies",
                     return_value=[existing],
                 ),
             ):
@@ -1147,7 +1147,7 @@ def test_force_flag_with_duplicate_triggers_modify(
             with (
                 patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
                 patch(
-                    "submit_rawreads_study.fetch_account_studies",
+                    "submit_study.fetch_account_studies",
                     return_value=[existing],
                 ),
                 patch(self._SUBMIT_TARGET, return_value=receipt_xml),
diff --git a/conf/modules.config b/conf/modules.config
index b55d4f9..eaef036 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -47,7 +47,7 @@ process {
         ext.args = { params.test_upload ? "--test" : "" }
     }
 
-    withName: 'SUBMIT_RAWREADS_STUDY' {
+    withName: 'REGISTERSTUDY' {
         ext.args = { [params.test_upload ? "--test" : "", "--automated"].findAll().join(" ") }
     }
 }
diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap
index 7fef896..f594383 100644
--- a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap
+++ b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap
@@ -11,7 +11,7 @@
                     ]
                 ],
                 "1": [
-                    "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e"
+                    "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2"
                 ],
                 "manifest": [
                     [
@@ -22,20 +22,20 @@
                     ]
                 ],
                 "versions": [
-                    "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e"
+                    "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2"
                 ]
             },
             {
                 "GENERATE_ASSEMBLY_MANIFEST": {
-                    "assembly_uploader": "assembly_uploader 1.3.3"
+                    "assembly_uploader": "assembly_uploader 1.3.4"
                 }
             }
         ],
+        "timestamp": "2026-03-12T13:52:01.267817",
         "meta": {
-            "nf-test": "0.9.0",
-            "nextflow": "25.04.1"
-        },
-        "timestamp": "2025-10-30T15:10:02.229709"
+            "nf-test": "0.9.4",
+            "nextflow": "25.10.4"
+        }
     },
     "GENERATE_ASSEMBLY_MANIFEST completes with expected outputs": {
         "content": [
@@ -45,34 +45,34 @@
                         {
                             "id": "test"
                         },
-                        "233126d4c4d023f18c7836ed36395e3c.manifest:md5,3152b34ddec05a2c9937a2e03416e5e1"
+                        "233126d4c4d0.manifest:md5,8387c0e6c123313259db613612c09dce"
                     ]
                 ],
                 "1": [
-                    "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e"
+                    "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2"
                 ],
                 "manifest": [
                     [
                         {
                             "id": "test"
                         },
-                        "233126d4c4d023f18c7836ed36395e3c.manifest:md5,3152b34ddec05a2c9937a2e03416e5e1"
+                        "233126d4c4d0.manifest:md5,8387c0e6c123313259db613612c09dce"
                     ]
                 ],
                 "versions": [
-                    "versions.yml:md5,32c079810bf4914d6d49aa9ad121889e"
+                    "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2"
                 ]
             },
             {
                 "GENERATE_ASSEMBLY_MANIFEST": {
-                    "assembly_uploader": "assembly_uploader 1.3.3"
+                    "assembly_uploader": "assembly_uploader 1.3.4"
                 }
             }
         ],
+        "timestamp": "2026-03-12T13:51:56.121365",
         "meta": {
-            "nf-test": "0.9.0",
-            "nextflow": "25.04.1"
-        },
-        "timestamp": "2025-10-30T15:09:57.708757"
+            "nf-test": "0.9.4",
+            "nextflow": "25.10.4"
+        }
     }
 }
\ No newline at end of file
diff --git a/modules/local/registerstudy/environment.yml b/modules/local/registerstudy/environment.yml
index 80dd37e..6ee92a8 100644
--- a/modules/local/registerstudy/environment.yml
+++ b/modules/local/registerstudy/environment.yml
@@ -4,7 +4,7 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  # TODO nf-core: List required Conda package(s).
-  #               Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10").
-  #               For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems.
-  - "bioconda::assembly_uploader=1.3.2"
+  - conda-forge::python>=3.12
+  - conda-forge::pip
+  - pip:
+    - mgnify-pipelines-toolkit==1.4.17
diff --git a/modules/local/registerstudy/main.nf b/modules/local/registerstudy/main.nf
index 0621043..67766e0 100644
--- a/modules/local/registerstudy/main.nf
+++ b/modules/local/registerstudy/main.nf
@@ -3,54 +3,45 @@ process REGISTERSTUDY {
     label 'process_single'
 
     conda "${moduleDir}/environment.yml"
-    container "community.wave.seqera.io/library/pip_assembly-uploader:2a65298c0161c561"
+    container "quay.io/microbiome-informatics/mgnify-pipelines-toolkit:1.4.17"
 
-    input:
-    tuple val(meta), val(study), val(center), val(library)
+    // ENA_WEBIN and ENA_WEBIN_PASSWORD must be set in the process environment.
+    // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config:
+    //   env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD }
 
+    input:
+    tuple val(meta), path(study_metadata)
 
     output:
-    tuple val(meta), env("STUDY_ID"), emit: study_accession
-    path "versions.yml"             , emit: versions
+    tuple val(meta), path("*_accessions.json"), emit: accessions
+    path "versions.yml",                        emit: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
-    def args = task.ext.args ?: ''
-    def args2 = task.ext.args2 ?: ''
+    def args   = task.ext.args   ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    echo "Generate study XMLs"
-    study_xmls \\
-        $args \\
-        --study ${study} \\
-        --library ${library} \\
-        --center ${center} \\
-
-    echo "Submit study to ENA"
-    submit_study \\
-        $args2 \\
-        --directory ${study}_upload \\
-        --study ${study} 2>&1 | tee report.log
-
-    STUDY_ID=\$(grep 'A new study accession has been created' report.log | grep -oE '(PRJ|ERP)[[:alnum:]_]+[[:digit:]]+')
+    submit_study.py \\
+        --input ${study_metadata} \\
+        --output ${prefix}_accessions.json \\
+        ${args}
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        assembly_uploader: \$(study_xmls --version)
+        mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))")
     END_VERSIONS
     """
 
     stub:
-    def args = task.ext.args ?: ''
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    touch ${prefix}.report
+    echo '{"submitted":[],"duplicates":[],"modified":[],"failed":[]}' > ${prefix}_accessions.json
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
-        assembly_uploader: \$(study_xmls --version)
+        mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))")
     END_VERSIONS
     """
 }
diff --git a/modules/local/registerstudy/meta.yml b/modules/local/registerstudy/meta.yml
index c459a19..549f187 100644
--- a/modules/local/registerstudy/meta.yml
+++ b/modules/local/registerstudy/meta.yml
@@ -1,18 +1,28 @@
 # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
 name: "registerstudy"
-description: This module registers a study and project and generates accessions that will be used for metagenomic assembly uploads in ENA. The study generated will reference reads from an already public project.
+description: |
+  Submit a new study to ENA via the Webin REST API v2.
+  Reads study metadata from a JSON, CSV, or TSV file, checks for
+  duplicate studies already registered under the Webin account,
+  builds a PROJECT XML submission document, and submits to ENA.
+  Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD
+  Nextflow secrets, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD
+  inside the process.
 keywords:
-  - assembly
-  - register
+  - ena
+  - submission
   - study
+  - project
+  - webin
 tools:
-  - "registerstudy":
-      description: "Nextflow module to register study/project to upload primary metagenome and metatranscriptome
-        assemblies to ENA on a per-study basis. The scripts generate xmls to register a new study and create manifests
-        necessary for submission of assemblies using webin-cli."
-      homepage: "https://github.com/EBI-Metagenomics/assembly_uploader"
-      documentation: "https://github.com/EBI-Metagenomics/assembly_uploader"
-      tool_dev_url: "None"
+  - mgnify-pipelines-toolkit:
+      description: |
+        A toolkit of utilities used in MGnify metagenomics pipelines,
+        including click, requests, and other dependencies required by
+        the ENA submission scripts.
+      homepage: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit
+      documentation: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit
+      tool_dev_url: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit
       doi: ""
       licence: ["Apache-2.0"]
       identifier: null
@@ -21,50 +31,38 @@ input:
   - - meta:
         type: map
         description: |
-          Groovy Map containing sample information
+          Groovy Map containing sample information.
           e.g. `[ id:'sample1' ]`
-    - study:
-        type: value
-        description: |
-          Study accession with raw reads public in ENA.
-          Example: "PRJNA312520"
-
-    - center:
-        type: value
+    - study_metadata:
+        type: file
         description: |
-          Name of the sequencing or submitting center.
-          Example: "Wellcome Sanger Institute"
-
-  - library:
-      type: value
-      description: |
-        Library information associated with the study.
-        Example: "metagenome"
-      enum:
-        - metagenome
-        - metatranscriptome
+          Study metadata file in JSON, CSV, or TSV format.
+          JSON may follow the DataHarmonizer Container export format or be
+          a plain list/dict of study records.
+          Required fields per record: STUDY_TITLE, existing_study_type.
+        pattern: "*.{json,csv,tsv}"
 
 output:
-  study_accession:
-    - - meta:
+  - accessions:
+      - meta:
           type: map
           description: |
-            Groovy Map containing sample information
+            Groovy Map containing sample information.
             e.g. `[ id:'sample1' ]`
-      - study:
-          type: value
+      - "*_accessions.json":
+          type: file
           description: |
-            Study accession registered in ENA.
-            Example: "PRJEB312520"
-  versions:
-    - "versions.yml":
-        type: file
-        description: File containing software versions
-        pattern: "versions.yml"
-        ontologies:
-          - edam: "http://edamontology.org/format_3750" # YAML
+            JSON file containing the submission results with keys:
+            submitted (newly created accessions), duplicates (skipped),
+            modified (force-updated), and failed.
+          pattern: "*_accessions.json"
+  - versions:
+      - "versions.yml":
+          type: file
+          description: File containing software versions
+          pattern: "versions.yml"
 
 authors:
-  - "@alisha246"
+  - "@timrozday"
 maintainers:
-  - "@alisha246"
+  - "@timrozday"
diff --git a/modules/local/registerstudy/nextflow.config b/modules/local/registerstudy/nextflow.config
deleted file mode 100644
index 3f71a8e..0000000
--- a/modules/local/registerstudy/nextflow.config
+++ /dev/null
@@ -1,9 +0,0 @@
-process {
-    withName: REGISTERSTUDY {
-        ext.args2 = '--test'
-    }
-}
-env {
-    ENA_WEBIN = secrets.WEBIN_ACCOUNT
-    ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD
-}
diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test
index d11a6d1..42f6902 100644
--- a/modules/local/registerstudy/tests/main.nf.test
+++ b/modules/local/registerstudy/tests/main.nf.test
@@ -1,25 +1,21 @@
-// TODO nf-core: Once you have added the required tests, please run the following command to build this file:
-// nf-core modules test registerstudy
 nextflow_process {
     name "Test Process REGISTERSTUDY"
     script "../main.nf"
-    config "../nextflow.config"
+    config "./nextflow.config"
     process "REGISTERSTUDY"
 
-
     tag "modules"
     tag "registerstudy"
 
-    test("registerstudy - should register a study on ENA test server") {
+    test("registerstudy - stub") {
+        options "-stub"
 
         when {
             process {
                 """
                 input[0] = [
-                    [ id:'test', single_end:false ], // meta map
-                    "PRJNA318468",
-                    "EMG",
-                    "metagenome"
+                    [ id:'example_study' ],
+                    file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true)
                 ]
                 """
             }
@@ -28,23 +24,21 @@ nextflow_process {
         then {
             assertAll(
                 { assert process.success },
-                //TODO improve assertions
+                { assert snapshot(process.out).match() }
             )
         }
     }
 
-    test("registerstudy - stub") {
-
-        options "-stub"
+    test("registerstudy - dry run against ENA test server") {
+        // Validates and builds the submission XML but does not submit to ENA.
+        // Dummy credentials in tests/nextflow.config are sufficient for dry-run mode.
 
         when {
             process {
                 """
                 input[0] = [
-                    [ id:'test', single_end:false ], // meta map
-                    "PRJNA318468",
-                    "EMG",
-                    "metagenome"
+                    [ id:'example_study' ],
+                    file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true)
                 ]
                 """
             }
@@ -53,10 +47,10 @@ nextflow_process {
         then {
             assertAll(
                 { assert process.success },
-                { assert snapshot(process.out).match() }
-                //TODO improve assertions
+                { assert path(process.out.accessions[0][1]).exists() },
+                { assert path(process.out.accessions[0][1]).json.submitted instanceof List },
+                { assert path(process.out.accessions[0][1]).json.failed.size() == 0 }
             )
         }
-
     }
 }
diff --git a/modules/local/registerstudy/tests/main.nf.test.snap b/modules/local/registerstudy/tests/main.nf.test.snap
index 1dd3a79..4b184e9 100644
--- a/modules/local/registerstudy/tests/main.nf.test.snap
+++ b/modules/local/registerstudy/tests/main.nf.test.snap
@@ -1,72 +1,35 @@
 {
-    "registerstudy - report - stub": {
-        "content": [
-            {
-                "0": [
-                    [
-                        {
-                            "id": "test",
-                            "single_end": false
-                        },
-                        ""
-                    ]
-                ],
-                "1": [
-                    "versions.yml:md5,ea872d341a2054fde3b2c8f06bbf8177"
-                ],
-                "study_accession": [
-                    [
-                        {
-                            "id": "test",
-                            "single_end": false
-                        },
-                        ""
-                    ]
-                ],
-                "versions": [
-                    "versions.yml:md5,ea872d341a2054fde3b2c8f06bbf8177"
-                ]
-            }
-        ],
-        "meta": {
-            "nf-test": "0.9.3",
-            "nextflow": "25.10.0"
-        },
-        "timestamp": "2025-10-28T16:35:02.331026"
-    },
     "registerstudy - stub": {
         "content": [
             {
                 "0": [
                     [
                         {
-                            "id": "test",
-                            "single_end": false
+                            "id": "example_study"
                         },
-                        ""
+                        "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
                     ]
                 ],
                 "1": [
-                    "versions.yml:md5,1d079512d28737f6b925e85563aa2c53"
+                    "versions.yml:md5,ddcc758a7d28faecd4286941889ab7e1"
                 ],
-                "study_accession": [
+                "accessions": [
                     [
                         {
-                            "id": "test",
-                            "single_end": false
+                            "id": "example_study"
                         },
-                        ""
+                        "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
                     ]
                 ],
                 "versions": [
-                    "versions.yml:md5,1d079512d28737f6b925e85563aa2c53"
+                    "versions.yml:md5,ddcc758a7d28faecd4286941889ab7e1"
                 ]
             }
         ],
+        "timestamp": "2026-03-12T13:52:06.989729",
         "meta": {
-            "nf-test": "0.9.0",
-            "nextflow": "25.04.1"
-        },
-        "timestamp": "2025-10-30T14:58:53.721718"
+            "nf-test": "0.9.4",
+            "nextflow": "25.10.4"
+        }
     }
 }
\ No newline at end of file
diff --git a/modules/local/submit_rawreads_study/tests/nextflow.config b/modules/local/registerstudy/tests/nextflow.config
similarity index 86%
rename from modules/local/submit_rawreads_study/tests/nextflow.config
rename to modules/local/registerstudy/tests/nextflow.config
index c4633fa..4a84743 100644
--- a/modules/local/submit_rawreads_study/tests/nextflow.config
+++ b/modules/local/registerstudy/tests/nextflow.config
@@ -1,4 +1,4 @@
-// Test configuration for SUBMIT_RAWREADS_STUDY module.
+// Test configuration for REGISTERSTUDY module.
 // --test        : use the ENA dev server (submissions are discarded daily)
 // --automated   : skip the Webin Reports duplicate-checking API call
 // --dry-run     : validate and build XML but do not submit to ENA
@@ -7,7 +7,7 @@
 // no HTTP calls are made. For real submission tests, replace with secrets:
 //   env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD }
 process {
-    withName: SUBMIT_RAWREADS_STUDY {
+    withName: REGISTERSTUDY {
         ext.args = '--test --automated --dry-run'
     }
 }
diff --git a/modules/local/submit_rawreads_study/environment.yml b/modules/local/submit_rawreads_study/environment.yml
deleted file mode 100644
index 6ee92a8..0000000
--- a/modules/local/submit_rawreads_study/environment.yml
+++ /dev/null
@@ -1,10 +0,0 @@
----
-# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
-channels:
-  - conda-forge
-  - bioconda
-dependencies:
-  - conda-forge::python>=3.12
-  - conda-forge::pip
-  - pip:
-    - mgnify-pipelines-toolkit==1.4.17
diff --git a/modules/local/submit_rawreads_study/main.nf b/modules/local/submit_rawreads_study/main.nf
deleted file mode 100644
index 51bc062..0000000
--- a/modules/local/submit_rawreads_study/main.nf
+++ /dev/null
@@ -1,47 +0,0 @@
-process SUBMIT_RAWREADS_STUDY {
-    tag "$meta.id"
-    label 'process_single'
-
-    conda "${moduleDir}/environment.yml"
-    container "quay.io/microbiome-informatics/mgnify-pipelines-toolkit:1.4.17"
-
-    // ENA_WEBIN and ENA_WEBIN_PASSWORD must be set in the process environment.
-    // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config:
-    //   env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD }
-
-    input:
-    tuple val(meta), path(study_metadata)
-
-    output:
-    tuple val(meta), path("*_accessions.json"), emit: accessions
-    path "versions.yml",                        emit: versions
-
-    when:
-    task.ext.when == null || task.ext.when
-
-    script:
-    def args   = task.ext.args   ?: ''
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    """
-    submit_rawreads_study.py \\
-        --input ${study_metadata} \\
-        --output ${prefix}_accessions.json \\
-        ${args}
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))")
-    END_VERSIONS
-    """
-
-    stub:
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    """
-    echo '{"submitted":[],"duplicates":[],"modified":[],"failed":[]}' > ${prefix}_accessions.json
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        mgnify-pipelines-toolkit: \$(python -c "import importlib.metadata; print(importlib.metadata.version('mgnify-pipelines-toolkit'))")
-    END_VERSIONS
-    """
-}
diff --git a/modules/local/submit_rawreads_study/meta.yml b/modules/local/submit_rawreads_study/meta.yml
deleted file mode 100644
index 629512f..0000000
--- a/modules/local/submit_rawreads_study/meta.yml
+++ /dev/null
@@ -1,68 +0,0 @@
-# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
-name: "submit_rawreads_study"
-description: |
-  Submit a new study to ENA via the Webin REST API v2.
-  Reads study metadata from a JSON, CSV, or TSV file, checks for
-  duplicate studies already registered under the Webin account,
-  builds a PROJECT XML submission document, and submits to ENA.
-  Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD
-  Nextflow secrets, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD
-  inside the process.
-keywords:
-  - ena
-  - submission
-  - study
-  - project
-  - webin
-tools:
-  - mgnify-pipelines-toolkit:
-      description: |
-        A toolkit of utilities used in MGnify metagenomics pipelines,
-        including click, requests, and other dependencies required by
-        the ENA submission scripts.
-      homepage: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit
-      documentation: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit
-      tool_dev_url: https://github.com/EBI-Metagenomics/mgnify-pipelines-toolkit
-      doi: ""
-      licence: ["Apache-2.0"]
-      identifier: null
-
-input:
-  - - meta:
-        type: map
-        description: |
-          Groovy Map containing sample information.
-          e.g. `[ id:'sample1' ]`
-    - study_metadata:
-        type: file
-        description: |
-          Study metadata file in JSON, CSV, or TSV format.
-          JSON may follow the DataHarmonizer Container export format or be
-          a plain list/dict of study records.
-          Required fields per record: STUDY_TITLE, existing_study_type.
-        pattern: "*.{json,csv,tsv}"
-
-output:
-  - accessions:
-      - meta:
-          type: map
-          description: |
-            Groovy Map containing sample information.
-            e.g. `[ id:'sample1' ]`
-      - "*_accessions.json":
-          type: file
-          description: |
-            JSON file containing the submission results with keys:
-            submitted (newly created accessions), duplicates (skipped),
-            modified (force-updated), and failed.
-          pattern: "*_accessions.json"
-  - versions:
-      - "versions.yml":
-          type: file
-          description: File containing software versions
-          pattern: "versions.yml"
-
-authors:
-  - "@timrozday"
-maintainers:
-  - "@timrozday"
diff --git a/modules/local/submit_rawreads_study/tests/main.nf.test b/modules/local/submit_rawreads_study/tests/main.nf.test
deleted file mode 100644
index a0cb4fd..0000000
--- a/modules/local/submit_rawreads_study/tests/main.nf.test
+++ /dev/null
@@ -1,56 +0,0 @@
-nextflow_process {
-    name "Test Process SUBMIT_RAWREADS_STUDY"
-    script "../main.nf"
-    config "./nextflow.config"
-    process "SUBMIT_RAWREADS_STUDY"
-
-    tag "modules"
-    tag "submit_rawreads_study"
-
-    test("submit_rawreads_study - stub") {
-        options "-stub"
-
-        when {
-            process {
-                """
-                input[0] = [
-                    [ id:'example_study' ],
-                    file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true)
-                ]
-                """
-            }
-        }
-
-        then {
-            assertAll(
-                { assert process.success },
-                { assert snapshot(process.out).match() }
-            )
-        }
-    }
-
-    test("submit_rawreads_study - dry run against ENA test server") {
-        // Validates and builds the submission XML but does not submit to ENA.
-        // Dummy credentials in tests/nextflow.config are sufficient for dry-run mode.
-
-        when {
-            process {
-                """
-                input[0] = [
-                    [ id:'example_study' ],
-                    file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true)
-                ]
-                """
-            }
-        }
-
-        then {
-            assertAll(
-                { assert process.success },
-                { assert path(process.out.accessions[0][1]).exists() },
-                { assert path(process.out.accessions[0][1]).json.submitted instanceof List },
-                { assert path(process.out.accessions[0][1]).json.failed.size() == 0 }
-            )
-        }
-    }
-}
diff --git a/modules/local/submit_rawreads_study/tests/main.nf.test.snap b/modules/local/submit_rawreads_study/tests/main.nf.test.snap
deleted file mode 100644
index 08f7fdb..0000000
--- a/modules/local/submit_rawreads_study/tests/main.nf.test.snap
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "submit_rawreads_study - stub": {
-        "content": [
-            {
-                "0": [
-                    [
-                        {
-                            "id": "example_study"
-                        },
-                        "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
-                    ]
-                ],
-                "1": [
-                    "versions.yml:md5,d7080ded74f0381019a674b865daa329"
-                ],
-                "accessions": [
-                    [
-                        {
-                            "id": "example_study"
-                        },
-                        "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
-                    ]
-                ],
-                "versions": [
-                    "versions.yml:md5,d7080ded74f0381019a674b865daa329"
-                ]
-            }
-        ],
-        "timestamp": "2026-03-12T11:57:10.234715",
-        "meta": {
-            "nf-test": "0.9.4",
-            "nextflow": "25.10.4"
-        }
-    }
-}
\ No newline at end of file
diff --git a/nextflow_schema.json b/nextflow_schema.json
index dedf312..2ee3d9c 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -266,7 +266,7 @@
                     "format": "file-path",
                     "exists": true,
                     "description": "Path to study metadata file (JSON, CSV, or TSV) for registering a new ENA study. Required when submission_study is not provided.",
-                    "help_text": "File containing study metadata fields (STUDY_TITLE, STUDY_ABSTRACT, existing_study_type, alias, etc.). Used by SUBMIT_RAWREADS_STUDY to create a new study in ENA when no existing submission_study accession is given.",
+                    "help_text": "File containing study metadata fields (STUDY_TITLE, STUDY_ABSTRACT, existing_study_type, alias, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.",
                     "fa_icon": "fas fa-file-alt"
                 },
                 "webincli_submit": {
diff --git a/tests/default.nf.test b/tests/default.nf.test
index 919645d..4a3b628 100644
--- a/tests/default.nf.test
+++ b/tests/default.nf.test
@@ -6,7 +6,7 @@ nextflow_pipeline {
 
     test("metagenomic_assemblies mode — submission_study provided (no study registration)") {
         // Exercises the assembly submission path using a pre-registered study (stub mode).
-        // SUBMIT_RAWREADS_STUDY is NOT called here; the module-level nf-test covers it.
+        // REGISTERSTUDY is NOT called here; the module-level nf-test covers it.
         //
         // A samplesheet is generated on the fly with absolute paths so that nf-schema
         // validation succeeds regardless of the nf-test launchDir.
@@ -45,8 +45,8 @@ nextflow_pipeline {
         }
     }
 
-    test("metagenomic_assemblies mode — study_metadata provided (SUBMIT_RAWREADS_STUDY registers study)") {
-        // Tests the study-registration path in stub mode. SUBMIT_RAWREADS_STUDY stub
+    test("metagenomic_assemblies mode — study_metadata provided (REGISTERSTUDY registers study)") {
+        // Tests the study-registration path in stub mode. REGISTERSTUDY stub
         // outputs an empty accessions JSON, so this test validates the plumbing rather
         // than the end-to-end submission output.
         options "-stub"
@@ -73,7 +73,7 @@ nextflow_pipeline {
         then {
             assertAll(
                 { assert workflow.success },
-                { assert workflow.trace.succeeded().any { it.name.contains("SUBMIT_RAWREADS_STUDY") } }
+                { assert workflow.trace.succeeded().any { it.name.contains("REGISTERSTUDY") } }
             )
         }
     }
diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf
index b383a6c..ec1309f 100644
--- a/workflows/assemblysubmit.nf
+++ b/workflows/assemblysubmit.nf
@@ -7,7 +7,7 @@
 include { COVERM_CONTIG              } from '../modules/nf-core/coverm/contig/main'
 include { FASTAVALIDATOR             } from '../modules/nf-core/fastavalidator/main'
 include { GENERATE_ASSEMBLY_MANIFEST } from '../modules/local/generate_assembly_manifest/main'
-include { SUBMIT_RAWREADS_STUDY      } from '../modules/local/submit_rawreads_study/main'
+include { REGISTERSTUDY              } from '../modules/local/registerstudy/main'
 include { ENA_WEBIN_CLI              } from '../modules/local/ena_webin_cli'
 
 include { MULTIQC                    } from '../modules/nf-core/multiqc/main'
@@ -154,11 +154,11 @@ workflow ASSEMBLYSUBMIT {
         study_accession_ch = channel.of(params.submission_study)
     } else {
         // Register a new study using the study metadata file
-        SUBMIT_RAWREADS_STUDY(
+        REGISTERSTUDY(
             channel.of([[id: "study"], file(params.study_metadata)])
         )
-        ch_versions = ch_versions.mix(SUBMIT_RAWREADS_STUDY.out.versions)
-        study_accession_ch = SUBMIT_RAWREADS_STUDY.out.accessions
+        ch_versions = ch_versions.mix(REGISTERSTUDY.out.versions)
+        study_accession_ch = REGISTERSTUDY.out.accessions
             .map { _meta, json ->
                 def data = new groovy.json.JsonSlurper().parse(json)
                 data.submitted[0]?.accession
diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf
index b34a704..e9b17bb 100644
--- a/workflows/genomesubmit.nf
+++ b/workflows/genomesubmit.nf
@@ -5,7 +5,7 @@
 */
 include { GENOME_UPLOAD          } from '../modules/local/genome_upload'
 include { ENA_WEBIN_CLI          } from '../modules/local/ena_webin_cli'
-include { SUBMIT_RAWREADS_STUDY  } from '../modules/local/submit_rawreads_study/main'
+include { REGISTERSTUDY          } from '../modules/local/registerstudy/main'
 
 include { RNA_DETECTION           } from '../subworkflows/local/rna_detection'
 
@@ -114,11 +114,11 @@ workflow GENOMESUBMIT {
     if (params.submission_study) {
         study_accession_ch = channel.of(params.submission_study)
     } else {
-        SUBMIT_RAWREADS_STUDY(
+        REGISTERSTUDY(
             channel.of([[id: "study"], file(params.study_metadata)])
         )
-        ch_versions = ch_versions.mix(SUBMIT_RAWREADS_STUDY.out.versions)
-        study_accession_ch = SUBMIT_RAWREADS_STUDY.out.accessions
+        ch_versions = ch_versions.mix(REGISTERSTUDY.out.versions)
+        study_accession_ch = REGISTERSTUDY.out.accessions
             .map { _meta, json ->
                 def data = new groovy.json.JsonSlurper().parse(json)
                 data.submitted[0]?.accession

From cc840dedea7bc77b175b44f163ed0a4b8a45cab4 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Thu, 12 Mar 2026 14:09:30 +0000
Subject: [PATCH 09/36] Remove pytest from bin

---
 bin/test_submit_study.py | 1420 --------------------------------------
 1 file changed, 1420 deletions(-)
 delete mode 100644 bin/test_submit_study.py

diff --git a/bin/test_submit_study.py b/bin/test_submit_study.py
deleted file mode 100644
index d021383..0000000
--- a/bin/test_submit_study.py
+++ /dev/null
@@ -1,1420 +0,0 @@
-#!/usr/bin/env python3
-"""Tests for submit_study.py — ENA study submission pipeline.
-
-Covers:
-    A. Unit tests for build_submission_xml and _add_project_element
-    B. Unit tests for validate_study_xml
-    C. Unit tests for parse_xml_receipt
-    D. Unit tests for find_duplicate_studies and fetch_account_studies
-    E. CLI integration tests for main() using click.testing.CliRunner
-
-Usage:
-    pytest bin/test_submit_study.py -v
-
-All external I/O (HTTP requests, ENA reports API) is mocked. Tests do NOT
-import from ena_submit_common directly — all assertions go through the public
-API of submit_study.
-"""
-
-from __future__ import annotations
-
-import json
-import os
-import sys
-import xml.etree.ElementTree as ET
-from pathlib import Path
-from textwrap import dedent
-from typing import Any
-from unittest.mock import MagicMock, patch
-
-import pytest
-from click.testing import CliRunner
-from requests.auth import HTTPBasicAuth
-
-# Ensure the scripts directory is on the path before importing the module.
-sys.path.insert(0, os.path.dirname(__file__))
-
-from bin.submit_study import (  # noqa: E402
-    _normalize_study_report,
-    build_submission_xml,
-    fetch_account_studies,
-    find_duplicate_studies,
-    main,
-    parse_xml_receipt,
-    validate_study_xml,
-)
-
-# ---------------------------------------------------------------------------
-# Constants shared across test groups
-# ---------------------------------------------------------------------------
-
-_PROD_REPORTS_URL = "https://www.ebi.ac.uk/ena/submit/report/projects"
-_TEST_REPORTS_URL = "https://wwwdev.ebi.ac.uk/ena/submit/report/projects"
-
-# ---------------------------------------------------------------------------
-# Shared fixtures
-# ---------------------------------------------------------------------------
-
-
-@pytest.fixture
-def basic_study() -> dict[str, Any]:
-    """Return a minimal valid study metadata dict."""
-    return {
-        "alias": "test-study-001",
-        "STUDY_TITLE": "A Basic Test Study",
-        "STUDY_ABSTRACT": "An abstract for the test study.",
-        "CENTER_PROJECT_NAME": "My Centre Project",
-        "existing_study_type": "Metagenomics",
-    }
-
-
-@pytest.fixture
-def metagenomics_assembly_study() -> dict[str, Any]:
-    """Return a study dict representing a metagenomics assembly submission."""
-    return {
-        "alias": "metagenome-assembly-001",
-        "STUDY_TITLE": "Primary Metagenome Assembly of Soil Sample",
-        "STUDY_ABSTRACT": "Assembly of contigs from metagenome sequencing of soil.",
-        "CENTER_PROJECT_NAME": "Soil Metagenome Project",
-        "existing_study_type": "Metagenomics",
-    }
-
-
-@pytest.fixture
-def mag_genome_study() -> dict[str, Any]:
-    """Return a study dict representing a MAG/genome submission."""
-    return {
-        "alias": "mag-genome-001",
-        "STUDY_TITLE": "Metagenome-Assembled Genome from Soil Microbiome",
-        "STUDY_ABSTRACT": "A high-quality MAG reconstructed from binned metagenome data.",
-        "existing_study_type": "Other",
-        "new_study_type": "Genome Sequencing",
-    }
-
-
-@pytest.fixture
-def mock_credentials() -> tuple[str, str]:
-    """Return mock ENA credentials."""
-    return ("Webin-12345", "pass")
-
-
-@pytest.fixture
-def auth(mock_credentials: tuple[str, str]) -> HTTPBasicAuth:
-    """Return mock HTTPBasicAuth built from mock credentials."""
-    return HTTPBasicAuth(*mock_credentials)
-
-
-@pytest.fixture
-def account_study_record() -> dict[str, str]:
-    """Return a normalised account study record as returned by the Reports API."""
-    return {
-        "title": "Existing Study Title",
-        "alias": "existing-study-alias",
-        "accession": "PRJEB99001",
-        "secondary_accession": "ERP099001",
-        "status": "PRIVATE",
-    }
-
-
-# ---------------------------------------------------------------------------
-# A. Unit tests for build_submission_xml and _add_project_element
-# ---------------------------------------------------------------------------
-
-
-class TestBuildSubmissionXml:
-    """Unit tests for build_submission_xml and _add_project_element."""
-
-    # ---- helper -------------------------------------------------------
-
-    @staticmethod
-    def _to_str(root: ET.Element) -> str:
-        """Serialise an ElementTree element to a UTF-8 string."""
-        return ET.tostring(root, encoding="unicode")
-
-    # ---- A1: Basic study fields -------------------------------------------
-
-    def test_study_title_round_trips(self, basic_study: dict[str, Any]) -> None:
-        """STUDY_TITLE is written as the TITLE element text."""
-        root = build_submission_xml([basic_study])
-        title_el = root.find(".//TITLE")
-        assert title_el is not None
-        assert title_el.text == basic_study["STUDY_TITLE"]
-
-    def test_study_abstract_round_trips(self, basic_study: dict[str, Any]) -> None:
-        """STUDY_ABSTRACT is written as the DESCRIPTION element text."""
-        root = build_submission_xml([basic_study])
-        desc_el = root.find(".//DESCRIPTION")
-        assert desc_el is not None
-        assert desc_el.text == basic_study["STUDY_ABSTRACT"]
-
-    def test_alias_round_trips(self, basic_study: dict[str, Any]) -> None:
-        """The alias attribute on PROJECT matches the input alias."""
-        root = build_submission_xml([basic_study])
-        project_el = root.find(".//PROJECT")
-        assert project_el is not None
-        assert project_el.get("alias") == basic_study["alias"]
-
-    def test_center_project_name_round_trips(self, basic_study: dict[str, Any]) -> None:
-        """CENTER_PROJECT_NAME is written as the NAME element text."""
-        root = build_submission_xml([basic_study])
-        name_el = root.find(".//NAME")
-        assert name_el is not None
-        assert name_el.text == basic_study["CENTER_PROJECT_NAME"]
-
-    def test_submission_project_present(self, basic_study: dict[str, Any]) -> None:
-        """SUBMISSION_PROJECT with SEQUENCING_PROJECT is always present."""
-        root = build_submission_xml([basic_study])
-        sp_el = root.find(".//SUBMISSION_PROJECT")
-        assert sp_el is not None
-        seq_el = sp_el.find("SEQUENCING_PROJECT")
-        assert seq_el is not None
-
-    # ---- A2: Study type PROJECT_ATTRIBUTEs --------------------------------
-
-    def test_existing_study_type_emitted_as_project_attribute(
-        self, basic_study: dict[str, Any]
-    ) -> None:
-        """existing_study_type is emitted as a PROJECT_ATTRIBUTE TAG/VALUE pair."""
-        root = build_submission_xml([basic_study])
-        xml_str = self._to_str(root)
-        assert "existing_study_type" in xml_str
-        assert basic_study["existing_study_type"] in xml_str
-
-    def test_new_study_type_absent_when_not_other(self, basic_study: dict[str, Any]) -> None:
-        """new_study_type is NOT emitted when existing_study_type != 'Other'."""
-        study = dict(basic_study)
-        study["new_study_type"] = "Genome Sequencing"
-        root = build_submission_xml([study])
-        xml_str = self._to_str(root)
-        assert "new_study_type" not in xml_str
-
-    def test_new_study_type_present_when_existing_is_other(
-        self, mag_genome_study: dict[str, Any]
-    ) -> None:
-        """new_study_type appears as a PROJECT_ATTRIBUTE when existing_study_type == 'Other'."""
-        root = build_submission_xml([mag_genome_study])
-        tags = [
-            el.text
-            for el in root.findall(".//PROJECT_ATTRIBUTE/TAG")
-            if el.text is not None
-        ]
-        values = [
-            el.text
-            for el in root.findall(".//PROJECT_ATTRIBUTE/VALUE")
-            if el.text is not None
-        ]
-        assert "existing_study_type" in tags
-        assert "new_study_type" in tags
-        assert "Other" in values
-        assert "Genome Sequencing" in values
-
-    def test_no_project_attributes_when_no_study_type(self) -> None:
-        """No PROJECT_ATTRIBUTES element when existing_study_type is absent."""
-        study = {
-            "alias": "no-type",
-            "STUDY_TITLE": "No Type Study",
-        }
-        root = build_submission_xml([study])
-        attrs_el = root.find(".//PROJECT_ATTRIBUTES")
-        assert attrs_el is None
-
-    # ---- A3: Hold date ----------------------------------------------------
-
-    def test_hold_until_present_in_submission(self, basic_study: dict[str, Any]) -> None:
-        """When hold_until is given, HOLD element with HoldUntilDate appears in SUBMISSION."""
-        root = build_submission_xml([basic_study], hold_until="2028-06-15")
-        hold_el = root.find(".//HOLD")
-        assert hold_el is not None
-        assert hold_el.get("HoldUntilDate") == "2028-06-15"
-
-    def test_hold_until_absent_when_not_provided(self, basic_study: dict[str, Any]) -> None:
-        """When hold_until is not given, no HOLD element appears."""
-        root = build_submission_xml([basic_study])
-        hold_el = root.find(".//HOLD")
-        assert hold_el is None
-
-    # ---- A4: MODIFY action ------------------------------------------------
-
-    def test_modify_action_produces_modify_element(self, basic_study: dict[str, Any]) -> None:
-        """Using action='MODIFY' produces a MODIFY element instead of ADD."""
-        root = build_submission_xml([basic_study], action="MODIFY")
-        xml_str = self._to_str(root)
-        assert "<MODIFY" in xml_str or "<MODIFY/>" in xml_str
-
-    def test_add_action_produces_add_element(self, basic_study: dict[str, Any]) -> None:
-        """Default action='ADD' produces an ADD element."""
-        root = build_submission_xml([basic_study])
-        xml_str = self._to_str(root)
-        assert "<ADD" in xml_str or "<ADD/>" in xml_str
-
-    def test_modify_action_does_not_produce_add(self, basic_study: dict[str, Any]) -> None:
-        """MODIFY action does not produce an ADD element."""
-        root = build_submission_xml([basic_study], action="MODIFY")
-        xml_str = self._to_str(root)
-        # Strip the XML preamble to avoid false positives in attributes
-        assert "<ADD" not in xml_str and "<ADD/>" not in xml_str
-
-    # ---- A5: Assembly/metagenomics study ----------------------------------
-
-    def test_metagenomics_assembly_study_round_trips(
-        self, metagenomics_assembly_study: dict[str, Any]
-    ) -> None:
-        """Metagenomics assembly study dict round-trips correctly into XML."""
-        root = build_submission_xml([metagenomics_assembly_study])
-        project_el = root.find(".//PROJECT")
-        assert project_el is not None
-        assert project_el.get("alias") == metagenomics_assembly_study["alias"]
-
-        title_el = root.find(".//TITLE")
-        assert title_el is not None
-        assert title_el.text == metagenomics_assembly_study["STUDY_TITLE"]
-
-        tags = [
-            el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text
-        ]
-        values = [
-            el.text for el in root.findall(".//PROJECT_ATTRIBUTE/VALUE") if el.text
-        ]
-        assert "existing_study_type" in tags
-        assert "Metagenomics" in values
-
-    # ---- A6: MAG/genome study with Other + new_study_type -----------------
-
-    def test_mag_genome_study_has_both_project_attributes(
-        self, mag_genome_study: dict[str, Any]
-    ) -> None:
-        """MAG/genome study with existing_study_type=Other produces both PROJECT_ATTRIBUTEs."""
-        root = build_submission_xml([mag_genome_study])
-        attr_els = root.findall(".//PROJECT_ATTRIBUTE")
-        assert len(attr_els) == 2
-
-        pairs: dict[str, str] = {}
-        for attr_el in attr_els:
-            tag_el = attr_el.find("TAG")
-            val_el = attr_el.find("VALUE")
-            if tag_el is not None and val_el is not None:
-                pairs[tag_el.text or ""] = val_el.text or ""
-
-        assert pairs.get("existing_study_type") == "Other"
-        assert pairs.get("new_study_type") == "Genome Sequencing"
-
-    # ---- Multiple studies in one call -------------------------------------
-
-    def test_multiple_studies_produce_multiple_project_elements(
-        self,
-        basic_study: dict[str, Any],
-        metagenomics_assembly_study: dict[str, Any],
-    ) -> None:
-        """Multiple studies in input produce multiple PROJECT elements."""
-        root = build_submission_xml([basic_study, metagenomics_assembly_study])
-        projects = root.findall(".//PROJECT")
-        assert len(projects) == 2
-
-    # ---- Alias auto-derived from title when absent ------------------------
-
-    def test_alias_derived_from_title_when_absent(self) -> None:
-        """When no alias is provided, alias is derived from STUDY_TITLE (spaces→underscores)."""
-        study = {"STUDY_TITLE": "My Derived Title"}
-        root = build_submission_xml([study])
-        project_el = root.find(".//PROJECT")
-        assert project_el is not None
-        alias = project_el.get("alias", "")
-        assert "_" in alias or alias == "My_Derived_Title"[:50]
-
-
-# ---------------------------------------------------------------------------
-# B. Unit tests for validate_study_xml
-# ---------------------------------------------------------------------------
-
-
-class TestValidateStudyXml:
-    """Unit tests for validate_study_xml."""
-
-    @staticmethod
-    def _build_valid_xml_bytes(alias: str = "study-1", title: str = "Test Study") -> bytes:
-        """Build a minimal valid study XML document as bytes.
-
-        Args:
-            alias: The PROJECT alias attribute value.
-            title: The TITLE element text.
-
-        Returns:
-            UTF-8 encoded XML bytes.
-        """
-        xml_str = dedent(f"""\
-            <?xml version='1.0' encoding='UTF-8'?>
-            <WEBIN>
-              <PROJECT_SET>
-                <PROJECT alias="{alias}">
-                  <TITLE>{title}</TITLE>
-                  <SUBMISSION_PROJECT>
-                    <SEQUENCING_PROJECT/>
-                  </SUBMISSION_PROJECT>
-                </PROJECT>
-              </PROJECT_SET>
-            </WEBIN>
-        """)
-        return xml_str.encode("utf-8")
-
-    # ---- B7: Valid XML passes ---------------------------------------------
-
-    def test_valid_assembly_study_xml_passes(self) -> None:
-        """A valid assembly study XML passes validation without errors."""
-        xml_bytes = self._build_valid_xml_bytes(
-            alias="assembly-study", title="Assembly Study Title"
-        )
-        is_valid, messages = validate_study_xml(xml_bytes)
-        assert is_valid, f"Expected valid; messages: {messages}"
-
-    def test_valid_metagenomics_xml_passes(self) -> None:
-        """Well-formed XML with required elements passes validation."""
-        study = {
-            "alias": "meta-study",
-            "STUDY_TITLE": "Metagenomics Study",
-            "existing_study_type": "Metagenomics",
-        }
-        import ena_submit_common as _common  # local import; only for xml_to_bytes helper
-
-        root = build_submission_xml([study])
-        xml_bytes = _common.xml_to_bytes(root)
-        is_valid, messages = validate_study_xml(xml_bytes)
-        assert is_valid, f"Expected valid; messages: {messages}"
-
-    # ---- B8: Missing TITLE ------------------------------------------------
-
-    def test_missing_title_fails_with_title_in_message(self) -> None:
-        """A PROJECT without a TITLE element fails validation with 'TITLE' in the message."""
-        xml_str = dedent("""\
-            <?xml version='1.0' encoding='UTF-8'?>
-            <WEBIN>
-              <PROJECT_SET>
-                <PROJECT alias="no-title">
-                  <SUBMISSION_PROJECT><SEQUENCING_PROJECT/></SUBMISSION_PROJECT>
-                </PROJECT>
-              </PROJECT_SET>
-            </WEBIN>
-        """)
-        is_valid, messages = validate_study_xml(xml_str.encode("utf-8"))
-        assert not is_valid
-        combined = " ".join(messages)
-        assert "TITLE" in combined
-
-    # ---- B9: Missing SUBMISSION_PROJECT -----------------------------------
-
-    def test_missing_submission_project_fails(self) -> None:
-        """A PROJECT without SUBMISSION_PROJECT fails with 'SUBMISSION_PROJECT' in message."""
-        xml_str = dedent("""\
-            <?xml version='1.0' encoding='UTF-8'?>
-            <WEBIN>
-              <PROJECT_SET>
-                <PROJECT alias="no-sp">
-                  <TITLE>Some Title</TITLE>
-                </PROJECT>
-              </PROJECT_SET>
-            </WEBIN>
-        """)
-        is_valid, messages = validate_study_xml(xml_str.encode("utf-8"))
-        assert not is_valid
-        combined = " ".join(messages)
-        assert "SUBMISSION_PROJECT" in combined
-
-    # ---- B10: Malformed XML -----------------------------------------------
-
-    def test_malformed_xml_fails_with_not_well_formed_message(self) -> None:
-        """Malformed XML fails validation with 'not well-formed' in the message."""
-        bad_xml = b"<WEBIN><PROJECT_SET><PROJECT alias='x'><TITLE>Unclosed"
-        is_valid, messages = validate_study_xml(bad_xml)
-        assert not is_valid
-        combined = " ".join(messages).lower()
-        assert "not well-formed" in combined or "well-formed" in combined
-
-    # ---- Extra structural checks -----------------------------------------
-
-    def test_empty_title_fails_validation(self) -> None:
-        """A PROJECT with an empty TITLE element fails validation."""
-        xml_str = dedent("""\
-            <?xml version='1.0' encoding='UTF-8'?>
-            <WEBIN>
-              <PROJECT_SET>
-                <PROJECT alias="empty-title">
-                  <TITLE></TITLE>
-                  <SUBMISSION_PROJECT><SEQUENCING_PROJECT/></SUBMISSION_PROJECT>
-                </PROJECT>
-              </PROJECT_SET>
-            </WEBIN>
-        """)
-        is_valid, messages = validate_study_xml(xml_str.encode("utf-8"))
-        assert not is_valid
-
-    def test_missing_project_set_fails_validation(self) -> None:
-        """XML without a PROJECT_SET element fails validation."""
-        xml_str = b"<?xml version='1.0'?><WEBIN/>"
-        is_valid, messages = validate_study_xml(xml_str)
-        assert not is_valid
-
-    def test_validation_returns_tuple_of_bool_and_list(self) -> None:
-        """validate_study_xml always returns (bool, list)."""
-        xml_bytes = self._build_valid_xml_bytes()
-        result = validate_study_xml(xml_bytes)
-        assert isinstance(result, tuple)
-        assert len(result) == 2
-        is_valid, messages = result
-        assert isinstance(is_valid, bool)
-        assert isinstance(messages, list)
-
-
-# ---------------------------------------------------------------------------
-# C. Unit tests for parse_xml_receipt
-# ---------------------------------------------------------------------------
-
-
-class TestParseXmlReceipt:
-    """Unit tests for parse_xml_receipt."""
-
-    @staticmethod
-    def _parse(xml_str: str) -> tuple[bool, list[dict[str, str]], list[str]]:
-        """Parse an XML receipt string via parse_xml_receipt.
-
-        Args:
-            xml_str: Raw XML receipt string.
-
-        Returns:
-            Tuple of (success, accessions, messages).
-        """
-        root = ET.fromstring(xml_str)
-        return parse_xml_receipt(root)
-
-    # ---- C11: Successful PROJECT receipt ----------------------------------
-
-    def test_successful_project_receipt_returns_true(self) -> None:
-        """A success='true' receipt returns success=True."""
-        xml_str = dedent("""\
-            <RECEIPT success="true" receiptDate="2024-01-15T12:00:00.000Z">
-              <PROJECT accession="PRJEB12345" alias="my-study"
-                       status="PRIVATE" holdUntilDate="2025-01-15">
-                <EXT_ID accession="ERP012345" type="study"/>
-              </PROJECT>
-            </RECEIPT>
-        """)
-        success, accessions, messages = self._parse(xml_str)
-        assert success is True
-
-    def test_successful_project_receipt_accession_round_trips(self) -> None:
-        """PROJECT accession, alias, status, holdUntilDate, and external_accession round-trip."""
-        xml_str = dedent("""\
-            <RECEIPT success="true">
-              <PROJECT accession="PRJEB12345" alias="my-study"
-                       status="PRIVATE" holdUntilDate="2025-01-15">
-                <EXT_ID accession="ERP012345" type="study"/>
-              </PROJECT>
-            </RECEIPT>
-        """)
-        success, accessions, messages = self._parse(xml_str)
-        assert len(accessions) == 1
-        acc = accessions[0]
-        assert acc["accession"] == "PRJEB12345"
-        assert acc["alias"] == "my-study"
-        assert acc["status"] == "PRIVATE"
-        assert acc["holdUntilDate"] == "2025-01-15"
-        assert acc["external_accession"] == "ERP012345"
-        assert acc["external_type"] == "study"
-
-    # ---- C12: Failed receipt ----------------------------------------------
-
-    def test_failed_receipt_returns_false(self) -> None:
-        """A success='false' receipt returns success=False."""
-        xml_str = dedent("""\
-            <RECEIPT success="false">
-              <MESSAGES>
-                <ERROR>Center name "Unknown" is not permitted to submit in Webin-12345.</ERROR>
-              </MESSAGES>
-            </RECEIPT>
-        """)
-        success, accessions, messages = self._parse(xml_str)
-        assert success is False
-
-    def test_failed_receipt_captures_error_message(self) -> None:
-        """Error text from MESSAGES/ERROR is captured in the messages list."""
-        xml_str = dedent("""\
-            <RECEIPT success="false">
-              <MESSAGES>
-                <ERROR>Submission failed due to duplicate alias.</ERROR>
-              </MESSAGES>
-            </RECEIPT>
-        """)
-        _, _, messages = self._parse(xml_str)
-        assert any("Submission failed due to duplicate alias" in m for m in messages)
-
-    # ---- C13: STUDY tag (alternate ENA format) ----------------------------
-
-    def test_study_tag_receipt_extracts_accession_and_alias(self) -> None:
-        """Receipts using STUDY instead of PROJECT still extract accession and alias."""
-        xml_str = dedent("""\
-            <RECEIPT success="true">
-              <STUDY accession="ERP099999" alias="study-alias-1" status="PRIVATE"/>
-            </RECEIPT>
-        """)
-        success, accessions, messages = self._parse(xml_str)
-        assert success is True
-        assert len(accessions) == 1
-        assert accessions[0]["accession"] == "ERP099999"
-        assert accessions[0]["alias"] == "study-alias-1"
-
-    # ---- C14: MESSAGES with INFO and ERROR --------------------------------
-
-    def test_receipt_with_info_messages_captured(self) -> None:
-        """INFO elements in MESSAGES are captured in the messages list."""
-        xml_str = dedent("""\
-            <RECEIPT success="true">
-              <PROJECT accession="PRJEB00001" alias="x" status="PRIVATE"/>
-              <MESSAGES>
-                <INFO>Submission processed successfully.</INFO>
-              </MESSAGES>
-            </RECEIPT>
-        """)
-        _, _, messages = self._parse(xml_str)
-        assert any("Submission processed successfully" in m for m in messages)
-        assert any(m.startswith("INFO:") for m in messages)
-
-    def test_receipt_with_multiple_error_messages(self) -> None:
-        """Multiple ERROR elements are all captured."""
-        xml_str = dedent("""\
-            <RECEIPT success="false">
-              <MESSAGES>
-                <ERROR>First error.</ERROR>
-                <ERROR>Second error.</ERROR>
-              </MESSAGES>
-            </RECEIPT>
-        """)
-        _, _, messages = self._parse(xml_str)
-        error_msgs = [m for m in messages if m.startswith("ERROR:")]
-        assert len(error_msgs) == 2
-
-    def test_receipt_both_info_and_error_captured(self) -> None:
-        """Both INFO and ERROR elements are captured in messages."""
-        xml_str = dedent("""\
-            <RECEIPT success="false">
-              <MESSAGES>
-                <INFO>Partial success.</INFO>
-                <ERROR>Some records failed.</ERROR>
-              </MESSAGES>
-            </RECEIPT>
-        """)
-        _, _, messages = self._parse(xml_str)
-        assert any(m.startswith("INFO:") for m in messages)
-        assert any(m.startswith("ERROR:") for m in messages)
-
-    def test_receipt_no_messages_element_returns_empty_list(self) -> None:
-        """A receipt without a MESSAGES element returns an empty messages list."""
-        xml_str = dedent("""\
-            <RECEIPT success="true">
-              <PROJECT accession="PRJEB00001" alias="x" status="PRIVATE"/>
-            </RECEIPT>
-        """)
-        _, _, messages = self._parse(xml_str)
-        assert messages == []
-
-    def test_receipt_success_false_string(self) -> None:
-        """Receipts with success='false' (string) correctly parse to False."""
-        xml_str = "<RECEIPT success='false'/>"
-        success, _, _ = self._parse(xml_str)
-        assert success is False
-
-    def test_receipt_missing_success_defaults_to_false(self) -> None:
-        """A receipt without a success attribute defaults to False."""
-        xml_str = "<RECEIPT/>"
-        success, _, _ = self._parse(xml_str)
-        assert success is False
-
-
-# ---------------------------------------------------------------------------
-# D. Unit tests for find_duplicate_studies and fetch_account_studies
-# ---------------------------------------------------------------------------
-
-
-class TestFindDuplicateStudies:
-    """Unit tests for find_duplicate_studies."""
-
-    @staticmethod
-    def _account_record(
-        title: str = "",
-        alias: str = "",
-        accession: str = "PRJEB00001",
-        status: str = "PRIVATE",
-    ) -> dict[str, str]:
-        """Build a normalised account study record.
-
-        Args:
-            title: Study title (as returned by Reports API normalizer).
-            alias: Study alias.
-            accession: ENA project accession.
-            status: Release status.
-
-        Returns:
-            Normalised study dict.
-        """
-        return {
-            "title": title,
-            "alias": alias,
-            "accession": accession,
-            "secondary_accession": "",
-            "status": status,
-        }
-
-    # ---- D15: Exact alias match ------------------------------------------
-
-    def test_exact_alias_match_detected_as_duplicate(self) -> None:
-        """An exact alias match is detected as a duplicate."""
-        new_studies = [{"STUDY_TITLE": "Different Title", "alias": "my-alias-x"}]
-        account = [self._account_record(title="Other", alias="my-alias-x", accession="PRJEB10")]
-        dups = find_duplicate_studies(new_studies, account)
-        assert 0 in dups
-        assert dups[0]["accession"] == "PRJEB10"
-        assert "alias" in dups[0]["match_reason"]
-
-    # ---- D16: Exact title match ------------------------------------------
-
-    def test_exact_title_match_detected_as_duplicate(self) -> None:
-        """An exact STUDY_TITLE match is detected as a duplicate."""
-        new_studies = [{"STUDY_TITLE": "My Metagenomics Study"}]
-        account = [
-            self._account_record(title="My Metagenomics Study", accession="PRJEB20")
-        ]
-        dups = find_duplicate_studies(new_studies, account)
-        assert 0 in dups
-        assert dups[0]["accession"] == "PRJEB20"
-        assert "title" in dups[0]["match_reason"]
-
-    # ---- D17: No match returns empty dict --------------------------------
-
-    def test_no_match_returns_empty_dict(self) -> None:
-        """When neither alias nor title matches, an empty dict is returned."""
-        new_studies = [{"STUDY_TITLE": "Completely Novel Study", "alias": "novel-alias"}]
-        account = [self._account_record(title="Existing Study", alias="existing-alias")]
-        dups = find_duplicate_studies(new_studies, account)
-        assert dups == {}
-
-    def test_empty_account_returns_empty_dict(self) -> None:
-        """Empty account list results in no duplicates."""
-        new_studies = [{"STUDY_TITLE": "Any Study"}]
-        dups = find_duplicate_studies(new_studies, [])
-        assert dups == {}
-
-    def test_empty_new_studies_returns_empty_dict(self) -> None:
-        """Empty new studies list results in no duplicates."""
-        account = [self._account_record(title="Existing")]
-        dups = find_duplicate_studies([], account)
-        assert dups == {}
-
-    def test_study_without_title_or_alias_not_flagged(self) -> None:
-        """A study dict with neither title nor alias is not flagged as duplicate."""
-        new_studies = [{"IS_PRIMARY": "YES"}]  # no STUDY_TITLE, no alias
-        account = [self._account_record(title="Existing")]
-        dups = find_duplicate_studies(new_studies, account)
-        assert dups == {}
-
-    def test_partial_title_not_a_duplicate(self) -> None:
-        """A partial title match does not count as a duplicate (exact match only)."""
-        new_studies = [{"STUDY_TITLE": "Metagenomics"}]
-        account = [self._account_record(title="Metagenomics Assembly Study")]
-        dups = find_duplicate_studies(new_studies, account)
-        assert dups == {}
-
-    def test_multiple_studies_only_matching_flagged(self) -> None:
-        """Only the matching study is flagged when multiple new studies are submitted."""
-        account = [self._account_record(title="Old Study", alias="old-alias", accession="PRJEB50")]
-        new_studies = [
-            {"STUDY_TITLE": "Old Study"},
-            {"STUDY_TITLE": "New Study"},
-        ]
-        dups = find_duplicate_studies(new_studies, account)
-        assert 0 in dups
-        assert 1 not in dups
-
-    def test_duplicate_index_corresponds_to_new_studies_list(self) -> None:
-        """The index in the duplicates dict matches the position in new_studies."""
-        account = [self._account_record(title="Study C", accession="PRJEB33")]
-        new_studies = [
-            {"STUDY_TITLE": "Study A"},
-            {"STUDY_TITLE": "Study B"},
-            {"STUDY_TITLE": "Study C"},
-        ]
-        dups = find_duplicate_studies(new_studies, account)
-        assert 2 in dups
-        assert dups[2]["accession"] == "PRJEB33"
-
-
-# ---------------------------------------------------------------------------
-# D18: _normalize_study_report and fetch_account_studies
-# ---------------------------------------------------------------------------
-
-
-class TestNormalizeStudyReport:
-    """Unit tests for _normalize_study_report field normalisation."""
-
-    def test_title_field_normalised(self) -> None:
-        """The 'title' field is extracted from the raw report dict."""
-        report = {"title": "My Title", "alias": "my-alias", "accession": "PRJEB1"}
-        result = _normalize_study_report(report)
-        assert result["title"] == "My Title"
-
-    def test_study_title_fallback(self) -> None:
-        """studyTitle is used when 'title' is absent."""
-        report = {"studyTitle": "Study Title Fallback", "alias": "a", "accession": "PRJEB2"}
-        result = _normalize_study_report(report)
-        assert result["title"] == "Study Title Fallback"
-
-    def test_alias_field_normalised(self) -> None:
-        """The 'alias' field is extracted."""
-        report = {"title": "T", "alias": "direct-alias", "accession": "PRJEB3"}
-        result = _normalize_study_report(report)
-        assert result["alias"] == "direct-alias"
-
-    def test_study_alias_fallback(self) -> None:
-        """studyAlias is used when 'alias' is absent."""
-        report = {"title": "T", "studyAlias": "study-alias-fallback", "accession": "PRJEB4"}
-        result = _normalize_study_report(report)
-        assert result["alias"] == "study-alias-fallback"
-
-    def test_accession_field_normalised(self) -> None:
-        """The 'accession' field is extracted."""
-        report = {"title": "T", "alias": "a", "accession": "PRJEB5"}
-        result = _normalize_study_report(report)
-        assert result["accession"] == "PRJEB5"
-
-    def test_study_accession_fallback(self) -> None:
-        """studyAccession is used when 'accession' is absent."""
-        report = {"title": "T", "alias": "a", "studyAccession": "PRJEB99"}
-        result = _normalize_study_report(report)
-        assert result["accession"] == "PRJEB99"
-
-    def test_missing_fields_default_to_empty_string(self) -> None:
-        """Missing fields default to empty string without raising."""
-        report = {}
-        result = _normalize_study_report(report)
-        assert result["title"] == ""
-        assert result["alias"] == ""
-        assert result["accession"] == ""
-
-    def test_status_field_defaults_to_unknown(self) -> None:
-        """The status field defaults to 'UNKNOWN' when absent."""
-        report = {"title": "T", "alias": "a", "accession": "PRJEB6"}
-        result = _normalize_study_report(report)
-        assert result["status"] == "UNKNOWN"
-
-    def test_release_status_used_for_status(self) -> None:
-        """releaseStatus is mapped to the 'status' key."""
-        report = {"title": "T", "alias": "a", "accession": "PRJEB7", "releaseStatus": "PUBLIC"}
-        result = _normalize_study_report(report)
-        assert result["status"] == "PUBLIC"
-
-
-class TestFetchAccountStudies:
-    """Unit tests for fetch_account_studies calling common.fetch_account_records."""
-
-    def test_fetch_calls_fetch_account_records_with_correct_urls(
-        self, auth: HTTPBasicAuth
-    ) -> None:
-        """fetch_account_studies calls common.fetch_account_records with prod/test URLs."""
-        target = "submit_study.common.fetch_account_records"
-        with patch(target, return_value=[]) as mock_fetch:
-            fetch_account_studies(auth, use_test=False)
-            mock_fetch.assert_called_once()
-            call_kwargs = mock_fetch.call_args
-            assert call_kwargs.kwargs.get("prod_url") == _PROD_REPORTS_URL
-            assert call_kwargs.kwargs.get("test_url") == _TEST_REPORTS_URL
-
-    def test_fetch_passes_normalizer_callable(self, auth: HTTPBasicAuth) -> None:
-        """fetch_account_studies passes a callable normalizer to fetch_account_records."""
-        target = "submit_study.common.fetch_account_records"
-        with patch(target, return_value=[]) as mock_fetch:
-            fetch_account_studies(auth, use_test=False)
-            call_kwargs = mock_fetch.call_args
-            normalizer = call_kwargs.kwargs.get("normalizer")
-            assert callable(normalizer)
-
-    def test_fetch_normalizer_handles_title_variant(self, auth: HTTPBasicAuth) -> None:
-        """The normalizer passed to fetch_account_records handles title/studyTitle variants."""
-        target = "submit_study.common.fetch_account_records"
-        captured_normalizer = None
-
-        def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
-            nonlocal captured_normalizer
-            captured_normalizer = kwargs.get("normalizer")
-            return []
-
-        with patch(target, side_effect=capture_normalizer):
-            fetch_account_studies(auth, use_test=False)
-
-        assert captured_normalizer is not None
-        result_title = captured_normalizer({"title": "Direct Title", "accession": "PRJEB1"})
-        assert result_title["title"] == "Direct Title"
-
-        result_study_title = captured_normalizer(
-            {"studyTitle": "Fallback Title", "accession": "PRJEB2"}
-        )
-        assert result_study_title["title"] == "Fallback Title"
-
-    def test_fetch_normalizer_handles_alias_variant(self, auth: HTTPBasicAuth) -> None:
-        """The normalizer handles alias/studyAlias field variants."""
-        target = "submit_study.common.fetch_account_records"
-        captured_normalizer = None
-
-        def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
-            nonlocal captured_normalizer
-            captured_normalizer = kwargs.get("normalizer")
-            return []
-
-        with patch(target, side_effect=capture_normalizer):
-            fetch_account_studies(auth, use_test=False)
-
-        assert captured_normalizer is not None
-        result = captured_normalizer({"alias": "direct-alias", "accession": "PRJEB3"})
-        assert result["alias"] == "direct-alias"
-
-        result_fallback = captured_normalizer(
-            {"studyAlias": "study-alias-fallback", "accession": "PRJEB4"}
-        )
-        assert result_fallback["alias"] == "study-alias-fallback"
-
-    def test_fetch_normalizer_handles_accession_variant(self, auth: HTTPBasicAuth) -> None:
-        """The normalizer handles accession/studyAccession field variants."""
-        target = "submit_study.common.fetch_account_records"
-        captured_normalizer = None
-
-        def capture_normalizer(*args: Any, **kwargs: Any) -> list[dict[str, str]]:
-            nonlocal captured_normalizer
-            captured_normalizer = kwargs.get("normalizer")
-            return []
-
-        with patch(target, side_effect=capture_normalizer):
-            fetch_account_studies(auth, use_test=False)
-
-        assert captured_normalizer is not None
-        result = captured_normalizer(
-            {"title": "T", "studyAccession": "PRJEB99", "accession": ""}
-        )
-        # studyAccession falls back when 'accession' is falsy
-        assert result["accession"] == "PRJEB99"
-
-
-# ---------------------------------------------------------------------------
-# E. CLI integration tests for main() using click.testing.CliRunner
-# ---------------------------------------------------------------------------
-
-
-def _extract_json_from_output(output: str) -> dict[str, Any]:
-    """Extract the JSON results dict from mixed CLI output.
-
-    The CLI writes JSON results via ``print()`` to stdout, but logging
-    also emits to stderr which CliRunner captures in ``result.output``.
-    This helper finds the last top-level JSON object in the output.
-
-    Args:
-        output: The full ``result.output`` string from CliRunner.
-
-    Returns:
-        Parsed JSON dict.
-
-    Raises:
-        ValueError: If no valid JSON object is found.
-    """
-    # Walk backwards through the output looking for a complete JSON block.
-    # The results JSON always starts with "{\n  " and ends with "\n}".
-    depth = 0
-    end = -1
-    start = -1
-    for i in range(len(output) - 1, -1, -1):
-        ch = output[i]
-        if ch == "}":
-            if depth == 0:
-                end = i
-            depth += 1
-        elif ch == "{":
-            depth -= 1
-            if depth == 0:
-                start = i
-                break
-    if start == -1 or end == -1:
-        raise ValueError(f"No JSON object found in output: {output[:200]!r}")
-    return json.loads(output[start : end + 1])
-
-
-def _make_study_json(study: dict[str, Any]) -> str:
-    """Serialise a study dict into a JSON string using the Container format.
-
-    Args:
-        study: Study metadata dict.
-
-    Returns:
-        JSON string in DataHarmonizer Container format.
-    """
-    return json.dumps({
-        "Container": {
-            "SRA_studys": [study],
-        }
-    })
-
-
-def _make_study_csv(study: dict[str, Any]) -> str:
-    """Serialise a study dict into a minimal CSV string.
-
-    Args:
-        study: Study metadata dict.
-
-    Returns:
-        CSV string with header and one data row.
-    """
-    headers = list(study.keys())
-    values = [str(study[h]) for h in headers]
-    return ",".join(headers) + "\n" + ",".join(values) + "\n"
-
-
-def _make_study_tsv(study: dict[str, Any]) -> str:
-    """Serialise a study dict into a minimal TSV string.
-
-    Args:
-        study: Study metadata dict.
-
-    Returns:
-        TSV string with header and one data row.
-    """
-    headers = list(study.keys())
-    values = [str(study[h]) for h in headers]
-    return "\t".join(headers) + "\n" + "\t".join(values) + "\n"
-
-
-@pytest.fixture
-def runner() -> CliRunner:
-    """Return a Click test runner with isolated filesystem."""
-    return CliRunner()
-
-
-@pytest.fixture
-def minimal_metagenomics_study() -> dict[str, Any]:
-    """Return a minimal metagenomics study for CLI tests."""
-    return {
-        "alias": "cli-metagenomics-001",
-        "STUDY_TITLE": "CLI Metagenomics Test Study",
-        "STUDY_ABSTRACT": "Abstract for CLI test.",
-        "existing_study_type": "Metagenomics",
-    }
-
-
-class TestMainCli:
-    """CLI integration tests for main() using CliRunner."""
-
-    _CRED_TARGET = "submit_study.common.get_credentials"
-    _SUBMIT_TARGET = "submit_study.common.submit_xml"
-
-    def _invoke(
-        self,
-        runner: CliRunner,
-        args: list[str],
-        input_filename: str,
-        input_content: str,
-    ) -> Any:
-        """Write input file and invoke the CLI.
-
-        Args:
-            runner: Click CliRunner instance.
-            args: CLI arguments (excluding --input, which is added automatically).
-            input_filename: Filename for the temporary input file.
-            input_content: Content to write to the input file.
-
-        Returns:
-            Click Result object.
-        """
-        with runner.isolated_filesystem():
-            Path(input_filename).write_text(input_content)
-            result = runner.invoke(
-                main,
-                ["--input", input_filename] + args,
-                catch_exceptions=False,
-            )
-        return result
-
-    # ---- E19: JSON input, automated mode, dry-run -------------------------
-
-    def test_json_input_automated_dry_run_exits_0(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """JSON input with --automated --dry-run exits 0 and output has 'submitted' key."""
-        content = _make_study_json(minimal_metagenomics_study)
-        with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
-            result = self._invoke(
-                runner, ["--automated", "--dry-run"], "studies.json", content
-            )
-        assert result.exit_code == 0, f"stdout: {result.output}"
-        data = _extract_json_from_output(result.output)
-        assert "submitted" in data
-
-    # ---- E20: CSV input ---------------------------------------------------
-
-    def test_csv_input_automated_dry_run_exits_0(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """CSV input with --automated --dry-run exits 0 and output has 'submitted' key."""
-        content = _make_study_csv(minimal_metagenomics_study)
-        with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
-            result = self._invoke(
-                runner, ["--automated", "--dry-run"], "studies.csv", content
-            )
-        assert result.exit_code == 0, f"stdout: {result.output}"
-        data = _extract_json_from_output(result.output)
-        assert "submitted" in data
-
-    # ---- E21: TSV input ---------------------------------------------------
-
-    def test_tsv_input_automated_dry_run_exits_0(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """TSV input with --automated --dry-run exits 0 and output has 'submitted' key."""
-        content = _make_study_tsv(minimal_metagenomics_study)
-        with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
-            result = self._invoke(
-                runner, ["--automated", "--dry-run"], "studies.tsv", content
-            )
-        assert result.exit_code == 0, f"stdout: {result.output}"
-        data = _extract_json_from_output(result.output)
-        assert "submitted" in data
-
-    # ---- E22: Duplicate detection -----------------------------------------
-
-    def test_duplicate_detection_records_duplicate_and_skips_submission(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """When account already has a matching study, duplicate is recorded; nothing submitted."""
-        existing = {
-            "title": minimal_metagenomics_study["STUDY_TITLE"],
-            "alias": minimal_metagenomics_study["alias"],
-            "accession": "PRJEB55555",
-            "secondary_accession": "ERP055555",
-            "status": "PRIVATE",
-        }
-        content = _make_study_json(minimal_metagenomics_study)
-        with runner.isolated_filesystem():
-            Path("studies.json").write_text(content)
-            with (
-                patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
-                patch(
-                    "submit_study.fetch_account_studies",
-                    return_value=[existing],
-                ),
-            ):
-                result = runner.invoke(
-                    main,
-                    ["--input", "studies.json"],
-                    catch_exceptions=False,
-                )
-        assert result.exit_code == 0, f"stdout: {result.output}"
-        data = _extract_json_from_output(result.output)
-        assert len(data["duplicates"]) == 1
-        assert data["duplicates"][0]["existing_accession"] == "PRJEB55555"
-        assert data["submitted"] == []
-
-    # ---- E23: --force with duplicate triggers MODIFY ----------------------
-
-    def test_force_flag_with_duplicate_triggers_modify(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """--force with a detected duplicate triggers MODIFY and study appears in 'modified'."""
-        existing = {
-            "title": minimal_metagenomics_study["STUDY_TITLE"],
-            "alias": minimal_metagenomics_study["alias"],
-            "accession": "PRJEB66666",
-            "secondary_accession": "ERP066666",
-            "status": "PRIVATE",
-        }
-        receipt_xml = ET.fromstring(
-            '<RECEIPT success="true">'
-            '<PROJECT accession="PRJEB66666" alias="cli-metagenomics-001" status="PRIVATE"/>'
-            "</RECEIPT>"
-        )
-        content = _make_study_json(minimal_metagenomics_study)
-        with runner.isolated_filesystem():
-            Path("studies.json").write_text(content)
-            with (
-                patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
-                patch(
-                    "submit_study.fetch_account_studies",
-                    return_value=[existing],
-                ),
-                patch(self._SUBMIT_TARGET, return_value=receipt_xml),
-            ):
-                result = runner.invoke(
-                    main,
-                    ["--input", "studies.json", "--force"],
-                    catch_exceptions=False,
-                )
-        assert result.exit_code == 0, f"stdout: {result.output}"
-        data = _extract_json_from_output(result.output)
-        assert len(data["modified"]) == 1
-        assert data["modified"][0]["accession"] == "PRJEB66666"
-
-    # ---- E24: Failed submission exits 1 -----------------------------------
-
-    def test_failed_submission_exits_1(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """When common.submit_xml raises HTTPError, the CLI exits with code 1."""
-        import requests
-
-        content = _make_study_json(minimal_metagenomics_study)
-        http_error = requests.exceptions.HTTPError(response=MagicMock(status_code=500, text="err"))
-        with runner.isolated_filesystem():
-            Path("studies.json").write_text(content)
-            with (
-                patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
-                patch(self._SUBMIT_TARGET, side_effect=http_error),
-            ):
-                result = runner.invoke(
-                    main,
-                    ["--input", "studies.json", "--automated"],
-                    catch_exceptions=False,
-                )
-        assert result.exit_code == 1
-
-    # ---- E25: MAG/genome study dry-run XML contains both PROJECT_ATTRIBUTEs ---
-
-    def test_mag_genome_study_dry_run_xml_has_both_attributes(
-        self,
-        runner: CliRunner,
-    ) -> None:
-        """MAG/genome study with existing_study_type=Other produces both PROJECT_ATTRIBUTEs."""
-        study = {
-            "alias": "mag-001",
-            "STUDY_TITLE": "MAG Genome Study",
-            "existing_study_type": "Other",
-            "new_study_type": "Genome Sequencing",
-        }
-        content = _make_study_json(study)
-        with runner.isolated_filesystem():
-            Path("studies.json").write_text(content)
-            with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
-                result = runner.invoke(
-                    main,
-                    ["--input", "studies.json", "--automated", "--dry-run"],
-                    catch_exceptions=False,
-                )
-        assert result.exit_code == 0, f"output: {result.output}"
-        data = _extract_json_from_output(result.output)
-        assert "submitted" in data
-        # Also verify the XML would contain both attributes by building it directly
-        root = build_submission_xml([study])
-        tags = [el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text]
-        assert "existing_study_type" in tags
-        assert "new_study_type" in tags
-
-    # ---- E26: --hold-until date present in XML ----------------------------
-
-    def test_hold_until_date_appears_in_submission_xml(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """--hold-until date is present in the HOLD element of the generated XML."""
-        study = dict(minimal_metagenomics_study)
-        root = build_submission_xml([study], hold_until="2027-12-31")
-        hold_el = root.find(".//HOLD")
-        assert hold_el is not None
-        assert hold_el.get("HoldUntilDate") == "2027-12-31"
-
-    def test_hold_until_cli_flag_passes_validation(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """CLI --hold-until with a valid future date exits 0 in dry-run mode."""
-        content = _make_study_json(minimal_metagenomics_study)
-        with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
-            result = self._invoke(
-                runner,
-                ["--automated", "--dry-run", "--hold-until", "2027-06-01"],
-                "studies.json",
-                content,
-            )
-        assert result.exit_code == 0, f"output: {result.output}"
-
-    # ---- E27: --output writes results to file -----------------------------
-
-    def test_output_flag_writes_results_to_file(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """--output flag writes JSON results to a file rather than stdout."""
-        content = _make_study_json(minimal_metagenomics_study)
-        with runner.isolated_filesystem():
-            Path("studies.json").write_text(content)
-            with patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")):
-                result = runner.invoke(
-                    main,
-                    ["--input", "studies.json", "--automated", "--dry-run",
-                     "--output", "results.json"],
-                    catch_exceptions=False,
-                )
-            assert result.exit_code == 0, f"stdout: {result.output}"
-            # With --output, the JSON results go to file, not stdout (stdout has only logging).
-            results_path = Path("results.json")
-            assert results_path.exists(), "results.json was not created"
-            data = json.loads(results_path.read_text())
-            assert "submitted" in data
-
-    # ---- E28: --test flag routes to test base URL -------------------------
-
-    def test_test_flag_uses_test_base_url(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """--test flag results in the test base URL being used for submission."""
-        receipt_xml = ET.fromstring(
-            '<RECEIPT success="true">'
-            '<PROJECT accession="PRJEB00001" alias="cli-metagenomics-001" status="PRIVATE"/>'
-            "</RECEIPT>"
-        )
-        content = _make_study_json(minimal_metagenomics_study)
-        with runner.isolated_filesystem():
-            Path("studies.json").write_text(content)
-            with (
-                patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
-                patch(self._SUBMIT_TARGET, return_value=receipt_xml) as mock_submit,
-            ):
-                result = runner.invoke(
-                    main,
-                    ["--input", "studies.json", "--automated", "--test"],
-                    catch_exceptions=False,
-                )
-        assert result.exit_code == 0, f"stdout: {result.output}"
-        assert mock_submit.called
-        called_url = mock_submit.call_args[0][0]
-        assert "wwwdev" in called_url, f"Expected test URL; got {called_url}"
-
-    def test_no_test_flag_uses_production_base_url(
-        self,
-        runner: CliRunner,
-        minimal_metagenomics_study: dict[str, Any],
-    ) -> None:
-        """Without --test flag, the production base URL is used."""
-        receipt_xml = ET.fromstring(
-            '<RECEIPT success="true">'
-            '<PROJECT accession="PRJEB00002" alias="cli-metagenomics-001" status="PRIVATE"/>'
-            "</RECEIPT>"
-        )
-        content = _make_study_json(minimal_metagenomics_study)
-        with runner.isolated_filesystem():
-            Path("studies.json").write_text(content)
-            with (
-                patch(self._CRED_TARGET, return_value=("Webin-12345", "pass")),
-                patch(self._SUBMIT_TARGET, return_value=receipt_xml) as mock_submit,
-            ):
-                result = runner.invoke(
-                    main,
-                    ["--input", "studies.json", "--automated"],
-                    catch_exceptions=False,
-                )
-        assert result.exit_code == 0, f"stdout: {result.output}"
-        assert mock_submit.called
-        called_url = mock_submit.call_args[0][0]
-        assert "wwwdev" not in called_url, f"Expected prod URL; got {called_url}"
-
-
-# ---------------------------------------------------------------------------
-# Parametrized study-type cases
-# ---------------------------------------------------------------------------
-
-
-@pytest.mark.parametrize(
-    "study_type,new_type,expect_new_type",
-    [
-        ("Metagenomics", None, False),
-        ("RNASeq", None, False),
-        ("Population Genomics", None, False),
-        ("Other", "Genome Sequencing", True),
-        ("Other", "Transcriptome Analysis", True),
-        ("Other", None, False),
-    ],
-)
-def test_project_attribute_new_study_type_conditional(
-    study_type: str,
-    new_type: str | None,
-    expect_new_type: bool,
-) -> None:
-    """new_study_type attribute appears iff existing_study_type=='Other' and new_type is set.
-
-    Args:
-        study_type: Value for existing_study_type.
-        new_type: Value for new_study_type (or None).
-        expect_new_type: Whether new_study_type should appear in the XML.
-    """
-    study: dict[str, Any] = {
-        "alias": "param-test",
-        "STUDY_TITLE": "Parametrized Study",
-        "existing_study_type": study_type,
-    }
-    if new_type is not None:
-        study["new_study_type"] = new_type
-
-    root = build_submission_xml([study])
-    tags = [el.text for el in root.findall(".//PROJECT_ATTRIBUTE/TAG") if el.text]
-    if expect_new_type:
-        assert "new_study_type" in tags, (
-            f"Expected new_study_type in tags for {study_type!r} / {new_type!r}"
-        )
-    else:
-        assert "new_study_type" not in tags, (
-            f"Did not expect new_study_type in tags for {study_type!r} / {new_type!r}"
-        )
-
-
-@pytest.mark.parametrize(
-    "hold_until,expect_hold",
-    [
-        ("2027-03-01", True),
-        ("2028-12-31", True),
-        (None, False),
-    ],
-)
-def test_hold_until_element_conditional(hold_until: str | None, expect_hold: bool) -> None:
-    """HOLD element appears iff hold_until is provided.
-
-    Args:
-        hold_until: The hold-until date string, or None.
-        expect_hold: Whether the HOLD element should appear.
-    """
-    study = {"alias": "hold-test", "STUDY_TITLE": "Hold Date Test"}
-    root = build_submission_xml([study], hold_until=hold_until)
-    hold_el = root.find(".//HOLD")
-    if expect_hold:
-        assert hold_el is not None
-        assert hold_el.get("HoldUntilDate") == hold_until
-    else:
-        assert hold_el is None
-
-
-@pytest.mark.parametrize("action", ["ADD", "MODIFY"])
-def test_submission_action_element_present(action: str) -> None:
-    """The correct action element (ADD or MODIFY) appears in the SUBMISSION.
-
-    Args:
-        action: The submission action string.
-    """
-    study = {"alias": "action-test", "STUDY_TITLE": "Action Test"}
-    root = build_submission_xml([study], action=action)
-    xml_str = ET.tostring(root, encoding="unicode")
-    assert f"<{action}" in xml_str or f"<{action}/>" in xml_str
-    opposite = "MODIFY" if action == "ADD" else "ADD"
-    assert f"<{opposite}" not in xml_str

From d2a78b6b60e4107d9905218bfce4ebee563de8b8 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 12:11:59 +0000
Subject: [PATCH 10/36] In response to PR comments, merged study_submit.py
 python scripts into one, trialling sanitizeOutput for test snapshot.

---
 .gitignore                                    |   4 +-
 bin/ena_submit_common.py                      | 677 -----------------
 bin/submit_study.py                           | 693 +++++++++++++++++-
 .../local/registerstudy/tests/main.nf.test    |   2 +-
 4 files changed, 672 insertions(+), 704 deletions(-)
 delete mode 100644 bin/ena_submit_common.py

diff --git a/.gitignore b/.gitignore
index d8c4dbb..601993a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,4 @@ testing*
 null/
 .nf-test*
 .idea/
-test_data
-.claude/
-CLAUDE.md
+test_data
\ No newline at end of file
diff --git a/bin/ena_submit_common.py b/bin/ena_submit_common.py
deleted file mode 100644
index 89e41ab..0000000
--- a/bin/ena_submit_common.py
+++ /dev/null
@@ -1,677 +0,0 @@
-"""Shared utilities for ENA submission scripts.
-
-Provide logging, credential management, file loading,
-XSD structural validation, Reports API access, duplicate
-detection, XML serialisation, and result output used by
-``submit_study.py``, ``submit_sample.py``, and
-``submit_reads.py``.
-"""
-
-from __future__ import annotations
-
-import csv
-import datetime
-import json
-import logging
-import os
-import sys
-import xml.etree.ElementTree as ET
-from collections.abc import Callable, Sequence
-from io import BytesIO
-from pathlib import Path
-from typing import Any, Final
-
-import click
-import requests
-from requests.auth import HTTPBasicAuth
-
-# All loggers in the ENA submission scripts are children of
-# this root, so configuring it once propagates to all.
-_LOGGER_NAME: Final = "ena_submit"
-
-logger = logging.getLogger(_LOGGER_NAME)
-
-
-# -----------------------------------------------------------
-# Constants
-# -----------------------------------------------------------
-
-PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2"
-TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2"
-
-_MAX_HOLD_YEARS: Final = 2
-
-
-# -----------------------------------------------------------
-# Logging
-# -----------------------------------------------------------
-
-
-def setup_logging(log_file: Path | None = None) -> None:
-    """Configure stderr and optional file logging.
-
-    Attach handlers to the ``ena_submit`` parent logger.
-    Child loggers (e.g. ``ena_submit.study``) propagate
-    their messages to these handlers automatically.
-
-    Args:
-        log_file: Path to a log file.  If provided,
-            debug-level messages are written there in
-            addition to stderr.
-    """
-    root = logging.getLogger(_LOGGER_NAME)
-
-    # Avoid duplicate handlers on repeated calls.
-    if root.handlers:
-        return
-
-    fmt = logging.Formatter(
-        "%(asctime)s [%(levelname)s] %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-    root.setLevel(logging.DEBUG)
-
-    stderr_handler = logging.StreamHandler(sys.stderr)
-    stderr_handler.setLevel(logging.INFO)
-    stderr_handler.setFormatter(fmt)
-    root.addHandler(stderr_handler)
-
-    if log_file:
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setLevel(logging.DEBUG)
-        file_handler.setFormatter(fmt)
-        root.addHandler(file_handler)
-
-
-# -----------------------------------------------------------
-# Credentials
-# -----------------------------------------------------------
-
-
-def get_credentials() -> tuple[str, str]:
-    """Read ENA credentials from environment variables.
-
-    Returns:
-        Tuple of (*username*, *password*).
-
-    Raises:
-        SystemExit: If either variable is unset or empty.
-    """
-    username = os.environ.get("ENA_WEBIN", "").strip()
-    password = os.environ.get("ENA_WEBIN_PASSWORD", "").strip()
-    if not username or not password:
-        logger.error("ENA_WEBIN and ENA_WEBIN_PASSWORD environment variables must be set")
-        sys.exit(1)
-    return username, password
-
-
-# -----------------------------------------------------------
-# ENA API helpers
-# -----------------------------------------------------------
-
-
-def get_base_url(use_test: bool) -> str:
-    """Return the ENA Webin v2 submission base URL."""
-    return TEST_URL if use_test else PROD_URL
-
-
-def submit_xml(
-    base_url: str,
-    auth: HTTPBasicAuth,
-    xml_bytes: bytes,
-) -> ET.Element:
-    """Submit an XML document to ENA via Webin v2.
-
-    Args:
-        base_url: ENA submission service base URL.
-        auth: HTTP basic-auth credentials.
-        xml_bytes: Serialised XML submission document.
-
-    Returns:
-        Parsed receipt XML element tree root.
-    """
-    url = f"{base_url}/submit"
-    headers = {
-        "Content-Type": "application/xml",
-        "Accept": "application/xml",
-    }
-    resp = requests.post(
-        url, data=xml_bytes,
-        headers=headers, auth=auth, timeout=120,
-    )
-    resp.raise_for_status()
-    return ET.fromstring(resp.content)
-
-
-# -----------------------------------------------------------
-# XML utilities
-# -----------------------------------------------------------
-
-
-def xml_to_bytes(root: ET.Element) -> bytes:
-    """Serialise an ElementTree element to UTF-8 bytes."""
-    tree = ET.ElementTree(root)
-    buf = BytesIO()
-    tree.write(buf, encoding="UTF-8", xml_declaration=True)
-    return buf.getvalue()
-
-
-# -----------------------------------------------------------
-# Hold-until date validation
-# -----------------------------------------------------------
-
-
-def validate_hold_until(hold_until: str) -> datetime.date:
-    """Parse and validate a hold-until date string.
-
-    Args:
-        hold_until: Date string in ``YYYY-MM-DD`` format.
-
-    Returns:
-        Parsed date.
-
-    Raises:
-        click.BadParameter: If the date format is invalid,
-            in the past, or more than 2 years from today.
-    """
-    try:
-        hold_date = datetime.date.fromisoformat(hold_until)
-    except ValueError:
-        raise click.BadParameter(
-            f"Invalid date format: {hold_until!r}. Expected YYYY-MM-DD."
-        ) from None
-
-    today = datetime.date.today()
-    max_date = today.replace(year=today.year + _MAX_HOLD_YEARS)
-
-    if hold_date > max_date:
-        raise click.BadParameter(
-            f"Hold date {hold_until} is more than {_MAX_HOLD_YEARS} years from today"
-            f" ({today}). Maximum allowed: {max_date}."
-        )
-
-    if hold_date <= today:
-        raise click.BadParameter(
-            f"Hold date {hold_until} is not in the future (today is {today})."
-        )
-
-    return hold_date
-
-
-# -----------------------------------------------------------
-# ENA checklist XML parsing
-# -----------------------------------------------------------
-
-
-def parse_checklist_units(
-    xml_path: str | Path,
-) -> dict[str, str]:
-    """Parse an ENA checklist XML and return field units.
-
-    Reads the ``<FIELD>`` elements from an ENA checklist XML
-    file (e.g. ``ERC000015.xml``) and returns a mapping from
-    slot name to unit string for every field that declares a
-    ``<UNITS><UNIT>`` element.
-
-    Args:
-        xml_path: Path to the ENA checklist XML file.
-
-    Returns:
-        Dict mapping slot name to unit string.
-        Fields without units are absent from the dict.
-    """
-    units: dict[str, str] = {}
-    try:
-        tree = ET.parse(str(xml_path))
-    except ET.ParseError as exc:
-        logger.warning(
-            "Could not parse checklist XML %s: %s",
-            xml_path, exc,
-        )
-        return units
-
-    for field in tree.iter("FIELD"):
-        name_el = field.find("NAME")
-        if name_el is None or not name_el.text:
-            continue
-        units_el = field.find("UNITS")
-        if units_el is None:
-            continue
-        unit_el = units_el.find("UNIT")
-        if unit_el is None or not unit_el.text:
-            continue
-        units[name_el.text.strip()] = unit_el.text.strip()
-
-    return units
-
-
-# -----------------------------------------------------------
-# XSD validation (structural fallback only)
-# -----------------------------------------------------------
-
-
-def validate_xml_against_xsd(
-    xml_bytes: bytes,
-    _fragment_tag: str | None = None,  # unused; kept for API compatibility
-    fallback_checker: Callable[
-        [bytes, list[str]], tuple[bool, list[str]]
-    ] | None = None,
-) -> tuple[bool, list[str]]:
-    """Validate XML bytes using a structural check.
-
-    Full XSD validation via lxml is not available in this
-    container.  Uses *fallback_checker* if provided,
-    otherwise checks that the document is well-formed XML.
-
-    Args:
-        xml_bytes: Serialised XML document.
-        _fragment_tag: Unused; kept for API compatibility.
-        fallback_checker: Optional function called with
-            (*xml_bytes*, *messages*) that returns
-            (*is_valid*, *messages*).
-
-    Returns:
-        Tuple of (*is_valid*, *messages*).
-    """
-    messages: list[str] = []
-
-    if fallback_checker is not None:
-        return fallback_checker(xml_bytes, messages)
-
-    try:
-        ET.fromstring(xml_bytes)
-    except ET.ParseError as exc:
-        messages.append(
-            f"ERROR: XML is not well-formed: {exc}"
-        )
-        return False, messages
-
-    messages.append(
-        "XML is well-formed (basic check passed)"
-    )
-    return True, messages
-
-
-# -----------------------------------------------------------
-# File loading (JSON, CSV, TSV)
-# -----------------------------------------------------------
-
-
-def _is_metadata_row(row: Sequence[object]) -> bool:
-    """Check whether *row* is a DataHarmonizer label row.
-
-    These rows have at most one non-empty cell.
-    """
-    non_empty = sum(
-        1 for c in row
-        if c is not None and str(c).strip()
-    )
-    return non_empty <= 1
-
-
-def extract_records_from_tabular(
-    filepath: str | Path,
-    delimiter: str = ",",
-) -> list[dict[str, str]]:
-    """Extract record dicts from a CSV or TSV file.
-
-    Skip an optional DataHarmonizer metadata row if
-    detected.
-
-    Args:
-        filepath: Path to the tabular file.
-        delimiter: Column delimiter character.
-
-    Returns:
-        List of record dicts.
-    """
-    with open(filepath, newline="", encoding="utf-8") as fh:
-        rows = list(csv.reader(fh, delimiter=delimiter))
-
-    if not rows:
-        return []
-
-    idx = 0
-    if _is_metadata_row(rows[idx]):
-        idx += 1
-    if idx >= len(rows):
-        return []
-
-    headers = rows[idx]
-    idx += 1
-
-    records: list[dict[str, str]] = []
-    for row in rows[idx:]:
-        record: dict[str, str] = {}
-        for col, val in zip(headers, row):
-            col = col.strip()
-            if col and val is not None and val.strip():
-                record[col] = val.strip()
-        if record:
-            records.append(record)
-
-    return records
-
-
-def extract_records_from_json(
-    input_data: object,
-    record_keys: Sequence[str] = ("data",),
-) -> list[dict[str, Any]] | None:
-    """Extract record dicts from a DataHarmonizer JSON export.
-
-    Handle several JSON shapes:
-
-    * DataHarmonizer Container format::
-
-        {"Container": {"<ClassName>s": [{...}, ...]}}
-
-    * Plain list of dicts.
-    * Dict with an entity-specific key or ``data`` key.
-    * Single record object (no wrapper).
-
-    Args:
-        input_data: Parsed JSON data (any shape).
-        record_keys: Dict keys to check for record lists
-            (e.g. ``["studies", "data"]``).
-
-    Returns:
-        List of record dicts, or ``None`` if unrecognised.
-    """
-    if isinstance(input_data, list):
-        return input_data
-
-    if isinstance(input_data, dict):
-        container = input_data.get("Container")
-        if isinstance(container, dict):
-            for key, val in container.items():
-                if isinstance(val, list):
-                    logger.info("Extracted records from Container.%s", key)
-                    return val
-
-        for key in record_keys:
-            if key in input_data:
-                return input_data[key]
-
-        return [input_data]
-
-    return None
-
-
-def load_input_file(
-    filepath: str | Path,
-    json_record_keys: Sequence[str] = ("data",),
-) -> list[dict[str, Any]] | None:
-    """Load records from a supported file format.
-
-    Supported formats: JSON, CSV, TSV.
-
-    Args:
-        filepath: Path to the input file.
-        json_record_keys: Dict keys to check when parsing
-            JSON (e.g. ``["studies", "data"]``).
-
-    Returns:
-        List of record dicts, or ``None`` if the format is
-        unrecognised.
-    """
-    ext = Path(filepath).suffix.lower()
-    if ext == ".json":
-        with open(filepath) as fh:
-            input_data = json.load(fh)
-        return extract_records_from_json(
-            input_data, json_record_keys,
-        )
-    if ext == ".csv":
-        return extract_records_from_tabular(
-            filepath, delimiter=",",
-        )
-    if ext == ".tsv":
-        return extract_records_from_tabular(
-            filepath, delimiter="\t",
-        )
-    return None
-
-
-# -----------------------------------------------------------
-# Reports API
-# -----------------------------------------------------------
-
-
-def fetch_from_reports_endpoint(
-    url: str,
-    auth: HTTPBasicAuth,
-    max_results: int = 5000,
-) -> list[dict[str, Any]] | None:
-    """Fetch records from a single Webin Reports endpoint.
-
-    Args:
-        url: Full URL of the reports endpoint.
-        auth: HTTP basic-auth credentials.
-        max_results: Maximum number of results to request.
-
-    Returns:
-        List of raw report dicts, or ``None`` on error.
-    """
-    params = {
-        "format": "json",
-        "max-results": max_results,
-    }
-
-    req = requests.Request("GET", url, params=params, auth=auth)
-    prepared = req.prepare()
-    logger.debug('curl -u %s:*** "%s"', auth.username, prepared.url)
-
-    try:
-        resp = requests.get(url, params=params, auth=auth, timeout=60)
-        logger.info("Reports API at %s returned %s", url, resp.status_code)
-        resp.raise_for_status()
-        return resp.json()
-
-    except requests.exceptions.HTTPError as exc:
-        status = (
-            exc.response.status_code
-            if exc.response is not None
-            else "unknown"
-        )
-        if status == 404:
-            logger.info("Reports API at %s returned 404 — no records yet", url)
-            return []
-        if status in (401, 403):
-            logger.warning(
-                "Reports API at %s returned %s — endpoint may not be available"
-                " or credentials may differ",
-                url, status,
-            )
-            return None
-        logger.warning("Reports API at %s returned HTTP %s", url, status)
-        return None
-
-    except requests.exceptions.RequestException as exc:
-        logger.warning("Reports API at %s failed: %s", url, exc)
-        return None
-
-
-def fetch_account_records(
-    auth: HTTPBasicAuth,
-    use_test: bool,
-    prod_url: str,
-    test_url: str,
-    normalizer: Callable[
-        [dict[str, Any]], dict[str, str] | None
-    ],
-    entity_label: str,
-    max_results: int = 5000,
-) -> list[dict[str, str]]:
-    """Fetch and normalise records from the Reports API.
-
-    Try test endpoint first (if *use_test*), then fall back
-    to production.
-
-    Args:
-        auth: HTTP basic-auth credentials.
-        use_test: Try the test endpoint first.
-        prod_url: Production reports endpoint URL.
-        test_url: Test reports endpoint URL.
-        normalizer: Callable that maps a raw report dict to
-            a normalised dict, or ``None`` to skip.
-        entity_label: Label for log messages (e.g.
-            ``"studies"``).
-        max_results: Maximum number of results to request.
-
-    Returns:
-        List of normalised record dicts.
-    """
-    urls = (
-        [test_url, prod_url] if use_test
-        else [prod_url]
-    )
-
-    for url in urls:
-        logger.info("Fetching account %s from: %s", entity_label, url)
-        raw = fetch_from_reports_endpoint(url, auth, max_results)
-        if raw is None:
-            continue
-
-        records: list[dict[str, str]] = []
-        for entry in raw:
-            report = entry.get("report")
-            if report is None:
-                continue
-            normalized = normalizer(report)
-            if normalized is not None:
-                records.append(normalized)
-
-        logger.info("Found %d %s in account", len(records), entity_label)
-        return records
-
-    logger.warning(
-        "Could not reach any Webin reports endpoint. Duplicate checking for %s will be skipped.",
-        entity_label,
-    )
-    return []
-
-
-# -----------------------------------------------------------
-# Duplicate detection (alias + title matching)
-# -----------------------------------------------------------
-
-
-def find_duplicates_by_alias_title(
-    new_records: Sequence[dict[str, Any]],
-    account_records: Sequence[dict[str, str]],
-    title_field: str,
-    entity_label: str,
-) -> dict[int, dict[str, str]]:
-    """Check new records against account records.
-
-    Match by ``alias`` (preferred) or by the entity-specific
-    title field against the pre-fetched account records from
-    the Webin Reports API.
-
-    Args:
-        new_records: Records the user wants to submit.
-        account_records: Existing records already registered
-            under the Webin account.
-        title_field: Field name for the title in new records
-            (e.g. ``"STUDY_TITLE"`` or ``"SAMPLE_TITLE"``).
-        entity_label: Label for log messages.
-
-    Returns:
-        Mapping of index in *new_records* to matching
-        existing record info.
-    """
-    duplicates: dict[int, dict[str, str]] = {}
-    total = len(new_records)
-
-    if not account_records:
-        return duplicates
-
-    by_title: dict[str, dict[str, str]] = {}
-    by_alias: dict[str, dict[str, str]] = {}
-    for rec in account_records:
-        title = (rec.get("title") or "").strip()
-        alias = (rec.get("alias") or "").strip()
-        if title:
-            by_title[title] = rec
-        if alias:
-            by_alias[alias] = rec
-
-    logger.info(
-        "Checking %d new %s against %d existing account %s...",
-        total, entity_label, len(account_records), entity_label,
-    )
-
-    for i, record in enumerate(new_records):
-        new_title = (
-            record.get(title_field) or ""
-        ).strip()
-        new_alias = (record.get("alias") or "").strip()
-
-        if not new_title and not new_alias:
-            continue
-
-        match = _match_by_alias_title(
-            new_alias, new_title, by_alias, by_title,
-        )
-        if match is not None:
-            duplicates[i] = match
-            logger.info(
-                "  Duplicate: '%s' matches %s -> %s (%s)",
-                new_title or new_alias,
-                match["match_reason"],
-                match["accession"],
-                match["status"],
-            )
-
-            if len(duplicates) == total:
-                logger.info("All %s are duplicates — skipping further checks", entity_label)
-                return duplicates
-
-    return duplicates
-
-
-def _match_by_alias_title(
-    new_alias: str,
-    new_title: str,
-    by_alias: dict[str, dict[str, str]],
-    by_title: dict[str, dict[str, str]],
-) -> dict[str, str] | None:
-    """Return matching record info or ``None``."""
-    if new_alias and new_alias in by_alias:
-        rec = by_alias[new_alias]
-        reason = f"alias '{new_alias}'"
-    elif new_title and new_title in by_title:
-        rec = by_title[new_title]
-        reason = f"title '{new_title}'"
-    else:
-        return None
-
-    return {
-        "accession": rec.get("accession", ""),
-        "secondary_accession": rec.get(
-            "secondary_accession", ""
-        ),
-        "alias": rec.get("alias", ""),
-        "title": rec.get("title", ""),
-        "status": rec.get("status", "UNKNOWN"),
-        "match_reason": reason,
-    }
-
-
-# -----------------------------------------------------------
-# Result output
-# -----------------------------------------------------------
-
-
-def write_results(
-    results: dict[str, list[dict[str, Any]]],
-    output_path: Path | None,
-) -> None:
-    """Write JSON results to file or stdout."""
-    json_str = json.dumps(results, indent=2)
-    if output_path:
-        with open(output_path, "w") as fh:
-            fh.write(json_str + "\n")
-        logger.info("Results written to %s", output_path)
-    else:
-        print(json_str)
diff --git a/bin/submit_study.py b/bin/submit_study.py
index ae72d69..463318d 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -1,6 +1,5 @@
 #!/usr/bin/env python3
-"""Submit raw-reads, assembly and genome studies to ENA 
-via the Webin REST API v2.
+"""Submit raw-reads, assembly and genome studies to ENA via the Webin REST API v2.
 
 Read a DataHarmonizer export containing study metadata,
 check for duplicate studies already registered under the
@@ -15,27 +14,32 @@
 
 Usage::
 
-    python bin/submit_study.py \
-        --input studies.json \
+    python bin/submit_study.py \\
+        --input studies.json \\
         --test
 
     # With hold date (max 2 years):
-    python bin/submit_study.py \
-        --input studies.json \
+    python bin/submit_study.py \\
+        --input studies.json \\
         --hold-until 2028-01-01
 
     # Log to file:
-    python bin/submit_study.py \
-        --input studies.json \
+    python bin/submit_study.py \\
+        --input studies.json \\
         --test --log submission.log
 """
 
 from __future__ import annotations
 
+import csv
 import datetime
+import json
 import logging
+import os
 import sys
 import xml.etree.ElementTree as ET
+from collections.abc import Callable, Sequence
+from io import BytesIO
 from pathlib import Path
 from typing import Any, Final
 
@@ -43,9 +47,652 @@
 import requests
 from requests.auth import HTTPBasicAuth
 
-import ena_submit_common as common
 
-logger = logging.getLogger("ena_submit.rawreads_study")
+# -----------------------------------------------------------
+# Logging
+# -----------------------------------------------------------
+
+# All loggers in the ENA submission scripts share this root,
+# so configuring it once propagates to all child loggers.
+_LOGGER_NAME: Final = "ena_submit"
+
+logger = logging.getLogger("ena_submit.study")
+
+
+def setup_logging(log_file: Path | None = None) -> None:
+    """Configure stderr and optional file logging.
+
+    Attach handlers to the ``ena_submit`` parent logger.
+    Child loggers (e.g. ``ena_submit.study``) propagate
+    their messages to these handlers automatically.
+
+    Args:
+        log_file: Path to a log file.  If provided,
+            debug-level messages are written there in
+            addition to stderr.
+    """
+    root = logging.getLogger(_LOGGER_NAME)
+
+    # Avoid duplicate handlers on repeated calls.
+    if root.handlers:
+        return
+
+    fmt = logging.Formatter(
+        "%(asctime)s [%(levelname)s] %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    root.setLevel(logging.DEBUG)
+
+    stderr_handler = logging.StreamHandler(sys.stderr)
+    stderr_handler.setLevel(logging.INFO)
+    stderr_handler.setFormatter(fmt)
+    root.addHandler(stderr_handler)
+
+    if log_file:
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setLevel(logging.DEBUG)
+        file_handler.setFormatter(fmt)
+        root.addHandler(file_handler)
+
+
+# -----------------------------------------------------------
+# Credentials
+# -----------------------------------------------------------
+
+
+def get_credentials() -> tuple[str, str]:
+    """Read ENA credentials from environment variables.
+
+    Returns:
+        Tuple of (*username*, *password*).
+
+    Raises:
+        SystemExit: If either variable is unset or empty.
+    """
+    username = os.environ.get("ENA_WEBIN", "").strip()
+    password = os.environ.get("ENA_WEBIN_PASSWORD", "").strip()
+    if not username or not password:
+        logger.error("ENA_WEBIN and ENA_WEBIN_PASSWORD environment variables must be set")
+        sys.exit(1)
+    return username, password
+
+
+# -----------------------------------------------------------
+# ENA API helpers
+# -----------------------------------------------------------
+
+PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2"
+TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2"
+
+
+def get_base_url(use_test: bool) -> str:
+    """Return the ENA Webin v2 submission base URL."""
+    return TEST_URL if use_test else PROD_URL
+
+
+def submit_xml(
+    base_url: str,
+    auth: HTTPBasicAuth,
+    xml_bytes: bytes,
+) -> ET.Element:
+    """Submit an XML document to ENA via Webin v2.
+
+    Args:
+        base_url: ENA submission service base URL.
+        auth: HTTP basic-auth credentials.
+        xml_bytes: Serialised XML submission document.
+
+    Returns:
+        Parsed receipt XML element tree root.
+    """
+    url = f"{base_url}/submit"
+    headers = {
+        "Content-Type": "application/xml",
+        "Accept": "application/xml",
+    }
+    resp = requests.post(
+        url, data=xml_bytes,
+        headers=headers, auth=auth, timeout=120,
+    )
+    resp.raise_for_status()
+    return ET.fromstring(resp.content)
+
+
+# -----------------------------------------------------------
+# XML utilities
+# -----------------------------------------------------------
+
+
+def xml_to_bytes(root: ET.Element) -> bytes:
+    """Serialise an ElementTree element to UTF-8 bytes."""
+    tree = ET.ElementTree(root)
+    buf = BytesIO()
+    tree.write(buf, encoding="UTF-8", xml_declaration=True)
+    return buf.getvalue()
+
+
+# -----------------------------------------------------------
+# Hold-until date validation
+# -----------------------------------------------------------
+
+_MAX_HOLD_YEARS: Final = 2
+
+
+def validate_hold_until(hold_until: str) -> datetime.date:
+    """Parse and validate a hold-until date string.
+
+    Args:
+        hold_until: Date string in ``YYYY-MM-DD`` format.
+
+    Returns:
+        Parsed date.
+
+    Raises:
+        click.BadParameter: If the date format is invalid,
+            in the past, or more than 2 years from today.
+    """
+    try:
+        hold_date = datetime.date.fromisoformat(hold_until)
+    except ValueError:
+        raise click.BadParameter(
+            f"Invalid date format: {hold_until!r}. Expected YYYY-MM-DD."
+        ) from None
+
+    today = datetime.date.today()
+    max_date = today.replace(year=today.year + _MAX_HOLD_YEARS)
+
+    if hold_date > max_date:
+        raise click.BadParameter(
+            f"Hold date {hold_until} is more than {_MAX_HOLD_YEARS} years from today"
+            f" ({today}). Maximum allowed: {max_date}."
+        )
+
+    if hold_date <= today:
+        raise click.BadParameter(
+            f"Hold date {hold_until} is not in the future (today is {today})."
+        )
+
+    return hold_date
+
+
+# -----------------------------------------------------------
+# ENA checklist XML parsing
+# -----------------------------------------------------------
+
+
+def parse_checklist_units(
+    xml_path: str | Path,
+) -> dict[str, str]:
+    """Parse an ENA checklist XML and return field units.
+
+    Reads the ``<FIELD>`` elements from an ENA checklist XML
+    file (e.g. ``ERC000015.xml``) and returns a mapping from
+    slot name to unit string for every field that declares a
+    ``<UNITS><UNIT>`` element.
+
+    Args:
+        xml_path: Path to the ENA checklist XML file.
+
+    Returns:
+        Dict mapping slot name to unit string.
+        Fields without units are absent from the dict.
+    """
+    units: dict[str, str] = {}
+    try:
+        tree = ET.parse(str(xml_path))
+    except ET.ParseError as exc:
+        logger.warning(
+            "Could not parse checklist XML %s: %s",
+            xml_path, exc,
+        )
+        return units
+
+    for field in tree.iter("FIELD"):
+        name_el = field.find("NAME")
+        if name_el is None or not name_el.text:
+            continue
+        units_el = field.find("UNITS")
+        if units_el is None:
+            continue
+        unit_el = units_el.find("UNIT")
+        if unit_el is None or not unit_el.text:
+            continue
+        units[name_el.text.strip()] = unit_el.text.strip()
+
+    return units
+
+
+# -----------------------------------------------------------
+# XSD validation (structural fallback only)
+# -----------------------------------------------------------
+
+
+def validate_xml_against_xsd(
+    xml_bytes: bytes,
+    _fragment_tag: str | None = None,  # unused; kept for API compatibility
+    fallback_checker: Callable[
+        [bytes, list[str]], tuple[bool, list[str]]
+    ] | None = None,
+) -> tuple[bool, list[str]]:
+    """Validate XML bytes using a structural check.
+
+    Full XSD validation via lxml is not available in this
+    container.  Uses *fallback_checker* if provided,
+    otherwise checks that the document is well-formed XML.
+
+    Args:
+        xml_bytes: Serialised XML document.
+        _fragment_tag: Unused; kept for API compatibility.
+        fallback_checker: Optional function called with
+            (*xml_bytes*, *messages*) that returns
+            (*is_valid*, *messages*).
+
+    Returns:
+        Tuple of (*is_valid*, *messages*).
+    """
+    messages: list[str] = []
+
+    if fallback_checker is not None:
+        return fallback_checker(xml_bytes, messages)
+
+    try:
+        ET.fromstring(xml_bytes)
+    except ET.ParseError as exc:
+        messages.append(
+            f"ERROR: XML is not well-formed: {exc}"
+        )
+        return False, messages
+
+    messages.append(
+        "XML is well-formed (basic check passed)"
+    )
+    return True, messages
+
+
+# -----------------------------------------------------------
+# File loading (JSON, CSV, TSV)
+# -----------------------------------------------------------
+
+
+def _is_metadata_row(row: Sequence[object]) -> bool:
+    """Check whether *row* is a DataHarmonizer label row.
+
+    These rows have at most one non-empty cell.
+    """
+    non_empty = sum(
+        1 for c in row
+        if c is not None and str(c).strip()
+    )
+    return non_empty <= 1
+
+
+def extract_records_from_tabular(
+    filepath: str | Path,
+    delimiter: str = ",",
+) -> list[dict[str, str]]:
+    """Extract record dicts from a CSV or TSV file.
+
+    Skip an optional DataHarmonizer metadata row if
+    detected.
+
+    Args:
+        filepath: Path to the tabular file.
+        delimiter: Column delimiter character.
+
+    Returns:
+        List of record dicts.
+    """
+    with open(filepath, newline="", encoding="utf-8") as fh:
+        rows = list(csv.reader(fh, delimiter=delimiter))
+
+    if not rows:
+        return []
+
+    idx = 0
+    if _is_metadata_row(rows[idx]):
+        idx += 1
+    if idx >= len(rows):
+        return []
+
+    headers = rows[idx]
+    idx += 1
+
+    records: list[dict[str, str]] = []
+    for row in rows[idx:]:
+        record: dict[str, str] = {}
+        for col, val in zip(headers, row):
+            col = col.strip()
+            if col and val is not None and val.strip():
+                record[col] = val.strip()
+        if record:
+            records.append(record)
+
+    return records
+
+
+def extract_records_from_json(
+    input_data: object,
+    record_keys: Sequence[str] = ("data",),
+) -> list[dict[str, Any]] | None:
+    """Extract record dicts from a DataHarmonizer JSON export.
+
+    Handle several JSON shapes:
+
+    * DataHarmonizer Container format::
+
+        {"Container": {"<ClassName>s": [{...}, ...]}}
+
+    * Plain list of dicts.
+    * Dict with an entity-specific key or ``data`` key.
+    * Single record object (no wrapper).
+
+    Args:
+        input_data: Parsed JSON data (any shape).
+        record_keys: Dict keys to check for record lists
+            (e.g. ``["studies", "data"]``).
+
+    Returns:
+        List of record dicts, or ``None`` if unrecognised.
+    """
+    if isinstance(input_data, list):
+        return input_data
+
+    if isinstance(input_data, dict):
+        container = input_data.get("Container")
+        if isinstance(container, dict):
+            for key, val in container.items():
+                if isinstance(val, list):
+                    logger.info("Extracted records from Container.%s", key)
+                    return val
+
+        for key in record_keys:
+            if key in input_data:
+                return input_data[key]
+
+        return [input_data]
+
+    return None
+
+
+def load_input_file(
+    filepath: str | Path,
+    json_record_keys: Sequence[str] = ("data",),
+) -> list[dict[str, Any]] | None:
+    """Load records from a supported file format.
+
+    Supported formats: JSON, CSV, TSV.
+
+    Args:
+        filepath: Path to the input file.
+        json_record_keys: Dict keys to check when parsing
+            JSON (e.g. ``["studies", "data"]``).
+
+    Returns:
+        List of record dicts, or ``None`` if the format is
+        unrecognised.
+    """
+    ext = Path(filepath).suffix.lower()
+    if ext == ".json":
+        with open(filepath) as fh:
+            input_data = json.load(fh)
+        return extract_records_from_json(
+            input_data, json_record_keys,
+        )
+    if ext == ".csv":
+        return extract_records_from_tabular(
+            filepath, delimiter=",",
+        )
+    if ext == ".tsv":
+        return extract_records_from_tabular(
+            filepath, delimiter="\t",
+        )
+    return None
+
+
+# -----------------------------------------------------------
+# Reports API
+# -----------------------------------------------------------
+
+
+def fetch_from_reports_endpoint(
+    url: str,
+    auth: HTTPBasicAuth,
+    max_results: int = 5000,
+) -> list[dict[str, Any]] | None:
+    """Fetch records from a single Webin Reports endpoint.
+
+    Args:
+        url: Full URL of the reports endpoint.
+        auth: HTTP basic-auth credentials.
+        max_results: Maximum number of results to request.
+
+    Returns:
+        List of raw report dicts, or ``None`` on error.
+    """
+    params = {
+        "format": "json",
+        "max-results": max_results,
+    }
+
+    req = requests.Request("GET", url, params=params, auth=auth)
+    prepared = req.prepare()
+    logger.debug('curl -u %s:*** "%s"', auth.username, prepared.url)
+
+    try:
+        resp = requests.get(url, params=params, auth=auth, timeout=60)
+        logger.info("Reports API at %s returned %s", url, resp.status_code)
+        resp.raise_for_status()
+        return resp.json()
+
+    except requests.exceptions.HTTPError as exc:
+        status = (
+            exc.response.status_code
+            if exc.response is not None
+            else "unknown"
+        )
+        if status == 404:
+            logger.info("Reports API at %s returned 404 — no records yet", url)
+            return []
+        if status in (401, 403):
+            logger.warning(
+                "Reports API at %s returned %s — endpoint may not be available"
+                " or credentials may differ",
+                url, status,
+            )
+            return None
+        logger.warning("Reports API at %s returned HTTP %s", url, status)
+        return None
+
+    except requests.exceptions.RequestException as exc:
+        logger.warning("Reports API at %s failed: %s", url, exc)
+        return None
+
+
+def fetch_account_records(
+    auth: HTTPBasicAuth,
+    use_test: bool,
+    prod_url: str,
+    test_url: str,
+    normalizer: Callable[
+        [dict[str, Any]], dict[str, str] | None
+    ],
+    entity_label: str,
+    max_results: int = 5000,
+) -> list[dict[str, str]]:
+    """Fetch and normalise records from the Reports API.
+
+    Try test endpoint first (if *use_test*), then fall back
+    to production.
+
+    Args:
+        auth: HTTP basic-auth credentials.
+        use_test: Try the test endpoint first.
+        prod_url: Production reports endpoint URL.
+        test_url: Test reports endpoint URL.
+        normalizer: Callable that maps a raw report dict to
+            a normalised dict, or ``None`` to skip.
+        entity_label: Label for log messages (e.g.
+            ``"studies"``).
+        max_results: Maximum number of results to request.
+
+    Returns:
+        List of normalised record dicts.
+    """
+    urls = (
+        [test_url, prod_url] if use_test
+        else [prod_url]
+    )
+
+    for url in urls:
+        logger.info("Fetching account %s from: %s", entity_label, url)
+        raw = fetch_from_reports_endpoint(url, auth, max_results)
+        if raw is None:
+            continue
+
+        records: list[dict[str, str]] = []
+        for entry in raw:
+            report = entry.get("report")
+            if report is None:
+                continue
+            normalized = normalizer(report)
+            if normalized is not None:
+                records.append(normalized)
+
+        logger.info("Found %d %s in account", len(records), entity_label)
+        return records
+
+    logger.warning(
+        "Could not reach any Webin reports endpoint."
+        " Duplicate checking for %s will be skipped.",
+        entity_label,
+    )
+    return []
+
+
+# -----------------------------------------------------------
+# Duplicate detection (alias + title matching)
+# -----------------------------------------------------------
+
+
+def find_duplicates_by_alias_title(
+    new_records: Sequence[dict[str, Any]],
+    account_records: Sequence[dict[str, str]],
+    title_field: str,
+    entity_label: str,
+) -> dict[int, dict[str, str]]:
+    """Check new records against account records.
+
+    Match by ``alias`` (preferred) or by the entity-specific
+    title field against the pre-fetched account records from
+    the Webin Reports API.
+
+    Args:
+        new_records: Records the user wants to submit.
+        account_records: Existing records already registered
+            under the Webin account.
+        title_field: Field name for the title in new records
+            (e.g. ``"STUDY_TITLE"`` or ``"SAMPLE_TITLE"``).
+        entity_label: Label for log messages.
+
+    Returns:
+        Mapping of index in *new_records* to matching
+        existing record info.
+    """
+    duplicates: dict[int, dict[str, str]] = {}
+    total = len(new_records)
+
+    if not account_records:
+        return duplicates
+
+    by_title: dict[str, dict[str, str]] = {}
+    by_alias: dict[str, dict[str, str]] = {}
+    for rec in account_records:
+        title = (rec.get("title") or "").strip()
+        alias = (rec.get("alias") or "").strip()
+        if title:
+            by_title[title] = rec
+        if alias:
+            by_alias[alias] = rec
+
+    logger.info(
+        "Checking %d new %s against %d existing account %s...",
+        total, entity_label, len(account_records), entity_label,
+    )
+
+    for i, record in enumerate(new_records):
+        new_title = (
+            record.get(title_field) or ""
+        ).strip()
+        new_alias = (record.get("alias") or "").strip()
+
+        if not new_title and not new_alias:
+            continue
+
+        match = _match_by_alias_title(
+            new_alias, new_title, by_alias, by_title,
+        )
+        if match is not None:
+            duplicates[i] = match
+            logger.info(
+                "  Duplicate: '%s' matches %s -> %s (%s)",
+                new_title or new_alias,
+                match["match_reason"],
+                match["accession"],
+                match["status"],
+            )
+
+            if len(duplicates) == total:
+                logger.info("All %s are duplicates — skipping further checks", entity_label)
+                return duplicates
+
+    return duplicates
+
+
+def _match_by_alias_title(
+    new_alias: str,
+    new_title: str,
+    by_alias: dict[str, dict[str, str]],
+    by_title: dict[str, dict[str, str]],
+) -> dict[str, str] | None:
+    """Return matching record info or ``None``."""
+    if new_alias and new_alias in by_alias:
+        rec = by_alias[new_alias]
+        reason = f"alias '{new_alias}'"
+    elif new_title and new_title in by_title:
+        rec = by_title[new_title]
+        reason = f"title '{new_title}'"
+    else:
+        return None
+
+    return {
+        "accession": rec.get("accession", ""),
+        "secondary_accession": rec.get(
+            "secondary_accession", ""
+        ),
+        "alias": rec.get("alias", ""),
+        "title": rec.get("title", ""),
+        "status": rec.get("status", "UNKNOWN"),
+        "match_reason": reason,
+    }
+
+
+# -----------------------------------------------------------
+# Result output
+# -----------------------------------------------------------
+
+
+def write_results(
+    results: dict[str, list[dict[str, Any]]],
+    output_path: Path | None,
+) -> None:
+    """Write JSON results to file or stdout."""
+    json_str = json.dumps(results, indent=2)
+    if output_path:
+        with open(output_path, "w") as fh:
+            fh.write(json_str + "\n")
+        logger.info("Results written to %s", output_path)
+    else:
+        print(json_str)
 
 
 # -----------------------------------------------------------
@@ -90,7 +737,7 @@ def fetch_account_studies(
     Returns:
         List of normalised study dicts.
     """
-    return common.fetch_account_records(
+    return fetch_account_records(
         auth,
         use_test=use_test,
         prod_url=_PROD_REPORTS_URL,
@@ -114,7 +761,7 @@ def find_duplicate_studies(
     Returns:
         Mapping of index to matching study info.
     """
-    return common.find_duplicates_by_alias_title(
+    return find_duplicates_by_alias_title(
         new_studies, account_studies,
         title_field="STUDY_TITLE",
         entity_label="studies",
@@ -288,7 +935,7 @@ def validate_study_xml(
     Returns:
         Tuple of (*is_valid*, *messages*).
     """
-    return common.validate_xml_against_xsd(
+    return validate_xml_against_xsd(
         xml_bytes,
         fallback_checker=_validate_study_xml_structure,
     )
@@ -392,7 +1039,7 @@ def _do_submission(
 
     logger.info("Submitting %s to ENA (%s)...", action, env_label)
     try:
-        receipt_root = common.submit_xml(base_url, auth, xml_bytes)
+        receipt_root = submit_xml(base_url, auth, xml_bytes)
     except requests.exceptions.HTTPError as exc:
         logger.error("HTTP error during %s submission: %s", action, exc)
         if exc.response is not None:
@@ -495,21 +1142,21 @@ def main(
     force: bool,
 ) -> None:
     """Submit studies to ENA via the Webin REST API v2."""
-    common.setup_logging(log_file)
-    username, password = common.get_credentials()
+    setup_logging(log_file)
+    username, password = get_credentials()
 
     env_label = "TEST" if use_test else "PRODUCTION"
     logger.info("ENA Study Submission — environment: %s", env_label)
-    base_url = common.get_base_url(use_test)
+    base_url = get_base_url(use_test)
     auth = HTTPBasicAuth(username, password)
     logger.debug("Auth username: %s", username)
 
     if hold_until:
-        common.validate_hold_until(hold_until)
+        validate_hold_until(hold_until)
 
     # -- Step 1: Load input file -------------------------
     logger.info("Loading input: %s", input_file)
-    studies = common.load_input_file(
+    studies = load_input_file(
         input_file, json_record_keys=_JSON_RECORD_KEYS,
     )
     if studies is None:
@@ -578,7 +1225,7 @@ def main(
 
     if not studies_to_submit and not studies_to_modify:
         logger.info("No studies to submit (all are duplicates or input is empty)")
-        common.write_results(results, output)
+        write_results(results, output)
         return
 
     logger.info(
@@ -592,7 +1239,7 @@ def main(
     if studies_to_submit:
         logger.info("Building ADD XML for %d new study/studies...", len(studies_to_submit))
         xml_root = build_submission_xml(studies_to_submit, hold_until=hold_until, action="ADD")
-        xml_bytes = common.xml_to_bytes(xml_root)
+        xml_bytes = xml_to_bytes(xml_root)
         logger.debug("Generated XML (ADD):\n%s", xml_bytes.decode("utf-8"))
         logger.info("XML document size (ADD): %d bytes", len(xml_bytes))
         ok = _do_submission(
@@ -609,7 +1256,7 @@ def main(
     if studies_to_modify:
         logger.info("Building MODIFY XML for %d duplicate(s)...", len(studies_to_modify))
         xml_root = build_submission_xml(studies_to_modify, hold_until=hold_until, action="MODIFY")
-        xml_bytes = common.xml_to_bytes(xml_root)
+        xml_bytes = xml_to_bytes(xml_root)
         logger.debug("Generated XML (MODIFY):\n%s", xml_bytes.decode("utf-8"))
         logger.info("XML document size (MODIFY): %d bytes", len(xml_bytes))
         ok = _do_submission(
@@ -626,7 +1273,7 @@ def main(
         sys.exit(1)
 
     # -- Step 5: Output results --------------------------
-    common.write_results(results, output)
+    write_results(results, output)
 
     logger.info("=" * 60)
     logger.info("SUBMISSION SUMMARY")
diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test
index 42f6902..43c72eb 100644
--- a/modules/local/registerstudy/tests/main.nf.test
+++ b/modules/local/registerstudy/tests/main.nf.test
@@ -24,7 +24,7 @@ nextflow_process {
         then {
             assertAll(
                 { assert process.success },
-                { assert snapshot(process.out).match() }
+                { assert snapshot(sanitizeOutput(process.out)).match() }
             )
         }
     }

From 15a0f7efccd2cef1189e5cbf59c944bc78d3fb44 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 12:15:10 +0000
Subject: [PATCH 11/36] Update nft-utils to get sanatizeOutputs to work

---
 modules/local/registerstudy/tests/main.nf.test.snap | 13 +------------
 nf-test.config                                      |  2 +-
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/modules/local/registerstudy/tests/main.nf.test.snap b/modules/local/registerstudy/tests/main.nf.test.snap
index 4b184e9..5e2fde1 100644
--- a/modules/local/registerstudy/tests/main.nf.test.snap
+++ b/modules/local/registerstudy/tests/main.nf.test.snap
@@ -2,17 +2,6 @@
     "registerstudy - stub": {
         "content": [
             {
-                "0": [
-                    [
-                        {
-                            "id": "example_study"
-                        },
-                        "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
-                    ]
-                ],
-                "1": [
-                    "versions.yml:md5,ddcc758a7d28faecd4286941889ab7e1"
-                ],
                 "accessions": [
                     [
                         {
@@ -26,7 +15,7 @@
                 ]
             }
         ],
-        "timestamp": "2026-03-12T13:52:06.989729",
+        "timestamp": "2026-03-13T12:14:02.650852",
         "meta": {
             "nf-test": "0.9.4",
             "nextflow": "25.10.4"
diff --git a/nf-test.config b/nf-test.config
index 3525ead..613fc05 100644
--- a/nf-test.config
+++ b/nf-test.config
@@ -19,6 +19,6 @@ config {
 
     // load the necessary plugins
     plugins {
-        load "nft-utils@0.0.3"
+        load "nft-utils@0.0.9"
     }
 }

From 7403a3e62611634307ccae6e0844479fa5b72cfb Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 12:17:50 +0000
Subject: [PATCH 12/36] Revert test config

---
 nextflow.config | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/nextflow.config b/nextflow.config
index dba0973..dd678eb 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -175,17 +175,7 @@ profiles {
         singularity.runOptions  = '--nv'
     }
     // TODO: figure out how to better orginise tests for different workflow types (bins, mags, metagenomic_assemblies)
-    // test             { includeConfig 'conf/test.config'          }
-    test {
-        docker.enabled          = true
-        conda.enabled           = false
-        singularity.enabled     = false
-        podman.enabled          = false
-        shifter.enabled         = false
-        charliecloud.enabled    = false
-        apptainer.enabled       = false
-        docker.runOptions       = '-u $(id -u):$(id -g)'
-    }
+    test             { includeConfig 'conf/test.config'          }
     test_genome      { includeConfig 'conf/test_genome.config'   }
     test_assembly    { includeConfig 'conf/test_assembly.config' }
     test_full        { includeConfig 'conf/test_full.config'     }

From 416fa7568b0bc5005b351bacb5950a3f4b4e0f59 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 12:56:21 +0000
Subject: [PATCH 13/36] Updated tests to run locally

---
 .../tests/main.nf.test                          |  2 +-
 .../tests/main.nf.test.snap                     | 17 +++--------------
 .../local/registerstudy/tests/nextflow.config   |  1 +
 3 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test b/modules/local/generate_assembly_manifest/tests/main.nf.test
index 897744a..790d402 100644
--- a/modules/local/generate_assembly_manifest/tests/main.nf.test
+++ b/modules/local/generate_assembly_manifest/tests/main.nf.test
@@ -28,7 +28,7 @@ nextflow_process {
             assert process.success
             assertAll(
                 { assert snapshot(
-                    process.out,
+                    sanitizeOutput(process.out),
                     path(process.out.versions[0]).yaml
                 ).match() },
                 { assert process.out.manifest.size() == 1 },
diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap
index f594383..5f5b1d7 100644
--- a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap
+++ b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap
@@ -31,7 +31,7 @@
                 }
             }
         ],
-        "timestamp": "2026-03-12T13:52:01.267817",
+        "timestamp": "2026-03-13T12:32:28.183967",
         "meta": {
             "nf-test": "0.9.4",
             "nextflow": "25.10.4"
@@ -40,23 +40,12 @@
     "GENERATE_ASSEMBLY_MANIFEST completes with expected outputs": {
         "content": [
             {
-                "0": [
-                    [
-                        {
-                            "id": "test"
-                        },
-                        "233126d4c4d0.manifest:md5,8387c0e6c123313259db613612c09dce"
-                    ]
-                ],
-                "1": [
-                    "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2"
-                ],
                 "manifest": [
                     [
                         {
                             "id": "test"
                         },
-                        "233126d4c4d0.manifest:md5,8387c0e6c123313259db613612c09dce"
+                        "233126d4c4d0.manifest:md5,cacedcfcce220081e7aa2f98c2f4ffd6"
                     ]
                 ],
                 "versions": [
@@ -69,7 +58,7 @@
                 }
             }
         ],
-        "timestamp": "2026-03-12T13:51:56.121365",
+        "timestamp": "2026-03-13T12:32:23.722449",
         "meta": {
             "nf-test": "0.9.4",
             "nextflow": "25.10.4"
diff --git a/modules/local/registerstudy/tests/nextflow.config b/modules/local/registerstudy/tests/nextflow.config
index 4a84743..f22b24f 100644
--- a/modules/local/registerstudy/tests/nextflow.config
+++ b/modules/local/registerstudy/tests/nextflow.config
@@ -6,6 +6,7 @@
 // Dummy credentials are sufficient for --dry-run --automated mode since
 // no HTTP calls are made. For real submission tests, replace with secrets:
 //   env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD }
+
 process {
     withName: REGISTERSTUDY {
         ext.args = '--test --automated --dry-run'

From 7e2b7b8a3ead5d0e8ac9e13b4d5a3f09fa738904 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 13:29:11 +0000
Subject: [PATCH 14/36] Remove duplicate detection functionality from study
 submit

---
 bin/submit_study.py | 496 +++-----------------------------------------
 1 file changed, 27 insertions(+), 469 deletions(-)

diff --git a/bin/submit_study.py b/bin/submit_study.py
index 463318d..c2b165e 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -54,47 +54,14 @@
 
 # All loggers in the ENA submission scripts share this root,
 # so configuring it once propagates to all child loggers.
-_LOGGER_NAME: Final = "ena_submit"
-
+logging.basicConfig(
+    format="%(levelname)s: %(message)s",
+    level=logging.INFO,
+    stream=sys.stderr,
+)
 logger = logging.getLogger("ena_submit.study")
 
 
-def setup_logging(log_file: Path | None = None) -> None:
-    """Configure stderr and optional file logging.
-
-    Attach handlers to the ``ena_submit`` parent logger.
-    Child loggers (e.g. ``ena_submit.study``) propagate
-    their messages to these handlers automatically.
-
-    Args:
-        log_file: Path to a log file.  If provided,
-            debug-level messages are written there in
-            addition to stderr.
-    """
-    root = logging.getLogger(_LOGGER_NAME)
-
-    # Avoid duplicate handlers on repeated calls.
-    if root.handlers:
-        return
-
-    fmt = logging.Formatter(
-        "%(asctime)s [%(levelname)s] %(message)s",
-        datefmt="%Y-%m-%d %H:%M:%S",
-    )
-    root.setLevel(logging.DEBUG)
-
-    stderr_handler = logging.StreamHandler(sys.stderr)
-    stderr_handler.setLevel(logging.INFO)
-    stderr_handler.setFormatter(fmt)
-    root.addHandler(stderr_handler)
-
-    if log_file:
-        file_handler = logging.FileHandler(log_file)
-        file_handler.setLevel(logging.DEBUG)
-        file_handler.setFormatter(fmt)
-        root.addHandler(file_handler)
-
-
 # -----------------------------------------------------------
 # Credentials
 # -----------------------------------------------------------
@@ -449,233 +416,6 @@ def load_input_file(
     return None
 
 
-# -----------------------------------------------------------
-# Reports API
-# -----------------------------------------------------------
-
-
-def fetch_from_reports_endpoint(
-    url: str,
-    auth: HTTPBasicAuth,
-    max_results: int = 5000,
-) -> list[dict[str, Any]] | None:
-    """Fetch records from a single Webin Reports endpoint.
-
-    Args:
-        url: Full URL of the reports endpoint.
-        auth: HTTP basic-auth credentials.
-        max_results: Maximum number of results to request.
-
-    Returns:
-        List of raw report dicts, or ``None`` on error.
-    """
-    params = {
-        "format": "json",
-        "max-results": max_results,
-    }
-
-    req = requests.Request("GET", url, params=params, auth=auth)
-    prepared = req.prepare()
-    logger.debug('curl -u %s:*** "%s"', auth.username, prepared.url)
-
-    try:
-        resp = requests.get(url, params=params, auth=auth, timeout=60)
-        logger.info("Reports API at %s returned %s", url, resp.status_code)
-        resp.raise_for_status()
-        return resp.json()
-
-    except requests.exceptions.HTTPError as exc:
-        status = (
-            exc.response.status_code
-            if exc.response is not None
-            else "unknown"
-        )
-        if status == 404:
-            logger.info("Reports API at %s returned 404 — no records yet", url)
-            return []
-        if status in (401, 403):
-            logger.warning(
-                "Reports API at %s returned %s — endpoint may not be available"
-                " or credentials may differ",
-                url, status,
-            )
-            return None
-        logger.warning("Reports API at %s returned HTTP %s", url, status)
-        return None
-
-    except requests.exceptions.RequestException as exc:
-        logger.warning("Reports API at %s failed: %s", url, exc)
-        return None
-
-
-def fetch_account_records(
-    auth: HTTPBasicAuth,
-    use_test: bool,
-    prod_url: str,
-    test_url: str,
-    normalizer: Callable[
-        [dict[str, Any]], dict[str, str] | None
-    ],
-    entity_label: str,
-    max_results: int = 5000,
-) -> list[dict[str, str]]:
-    """Fetch and normalise records from the Reports API.
-
-    Try test endpoint first (if *use_test*), then fall back
-    to production.
-
-    Args:
-        auth: HTTP basic-auth credentials.
-        use_test: Try the test endpoint first.
-        prod_url: Production reports endpoint URL.
-        test_url: Test reports endpoint URL.
-        normalizer: Callable that maps a raw report dict to
-            a normalised dict, or ``None`` to skip.
-        entity_label: Label for log messages (e.g.
-            ``"studies"``).
-        max_results: Maximum number of results to request.
-
-    Returns:
-        List of normalised record dicts.
-    """
-    urls = (
-        [test_url, prod_url] if use_test
-        else [prod_url]
-    )
-
-    for url in urls:
-        logger.info("Fetching account %s from: %s", entity_label, url)
-        raw = fetch_from_reports_endpoint(url, auth, max_results)
-        if raw is None:
-            continue
-
-        records: list[dict[str, str]] = []
-        for entry in raw:
-            report = entry.get("report")
-            if report is None:
-                continue
-            normalized = normalizer(report)
-            if normalized is not None:
-                records.append(normalized)
-
-        logger.info("Found %d %s in account", len(records), entity_label)
-        return records
-
-    logger.warning(
-        "Could not reach any Webin reports endpoint."
-        " Duplicate checking for %s will be skipped.",
-        entity_label,
-    )
-    return []
-
-
-# -----------------------------------------------------------
-# Duplicate detection (alias + title matching)
-# -----------------------------------------------------------
-
-
-def find_duplicates_by_alias_title(
-    new_records: Sequence[dict[str, Any]],
-    account_records: Sequence[dict[str, str]],
-    title_field: str,
-    entity_label: str,
-) -> dict[int, dict[str, str]]:
-    """Check new records against account records.
-
-    Match by ``alias`` (preferred) or by the entity-specific
-    title field against the pre-fetched account records from
-    the Webin Reports API.
-
-    Args:
-        new_records: Records the user wants to submit.
-        account_records: Existing records already registered
-            under the Webin account.
-        title_field: Field name for the title in new records
-            (e.g. ``"STUDY_TITLE"`` or ``"SAMPLE_TITLE"``).
-        entity_label: Label for log messages.
-
-    Returns:
-        Mapping of index in *new_records* to matching
-        existing record info.
-    """
-    duplicates: dict[int, dict[str, str]] = {}
-    total = len(new_records)
-
-    if not account_records:
-        return duplicates
-
-    by_title: dict[str, dict[str, str]] = {}
-    by_alias: dict[str, dict[str, str]] = {}
-    for rec in account_records:
-        title = (rec.get("title") or "").strip()
-        alias = (rec.get("alias") or "").strip()
-        if title:
-            by_title[title] = rec
-        if alias:
-            by_alias[alias] = rec
-
-    logger.info(
-        "Checking %d new %s against %d existing account %s...",
-        total, entity_label, len(account_records), entity_label,
-    )
-
-    for i, record in enumerate(new_records):
-        new_title = (
-            record.get(title_field) or ""
-        ).strip()
-        new_alias = (record.get("alias") or "").strip()
-
-        if not new_title and not new_alias:
-            continue
-
-        match = _match_by_alias_title(
-            new_alias, new_title, by_alias, by_title,
-        )
-        if match is not None:
-            duplicates[i] = match
-            logger.info(
-                "  Duplicate: '%s' matches %s -> %s (%s)",
-                new_title or new_alias,
-                match["match_reason"],
-                match["accession"],
-                match["status"],
-            )
-
-            if len(duplicates) == total:
-                logger.info("All %s are duplicates — skipping further checks", entity_label)
-                return duplicates
-
-    return duplicates
-
-
-def _match_by_alias_title(
-    new_alias: str,
-    new_title: str,
-    by_alias: dict[str, dict[str, str]],
-    by_title: dict[str, dict[str, str]],
-) -> dict[str, str] | None:
-    """Return matching record info or ``None``."""
-    if new_alias and new_alias in by_alias:
-        rec = by_alias[new_alias]
-        reason = f"alias '{new_alias}'"
-    elif new_title and new_title in by_title:
-        rec = by_title[new_title]
-        reason = f"title '{new_title}'"
-    else:
-        return None
-
-    return {
-        "accession": rec.get("accession", ""),
-        "secondary_accession": rec.get(
-            "secondary_accession", ""
-        ),
-        "alias": rec.get("alias", ""),
-        "title": rec.get("title", ""),
-        "status": rec.get("status", "UNKNOWN"),
-        "match_reason": reason,
-    }
-
-
 # -----------------------------------------------------------
 # Result output
 # -----------------------------------------------------------
@@ -695,79 +435,6 @@ def write_results(
         print(json_str)
 
 
-# -----------------------------------------------------------
-# Reports API (study-specific)
-# -----------------------------------------------------------
-
-_PROD_REPORTS_URL: Final = "https://www.ebi.ac.uk/ena/submit/report/projects"
-_TEST_REPORTS_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/report/projects"
-
-
-def _normalize_study_report(
-    report: dict[str, Any],
-) -> dict[str, str]:
-    """Normalise a raw study report dict."""
-    return {
-        "title": (
-            report.get("title") or report.get("studyTitle") or report.get("STUDY_TITLE", "")
-        ),
-        "alias": report.get("alias") or report.get("studyAlias") or "",
-        "accession": (
-            report.get("accession")
-            or report.get("studyAccession")
-            or report.get("report", {}).get("id", "")
-        ),
-        "secondary_accession": report.get("secondaryAccession") or report.get("secondaryId", ""),
-        "status": report.get("releaseStatus", "UNKNOWN"),
-    }
-
-
-def fetch_account_studies(
-    auth: HTTPBasicAuth,
-    use_test: bool = False,
-    max_results: int = 5000,
-) -> list[dict[str, str]]:
-    """Fetch all projects from the Webin Reports API.
-
-    Args:
-        auth: HTTP basic-auth credentials.
-        use_test: Try the test endpoint before production.
-        max_results: Maximum number of results to request.
-
-    Returns:
-        List of normalised study dicts.
-    """
-    return fetch_account_records(
-        auth,
-        use_test=use_test,
-        prod_url=_PROD_REPORTS_URL,
-        test_url=_TEST_REPORTS_URL,
-        normalizer=_normalize_study_report,
-        entity_label="studies",
-        max_results=max_results,
-    )
-
-
-def find_duplicate_studies(
-    new_studies: list[dict[str, Any]],
-    account_studies: list[dict[str, str]],
-) -> dict[int, dict[str, str]]:
-    """Check new studies against existing account studies.
-
-    Args:
-        new_studies: Studies the user wants to submit.
-        account_studies: Existing studies in the account.
-
-    Returns:
-        Mapping of index to matching study info.
-    """
-    return find_duplicates_by_alias_title(
-        new_studies, account_studies,
-        title_field="STUDY_TITLE",
-        entity_label="studies",
-    )
-
-
 # -----------------------------------------------------------
 # XML construction
 # -----------------------------------------------------------
@@ -1110,39 +777,19 @@ def _do_submission(
     help="Path to write JSON accession results (default: stdout)",
 )
 @click.option(
-    "--max-results",
-    default=5000,
-    help="Maximum number of projects to fetch from the Reports API for duplicate checking",
-)
-@click.option(
-    "--dry-run",
+    "--validate",
     is_flag=True, default=False,
     help="Validate and build XML but do not submit to ENA",
 )
-@click.option(
-    "--automated",
-    is_flag=True, default=False,
-    help="Skip duplicate detection against the Webin Reports API (for automated pipelines)",
-)
-@click.option(
-    "--force",
-    is_flag=True, default=False,
-    help="Submit duplicate studies using the MODIFY action to overwrite existing ENA records,"
-    " instead of skipping them",
-)
 def main(
     input_file: Path,
     use_test: bool,
     hold_until: str | None,
     log_file: Path | None,
     output: Path | None,
-    max_results: int,
-    dry_run: bool,
-    automated: bool,
-    force: bool,
+    validate: bool,
 ) -> None:
     """Submit studies to ENA via the Webin REST API v2."""
-    setup_logging(log_file)
     username, password = get_credentials()
 
     env_label = "TEST" if use_test else "PRODUCTION"
@@ -1165,133 +812,44 @@ def main(
 
     logger.info("Loaded %d study/studies from input", len(studies))
 
-    # -- Step 2: Check for duplicates --------------------
-    if automated:
-        logger.info("Automated mode: skipping duplicate detection")
-        duplicates: dict[int, dict[str, Any]] = {}
-    else:
-        account_studies = fetch_account_studies(
-            auth, use_test=use_test,
-            max_results=max_results,
-        )
-        for ps in account_studies:
-            logger.info(
-                "  Account study: %s | alias=%s | title=%s | status=%s",
-                ps["accession"], ps["alias"], ps["title"], ps["status"],
-            )
-        duplicates = find_duplicate_studies(
-            studies, account_studies,
-        )
+    if not studies:
+        logger.info("No studies to submit")
+        write_results({"submitted": [], "failed": []}, output)
+        return
 
     results: dict[str, list[dict[str, Any]]] = {
-        "duplicates": [],
         "submitted": [],
-        "modified": [],
         "failed": [],
     }
 
-    studies_to_modify: list[dict[str, Any]] = []
-    if duplicates:
-        action_label = "will be re-submitted with MODIFY" if force else "will NOT be submitted"
-        logger.warning(
-            "Found %d duplicate(s) — %s:",
-            len(duplicates), action_label,
-        )
-        for idx, dup_info in duplicates.items():
-            study_title = studies[idx].get("STUDY_TITLE", f"study[{idx}]")
-            logger.warning(
-                "  DUPLICATE: '%s' matches existing %s (accession: %s)",
-                study_title, dup_info["match_reason"], dup_info["accession"],
-            )
-            results["duplicates"].append({
-                "input_index": idx,
-                "title": study_title,
-                "alias": studies[idx].get("alias", ""),
-                "existing_accession": dup_info["accession"],
-                "existing_secondary_accession": dup_info.get("secondary_accession", ""),
-                "match_reason": dup_info["match_reason"],
-            })
-            if force:
-                study_copy = dict(studies[idx])
-                existing_alias = dup_info.get("alias", "")
-                if existing_alias:
-                    study_copy["alias"] = existing_alias
-                studies_to_modify.append(study_copy)
-
-    studies_to_submit = [
-        s for i, s in enumerate(studies)
-        if i not in duplicates
-    ]
-
-    if not studies_to_submit and not studies_to_modify:
-        logger.info("No studies to submit (all are duplicates or input is empty)")
-        write_results(results, output)
-        return
-
-    logger.info(
-        "%d new study/studies to ADD, %d duplicate(s) to MODIFY",
-        len(studies_to_submit), len(studies_to_modify),
+    # -- Step 2: Build and submit XML --------------------
+    logger.info("Building ADD XML for %d study/studies...", len(studies))
+    xml_root = build_submission_xml(studies, hold_until=hold_until, action="ADD")
+    xml_bytes = xml_to_bytes(xml_root)
+    logger.debug("Generated XML:\n%s", xml_bytes.decode("utf-8"))
+    logger.info("XML document size: %d bytes", len(xml_bytes))
+    ok = _do_submission(
+        base_url, auth, xml_bytes,
+        action="ADD",
+        results=results,
+        result_key="submitted",
+        env_label=env_label,
+        dry_run=validate,
     )
 
-    overall_ok = True
-
-    # -- Step 3: ADD new studies -------------------------
-    if studies_to_submit:
-        logger.info("Building ADD XML for %d new study/studies...", len(studies_to_submit))
-        xml_root = build_submission_xml(studies_to_submit, hold_until=hold_until, action="ADD")
-        xml_bytes = xml_to_bytes(xml_root)
-        logger.debug("Generated XML (ADD):\n%s", xml_bytes.decode("utf-8"))
-        logger.info("XML document size (ADD): %d bytes", len(xml_bytes))
-        ok = _do_submission(
-            base_url, auth, xml_bytes,
-            action="ADD",
-            results=results,
-            result_key="submitted",
-            env_label=env_label,
-            dry_run=dry_run,
-        )
-        overall_ok = overall_ok and ok
-
-    # -- Step 4: MODIFY duplicate studies (--force) ------
-    if studies_to_modify:
-        logger.info("Building MODIFY XML for %d duplicate(s)...", len(studies_to_modify))
-        xml_root = build_submission_xml(studies_to_modify, hold_until=hold_until, action="MODIFY")
-        xml_bytes = xml_to_bytes(xml_root)
-        logger.debug("Generated XML (MODIFY):\n%s", xml_bytes.decode("utf-8"))
-        logger.info("XML document size (MODIFY): %d bytes", len(xml_bytes))
-        ok = _do_submission(
-            base_url, auth, xml_bytes,
-            action="MODIFY",
-            results=results,
-            result_key="modified",
-            env_label=env_label,
-            dry_run=dry_run,
-        )
-        overall_ok = overall_ok and ok
-
-    if not overall_ok:
+    if not ok:
         sys.exit(1)
 
-    # -- Step 5: Output results --------------------------
+    # -- Step 3: Output results --------------------------
     write_results(results, output)
 
     logger.info("=" * 60)
     logger.info("SUBMISSION SUMMARY")
-    logger.info(
-        "  Duplicates skipped: %d", len(results["duplicates"]) - len(results["modified"]),
-    )
-    for d in results["duplicates"]:
-        logger.info("    %s -> %s", d["title"], d["existing_accession"])
-    logger.info("  Newly submitted (ADD): %d", len(results["submitted"]))
+    logger.info("  Submitted (ADD): %d", len(results["submitted"]))
     for s in results["submitted"]:
         ext = s.get("external_accession", "")
         ext_suffix = f" ({ext})" if ext else ""
         logger.info("    %s -> %s%s", s["alias"], s["accession"], ext_suffix)
-    logger.info("  Modified (MODIFY): %d", len(results["modified"]))
-    for m in results["modified"]:
-        ext = m.get("external_accession", "")
-        ext_suffix = f" ({ext})" if ext else ""
-        logger.info("    %s -> %s%s", m["alias"], m["accession"], ext_suffix)
     logger.info("=" * 60)
 
 

From 1e87af775a500e8de78ee7c6a7a1c572539fc1f1 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 13:33:52 +0000
Subject: [PATCH 15/36] Remove unused code

---
 bin/submit_study.py | 49 +--------------------------------------------
 1 file changed, 1 insertion(+), 48 deletions(-)

diff --git a/bin/submit_study.py b/bin/submit_study.py
index c2b165e..57edb92 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -182,53 +182,6 @@ def validate_hold_until(hold_until: str) -> datetime.date:
     return hold_date
 
 
-# -----------------------------------------------------------
-# ENA checklist XML parsing
-# -----------------------------------------------------------
-
-
-def parse_checklist_units(
-    xml_path: str | Path,
-) -> dict[str, str]:
-    """Parse an ENA checklist XML and return field units.
-
-    Reads the ``<FIELD>`` elements from an ENA checklist XML
-    file (e.g. ``ERC000015.xml``) and returns a mapping from
-    slot name to unit string for every field that declares a
-    ``<UNITS><UNIT>`` element.
-
-    Args:
-        xml_path: Path to the ENA checklist XML file.
-
-    Returns:
-        Dict mapping slot name to unit string.
-        Fields without units are absent from the dict.
-    """
-    units: dict[str, str] = {}
-    try:
-        tree = ET.parse(str(xml_path))
-    except ET.ParseError as exc:
-        logger.warning(
-            "Could not parse checklist XML %s: %s",
-            xml_path, exc,
-        )
-        return units
-
-    for field in tree.iter("FIELD"):
-        name_el = field.find("NAME")
-        if name_el is None or not name_el.text:
-            continue
-        units_el = field.find("UNITS")
-        if units_el is None:
-            continue
-        unit_el = units_el.find("UNIT")
-        if unit_el is None or not unit_el.text:
-            continue
-        units[name_el.text.strip()] = unit_el.text.strip()
-
-    return units
-
-
 # -----------------------------------------------------------
 # XSD validation (structural fallback only)
 # -----------------------------------------------------------
@@ -746,7 +699,7 @@ def _do_submission(
 
 
 @click.command(
-    help="Submit raw-reads, assembly and genome studies to ENA via the Webin REST API v2.",
+    help="Submit studies to ENA via the Webin REST API v2.",
 )
 @click.option(
     "--input", "input_file",

From ffc4e904a712fd53c0ab66620a3aa6b5c70b7fbb Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 13:40:18 +0000
Subject: [PATCH 16/36] Move test fixtures to nf-croe test-datasets

---
 assets/test-fixtures/example_study.csv         |  3 ---
 assets/test-fixtures/example_study.json        | 15 ---------------
 assets/test-fixtures/example_study.tsv         |  3 ---
 modules/local/registerstudy/tests/main.nf.test |  4 ++--
 tests/default.nf.test                          |  2 +-
 5 files changed, 3 insertions(+), 24 deletions(-)
 delete mode 100644 assets/test-fixtures/example_study.csv
 delete mode 100644 assets/test-fixtures/example_study.json
 delete mode 100644 assets/test-fixtures/example_study.tsv

diff --git a/assets/test-fixtures/example_study.csv b/assets/test-fixtures/example_study.csv
deleted file mode 100644
index 2b68cc1..0000000
--- a/assets/test-fixtures/example_study.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-Generic,,,,,,,,
-IS_PRIMARY,STUDY_TITLE,existing_study_type,new_study_type,STUDY_ABSTRACT,CENTER_NAME,CENTER_PROJECT_NAME,PROJECT_ID,STUDY_DESCRIPTION
-YES,MIMICC,Metagenomics,,,,,,
\ No newline at end of file
diff --git a/assets/test-fixtures/example_study.json b/assets/test-fixtures/example_study.json
deleted file mode 100644
index cd9af28..0000000
--- a/assets/test-fixtures/example_study.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-  "schema": "https://github.com/timrozday/ena-submission-dataharmonizer/SRA_study",
-  "location": "/templates/sra_study",
-  "version": "1.0.0",
-  "in_language": "en",
-  "Container": {
-    "SRA_studys": [
-      {
-        "IS_PRIMARY": "YES",
-        "STUDY_TITLE": "MIMICC",
-        "existing_study_type": "Metagenomics"
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/assets/test-fixtures/example_study.tsv b/assets/test-fixtures/example_study.tsv
deleted file mode 100644
index 4682df1..0000000
--- a/assets/test-fixtures/example_study.tsv
+++ /dev/null
@@ -1,3 +0,0 @@
-Generic								
-IS_PRIMARY	STUDY_TITLE	existing_study_type	new_study_type	STUDY_ABSTRACT	CENTER_NAME	CENTER_PROJECT_NAME	PROJECT_ID	STUDY_DESCRIPTION
-YES	MIMICC	Metagenomics						
\ No newline at end of file
diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test
index 43c72eb..5d61673 100644
--- a/modules/local/registerstudy/tests/main.nf.test
+++ b/modules/local/registerstudy/tests/main.nf.test
@@ -15,7 +15,7 @@ nextflow_process {
                 """
                 input[0] = [
                     [ id:'example_study' ],
-                    file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true)
+                    file(params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json", checkIfExists: true)
                 ]
                 """
             }
@@ -38,7 +38,7 @@ nextflow_process {
                 """
                 input[0] = [
                     [ id:'example_study' ],
-                    file("${projectDir}/assets/test-fixtures/example_study.json", checkIfExists: true)
+                    file(params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json", checkIfExists: true)
                 ]
                 """
             }
diff --git a/tests/default.nf.test b/tests/default.nf.test
index 4a3b628..9ed7563 100644
--- a/tests/default.nf.test
+++ b/tests/default.nf.test
@@ -65,7 +65,7 @@ nextflow_pipeline {
                 outdir         = "$outputDir"
                 input          = csv.absolutePath
                 mode           = "metagenomic_assemblies"
-                study_metadata = "${projectDir}/assets/test-fixtures/example_study.json"
+                study_metadata = params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json"
                 centre_name    = "TEST_CENTER"
             }
         }

From 3824311ce3f94ea762aa0a40de40f846b447d819 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 14:04:50 +0000
Subject: [PATCH 17/36] Fix the tests using nf-core test-datasets and removing
 inputs and outputs that have been removed previously

---
 .../tests/main.nf.test.snap                        |  8 ++++----
 modules/local/registerstudy/main.nf                |  2 +-
 .../local/registerstudy/tests/main.nf.test.snap    |  4 ++--
 modules/local/registerstudy/tests/nextflow.config  | 14 +++++++++-----
 tests/default.nf.test                              |  2 +-
 workflows/assemblysubmit.nf                        |  1 -
 6 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap
index 5f5b1d7..cf8a9e1 100644
--- a/modules/local/generate_assembly_manifest/tests/main.nf.test.snap
+++ b/modules/local/generate_assembly_manifest/tests/main.nf.test.snap
@@ -11,7 +11,7 @@
                     ]
                 ],
                 "1": [
-                    "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2"
+                    "versions.yml:md5,4711ed8f2fd35e895aefafebd29f0333"
                 ],
                 "manifest": [
                     [
@@ -22,16 +22,16 @@
                     ]
                 ],
                 "versions": [
-                    "versions.yml:md5,0664035de44b4d88c1a70a357c1a24f2"
+                    "versions.yml:md5,4711ed8f2fd35e895aefafebd29f0333"
                 ]
             },
             {
                 "GENERATE_ASSEMBLY_MANIFEST": {
-                    "assembly_uploader": "assembly_uploader 1.3.4"
+                    "assembly_uploader": null
                 }
             }
         ],
-        "timestamp": "2026-03-13T12:32:28.183967",
+        "timestamp": "2026-03-13T14:02:14.937082",
         "meta": {
             "nf-test": "0.9.4",
             "nextflow": "25.10.4"
diff --git a/modules/local/registerstudy/main.nf b/modules/local/registerstudy/main.nf
index 67766e0..99533da 100644
--- a/modules/local/registerstudy/main.nf
+++ b/modules/local/registerstudy/main.nf
@@ -37,7 +37,7 @@ process REGISTERSTUDY {
     stub:
     def prefix = task.ext.prefix ?: "${meta.id}"
     """
-    echo '{"submitted":[],"duplicates":[],"modified":[],"failed":[]}' > ${prefix}_accessions.json
+    echo '{"submitted":[],"failed":[]}' > ${prefix}_accessions.json
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/local/registerstudy/tests/main.nf.test.snap b/modules/local/registerstudy/tests/main.nf.test.snap
index 5e2fde1..385b735 100644
--- a/modules/local/registerstudy/tests/main.nf.test.snap
+++ b/modules/local/registerstudy/tests/main.nf.test.snap
@@ -7,7 +7,7 @@
                         {
                             "id": "example_study"
                         },
-                        "example_study_accessions.json:md5,e43f257b8a1ffb551bf993867c5f1d7f"
+                        "example_study_accessions.json:md5,83600b2fb33a560c25351dbd4a9bdba2"
                     ]
                 ],
                 "versions": [
@@ -15,7 +15,7 @@
                 ]
             }
         ],
-        "timestamp": "2026-03-13T12:14:02.650852",
+        "timestamp": "2026-03-13T14:02:21.161445",
         "meta": {
             "nf-test": "0.9.4",
             "nextflow": "25.10.4"
diff --git a/modules/local/registerstudy/tests/nextflow.config b/modules/local/registerstudy/tests/nextflow.config
index f22b24f..aaf8385 100644
--- a/modules/local/registerstudy/tests/nextflow.config
+++ b/modules/local/registerstudy/tests/nextflow.config
@@ -1,15 +1,14 @@
 // Test configuration for REGISTERSTUDY module.
-// --test        : use the ENA dev server (submissions are discarded daily)
-// --automated   : skip the Webin Reports duplicate-checking API call
-// --dry-run     : validate and build XML but do not submit to ENA
+// --test     : use the ENA dev server (submissions are discarded daily)
+// --validate : validate and build XML but do not submit to ENA
 //
-// Dummy credentials are sufficient for --dry-run --automated mode since
+// Dummy credentials are sufficient for --validate mode since
 // no HTTP calls are made. For real submission tests, replace with secrets:
 //   env { ENA_WEBIN = secrets.WEBIN_ACCOUNT; ENA_WEBIN_PASSWORD = secrets.WEBIN_PASSWORD }
 
 process {
     withName: REGISTERSTUDY {
-        ext.args = '--test --automated --dry-run'
+        ext.args = '--test --validate'
     }
 }
 
@@ -17,3 +16,8 @@ env {
     ENA_WEBIN          = 'Webin-000000'
     ENA_WEBIN_PASSWORD = 'dummy-password'
 }
+
+docker {
+    enabled    = true
+    runOptions = '-u $(id -u):$(id -g)'
+}
diff --git a/tests/default.nf.test b/tests/default.nf.test
index 9ed7563..b436ff9 100644
--- a/tests/default.nf.test
+++ b/tests/default.nf.test
@@ -65,7 +65,7 @@ nextflow_pipeline {
                 outdir         = "$outputDir"
                 input          = csv.absolutePath
                 mode           = "metagenomic_assemblies"
-                study_metadata = params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json"
+                study_metadata = "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/seqsubmit/test_data/study_metadata/example_study.json"
                 centre_name    = "TEST_CENTER"
             }
         }
diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf
index ec1309f..a7897ba 100644
--- a/workflows/assemblysubmit.nf
+++ b/workflows/assemblysubmit.nf
@@ -162,7 +162,6 @@ workflow ASSEMBLYSUBMIT {
             .map { _meta, json ->
                 def data = new groovy.json.JsonSlurper().parse(json)
                 data.submitted[0]?.accession
-                    ?: data.duplicates[0]?.existing_accession
             }
     }
 

From 5d0c83ee25fa6d744574ea21a2c96a4a14040a70 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 14:07:47 +0000
Subject: [PATCH 18/36] Remove references to dataharmonizer

---
 bin/submit_study.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/bin/submit_study.py b/bin/submit_study.py
index 57edb92..04fff2a 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -1,10 +1,8 @@
 #!/usr/bin/env python3
 """Submit raw-reads, assembly and genome studies to ENA via the Webin REST API v2.
 
-Read a DataHarmonizer export containing study metadata,
-check for duplicate studies already registered under the
-Webin account, construct an XML submission document, and
-submit new studies to ENA.
+Read a study metadata file (JSON, CSV, or TSV), construct an
+XML submission document, and submit new studies to ENA.
 
 Credentials are read from environment variables to avoid
 secrets appearing in shell history or process listings::
@@ -235,9 +233,10 @@ def validate_xml_against_xsd(
 
 
 def _is_metadata_row(row: Sequence[object]) -> bool:
-    """Check whether *row* is a DataHarmonizer label row.
+    """Check whether *row* is a non-data header/metadata row.
 
-    These rows have at most one non-empty cell.
+    Such rows have at most one non-empty cell and are skipped
+    during record extraction.
     """
     non_empty = sum(
         1 for c in row
@@ -252,8 +251,8 @@ def extract_records_from_tabular(
 ) -> list[dict[str, str]]:
     """Extract record dicts from a CSV or TSV file.
 
-    Skip an optional DataHarmonizer metadata row if
-    detected.
+    Skip an optional leading metadata/label row if detected
+    (a row with at most one non-empty cell).
 
     Args:
         filepath: Path to the tabular file.
@@ -294,11 +293,11 @@ def extract_records_from_json(
     input_data: object,
     record_keys: Sequence[str] = ("data",),
 ) -> list[dict[str, Any]] | None:
-    """Extract record dicts from a DataHarmonizer JSON export.
+    """Extract record dicts from a JSON input.
 
     Handle several JSON shapes:
 
-    * DataHarmonizer Container format::
+    * Container format (e.g. DataHarmonizer exports)::
 
         {"Container": {"<ClassName>s": [{...}, ...]}}
 

From fc14c4b8c406ad39f9d07631171f75d9c9cebd3d Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 14:10:03 +0000
Subject: [PATCH 19/36] Remove xml validation from submit study script

---
 bin/submit_study.py | 73 ---------------------------------------------
 1 file changed, 73 deletions(-)

diff --git a/bin/submit_study.py b/bin/submit_study.py
index 04fff2a..cc9d698 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -180,53 +180,6 @@ def validate_hold_until(hold_until: str) -> datetime.date:
     return hold_date
 
 
-# -----------------------------------------------------------
-# XSD validation (structural fallback only)
-# -----------------------------------------------------------
-
-
-def validate_xml_against_xsd(
-    xml_bytes: bytes,
-    _fragment_tag: str | None = None,  # unused; kept for API compatibility
-    fallback_checker: Callable[
-        [bytes, list[str]], tuple[bool, list[str]]
-    ] | None = None,
-) -> tuple[bool, list[str]]:
-    """Validate XML bytes using a structural check.
-
-    Full XSD validation via lxml is not available in this
-    container.  Uses *fallback_checker* if provided,
-    otherwise checks that the document is well-formed XML.
-
-    Args:
-        xml_bytes: Serialised XML document.
-        _fragment_tag: Unused; kept for API compatibility.
-        fallback_checker: Optional function called with
-            (*xml_bytes*, *messages*) that returns
-            (*is_valid*, *messages*).
-
-    Returns:
-        Tuple of (*is_valid*, *messages*).
-    """
-    messages: list[str] = []
-
-    if fallback_checker is not None:
-        return fallback_checker(xml_bytes, messages)
-
-    try:
-        ET.fromstring(xml_bytes)
-    except ET.ParseError as exc:
-        messages.append(
-            f"ERROR: XML is not well-formed: {exc}"
-        )
-        return False, messages
-
-    messages.append(
-        "XML is well-formed (basic check passed)"
-    )
-    return True, messages
-
-
 # -----------------------------------------------------------
 # File loading (JSON, CSV, TSV)
 # -----------------------------------------------------------
@@ -543,23 +496,6 @@ def _validate_study_xml_structure(
     return True, messages
 
 
-def validate_study_xml(
-    xml_bytes: bytes,
-) -> tuple[bool, list[str]]:
-    """Validate study XML structure.
-
-    Args:
-        xml_bytes: Serialised XML document.
-
-    Returns:
-        Tuple of (*is_valid*, *messages*).
-    """
-    return validate_xml_against_xsd(
-        xml_bytes,
-        fallback_checker=_validate_study_xml_structure,
-    )
-
-
 # -----------------------------------------------------------
 # Receipt parsing
 # -----------------------------------------------------------
@@ -642,15 +578,6 @@ def _do_submission(
     Returns:
         ``True`` if the batch succeeded (or dry run).
     """
-    xml_valid, xml_messages = validate_study_xml(xml_bytes)
-    for msg in xml_messages:
-        logger.info("  %s", msg)
-    if not xml_valid:
-        logger.error("XML validation FAILED (%s) — aborting submission", action)
-        return False
-
-    logger.info("XML validation PASSED (%s)", action)
-
     if dry_run:
         logger.info("DRY RUN — skipping %s submission", action)
         logger.info("Generated XML:\n%s", xml_bytes.decode("utf-8"))

From 7e586c44ed7e3953a6a69da2fb67df4e807a3fb2 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 14:13:49 +0000
Subject: [PATCH 20/36] Split e2e tests into seperate files per workflow

---
 tests/default.nf.test      | 84 --------------------------------------
 tests/default.nf.test.snap | 58 --------------------------
 2 files changed, 142 deletions(-)
 delete mode 100644 tests/default.nf.test
 delete mode 100644 tests/default.nf.test.snap

diff --git a/tests/default.nf.test b/tests/default.nf.test
deleted file mode 100644
index b436ff9..0000000
--- a/tests/default.nf.test
+++ /dev/null
@@ -1,84 +0,0 @@
-nextflow_pipeline {
-
-    name "Test pipeline"
-    script "../main.nf"
-    tag "pipeline"
-
-    test("metagenomic_assemblies mode — submission_study provided (no study registration)") {
-        // Exercises the assembly submission path using a pre-registered study (stub mode).
-        // REGISTERSTUDY is NOT called here; the module-level nf-test covers it.
-        //
-        // A samplesheet is generated on the fly with absolute paths so that nf-schema
-        // validation succeeds regardless of the nf-test launchDir.
-        options "-stub"
-
-        when {
-            params {
-                def csv = new File("${outputDir}/samplesheet_assembly.csv")
-                csv.parentFile.mkdirs()
-                csv.text = [
-                    "sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version",
-                    "sample1,${projectDir}/tests/data/contigs.fasta.gz,${projectDir}/tests/data/fastq_1.fastq,${projectDir}/tests/data/fastq_2.fastq,,ERR000001,SPAdes,3.15",
-                    "sample2,${projectDir}/tests/data/invalid_assembly.fasta.gz,,,45,ERR000002,Velvet,1.2.10",
-                    "sample3,${projectDir}/tests/data/contigs.fasta.gz,,,30,ERR000003,MEGAHIT,1.2.9"
-                ].join("\n")
-
-                outdir           = "$outputDir"
-                input            = csv.absolutePath
-                mode             = "metagenomic_assemblies"
-                submission_study = "PRJEB98843"
-                centre_name      = "TEST_CENTER"
-            }
-        }
-
-        then {
-            def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}'])
-            def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore')
-            assertAll(
-                { assert workflow.success },
-                { assert snapshot(
-                    removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"),
-                    stable_name,
-                    stable_path
-                ).match() }
-            )
-        }
-    }
-
-    test("metagenomic_assemblies mode — study_metadata provided (REGISTERSTUDY registers study)") {
-        // Tests the study-registration path in stub mode. REGISTERSTUDY stub
-        // outputs an empty accessions JSON, so this test validates the plumbing rather
-        // than the end-to-end submission output.
-        options "-stub"
-
-        when {
-            params {
-                def csv = new File("${outputDir}/samplesheet_assembly.csv")
-                csv.parentFile.mkdirs()
-                csv.text = [
-                    "sample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version",
-                    "sample1,${projectDir}/tests/data/contigs.fasta.gz,${projectDir}/tests/data/fastq_1.fastq,${projectDir}/tests/data/fastq_2.fastq,,ERR000001,SPAdes,3.15",
-                    "sample2,${projectDir}/tests/data/invalid_assembly.fasta.gz,,,45,ERR000002,Velvet,1.2.10",
-                    "sample3,${projectDir}/tests/data/contigs.fasta.gz,,,30,ERR000003,MEGAHIT,1.2.9"
-                ].join("\n")
-
-                outdir         = "$outputDir"
-                input          = csv.absolutePath
-                mode           = "metagenomic_assemblies"
-                study_metadata = "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/seqsubmit/test_data/study_metadata/example_study.json"
-                centre_name    = "TEST_CENTER"
-            }
-        }
-
-        then {
-            assertAll(
-                { assert workflow.success },
-                { assert workflow.trace.succeeded().any { it.name.contains("REGISTERSTUDY") } }
-            )
-        }
-    }
-
-    // NOTE: The MAGs/bins test requires remote genome files from nf-core/test-datasets
-    // (https://github.com/nf-core/test-datasets/tree/seqsubmit) and cannot run offline.
-    // Run it manually with: nf-test test tests/default.nf.test --filter "mags" --profile test_genome,docker
-}
diff --git a/tests/default.nf.test.snap b/tests/default.nf.test.snap
deleted file mode 100644
index 71a254e..0000000
--- a/tests/default.nf.test.snap
+++ /dev/null
@@ -1,58 +0,0 @@
-{
-    "metagenomic_assemblies mode \u2014 submission_study provided (no study registration)": {
-        "content": [
-            {
-                "Workflow": {
-                    "nf-core/seqsubmit": "v1.0.0dev"
-                }
-            },
-            [
-                "coverm",
-                "coverm/sample1.depth.txt",
-                "fastavalidator",
-                "fastavalidator/sample1.success.log",
-                "fastavalidator/sample2.success.log",
-                "fastavalidator/sample3.success.log",
-                "generate",
-                "generate/PRJEB98843_upload",
-                "generate/PRJEB98843_upload/test.manifest",
-                "metagenomic_assemblies",
-                "metagenomic_assemblies/multiqc",
-                "metagenomic_assemblies/multiqc/multiqc_data",
-                "metagenomic_assemblies/multiqc/multiqc_plots",
-                "metagenomic_assemblies/multiqc/multiqc_report.html",
-                "metagenomic_assemblies/sample1_assembly_metadata.csv",
-                "metagenomic_assemblies/sample2_assembly_metadata.csv",
-                "metagenomic_assemblies/sample3_assembly_metadata.csv",
-                "metagenomic_assemblies/upload",
-                "metagenomic_assemblies/upload/webin_cli",
-                "metagenomic_assemblies/upload/webin_cli/sample1_webin-cli.report",
-                "metagenomic_assemblies/upload/webin_cli/sample2_webin-cli.report",
-                "metagenomic_assemblies/upload/webin_cli/sample3_webin-cli.report",
-                "pipeline_info",
-                "pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml",
-                "samplesheet_assembly.csv"
-            ],
-            [
-                "sample1.depth.txt:md5,d41d8cd98f00b204e9800998ecf8427e",
-                "sample1.success.log:md5,b0b859eda1db5cd43915846e00ebc22c",
-                "sample2.success.log:md5,b0b859eda1db5cd43915846e00ebc22c",
-                "sample3.success.log:md5,b0b859eda1db5cd43915846e00ebc22c",
-                "test.manifest:md5,d41d8cd98f00b204e9800998ecf8427e",
-                "multiqc_report.html:md5,d41d8cd98f00b204e9800998ecf8427e",
-                "sample1_assembly_metadata.csv:md5,e1a00dc628e95c38e18dfd5161fa2ce4",
-                "sample2_assembly_metadata.csv:md5,901e55730b100224efb27f23aabf4f67",
-                "sample3_assembly_metadata.csv:md5,d5b1575095ece78d988395b874440bef",
-                "sample1_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e",
-                "sample2_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e",
-                "sample3_webin-cli.report:md5,d41d8cd98f00b204e9800998ecf8427e",
-                "samplesheet_assembly.csv:md5,2f74b281cb7096ad80a378b8960aabee"
-            ]
-        ],
-        "timestamp": "2026-03-12T13:22:15.261886",
-        "meta": {
-            "nf-test": "0.9.4",
-            "nextflow": "25.10.4"
-        }
-    }
-}
\ No newline at end of file

From d80420be2a86248360dd4bec9995b88d77985585 Mon Sep 17 00:00:00 2001
From: Tim Rozday <timrozday@ebi.ac.uk>
Date: Fri, 13 Mar 2026 14:24:18 +0000
Subject: [PATCH 21/36] nf-core linting

---
 ro-crate-metadata.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json
index befa2db..3ff34eb 100644
--- a/ro-crate-metadata.json
+++ b/ro-crate-metadata.json
@@ -23,7 +23,7 @@
             "@type": "Dataset",
             "creativeWorkStatus": "InProgress",
             "datePublished": "2025-11-20T09:32:34+00:00",
-            "description": "<h1>\n  <picture>\n    <source media=\"(prefers-color-scheme: dark)\" srcset=\"docs/images/nf-core-seqsubmit_logo_dark.png\">\n    <img alt=\"nf-core/seqsubmit\" src=\"docs/images/nf-core-seqsubmit_logo_light.png\">\n  </picture>\n</h1>\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/seqsubmit)\n[![GitHub Actions CI Status](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/seqsubmit/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/seqsubmit)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23seqsubmit-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/seqsubmit)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/seqsubmit** is a Nextflow pipeline for submitting sequence data to [ENA](https://www.ebi.ac.uk/ena/browser/home).\nCurrently, the pipeline supports three submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure:\n\n- `mags` for Metagenome Assembled Genomes (MAGs) submission with `GENOMESUBMIT` workflow\n- `bins` for bins submission with `GENOMESUBMIT` workflow\n- `metagenomic_assemblies` for assembly submission with `ASSEMBLYSUBMIT` workflow\n\n![seqsubmit workflow diagram](assets/seqsubmit_schema.png)\n\n## Requirements\n\n- [Nextflow](https://www.nextflow.io/) `>=25.04.0`\n- Webin account registered at https://www.ebi.ac.uk/ena/submit/webin/login\n- Raw reads used to assemble contigs submitted to [INSDC](https://www.insdc.org/) and associated accessions available\n\nSetup your environment secrets before running the pipeline:\n\n`nextflow secrets set WEBIN_ACCOUNT \"Webin-XXX\"`\n\n`nextflow secrets set WEBIN_PASSWORD \"XXX\"`\n\nMake sure you update commands above with your authorised credentials.\n\n## Input samplesheets\n\n### `mags` and `bins` modes (`GENOMESUBMIT`)\n\nThe input must follow `assets/schema_input_genome.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `accession`\n- `assembly_software`\n- `binning_software`\n- `binning_parameters`\n- `stats_generation_software`\n- `metagenome`\n- `environmental_medium`\n- `broad_environment`\n- `local_environment`\n- `co-assembly`\n\nColumns that required for now, but will be optional in the nearest future:\n\n- `completeness`\n- `contamination`\n- `genome_coverage`\n- `rRNA_presence`\n- `NCBI_lineage`\n\nThose fields are metadata required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package. They are described in [docs](https://github.com/EBI-Metagenomics/genome_uploader/blob/main/README.md#input-tsv-and-fields).\n\nExample `samplesheet_genome.csv`:\n\n```csv\nsample,fasta,accession,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,rRNA_presence,NCBI_lineage\nlachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,false,marine,cable_bacteria,marine_sediment,false,d__Bacteria;p__Proteobacteria;s_unclassified_Proteobacteria\n```\n\n### `metagenomic_assemblies` mode (`ASSEMBLYSUBMIT`)\n\nThe input must follow `assets/schema_input_assembly.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `run_accession`\n- `assembler`\n- `assembler_version`\n\nAt least one of the following must be provided per row:\n\n- reads (`fastq_1`, optional `fastq_2` for paired-end)\n- `coverage`\n\nIf `coverage` is missing and reads are provided, the workflow calculates average coverage with `coverm`.\n\nExample `samplesheet_assembly.csv`:\n\n```csv\nsample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version\nassembly_1,data/contigs_1.fasta.gz,data/reads_1.fastq.gz,data/reads_2.fastq.gz,,ERR011322,SPAdes,3.15.5\nassembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9\n```\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n### Required parameters:\n\n| Parameter            | Description                                                                       |\n| -------------------- | --------------------------------------------------------------------------------- |\n| `--mode`             | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` |\n| `--input`            | Path to the samplesheet describing the data to be submitted                       |\n| `--outdir`           | Path to the output directory for pipeline results                                 |\n| `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to                               |\n| `--centre_name`      | Name of the submitter's organisation                                              |\n\n### Optional parameters:\n\n| Parameter           | Description                                                                              |\n| ------------------- | ---------------------------------------------------------------------------------------- |\n| `--upload_tpa`      | Flag to control the type of assembly study (third party assembly or not). Default: false |\n| `--test_upload`     | Upload to TEST ENA server instead of LIVE. Default: false                                |\n| `--webincli_submit` | If set to false, submissions will be validated, but not submitted. Default: true         |\n\nGeneral command template:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n   -profile <docker/singularity/...> \\\n   --mode <mags|bins|metagenomic_assemblies> \\\n   --input <samplesheet.csv> \\\n   --centre_name <your_centre> \\\n   --submission_study <your_study> \\\n   --outdir <outdir>\n```\n\nValidation run (submission to the ENA TEST server) in `mags` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n   -profile docker \\\n   --mode mags \\\n   --input assets/samplesheet_genomes.csv \\\n   --submission_study <your_study> \\\n   --centre_name TEST_CENTER \\\n   --webincli_submit true \\\n   --test_upload true \\\n   --outdir results/validate_mags\n```\n\nValidation run (submission to the ENA TEST server) in `metagenomic_assemblies` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n   -profile docker \\\n   --mode metagenomic_assemblies \\\n   --input assets/samplesheet_assembly.csv \\\n   --submission_study <your_study> \\\n   --centre_name TEST_CENTER \\\n   --webincli_submit true \\\n   --test_upload true \\\n   --outdir results/validate_assemblies\n```\n\nLive submission example:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n   -profile docker \\\n   --mode metagenomic_assemblies \\\n   --input assets/samplesheet_assembly.csv \\\n   --submission_study PRJEB98843 \\\n   --test_upload false \\\n   --webincli_submit true \\\n   --outdir results/live_assembly\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/seqsubmit/usage) and the [parameter documentation](https://nf-co.re/seqsubmit/parameters).\n\n## Pipeline output\n\nKey output locations in `--outdir`:\n\n- `upload/manifests/`: generated manifest files for submission\n- `upload/webin_cli/`: ENA Webin CLI reports\n- `multiqc/`: MultiQC summary report\n- `pipeline_info/`: execution reports, trace, DAG, and software versions\n\nFor full details, see the [output documentation](https://nf-co.re/seqsubmit/output).\n\n## Credits\n\nnf-core/seqsubmit was originally written by [Martin Beracochea](https://github.com/mberacochea), [Ekaterina Sakharova](https://github.com/KateSakharova), [Sofiia Ochkalova](https://github.com/ochkalova), [Evangelos Karatzas](https://github.com/vagkaratzas).\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#seqsubmit` channel](https://nfcore.slack.com/channels/seqsubmit) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n<!-- TODO nf-core: Add citation for pipeline after first release. Uncomment lines below and update Zenodo doi and badge at the top of this file. -->\n\n<!-- If you use nf-core/seqsubmit for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) -->\n\n<!-- TODO nf-core: Add bibliography of tools and data used in your pipeline -->\n\nIf you use this pipeline please make sure to cite all used software.\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **MGnify: the microbiome sequence data analysis resource in 2023**\n>\n> Richardson L, Allen B, Baldi G, Beracochea M, Bileschi ML, Burdett T, et al.\n>\n> Vol. 51, Nucleic Acids Research. Oxford University Press (OUP); 2022. p. D753\u20139. Available from: http://dx.doi.org/10.1093/nar/gkac1080\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n",
+            "description": "<h1>\n  <picture>\n    <source media=\"(prefers-color-scheme: dark)\" srcset=\"docs/images/nf-core-seqsubmit_logo_dark.png\">\n    <img alt=\"nf-core/seqsubmit\" src=\"docs/images/nf-core-seqsubmit_logo_light.png\">\n  </picture>\n</h1>\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/seqsubmit)\n[![GitHub Actions CI Status](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/seqsubmit/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/seqsubmit/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.04.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/seqsubmit)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23seqsubmit-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/seqsubmit)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/seqsubmit** is a Nextflow pipeline for submitting sequence data to [ENA](https://www.ebi.ac.uk/ena/browser/home).\nCurrently, the pipeline supports three submission modes, each routed to a dedicated workflow and requiring its own input samplesheet structure:\n\n- `mags` for Metagenome Assembled Genomes (MAGs) submission with `GENOMESUBMIT` workflow\n- `bins` for bins submission with `GENOMESUBMIT` workflow\n- `metagenomic_assemblies` for assembly submission with `ASSEMBLYSUBMIT` workflow\n\n![seqsubmit workflow diagram](assets/seqsubmit_schema.png)\n\n## Requirements\n\n- [Nextflow](https://www.nextflow.io/) `>=25.04.0`\n- Webin account registered at https://www.ebi.ac.uk/ena/submit/webin/login\n- Raw reads used to assemble contigs submitted to [INSDC](https://www.insdc.org/) and associated accessions available\n\nSetup your environment secrets before running the pipeline:\n\n`nextflow secrets set WEBIN_ACCOUNT \"Webin-XXX\"`\n\n`nextflow secrets set WEBIN_PASSWORD \"XXX\"`\n\nMake sure you update commands above with your authorised credentials.\n\n## Input samplesheets\n\nFor detailed descriptions of all samplesheet columns, see the [usage documentation](docs/usage.md#samplesheet-input).\n\n### `mags` and `bins` modes (`GENOMESUBMIT`)\n\nThe input must follow `assets/schema_input_genome.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `accession`\n- `assembly_software`\n- `binning_software`\n- `binning_parameters`\n- `stats_generation_software`\n- `metagenome`\n- `environmental_medium`\n- `broad_environment`\n- `local_environment`\n- `co-assembly`\n\nColumns that required for now, but will be optional in the nearest future:\n\n- `completeness`\n- `contamination`\n- `genome_coverage`\n- `RNA_presence`\n- `NCBI_lineage`\n\nThose fields are metadata required for [genome_uploader](https://github.com/EBI-Metagenomics/genome_uploader) package.\n\nExample `samplesheet_genome.csv`:\n\n```csv\nsample,fasta,accession,assembly_software,binning_software,binning_parameters,stats_generation_software,completeness,contamination,genome_coverage,metagenome,co-assembly,broad_environment,local_environment,environmental_medium,RNA_presence,NCBI_lineage\nlachnospira_eligens,data/bin_lachnospira_eligens.fa.gz,SRR24458089,spades_v3.15.5,metabat2_v2.6,default,CheckM2_v1.0.1,61.0,0.21,32.07,sediment metagenome,No,marine,cable_bacteria,marine_sediment,No,d__Bacteria;p__Proteobacteria;s_unclassified_Proteobacteria\n```\n\n### `metagenomic_assemblies` mode (`ASSEMBLYSUBMIT`)\n\nThe input must follow `assets/schema_input_assembly.json`.\n\nRequired columns:\n\n- `sample`\n- `fasta` (must end with `.fa.gz` or `.fasta.gz`)\n- `run_accession`\n- `assembler`\n- `assembler_version`\n\nAt least one of the following must be provided per row:\n\n- reads (`fastq_1`, optional `fastq_2` for paired-end)\n- `coverage`\n\nIf `coverage` is missing and reads are provided, the workflow calculates average coverage with `coverm`.\n\nExample `samplesheet_assembly.csv`:\n\n```csv\nsample,fasta,fastq_1,fastq_2,coverage,run_accession,assembler,assembler_version\nassembly_1,data/contigs_1.fasta.gz,data/reads_1.fastq.gz,data/reads_2.fastq.gz,,ERR011322,SPAdes,3.15.5\nassembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9\n```\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\n### Required parameters:\n\n| Parameter            | Description                                                                       |\n| -------------------- | --------------------------------------------------------------------------------- |\n| `--mode`             | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` |\n| `--input`            | Path to the samplesheet describing the data to be submitted                       |\n| `--outdir`           | Path to the output directory for pipeline results                                 |\n| `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to                               |\n| `--centre_name`      | Name of the submitter's organisation                                              |\n\n### Optional parameters:\n\n| Parameter           | Description                                                                              |\n| ------------------- | ---------------------------------------------------------------------------------------- |\n| `--upload_tpa`      | Flag to control the type of assembly study (third party assembly or not). Default: false |\n| `--test_upload`     | Upload to TEST ENA server instead of LIVE. Default: false                                |\n| `--webincli_submit` | If set to false, submissions will be validated, but not submitted. Default: true         |\n\nGeneral command template:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n   -profile <docker/singularity/...> \\\n   --mode <mags|bins|metagenomic_assemblies> \\\n   --input <samplesheet.csv> \\\n   --centre_name <your_centre> \\\n   --submission_study <your_study> \\\n   --outdir <outdir>\n```\n\nValidation run (submission to the ENA TEST server) in `mags` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n   -profile docker \\\n   --mode mags \\\n   --input assets/samplesheet_genomes.csv \\\n   --submission_study <your_study> \\\n   --centre_name TEST_CENTER \\\n   --webincli_submit true \\\n   --test_upload true \\\n   --outdir results/validate_mags\n```\n\nValidation run (submission to the ENA TEST server) in `metagenomic_assemblies` mode:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n   -profile docker \\\n   --mode metagenomic_assemblies \\\n   --input assets/samplesheet_assembly.csv \\\n   --submission_study <your_study> \\\n   --centre_name TEST_CENTER \\\n   --webincli_submit true \\\n   --test_upload true \\\n   --outdir results/validate_assemblies\n```\n\nLive submission example:\n\n```bash\nnextflow run nf-core/seqsubmit \\\n   -profile docker \\\n   --mode metagenomic_assemblies \\\n   --input assets/samplesheet_assembly.csv \\\n   --submission_study PRJEB98843 \\\n   --test_upload false \\\n   --webincli_submit true \\\n   --outdir results/live_assembly\n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/seqsubmit/usage) and the [parameter documentation](https://nf-co.re/seqsubmit/parameters).\n\n## Pipeline output\n\nKey output locations in `--outdir`:\n\n- `upload/manifests/`: generated manifest files for submission\n- `upload/webin_cli/`: ENA Webin CLI reports\n- `multiqc/`: MultiQC summary report\n- `pipeline_info/`: execution reports, trace, DAG, and software versions\n\nFor full details, see the [output documentation](https://nf-co.re/seqsubmit/output).\n\n## Credits\n\nnf-core/seqsubmit was originally written by [Martin Beracochea](https://github.com/mberacochea), [Ekaterina Sakharova](https://github.com/KateSakharova), [Sofiia Ochkalova](https://github.com/ochkalova), [Evangelos Karatzas](https://github.com/vagkaratzas).\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#seqsubmit` channel](https://nfcore.slack.com/channels/seqsubmit) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n<!-- TODO nf-core: Add citation for pipeline after first release. Uncomment lines below and update Zenodo doi and badge at the top of this file. -->\n\n<!-- If you use nf-core/seqsubmit for your analysis, please cite it using the following doi: [10.5281/zenodo.XXXXXX](https://doi.org/10.5281/zenodo.XXXXXX) -->\n\n<!-- TODO nf-core: Add bibliography of tools and data used in your pipeline -->\n\nIf you use this pipeline please make sure to cite all used software.\nThis pipeline uses code and infrastructure developed and maintained by the [nf-core](https://nf-co.re) community, reused here under the [MIT license](https://github.com/nf-core/tools/blob/main/LICENSE).\n\n> **MGnify: the microbiome sequence data analysis resource in 2023**\n>\n> Richardson L, Allen B, Baldi G, Beracochea M, Bileschi ML, Burdett T, et al.\n>\n> Vol. 51, Nucleic Acids Research. Oxford University Press (OUP); 2022. p. D753\u20139. Available from: http://dx.doi.org/10.1093/nar/gkac1080\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n",
             "hasPart": [
                 {
                     "@id": "main.nf"

From 0d743f25a9417978d94195146fa9eec6c767d860 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Mon, 23 Mar 2026 14:48:13 +0000
Subject: [PATCH 22/36] remove dead code

---
 bin/submit_study.py | 59 +++------------------------------------------
 conf/modules.config |  2 +-
 2 files changed, 4 insertions(+), 57 deletions(-)

diff --git a/bin/submit_study.py b/bin/submit_study.py
index cc9d698..b644ca3 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -8,10 +8,11 @@
 secrets appearing in shell history or process listings::
 
     export ENA_WEBIN=Webin-XXXXX
-    export ENA_WEBIN_PASSWORD=SECRET
+    export ENA_WEBIN_PASSWORD=XXXXX
 
 Usage::
 
+    # Submission to TEST server (submissions are discarded daily):
     python bin/submit_study.py \\
         --input studies.json \\
         --test
@@ -36,7 +37,7 @@
 import os
 import sys
 import xml.etree.ElementTree as ET
-from collections.abc import Callable, Sequence
+from collections.abc import Sequence
 from io import BytesIO
 from pathlib import Path
 from typing import Any, Final
@@ -449,53 +450,6 @@ def _add_project_attribute(
     val_el.text = value_text
 
 
-# -----------------------------------------------------------
-# Structural XML validation (study-specific)
-# -----------------------------------------------------------
-
-
-def _validate_study_xml_structure(
-    xml_bytes: bytes,
-    messages: list[str],
-) -> tuple[bool, list[str]]:
-    """Structural check for study XML."""
-    try:
-        tree = ET.fromstring(xml_bytes)
-    except ET.ParseError as exc:
-        messages.append(
-            f"ERROR: XML is not well-formed: {exc}"
-        )
-        return False, messages
-
-    messages.append(
-        "XML is well-formed (basic check passed)"
-    )
-
-    project_set = tree.find("PROJECT_SET")
-    if project_set is None:
-        messages.append("ERROR: Missing PROJECT_SET element")
-        return False, messages
-
-    projects = project_set.findall("PROJECT")
-    if not projects:
-        messages.append("ERROR: No PROJECT elements found")
-        return False, messages
-
-    for proj in projects:
-        alias = proj.get("alias", "<no alias>")
-        title = proj.find("TITLE")
-        if title is None or not title.text:
-            messages.append(f"ERROR: PROJECT '{alias}' missing TITLE")
-            return False, messages
-        sp = proj.find("SUBMISSION_PROJECT")
-        if sp is None:
-            messages.append(f"ERROR: PROJECT '{alias}' missing SUBMISSION_PROJECT")
-            return False, messages
-        messages.append(f"OK: PROJECT '{alias}' has required elements")
-
-    return True, messages
-
-
 # -----------------------------------------------------------
 # Receipt parsing
 # -----------------------------------------------------------
@@ -643,12 +597,6 @@ def _do_submission(
     default=None,
     help="Hold studies private until this date (YYYY-MM-DD, max 2 years from now)",
 )
-@click.option(
-    "--log", "log_file",
-    type=click.Path(path_type=Path),
-    default=None,
-    help="Path to log file",
-)
 @click.option(
     "--output",
     type=click.Path(path_type=Path),
@@ -664,7 +612,6 @@ def main(
     input_file: Path,
     use_test: bool,
     hold_until: str | None,
-    log_file: Path | None,
     output: Path | None,
     validate: bool,
 ) -> None:
diff --git a/conf/modules.config b/conf/modules.config
index 94f2e94..4497a5f 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -57,6 +57,6 @@ process {
     }
 
     withName: 'REGISTERSTUDY' {
-        ext.args = { [params.test_upload ? "--test" : "", "--automated"].findAll().join(" ") }
+        ext.args = { params.test_upload ? "--test" : "" }
     }
 }

From 763caeadcf75aff130d1d31d73b25c88d755f5f7 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Mon, 23 Mar 2026 17:50:16 +0000
Subject: [PATCH 23/36] add input format examples, do minor simplifications of
 the code

---
 bin/submit_study.py | 102 ++++++++++++++++++++++++++++++++++++++------
 1 file changed, 89 insertions(+), 13 deletions(-)

diff --git a/bin/submit_study.py b/bin/submit_study.py
index b644ca3..0fed37e 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -1,9 +1,91 @@
 #!/usr/bin/env python3
-"""Submit raw-reads, assembly and genome studies to ENA via the Webin REST API v2.
+"""Submit studies to ENA via the Webin REST API v2.
 
 Read a study metadata file (JSON, CSV, or TSV), construct an
 XML submission document, and submit new studies to ENA.
 
+# TODO: Currently script supports multiple input format that might be unnecessary.
+# TODO: Consider standardising on a single format (e.g. JSON and/or TSV) and deprecating the others.
+Input formats accepted (``--input``):
+
+* ``.json``
+* ``.csv``
+* ``.tsv``
+
+Example JSON inputs accepted::
+
+        {
+            "alias": "study-gut-2026",
+            "STUDY_TITLE": "Gut microbiome study",
+            "STUDY_ABSTRACT": "Characterisation of gut microbial communities",
+            "existing_study_type": "Metagenomics"
+        }
+
+        [
+            {
+                "alias": "study-gut-2026",
+                "STUDY_TITLE": "Gut microbiome study",
+                "STUDY_ABSTRACT": "Characterisation of gut microbial communities",
+                "existing_study_type": "Metagenomics"
+            },
+            ...
+        ]
+
+        {
+            "studies": [
+                {
+                    "alias": "study-soil-2026",
+                    "STUDY_TITLE": "Soil microbiome study",
+                    "existing_study_type": "Other",
+                    "new_study_type": "Environmental microbiome"
+                }
+            ]
+        }
+
+        {
+            "data": [
+                {
+                    "alias": "study-soil-2026",
+                    "STUDY_TITLE": "Soil microbiome study",
+                }
+            ]
+        }
+
+        {
+            "Container": {
+                "Studies": [
+                    {
+                        "STUDY_TITLE": "Marine metagenome study",
+                        "STUDY_ABSTRACT": "Shotgun metagenomics from seawater"
+                    }
+                ]
+            }
+        }
+
+Example CSV input accepted::
+
+        alias,STUDY_TITLE,STUDY_ABSTRACT,existing_study_type
+        study-gut-2026,Gut microbiome study,Characterisation of gut microbial communities,Metagenomics
+
+Example TSV input accepted::
+
+        alias\tSTUDY_TITLE\tSTUDY_ABSTRACT\texisting_study_type
+        study-soil-2026\tSoil microbiome study\tSurvey of soil microbiota\tMetagenomics
+
+Study metadata fields:
+
+Mandatory:
+
+* ``STUDY_TITLE`` — study title used in ``<TITLE>``.
+
+Optional:
+
+* ``alias`` — project alias; if missing, derived from ``STUDY_TITLE`` (first 50 characters).
+* ``CENTER_PROJECT_NAME`` — written to ``<NAME>``; defaults to alias.
+* ``STUDY_ABSTRACT`` or ``STUDY_DESCRIPTION`` — written to ``<DESCRIPTION>``.
+* ``existing_study_type`` — included as PROJECT_ATTRIBUTE.
+* ``new_study_type`` — included only when ``existing_study_type == "Other"``.
+
 Credentials are read from environment variables to avoid
 secrets appearing in shell history or process listings::
 
@@ -58,7 +140,7 @@
     level=logging.INFO,
     stream=sys.stderr,
 )
-logger = logging.getLogger("ena_submit.study")
+logger = logging.getLogger()
 
 
 # -----------------------------------------------------------
@@ -91,11 +173,6 @@ def get_credentials() -> tuple[str, str]:
 TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2"
 
 
-def get_base_url(use_test: bool) -> str:
-    """Return the ENA Webin v2 submission base URL."""
-    return TEST_URL if use_test else PROD_URL
-
-
 def submit_xml(
     base_url: str,
     auth: HTTPBasicAuth,
@@ -512,7 +589,6 @@ def _do_submission(
     xml_bytes: bytes,
     action: str,
     results: dict[str, list[dict[str, Any]]],
-    result_key: str,
     env_label: str,
     dry_run: bool,
 ) -> bool:
@@ -525,8 +601,7 @@ def _do_submission(
         action: Label for log messages (``"ADD"`` or
             ``"MODIFY"``).
         results: Results dict to accumulate into.
-        result_key: Key under which successes are stored.
-        env_label: ``"TEST"`` or ``"PRODUCTION"``.
+        env_label: ``"TEST server"`` or ``"LIVE server"``.
         dry_run: If ``True``, skip the actual submission.
 
     Returns:
@@ -559,7 +634,7 @@ def _do_submission(
                 "  %s: alias=%s accession=%s status=%s%s",
                 action, acc["alias"], acc["accession"], acc["status"], ext_suffix,
             )
-            results[result_key].append(acc)
+            results["submitted"].append(acc)
     else:
         logger.error("%s FAILED", action)
         receipt_xml_str = ET.tostring(
@@ -618,9 +693,10 @@ def main(
     """Submit studies to ENA via the Webin REST API v2."""
     username, password = get_credentials()
 
-    env_label = "TEST" if use_test else "PRODUCTION"
+    env_label = "TEST server" if use_test else "LIVE server"
     logger.info("ENA Study Submission — environment: %s", env_label)
-    base_url = get_base_url(use_test)
+    base_url = TEST_URL if use_test else PROD_URL
+
     auth = HTTPBasicAuth(username, password)
     logger.debug("Auth username: %s", username)
 

From ac77c707864b1368dcdff471fe12a6897ccff78d Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Mon, 23 Mar 2026 17:50:45 +0000
Subject: [PATCH 24/36] update meta.yml for registerstudy

---
 modules/local/registerstudy/meta.yml | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/modules/local/registerstudy/meta.yml b/modules/local/registerstudy/meta.yml
index 549f187..385c79e 100644
--- a/modules/local/registerstudy/meta.yml
+++ b/modules/local/registerstudy/meta.yml
@@ -2,11 +2,10 @@
 name: "registerstudy"
 description: |
   Submit a new study to ENA via the Webin REST API v2.
-  Reads study metadata from a JSON, CSV, or TSV file, checks for
-  duplicate studies already registered under the Webin account,
+  Reads study metadata from a JSON, CSV, or TSV file,
   builds a PROJECT XML submission document, and submits to ENA.
   Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD
-  Nextflow secrets, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD
+  env variables, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD
   inside the process.
 keywords:
   - ena
@@ -53,8 +52,7 @@ output:
           type: file
           description: |
             JSON file containing the submission results with keys:
-            submitted (newly created accessions), duplicates (skipped),
-            modified (force-updated), and failed.
+            submitted (newly created accessions) and failed.
           pattern: "*_accessions.json"
   - versions:
       - "versions.yml":

From 6ede603bcde4716b51612ccd1c486249c479a33f Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Tue, 24 Mar 2026 11:35:24 +0000
Subject: [PATCH 25/36] refactor and bugfix registerstudy

---
 bin/submit_study.py                           | 193 ++++++++++--------
 conf/modules.config                           |   5 +-
 .../local/registerstudy/tests/nextflow.config |  11 +-
 tests/nextflow.config                         |  29 ---
 4 files changed, 109 insertions(+), 129 deletions(-)
 delete mode 100644 tests/nextflow.config

diff --git a/bin/submit_study.py b/bin/submit_study.py
index 0fed37e..0c88bfd 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -1,11 +1,13 @@
 #!/usr/bin/env python3
-"""Submit studies to ENA via the Webin REST API v2.
+"""Submit studies to ENA via the Webin drop-box XML submission service.
 
 Read a study metadata file (JSON, CSV, or TSV), construct an
 XML submission document, and submit new studies to ENA.
 
 # TODO: Currently script supports multiple input format that might be unnecessary.
 # TODO: Consider standardising on a single format (e.g. JSON and/or TSV) and deprecating the others.
+# TODO: Consider which columns are mandatory vs optional. "alias" is optional, might be worth making it mandatory.
+# TODO: Add input file validation and error handling (e.g. missing mandatory fields, long alias).
 Input formats accepted (``--input``):
 
 * ``.json``
@@ -114,6 +116,7 @@
 
 import csv
 import datetime
+import hashlib
 import json
 import logging
 import os
@@ -169,33 +172,34 @@ def get_credentials() -> tuple[str, str]:
 # ENA API helpers
 # -----------------------------------------------------------
 
-PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2"
-TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2"
+PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/drop-box"
+TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box"
 
 
 def submit_xml(
     base_url: str,
     auth: HTTPBasicAuth,
-    xml_bytes: bytes,
+    submission_xml_bytes: bytes,
+    project_xml_bytes: bytes,
 ) -> ET.Element:
-    """Submit an XML document to ENA via Webin v2.
+    """Submit study XMLs to ENA via the submit/drop-box endpoint.
 
     Args:
         base_url: ENA submission service base URL.
         auth: HTTP basic-auth credentials.
-        xml_bytes: Serialised XML submission document.
+        submission_xml_bytes: Serialised ``<SUBMISSION>`` XML.
+        project_xml_bytes: Serialised ``<PROJECT_SET>`` XML.
 
     Returns:
         Parsed receipt XML element tree root.
     """
     url = f"{base_url}/submit"
-    headers = {
-        "Content-Type": "application/xml",
-        "Accept": "application/xml",
+    files = {
+        "SUBMISSION": ("submission.xml", submission_xml_bytes, "application/xml"),
+        "PROJECT": ("project.xml", project_xml_bytes, "application/xml"),
     }
     resp = requests.post(
-        url, data=xml_bytes,
-        headers=headers, auth=auth, timeout=120,
+        url, files=files, auth=auth, timeout=120,
     )
     resp.raise_for_status()
     return ET.fromstring(resp.content)
@@ -423,35 +427,24 @@ def write_results(
 # -----------------------------------------------------------
 
 
-def build_submission_xml(
-    studies: list[dict[str, Any]],
+def build_submission_actions_xml(
     hold_until: str | None = None,
     action: str = "ADD",
 ) -> ET.Element:
-    """Build a WEBIN XML document for submitting studies.
+    """Build the ``<SUBMISSION>`` actions XML element.
 
-    Each study in the input list is converted to a PROJECT
-    element.
+    This is submitted as the ``SUBMISSION`` multipart field.
 
     Args:
-        studies: Study metadata dicts.
         hold_until: Optional hold-until date string
             (``YYYY-MM-DD``).
         action: Submission action — ``"ADD"`` for new studies
             or ``"MODIFY"`` to update existing ones.
 
     Returns:
-        Root ``<WEBIN>`` element.
+        Root ``<SUBMISSION>`` element.
     """
-    webin = ET.Element("WEBIN")
-
-    # SUBMISSION_SET
-    submission_set = ET.SubElement(webin, "SUBMISSION_SET")
-    submission = ET.SubElement(
-        submission_set, "SUBMISSION",
-    )
-    sub_alias = f"study-submission-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"
-    submission.set("alias", sub_alias)
+    submission = ET.Element("SUBMISSION")
     actions = ET.SubElement(submission, "ACTIONS")
     main_action = ET.SubElement(actions, "ACTION")
     ET.SubElement(main_action, action.upper())
@@ -459,59 +452,74 @@ def build_submission_xml(
         hold_action = ET.SubElement(actions, "ACTION")
         hold_el = ET.SubElement(hold_action, "HOLD")
         hold_el.set("HoldUntilDate", hold_until)
+    return submission
 
-    # PROJECT_SET
-    project_set = ET.SubElement(webin, "PROJECT_SET")
-    for study in studies:
-        _add_project_element(project_set, study)
-
-    return webin
-
-
-def _add_project_element(
-    project_set: ET.Element,
-    study: dict[str, Any],
-) -> None:
-    """Append a ``<PROJECT>`` element to *project_set*."""
-    alias = study.get(
-        "alias",
-        study.get("STUDY_TITLE", "").replace(" ", "_")[:50],
-    )
-    project = ET.SubElement(project_set, "PROJECT")
-    project.set("alias", alias)
-
-    name_text = study.get("CENTER_PROJECT_NAME", alias)
-    if name_text:
-        name_el = ET.SubElement(project, "NAME")
-        name_el.text = name_text
 
-    title_el = ET.SubElement(project, "TITLE")
-    title_el.text = study.get("STUDY_TITLE", "")
+def build_project_set_xml(
+    studies: list[dict[str, Any]],
+    test: bool = False,
+) -> ET.Element:
+    """Build the ``<PROJECT_SET>`` XML element.
 
-    desc_text = (
-        study.get("STUDY_ABSTRACT")
-        or study.get("STUDY_DESCRIPTION", "")
-    )
-    if desc_text:
-        desc_el = ET.SubElement(project, "DESCRIPTION")
-        desc_el.text = desc_text
+    This is submitted as the ``PROJECT`` multipart field.
 
-    sp = ET.SubElement(project, "SUBMISSION_PROJECT")
-    ET.SubElement(sp, "SEQUENCING_PROJECT")
+    Args:
+        studies: Study metadata dicts.
+        test: If ``True``, append a timestamp-based hash to aliases
+            for uniqueness in test submissions.
 
-    study_type = study.get("existing_study_type")
-    if study_type:
-        attrs = ET.SubElement(
-            project, "PROJECT_ATTRIBUTES",
+    Returns:
+        Root ``<PROJECT_SET>`` element.
+    """
+    project_set = ET.Element("PROJECT_SET")
+    for study in studies:
+        alias = study.get(
+            "alias",
+            study.get("STUDY_TITLE", "").replace(" ", "_")[:50],
         )
-        _add_project_attribute(
-            attrs, "existing_study_type", study_type,
+        if test:
+            # Append 8-character hash of current timestamp for uniqueness in test mode
+            timestamp_hash = hashlib.md5(
+                datetime.datetime.now().isoformat().encode()
+            ).hexdigest()[:8]
+            alias = f"{alias}_{timestamp_hash}"
+
+        project = ET.SubElement(project_set, "PROJECT")
+        project.set("alias", alias)
+
+        name_text = study.get("CENTER_PROJECT_NAME", alias)
+        if name_text:
+            name_el = ET.SubElement(project, "NAME")
+            name_el.text = name_text
+
+        title_el = ET.SubElement(project, "TITLE")
+        title_el.text = study.get("STUDY_TITLE", "")
+
+        desc_text = (
+            study.get("STUDY_ABSTRACT")
+            or study.get("STUDY_DESCRIPTION", "")
         )
-        new_type = study.get("new_study_type")
-        if new_type and study_type == "Other":
+        if desc_text:
+            desc_el = ET.SubElement(project, "DESCRIPTION")
+            desc_el.text = desc_text
+
+        sp = ET.SubElement(project, "SUBMISSION_PROJECT")
+        ET.SubElement(sp, "SEQUENCING_PROJECT")
+        # TODO: Check existing_study_type and new_study_type metadata fields, do we need those?
+        study_type = study.get("existing_study_type")
+        if study_type:
+            attrs = ET.SubElement(
+                project, "PROJECT_ATTRIBUTES",
+            )
             _add_project_attribute(
-                attrs, "new_study_type", new_type,
+                attrs, "existing_study_type", study_type,
             )
+            new_type = study.get("new_study_type")
+            if new_type and study_type == "Other":
+                _add_project_attribute(
+                    attrs, "new_study_type", new_type,
+                )
+    return project_set
 
 
 def _add_project_attribute(
@@ -554,6 +562,8 @@ def parse_xml_receipt(
         for err in msgs_el.findall("ERROR"):
             messages.append(f"ERROR: {err.text}")
 
+    # TODO: "accession" should be present for successful submissions
+    # TODO: remove get default and log error if missing.
     for proj in receipt_root.findall("PROJECT"):
         acc_info: dict[str, str] = {
             "alias": proj.get("alias", ""),
@@ -586,7 +596,8 @@ def parse_xml_receipt(
 def _do_submission(
     base_url: str,
     auth: Any,
-    xml_bytes: bytes,
+    submission_xml_bytes: bytes,
+    project_xml_bytes: bytes,
     action: str,
     results: dict[str, list[dict[str, Any]]],
     env_label: str,
@@ -595,9 +606,10 @@ def _do_submission(
     """Validate, optionally submit, and parse one batch.
 
     Args:
-        base_url: ENA Webin v2 submission base URL.
+        base_url: ENA submission base URL.
         auth: HTTP basic-auth credentials.
-        xml_bytes: Serialised XML submission document.
+        submission_xml_bytes: Serialised ``<SUBMISSION>`` actions XML.
+        project_xml_bytes: Serialised ``<PROJECT_SET>`` XML.
         action: Label for log messages (``"ADD"`` or
             ``"MODIFY"``).
         results: Results dict to accumulate into.
@@ -609,12 +621,13 @@ def _do_submission(
     """
     if dry_run:
         logger.info("DRY RUN — skipping %s submission", action)
-        logger.info("Generated XML:\n%s", xml_bytes.decode("utf-8"))
+        logger.info("SUBMISSION XML:\n%s", submission_xml_bytes.decode("utf-8"))
+        logger.info("PROJECT XML:\n%s", project_xml_bytes.decode("utf-8"))
         return True
 
     logger.info("Submitting %s to ENA (%s)...", action, env_label)
     try:
-        receipt_root = submit_xml(base_url, auth, xml_bytes)
+        receipt_root = submit_xml(base_url, auth, submission_xml_bytes, project_xml_bytes)
     except requests.exceptions.HTTPError as exc:
         logger.error("HTTP error during %s submission: %s", action, exc)
         if exc.response is not None:
@@ -654,7 +667,7 @@ def _do_submission(
 
 
 @click.command(
-    help="Submit studies to ENA via the Webin REST API v2.",
+    help="Register studies with ENA using Webin XML submission service.",
 )
 @click.option(
     "--input", "input_file",
@@ -690,7 +703,7 @@ def main(
     output: Path | None,
     validate: bool,
 ) -> None:
-    """Submit studies to ENA via the Webin REST API v2."""
+    """Register studies with ENA using Webin XML submission service."""
     username, password = get_credentials()
 
     env_label = "TEST server" if use_test else "LIVE server"
@@ -726,15 +739,18 @@ def main(
 
     # -- Step 2: Build and submit XML --------------------
     logger.info("Building ADD XML for %d study/studies...", len(studies))
-    xml_root = build_submission_xml(studies, hold_until=hold_until, action="ADD")
-    xml_bytes = xml_to_bytes(xml_root)
-    logger.debug("Generated XML:\n%s", xml_bytes.decode("utf-8"))
-    logger.info("XML document size: %d bytes", len(xml_bytes))
+    submission_root = build_submission_actions_xml(hold_until=hold_until, action="ADD")
+    project_root = build_project_set_xml(studies, test=use_test)
+    submission_xml_bytes = xml_to_bytes(submission_root)
+    project_xml_bytes = xml_to_bytes(project_root)
+    logger.info("SUBMISSION XML document size: %d bytes", len(submission_xml_bytes))
+    logger.debug("SUBMISSION XML:\n%s", submission_xml_bytes.decode("utf-8"))
+    logger.info("PROJECT XML document size: %d bytes", len(project_xml_bytes))
+    logger.debug("PROJECT XML:\n%s", project_xml_bytes.decode("utf-8"))
     ok = _do_submission(
-        base_url, auth, xml_bytes,
+        base_url, auth, submission_xml_bytes, project_xml_bytes,
         action="ADD",
         results=results,
-        result_key="submitted",
         env_label=env_label,
         dry_run=validate,
     )
@@ -748,10 +764,11 @@ def main(
     logger.info("=" * 60)
     logger.info("SUBMISSION SUMMARY")
     logger.info("  Submitted (ADD): %d", len(results["submitted"]))
-    for s in results["submitted"]:
-        ext = s.get("external_accession", "")
-        ext_suffix = f" ({ext})" if ext else ""
-        logger.info("    %s -> %s%s", s["alias"], s["accession"], ext_suffix)
+    for submission in results["submitted"]:
+        alias = submission["alias"]
+        accession = submission["accession"]
+        external_accession = submission["external_accession"]
+        logger.info(f"    {alias} -> {accession} ({external_accession})")
     logger.info("=" * 60)
 
 
diff --git a/conf/modules.config b/conf/modules.config
index c45656c..aab05b3 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -71,11 +71,8 @@ process {
         ]
     }
 
-    withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI' {
+    withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI|REGISTERSTUDY' {
         ext.args = { params.test_upload ? "--test" : "" }
     }
 
-    withName: 'REGISTERSTUDY' {
-        ext.args = { params.test_upload ? "--test" : "" }
-    }
 }
diff --git a/modules/local/registerstudy/tests/nextflow.config b/modules/local/registerstudy/tests/nextflow.config
index aaf8385..0a1acb3 100644
--- a/modules/local/registerstudy/tests/nextflow.config
+++ b/modules/local/registerstudy/tests/nextflow.config
@@ -8,16 +8,11 @@
 
 process {
     withName: REGISTERSTUDY {
-        ext.args = '--test --validate'
+        ext.args = '--test'
     }
 }
 
 env {
-    ENA_WEBIN          = 'Webin-000000'
-    ENA_WEBIN_PASSWORD = 'dummy-password'
-}
-
-docker {
-    enabled    = true
-    runOptions = '-u $(id -u):$(id -g)'
+    ENA_WEBIN          = secrets.ENA_WEBIN
+    ENA_WEBIN_PASSWORD = secrets.ENA_WEBIN_PASSWORD
 }
diff --git a/tests/nextflow.config b/tests/nextflow.config
deleted file mode 100644
index be915f5..0000000
--- a/tests/nextflow.config
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
-========================================================================================
-    Nextflow config file for running nf-test tests
-========================================================================================
-*/
-
-// TODO nf-core: Specify any additional parameters here
-// Or any resources requirements
-params {
-    modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/'
-    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/seqsubmit'
-}
-
-process {
-    resourceLimits = [
-        cpus: 2,
-        memory: '8.GB',
-        time: '1.h'
-    ]
-}
-
-// Override secrets-based env vars so nf-test runs don't require a populated keystore.
-// Stub-mode tests never use the actual credentials.
-env {
-    ENA_WEBIN          = "test_webin_account"
-    ENA_WEBIN_PASSWORD = "test_webin_password"
-}
-
-aws.client.anonymous = true // fixes S3 access issues on self-hosted runners

From ab305dabe7da90ceab172af721999fab271f5cd2 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Tue, 24 Mar 2026 12:46:20 +0000
Subject: [PATCH 26/36] add real test submission for registerstudy in nf-tests

---
 bin/submit_study.py                            |  2 --
 modules/local/registerstudy/meta.yml           |  8 +++++---
 modules/local/registerstudy/tests/main.nf.test | 16 +++++++---------
 workflows/genomesubmit.nf                      |  1 -
 4 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/bin/submit_study.py b/bin/submit_study.py
index 0c88bfd..b4b700f 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -136,8 +136,6 @@
 # Logging
 # -----------------------------------------------------------
 
-# All loggers in the ENA submission scripts share this root,
-# so configuring it once propagates to all child loggers.
 logging.basicConfig(
     format="%(levelname)s: %(message)s",
     level=logging.INFO,
diff --git a/modules/local/registerstudy/meta.yml b/modules/local/registerstudy/meta.yml
index 385c79e..e3e3245 100644
--- a/modules/local/registerstudy/meta.yml
+++ b/modules/local/registerstudy/meta.yml
@@ -1,10 +1,10 @@
 # yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
 name: "registerstudy"
 description: |
-  Submit a new study to ENA via the Webin REST API v2.
+  Submit a new study to ENA via via the Webin drop-box XML submission service.
   Reads study metadata from a JSON, CSV, or TSV file,
-  builds a PROJECT XML submission document, and submits to ENA.
-  Credentials are read from the WEBIN_ACCOUNT and WEBIN_PASSWORD
+  builds SUBMISSION XML and PROJECT XML, and submits to ENA.
+  Credentials are read from the ENA_WEBIN and ENA_WEBIN_PASSWORD
   env variables, which are mapped to ENA_WEBIN and ENA_WEBIN_PASSWORD
   inside the process.
 keywords:
@@ -62,5 +62,7 @@ output:
 
 authors:
   - "@timrozday"
+  - "@ochkalova"
 maintainers:
   - "@timrozday"
+  - "@ochkalova"
diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test
index 5d61673..cdc0e69 100644
--- a/modules/local/registerstudy/tests/main.nf.test
+++ b/modules/local/registerstudy/tests/main.nf.test
@@ -7,8 +7,7 @@ nextflow_process {
     tag "modules"
     tag "registerstudy"
 
-    test("registerstudy - stub") {
-        options "-stub"
+    test("registerstudy - submission to ENA test server") {
 
         when {
             process {
@@ -24,14 +23,15 @@ nextflow_process {
         then {
             assertAll(
                 { assert process.success },
-                { assert snapshot(sanitizeOutput(process.out)).match() }
+                { assert path(process.out.accessions[0][1]).exists() },
+                { assert path(process.out.accessions[0][1]).json.submitted instanceof List },
+                { assert path(process.out.accessions[0][1]).json.failed.size() == 0 }
             )
         }
     }
 
-    test("registerstudy - dry run against ENA test server") {
-        // Validates and builds the submission XML but does not submit to ENA.
-        // Dummy credentials in tests/nextflow.config are sufficient for dry-run mode.
+    test("registerstudy - stub") {
+        options "-stub"
 
         when {
             process {
@@ -47,9 +47,7 @@ nextflow_process {
         then {
             assertAll(
                 { assert process.success },
-                { assert path(process.out.accessions[0][1]).exists() },
-                { assert path(process.out.accessions[0][1]).json.submitted instanceof List },
-                { assert path(process.out.accessions[0][1]).json.failed.size() == 0 }
+                { assert snapshot(sanitizeOutput(process.out)).match() }
             )
         }
     }
diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf
index afc7859..4b8ca08 100644
--- a/workflows/genomesubmit.nf
+++ b/workflows/genomesubmit.nf
@@ -259,7 +259,6 @@ workflow GENOMESUBMIT {
             .map { _meta, json ->
                 def data = new groovy.json.JsonSlurper().parse(json)
                 data.submitted[0]?.accession
-                    ?: data.duplicates[0]?.existing_accession
             }
     }
 

From 1ddd2b15982fe07812799a486875f430df26a83c Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Tue, 24 Mar 2026 12:58:13 +0000
Subject: [PATCH 27/36] update schema

---
 nextflow_schema.json | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/nextflow_schema.json b/nextflow_schema.json
index 7e42ad9..d5d0ebb 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -308,7 +308,7 @@
                     "format": "file-path",
                     "exists": true,
                     "description": "Path to study metadata file (JSON, CSV, or TSV) for registering a new ENA study. Required when submission_study is not provided.",
-                    "help_text": "File containing study metadata fields (STUDY_TITLE, STUDY_ABSTRACT, existing_study_type, alias, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.",
+                    "help_text": "File containing study metadata fields (required: STUDY_TITLE, optional: alias, STUDY_ABSTRACT, existing_study_type, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.",
                     "fa_icon": "fas fa-file-alt"
                 },
                 "webincli_submit": {
@@ -320,6 +320,20 @@
             }
         }
     },
+    "oneOf": [
+        {
+            "required": ["submission_study"],
+            "not": {
+                "required": ["study_metadata"]
+            }
+        },
+        {
+            "required": ["study_metadata"],
+            "not": {
+                "required": ["submission_study"]
+            }
+        }
+    ],
     "allOf": [
         {
             "$ref": "#/$defs/input_output_options"

From 6026318b4cd3fdda1bd44e1e1b8c19a80d062362 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Tue, 24 Mar 2026 13:07:52 +0000
Subject: [PATCH 28/36] revert some minor changes for branch merging later

---
 conf/modules.config                 |  1 -
 modules/local/ena_webin_cli/main.nf | 12 -----------
 tests/default.nf.test               | 33 +++++++++++++++++++++++++++++
 tests/nextflow.config               | 14 ++++++++++++
 4 files changed, 47 insertions(+), 13 deletions(-)
 create mode 100644 tests/default.nf.test
 create mode 100644 tests/nextflow.config

diff --git a/conf/modules.config b/conf/modules.config
index aab05b3..f828bef 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -74,5 +74,4 @@ process {
     withName: 'GENERATE_ASSEMBLY_MANIFEST|ENA_WEBIN_CLI|REGISTERSTUDY' {
         ext.args = { params.test_upload ? "--test" : "" }
     }
-
 }
diff --git a/modules/local/ena_webin_cli/main.nf b/modules/local/ena_webin_cli/main.nf
index e5f878e..25b12f4 100644
--- a/modules/local/ena_webin_cli/main.nf
+++ b/modules/local/ena_webin_cli/main.nf
@@ -58,16 +58,4 @@ process ENA_WEBIN_CLI {
         false
     fi
     """
-
-    stub:
-    def prefix = task.ext.prefix ?: "${meta.id}"
-    """
-    touch ${prefix}_webin-cli.report
-    export STATUS="success"
-
-    cat <<-END_VERSIONS > versions.yml
-    "${task.process}":
-        ena-webin-cli: 0.0.0
-    END_VERSIONS
-    """
 }
diff --git a/tests/default.nf.test b/tests/default.nf.test
new file mode 100644
index 0000000..44f2465
--- /dev/null
+++ b/tests/default.nf.test
@@ -0,0 +1,33 @@
+nextflow_pipeline {
+
+    name "Test pipeline"
+    script "../main.nf"
+    tag "pipeline"
+
+    test("-profile test") {
+
+        when {
+            params {
+                outdir = "$outputDir"
+            }
+        }
+
+        then {
+            // stable_name: All files + folders in ${params.outdir}/ with a stable name
+            def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}'])
+            // stable_path: All files in ${params.outdir}/ with stable content
+            def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore')
+            assertAll(
+                { assert workflow.success},
+                { assert snapshot(
+                    // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions
+                    removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"),
+                    // All stable path name, with a relative path
+                    stable_name,
+                    // All files with stable contents
+                    stable_path
+                ).match() }
+            )
+        }
+    }
+}
diff --git a/tests/nextflow.config b/tests/nextflow.config
new file mode 100644
index 0000000..695d52b
--- /dev/null
+++ b/tests/nextflow.config
@@ -0,0 +1,14 @@
+/*
+========================================================================================
+    Nextflow config file for running nf-test tests
+========================================================================================
+*/
+
+// TODO nf-core: Specify any additional parameters here
+// Or any resources requirements
+params {
+    modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/'
+    pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/seqsubmit'
+}
+
+aws.client.anonymous = true // fixes S3 access issues on self-hosted runners

From 8ac674db79b7c85e9a5383538dd23dc81921c0d8 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Tue, 24 Mar 2026 13:08:09 +0000
Subject: [PATCH 29/36] update docs

---
 README.md     | 20 ++++++++++------
 docs/usage.md | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 450e725..fb8c2cf 100644
--- a/README.md
+++ b/README.md
@@ -116,15 +116,21 @@ assembly_2,data/contigs_2.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9
 > [!NOTE]
 > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.
 
+### Submission study
+
+All data submitted through this pipeline must be associated with an ENA study (project). You can either pass an accession of your existing study via `--submission_study`or provide a metadata file via `--study_metadata` and the pipeline will register the study with ENA before submitting your data.
+
+See the [usage documentation](docs/usage.md#submission-study) for more details.
+
 ### Required parameters:
 
-| Parameter            | Description                                                                       |
-| -------------------- | --------------------------------------------------------------------------------- |
-| `--mode`             | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]` |
-| `--input`            | Path to the samplesheet describing the data to be submitted                       |
-| `--outdir`           | Path to the output directory for pipeline results                                 |
-| `--submission_study` | ENA study accession (PRJ/ERP) to submit the data to                               |
-| `--centre_name`      | Name of the submitter's organisation                                              |
+| Parameter                                  | Description                                                                                                       |
+| ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------- |
+| `--mode`                                   | Type of the data to be submitted. Options: `[mags, bins, metagenomic_assemblies]`                                 |
+| `--input`                                  | Path to the samplesheet describing the data to be submitted                                                       |
+| `--outdir`                                 | Path to the output directory for pipeline results                                                                 |
+| `--submission_study` OR `--study_metadata` | ENA study accession (PRJ/ERP) to submit the data to OR metadata file in JSON/TSV/CSV format to register new study |
+| `--centre_name`                            | Name of the submitter's organisation                                                                              |
 
 ### Optional parameters:
 
diff --git a/docs/usage.md b/docs/usage.md
index 0833bb6..9cab0d0 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -99,6 +99,70 @@ assembly_002,data/assembly_002.fasta.gz,,,42.7,ERR011323,MEGAHIT,1.2.9
 
 An example file is available at [assets/samplesheet_assembly.csv](../assets/samplesheet_assembly.csv).
 
+## Submission study
+
+All data submitted through this pipeline must be associated with an ENA study (project). You have two options:
+
+### Option 1 — Use an existing study
+
+If you already have an ENA study, pass its accession (starting with `PRJ` or `ERP`) via `--submission_study`:
+
+```bash
+--submission_study PRJEB12345
+```
+
+You can create a study manually via the [Webin Portal](https://www.ebi.ac.uk/ena/submit/webin/login) and then use the assigned accession here.
+
+### Option 2 — Register a new study automatically
+
+Provide a study metadata file via `--study_metadata` and the pipeline will register the study with ENA before submitting your data:
+
+```bash
+--study_metadata study_metadata.json
+```
+
+The pipeline accepts JSON, CSV, and TSV formats.
+
+#### JSON formats
+
+Single study as a flat object:
+
+```json
+{
+  "alias": "study-gut-2026",
+  "STUDY_TITLE": "Gut microbiome study",
+  "STUDY_ABSTRACT": "Characterisation of gut microbial communities"
+}
+```
+
+#### CSV format
+
+```csv
+alias,STUDY_TITLE,STUDY_ABSTRACT
+study-gut-2026,Gut microbiome study,Characterisation of gut microbial communities
+```
+
+#### TSV format
+
+```tsv
+alias	STUDY_TITLE	STUDY_ABSTRACT
+study-soil-2026	Soil microbiome study	Survey of soil microbiota
+```
+
+#### Study metadata fields
+
+| Field                 | Required | Description                                                                                 |
+| --------------------- | -------- | ------------------------------------------------------------------------------------------- |
+| `STUDY_TITLE`         | Yes      | Descriptive title of the study.                                                             |
+| `alias`               | No       | Unique project alias within your Webin account. Derived from `STUDY_TITLE` if not provided. |
+| `STUDY_ABSTRACT`      | No       | Free-text abstract describing the study.                                                    |
+| `STUDY_DESCRIPTION`   | No       | Alternative to `STUDY_ABSTRACT`.                                                            |
+| `CENTER_PROJECT_NAME` | No       | Internal project name at your centre. Defaults to `alias`.                                  |
+| `existing_study_type` | No       | ENA study type (e.g. `Metagenomics`, `Other`).                                              |
+| `new_study_type`      | No       | Custom study type. Only used when `existing_study_type` is set to `Other`.                  |
+
+An example metadata file is available at [assets/study_metadata.json](../assets/study_metadata.json).
+
 ## Running the pipeline
 
 General command template:

From 6eae0bd3286551d8d59d0dd8a9dd02647f3d6894 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Tue, 24 Mar 2026 13:08:40 +0000
Subject: [PATCH 30/36] add tests for pipeline with study registration

---
 ...assembly_no_study_complete_metadata.config | 37 +++++++++++++++++
 ...test_mag_no_study_complete_metadata.config | 40 +++++++++++++++++++
 ...ssembly_no_study_complete_metadata.nf.test | 39 ++++++++++++++++++
 tests/mag_no_study_complete_metadata.nf.test  | 39 ++++++++++++++++++
 4 files changed, 155 insertions(+)
 create mode 100644 conf/test_assembly_no_study_complete_metadata.config
 create mode 100644 conf/test_mag_no_study_complete_metadata.config
 create mode 100644 tests/assembly_no_study_complete_metadata.nf.test
 create mode 100644 tests/mag_no_study_complete_metadata.nf.test

diff --git a/conf/test_assembly_no_study_complete_metadata.config b/conf/test_assembly_no_study_complete_metadata.config
new file mode 100644
index 0000000..f729c77
--- /dev/null
+++ b/conf/test_assembly_no_study_complete_metadata.config
@@ -0,0 +1,37 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/seqsubmit -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+process {
+    resourceLimits = [
+        cpus: 2,
+        memory: '8.GB',
+        time: '1.h'
+    ]
+}
+
+params {
+    config_profile_name        = 'Test --mode metagenomic_assemblies complete_metadata profile'
+    config_profile_description = 'Single-case assembly test with complete metadata values provided'
+
+    // Input data
+    input  = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/assembly_complete_metadata.csv'
+
+    mode                          = "metagenomic_assemblies"
+    submission_study              = null
+    study_metadata                = "$projectDir/assets/study_metadata.json"
+    centre_name                   = "TEST_CENTER"
+
+    test_upload      = true
+
+}
+
+docker.enabled = true
diff --git a/conf/test_mag_no_study_complete_metadata.config b/conf/test_mag_no_study_complete_metadata.config
new file mode 100644
index 0000000..dd3e659
--- /dev/null
+++ b/conf/test_mag_no_study_complete_metadata.config
@@ -0,0 +1,40 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/seqsubmit -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+process {
+    resourceLimits = [
+        cpus: 2,
+        memory: '16.GB',
+        time: '1.h'
+    ]
+}
+
+params {
+    config_profile_name        = 'Test --mode mags complete_metadata profile'
+    config_profile_description = 'Single-case MAG test with complete metadata values provided'
+
+    // Input data
+    input            = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/mag_complete_metadata.csv'
+
+    mode             = "mags"
+    submission_study = null
+    study_metadata   = "$projectDir/assets/study_metadata.json"
+    centre_name      = "TEST_CENTER"
+
+    test_upload      = true
+
+    cat_db           = null
+    checkm2_db       = null
+
+}
+
+docker.enabled = true
diff --git a/tests/assembly_no_study_complete_metadata.nf.test b/tests/assembly_no_study_complete_metadata.nf.test
new file mode 100644
index 0000000..b6c857b
--- /dev/null
+++ b/tests/assembly_no_study_complete_metadata.nf.test
@@ -0,0 +1,39 @@
+nextflow_pipeline {
+
+    name "Test assembly submission workflow stub - complete_metadata"
+    script "../main.nf"
+    tag "pipeline"
+    tag "mode_assembly"
+    tag "test_assembly_no_study_complete_metadata"
+    profile "test_assembly_no_study_complete_metadata"
+
+    test("-profile test_assembly_no_study_complete_metadata") {
+
+        when {
+            params {
+                outdir = "$outputDir"
+            }
+        }
+
+        then {
+            // stable_name: All files + folders in ${params.outdir}/ with a stable name
+            def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}'])
+            // stable_path: All files in ${params.outdir}/ with stable content
+            def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore')
+            // Early failure no need to test the rest of snapshots
+            assert workflow.success
+            assertAll(
+                { assert snapshot(
+                    // Number of successful tasks
+                    workflow.trace.succeeded().size(),
+                    // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions
+                    removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"),
+                    // All stable path name, with a relative path
+                    stable_name,
+                    // All files with stable contents
+                    stable_path
+                ).match() }
+            )
+        }
+    }
+}
diff --git a/tests/mag_no_study_complete_metadata.nf.test b/tests/mag_no_study_complete_metadata.nf.test
new file mode 100644
index 0000000..d585286
--- /dev/null
+++ b/tests/mag_no_study_complete_metadata.nf.test
@@ -0,0 +1,39 @@
+nextflow_pipeline {
+
+    name "Test genome submission workflow - complete_metadata"
+    script "../main.nf"
+    tag "pipeline"
+    tag "mode_mag"
+    tag "test_mag_no_study_complete_metadata"
+    profile "test_mag_no_study_complete_metadata"
+
+    test("-profile test_mag_no_study_complete_metadata") {
+
+        when {
+            params {
+                outdir = "$outputDir"
+            }
+        }
+
+        then {
+            // stable_name: All files + folders in ${params.outdir}/ with a stable name
+            def stable_name = getAllFilesFromDir(params.outdir, relative: true, includeDir: true, ignore: ['pipeline_info/*.{html,json,txt}', '**/manifests_test/*'])
+            // stable_path: All files in ${params.outdir}/ with stable content
+            def stable_path = getAllFilesFromDir(params.outdir, ignoreFile: 'tests/.nftignore')
+            // Early failure no need to test the rest of snapshots
+            assert workflow.success
+            assertAll(
+                { assert snapshot(
+                    // Number of successful tasks
+                    workflow.trace.succeeded().size(),
+                    // pipeline versions.yml file for multiqc from which Nextflow version is removed because we test pipelines on multiple Nextflow versions
+                    removeNextflowVersion("$outputDir/pipeline_info/nf_core_seqsubmit_software_mqc_versions.yml"),
+                    // All stable path name, with a relative path
+                    stable_name,
+                    // All files with stable contents
+                    stable_path
+                ).match() }
+            )
+        }
+    }
+}

From cc29a3e4c04ee4ffe82214d12d2064fc9124de5d Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Tue, 24 Mar 2026 14:27:39 +0000
Subject: [PATCH 31/36] apply linter

---
 modules/local/registerstudy/environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/local/registerstudy/environment.yml b/modules/local/registerstudy/environment.yml
index 6ee92a8..a5e1bf2 100644
--- a/modules/local/registerstudy/environment.yml
+++ b/modules/local/registerstudy/environment.yml
@@ -7,4 +7,4 @@ dependencies:
   - conda-forge::python>=3.12
   - conda-forge::pip
   - pip:
-    - mgnify-pipelines-toolkit==1.4.17
+      - mgnify-pipelines-toolkit==1.4.17

From bbd8305ce6ec19897838a5104ff1bbcacc062a06 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Tue, 24 Mar 2026 14:35:20 +0000
Subject: [PATCH 32/36] linter on .gitignore

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 601993a..1c11923 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,4 +9,4 @@ testing*
 null/
 .nf-test*
 .idea/
-test_data
\ No newline at end of file
+test_data

From 1a82bec1da2906a6b7cf07296ccf4101d0d447f3 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Tue, 24 Mar 2026 15:26:29 +0000
Subject: [PATCH 33/36] revert to REST API usage

---
 bin/submit_study.py | 192 ++++++++++++++++++++++----------------------
 1 file changed, 95 insertions(+), 97 deletions(-)

diff --git a/bin/submit_study.py b/bin/submit_study.py
index b4b700f..7bae1ef 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Submit studies to ENA via the Webin drop-box XML submission service.
+"""Submit studies to ENA via the Webin REST API v2.
 
 Read a study metadata file (JSON, CSV, or TSV), construct an
 XML submission document, and submit new studies to ENA.
@@ -170,34 +170,33 @@ def get_credentials() -> tuple[str, str]:
 # ENA API helpers
 # -----------------------------------------------------------
 
-PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/drop-box"
-TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box"
+PROD_URL: Final = "https://www.ebi.ac.uk/ena/submit/webin-v2"
+TEST_URL: Final = "https://wwwdev.ebi.ac.uk/ena/submit/webin-v2"
 
 
 def submit_xml(
     base_url: str,
     auth: HTTPBasicAuth,
-    submission_xml_bytes: bytes,
-    project_xml_bytes: bytes,
+    xml_bytes: bytes,
 ) -> ET.Element:
-    """Submit study XMLs to ENA via the submit/drop-box endpoint.
+    """Submit an XML document to ENA via Webin REST API v2.
 
     Args:
         base_url: ENA submission service base URL.
         auth: HTTP basic-auth credentials.
-        submission_xml_bytes: Serialised ``<SUBMISSION>`` XML.
-        project_xml_bytes: Serialised ``<PROJECT_SET>`` XML.
+        xml_bytes: Serialised XML submission document.
 
     Returns:
         Parsed receipt XML element tree root.
     """
     url = f"{base_url}/submit"
-    files = {
-        "SUBMISSION": ("submission.xml", submission_xml_bytes, "application/xml"),
-        "PROJECT": ("project.xml", project_xml_bytes, "application/xml"),
+    headers = {
+        "Content-Type": "application/xml",
+        "Accept": "application/xml",
     }
     resp = requests.post(
-        url, files=files, auth=auth, timeout=120,
+        url, data=xml_bytes,
+        headers=headers, auth=auth, timeout=120,
     )
     resp.raise_for_status()
     return ET.fromstring(resp.content)
@@ -425,24 +424,33 @@ def write_results(
 # -----------------------------------------------------------
 
 
-def build_submission_actions_xml(
+def build_submission_xml(
+    studies: list[dict[str, Any]],
     hold_until: str | None = None,
     action: str = "ADD",
+    test: bool = False,
 ) -> ET.Element:
-    """Build the ``<SUBMISSION>`` actions XML element.
-
-    This is submitted as the ``SUBMISSION`` multipart field.
+    """Build a ``<WEBIN>`` XML document for submitting studies.
 
     Args:
+        studies: Study metadata dicts.
         hold_until: Optional hold-until date string
             (``YYYY-MM-DD``).
         action: Submission action — ``"ADD"`` for new studies
             or ``"MODIFY"`` to update existing ones.
+        test: If ``True``, append a timestamp-based hash to aliases
+            for uniqueness in test submissions.
 
     Returns:
-        Root ``<SUBMISSION>`` element.
+        Root ``<WEBIN>`` element.
     """
-    submission = ET.Element("SUBMISSION")
+    webin = ET.Element("WEBIN")
+
+    # SUBMISSION_SET
+    submission_set = ET.SubElement(webin, "SUBMISSION_SET")
+    submission = ET.SubElement(submission_set, "SUBMISSION")
+    sub_alias = f"study-submission-{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"
+    submission.set("alias", sub_alias)
     actions = ET.SubElement(submission, "ACTIONS")
     main_action = ET.SubElement(actions, "ACTION")
     ET.SubElement(main_action, action.upper())
@@ -450,74 +458,66 @@ def build_submission_actions_xml(
         hold_action = ET.SubElement(actions, "ACTION")
         hold_el = ET.SubElement(hold_action, "HOLD")
         hold_el.set("HoldUntilDate", hold_until)
-    return submission
 
+    # PROJECT_SET
+    project_set = ET.SubElement(webin, "PROJECT_SET")
+    for study in studies:
+        _add_project_element(project_set, study, test=test)
+    return webin
 
-def build_project_set_xml(
-    studies: list[dict[str, Any]],
-    test: bool = False,
-) -> ET.Element:
-    """Build the ``<PROJECT_SET>`` XML element.
-
-    This is submitted as the ``PROJECT`` multipart field.
 
-    Args:
-        studies: Study metadata dicts.
-        test: If ``True``, append a timestamp-based hash to aliases
-            for uniqueness in test submissions.
-
-    Returns:
-        Root ``<PROJECT_SET>`` element.
-    """
-    project_set = ET.Element("PROJECT_SET")
-    for study in studies:
-        alias = study.get(
-            "alias",
-            study.get("STUDY_TITLE", "").replace(" ", "_")[:50],
+def _add_project_element(
+    project_set: ET.Element,
+    study: dict[str, Any],
+    test: bool = False,
+) -> None:
+    """Append a ``<PROJECT>`` element to *project_set*."""
+    alias = study.get(
+        "alias",
+        study.get("STUDY_TITLE", "").replace(" ", "_")[:50],
+    )
+    if test:
+        # Append 8-character hash of current timestamp for uniqueness in test mode
+        timestamp_hash = hashlib.md5(
+            datetime.datetime.now().isoformat().encode()
+        ).hexdigest()[:8]
+        alias = f"{alias}_{timestamp_hash}"
+
+    project = ET.SubElement(project_set, "PROJECT")
+    project.set("alias", alias)
+
+    name_text = study.get("CENTER_PROJECT_NAME", alias)
+    if name_text:
+        name_el = ET.SubElement(project, "NAME")
+        name_el.text = name_text
+
+    title_el = ET.SubElement(project, "TITLE")
+    title_el.text = study.get("STUDY_TITLE", "")
+
+    desc_text = (
+        study.get("STUDY_ABSTRACT")
+        or study.get("STUDY_DESCRIPTION", "")
+    )
+    if desc_text:
+        desc_el = ET.SubElement(project, "DESCRIPTION")
+        desc_el.text = desc_text
+
+    sp = ET.SubElement(project, "SUBMISSION_PROJECT")
+    ET.SubElement(sp, "SEQUENCING_PROJECT")
+    # TODO: Check existing_study_type and new_study_type metadata fields, do we need those?
+    study_type = study.get("existing_study_type")
+    if study_type:
+        attrs = ET.SubElement(
+            project, "PROJECT_ATTRIBUTES",
         )
-        if test:
-            # Append 8-character hash of current timestamp for uniqueness in test mode
-            timestamp_hash = hashlib.md5(
-                datetime.datetime.now().isoformat().encode()
-            ).hexdigest()[:8]
-            alias = f"{alias}_{timestamp_hash}"
-
-        project = ET.SubElement(project_set, "PROJECT")
-        project.set("alias", alias)
-
-        name_text = study.get("CENTER_PROJECT_NAME", alias)
-        if name_text:
-            name_el = ET.SubElement(project, "NAME")
-            name_el.text = name_text
-
-        title_el = ET.SubElement(project, "TITLE")
-        title_el.text = study.get("STUDY_TITLE", "")
-
-        desc_text = (
-            study.get("STUDY_ABSTRACT")
-            or study.get("STUDY_DESCRIPTION", "")
+        _add_project_attribute(
+            attrs, "existing_study_type", study_type,
         )
-        if desc_text:
-            desc_el = ET.SubElement(project, "DESCRIPTION")
-            desc_el.text = desc_text
-
-        sp = ET.SubElement(project, "SUBMISSION_PROJECT")
-        ET.SubElement(sp, "SEQUENCING_PROJECT")
-        # TODO: Check existing_study_type and new_study_type metadata fields, do we need those?
-        study_type = study.get("existing_study_type")
-        if study_type:
-            attrs = ET.SubElement(
-                project, "PROJECT_ATTRIBUTES",
-            )
+        new_type = study.get("new_study_type")
+        if new_type and study_type == "Other":
             _add_project_attribute(
-                attrs, "existing_study_type", study_type,
+                attrs, "new_study_type", new_type,
             )
-            new_type = study.get("new_study_type")
-            if new_type and study_type == "Other":
-                _add_project_attribute(
-                    attrs, "new_study_type", new_type,
-                )
-    return project_set
 
 
 def _add_project_attribute(
@@ -594,8 +594,7 @@ def parse_xml_receipt(
 def _do_submission(
     base_url: str,
     auth: Any,
-    submission_xml_bytes: bytes,
-    project_xml_bytes: bytes,
+    xml_bytes: bytes,
     action: str,
     results: dict[str, list[dict[str, Any]]],
     env_label: str,
@@ -606,8 +605,7 @@ def _do_submission(
     Args:
         base_url: ENA submission base URL.
         auth: HTTP basic-auth credentials.
-        submission_xml_bytes: Serialised ``<SUBMISSION>`` actions XML.
-        project_xml_bytes: Serialised ``<PROJECT_SET>`` XML.
+        xml_bytes: Serialised XML submission document.
         action: Label for log messages (``"ADD"`` or
             ``"MODIFY"``).
         results: Results dict to accumulate into.
@@ -619,13 +617,12 @@ def _do_submission(
     """
     if dry_run:
         logger.info("DRY RUN — skipping %s submission", action)
-        logger.info("SUBMISSION XML:\n%s", submission_xml_bytes.decode("utf-8"))
-        logger.info("PROJECT XML:\n%s", project_xml_bytes.decode("utf-8"))
+        logger.info("Generated XML:\n%s", xml_bytes.decode("utf-8"))
         return True
 
     logger.info("Submitting %s to ENA (%s)...", action, env_label)
     try:
-        receipt_root = submit_xml(base_url, auth, submission_xml_bytes, project_xml_bytes)
+        receipt_root = submit_xml(base_url, auth, xml_bytes)
     except requests.exceptions.HTTPError as exc:
         logger.error("HTTP error during %s submission: %s", action, exc)
         if exc.response is not None:
@@ -665,7 +662,7 @@ def _do_submission(
 
 
 @click.command(
-    help="Register studies with ENA using Webin XML submission service.",
+    help="Submit studies to ENA via the Webin REST API v2.",
 )
 @click.option(
     "--input", "input_file",
@@ -701,7 +698,7 @@ def main(
     output: Path | None,
     validate: bool,
 ) -> None:
-    """Register studies with ENA using Webin XML submission service."""
+    """Submit studies to ENA via the Webin REST API v2."""
     username, password = get_credentials()
 
     env_label = "TEST server" if use_test else "LIVE server"
@@ -737,16 +734,17 @@ def main(
 
     # -- Step 2: Build and submit XML --------------------
     logger.info("Building ADD XML for %d study/studies...", len(studies))
-    submission_root = build_submission_actions_xml(hold_until=hold_until, action="ADD")
-    project_root = build_project_set_xml(studies, test=use_test)
-    submission_xml_bytes = xml_to_bytes(submission_root)
-    project_xml_bytes = xml_to_bytes(project_root)
-    logger.info("SUBMISSION XML document size: %d bytes", len(submission_xml_bytes))
-    logger.debug("SUBMISSION XML:\n%s", submission_xml_bytes.decode("utf-8"))
-    logger.info("PROJECT XML document size: %d bytes", len(project_xml_bytes))
-    logger.debug("PROJECT XML:\n%s", project_xml_bytes.decode("utf-8"))
+    xml_root = build_submission_xml(
+        studies,
+        hold_until=hold_until,
+        action="ADD",
+        test=use_test,
+    )
+    xml_bytes = xml_to_bytes(xml_root)
+    logger.info("XML document size: %d bytes", len(xml_bytes))
+    logger.debug("Generated XML:\n%s", xml_bytes.decode("utf-8"))
     ok = _do_submission(
-        base_url, auth, submission_xml_bytes, project_xml_bytes,
+        base_url, auth, xml_bytes,
         action="ADD",
         results=results,
         env_label=env_label,

From 8e98878fcfa75969be569e68b191283db539fc05 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Wed, 25 Mar 2026 10:31:55 +0000
Subject: [PATCH 34/36] refactor input parsing in submit_study.py

---
 assets/study_metadata.json                    |   6 +
 assets/study_metadata.tsv                     |   2 +
 bin/submit_study.py                           | 305 +++++-------------
 docs/usage.md                                 |  26 +-
 modules/local/registerstudy/meta.yml          |   5 +-
 .../local/registerstudy/tests/main.nf.test    |  29 +-
 nextflow_schema.json                          |   2 +-
 7 files changed, 133 insertions(+), 242 deletions(-)
 create mode 100644 assets/study_metadata.json
 create mode 100644 assets/study_metadata.tsv

diff --git a/assets/study_metadata.json b/assets/study_metadata.json
new file mode 100644
index 0000000..fbc2b28
--- /dev/null
+++ b/assets/study_metadata.json
@@ -0,0 +1,6 @@
+{
+    "alias": "study-example-2026",
+    "study_title": "Example metagenome study",
+    "study_abstract": "Description of the study aims and methods.",
+    "existing_study_type": "Metagenomics"
+}
diff --git a/assets/study_metadata.tsv b/assets/study_metadata.tsv
new file mode 100644
index 0000000..2389f1d
--- /dev/null
+++ b/assets/study_metadata.tsv
@@ -0,0 +1,2 @@
+alias	study_title	study_abstract	existing_study_type
+study-example-2026	Example metagenome study	Description of the study aims and methods.	Metagenomics
diff --git a/bin/submit_study.py b/bin/submit_study.py
index 7bae1ef..28c1f9a 100755
--- a/bin/submit_study.py
+++ b/bin/submit_study.py
@@ -1,117 +1,4 @@
 #!/usr/bin/env python3
-"""Submit studies to ENA via the Webin REST API v2.
-
-Read a study metadata file (JSON, CSV, or TSV), construct an
-XML submission document, and submit new studies to ENA.
-
-# TODO: Currently script supports multiple input format that might be unnecessary.
-# TODO: Consider standardising on a single format (e.g. JSON and/or TSV) and deprecating the others.
-# TODO: Consider which columns are mandatory vs optional. "alias" is optional, might be worth making it mandatory.
-# TODO: Add input file validation and error handling (e.g. missing mandatory fields, long alias).
-Input formats accepted (``--input``):
-
-* ``.json``
-* ``.csv``
-* ``.tsv``
-
-Example JSON inputs accepted::
-
-        {
-            "alias": "study-gut-2026",
-            "STUDY_TITLE": "Gut microbiome study",
-            "STUDY_ABSTRACT": "Characterisation of gut microbial communities",
-            "existing_study_type": "Metagenomics"
-        }
-
-        [
-            {
-                "alias": "study-gut-2026",
-                "STUDY_TITLE": "Gut microbiome study",
-                "STUDY_ABSTRACT": "Characterisation of gut microbial communities",
-                "existing_study_type": "Metagenomics"
-            },
-            ...
-        ]
-
-        {
-            "studies": [
-                {
-                    "alias": "study-soil-2026",
-                    "STUDY_TITLE": "Soil microbiome study",
-                    "existing_study_type": "Other",
-                    "new_study_type": "Environmental microbiome"
-                }
-            ]
-        }
-
-        {
-            "data": [
-                {
-                    "alias": "study-soil-2026",
-                    "STUDY_TITLE": "Soil microbiome study",
-                }
-            ]
-        }
-
-        {
-            "Container": {
-                "Studies": [
-                    {
-                        "STUDY_TITLE": "Marine metagenome study",
-                        "STUDY_ABSTRACT": "Shotgun metagenomics from seawater"
-                    }
-                ]
-            }
-        }
-
-Example CSV input accepted::
-
-        alias,STUDY_TITLE,STUDY_ABSTRACT,existing_study_type
-        study-gut-2026,Gut microbiome study,Characterisation of gut microbial communities,Metagenomics
-
-Example TSV input accepted::
-
-        alias\tSTUDY_TITLE\tSTUDY_ABSTRACT\texisting_study_type
-        study-soil-2026\tSoil microbiome study\tSurvey of soil microbiota\tMetagenomics
-
-Study metadata fields:
-
-Mandatory:
-
-* ``STUDY_TITLE`` — study title used in ``<TITLE>``.
-
-Optional:
-
-* ``alias`` — project alias; if missing, derived from ``STUDY_TITLE`` (first 50 characters).
-* ``CENTER_PROJECT_NAME`` — written to ``<NAME>``; defaults to alias.
-* ``STUDY_ABSTRACT`` or ``STUDY_DESCRIPTION`` — written to ``<DESCRIPTION>``.
-* ``existing_study_type`` — included as PROJECT_ATTRIBUTE.
-* ``new_study_type`` — included only when ``existing_study_type == "Other"``.
-
-Credentials are read from environment variables to avoid
-secrets appearing in shell history or process listings::
-
-    export ENA_WEBIN=Webin-XXXXX
-    export ENA_WEBIN_PASSWORD=XXXXX
-
-Usage::
-
-    # Submission to TEST server (submissions are discarded daily):
-    python bin/submit_study.py \\
-        --input studies.json \\
-        --test
-
-    # With hold date (max 2 years):
-    python bin/submit_study.py \\
-        --input studies.json \\
-        --hold-until 2028-01-01
-
-    # Log to file:
-    python bin/submit_study.py \\
-        --input studies.json \\
-        --test --log submission.log
-"""
-
 from __future__ import annotations
 
 import csv
@@ -122,7 +9,6 @@
 import os
 import sys
 import xml.etree.ElementTree as ET
-from collections.abc import Sequence
 from io import BytesIO
 from pathlib import Path
 from typing import Any, Final
@@ -260,21 +146,31 @@ def validate_hold_until(hold_until: str) -> datetime.date:
 
 
 # -----------------------------------------------------------
-# File loading (JSON, CSV, TSV)
+# Study metadata field definitions
 # -----------------------------------------------------------
 
+#: Fields that must be present and non-empty in every record.
+_REQUIRED_FIELDS: Final[frozenset[str]] = frozenset({
+    "alias",
+    "study_title",
+})
 
-def _is_metadata_row(row: Sequence[object]) -> bool:
-    """Check whether *row* is a non-data header/metadata row.
+#: Fields that are recognised but optional.
+_OPTIONAL_FIELDS: Final[frozenset[str]] = frozenset({
+    "project_name",
+    "study_abstract",
+    "study_description",
+    "existing_study_type",
+    "new_study_type",
+})
 
-    Such rows have at most one non-empty cell and are skipped
-    during record extraction.
-    """
-    non_empty = sum(
-        1 for c in row
-        if c is not None and str(c).strip()
-    )
-    return non_empty <= 1
+#: All recognised field names (required + optional).
+_ALL_FIELDS: Final[frozenset[str]] = _REQUIRED_FIELDS | _OPTIONAL_FIELDS
+
+
+# -----------------------------------------------------------
+# File loading (JSON, CSV, TSV)
+# -----------------------------------------------------------
 
 
 def extract_records_from_tabular(
@@ -283,8 +179,8 @@ def extract_records_from_tabular(
 ) -> list[dict[str, str]]:
     """Extract record dicts from a CSV or TSV file.
 
-    Skip an optional leading metadata/label row if detected
-    (a row with at most one non-empty cell).
+    Only columns present in _ALL_FIELDS are retained;
+    unknown columns are ignored.
 
     Args:
         filepath: Path to the tabular file.
@@ -293,111 +189,88 @@ def extract_records_from_tabular(
     Returns:
         List of record dicts.
     """
+    records = []
+
     with open(filepath, newline="", encoding="utf-8") as fh:
-        rows = list(csv.reader(fh, delimiter=delimiter))
-
-    if not rows:
-        return []
-
-    idx = 0
-    if _is_metadata_row(rows[idx]):
-        idx += 1
-    if idx >= len(rows):
-        return []
-
-    headers = rows[idx]
-    idx += 1
-
-    records: list[dict[str, str]] = []
-    for row in rows[idx:]:
-        record: dict[str, str] = {}
-        for col, val in zip(headers, row):
-            col = col.strip()
-            if col and val is not None and val.strip():
-                record[col] = val.strip()
-        if record:
-            records.append(record)
+        reader = csv.DictReader(fh, delimiter=delimiter)
+        for line in reader:
+            record = {}
+            for col in _ALL_FIELDS:
+                value = line.get(col, "").strip()
+                if value:
+                    record[col] = value
+            if record:
+                records.append(record)
 
-    return records
+        return records
 
 
 def extract_records_from_json(
-    input_data: object,
-    record_keys: Sequence[str] = ("data",),
-) -> list[dict[str, Any]] | None:
-    """Extract record dicts from a JSON input.
-
-    Handle several JSON shapes:
-
-    * Container format (e.g. DataHarmonizer exports)::
+    filepath: str | Path,
+) -> list[dict[str, Any]]:
+    """Extract record dicts from a JSON file.
 
-        {"Container": {"<ClassName>s": [{...}, ...]}}
+    Handle two JSON shapes:
 
     * Plain list of dicts.
-    * Dict with an entity-specific key or ``data`` key.
     * Single record object (no wrapper).
 
     Args:
-        input_data: Parsed JSON data (any shape).
-        record_keys: Dict keys to check for record lists
-            (e.g. ``["studies", "data"]``).
+        filepath: Path to the JSON file.
 
     Returns:
-        List of record dicts, or ``None`` if unrecognised.
+        List of record dicts, or [] if unrecognised.
     """
+    with open(filepath) as fh:
+        input_data = json.load(fh)
+
     if isinstance(input_data, list):
         return input_data
 
     if isinstance(input_data, dict):
-        container = input_data.get("Container")
-        if isinstance(container, dict):
-            for key, val in container.items():
-                if isinstance(val, list):
-                    logger.info("Extracted records from Container.%s", key)
-                    return val
-
-        for key in record_keys:
-            if key in input_data:
-                return input_data[key]
-
         return [input_data]
 
-    return None
+    return []
 
 
-def load_input_file(
+def load_and_validate_input_file(
     filepath: str | Path,
-    json_record_keys: Sequence[str] = ("data",),
-) -> list[dict[str, Any]] | None:
-    """Load records from a supported file format.
+) -> list[dict[str, Any]]:
+    """Load and validate records from a supported file format.
 
-    Supported formats: JSON, CSV, TSV.
+    Supported formats: JSON, CSV, TSV. Other formats will cause a ValueError.
+    Records are validated against _REQUIRED_FIELDS before being returned;
+    missing required fields will cause a ValueError.
 
     Args:
         filepath: Path to the input file.
-        json_record_keys: Dict keys to check when parsing
-            JSON (e.g. ``["studies", "data"]``).
 
     Returns:
-        List of record dicts, or ``None`` if the format is
-        unrecognised.
+        List of record dicts. If the file format is
+        unrecognised (based on file extension) or required fields are missing,
+        raises ValueError.
     """
     ext = Path(filepath).suffix.lower()
     if ext == ".json":
-        with open(filepath) as fh:
-            input_data = json.load(fh)
-        return extract_records_from_json(
-            input_data, json_record_keys,
-        )
-    if ext == ".csv":
-        return extract_records_from_tabular(
-            filepath, delimiter=",",
-        )
-    if ext == ".tsv":
-        return extract_records_from_tabular(
-            filepath, delimiter="\t",
-        )
-    return None
+        records = extract_records_from_json(filepath)
+    elif ext == ".csv":
+        records = extract_records_from_tabular(filepath, delimiter=",")
+    elif ext == ".tsv":
+        records = extract_records_from_tabular(filepath, delimiter="\t")
+    else:
+        raise ValueError(f"Unsupported file format: {ext}. Supported: .json, .csv, .tsv")
+
+    if not records:
+        raise ValueError(f"File {filepath} seems to be empty. Check the format and content.")
+
+    for record in records:
+        for field in _REQUIRED_FIELDS:
+            if not record.get(field, "").strip():
+                raise ValueError(
+                    f"Record with alias {record.get('alias', '<missing>')} is missing required field: {field}"
+                )
+
+    return records
 
 
 # -----------------------------------------------------------
@@ -472,10 +345,7 @@ def _add_project_element(
     test: bool = False,
 ) -> None:
     """Append a ``<PROJECT>`` element to *project_set*."""
-    alias = study.get(
-        "alias",
-        study.get("STUDY_TITLE", "").replace(" ", "_")[:50],
-    )
+    alias = study.get("alias", "")
     if test:
         # Append 8-character hash of current timestamp for uniqueness in test mode
         timestamp_hash = hashlib.md5(
@@ -486,17 +356,17 @@ def _add_project_element(
     project = ET.SubElement(project_set, "PROJECT")
     project.set("alias", alias)
 
-    name_text = study.get("CENTER_PROJECT_NAME", alias)
+    name_text = study.get("project_name", study.get("study_title", ""))
     if name_text:
         name_el = ET.SubElement(project, "NAME")
         name_el.text = name_text
 
     title_el = ET.SubElement(project, "TITLE")
-    title_el.text = study.get("STUDY_TITLE", "")
+    title_el.text = study.get("study_title", "")
 
     desc_text = (
-        study.get("STUDY_ABSTRACT")
-        or study.get("STUDY_DESCRIPTION", "")
+        study.get("study_abstract")
+        or study.get("study_description", "")
     )
     if desc_text:
         desc_el = ET.SubElement(project, "DESCRIPTION")
@@ -658,9 +528,6 @@ def _do_submission(
 # Main
 # -----------------------------------------------------------
 
-_JSON_RECORD_KEYS: Final = ("studies", "data")
-
-
 @click.command(
     help="Submit studies to ENA via the Webin REST API v2.",
 )
@@ -713,20 +580,14 @@ def main(
 
     # -- Step 1: Load input file -------------------------
     logger.info("Loading input: %s", input_file)
-    studies = load_input_file(
-        input_file, json_record_keys=_JSON_RECORD_KEYS,
-    )
-    if studies is None:
-        logger.error("Unsupported file format. Supported: .json, .csv, .tsv")
-        sys.exit(1)
+    try:
+        studies = load_and_validate_input_file(input_file)
+    except ValueError as exc:
+        # Re-raise as click.BadParameter to get nice error formatting without a full stack trace
+        raise click.BadParameter(str(exc), param_hint="--input") from exc
 
     logger.info("Loaded %d study/studies from input", len(studies))
 
-    if not studies:
-        logger.info("No studies to submit")
-        write_results({"submitted": [], "failed": []}, output)
-        return
-
     results: dict[str, list[dict[str, Any]]] = {
         "submitted": [],
         "failed": [],
diff --git a/docs/usage.md b/docs/usage.md
index 9cab0d0..ad32375 100644
--- a/docs/usage.md
+++ b/docs/usage.md
@@ -130,36 +130,36 @@ Single study as a flat object:
 ```json
 {
   "alias": "study-gut-2026",
-  "STUDY_TITLE": "Gut microbiome study",
-  "STUDY_ABSTRACT": "Characterisation of gut microbial communities"
+  "study_title": "Gut microbiome study",
+  "study_abstract": "Characterisation of gut microbial communities"
 }
 ```
 
 #### CSV format
 
 ```csv
-alias,STUDY_TITLE,STUDY_ABSTRACT
+alias,study_title,study_abstract
 study-gut-2026,Gut microbiome study,Characterisation of gut microbial communities
 ```
 
 #### TSV format
 
 ```tsv
-alias	STUDY_TITLE	STUDY_ABSTRACT
+alias	study_title	study_abstract
 study-soil-2026	Soil microbiome study	Survey of soil microbiota
 ```
 
 #### Study metadata fields
 
-| Field                 | Required | Description                                                                                 |
-| --------------------- | -------- | ------------------------------------------------------------------------------------------- |
-| `STUDY_TITLE`         | Yes      | Descriptive title of the study.                                                             |
-| `alias`               | No       | Unique project alias within your Webin account. Derived from `STUDY_TITLE` if not provided. |
-| `STUDY_ABSTRACT`      | No       | Free-text abstract describing the study.                                                    |
-| `STUDY_DESCRIPTION`   | No       | Alternative to `STUDY_ABSTRACT`.                                                            |
-| `CENTER_PROJECT_NAME` | No       | Internal project name at your centre. Defaults to `alias`.                                  |
-| `existing_study_type` | No       | ENA study type (e.g. `Metagenomics`, `Other`).                                              |
-| `new_study_type`      | No       | Custom study type. Only used when `existing_study_type` is set to `Other`.                  |
+| Field                 | Required | Description                                                                  |
+| --------------------- | -------- | ---------------------------------------------------------------------------- |
+| `study_title`         | Yes      | Descriptive title of the study.                                              |
+| `alias`               | Yes      | Unique project alias within your Webin account. Max length is 50 characters. |
+| `study_abstract`      | No       | Free-text abstract describing the study.                                     |
+| `study_description`   | No       | Alternative to `study_abstract`.                                             |
+| `project_name`        | No       | Project name. Defaults to `study_title`.                                     |
+| `existing_study_type` | No       | ENA study type (e.g. `Metagenomics`, `Other`).                               |
+| `new_study_type`      | No       | Custom study type. Only used when `existing_study_type` is set to `Other`.   |
 
 An example metadata file is available at [assets/study_metadata.json](../assets/study_metadata.json).
 
diff --git a/modules/local/registerstudy/meta.yml b/modules/local/registerstudy/meta.yml
index e3e3245..f0e6ce7 100644
--- a/modules/local/registerstudy/meta.yml
+++ b/modules/local/registerstudy/meta.yml
@@ -36,9 +36,8 @@ input:
         type: file
         description: |
           Study metadata file in JSON, CSV, or TSV format.
-          JSON may follow the DataHarmonizer Container export format or be
-          a plain list/dict of study records.
-          Required fields per record: STUDY_TITLE, existing_study_type.
+          JSON may be a plain list of dicts or a single dict of study records.
+          Required fields per record: study_title, alias.
         pattern: "*.{json,csv,tsv}"
 
 output:
diff --git a/modules/local/registerstudy/tests/main.nf.test b/modules/local/registerstudy/tests/main.nf.test
index cdc0e69..2ec967f 100644
--- a/modules/local/registerstudy/tests/main.nf.test
+++ b/modules/local/registerstudy/tests/main.nf.test
@@ -7,14 +7,37 @@ nextflow_process {
     tag "modules"
     tag "registerstudy"
 
-    test("registerstudy - submission to ENA test server") {
+    test("registerstudy - submission to ENA test server (JSON metadata)") {
 
         when {
             process {
                 """
                 input[0] = [
                     [ id:'example_study' ],
-                    file(params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json", checkIfExists: true)
+                    file("$projectDir/assets/study_metadata.json", checkIfExists: true)
+                ]
+                """
+            }
+        }
+
+        then {
+            assertAll(
+                { assert process.success },
+                { assert path(process.out.accessions[0][1]).exists() },
+                { assert path(process.out.accessions[0][1]).json.submitted instanceof List },
+                { assert path(process.out.accessions[0][1]).json.failed.size() == 0 }
+            )
+        }
+    }
+
+    test("registerstudy - submission to ENA test server (TSV metadata)") {
+
+        when {
+            process {
+                """
+                input[0] = [
+                    [ id:'example_study_tsv' ],
+                    file("$projectDir/assets/study_metadata.tsv", checkIfExists: true)
                 ]
                 """
             }
@@ -38,7 +61,7 @@ nextflow_process {
                 """
                 input[0] = [
                     [ id:'example_study' ],
-                    file(params.pipelines_testdata_base_path + "/test_data/study_metadata/example_study.json", checkIfExists: true)
+                    file("$projectDir/assets/study_metadata.json", checkIfExists: true)
                 ]
                 """
             }
diff --git a/nextflow_schema.json b/nextflow_schema.json
index d5d0ebb..83b1ed2 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -308,7 +308,7 @@
                     "format": "file-path",
                     "exists": true,
                     "description": "Path to study metadata file (JSON, CSV, or TSV) for registering a new ENA study. Required when submission_study is not provided.",
-                    "help_text": "File containing study metadata fields (required: STUDY_TITLE, optional: alias, STUDY_ABSTRACT, existing_study_type, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.",
+                    "help_text": "File containing study metadata fields (required: study_title and alias, optional: study_abstract, existing_study_type, etc.). Used by REGISTERSTUDY to create a new study in ENA when no existing submission_study accession is given.",
                     "fa_icon": "fas fa-file-alt"
                 },
                 "webincli_submit": {

From 217edea8c477540909e1ae47108c1fef5eb32506 Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Wed, 25 Mar 2026 10:54:38 +0000
Subject: [PATCH 35/36] update container for mgnify-pipelines-toolkit

---
 modules/local/registerstudy/environment.yml         |  2 +-
 modules/local/registerstudy/main.nf                 |  4 +++-
 modules/local/registerstudy/tests/main.nf.test.snap | 10 +++++-----
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/modules/local/registerstudy/environment.yml b/modules/local/registerstudy/environment.yml
index a5e1bf2..2faa83d 100644
--- a/modules/local/registerstudy/environment.yml
+++ b/modules/local/registerstudy/environment.yml
@@ -7,4 +7,4 @@ dependencies:
   - conda-forge::python>=3.12
   - conda-forge::pip
   - pip:
-      - mgnify-pipelines-toolkit==1.4.17
+      - mgnify-pipelines-toolkit==1.4.21
diff --git a/modules/local/registerstudy/main.nf b/modules/local/registerstudy/main.nf
index 99533da..573a38c 100644
--- a/modules/local/registerstudy/main.nf
+++ b/modules/local/registerstudy/main.nf
@@ -3,7 +3,9 @@ process REGISTERSTUDY {
     label 'process_single'
 
     conda "${moduleDir}/environment.yml"
-    container "quay.io/microbiome-informatics/mgnify-pipelines-toolkit:1.4.17"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mgnify-pipelines-toolkit:1.4.21--pyhdfd78af_0':
+        'biocontainers/mgnify-pipelines-toolkit:1.4.21--pyhdfd78af_0' }"
 
     // ENA_WEBIN and ENA_WEBIN_PASSWORD must be set in the process environment.
     // In the pipeline, map Nextflow secrets via conf/modules.config or nextflow.config:
diff --git a/modules/local/registerstudy/tests/main.nf.test.snap b/modules/local/registerstudy/tests/main.nf.test.snap
index 385b735..d1cb6ea 100644
--- a/modules/local/registerstudy/tests/main.nf.test.snap
+++ b/modules/local/registerstudy/tests/main.nf.test.snap
@@ -11,14 +11,14 @@
                     ]
                 ],
                 "versions": [
-                    "versions.yml:md5,ddcc758a7d28faecd4286941889ab7e1"
+                    "versions.yml:md5,29d54944e57cbb7cb12b7605f13fd0fc"
                 ]
             }
         ],
-        "timestamp": "2026-03-13T14:02:21.161445",
         "meta": {
-            "nf-test": "0.9.4",
-            "nextflow": "25.10.4"
-        }
+            "nf-test": "0.9.0",
+            "nextflow": "25.04.1"
+        },
+        "timestamp": "2026-03-25T10:54:18.30373"
     }
 }
\ No newline at end of file

From 55373f83f84aab8d4f515e710a48d9e7f1b57bff Mon Sep 17 00:00:00 2001
From: Sofia Ochkalova <so.ochkalova@gmail.com>
Date: Wed, 25 Mar 2026 14:06:06 +0000
Subject: [PATCH 36/36] remove docker.enabled = true from test profiles, update
 nextflow.config

---
 conf/test_assembly_no_study_complete_metadata.config | 10 ++++------
 conf/test_mag_no_study_complete_metadata.config      |  2 --
 nextflow.config                                      |  2 ++
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/conf/test_assembly_no_study_complete_metadata.config b/conf/test_assembly_no_study_complete_metadata.config
index f729c77..b1c96d7 100644
--- a/conf/test_assembly_no_study_complete_metadata.config
+++ b/conf/test_assembly_no_study_complete_metadata.config
@@ -25,13 +25,11 @@ params {
     // Input data
     input  = params.pipelines_testdata_base_path + 'seqsubmit/samplesheets/assembly_complete_metadata.csv'
 
-    mode                          = "metagenomic_assemblies"
-    submission_study              = null
-    study_metadata                = "$projectDir/assets/study_metadata.json"
-    centre_name                   = "TEST_CENTER"
+    mode             = "metagenomic_assemblies"
+    submission_study = null
+    study_metadata   = "$projectDir/assets/study_metadata.json"
+    centre_name      = "TEST_CENTER"
 
     test_upload      = true
 
 }
-
-docker.enabled = true
diff --git a/conf/test_mag_no_study_complete_metadata.config b/conf/test_mag_no_study_complete_metadata.config
index dd3e659..aea18b1 100644
--- a/conf/test_mag_no_study_complete_metadata.config
+++ b/conf/test_mag_no_study_complete_metadata.config
@@ -36,5 +36,3 @@ params {
     checkm2_db       = null
 
 }
-
-docker.enabled = true
diff --git a/nextflow.config b/nextflow.config
index 6e9f1b0..1cb8aff 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -187,6 +187,8 @@ profiles {
     test_genome      { includeConfig 'conf/test_genome.config'   }
     test_assembly    { includeConfig 'conf/test_assembly.config' }
     test_full        { includeConfig 'conf/test_full.config'     }
+    test_assembly_no_study_complete_metadata { includeConfig 'conf/test_assembly_no_study_complete_metadata.config' }
+    test_mag_no_study_complete_metadata      { includeConfig 'conf/test_mag_no_study_complete_metadata.config'      }
 }
 
 // Load nf-core custom profiles from different institutions