nextstrain · joverlee521 · Oct 8, 2025 · Sep 26, 2025 · Sep 27, 2025 · Sep 27, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,16 @@ Instead, changes appear below grouped by the date they were added to the workflo
 
 ## 2025
 
+* 08 October 2025: phylogenetic - Major update to the definition of inputs. ([#339][])
+    * Configs are now required to include the `inputs` param to define inputs for the workflow
+
+        ```yaml
+        inputs:
+          - name: ncbi
+            metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.zst"
+            sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.zst"
+        ```
+
 * 02 July 2025: phylogenetic - config schema updates for easier config overlays ([#321][])
     * new required config params
         * `exclude` - path to exclude.txt for `augur filter`
@@ -65,4 +75,5 @@ Instead, changes appear below grouped by the date they were added to the workflo
 [#318]: https://github.com/nextstrain/mpox/pull/318
 [#319]: https://github.com/nextstrain/mpox/pull/319
 [#321]: https://github.com/nextstrain/mpox/pull/321
+[#339]: https://github.com/nextstrain/mpox/pull/339
 [NCBI Datasets mnemonics]: https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/dataformat/tsv/dataformat_tsv_virus-genome/#fields
diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
@@ -2,6 +2,10 @@ from packaging import version
 from augur.__version__ import __version__ as augur_version
 import os
 import sys
+from snakemake.utils import min_version
+
+# Minimum Snakemake version needed for the storage plugins used in remote_files.smk
+min_version("8.0.0")
 
 # The 'exec &> >(tee {log:q})' in each rule makes Python think it's not connected to terminal
 # Setting unbuffered makes it behave as expected
@@ -46,7 +50,9 @@ rule all:
         """
 
 
+include: "../shared/vendored/snakemake/remote_files.smk"
 include: "rules/config.smk"
+include: "rules/merge_inputs.smk"
 include: "rules/prepare_sequences.smk"
 include: "rules/construct_phylogeny.smk"
 include: "rules/annotate_phylogeny.smk"

diff --git a/phylogenetic/build-configs/chores/chores.smk b/phylogenetic/build-configs/chores/chores.smk
@@ -10,8 +10,8 @@ rule update_example_data:
     - ensures all clades and lineages are accounted for using --group-by
     """
     input:
-        sequences="data/sequences.fasta",
-        metadata="data/metadata.tsv",
+        sequences="results/sequences.fasta",
+        metadata="results/metadata.tsv",
     output:
         sequences="example_data/sequences.fasta",
         metadata="example_data/metadata.tsv",

diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml
@@ -1,5 +1,7 @@
-custom_rules:
-  - build-configs/ci/copy_example_data.smk
+inputs:
+  - name: example
+    metadata: "example_data/metadata.tsv"
+    sequences: "example_data/sequences.fasta"
 
 reference: "defaults/reference.fasta"
 genome_annotation: "defaults/genome_annotation.gff3"

diff --git a/phylogenetic/build-configs/ci/copy_example_data.smk b/phylogenetic/build-configs/ci/copy_example_data.smk
diff --git a/phylogenetic/defaults/clade-i/config.yaml b/phylogenetic/defaults/clade-i/config.yaml
@@ -1,3 +1,8 @@
+inputs:
+  - name: ncbi
+    metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.zst"
+    sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.zst"
+
 reference: "defaults/clade-i/reference.fasta"
 genome_annotation: "defaults/clade-i/genome_annotation.gff3"
 genbank_reference: "defaults/clade-i/reference.gb"

diff --git a/phylogenetic/defaults/hmpxv1/config.yaml b/phylogenetic/defaults/hmpxv1/config.yaml
@@ -1,3 +1,8 @@
+inputs:
+  - name: ncbi
+    metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.zst"
+    sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.zst"
+
 reference: "defaults/reference.fasta"
 genome_annotation: "defaults/genome_annotation.gff3"
 genbank_reference: "defaults/reference.gb"

diff --git a/phylogenetic/defaults/hmpxv1_big/config.yaml b/phylogenetic/defaults/hmpxv1_big/config.yaml
@@ -1,3 +1,8 @@
+inputs:
+  - name: ncbi
+    metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.zst"
+    sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.zst"
+
 reference: "defaults/reference.fasta"
 genome_annotation: "defaults/genome_annotation.gff3"
 genbank_reference: "defaults/reference.gb"

diff --git a/phylogenetic/defaults/mpxv/config.yaml b/phylogenetic/defaults/mpxv/config.yaml
@@ -1,3 +1,8 @@
+inputs:
+  - name: ncbi
+    metadata: "https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.zst"
+    sequences: "https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.zst"
+
 auspice_config: "defaults/mpxv/auspice_config.json"
 include: "defaults/mpxv/include.txt"
 exclude: "defaults/exclude.txt"

diff --git a/phylogenetic/rules/merge_inputs.smk b/phylogenetic/rules/merge_inputs.smk
@@ -0,0 +1,186 @@
+"""
+This part of the workflow merges inputs based on what is defined in the config.
+
+OUTPUTS:
+
+    metadata  = results/metadata.tsv
+    sequences = results/sequences.fasta
+
+The config dict is expected to have a top-level `inputs` list that defines the
+separate inputs' name, metadata, and sequences. Optionally, the config can have
+a top-level `additional-inputs` list that is used to define additional data that
+are combined with the default inputs:
+
+```yaml
+inputs:
+    - name: default
+      metadata: <path-or-url>
+      sequences: <path-or-url>
+
+additional_inputs:
+    - name: private
+      metadata: <path-or-url>
+      sequences: <path-or-url>
+```
+
+Supports any of the compression formats that are supported by `augur read-file`,
+see <https://docs.nextstrain.org/projects/augur/page/usage/cli/read-file.html>
+"""
+
+from pathlib import Path
+
+
+def _gather_inputs():
+    all_inputs = [*config["inputs"], *config.get("additional_inputs", [])]
+
+    if len(all_inputs) == 0:
+        raise InvalidConfigError(
+            "Config must define at least one element in config.inputs or config.additional_inputs lists"
+        )
+    if not all([isinstance(i, dict) for i in all_inputs]):
+        raise InvalidConfigError(
+            "All of the elements in config.inputs and config.additional_inputs lists must be dictionaries. "
+            "If you've used a command line '--config' double check your quoting."
+        )
+    if len({i["name"] for i in all_inputs}) != len(all_inputs):
+        raise InvalidConfigError(
+            "Names of inputs (config.inputs and config.additional_inputs) must be unique"
+        )
+    if not all(
+        ["name" in i and ("sequences" in i or "metadata" in i) for i in all_inputs]
+    ):
+        raise InvalidConfigError(
+            "Each input (config.inputs and config.additional_inputs) must have a 'name' and 'metadata' and/or 'sequences'"
+        )
+    if not any(["metadata" in i for i in all_inputs]):
+        raise InvalidConfigError("At least one input must have 'metadata'")
+    if not any(["sequences" in i for i in all_inputs]):
+        raise InvalidConfigError("At least one input must have 'sequences'")
+
+    available_keys = set(["name", "metadata", "sequences"])
+    if any([len(set(el.keys()) - available_keys) > 0 for el in all_inputs]):
+        raise InvalidConfigError(
+            f"Each input (config.inputs and config.additional_inputs) can only include keys of {', '.join(available_keys)}"
+        )
+
+    return {
+        el["name"]: {k: (v if k == "name" else path_or_url(v)) for k, v in el.items()}
+        for el in all_inputs
+    }
+
+
+input_sources = _gather_inputs()
+_input_metadata = [
+    info["metadata"] for info in input_sources.values() if info.get("metadata", None)
+]
+_input_sequences = [
+    info["sequences"] for info in input_sources.values() if info.get("sequences", None)
+]
+
+
+if len(_input_metadata) == 1:
+
+    rule decompress_metadata:
+        """
+        This rule is invoked when there is a single metadata input to
+        ensure that we have a decompressed input for downstream rules to match
+        the output of rule.merge_metadata.
+        """
+        input:
+            metadata=_input_metadata[0],
+        output:
+            metadata="results/metadata.tsv",
+        log:
+            "logs/decompress_metadata.txt",
+        benchmark:
+            "benchmarks/decompress_metadata.txt"
+        shell:
+            r"""
+            exec &> >(tee {log:q})
+
+            augur read-file {input.metadata:q} > {output.metadata:q}
+            """
+
+else:
+
+    rule merge_metadata:
+        """
+        This rule is invoked when there are multiple defined metadata inputs
+        (config.inputs + config.additional_inputs)
+        """
+        input:
+            **{
+                name: info["metadata"]
+                for name, info in input_sources.items()
+                if info.get("metadata", None)
+            },
+        params:
+            metadata=lambda w, input: list(map("=".join, input.items())),
+            id_field=config["strain_id_field"],
+        output:
+            metadata="results/metadata.tsv",
+        log:
+            "logs/merge_metadata.txt",
+        benchmark:
+            "benchmarks/merge_metadata.txt"
+        shell:
+            r"""
+            exec &> >(tee {log:q})
+
+            augur merge \
+                --metadata {params.metadata:q} \
+                --metadata-id-columns {params.id_field:q} \
+                --output-metadata {output.metadata:q}
+            """
+
+
+if len(_input_sequences) == 1:
+
+    rule decompress_sequences:
+        """
+        This rule is invoked when there is a single sequences input to
+        ensure that we have a decompressed input for downstream rules to match
+        the output of rule.merge_sequences.
+        """
+        input:
+            sequences=_input_sequences[0],
+        output:
+            sequences="results/sequences.fasta",
+        log:
+            "logs/decompress_sequences.txt",
+        benchmark:
+            "benchmarks/decompress_sequences.txt"
+        shell:
+            r"""
+            exec &> >(tee {log:q})
+
+            augur read-file {input.sequences:q} > {output.sequences:q}
+            """
+
+else:
+
+    rule merge_sequences:
+        """
+        This rule is invoked when there are multiple defined sequences inputs
+        (config.inputs + config.additional_inputs)
+        """
+        input:
+            **{
+                name: info["sequences"]
+                for name, info in input_sources.items()
+                if info.get("sequences", None)
+            },
+        output:
+            sequences="results/sequences.fasta",
+        log:
+            "logs/merge_sequences.txt",
+        benchmark:
+            "benchmarks/merge_sequences.txt"
+        shell:
+            r"""
+            exec &> >(tee {log:q})
+
+            augur merge \
+                --sequences {input:q} \
+                --output-sequences {output.sequences:q}
+            """
diff --git a/phylogenetic/rules/prepare_sequences.smk b/phylogenetic/rules/prepare_sequences.smk
@@ -3,6 +3,8 @@ This part of the workflow prepares sequences for constructing the phylogenetic t
 
 REQUIRED INPUTS:
 
+    metadata    = results/metadata.tsv
+    sequences   = results/sequences.fasta
     include     = path to file of sequences to force include
     exclude     = path to file of sequences to exclude
     reference   = path to reference sequence FASTA for Nextclade alignment
@@ -16,59 +18,13 @@ OUTPUTS:
 """
 
 
-rule download:
-    """
-    Downloading sequences and metadata from data.nextstrain.org
-    """
-    output:
-        sequences="data/sequences.fasta.zst",
-        metadata="data/metadata.tsv.zst",
-    params:
-        sequences_url="https://data.nextstrain.org/files/workflows/mpox/sequences.fasta.zst",
-        metadata_url="https://data.nextstrain.org/files/workflows/mpox/metadata.tsv.zst",
-    log:
-        "logs/download.txt",
-    benchmark:
-        "benchmarks/download.txt"
-    shell:
-        r"""
-        exec &> >(tee {log:q})
-
-        curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences:q}
-        curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata:q}
-        """
-
-
-rule decompress:
-    """
-    Decompressing sequences and metadata
-    """
-    input:
-        sequences="data/sequences.fasta.zst",
-        metadata="data/metadata.tsv.zst",
-    output:
-        sequences="data/sequences.fasta",
-        metadata="data/metadata.tsv",
-    log:
-        "logs/decompress.txt",
-    benchmark:
-        "benchmarks/decompress.txt"
-    shell:
-        r"""
-        exec &> >(tee {log:q})
-
-        zstd --decompress --stdout {input.sequences:q} > {output.sequences:q}
-        zstd --decompress --stdout {input.metadata:q} > {output.metadata:q}
-        """
-
-
 rule filter:
     """
     Removing strains that do not satisfy certain requirements.
     """
     input:
-        sequences="data/sequences.fasta",
-        metadata="data/metadata.tsv",
+        sequences="results/sequences.fasta",
+        metadata="results/metadata.tsv",
         exclude=config["exclude"],
     output:
         sequences=build_dir + "/{build_name}/good_sequences.fasta",

diff --git a/shared/vendored/.github/workflows/ci.yaml b/shared/vendored/.github/workflows/ci.yaml
@@ -11,5 +11,5 @@ jobs:
   shellcheck:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
       - uses: nextstrain/.github/actions/shellcheck@master
diff --git a/shared/vendored/.github/workflows/pre-commit.yaml b/shared/vendored/.github/workflows/pre-commit.yaml
@@ -7,8 +7,8 @@ jobs:
   pre-commit:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - uses: actions/checkout@v5
+      - uses: actions/setup-python@v6
         with:
           python-version: "3.12"
       - uses: pre-commit/action@v3.0.1