nextstrain · rneher · Aug 15, 2025 · Aug 22, 2025
diff --git a/nextclade_genome/Snakefile b/nextclade_genome/Snakefile
@@ -0,0 +1,224 @@
+
+configfile: os.path.join(workflow.basedir, "defaults/config.yaml")
+
+rule all:
+    input:
+        tree_json = "dataset/genome/tree.json"
+
+
+rule download:
+    """Downloading sequences and metadata from data.nextstrain.org"""
+    output:
+        sequences = "data/sequences.fasta.zst",
+        metadata = "data/metadata.tsv.zst"
+    params:
+        sequences_url = "https://data.nextstrain.org/files/workflows/rubella/sequences.fasta.zst",
+        metadata_url = "https://data.nextstrain.org/files/workflows/rubella/metadata.tsv.zst"
+    shell:
+        """
+        curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
+        curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
+        """
+
+rule decompress:
+    """Decompressing sequences and metadata"""
+    input:
+        sequences = "data/sequences.fasta.zst",
+        metadata = "data/metadata.tsv.zst"
+    output:
+        sequences = "data/sequences.fasta",
+        metadata = "data/metadata.tsv"
+    shell:
+        """
+        zstd -d -c {input.sequences} > {output.sequences}
+        zstd -d -c {input.metadata} > {output.metadata}
+        """
+
+rule filter:
+    input:
+        sequences = "data/sequences.fasta",
+        metadata = "data/metadata.tsv",
+        exclude = lambda w: config[w.build]["filter"]["exclude"],
+        include = lambda w: config[w.build]["filter"]["include"]
+    output:
+        sequences = "results/{build}/pre-filtered.fasta"
+    params:
+        group_by = lambda w: config[w.build]["filter"]["group_by"],
+        subsample_max_sequences = lambda w: config[w.build]["filter"]["subsample_max_sequences"],
+        min_date = lambda w: config[w.build]["filter"]["min_date"],
+        min_length = lambda w: config[w.build]["filter"]["min_length"],
+        strain_id = lambda w: config[w.build]["strain_id_field"]
+    shell:
+        """
+        augur filter \
+            --sequences {input.sequences} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --exclude {input.exclude} \
+            --include {input.include} \
+            --output-sequences {output.sequences} \
+            --group-by {params.group_by} \
+            --subsample-max-sequences {params.subsample_max_sequences} \
+            --min-length {params.min_length}
+        """
+
+rule align:
+    input:
+        sequences = "results/{build}/pre-filtered.fasta"
+    output:
+        sequences = "results/{build}/aligned.fasta",
+        translations = "results/{build}/translations/touch.txt"
+    params:
+        dataset = lambda w: config[w.build]['files']['dataset_files'],
+        translations = "results/{build}/translations"
+    threads: workflow.cores
+    shell:
+        """
+        nextclade3 run \
+           --jobs {threads} \
+           --input-ref {params.dataset}/reference.fasta \
+           --input-pathogen-json {params.dataset}/pathogen.json \
+           --input-annotation {params.dataset}/annotation.gff3 \
+           --output-fasta {output.sequences} \
+           --output-translations {params.translations}/{{cds}}.fasta \
+           --silent \
+           {input.sequences} & touch {output.translations}
+        """
+
+
+rule tree:
+    """Building tree"""
+    input:
+        alignment = "results/{build}/aligned.fasta"
+    output:
+        tree = "results/{build}/tree_raw.nwk"
+    shell:
+        """
+        augur tree \
+            --alignment {input.alignment} \
+            --output {output.tree}
+        """
+
+rule refine:
+    """
+    Refining tree
+      - estimate timetree
+      - use {params.coalescent} coalescent timescale
+      - estimate {params.date_inference} node dates
+      - filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
+    """
+    input:
+        tree = "results/{build}/tree_raw.nwk",
+        alignment = "results/{build}/aligned.fasta",
+        metadata = "data/metadata.tsv"
+    output:
+        tree = "results/{build}/tree.nwk",
+        node_data = "results/{build}/branch_lengths.json"
+    params:
+        strain_id = lambda w: config[w.build]["strain_id_field"]
+    shell:
+        """
+        augur refine \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --output-tree {output.tree} \
+            --output-node-data {output.node_data} \
+            --root mid_point
+        """
+
+rule ancestral:
+    message:
+        """
+        Reconstructing ancestral sequences and mutations
+          - inferring ambiguous mutations
+        """
+    input:
+        tree="results/{build}/tree.nwk",
+        alignment="results/{build}/aligned.fasta",
+        annotation=lambda w: config[w.build]['files']['reference'],
+        reference= lambda w: config[w.build]['files']['dataset_files'] + '/reference.fasta',
+    output:
+        node_data= "results/{build}/muts.json",
+    params:
+        inference="joint",
+        translations= "results/{build}/translations/%GENE.fasta",
+        genes = lambda w: config[w.build]['genes']
+    shell:
+        """
+        augur ancestral \
+            --tree {input.tree} \
+            --alignment {input.alignment} \
+            --inference {params.inference} \
+            --infer-ambiguous \
+            --genes {params.genes} \
+            --annotation {input.annotation} \
+            --translations {params.translations:q} \
+            --root-sequence {input.reference} \
+            --output-node-data {output.node_data}
+        """
+
+rule clades:
+    input:
+        tree = "results/{build}/tree.nwk",
+        muts = "results/{build}/muts.json",
+        clade_defs = lambda w: config[w.build]["files"]["clades"]
+    output:
+        clades = "results/{build}/clades.json"
+    shell:
+        """
+        augur clades \
+            --tree {input.tree} \
+            --mutations {input.muts} \
+            --clades {input.clade_defs} \
+            --output {output.clades}
+        """
+
+
+rule export:
+    """Exporting data files for auspice"""
+    input:
+        tree = "results/{build}/tree.nwk",
+        metadata = "data/metadata.tsv",
+        branch_lengths = "results/{build}/branch_lengths.json",
+        clades = "results/{build}/clades.json",
+        muts = "results/{build}/muts.json",
+        colors = lambda w: config[w.build]["files"]["colors"],
+        auspice_config = lambda w: config[w.build]["files"]["auspice_config"]
+    output:
+        auspice_json = "auspice/measles_{build}.json"
+    params:
+        strain_id = lambda w: config[w.build]["strain_id_field"],
+        metadata_columns = lambda w: config[w.build]["export"]["metadata_columns"]
+    shell:
+        """
+        augur export v2 \
+            --tree {input.tree} \
+            --metadata {input.metadata} \
+            --metadata-id-columns {params.strain_id} \
+            --node-data {input.branch_lengths} {input.muts} {input.clades} \
+            --colors {input.colors} \
+            --metadata-columns {params.metadata_columns} \
+            --auspice-config {input.auspice_config} \
+            --include-root-sequence-inline \
+            --output {output.auspice_json}
+        """
+
+
+rule assemble_dataset:
+    """Assembling the dataset for Nextstrain"""
+    input:
+        auspice_json = "auspice/measles_{build}.json",
+    output:
+        reference = "dataset/{build}/reference.fasta",
+        tree_json = "dataset/{build}/tree.json"
+    params:
+        dataset = lambda w: config[w.build]['files']['dataset_files']
+    threads: workflow.cores
+    shell:
+        """
+        cp {params.dataset}/* dataset/{wildcards.build}/
+        cp {input.auspice_json} dataset/{wildcards.build}/tree.json
+        """
+
diff --git a/nextclade_genome/defaults/auspice_config_genome.json b/nextclade_genome/defaults/auspice_config_genome.json
@@ -0,0 +1,70 @@
+{
+  "title": "Nextclade dataset for Rubella virus -- full genome",
+  "maintainers": [
+    {"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
+  ],
+  "build_url": "https://github.com/nextstrain/rubella",
+  "extensions": {
+    "nextclade": {
+      "ref_nodes":{
+        "default":"__root__",
+        "search":[
+          {
+            "name": "L78917",
+            "displayname": "Vaccine strain L78917 (RA 27/3)",
+            "description": "Show mutations relative to the vaccine strain RA 27/3",
+            "criteria":[
+              {
+                "node": [{"name": ["L78917"]}]
+              }
+            ]
+          }
+        ]
+      }
+    }
+  },
+  "colorings": [
+    {
+      "key": "gt",
+      "title": "Genotype",
+      "type": "categorical"
+    },
+    {
+      "key": "clade_membership",
+      "title": "Genotype (Nextstrain)",
+      "type": "categorical"
+    },
+    {
+      "key": "region",
+      "title": "Region",
+      "type": "categorical"
+    },
+    {
+      "key": "country",
+      "title": "Country",
+      "type": "categorical"
+    },
+    {
+      "key": "is_reference",
+      "title": "WHO Reference",
+      "type": "categorical"
+    }
+  ],
+  "geo_resolutions": [
+    "country",
+    "region"
+  ],
+  "display_defaults": {
+    "map_triplicate": true,
+    "color_by": "clade_membership"
+  },
+  "filters": [
+    "clade_membership",
+    "region",
+    "country",
+    "author"
+  ],
+  "metadata_columns": [
+    "author"
+  ]
+}
diff --git a/nextclade_genome/defaults/clades_genome.tsv b/nextclade_genome/defaults/clades_genome.tsv
@@ -0,0 +1,52 @@
+clade	gene	site	alt
+# 1A	nuc	8490	A
+# 1A	nuc	8745	T
+# 1A(VAX)	nuc	8327	T # including this excludes JF727653
+1A	nuc	8884	C
+# 1A	nuc	8934	T ## this is the one that hits _very_ basal, bad choice
+
+1B	nuc	8787	T
+1B	nuc	8781	A
+
+1C	nuc	8781	T
+1C	nuc	8865	C
+1C	nuc	8994	C
+1C	nuc	9207	T
+
+1D	nuc	8440	C
+1D	nuc	9303	T
+
+1E	nuc	8301	G
+1E	nuc	8667	T
+1E	nuc	9231	A
+1E	nuc	9240	C
+
+1F	nuc	8439	T
+1F	nuc	9306	T
+1F	nuc	9546	A
+1F	nuc	9651	C
+
+1G	nuc	8265	C
+1G	nuc	8508	T
+1G	nuc	8577	T
+
+1H	nuc	8388	A
+1H	nuc	8442	A
+1H	nuc	8637	T
+
+1I	nuc	9012	A
+
+1J	nuc	9255	G
+
+2A	nuc	8565	T
+2A	nuc	8754	G
+2A	nuc	9249	C
+
+2B	nuc	9213	C
+2B	nuc	9333	G
+2B	nuc	9348	A
+
+2C	nuc	8376	C
+2C	nuc	8553	A
+2C	nuc	9027	T
+2C	nuc	9623	C