Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions nextclade_genome/Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@

configfile: os.path.join(workflow.basedir, "defaults/config.yaml")

rule all:
input:
tree_json = "dataset/genome/tree.json"


rule download:
"""Downloading sequences and metadata from data.nextstrain.org"""
output:
sequences = "data/sequences.fasta.zst",
metadata = "data/metadata.tsv.zst"
params:
sequences_url = "https://data.nextstrain.org/files/workflows/rubella/sequences.fasta.zst",
metadata_url = "https://data.nextstrain.org/files/workflows/rubella/metadata.tsv.zst"
shell:
"""
curl -fsSL --compressed {params.sequences_url:q} --output {output.sequences}
curl -fsSL --compressed {params.metadata_url:q} --output {output.metadata}
"""

rule decompress:
"""Decompressing sequences and metadata"""
input:
sequences = "data/sequences.fasta.zst",
metadata = "data/metadata.tsv.zst"
output:
sequences = "data/sequences.fasta",
metadata = "data/metadata.tsv"
shell:
"""
zstd -d -c {input.sequences} > {output.sequences}
zstd -d -c {input.metadata} > {output.metadata}
"""

rule filter:
input:
sequences = "data/sequences.fasta",
metadata = "data/metadata.tsv",
exclude = lambda w: config[w.build]["filter"]["exclude"],
include = lambda w: config[w.build]["filter"]["include"]
output:
sequences = "results/{build}/pre-filtered.fasta"
params:
group_by = lambda w: config[w.build]["filter"]["group_by"],
subsample_max_sequences = lambda w: config[w.build]["filter"]["subsample_max_sequences"],
min_date = lambda w: config[w.build]["filter"]["min_date"],
min_length = lambda w: config[w.build]["filter"]["min_length"],
strain_id = lambda w: config[w.build]["strain_id_field"]
shell:
"""
augur filter \
--sequences {input.sequences} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--exclude {input.exclude} \
--include {input.include} \
--output-sequences {output.sequences} \
--group-by {params.group_by} \
--subsample-max-sequences {params.subsample_max_sequences} \
--min-length {params.min_length}
"""

rule align:
input:
sequences = "results/{build}/pre-filtered.fasta"
output:
sequences = "results/{build}/aligned.fasta",
translations = "results/{build}/translations/touch.txt"
params:
dataset = lambda w: config[w.build]['files']['dataset_files'],
translations = "results/{build}/translations"
threads: workflow.cores
shell:
"""
nextclade3 run \
--jobs {threads} \
--input-ref {params.dataset}/reference.fasta \
--input-pathogen-json {params.dataset}/pathogen.json \
--input-annotation {params.dataset}/annotation.gff3 \
--output-fasta {output.sequences} \
--output-translations {params.translations}/{{cds}}.fasta \
--silent \
{input.sequences} & touch {output.translations}
"""


rule tree:
"""Building tree"""
input:
alignment = "results/{build}/aligned.fasta"
output:
tree = "results/{build}/tree_raw.nwk"
shell:
"""
augur tree \
--alignment {input.alignment} \
--output {output.tree}
"""

rule refine:
"""
Refining tree
- estimate timetree
- use {params.coalescent} coalescent timescale
- estimate {params.date_inference} node dates
- filter tips more than {params.clock_filter_iqd} IQDs from clock expectation
"""
input:
tree = "results/{build}/tree_raw.nwk",
alignment = "results/{build}/aligned.fasta",
metadata = "data/metadata.tsv"
output:
tree = "results/{build}/tree.nwk",
node_data = "results/{build}/branch_lengths.json"
params:
strain_id = lambda w: config[w.build]["strain_id_field"]
shell:
"""
augur refine \
--tree {input.tree} \
--alignment {input.alignment} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--output-tree {output.tree} \
--output-node-data {output.node_data} \
--root mid_point
"""

rule ancestral:
message:
"""
Reconstructing ancestral sequences and mutations
- inferring ambiguous mutations
"""
input:
tree="results/{build}/tree.nwk",
alignment="results/{build}/aligned.fasta",
annotation=lambda w: config[w.build]['files']['reference'],
reference= lambda w: config[w.build]['files']['dataset_files'] + '/reference.fasta',
output:
node_data= "results/{build}/muts.json",
params:
inference="joint",
translations= "results/{build}/translations/%GENE.fasta",
genes = lambda w: config[w.build]['genes']
shell:
"""
augur ancestral \
--tree {input.tree} \
--alignment {input.alignment} \
--inference {params.inference} \
--infer-ambiguous \
--genes {params.genes} \
--annotation {input.annotation} \
--translations {params.translations:q} \
--root-sequence {input.reference} \
--output-node-data {output.node_data}
"""

rule clades:
input:
tree = "results/{build}/tree.nwk",
muts = "results/{build}/muts.json",
clade_defs = lambda w: config[w.build]["files"]["clades"]
output:
clades = "results/{build}/clades.json"
shell:
"""
augur clades \
--tree {input.tree} \
--mutations {input.muts} \
--clades {input.clade_defs} \
--output {output.clades}
"""


rule export:
"""Exporting data files for auspice"""
input:
tree = "results/{build}/tree.nwk",
metadata = "data/metadata.tsv",
branch_lengths = "results/{build}/branch_lengths.json",
clades = "results/{build}/clades.json",
muts = "results/{build}/muts.json",
colors = lambda w: config[w.build]["files"]["colors"],
auspice_config = lambda w: config[w.build]["files"]["auspice_config"]
output:
auspice_json = "auspice/measles_{build}.json"
params:
strain_id = lambda w: config[w.build]["strain_id_field"],
metadata_columns = lambda w: config[w.build]["export"]["metadata_columns"]
shell:
"""
augur export v2 \
--tree {input.tree} \
--metadata {input.metadata} \
--metadata-id-columns {params.strain_id} \
--node-data {input.branch_lengths} {input.muts} {input.clades} \
--colors {input.colors} \
--metadata-columns {params.metadata_columns} \
--auspice-config {input.auspice_config} \
--include-root-sequence-inline \
--output {output.auspice_json}
"""


rule assemble_dataset:
"""Assembling the dataset for Nextstrain"""
input:
auspice_json = "auspice/measles_{build}.json",
output:
reference = "dataset/{build}/reference.fasta",
tree_json = "dataset/{build}/tree.json"
params:
dataset = lambda w: config[w.build]['files']['dataset_files']
threads: workflow.cores
shell:
"""
cp {params.dataset}/* dataset/{wildcards.build}/
cp {input.auspice_json} dataset/{wildcards.build}/tree.json
"""

70 changes: 70 additions & 0 deletions nextclade_genome/defaults/auspice_config_genome.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
{
"title": "Nextclade dataset for Rubella virus -- full genome",
"maintainers": [
{"name": "the Nextstrain team", "url": "https://nextstrain.org/team"}
],
"build_url": "https://github.com/nextstrain/rubella",
"extensions": {
"nextclade": {
"ref_nodes":{
"default":"__root__",
"search":[
{
"name": "L78917",
"displayname": "Vaccine strain L78917 (RA 27/3)",
"description": "Show mutations relative to the vaccine strain RA 27/3",
"criteria":[
{
"node": [{"name": ["L78917"]}]
}
]
}
]
}
}
},
"colorings": [
{
"key": "gt",
"title": "Genotype",
"type": "categorical"
},
{
"key": "clade_membership",
"title": "Genotype (Nextstrain)",
"type": "categorical"
},
{
"key": "region",
"title": "Region",
"type": "categorical"
},
{
"key": "country",
"title": "Country",
"type": "categorical"
},
{
"key": "is_reference",
"title": "WHO Reference",
"type": "categorical"
}
],
"geo_resolutions": [
"country",
"region"
],
"display_defaults": {
"map_triplicate": true,
"color_by": "clade_membership"
},
"filters": [
"clade_membership",
"region",
"country",
"author"
],
"metadata_columns": [
"author"
]
}
52 changes: 52 additions & 0 deletions nextclade_genome/defaults/clades_genome.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
clade gene site alt
# 1A nuc 8490 A
# 1A nuc 8745 T
# 1A(VAX) nuc 8327 T # including this excludes JF727653
1A nuc 8884 C
# 1A nuc 8934 T ## this is the one that hits _very_ basal, bad choice

1B nuc 8787 T
1B nuc 8781 A

1C nuc 8781 T
1C nuc 8865 C
1C nuc 8994 C
1C nuc 9207 T

1D nuc 8440 C
1D nuc 9303 T

1E nuc 8301 G
1E nuc 8667 T
1E nuc 9231 A
1E nuc 9240 C

1F nuc 8439 T
1F nuc 9306 T
1F nuc 9546 A
1F nuc 9651 C

1G nuc 8265 C
1G nuc 8508 T
1G nuc 8577 T

1H nuc 8388 A
1H nuc 8442 A
1H nuc 8637 T

1I nuc 9012 A

1J nuc 9255 G

2A nuc 8565 T
2A nuc 8754 G
2A nuc 9249 C

2B nuc 9213 C
2B nuc 9333 G
2B nuc 9348 A

2C nuc 8376 C
2C nuc 8553 A
2C nuc 9027 T
2C nuc 9623 C
Loading
Loading