diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 95b4eba..8f911b9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,18 +1,18 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.3.0 + rev: v6.0.0 hooks: - id: check-yaml - repo: https://github.com/snakemake/snakefmt - rev: v0.6.1 + rev: v2.0.1 hooks: - id: snakefmt - - repo: https://github.com/psf/black - rev: 22.6.0 + - repo: https://github.com/psf/black-pre-commit-mirror + rev: 26.5.1 hooks: - id: black - repo: https://github.com/PyCQA/isort.git - rev: 5.10.1 + rev: 9.0.0a3 hooks: - id: isort - repo: https://github.com/python-poetry/poetry diff --git a/mccoy/workflow/Snakefile b/mccoy/workflow/Snakefile index 94970c0..a5bd1b8 100644 --- a/mccoy/workflow/Snakefile +++ b/mccoy/workflow/Snakefile @@ -4,7 +4,6 @@ import yaml import snakemake import pathlib - PROJECT_DIR = Path(config['project_path']) RESOURCES_DIR = PROJECT_DIR / "resources" INPUT_DATA = config['data'] @@ -44,7 +43,6 @@ onstart: print("Workflow paths:") print(f"\t{'snakefile':20s} ➡ {workflow.snakefile}") print(f"\t{'working directory':20s} ➡ {workflow.basedir}") - print("Environment:") shell = lambda cmd: subprocess.run(cmd, shell=True, stdout=subprocess.PIPE).stdout.decode().rstrip() print(f"\t{shell('python --version'):20s} ➡ {shell('which python')}") diff --git a/mccoy/workflow/rules/align.smk b/mccoy/workflow/rules/align.smk index eb118ae..5b9872d 100644 --- a/mccoy/workflow/rules/align.smk +++ b/mccoy/workflow/rules/align.smk @@ -1,20 +1,20 @@ rule align: """ - Use `MAFFT `_ to align the combined sequence file against the project - reference. +Use `MAFFT `_ to align the combined sequence file against the project +reference. - :input original: the combined sequence file generated from the :smk:ref:`combine` rule - :input reference: the project reference sequence, provided during McCoy project creation +:input original: the combined sequence file generated from the :smk:ref:`combine` rule +:input reference: the project reference sequence, provided during McCoy project creation - :config align.mafft: a list of command line arguments passed directly to MAFFT - :config align.threads: the number of threads (cores) to use for a single MAFFT call - :config align.resources: the resources to request when submitting to a cluster +:config align.mafft: a list of command line arguments passed directly to MAFFT +:config align.threads: the number of threads (cores) to use for a single MAFFT call +:config align.resources: the resources to request when submitting to a cluster - :output: the aligned version of the original input file - :params: the command-line arguments passed to MAFFT (set in `align.mafft` config entry) - :threads: set to `align.threads` from the config file if present, else set by the number of cores available to the workflow (up-to `threads_max`) - :resources: set to `align.resources` in the project config, if present - """ +:output: the aligned version of the original input file +:params: the command-line arguments passed to MAFFT (set in `align.mafft` config entry) +:threads: set to `align.threads` from the config file if present, else set by the number of cores available to the workflow (up-to `threads_max`) +:resources: set to `align.resources` in the project config, if present +""" input: original="data/combined/{id}.fasta", reference=RESOURCES_DIR / "reference.fasta", @@ -24,16 +24,16 @@ rule align: "logs/align-{id}.txt", conda: "../envs/mafft.yml" - params: - lambda wildcards: " ".join(config["align"]["mafft"]), threads: config["align"].get("threads", config["all"]["threads_max"]) resources: **config["align"].get("resources", {}), + params: + lambda wildcards: " ".join(config["align"]["mafft"]), shell: """ REFNAME=$(head -n1 {input.reference} | tr -d '>') - mafft --thread {threads} {params} {input.original} {input.reference} 2> {log} \ - | seqkit grep -rvip "^$REFNAME" > {output} 2> {log} + mafft --thread {threads} {params} {input.original} {input.reference} 2>{log} \ + | seqkit grep -rvip "^$REFNAME" >{output} 2>{log} """ @@ -41,8 +41,6 @@ rule alignment_stats: input: alignment=rules.align.output, reference=RESOURCES_DIR / "reference.fasta", - conda: - "../envs/steenwyk.yml" output: summary="results/aligned/{id}.summary.txt", gc_content="results/aligned/{id}.gc_content.txt", @@ -51,13 +49,15 @@ rule alignment_stats: pairwise_identity_verbose="results/aligned/{id}.pairwise_identity_verbose.txt", # position_specific_score_matrix="results/aligned/{id}.position_specific_score_matrix.txt", # sum_of_pairs_score="results/aligned/{id}.sum_of_pairs_score.txt", + conda: + "../envs/steenwyk.yml" shell: """ - biokit alignment_summary {input.alignment} > {output.summary} - phykit gc_content {input.alignment} > {output.gc_content} - phykit relative_composition_variability {input.alignment} > {output.relative_composition_variability} - phykit pairwise_identity {input.alignment} > {output.pairwise_identity} - phykit pairwise_identity {input.alignment} --verbose > {output.pairwise_identity_verbose} + biokit alignment_summary {input.alignment} >{output.summary} + phykit gc_content {input.alignment} >{output.gc_content} + phykit relative_composition_variability {input.alignment} >{output.relative_composition_variability} + phykit pairwise_identity {input.alignment} >{output.pairwise_identity} + phykit pairwise_identity {input.alignment} --verbose >{output.pairwise_identity_verbose} # phykit sum_of_pairs_score {input.alignment} --reference {input.reference} > output.sum_of_pairs_score # biokit position_specific_score_matrix {input.alignment} > output.position_specific_score_matrix @@ -67,11 +67,11 @@ rule alignment_stats: rule pairwise_identity_histogram: input: rules.alignment_stats.output.pairwise_identity_verbose, - conda: - "../envs/plot_traces.yml" output: svg="results/aligned/{id}.pairwise_identity_verbose.svg", html="results/aligned/{id}.pairwise_identity_verbose.html", + conda: + "../envs/plot_traces.yml" shell: """ ${{CONDA_PREFIX}}/bin/python {SCRIPT_DIR}/pairwise_identity_histogram.py {input} {output.svg} {output.html} diff --git a/mccoy/workflow/rules/beast.smk b/mccoy/workflow/rules/beast.smk index e61feb4..039bd86 100644 --- a/mccoy/workflow/rules/beast.smk +++ b/mccoy/workflow/rules/beast.smk @@ -1,22 +1,22 @@ rule onlinebeast: """ - Use `online-beast `_ to add any new - sequences to the Beast2 analysis from an inherited run and update the state. +Use `online-beast `_ to add any new +sequences to the Beast2 analysis from an inherited run and update the state. - .. warning:: - This rule will only run if the ``--inherit`` or ``--inherit-last`` flags are passed to McCoy. +.. warning:: + This rule will only run if the ``--inherit`` or ``--inherit-last`` flags are passed to McCoy. - :input xml: the template file generated by the :smk:ref:`dynamicbeast` rule - :input state: the statefile from the inherited McCoy run. - This is compied into the ``data`` directory by the McCoy CLI. - :input alignment: the aligned sequences from the :smk:ref:`align` rule +:input xml: the template file generated by the :smk:ref:`dynamicbeast` rule +:input state: the statefile from the inherited McCoy run. + This is compied into the ``data`` directory by the McCoy CLI. +:input alignment: the aligned sequences from the :smk:ref:`align` rule - :output: the updated state file produced by online-beast. +:output: the updated state file produced by online-beast. - **Note:** No XML file is produced as we are using a template XML which - doesn't actually contain the sequences in it. - """ + **Note:** No XML file is produced as we are using a template XML which + doesn't actually contain the sequences in it. +""" input: xml=rules.dynamicbeast.output, state="data/{id}-beast.xml.state", @@ -44,34 +44,35 @@ def beast_params(wildcards): rule beast: """ - Run Beast2, either restarting from a state file or from scratch. +Run Beast2, either restarting from a state file or from scratch. - .. note:: - GPU acceleration is requested if available by default. If you are running on a machine with a compatible GPU then - the code will crash when using the bioconda package. To avoid this, either: +.. note:: + GPU acceleration is requested if available by default. If you are running on a machine with a compatible GPU then + the code will crash when using the bioconda package. To avoid this, either: - 1. ensure you pass ``--use-envmodules`` to McCoy and set the ``envmodules`` directives of this rule appropriately, or - 2. remove the ``-beagle_GPU`` flag from the ``beast.beast`` entry in your McCoy config file. + 1. ensure you pass ``--use-envmodules`` to McCoy and set the ``envmodules`` directives of this rule appropriately, or + 2. remove the ``-beagle_GPU`` flag from the ``beast.beast`` entry in your McCoy config file. - :input alignment: the aligned fasta file output from :smk:ref:`align` - :input template: the Beast 2 input XML file, templated with `feast `_. - If ``inherit`` is set in the config then the output of the :smk:ref:`onlinebeast` rule is used, - otherwise the output of the :smk:ref:`dynamicbeast` rule is used. +:input alignment: the aligned fasta file output from :smk:ref:`align` +:input template: the Beast 2 input XML file, templated with `feast `_. + If ``inherit`` is set in the config then the output of the :smk:ref:`onlinebeast` rule is used, + otherwise the output of the :smk:ref:`dynamicbeast` rule is used. - :output: the tree log, trace log, and statefile from Beast2 +:output: the tree log, trace log, and statefile from Beast2 - :config inherit: are we inheriting from a previous run? - :config beast.dynamic: the dynamic variables used to populate the feast template. - :config beast.beast: Beast2 command line arguments to pass (beyond the params, statefile and input) - :config beast.threads: the number of cores to run with (both locally or when submitting to a cluster) - :config beast.resources: the resources to request when submitting to a cluster +:config inherit: are we inheriting from a previous run? +:config beast.dynamic: the dynamic variables used to populate the feast template. +:config beast.beast: Beast2 command line arguments to pass (beyond the params, statefile and input) +:config beast.threads: the number of cores to run with (both locally or when submitting to a cluster) +:config beast.resources: the resources to request when submitting to a cluster - :envmodules: environment variables to load for the Spartan HPC system +:envmodules: environment variables to load for the Spartan HPC system - ..note:: - GPU acceleration is **not** requested by default. If you are running on a machine with a compatible GPU then - please replace ``-beagle`` with ``-beagle_GPU`` in the ``beast.beast`` entry in your McCoy ``config.yaml`` file. - """ +..note:: + GPU acceleration is **not** requested by default. If you are running on a machine with a compatible GPU then + please replace ``-beagle`` with ``-beagle_GPU`` in the ``beast.beast`` entry in your McCoy ``config.yaml`` file. + +""" input: alignment=rules.align.output, template=rules.dynamicbeast.output, @@ -84,25 +85,25 @@ rule beast: "logs/{id}_beast.log", conda: "../envs/beast.yml" - params: - dynamic=lambda wildcards: ",".join(config["beast"]["dynamic"]), - beast=beast_params, + envmodules: + *config["beast"].get("envmodules", []), threads: config["beast"].get("threads", config["all"]["threads_max"]) resources: **config["beast"].get("resources", {}), - envmodules: - *config["beast"].get("envmodules", []), + params: + dynamic=lambda wildcards: ",".join(config["beast"]["dynamic"]), + beast=beast_params, shell: """ if [[ -n "{input.statefile}" ]]; then cp {input.statefile} {output.statefile}; fi - beast -D 'alignment={input.alignment},tracelog={output.tracelog},treelog={output.treelog},mcmc.threads={threads},{params.dynamic}' {params.beast} -statefile {output.statefile} {input.template} 1>&2 2> {log} + beast -D 'alignment={input.alignment},tracelog={output.tracelog},treelog={output.treelog},mcmc.threads={threads},{params.dynamic}' {params.beast} -statefile {output.statefile} {input.template} 1>&2 2>{log} """ rule plot_traces: """ - Makes trace plots from the beast log file. - """ +Makes trace plots from the beast log file. +""" input: expand(rules.beast.output.tracelog, id=config['id']), output: @@ -117,8 +118,8 @@ rule plot_traces: rule arviz: """ - Makes trace plots from the beast log file. - """ +Makes trace plots from the beast log file. +""" input: expand(rules.beast.output.tracelog, id=config['id']), output: @@ -135,8 +136,8 @@ rule arviz: rule max_clade_credibility_tree: """ - Makes trace plots from the beast log file. - """ +Makes trace plots from the beast log file. +""" input: expand(rules.beast.output.treelog, id=config['id']), output: @@ -151,8 +152,8 @@ rule max_clade_credibility_tree: rule max_clade_credibility_tree_newick: """ - Makes trace plots from the beast log file. - """ +Makes trace plots from the beast log file. +""" input: expand(rules.max_clade_credibility_tree.output, id=config['id']), output: @@ -165,8 +166,8 @@ rule max_clade_credibility_tree_newick: rule max_clade_credibility_tree_render: """ - Renders the consensus maximum likelihood tree from iqtree in SVG and HTML format. - """ +Renders the consensus maximum likelihood tree from iqtree in SVG and HTML format. +""" input: expand(rules.max_clade_credibility_tree_newick.output, id=config['id']), output: diff --git a/mccoy/workflow/rules/combine.smk b/mccoy/workflow/rules/combine.smk index 27690d1..ad4b358 100644 --- a/mccoy/workflow/rules/combine.smk +++ b/mccoy/workflow/rules/combine.smk @@ -1,10 +1,10 @@ rule combine: """ - Combine multiple sequence files together into a single file. +Combine multiple sequence files together into a single file. - :input data: the sequence files to be concatenated - :output: a single concatenated fasta file - """ +:input data: the sequence files to be concatenated +:output: a single concatenated fasta file +""" input: data=INPUT_DATA, output: @@ -15,5 +15,5 @@ rule combine: "../envs/combine.yml" shell: """ - cat {input.data} | sed s/\@/_/g | seqkit rmdup -n -o {output} 2> {log} + cat {input.data} | sed s/\@/_/g | seqkit rmdup -n -o {output} 2>{log} """ diff --git a/mccoy/workflow/rules/dynamicbeast.smk b/mccoy/workflow/rules/dynamicbeast.smk index a325bb7..93a512c 100644 --- a/mccoy/workflow/rules/dynamicbeast.smk +++ b/mccoy/workflow/rules/dynamicbeast.smk @@ -1,16 +1,16 @@ rule dynamicbeast: """ - Use `dynamicbeast `_ to generate - a dynamic BEAST2 XML template from a standard static one, for use with - `feast `_. +Use `dynamicbeast `_ to generate +a dynamic BEAST2 XML template from a standard static one, for use with +`feast `_. - :input template: the BEAST XML template file - :input phytest_report: the phytest report file generated by the :smk:ref:`phytest` rule. - By using this as an input, we ensure that this rule (and all downstream rules) - only run when our quality control checks pass. +:input template: the BEAST XML template file +:input phytest_report: the phytest report file generated by the :smk:ref:`phytest` rule. + By using this as an input, we ensure that this rule (and all downstream rules) + only run when our quality control checks pass. - :output: the dynamic XML template file to be used by the :smk:ref:`onlinebeast` and/or :smk:ref:`beast` rules - """ +:output: the dynamic XML template file to be used by the :smk:ref:`onlinebeast` and/or :smk:ref:`beast` rules +""" input: template=RESOURCES_DIR / "template.xml", phytest_report="results/{id}-phytest.html", @@ -22,5 +22,5 @@ rule dynamicbeast: "../envs/dynamicbeast.yml" shell: """ - dynamic-beast {input.template} > {output} 2> {log} + dynamic-beast {input.template} >{output} 2>{log} """ diff --git a/mccoy/workflow/rules/phytest.smk b/mccoy/workflow/rules/phytest.smk index 5790a81..61f705c 100644 --- a/mccoy/workflow/rules/phytest.smk +++ b/mccoy/workflow/rules/phytest.smk @@ -1,13 +1,13 @@ rule phytest: """ - Run quality control checks using `phytest `_. +Run quality control checks using `phytest `_. - :input alignment: the aligned fasta file from the :smk:ref:`align` rule - :input tree: the maximum likelihood tree generated by the :smk:ref:`tree` rule - :input phytest_file: the phytest test file +:input alignment: the aligned fasta file from the :smk:ref:`align` rule +:input tree: the maximum likelihood tree generated by the :smk:ref:`tree` rule +:input phytest_file: the phytest test file - :output: The phytest html report, placed into the Snakemake report - """ +:output: The phytest html report, placed into the Snakemake report +""" input: alignment="results/aligned/{id}.fasta", tree="results/tree/{id}.fasta.treefile", @@ -20,5 +20,5 @@ rule phytest: "../envs/phytest.yml" shell: """ - phytest {input.phytest_file} -s {input.alignment} -t {input.tree} --report {output} -v > {log} + phytest {input.phytest_file} -s {input.alignment} -t {input.tree} --report {output} -v >{log} """ diff --git a/mccoy/workflow/rules/report.smk b/mccoy/workflow/rules/report.smk index 1ce6e07..4333e55 100644 --- a/mccoy/workflow/rules/report.smk +++ b/mccoy/workflow/rules/report.smk @@ -23,7 +23,7 @@ rule dag_svg: "../envs/graphviz.yml" shell: """ - dot -Tsvg {input} > {output} + dot -Tsvg {input} >{output} """ @@ -53,32 +53,26 @@ rule report: loader = jinja2.FileSystemLoader(report_dir) env = jinja2.Environment(loader=loader, autoescape=jinja2.select_autoescape()) - def include_file_unsafe(name): if name: return Path(str(name)).read_text() return "" - def include_file(name): if name: return markupsafe.Markup(include_file_unsafe(name)) return "" - def include_raw(name): if name: file = report_dir / name return markupsafe.Markup(file.read_text()) return "" - env.globals['include_file_unsafe'] = include_file_unsafe env.globals['include_file'] = include_file env.globals['include_raw'] = include_raw - output_path = Path(output.html).resolve() - template = env.get_template("report-template.html") try: result = template.render( @@ -89,8 +83,6 @@ rule report: ) except Exception as err: print(f"could not render template: {err}") - - with open(output_path, 'w') as f: print(f"Writing result to {output_path}") f.write(result) diff --git a/mccoy/workflow/rules/tree.smk b/mccoy/workflow/rules/tree.smk index d192d18..f29b578 100644 --- a/mccoy/workflow/rules/tree.smk +++ b/mccoy/workflow/rules/tree.smk @@ -5,19 +5,19 @@ def iqtree_random_seed(wildcards): rule tree: """ - Use `iqtree `_ to generate the maximum likelihood phylogenomic tree. +Use `iqtree `_ to generate the maximum likelihood phylogenomic tree. - :input: the aligned fasta file from the :smk:ref:`align` rule - :output: the files output by iqtree, most notably `*.treefile` +:input: the aligned fasta file from the :smk:ref:`align` rule +:output: the files output by iqtree, most notably `*.treefile` - :config tree.iqtree2: the iqtree config parameters passed on the command line +:config tree.iqtree2: the iqtree config parameters passed on the command line - **Note:** `-pre` is set automatically by McCoy. - `-seed` will also be take on the value of the environment variable `IQTREE_SEED` if set. + **Note:** `-pre` is set automatically by McCoy. + `-seed` will also be take on the value of the environment variable `IQTREE_SEED` if set. - :config tree.threads: the maximum number of threads available to iqtree - :config tree.resources: the resources to request when submitting this rule to a cluster - """ +:config tree.threads: the maximum number of threads available to iqtree +:config tree.resources: the resources to request when submitting this rule to a cluster +""" input: "results/aligned/{id}.fasta", output: @@ -36,45 +36,45 @@ rule tree: "logs/tree-{id}.txt", conda: "../envs/iqtree.yml" + threads: config["tree"].get("threads", config["all"]["threads_max"]) + resources: + **config['tree'].get('resources', {}), params: config=lambda wildcards: " ".join(config["tree"]["iqtree2"]), pre=lambda wildcards, output: Path(output[0]).with_suffix(''), seed=iqtree_random_seed, - threads: config["tree"].get("threads", config["all"]["threads_max"]) - resources: - **config['tree'].get('resources', {}), shell: """ - iqtree2 -s {input} -st DNA -pre {params.pre} {params.config} {params.seed} -ntmax {threads} 2>&1 > {log} + iqtree2 -s {input} -st DNA -pre {params.pre} {params.config} {params.seed} -ntmax {threads} 2>&1 >{log} """ rule render_mltree: """ - Renders the maximum likelihood tree from iqtree in SVG and HTML format. +Renders the maximum likelihood tree from iqtree in SVG and HTML format. - :input: the tree file produced by the :smk:ref:`tree` rule +:input: the tree file produced by the :smk:ref:`tree` rule - :output svg: the maximum likeihood tree (svg) - :output html: the maximum likeihood tree (html) - """ +:output svg: the maximum likeihood tree (svg) +:output html: the maximum likeihood tree (html) +""" input: "results/tree/{id}.fasta.treefile", output: svg="results/tree/{id}-mltree.svg", html="results/tree/{id}-mltree.html", - conda: - "../envs/toytree.yml" log: "logs/render_tree-{id}.txt", + conda: + "../envs/toytree.yml" shell: "${{CONDA_PREFIX}}/bin/python {SCRIPT_DIR}/render_tree.py {input} --svg {output.svg} --html {output.html}" rule render_consensus_mltree: """ - Renders the consensus maximum likelihood tree from iqtree in SVG and HTML format. - """ +Renders the consensus maximum likelihood tree from iqtree in SVG and HTML format. +""" input: "results/tree/{id}.fasta.contree", output: