diff --git a/CITATIONS.md b/CITATIONS.md index 843f5d3..31893a3 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -14,6 +14,10 @@ > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. +- [DIAMOND](https://github.com/bbuchfink/diamond) + +> Buchfink B, Xie C, Huson DH, "Fast and sensitive protein alignment using DIAMOND", Nature Methods 12, 59-60 (2015). doi:10.1038/nmeth.3176 + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. diff --git a/README.md b/README.md index 7405207..490f461 100644 --- a/README.md +++ b/README.md @@ -29,7 +29,10 @@ -1. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) + + +1. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +2. Functional Annotation ([`DIAMOND`](https://github.com/bbuchfink/diamond)) ## Usage diff --git a/modules.json b/modules.json index 1bfd609..3b1f9f4 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "diamond/blastp": { + "branch": "master", + "git_sha": "05954dab2ff481bcb999f24455da29a5828af08d", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "f0719ae309075ae4a291533883847c3f7c441dad", diff --git a/modules/nf-core/diamond/blastp/environment.yml b/modules/nf-core/diamond/blastp/environment.yml new file mode 100644 index 0000000..6a9b16a --- /dev/null +++ b/modules/nf-core/diamond/blastp/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::diamond=2.1.11 diff --git a/modules/nf-core/diamond/blastp/main.nf b/modules/nf-core/diamond/blastp/main.nf new file mode 100644 index 0000000..6dd8d39 --- /dev/null +++ b/modules/nf-core/diamond/blastp/main.nf @@ -0,0 +1,109 @@ +process DIAMOND_BLASTP { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/diamond:2.1.11--h5ca1c30_0' : + 'biocontainers/diamond:2.1.11--h5ca1c30_0' }" + + input: + tuple val(meta) , path(fasta) + tuple val(meta2), path(db) + val outfmt + val blast_columns + + output: + tuple val(meta), path('*.{blast,blast.gz}'), optional: true, emit: blast + tuple val(meta), path('*.{xml,xml.gz}') , optional: true, emit: xml + tuple val(meta), path('*.{txt,txt.gz}') , optional: true, emit: txt + tuple val(meta), path('*.{daa,daa.gz}') , optional: true, emit: daa + tuple val(meta), path('*.{sam,sam.gz}') , optional: true, emit: sam + tuple val(meta), path('*.{tsv,tsv.gz}') , optional: true, emit: tsv + tuple val(meta), path('*.{paf,paf.gz}') , optional: true, emit: paf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + def columns = blast_columns ? "${blast_columns}" : '' + def out_ext = "" + + if (outfmt == 0) { + out_ext = "blast" + } else if (outfmt == 5) { + out_ext = "xml" + } else if (outfmt == 6) { + out_ext = "txt" + } else if (outfmt == 100) { + out_ext = "daa" + } else if (outfmt == 101) { + out_ext = "sam" + } else if (outfmt == 102) { + out_ext = "tsv" + } else if (outfmt == 103) { + out_ext = "paf" + } else { + log.warn("Unknown output file format provided (${outfmt}): selecting DIAMOND default of tabular BLAST output (txt)") + outfmt = 6 + out_ext = 'txt' + } + + if ( args =~ /--compress\s+1/ ) out_ext += '.gz' + + """ + diamond \\ + blastp \\ + --threads ${task.cpus} \\ + --db ${db} \\ + --query ${fasta} \\ + --outfmt ${outfmt} ${columns} \\ + ${args} \\ + --out ${prefix}.${out_ext} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + + def out_ext = "" + + if (outfmt == 0) { + out_ext = "blast" + } else if (outfmt == 5) { + out_ext = "xml" + } else if (outfmt == 6) { + out_ext = "txt" + } else if (outfmt == 100) { + out_ext = "daa" + } else if (outfmt == 101) { + out_ext = "sam" + } else if (outfmt == 102) { + out_ext = "tsv" + } else if (outfmt == 103) { + out_ext = "paf" + } else { + log.warn("Unknown output file format provided (${outfmt}): selecting DIAMOND default of tabular BLAST output (txt)") + outfmt = 6 + out_ext = 'txt' + } + + if ( args =~ /--compress\s+1/ ) out_ext += '.gz' + + """ + touch ${prefix}.${out_ext} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/diamond/blastp/meta.yml b/modules/nf-core/diamond/blastp/meta.yml new file mode 100644 index 0000000..239e252 --- /dev/null +++ b/modules/nf-core/diamond/blastp/meta.yml @@ -0,0 +1,154 @@ +name: diamond_blastp +description: Queries a DIAMOND database using blastp mode +keywords: + - fasta + - diamond + - blastp + - DNA sequence +tools: + - diamond: + description: Accelerated BLAST compatible local sequence aligner + homepage: https://github.com/bbuchfink/diamond + documentation: https://github.com/bbuchfink/diamond/wiki + tool_dev_url: https://github.com/bbuchfink/diamond + doi: "10.1038/s41592-021-01101-x" + licence: ["GPL v3.0"] + identifier: biotools:diamond +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing query sequences + pattern: "*.{fa,fasta,fa.gz,fasta.gz}" + ontologies: + - edam: http://edamontology.org/format_1929 # FASTA + - - meta2: + type: map + description: | + Groovy Map containing db information + e.g. [ id:'test2', single_end:false ] + - db: + type: file + description: File of the indexed DIAMOND database + pattern: "*.dmnd" + ontologies: [] + - - outfmt: + type: integer + description: | + Specify the type of output file to be generated. + 0, .blast, BLAST pairwise format. + 5, .xml, BLAST XML format. + 6, .txt, BLAST tabular format (default). This format can be customized, the 6 may be followed by a space-separated list of the blast_columns keywords, each specifying a field of the output. + 100, .daa, DIAMOND alignment archive (DAA). The DAA format is a proprietary binary format that can subsequently be used to generate other output formats using the view command. It is also supported by MEGAN and allows a quick import of results. + 101, .sam, SAM format. + 102, .tsv, Taxonomic classification. This format will not print alignments but only a taxonomic classification for each query using the LCA algorithm. + 103, .paf, PAF format. The custom fields in the format are AS (bit score), ZR (raw score) and ZE (e-value). + pattern: "0|5|6|100|101|102|103" + - - blast_columns: + type: string + description: | + Optional space separated list of DIAMOND tabular BLAST output keywords + used in conjunction with the --outfmt 6 option (txt). + Options: + qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore +output: + - blast: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{blast,blast.gz}": + type: file + description: File containing blastp hits + pattern: "*.{blast,blast.gz}" + ontologies: + - edam: http://edamontology.org/format_3836 # BLAST XML v2 results format + - xml: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{xml,xml.gz}": + type: file + description: File containing blastp hits + pattern: "*.{xml,xml.gz}" + ontologies: + - edam: http://edamontology.org/format_2332 # XML + - txt: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{txt,txt.gz}": + type: file + description: File containing hits in tabular BLAST format. + pattern: "*.{txt,txt.gz}" + ontologies: + - edam: http://edamontology.org/format_1333 # BLAST results + - daa: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{daa,daa.gz}": + type: file + description: File containing hits DAA format + pattern: "*.{daa,daa.gz}" + ontologies: [] + - sam: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{sam,sam.gz}": + type: file + description: File containing aligned reads in SAM format + pattern: "*.{sam,sam.gz}" + ontologies: + - edam: http://edamontology.org/format_2573 # SAM + - tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{tsv,tsv.gz}": + type: file + description: Tab separated file containing taxonomic classification of hits + pattern: "*.{tsv,tsv.gz}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + - paf: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{paf,paf.gz}": + type: file + description: File containing aligned reads in pairwise mapping format format + pattern: "*.{paf,paf.gz}" + ontologies: [] + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@spficklin" + - "@jfy133" +maintainers: + - "@spficklin" + - "@jfy133" + - "@vagkaratzas" diff --git a/modules/nf-core/diamond/blastp/tests/main.nf.test b/modules/nf-core/diamond/blastp/tests/main.nf.test new file mode 100644 index 0000000..9211915 --- /dev/null +++ b/modules/nf-core/diamond/blastp/tests/main.nf.test @@ -0,0 +1,141 @@ +nextflow_process { + + name "Test Process DIAMOND_BLASTP" + script "../main.nf" + process "DIAMOND_BLASTP" + tag "modules" + tag "modules_nfcore" + tag "diamond" + tag "diamond/makedb" + tag "diamond/blastp" + + setup { + run("DIAMOND_MAKEDB") { + script "../../makedb/main.nf" + process { + """ + input[0] = [ [id:'test2'], [ file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] ] + input[1] = [] + input[2] = [] + input[3] = [] + """ + } + } + } + + test("sarscov2 - proteome - txt") { + + when { + process { + """ + input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] + input[1] = DIAMOND_MAKEDB.out.db + input[2] = 6 + input[3] = 'qseqid qlen' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - proteome - gz - txt") { + + when { + process { + """ + input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta.gz', checkIfExists: true) ] + input[1] = DIAMOND_MAKEDB.out.db + input[2] = 6 + input[3] = 'qseqid qlen' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match("gz_txt")} + ) + } + + } + + test("sarscov2 - proteome - daa") { + + when { + process { + """ + input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] + input[1] = DIAMOND_MAKEDB.out.db + input[2] = 100 + input[3] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.daa }, + { assert snapshot(process.out.versions).match("daa") } + ) + } + + } + + test("sarscov2 - proteome - txt - gz") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] + input[1] = DIAMOND_MAKEDB.out.db + input[2] = 6 + input[3] = 'qseqid qlen' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match("txt_gz") } + ) + } + + } + + test("sarscov2 - proteome - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id:'test'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/proteome.fasta', checkIfExists: true) ] + input[1] = DIAMOND_MAKEDB.out.db + input[2] = 6 + input[3] = 'qseqid qlen' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match("stub") } + ) + } + + } + +} diff --git a/modules/nf-core/diamond/blastp/tests/main.nf.test.snap b/modules/nf-core/diamond/blastp/tests/main.nf.test.snap new file mode 100644 index 0000000..44d5043 --- /dev/null +++ b/modules/nf-core/diamond/blastp/tests/main.nf.test.snap @@ -0,0 +1,290 @@ +{ + "sarscov2 - proteome - txt": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test.txt:md5,8131b1afd717f3d5f2f2417c5b562e6e" + ] + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + + ], + "6": [ + + ], + "7": [ + "versions.yml:md5,5f638327037bee3c00e17521c04a652f" + ], + "blast": [ + + ], + "daa": [ + + ], + "paf": [ + + ], + "sam": [ + + ], + "tsv": [ + + ], + "txt": [ + [ + { + "id": "test" + }, + "test.txt:md5,8131b1afd717f3d5f2f2417c5b562e6e" + ] + ], + "versions": [ + "versions.yml:md5,5f638327037bee3c00e17521c04a652f" + ], + "xml": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2025-01-28T10:25:13.48912978" + }, + "txt_gz": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test.txt.gz:md5,8131b1afd717f3d5f2f2417c5b562e6e" + ] + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + + ], + "6": [ + + ], + "7": [ + "versions.yml:md5,5f638327037bee3c00e17521c04a652f" + ], + "blast": [ + + ], + "daa": [ + + ], + "paf": [ + + ], + "sam": [ + + ], + "tsv": [ + + ], + "txt": [ + [ + { + "id": "test" + }, + "test.txt.gz:md5,8131b1afd717f3d5f2f2417c5b562e6e" + ] + ], + "versions": [ + "versions.yml:md5,5f638327037bee3c00e17521c04a652f" + ], + "xml": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2025-01-28T10:36:04.361504205" + }, + "gz_txt": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test.txt:md5,8131b1afd717f3d5f2f2417c5b562e6e" + ] + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + + ], + "6": [ + + ], + "7": [ + "versions.yml:md5,5f638327037bee3c00e17521c04a652f" + ], + "blast": [ + + ], + "daa": [ + + ], + "paf": [ + + ], + "sam": [ + + ], + "tsv": [ + + ], + "txt": [ + [ + { + "id": "test" + }, + "test.txt:md5,8131b1afd717f3d5f2f2417c5b562e6e" + ] + ], + "versions": [ + "versions.yml:md5,5f638327037bee3c00e17521c04a652f" + ], + "xml": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2025-01-28T10:25:20.993203497" + }, + "daa": { + "content": [ + [ + "versions.yml:md5,5f638327037bee3c00e17521c04a652f" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2025-01-28T10:25:28.126992812" + }, + "stub": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "2": [ + [ + { + "id": "test" + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + + ], + "6": [ + + ], + "7": [ + "versions.yml:md5,5f638327037bee3c00e17521c04a652f" + ], + "blast": [ + + ], + "daa": [ + + ], + "paf": [ + + ], + "sam": [ + + ], + "tsv": [ + + ], + "txt": [ + [ + { + "id": "test" + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,5f638327037bee3c00e17521c04a652f" + ], + "xml": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.2" + }, + "timestamp": "2025-01-28T10:25:34.911633513" + } +} \ No newline at end of file diff --git a/modules/nf-core/diamond/blastp/tests/nextflow.config b/modules/nf-core/diamond/blastp/tests/nextflow.config new file mode 100644 index 0000000..bd28cb1 --- /dev/null +++ b/modules/nf-core/diamond/blastp/tests/nextflow.config @@ -0,0 +1,7 @@ +process { + + withName: DIAMOND_BLASTP { + ext.args = '--compress 1' + } + +} diff --git a/nextflow.config b/nextflow.config index 1dcfdb8..80172df 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,6 +18,11 @@ params { igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false + // DIAMOND options + diamond_db = null + diamond_outfmt = 102 + diamond_blast_columns = '' + // MultiQC options multiqc_config = null multiqc_title = null diff --git a/subworkflows/local/functional_annotation/main.nf b/subworkflows/local/functional_annotation/main.nf index af1134b..36bf2d3 100644 --- a/subworkflows/local/functional_annotation/main.nf +++ b/subworkflows/local/functional_annotation/main.nf @@ -1,3 +1,5 @@ +include { DIAMOND_BLASTP } from '../../../modules/nf-core/diamond/blastp/main' + workflow FUNCTIONAL_ANNOTATION { take: @@ -9,8 +11,19 @@ workflow FUNCTIONAL_ANNOTATION { // TODO nf-core: substitute modules here for the modules of your subworkflow + ch_diamond_db = Channel.of( [ [id:"diamond_db"], file(params.diamond_db, checkIfExists: true) ] ) + + DIAMOND_BLASTP ( + ch_fasta, + ch_diamond_db, + params.diamond_outfmt, + params.diamond_blast_columns, + ) + ch_versions = ch_versions.mix(DIAMOND_BLASTP.out.versions.first()) + emit: // TODO nf-core: edit emitted channels + ch_diamond_tsv = DIAMOND_BLASTP.out.tsv // channel: [ val(meta)] versions = ch_versions // channel: [ versions.yml ] } diff --git a/subworkflows/local/functional_annotation/meta.yml b/subworkflows/local/functional_annotation/meta.yml index cf6ee7a..9fce546 100644 --- a/subworkflows/local/functional_annotation/meta.yml +++ b/subworkflows/local/functional_annotation/meta.yml @@ -4,6 +4,7 @@ description: Functional annotation of proteins keywords: - fasta components: + - diamond/blastp input: - ch_fasta: type: file @@ -12,6 +13,18 @@ input: Structure: [ val(meta), path(fasta) ] pattern: "*.{fa,fasta,fa.gz,fasta.gz}" output: + - tsv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.{tsv,tsv.gz}": + type: file + description: Tab separated file containing taxonomic classification of hits + pattern: "*.{tsv,tsv.gz}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV - versions: type: file description: |