diff --git a/modules/nf-core/gatk4/cleansam/environment.yml b/modules/nf-core/gatk4/cleansam/environment.yml new file mode 100644 index 00000000000..e6f0f713b9a --- /dev/null +++ b/modules/nf-core/gatk4/cleansam/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + # renovate: datasource=conda depName=bioconda/gatk4 + - "bioconda::gatk4=4.6.2.0" diff --git a/modules/nf-core/gatk4/cleansam/main.nf b/modules/nf-core/gatk4/cleansam/main.nf new file mode 100644 index 00000000000..9d447570524 --- /dev/null +++ b/modules/nf-core/gatk4/cleansam/main.nf @@ -0,0 +1,69 @@ +process GATK4_CLEANSAM { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ce/ced519873646379e287bc28738bdf88e975edd39a92e7bc6a34bccd37153d9d0/data' + : 'community.wave.seqera.io/library/gatk4_gcnvkernel:edb12e4f0bf02cd3'}" + + input: + tuple val(meta), path(bam) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fasta_index) + // input file must be sorted for index to be created + val create_index + val create_md5 + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.md5"), emit: md5, optional: true + tuple val("${task.process}"), val('gatk'), eval("gatk CleanSam --version | grep GATK | sed 's/.*(GATK) v//'"), topic: versions, emit: versions_gatk + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def index = create_index ? "--CREATE_INDEX true" : "" + def reference = fasta ? "--REFERENCE_SEQUENCE ${fasta}" : "" + def md5 = create_md5 ? "--CREATE_MD5_FILE true" : "" + def avail_mem = 3072 + if (!task.memory) { + log.info('[GATK CleanSam] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.') + } + else { + avail_mem = (task.memory.mega * 0.8).intValue() + } + + if ("${bam}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData"\\ + CleanSam \\ + ${args} \\ + ${reference} \\ + ${index} \\ + ${md5} \\ + --INPUT ${bam} \\ + --OUTPUT ${prefix}.bam + + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def index = create_index ? "touch ${prefix}.bam.bai" : "" + def md5 = create_md5 ? "touch ${prefix}.md5" : "" + if ("${bam}" == "${prefix}.bam") { + error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!") + } + """ + touch ${prefix}.bam \\ + ${index} \\ + ${md5} + """ +} diff --git a/modules/nf-core/gatk4/cleansam/meta.yml b/modules/nf-core/gatk4/cleansam/meta.yml new file mode 100644 index 00000000000..6e63a112db1 --- /dev/null +++ b/modules/nf-core/gatk4/cleansam/meta.yml @@ -0,0 +1,123 @@ +name: "gatk4_cleansam" +description: Cleans the provided BAM, soft-clipping beyond-end-of-reference + alignments and setting MAPQ to 0 for unmapped reads +keywords: + - clean + - bam + - gatk + - sam + - clipping +tools: + - "gatk": + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: + - "Apache-2.0" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM file + pattern: "*.{bam}" + ontologies: + - edam: "http://edamontology.org/format_2572" + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Reference FASTA file + pattern: "*.{fasta}" + ontologies: + - edam: "http://edamontology.org/format_1929" + - - meta3: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta_index: + type: file + description: Index file for the reference FASTA + pattern: "*.{fai}" + ontologies: + - edam: "http://edamontology.org/format_3703" + - create_index: + type: boolean + description: Whether to create an index file for the output BAM + - create_md5: + type: boolean + description: Whether to create an MD5 checksum for the output BAM +output: + bam: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bam": + type: file + description: Cleaned BAM file + pattern: "*.{bam}" + ontologies: + - edam: "http://edamontology.org/format_2572" + bai: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.bai": + type: file + description: Index file for the cleaned BAM file + pattern: "*.{bai}" + ontologies: + - edam: "http://edamontology.org/format_3327" + md5: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.md5": + type: file + description: MD5 checksum for the cleaned BAM file + pattern: "*.{md5}" + ontologies: + - edam: "http://edamontology.org/format_3823" + versions_gatk: + - - ${task.process}: + type: string + description: The process the versions were collected from + - gatk: + type: string + description: The tool name + - "gatk CleanSam --version | grep GATK | sed 's/.*(GATK) v//'": + type: eval + description: The command used to generate the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The process the versions were collected from + - gatk: + type: string + description: The tool name + - "gatk CleanSam --version | grep GATK | sed 's/.*(GATK) v//'": + type: eval + description: The command used to generate the version of the tool +authors: + - "@sofiademmou" +maintainers: + - "@sofiademmou" diff --git a/modules/nf-core/gatk4/cleansam/tests/main.nf.test b/modules/nf-core/gatk4/cleansam/tests/main.nf.test new file mode 100644 index 00000000000..e197c256422 --- /dev/null +++ b/modules/nf-core/gatk4/cleansam/tests/main.nf.test @@ -0,0 +1,197 @@ + +nextflow_process { + + name "Test Process GATK4_CLEANSAM" + script "../main.nf" + process "GATK4_CLEANSAM" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "gatk4" + tag "gatk4/cleansam" + + test("sarscov2 - bam") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true) + ] + input[1] = [[], []] + input[2] = [[], []] + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getReadsMD5(), + process.out.findAll { key, val -> key.startsWith("versions") } + ).match()} + ) + } + } + + test("sarscov2 - bam + fasta") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test', single_end:true ], // meta2 map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = [ + [ id:'test', single_end:true ], // meta3 map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) + ] + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getReadsMD5(), + process.out.findAll { key, val -> key.startsWith("versions") } + ).match()} + ) + } + } + + test("sarscov2 - bam + fasta - bai + md5") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test', single_end:true ], // meta2 map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = [ + [ id:'test', single_end:true ], // meta3 map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) + ] + input[3] = true + input[4] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + bam(process.out.bam[0][1]).getReadsMD5(), + file(process.out.bai[0][1]).name, + process.out.md5, + process.out.findAll { key, val -> key.startsWith("versions") } + ).match()} + ) + } + } + + test("sarscov2 - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true) + ] + input[1] = [[], []] + input[2] = [[], []] + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + } + test("sarscov2 - bam + fasta - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test', single_end:true ], // meta2 map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = [ + [ id:'test', single_end:true ], // meta3 map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) + ] + input[3] = false + input[4] = false + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + } + + test("sarscov2 - bam + fasta - bai + md5 - stub") { + options "-stub" + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.single_end.sorted.bam', checkIfExists: true) + ] + input[1] = [ + [ id:'test', single_end:true ], // meta2 map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ] + input[2] = [ + [ id:'test', single_end:true ], // meta3 map + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) + ] + input[3] = true + input[4] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(sanitizeOutput(process.out)).match() } + ) + } + } +} diff --git a/modules/nf-core/gatk4/cleansam/tests/main.nf.test.snap b/modules/nf-core/gatk4/cleansam/tests/main.nf.test.snap new file mode 100644 index 00000000000..dd92e91464f --- /dev/null +++ b/modules/nf-core/gatk4/cleansam/tests/main.nf.test.snap @@ -0,0 +1,180 @@ +{ + "sarscov2 - bam + fasta - bai + md5 - stub": { + "content": [ + { + "bai": [ + [ + { + "id": "test", + "single_end": true + }, + "test.cleaned.bam.bai:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.cleaned.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "md5": [ + [ + { + "id": "test", + "single_end": true + }, + "test.cleaned.md5:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_gatk": [ + [ + "GATK4_CLEANSAM", + "gatk", + "4.6.2.0" + ] + ] + } + ], + "timestamp": "2026-04-24T16:44:14.582571234", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.03.2" + } + }, + "sarscov2 - stub": { + "content": [ + { + "bai": [ + + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.cleaned.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "md5": [ + + ], + "versions_gatk": [ + [ + "GATK4_CLEANSAM", + "gatk", + "4.6.2.0" + ] + ] + } + ], + "timestamp": "2026-04-23T12:01:12.579259621", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.03.2" + } + }, + "sarscov2 - bam": { + "content": [ + "94fcf617f5b994584c4e8d4044e16b4f", + { + "versions_gatk": [ + [ + "GATK4_CLEANSAM", + "gatk", + "4.6.2.0" + ] + ] + } + ], + "timestamp": "2026-04-24T10:35:34.181426878", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.03.2" + } + }, + "sarscov2 - bam + fasta - bai + md5": { + "content": [ + "94fcf617f5b994584c4e8d4044e16b4f", + "test.cleaned.bai", + [ + [ + { + "id": "test", + "single_end": true + }, + "test.cleaned.bam.md5:md5,1cf672787405b01fec776818fb312abc" + ] + ], + { + "versions_gatk": [ + [ + "GATK4_CLEANSAM", + "gatk", + "4.6.2.0" + ] + ] + } + ], + "timestamp": "2026-04-24T10:37:53.473973418", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.03.2" + } + }, + "sarscov2 - bam + fasta": { + "content": [ + "94fcf617f5b994584c4e8d4044e16b4f", + { + "versions_gatk": [ + [ + "GATK4_CLEANSAM", + "gatk", + "4.6.2.0" + ] + ] + } + ], + "timestamp": "2026-04-24T10:35:41.370786571", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.03.2" + } + }, + "sarscov2 - bam + fasta - stub": { + "content": [ + { + "bai": [ + + ], + "bam": [ + [ + { + "id": "test", + "single_end": true + }, + "test.cleaned.bam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "md5": [ + + ], + "versions_gatk": [ + [ + "GATK4_CLEANSAM", + "gatk", + "4.6.2.0" + ] + ] + } + ], + "timestamp": "2026-04-24T16:44:08.721122709", + "meta": { + "nf-test": "0.9.5", + "nextflow": "26.03.2" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/gatk4/cleansam/tests/nextflow.config b/modules/nf-core/gatk4/cleansam/tests/nextflow.config new file mode 100644 index 00000000000..4e288a9c562 --- /dev/null +++ b/modules/nf-core/gatk4/cleansam/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'GATK4_CLEANSAM'{ + ext.prefix = { "${meta.id}.cleaned"} + } +}