From 09785650d2ffd77d76e54415b23fc520ca4890dd Mon Sep 17 00:00:00 2001 From: sahuno Date: Thu, 23 Apr 2026 22:44:50 -0400 Subject: [PATCH] feat: add modkit/entropy module Add new nf-core module wrapping `modkit entropy`, which computes methylation entropy over genomic windows from one or more mod-BAMs. Supports an optional BED of regions for per-region descriptive statistics; emits a genome-wide BED otherwise. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../nf-core/modkit/entropy/environment.yml | 7 + modules/nf-core/modkit/entropy/main.nf | 52 ++++++ modules/nf-core/modkit/entropy/meta.yml | 169 ++++++++++++++++++ .../nf-core/modkit/entropy/tests/main.nf.test | 76 ++++++++ .../modkit/entropy/tests/main.nf.test.snap | 83 +++++++++ .../modkit/entropy/tests/nextflow.config | 5 + 6 files changed, 392 insertions(+) create mode 100644 modules/nf-core/modkit/entropy/environment.yml create mode 100644 modules/nf-core/modkit/entropy/main.nf create mode 100644 modules/nf-core/modkit/entropy/meta.yml create mode 100644 modules/nf-core/modkit/entropy/tests/main.nf.test create mode 100644 modules/nf-core/modkit/entropy/tests/main.nf.test.snap create mode 100644 modules/nf-core/modkit/entropy/tests/nextflow.config diff --git a/modules/nf-core/modkit/entropy/environment.yml b/modules/nf-core/modkit/entropy/environment.yml new file mode 100644 index 000000000000..dac7b506e89f --- /dev/null +++ b/modules/nf-core/modkit/entropy/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::ont-modkit=0.6.1" diff --git a/modules/nf-core/modkit/entropy/main.nf b/modules/nf-core/modkit/entropy/main.nf new file mode 100644 index 000000000000..579c8a61fd8e --- /dev/null +++ b/modules/nf-core/modkit/entropy/main.nf @@ -0,0 +1,52 @@ +process MODKIT_ENTROPY { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ont-modkit:0.6.1--hcdda2d0_0': + 'quay.io/biocontainers/ont-modkit:0.6.1--hcdda2d0_0' }" + + input: + // stageAs '?/*' prevents filename collisions when multiple BAMs from the + // same sample (e.g. technical replicates) are passed to a single run. + tuple val(meta), path(bams, stageAs: "in/?/*"), path(bais, stageAs: "in/?/*") + tuple val(meta2), path(fasta), path(fai) + tuple val(meta3), path(regions) + + output: + tuple val(meta), path("*.bed") , emit: bed , optional: true + tuple val(meta), path("entropy_regions/*.bed") , emit: regions_bed , optional: true + tuple val(meta), path("entropy_regions/*.bedgraph") , emit: bedgraph , optional: true + tuple val(meta), path("entropy_regions/*.tsv") , emit: tsv , optional: true + tuple val(meta), path("*.log") , emit: log , optional: true + tuple val("${task.process}"), val('modkit'), eval("modkit --version | sed 's/modkit //'"), emit: versions_modkit, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_args = bams instanceof List ? bams.collect { "--in-bam ${it}" }.join(' ') : "--in-bam ${bams}" + // modkit entropy's --out-bed expects a FILE without --regions, and a DIRECTORY with --regions + def out_arg = regions ? "--regions ${regions} --out-bed entropy_regions --prefix ${prefix}" : "--out-bed ${prefix}.bed" + def mkdir = regions ? "mkdir -p entropy_regions" : "" + """ + ${mkdir} + + modkit \\ + entropy \\ + $args \\ + --threads ${task.cpus} \\ + --ref ${fasta} \\ + ${out_arg} \\ + ${bam_args} + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bed + """ +} diff --git a/modules/nf-core/modkit/entropy/meta.yml b/modules/nf-core/modkit/entropy/meta.yml new file mode 100644 index 000000000000..96101d10d9a5 --- /dev/null +++ b/modules/nf-core/modkit/entropy/meta.yml @@ -0,0 +1,169 @@ +name: modkit_entropy +description: | + Calculate methylation entropy over genomic windows from one or more mod-BAMs. + Entropy is a per-window measure of the diversity of methylation patterns + across reads covering the window and is complementary to mean methylation. + When `--regions` is supplied (via `ext.args` or the `regions` input), modkit + writes per-region BED/bedgraph/tsv files into a directory; otherwise a single + genome-wide BED file is produced. +keywords: + - modkit + - methylation + - entropy + - nanopore + - ont + - modbam +tools: + - "modkit": + description: A bioinformatics tool for working with modified bases in Oxford Nanopore + sequencing data. + homepage: https://github.com/nanoporetech/modkit + documentation: https://nanoporetech.github.io/modkit/ + tool_dev_url: https://github.com/nanoporetech/modkit + licence: + - "Oxford Nanopore Technologies PLC. Public License Version 1.0" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]`. The output inherits this meta. + - bams: + type: list + description: | + One or more modBAM files to compute entropy from. Passing more than + one BAM aggregates counts across them (multi-sample entropy). + pattern: "*.{bam,cram}" + ontologies: + - edam: http://edamontology.org/format_2572 + - bais: + type: list + description: | + BAM indices (`.bai` or `.csi`) matching each input BAM, one-to-one. + pattern: "*.{bai,csi}" + ontologies: [] + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. `[ id:'mm10' ]`. + - fasta: + type: file + description: Reference FASTA the BAM was aligned to. + pattern: "*.{fa,fasta,fna}" + ontologies: + - edam: http://edamontology.org/format_1929 + - fai: + type: file + description: Samtools FASTA index for `fasta`. + pattern: "*.fai" + ontologies: + - edam: http://edamontology.org/format_3475 + - - meta3: + type: map + description: | + Groovy Map containing region information + e.g. `[ id:'promoters' ]`. May be `[[], []]` to skip. + - regions: + type: file + description: | + Optional BED file of regions over which to compute per-region + descriptive statistics. When provided, modkit writes per-region + outputs (`.bed`, `.bedgraph`, `.tsv`) into a directory. + pattern: "*.{bed,bed.gz}" + ontologies: + - edam: http://edamontology.org/format_3003 +output: + bed: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]`. + - "*.bed": + type: file + description: | + Genome-wide entropy BED file, produced when `regions` is not + provided. One row per window. + pattern: "*.bed" + ontologies: + - edam: http://edamontology.org/format_3003 + regions_bed: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]`. + - "entropy_regions/*.bed": + type: file + description: | + Per-region entropy BED files, produced when `regions` is provided. + pattern: "*.bed" + ontologies: + - edam: http://edamontology.org/format_3003 + bedgraph: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]`. + - "entropy_regions/*.bedgraph": + type: file + description: | + Per-region entropy bedgraph, produced when `regions` is provided. + pattern: "*.bedgraph" + ontologies: + - edam: http://edamontology.org/format_3583 + tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]`. + - "entropy_regions/*.tsv": + type: file + description: | + Per-region descriptive statistics TSV, produced when `regions` + is provided. + pattern: "*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]`. + - "*.log": + type: file + description: | + Optional modkit debug log (only emitted when `--log-filepath + .log` is passed via `ext.args`). + pattern: "*.log" + ontologies: [] + versions_modkit: + - - ${task.process}: + type: string + description: The name of the process + - modkit: + type: string + description: The name of the tool + - modkit --version | sed 's/modkit //': + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - modkit: + type: string + description: The name of the tool + - modkit --version | sed 's/modkit //': + type: eval + description: The expression to obtain the version of the tool +authors: + - "@sahuno" +maintainers: + - "@sahuno" diff --git a/modules/nf-core/modkit/entropy/tests/main.nf.test b/modules/nf-core/modkit/entropy/tests/main.nf.test new file mode 100644 index 000000000000..771e6b6fa5fb --- /dev/null +++ b/modules/nf-core/modkit/entropy/tests/main.nf.test @@ -0,0 +1,76 @@ +nextflow_process { + + name "Test Process MODKIT_ENTROPY" + script "../main.nf" + process "MODKIT_ENTROPY" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "modkit" + tag "modkit/entropy" + + test("homo sapiens - nanopore modbam - cpg - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam', checkIfExists: true) ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam.bai', checkIfExists: true) ] + ] + input[1] = [ + [ id: 'genome' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[2] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("homo sapiens - nanopore modbam - cpg") { + + when { + process { + """ + input[0] = [ + [ id: 'test' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam', checkIfExists: true) ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam.bai', checkIfExists: true) ] + ] + input[1] = [ + [ id: 'genome' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[2] = [[],[]] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert process.out.bed }, + { assert path(process.out.bed[0][1]).exists() }, + { assert path(process.out.bed[0][1]).size() > 0 }, + // modkit entropy's BED output isn't byte-deterministic across CPU + // counts / architectures (float precision differs). Assert structure + // only and snapshot just the versions topic. + { assert snapshot(process.out.versions_modkit).match() } + ) + } + } +} diff --git a/modules/nf-core/modkit/entropy/tests/main.nf.test.snap b/modules/nf-core/modkit/entropy/tests/main.nf.test.snap new file mode 100644 index 000000000000..55282725a97c --- /dev/null +++ b/modules/nf-core/modkit/entropy/tests/main.nf.test.snap @@ -0,0 +1,83 @@ +{ + "homo sapiens - nanopore modbam - cpg": { + "content": [ + [ + [ + "MODKIT_ENTROPY", + "modkit", + "0.6.1" + ] + ] + ], + "timestamp": "2026-04-23T23:01:43.40082036", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.04.6" + } + }, + "homo sapiens - nanopore modbam - cpg - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + + ], + "3": [ + + ], + "4": [ + + ], + "5": [ + [ + "MODKIT_ENTROPY", + "modkit", + "0.6.1" + ] + ], + "bed": [ + [ + { + "id": "test" + }, + "test.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "bedgraph": [ + + ], + "log": [ + + ], + "regions_bed": [ + + ], + "tsv": [ + + ], + "versions_modkit": [ + [ + "MODKIT_ENTROPY", + "modkit", + "0.6.1" + ] + ] + } + ], + "timestamp": "2026-04-23T23:01:16.647684082", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.04.6" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/modkit/entropy/tests/nextflow.config b/modules/nf-core/modkit/entropy/tests/nextflow.config new file mode 100644 index 000000000000..e118b5753a3a --- /dev/null +++ b/modules/nf-core/modkit/entropy/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'MODKIT_ENTROPY' { + ext.args = '--cpg' + } +}