From c8ad07698d8decd3fa91367c0befcb4a570950f5 Mon Sep 17 00:00:00 2001 From: sahuno Date: Thu, 23 Apr 2026 22:02:11 -0400 Subject: [PATCH] feat: add modkit/validate module Add new nf-core module wrapping `modkit validate`, which benchmarks modified-base calls in one or more mod-BAMs against paired ground-truth BED files and emits a tab-separated summary of correct / incorrect / filtered calls per class. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../nf-core/modkit/validate/environment.yml | 7 ++ modules/nf-core/modkit/validate/main.nf | 46 ++++++++ modules/nf-core/modkit/validate/meta.yml | 110 ++++++++++++++++++ .../modkit/validate/tests/main.nf.test | 65 +++++++++++ .../modkit/validate/tests/main.nf.test.snap | 49 ++++++++ .../modkit/validate/tests/nextflow.config | 5 + 6 files changed, 282 insertions(+) create mode 100644 modules/nf-core/modkit/validate/environment.yml create mode 100644 modules/nf-core/modkit/validate/main.nf create mode 100644 modules/nf-core/modkit/validate/meta.yml create mode 100644 modules/nf-core/modkit/validate/tests/main.nf.test create mode 100644 modules/nf-core/modkit/validate/tests/main.nf.test.snap create mode 100644 modules/nf-core/modkit/validate/tests/nextflow.config diff --git a/modules/nf-core/modkit/validate/environment.yml b/modules/nf-core/modkit/validate/environment.yml new file mode 100644 index 000000000000..dac7b506e89f --- /dev/null +++ b/modules/nf-core/modkit/validate/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::ont-modkit=0.6.1" diff --git a/modules/nf-core/modkit/validate/main.nf b/modules/nf-core/modkit/validate/main.nf new file mode 100644 index 000000000000..17d2441bc85e --- /dev/null +++ b/modules/nf-core/modkit/validate/main.nf @@ -0,0 +1,46 @@ +process MODKIT_VALIDATE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ont-modkit:0.6.1--hcdda2d0_0': + 'quay.io/biocontainers/ont-modkit:0.6.1--hcdda2d0_0' }" + + input: + // Multiple sample pairs can be supplied as list inputs; BAMs and BEDs are + // paired by index and passed as repeated `--bam-and-bed `. + tuple val(meta), path(bams, stageAs: "bams/?/*"), path(bais, stageAs: "bais/?/*"), path(truth_beds, stageAs: "truth/?/*") + + output: + tuple val(meta), path("*.tsv"), emit: tsv + tuple val(meta), path("*.log"), emit: log, optional: true + tuple val("${task.process}"), val('modkit'), eval("modkit --version | sed 's/modkit //'"), emit: versions_modkit, topic: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_list = bams instanceof List ? bams : [bams] + def bed_list = truth_beds instanceof List ? truth_beds : [truth_beds] + if (bam_list.size() != bed_list.size()) { + error "MODKIT_VALIDATE: bams and truth_beds must have the same length (got ${bam_list.size()} vs ${bed_list.size()})" + } + def pair_args = [bam_list, bed_list].transpose().collect { b, g -> "--bam-and-bed ${b} ${g}" }.join(' ') + """ + modkit \\ + validate \\ + $args \\ + --threads ${task.cpus} \\ + ${pair_args} \\ + --out-filepath ${prefix}.tsv + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.tsv + """ +} diff --git a/modules/nf-core/modkit/validate/meta.yml b/modules/nf-core/modkit/validate/meta.yml new file mode 100644 index 000000000000..fb851c83d1f9 --- /dev/null +++ b/modules/nf-core/modkit/validate/meta.yml @@ -0,0 +1,110 @@ +name: modkit_validate +description: | + Validate base modification calls in one or more mod-BAMs against a matched + ground-truth BED file containing the expected modification state at + reference positions. Reports per-sample and combined confusion-matrix style + statistics (accuracy, filtered calls, probability distribution) as a + tab-separated machine-parseable table. +keywords: + - modkit + - methylation + - validate + - nanopore + - ont + - modbam + - ground-truth + - benchmark +tools: + - "modkit": + description: A bioinformatics tool for working with modified bases in Oxford Nanopore + sequencing data. + homepage: https://github.com/nanoporetech/modkit + documentation: https://nanoporetech.github.io/modkit/ + tool_dev_url: https://github.com/nanoporetech/modkit + licence: + - "Oxford Nanopore Technologies PLC. Public License Version 1.0" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample (or cohort) information + e.g. `[ id:'test_run' ]`. The output inherits this meta. + - bams: + type: list + description: | + One or more modBAM files. BAMs are paired 1:1 with `truth_beds` + by list index and passed as repeated `--bam-and-bed ` + arguments to `modkit validate`. + pattern: "*.{bam,cram}" + ontologies: + - edam: http://edamontology.org/format_2572 + - bais: + type: list + description: | + BAM indices (`.bai` or `.csi`) matching each input BAM. + pattern: "*.{bai,csi}" + ontologies: [] + - truth_beds: + type: list + description: | + Ground-truth BED files (one per BAM). The name field must be the + short modified base code (single letter or ChEBI ID) or `-` to + mark a canonical reference position. Paired 1:1 with `bams`. + pattern: "*.{bed,bed.gz}" + ontologies: + - edam: http://edamontology.org/format_3003 +output: + tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test_run' ]`. + - "*.tsv": + type: file + description: | + Machine-parseable validation summary produced by + `modkit validate --out-filepath`. Columns include counts of + correct / incorrect / filtered calls per class. + pattern: "*.tsv" + ontologies: + - edam: http://edamontology.org/format_3475 + log: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test_run' ]`. + - "*.log": + type: file + description: | + Optional modkit debug log (only emitted when `--log-filepath + .log` is passed via `ext.args`). + pattern: "*.log" + ontologies: [] + versions_modkit: + - - ${task.process}: + type: string + description: The name of the process + - modkit: + type: string + description: The name of the tool + - modkit --version | sed 's/modkit //': + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - modkit: + type: string + description: The name of the tool + - modkit --version | sed 's/modkit //': + type: eval + description: The expression to obtain the version of the tool +authors: + - "@sahuno" +maintainers: + - "@sahuno" diff --git a/modules/nf-core/modkit/validate/tests/main.nf.test b/modules/nf-core/modkit/validate/tests/main.nf.test new file mode 100644 index 000000000000..ddbd9239bccf --- /dev/null +++ b/modules/nf-core/modkit/validate/tests/main.nf.test @@ -0,0 +1,65 @@ +nextflow_process { + + name "Test Process MODKIT_VALIDATE" + script "../main.nf" + process "MODKIT_VALIDATE" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "modkit" + tag "modkit/validate" + tag "modkit/pileup" + + // A real validation run needs a BAM paired with a ground-truth BED whose + // `name` column is the short modified-base code (m/h/a/...) or `-` for a + // canonical reference position. + // + // The public modules test data doesn't ship such a BED, so for the real + // test we pipe `MODKIT_PILEUP` output through `cut -f1-6` (a bedMethyl's + // first six columns already satisfy the BED6 + mod-code-in-name contract) + // and validate the BAM against its own pileup. This is a self-consistency + // smoke test — it won't produce meaningful accuracy metrics but will + // exercise the full CLI path end-to-end. + setup { + run("MODKIT_PILEUP") { + script "../../pileup/main.nf" + process { + """ + input[0] = [ + [ id: 'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam.bai', checkIfExists: true) + ] + input[1] = [[],[],[]] + input[2] = [[],[]] + """ + } + } + } + + test("homo sapiens - modbam + truth bed - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id: 'test' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam', checkIfExists: true) ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam.bai', checkIfExists: true) ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.bed', checkIfExists: true) ] // placeholder truth BED for stub — real validation needs a truth BED whose name column is the mod code (m/h/-) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/modkit/validate/tests/main.nf.test.snap b/modules/nf-core/modkit/validate/tests/main.nf.test.snap new file mode 100644 index 000000000000..418f7a58fe71 --- /dev/null +++ b/modules/nf-core/modkit/validate/tests/main.nf.test.snap @@ -0,0 +1,49 @@ +{ + "homo sapiens - modbam + truth bed - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + + ], + "2": [ + [ + "MODKIT_VALIDATE", + "modkit", + "0.6.1" + ] + ], + "log": [ + + ], + "tsv": [ + [ + { + "id": "test" + }, + "test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_modkit": [ + [ + "MODKIT_VALIDATE", + "modkit", + "0.6.1" + ] + ] + } + ], + "timestamp": "2026-04-23T22:01:55.666359485", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.04.6" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/modkit/validate/tests/nextflow.config b/modules/nf-core/modkit/validate/tests/nextflow.config new file mode 100644 index 000000000000..bd2f46cb54a7 --- /dev/null +++ b/modules/nf-core/modkit/validate/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'MODKIT_VALIDATE' { + ext.args = '-c C' + } +}