From 8a37c4f40f663cb85dfbc0cfd7d6ffd00c94a579 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova Date: Fri, 24 Apr 2026 10:11:26 +0100 Subject: [PATCH 1/2] add CREATE_ASSEMBLY_METADATA_CSV module, add versions tracking for all modules --- conf/modules.config | 8 ++ .../create_assembly_metadata_csv/main.nf | 52 +++++++++++ .../create_assembly_metadata_csv/meta.yml | 48 ++++++++++ .../tests/main.nf.test | 93 +++++++++++++++++++ workflows/assemblysubmit.nf | 38 +++----- 5 files changed, 215 insertions(+), 24 deletions(-) create mode 100644 modules/local/create_assembly_metadata_csv/main.nf create mode 100644 modules/local/create_assembly_metadata_csv/meta.yml create mode 100644 modules/local/create_assembly_metadata_csv/tests/main.nf.test diff --git a/conf/modules.config b/conf/modules.config index 7f87441..fc03999 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -144,6 +144,14 @@ process { // SUBMISSION AND MANIFEST GENERATION // + withName: 'CREATE_ASSEMBLY_METADATA_CSV' { + publishDir = [ + path: { "${params.outdir}/${params.mode}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'GENOME_UPLOAD' { publishDir = [ path: { "${params.outdir}/${params.mode}/upload/manifests" }, diff --git a/modules/local/create_assembly_metadata_csv/main.nf b/modules/local/create_assembly_metadata_csv/main.nf new file mode 100644 index 0000000..91fe5bd --- /dev/null +++ b/modules/local/create_assembly_metadata_csv/main.nf @@ -0,0 +1,52 @@ +process CREATE_ASSEMBLY_METADATA_CSV { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/31/31f1c42a25a80ebc296a0aa07d83b3f0e408f9f3c240f9375c55d9790576c1de/data' : + 'community.wave.seqera.io/library/pip_pygments:37b2b421ce07e516' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("${meta.id}_assembly_metadata.csv"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def run_accession = meta.run_accession ?: '' + def coverage = meta.coverage ?: '' + def assembler = meta.assembler ?: '' + def assembler_version = meta.assembler_version ?: '' + def sample = '' // Sample column left empty because co-assemblies are not supported + + """ + cat > ${meta.id}_assembly_metadata.csv << EOF +Runs,Coverage,Assembler,Version,Filepath,Sample +${run_accession},${coverage},${assembler},${assembler_version},${fasta.name},${sample} +EOF + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(bash --version | head -n1 | sed 's/.*version //; s/ .*//') + END_VERSIONS + """ + + stub: + """ + cat > ${meta.id}_assembly_metadata.csv << EOF +Runs,Coverage,Assembler,Version,Filepath,Sample +${meta.run_accession ?: ''},${meta.coverage ?: ''},${meta.assembler ?: ''},${meta.assembler_version ?: ''},${fasta.name}, +EOF + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(bash --version | head -n1 | sed 's/.*version //; s/ .*//') + END_VERSIONS + """ +} diff --git a/modules/local/create_assembly_metadata_csv/meta.yml b/modules/local/create_assembly_metadata_csv/meta.yml new file mode 100644 index 0000000..b1301c7 --- /dev/null +++ b/modules/local/create_assembly_metadata_csv/meta.yml @@ -0,0 +1,48 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "create_assembly_metadata_csv" +description: Create input CSV metadata file for assembly_uploader tool +keywords: + - assembly + - metadata + - ena + - submission +tools: + - bash: + description: Bash shell scripting + homepage: https://www.gnu.org/software/bash/ + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information: + - id: Sample identifier + - run_accession: ENA run accession + - coverage: Assembly coverage value + - assembler: Name of the assembler used + - assembler_version: Version of the assembler + - fasta: + type: file + description: Assembly FASTA file + pattern: "*.{fasta,fa,fna}" + +output: + - csv: + - meta: + type: map + description: | + Groovy Map containing sample information (same as input) + - "*.csv": + type: file + description: CSV file containing assembly metadata to be used as input for the assembly_uploader + pattern: "*_assembly_metadata.csv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@ochkalova" +maintainers: + - "@ochkalova" diff --git a/modules/local/create_assembly_metadata_csv/tests/main.nf.test b/modules/local/create_assembly_metadata_csv/tests/main.nf.test new file mode 100644 index 0000000..6140369 --- /dev/null +++ b/modules/local/create_assembly_metadata_csv/tests/main.nf.test @@ -0,0 +1,93 @@ +nextflow_process { + + name "Test Process CREATE_ASSEMBLY_METADATA_CSV" + script "../main.nf" + process "CREATE_ASSEMBLY_METADATA_CSV" + + tag "modules" + tag "modules_local" + tag "create_assembly_metadata_csv" + + test("test_create_assembly_metadata_csv - complete metadata") { + + when { + process { + """ + input[0] = [ + [ + id: 'test_sample', + run_accession: 'ERR123456', + coverage: 50.5, + assembler: 'metaspades', + assembler_version: '3.15.5' + ], + file('https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.fna.gz') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_create_assembly_metadata_csv - minimal metadata") { + + when { + process { + """ + input[0] = [ + [ + id: 'minimal_sample' + ], + file('https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.fna.gz') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("test_create_assembly_metadata_csv - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ + id: 'stub_sample', + run_accession: 'ERR999999', + coverage: 75.0, + assembler: 'megahit', + assembler_version: '1.2.9' + ], + file('https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/genome/NC_045512.2/GCF_009858895.2_ASM985889v3_genomic.200409.fna.gz') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/workflows/assemblysubmit.nf b/workflows/assemblysubmit.nf index 5743f7d..4f1f2ab 100644 --- a/workflows/assemblysubmit.nf +++ b/workflows/assemblysubmit.nf @@ -6,6 +6,7 @@ include { COVERM_CONTIG } from '../modules/nf-core/coverm/contig/main' include { FASTAVALIDATOR } from '../modules/nf-core/fastavalidator/main' +include { CREATE_ASSEMBLY_METADATA_CSV } from '../modules/local/create_assembly_metadata_csv/main' include { GENERATE_ASSEMBLY_MANIFEST } from '../modules/local/generate_assembly_manifest/main' include { REGISTERSTUDY } from '../modules/local/registerstudy/main' include { ENA_WEBIN_CLI_WRAPPER as SUBMIT } from '../modules/local/ena_webin_cli_wrapper' @@ -72,6 +73,8 @@ workflow ASSEMBLYSUBMIT { assembly_fasta, "true" // enables number of contigs check - ENA requires more than 1 contig for an assembly submission ) + ch_versions = ch_versions.mix(FASTAVALIDATOR.out.versions) + validated_fastas = assembly_fasta.join(FASTAVALIDATOR.out.success_log) .map { meta, fasta, _log -> [meta, fasta] @@ -85,12 +88,14 @@ workflow ASSEMBLYSUBMIT { reads: [ meta, fastq ] } .set { coverm_input } + COVERM_CONTIG ( coverm_input.reads, coverm_input.assembly, false, // bam_input false // interleaved ) + ch_versions = ch_versions.mix(COVERM_CONTIG.out.versions) // Calculate average coverage using splitCsv operator average_coverage_ch = COVERM_CONTIG.out.coverage @@ -121,29 +126,11 @@ workflow ASSEMBLYSUBMIT { .filter { meta, _fasta -> meta.coverage != null } .mix( assemblies_with_added_cov_ch ) - assembly_metadata_csv = assemblies_with_coverage - .map { meta, fasta -> - def header = 'Runs,Coverage,Assembler,Version,Filepath,Sample' - def row = [ - meta.run_accession ?: '', - meta.coverage ?: '', - meta.assembler ?: '', - meta.assembler_version ?: '', - fasta.name, - '' // Sample column left empty because co assemblies are not supported - ].join(',') - - def content = "${header}\n${row}" - - // Create output directory if it doesn't exist - def outDir = file("${params.outdir}/${params.mode}") - outDir.mkdirs() - - def csv_file = file("${outDir}/${meta.id}_assembly_metadata.csv") - csv_file.text = content - - [meta, csv_file] - } + // Create CSV with assembly metadata for manifest generation + CREATE_ASSEMBLY_METADATA_CSV( + assemblies_with_coverage + ) + ch_versions = ch_versions.mix(CREATE_ASSEMBLY_METADATA_CSV.out.versions) def study_accession_ch if (params.submission_study) { @@ -164,18 +151,21 @@ workflow ASSEMBLYSUBMIT { // Generate assembly manifest files and submit them to ENA GENERATE_ASSEMBLY_MANIFEST( - assemblies_with_coverage.join(assembly_metadata_csv), + assemblies_with_coverage.join(CREATE_ASSEMBLY_METADATA_CSV.out.csv), study_accession_ch.first() ) + ch_versions = ch_versions.mix(GENERATE_ASSEMBLY_MANIFEST.out.versions.first()) ENA_WEBIN_CLI_DOWNLOAD ( params.webin_cli_version ) + ch_versions = ch_versions.mix(ENA_WEBIN_CLI_DOWNLOAD.out.versions) SUBMIT ( assemblies_with_coverage.join(GENERATE_ASSEMBLY_MANIFEST.out.manifest), ENA_WEBIN_CLI_DOWNLOAD.out.webin_cli_jar ) + ch_versions = ch_versions.mix(SUBMIT.out.versions) // // Collate and save software versions From 4bb213f484f86e82ea9a1388e1a96d69667cea79 Mon Sep 17 00:00:00 2001 From: Sofia Ochkalova Date: Fri, 24 Apr 2026 11:13:03 +0100 Subject: [PATCH 2/2] add CREATE_GENOME_METADATA_TSV module --- conf/modules.config | 8 ++ .../local/create_genome_metadata_tsv/main.nf | 48 +++++++++++ .../local/create_genome_metadata_tsv/meta.yml | 60 ++++++++++++++ .../tests/main.nf.test | 81 +++++++++++++++++++ workflows/genomesubmit.nf | 68 ++++------------ 5 files changed, 214 insertions(+), 51 deletions(-) create mode 100644 modules/local/create_genome_metadata_tsv/main.nf create mode 100644 modules/local/create_genome_metadata_tsv/meta.yml create mode 100644 modules/local/create_genome_metadata_tsv/tests/main.nf.test diff --git a/conf/modules.config b/conf/modules.config index fc03999..406602d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -152,6 +152,14 @@ process { ] } + withName: 'CREATE_GENOME_METADATA_TSV' { + publishDir = [ + path: { "${params.outdir}/genome_metadata" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'GENOME_UPLOAD' { publishDir = [ path: { "${params.outdir}/${params.mode}/upload/manifests" }, diff --git a/modules/local/create_genome_metadata_tsv/main.nf b/modules/local/create_genome_metadata_tsv/main.nf new file mode 100644 index 0000000..3c8327e --- /dev/null +++ b/modules/local/create_genome_metadata_tsv/main.nf @@ -0,0 +1,48 @@ +process CREATE_GENOME_METADATA_TSV { + tag "${meta.id}" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'oras://community.wave.seqera.io/library/bash:5.2.26' : + 'community.wave.seqera.io/library/bash:5.2.26'}" + + input: + tuple val(meta), path(fasta) + + output: + path "*.tsv" , emit: tsv + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: 'genomes_metadata' + def co_assembly_value = meta.co_assembly == "Yes" ? "True" : "False" + def rna_presence_value = meta.RNA_presence == "Yes" ? "True" : "False" + """ + # Create header + echo -e "genome_name\\tgenome_path\\taccessions\\tassembly_software\\tbinning_software\\tbinning_parameters\\tstats_generation_software\\tcompleteness\\tcontamination\\tgenome_coverage\\tmetagenome\\tco-assembly\\tbroad_environment\\tlocal_environment\\tenvironmental_medium\\trRNA_presence\\tNCBI_lineage" > ${prefix}.tsv + + # Add data row + echo -e "${meta.id}\\t${fasta.getName()}\\t${meta.accession}\\t${meta.assembly_software}\\t${meta.binning_software}\\t${meta.binning_parameters}\\t${meta.stats_generation_software}\\t${meta.completeness}\\t${meta.contamination}\\t${meta.genome_coverage}\\t${meta.metagenome}\\t${co_assembly_value}\\t${meta.broad_environment}\\t${meta.local_environment}\\t${meta.environmental_medium}\\t${rna_presence_value}\\t${meta.NCBI_lineage}" >> ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(bash --version | head -n 1 | sed 's/GNU bash, version //; s/ .*//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: 'genomes_metadata' + """ + echo -e "genome_name\\tgenome_path\\taccessions\\tassembly_software\\tbinning_software\\tbinning_parameters\\tstats_generation_software\\tcompleteness\\tcontamination\\tgenome_coverage\\tmetagenome\\tco-assembly\\tbroad_environment\\tlocal_environment\\tenvironmental_medium\\trRNA_presence\\tNCBI_lineage" > ${prefix}.tsv + echo -e "${meta.id}\\t${fasta.getName()}\\t${meta.accession}\\t${meta.assembly_software}\\t${meta.binning_software}\\t${meta.binning_parameters}\\t${meta.stats_generation_software}\\t${meta.completeness}\\t${meta.contamination}\\t${meta.genome_coverage}\\t${meta.metagenome}\\tTrue\\t${meta.broad_environment}\\t${meta.local_environment}\\t${meta.environmental_medium}\\tTrue\\t${meta.NCBI_lineage}" >> ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bash: \$(bash --version | head -n 1 | sed 's/GNU bash, version //; s/ .*//') + END_VERSIONS + """ +} diff --git a/modules/local/create_genome_metadata_tsv/meta.yml b/modules/local/create_genome_metadata_tsv/meta.yml new file mode 100644 index 0000000..74752ad --- /dev/null +++ b/modules/local/create_genome_metadata_tsv/meta.yml @@ -0,0 +1,60 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "create_genome_metadata_tsv" +description: Creates a tab-separated TSV file containing metadata for genome MAGs/bins submissions +keywords: + - genomics + - metadata + - submission + - metagenomics + - MAG + - bins +tools: + - bash: + description: GNU Bourne-Again SHell + homepage: https://www.gnu.org/software/bash/ + documentation: https://www.gnu.org/software/bash/manual/ + licence: ["GPL-3.0-or-later"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing genome metadata. Required fields: + - id: Genome identifier + - accession: Sample accession + - assembly_software: Software used for assembly + - binning_software: Software used for binning + - binning_parameters: Parameters used for binning + - stats_generation_software: Software used for stats generation + - completeness: Genome completeness percentage + - contamination: Genome contamination percentage + - genome_coverage: Genome coverage value + - metagenome: Metagenome identifier + - co_assembly: Co-assembly status ("Yes" or other) + - broad_environment: Broad environmental context + - local_environment: Local environmental context + - environmental_medium: Environmental medium description + - RNA_presence: RNA presence status ("Yes" or other) + - NCBI_lineage: NCBI taxonomic lineage + - fasta: + type: file + description: Genome FASTA file + pattern: "*.{fasta,fa,fna}" + +output: + - tsv: + - "*.tsv": + type: file + description: Tab-separated TSV file with genome metadata + pattern: "*.tsv" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@ochkalova" +maintainers: + - "@ochkalova" diff --git a/modules/local/create_genome_metadata_tsv/tests/main.nf.test b/modules/local/create_genome_metadata_tsv/tests/main.nf.test new file mode 100644 index 0000000..9fa9efb --- /dev/null +++ b/modules/local/create_genome_metadata_tsv/tests/main.nf.test @@ -0,0 +1,81 @@ +nextflow_process { + + name "Test Process CREATE_GENOME_METADATA_TSV" + script "../main.nf" + process "CREATE_GENOME_METADATA_TSV" + tag "modules" + tag "modules_nfcore" + tag "create_genome_metadata_tsv" + + test("genome_metadata - complete reference") { + + when { + process { + """ + input[0] = [ + [id: 'test_genome'], + file('${projectDir}/assets/genome/sequence/chr1.fa'), + file('${projectDir}/assets/genome/annotation/genes.gtf'), + file('${projectDir}/assets/genome/annotation/repeats.bed'), + file('${projectDir}/assets/genome/annotation/centromeres.bed') + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + + test("genome_metadata - minimal reference (fasta only)") { + + when { + process { + """ + input[0] = [ + [id: 'minimal_genome'], + file('${projectDir}/assets/genome/sequence/chr1.fa'), + [], + [], + [] + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + + test("genome_metadata - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [id: 'stub_genome'], + file('${projectDir}/assets/genome/sequence/chr1.fa'), + file('${projectDir}/assets/genome/annotation/genes.gtf'), + [], + [] + ] + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + + } + +} diff --git a/workflows/genomesubmit.nf b/workflows/genomesubmit.nf index a17299c..c33c3e8 100644 --- a/workflows/genomesubmit.nf +++ b/workflows/genomesubmit.nf @@ -8,6 +8,7 @@ include { ENA_WEBIN_CLI_WRAPPER as SUBMIT } from '../modules/local/ena_webin_c include { ENA_WEBIN_CLI_DOWNLOAD } from '../modules/local/ena_webin_cli_download' include { REGISTERSTUDY } from '../modules/local/registerstudy/main' include { RENAME_FASTA_FOR_CATPACK } from '../modules/local/rename_fasta_for_catpack' +include { CREATE_GENOME_METADATA_TSV } from '../modules/local/create_genome_metadata_tsv/main' include { FASTAVALIDATOR } from '../modules/nf-core/fastavalidator/main' include { COVERM_GENOME } from '../modules/nf-core/coverm/genome' @@ -122,7 +123,7 @@ workflow GENOMESUBMIT { // --------- For genomes without RNA_presence info, calculate rRNA and tRNA fasta_updated_with_coverage - .branch { meta, fasta -> + .branch { meta, _fasta -> rna_prediction_input: meta.RNA_presence == null rna_present: true // Everything else goes here } @@ -145,7 +146,7 @@ workflow GENOMESUBMIT { // --------- Completeness and contamination calculation fasta_updated_with_rna - .branch { meta, fasta -> + .branch { meta, _fasta -> genome_evaluation_input: meta.completeness == null || meta.contamination == null || meta.stats_generation_software == null evaluation_present: true // Everything else goes here } @@ -176,7 +177,7 @@ workflow GENOMESUBMIT { // --------- Taxonomy fasta_updated_with_stats - .branch { meta, fasta -> + .branch { meta, _fasta -> genome_taxonomy_input: meta.NCBI_lineage == null taxonomy_present: true // Everything else goes here } @@ -215,52 +216,17 @@ workflow GENOMESUBMIT { } .mix(branched_taxonomy_results.taxonomy_present) - // --------- Combine metadata into TSV - genome_metadata_csv = fasta_updated_with_taxonomy - .map { meta, fasta -> - [ - meta.id, - fasta.getName(), - meta.accession, - meta.assembly_software, - meta.binning_software, - meta.binning_parameters, - meta.stats_generation_software, - meta.completeness, - meta.contamination, - meta.genome_coverage, - meta.metagenome, - meta.co_assembly == "Yes" ? "True" : "False", - meta.broad_environment, - meta.local_environment, - meta.environmental_medium, - meta.RNA_presence == "Yes" ? "True" : "False", - meta.NCBI_lineage - ].join('\t') - } + // --------- Combine metadata into TSV using module + CREATE_GENOME_METADATA_TSV ( + fasta_updated_with_taxonomy + ) + ch_versions = ch_versions.mix(CREATE_GENOME_METADATA_TSV.out.versions) + + // Collect all TSV rows into a single file + genome_metadata_tsv = CREATE_GENOME_METADATA_TSV.out.tsv .collectFile( - name: 'genomes_metadata.csv', - storeDir: "${params.outdir}/${params.mode}", - seed: [ - 'genome_name', - 'genome_path', - 'accessions', - 'assembly_software', - 'binning_software', - 'binning_parameters', - 'stats_generation_software', - 'completeness', - 'contamination', - 'genome_coverage', - 'metagenome', - 'co-assembly', - 'broad_environment', - 'local_environment', - 'environmental_medium', - 'rRNA_presence', - 'NCBI_lineage' - ].join('\t'), - newLine: true + name: 'genomes_metadata.tsv', + storeDir: "${params.outdir}/${params.mode}" ) // --------- Register study if accession not provided @@ -281,8 +247,8 @@ workflow GENOMESUBMIT { // --------- Generate manifests CREATE_MANIFESTS( - fasta_updated_with_stats.map{meta, fasta -> fasta}.collect(), - genome_metadata_csv, + fasta_updated_with_stats.map{_meta, fasta -> fasta}.collect(), + genome_metadata_tsv, params.mode, // mags or bins study_accession_ch.first() ) @@ -303,7 +269,7 @@ workflow GENOMESUBMIT { .join( manifests_ch.map { meta, manifest -> [meta.id, manifest] } // Has only [id: prefix] ) - .map { id, full_meta, fasta, manifest -> + .map { _id, full_meta, fasta, manifest -> [full_meta, fasta, manifest] }