diff --git a/main.nf b/main.nf index 98d7d67..64a3ced 100644 --- a/main.nf +++ b/main.nf @@ -49,6 +49,7 @@ workflow NFCORE_PROTEINANNOTATOR { params.skip_interproscan, params.interproscan_db_url, params.interproscan_db, + params.interproscan_batch_size, params.skip_s4pred ) emit: diff --git a/nextflow.config b/nextflow.config index d79a287..f567691 100644 --- a/nextflow.config +++ b/nextflow.config @@ -33,6 +33,7 @@ params { interproscan_db = null interproscan_applications = 'Hamap,PANTHER,PIRSF,TIGRFAM,sfld' interproscan_enableprecalc = false + interproscan_batch_size = 1000 // Secondary structure prediction (s4pred) skip_s4pred = false diff --git a/nextflow_schema.json b/nextflow_schema.json index b7ad6d8..10aaf42 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -320,6 +320,14 @@ "help_text": "This increases the speed of functional annotation with InterProScan by pre-calculating matches found in the UniProtKB, thereby identifying unique matches in the query sequences for faster annotation. By default this is turned off.\n\nFor more information about this flag see the tool [documentation](https://interproscan-docs.readthedocs.io/en/latest/HowToRun.html).\n\n> Modifies tool parameter(s):\n> - InterProScan: `---diasable-precalc`", "description": "Pre-calculates residue mutual matches.", "fa_icon": "fas fa-clock" + }, + "interproscan_batch_size": { + "type": "integer", + "default": 1000, + "minimum": 1, + "description": "Number of sequences per InterProScan batch.", + "help_text": "Split input FASTA files into batches of this many sequences before running InterProScan. This enables parallel processing of large proteomes and reduces memory usage per job. Results are automatically concatenated after all batches complete. Default: 1000 sequences per batch.", + "fa_icon": "fas fa-layer-group" } }, "help_text": "This subworkflow adds additional protein annotations to all input sequences. Currently, only annotation with InterProScan is integrated in the subworkflow.", diff --git a/subworkflows/local/functional_annotation/main.nf b/subworkflows/local/functional_annotation/main.nf index 240c504..e25f866 100644 --- a/subworkflows/local/functional_annotation/main.nf +++ b/subworkflows/local/functional_annotation/main.nf @@ -4,10 +4,11 @@ include { INTERPROSCAN } from '../../../modules/nf-core/interproscan/main' workflow FUNCTIONAL_ANNOTATION { take: - ch_fasta // channel: [ val(meta), [ fasta ] ] - skip_interproscan // boolean - interproscan_db_url // string, url to download db - interproscan_db // string, existing db + ch_fasta // channel: [ val(meta), [ fasta ] ] + skip_interproscan // boolean + interproscan_db_url // string, url to download db + interproscan_db // string, existing db + interproscan_batch_size // integer, number of sequences per batch main: ch_interproscan_tsv = channel.empty() @@ -25,8 +26,24 @@ workflow FUNCTIONAL_ANNOTATION { ch_interproscan_db = UNTAR.out.untar.map{ f -> f[1] } } - INTERPROSCAN( ch_fasta, ch_interproscan_db ) - ch_interproscan_tsv = ch_interproscan_tsv.mix(INTERPROSCAN.out.tsv) + // Split FASTA into batches for parallel InterProScan processing + ch_fasta_batched = ch_fasta + .flatMap { meta, fasta -> + def chunks = fasta.splitFasta(by: interproscan_batch_size, file: true) + if (chunks instanceof Path) { + // Single chunk (fewer sequences than batch size) + return [[ meta, chunks ]] + } + chunks.withIndex().collect { chunk, idx -> + def new_meta = meta.clone() + new_meta.original_id = meta.id + new_meta.id = "${meta.id}_batch${idx}" + [ new_meta, chunk ] + } + } + + INTERPROSCAN( ch_fasta_batched, ch_interproscan_db ) + ch_interproscan_tsv = INTERPROSCAN.out.tsv } emit: diff --git a/workflows/proteinannotator.nf b/workflows/proteinannotator.nf index fae1d7a..f84e356 100644 --- a/workflows/proteinannotator.nf +++ b/workflows/proteinannotator.nf @@ -29,10 +29,11 @@ workflow PROTEINANNOTATOR { skip_funfam // boolean funfam_db // string, path to the pfam HMM database, if already exists funfam_latest_link // string, path to the latest pfam HMM database, to download - skip_interproscan // boolean - interproscan_db_url // string, url to download db - interproscan_db // string, existing db - skip_s4pred // boolean + skip_interproscan // boolean + interproscan_db_url // string, url to download db + interproscan_db // string, existing db + interproscan_batch_size // integer, number of sequences per batch + skip_s4pred // boolean main: @@ -57,7 +58,8 @@ workflow PROTEINANNOTATOR { FAA_SEQFU_SEQKIT.out.fasta, skip_interproscan, interproscan_db_url, - interproscan_db + interproscan_db, + interproscan_batch_size ) ch_versions = ch_versions.mix( FUNCTIONAL_ANNOTATION.out.versions )