nf-core · olgabot · Feb 9, 2026 · Feb 9, 2026 · Feb 9, 2026 · Feb 17, 2026
diff --git a/main.nf b/main.nf
@@ -49,6 +49,7 @@ workflow NFCORE_PROTEINANNOTATOR {
         params.skip_interproscan,
         params.interproscan_db_url,
         params.interproscan_db,
+        params.interproscan_batch_size,
         params.skip_s4pred
     )
     emit:

diff --git a/nextflow.config b/nextflow.config
@@ -33,6 +33,7 @@ params {
     interproscan_db            = null
     interproscan_applications  = 'Hamap,PANTHER,PIRSF,TIGRFAM,sfld'
     interproscan_enableprecalc = false
+    interproscan_batch_size    = 1000
 
     // Secondary structure prediction (s4pred)
     skip_s4pred   = false

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -320,6 +320,14 @@
                     "help_text": "This increases the speed of functional annotation with InterProScan by pre-calculating matches found in the UniProtKB, thereby identifying unique matches in the query sequences for faster annotation. By default this is turned off.\n\nFor more information about this flag see the tool [documentation](https://interproscan-docs.readthedocs.io/en/latest/HowToRun.html).\n\n> Modifies tool parameter(s):\n> - InterProScan: `---diasable-precalc`",
                     "description": "Pre-calculates residue mutual matches.",
                     "fa_icon": "fas fa-clock"
+                },
+                "interproscan_batch_size": {
+                    "type": "integer",
+                    "default": 1000,
+                    "minimum": 1,
+                    "description": "Number of sequences per InterProScan batch.",
+                    "help_text": "Split input FASTA files into batches of this many sequences before running InterProScan. This enables parallel processing of large proteomes and reduces memory usage per job. Results are automatically concatenated after all batches complete. Default: 1000 sequences per batch.",
+                    "fa_icon": "fas fa-layer-group"
                 }
             },
             "help_text": "This subworkflow adds additional protein annotations to all input sequences. Currently, only annotation with InterProScan is integrated in the subworkflow.",

diff --git a/subworkflows/local/functional_annotation/main.nf b/subworkflows/local/functional_annotation/main.nf
@@ -4,10 +4,11 @@ include { INTERPROSCAN } from '../../../modules/nf-core/interproscan/main'
 
 workflow FUNCTIONAL_ANNOTATION {
     take:
-    ch_fasta            // channel: [ val(meta), [ fasta ] ]
-    skip_interproscan   // boolean
-    interproscan_db_url // string, url to download db
-    interproscan_db     // string, existing db
+    ch_fasta                // channel: [ val(meta), [ fasta ] ]
+    skip_interproscan       // boolean
+    interproscan_db_url     // string, url to download db
+    interproscan_db         // string, existing db
+    interproscan_batch_size // integer, number of sequences per batch
 
     main:
     ch_interproscan_tsv = channel.empty()
@@ -25,8 +26,24 @@ workflow FUNCTIONAL_ANNOTATION {
             ch_interproscan_db = UNTAR.out.untar.map{ f -> f[1] }
         }
 
-        INTERPROSCAN( ch_fasta, ch_interproscan_db )
-        ch_interproscan_tsv = ch_interproscan_tsv.mix(INTERPROSCAN.out.tsv)
+        // Split FASTA into batches for parallel InterProScan processing
+        ch_fasta_batched = ch_fasta
+            .flatMap { meta, fasta ->
+                def chunks = fasta.splitFasta(by: interproscan_batch_size, file: true)
+                if (chunks instanceof Path) {
+                    // Single chunk (fewer sequences than batch size)
+                    return [[ meta, chunks ]]
+                }
+                chunks.withIndex().collect { chunk, idx ->
+                    def new_meta = meta.clone()
+                    new_meta.original_id = meta.id
+                    new_meta.id = "${meta.id}_batch${idx}"
+                    [ new_meta, chunk ]
+                }
+            }
+
+        INTERPROSCAN( ch_fasta_batched, ch_interproscan_db )
+        ch_interproscan_tsv = INTERPROSCAN.out.tsv
     }
 
     emit:

diff --git a/workflows/proteinannotator.nf b/workflows/proteinannotator.nf
@@ -29,10 +29,11 @@ workflow PROTEINANNOTATOR {
     skip_funfam         // boolean
     funfam_db           // string, path to the pfam HMM database, if already exists
     funfam_latest_link  // string, path to the latest pfam HMM database, to download
-    skip_interproscan   // boolean
-    interproscan_db_url // string, url to download db
-    interproscan_db     // string, existing db
-    skip_s4pred         // boolean
+    skip_interproscan       // boolean
+    interproscan_db_url     // string, url to download db
+    interproscan_db         // string, existing db
+    interproscan_batch_size // integer, number of sequences per batch
+    skip_s4pred             // boolean
 
     main:
 
@@ -57,7 +58,8 @@ workflow PROTEINANNOTATOR {
         FAA_SEQFU_SEQKIT.out.fasta,
         skip_interproscan,
         interproscan_db_url,
-        interproscan_db
+        interproscan_db,
+        interproscan_batch_size
     )
     ch_versions = ch_versions.mix( FUNCTIONAL_ANNOTATION.out.versions )