Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ workflow NFCORE_PROTEINANNOTATOR {
params.skip_interproscan,
params.interproscan_db_url,
params.interproscan_db,
params.interproscan_batch_size,
params.skip_s4pred
)
emit:
Expand Down
1 change: 1 addition & 0 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ params {
interproscan_db = null
interproscan_applications = 'Hamap,PANTHER,PIRSF,TIGRFAM,sfld'
interproscan_enableprecalc = false
interproscan_batch_size = 1000

// Secondary structure prediction (s4pred)
skip_s4pred = false
Expand Down
8 changes: 8 additions & 0 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,14 @@
"help_text": "This increases the speed of functional annotation with InterProScan by pre-calculating matches found in the UniProtKB, thereby identifying unique matches in the query sequences for faster annotation. By default this is turned off.\n\nFor more information about this flag see the tool [documentation](https://interproscan-docs.readthedocs.io/en/latest/HowToRun.html).\n\n> Modifies tool parameter(s):\n> - InterProScan: `---diasable-precalc`",
"description": "Pre-calculates residue mutual matches.",
"fa_icon": "fas fa-clock"
},
"interproscan_batch_size": {
"type": "integer",
"default": 1000,
"minimum": 1,
"description": "Number of sequences per InterProScan batch.",
"help_text": "Split input FASTA files into batches of this many sequences before running InterProScan. This enables parallel processing of large proteomes and reduces memory usage per job. Results are automatically concatenated after all batches complete. Default: 1000 sequences per batch.",
"fa_icon": "fas fa-layer-group"
}
},
"help_text": "This subworkflow adds additional protein annotations to all input sequences. Currently, only annotation with InterProScan is integrated in the subworkflow.",
Expand Down
29 changes: 23 additions & 6 deletions subworkflows/local/functional_annotation/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ include { INTERPROSCAN } from '../../../modules/nf-core/interproscan/main'

workflow FUNCTIONAL_ANNOTATION {
take:
ch_fasta // channel: [ val(meta), [ fasta ] ]
skip_interproscan // boolean
interproscan_db_url // string, url to download db
interproscan_db // string, existing db
ch_fasta // channel: [ val(meta), [ fasta ] ]
skip_interproscan // boolean
interproscan_db_url // string, url to download db
interproscan_db // string, existing db
interproscan_batch_size // integer, number of sequences per batch

main:
ch_interproscan_tsv = channel.empty()
Expand All @@ -25,8 +26,24 @@ workflow FUNCTIONAL_ANNOTATION {
ch_interproscan_db = UNTAR.out.untar.map{ f -> f[1] }
}

INTERPROSCAN( ch_fasta, ch_interproscan_db )
ch_interproscan_tsv = ch_interproscan_tsv.mix(INTERPROSCAN.out.tsv)
// Split FASTA into batches for parallel InterProScan processing
ch_fasta_batched = ch_fasta
.flatMap { meta, fasta ->
def chunks = fasta.splitFasta(by: interproscan_batch_size, file: true)
if (chunks instanceof Path) {
// Single chunk (fewer sequences than batch size)
return [[ meta, chunks ]]
}
chunks.withIndex().collect { chunk, idx ->
def new_meta = meta.clone()
new_meta.original_id = meta.id
new_meta.id = "${meta.id}_batch${idx}"
[ new_meta, chunk ]
}
}

INTERPROSCAN( ch_fasta_batched, ch_interproscan_db )
ch_interproscan_tsv = INTERPROSCAN.out.tsv
}

emit:
Expand Down
12 changes: 7 additions & 5 deletions workflows/proteinannotator.nf
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,11 @@ workflow PROTEINANNOTATOR {
skip_funfam // boolean
funfam_db // string, path to the pfam HMM database, if already exists
funfam_latest_link // string, path to the latest pfam HMM database, to download
skip_interproscan // boolean
interproscan_db_url // string, url to download db
interproscan_db // string, existing db
skip_s4pred // boolean
skip_interproscan // boolean
interproscan_db_url // string, url to download db
interproscan_db // string, existing db
interproscan_batch_size // integer, number of sequences per batch
skip_s4pred // boolean

main:

Expand All @@ -57,7 +58,8 @@ workflow PROTEINANNOTATOR {
FAA_SEQFU_SEQKIT.out.fasta,
skip_interproscan,
interproscan_db_url,
interproscan_db
interproscan_db,
interproscan_batch_size
)
ch_versions = ch_versions.mix( FUNCTIONAL_ANNOTATION.out.versions )

Expand Down
Loading