From 88107cd27581a84c4658612b10199fad44762ebb Mon Sep 17 00:00:00 2001 From: vagkaratzas Date: Mon, 9 Feb 2026 14:53:17 +0000 Subject: [PATCH 1/3] zenodo updated --- README.md | 5 ++--- ro-crate-metadata.json | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fb552f8..2eef432 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/proteinannotator) [![GitHub Actions CI Status](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml) -[![GitHub Actions Linting Status](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinannotator/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![GitHub Actions Linting Status](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinannotator/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.18547735-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.18547735) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) [![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/) @@ -99,8 +99,7 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations - - +If you use nf-core/proteinannotator for your analysis, please cite it using the following doi: [10.5281/zenodo.18547735](https://doi.org/10.5281/zenodo.18547735) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/ro-crate-metadata.json b/ro-crate-metadata.json index 51bb9a3..8058a02 100644 --- a/ro-crate-metadata.json +++ b/ro-crate-metadata.json @@ -23,7 +23,7 @@ "@type": "Dataset", "creativeWorkStatus": "Stable", "datePublished": "2026-02-09T10:42:29+00:00", - "description": "

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/proteinannotator)\n[![GitHub Actions CI Status](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinannotator/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinannotator)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinannotator-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinannotator)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/proteinannotator** is a bioinformatics pipeline that computes statistics for protein FASTA inputs and produces protein annotations based on predicted sequence features, including conserved domains, functions, and secondary structure.\n\n

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n### Check quality and pre-process\n\nGenerate input amino acid sequence statistics with ([`SeqFu`](https://github.com/telatin/seqfu2/)) and pre-process them (i.e., gap removal, convert to upper case, validate, filter by length, replace special characters such as `/`, and remove duplicate sequences) with ([`SeqKit`](https://github.com/shenwei356/seqkit/))\n\n### Annotate sequences\n\n1. Conserved domain annotation with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/)) against databases\n such as [Pfam](https://ftp.ebi.ac.uk/pub/databases/Pfam/) and [FunFam](https://download.cathdb.info/cath/releases/all-releases/)\n2. Functional annotation:\n - ([`InterProScan`](https://interproscan-docs.readthedocs.io/en/v5/)) a software tool used to analyze protein sequences by scanning them against the signatures of protein families, domains, and sites in the [InterPro](https://www.ebi.ac.uk/interpro/) database, helping to identify their functional characteristics.\n3. Predict secondary structure compositional features such as \u03b1-helices, \u03b2-strands and coils with ([`s4pred`](https://github.com/psipred/s4pred))\n4. Present QC stats for input sequences before and after initial pre-processing with ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nid,fasta\nspecies1,species1_proteins.fasta\nspecies2,species2_proteins.fasta\n```\n\nEach row represents a FASTA file of proteins from a single species.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/proteinannotator \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/proteinannotator/usage) and the [parameter documentation](https://nf-co.re/proteinannotator/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/proteinannotator/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/proteinannotator/output).\n\n## Credits\n\nnf-core/proteinannotator was originally written by Olga Botvinnik and Evangelos Karatzas.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Michael L Heuer](https://github.com/heuermh)\n- [Edmund Miller](https://github.com/edmundmiller)\n- [Eric Wei](https://github.com/eweizy)\n- [Martin Beracochea](https://github.com/mberacochea)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#proteinannotator` channel](https://nfcore.slack.com/channels/proteinannotator) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\n\n\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n[![Open in GitHub Codespaces](https://img.shields.io/badge/Open_In_GitHub_Codespaces-black?labelColor=grey&logo=github)](https://github.com/codespaces/new/nf-core/proteinannotator)\n[![GitHub Actions CI Status](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/proteinannotator/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/proteinannotator/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.18547735-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.18547735)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A525.10.0-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.5.1-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.5.1)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/proteinannotator)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23proteinannotator-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/proteinannotator)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/proteinannotator** is a bioinformatics pipeline that computes statistics for protein FASTA inputs and produces protein annotations based on predicted sequence features, including conserved domains, functions, and secondary structure.\n\n

\n \n \n \"nf-core/proteinannotator\"\n \n

\n\n### Check quality and pre-process\n\nGenerate input amino acid sequence statistics with ([`SeqFu`](https://github.com/telatin/seqfu2/)) and pre-process them (i.e., gap removal, convert to upper case, validate, filter by length, replace special characters such as `/`, and remove duplicate sequences) with ([`SeqKit`](https://github.com/shenwei356/seqkit/))\n\n### Annotate sequences\n\n1. Conserved domain annotation with ([`hmmer`](https://github.com/EddyRivasLab/hmmer/)) against databases\n such as [Pfam](https://ftp.ebi.ac.uk/pub/databases/Pfam/) and [FunFam](https://download.cathdb.info/cath/releases/all-releases/)\n2. Functional annotation:\n - ([`InterProScan`](https://interproscan-docs.readthedocs.io/en/v5/)) a software tool used to analyze protein sequences by scanning them against the signatures of protein families, domains, and sites in the [InterPro](https://www.ebi.ac.uk/interpro/) database, helping to identify their functional characteristics.\n3. Predict secondary structure compositional features such as \u03b1-helices, \u03b2-strands and coils with ([`s4pred`](https://github.com/psipred/s4pred))\n4. Present QC stats for input sequences before and after initial pre-processing with ([`MultiQC`](http://multiqc.info/))\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nFirst, prepare a samplesheet with your input data that looks as follows:\n\n`samplesheet.csv`:\n\n```csv\nid,fasta\nspecies1,species1_proteins.fasta\nspecies2,species2_proteins.fasta\n```\n\nEach row represents a FASTA file of proteins from a single species.\n\nNow, you can run the pipeline using:\n\n```bash\nnextflow run nf-core/proteinannotator \\\n -profile \\\n --input samplesheet.csv \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/proteinannotator/usage) and the [parameter documentation](https://nf-co.re/proteinannotator/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/proteinannotator/results) tab on the nf-core website pipeline page.\nFor more details about the output files and reports, please refer to the\n[output documentation](https://nf-co.re/proteinannotator/output).\n\n## Credits\n\nnf-core/proteinannotator was originally written by Olga Botvinnik and Evangelos Karatzas.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- [Michael L Heuer](https://github.com/heuermh)\n- [Edmund Miller](https://github.com/edmundmiller)\n- [Eric Wei](https://github.com/eweizy)\n- [Martin Beracochea](https://github.com/mberacochea)\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#proteinannotator` channel](https://nfcore.slack.com/channels/proteinannotator) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/proteinannotator for your analysis, please cite it using the following doi: [10.5281/zenodo.18547735](https://doi.org/10.5281/zenodo.18547735)\n\nAn extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" From 2ad495f1cb19324660d4569380cffe17ab2ffc62 Mon Sep 17 00:00:00 2001 From: angelphanth Date: Thu, 12 Mar 2026 16:50:26 +0000 Subject: [PATCH 2/3] copying integration of pfam and funfam --- conf/modules.config | 22 ++++++++++++ conf/test.config | 1 + main.nf | 3 ++ nextflow.config | 3 ++ nextflow_schema.json | 17 +++++++++ subworkflows/local/domain_annotation/main.nf | 36 +++++++++++++++++++ subworkflows/local/domain_annotation/meta.yml | 16 +++++++++ .../main.nf | 4 +-- workflows/proteinannotator.nf | 8 ++++- 9 files changed, 107 insertions(+), 3 deletions(-) diff --git a/conf/modules.config b/conf/modules.config index ec1428c..9f40a6d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -90,6 +90,17 @@ process { ] } + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:WGET_MROOT' { + ext.prefix = "HMM" + ext.suffix = "tar.gz" + ext.args = '--no-check-certificate' // explicitly naming output + publishDir = [ + path: { "${params.outdir}/downloaded_dbs/" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:HMMSEARCH_PFAM' { ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" } publishDir = [ @@ -110,6 +121,17 @@ process { ] } + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:DOMAIN_ANNOTATION:HMMSEARCH_MROOT' { + ext.args = { "-E ${params.hmmsearch_evalue_cutoff}" } + publishDir = [ + path: { "${params.outdir}/domain_annotation/mroot/" }, + mode: params.publish_dir_mode, + pattern: "*.domtbl.gz", + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PROTEINANNOTATOR:PROTEINANNOTATOR:FUNCTIONAL_ANNOTATION:ARIA2' { publishDir = [ path: { "${params.outdir}/downloaded_dbs/" }, diff --git a/conf/test.config b/conf/test.config index 252ec87..02c92be 100644 --- a/conf/test.config +++ b/conf/test.config @@ -27,6 +27,7 @@ params { // Domain annotation pfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/pfam/Pfam-A_test.hmm.gz' funfam_latest_link = params.pipelines_testdata_base_path + 'proteinannotator/testdata/funfam/funfam-hmm3-v4_3_0_test.lib.gz' + mroot_latest_link = 'https://pavlopoulos-lab.org/metagroot/DownloadHmm' // Functional annotation interproscan_db_url = params.pipelines_testdata_base_path + 'proteinannotator/testdata/interproscan/interproscan_test.tar.gz' interproscan_applications = 'Hamap,TIGRFAM,sfld' diff --git a/main.nf b/main.nf index 98d7d67..d7f1972 100644 --- a/main.nf +++ b/main.nf @@ -46,6 +46,9 @@ workflow NFCORE_PROTEINANNOTATOR { params.skip_funfam, params.funfam_db, params.funfam_latest_link, + params.skip_mroot, + params.mroot_db, + params.mroot_latest_link, params.skip_interproscan, params.interproscan_db_url, params.interproscan_db, diff --git a/nextflow.config b/nextflow.config index e56f91f..f83d29d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -25,6 +25,9 @@ params { skip_funfam = false funfam_db = null funfam_latest_link = "https://download.cathdb.info/cath/releases/all-releases/v4_3_0/sequence-data/funfam-hmm3-v4_3_0.lib.gz" + skip_mroot = false + mroot_db = null + mroot_latest_link = "https://pavlopoulos-lab.org/metagroot/DownloadHmm" hmmsearch_evalue_cutoff = 0.001 // Functional annotation diff --git a/nextflow_schema.json b/nextflow_schema.json index b7ad6d8..754cd6c 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -276,6 +276,23 @@ "default": "https://download.cathdb.info/cath/releases/all-releases/v4_3_0/sequence-data/funfam-hmm3-v4_3_0.lib.gz", "description": "CATH hosted link to the latest available (v4_3_0) FunFam HMM database file." }, + "skip_mroot": { + "type": "boolean", + "fa_icon": "fas fa-ban", + "description": "Skip the domain annotation with the MetagRoot database.", + "help": "Skips the domain annotation of input sequence against a MetagRoot database." + }, + "mroot_db": { + "type": "string", + "format": "file-path", + "description": "Path to an already installed MetagRoot HMM database (.tar.gz).", + "help_text": "If left null and skip_mroot is false, the pipeline will start downloading the latest MetagRoot HMM library." + }, + "mroot_latest_link": { + "type": "string", + "default": "https://pavlopoulos-lab.org/metagroot/DownloadHmm", + "description": "MetagRoot hosted link to the latest available MetagRoot HMM database file." + }, "hmmsearch_evalue_cutoff": { "type": "number", "default": 0.001, diff --git a/subworkflows/local/domain_annotation/main.nf b/subworkflows/local/domain_annotation/main.nf index 1ec8289..4e76a56 100644 --- a/subworkflows/local/domain_annotation/main.nf +++ b/subworkflows/local/domain_annotation/main.nf @@ -1,7 +1,10 @@ include { ARIA2 as ARIA2_PFAM } from '../../../modules/nf-core/aria2/main' include { ARIA2 as ARIA2_FUNFAM } from '../../../modules/nf-core/aria2/main' +include { WGET as WGET_MROOT } from '../../../modules/nf-core/wget/main' include { HMMER_HMMSEARCH as HMMSEARCH_PFAM } from '../../../modules/nf-core/hmmer/hmmsearch/main' include { HMMER_HMMSEARCH as HMMSEARCH_FUNFAM } from '../../../modules/nf-core/hmmer/hmmsearch/main' +include { HMMER_HMMSEARCH as HMMSEARCH_MROOT } from '../../../modules/nf-core/hmmer/hmmsearch/main' +include { UNTAR as UNTAR_MROOT } from '../../../modules/nf-core/untar/main' workflow DOMAIN_ANNOTATION { take: @@ -12,12 +15,16 @@ workflow DOMAIN_ANNOTATION { skip_funfam // boolean funfam_db // string, path to the funfam HMM database, if already exists funfam_latest_link // string, path to the latest funfam HMM database, to download + skip_mroot // boolean + mroot_db // string, path to the metagroot HMM database, if already exists + mroot_latest_link // string, path to the latest metagroot HMM database, to download main: ch_versions = channel.empty() ch_pfam_domains = channel.empty() ch_funfam_domains = channel.empty() + ch_mroot_domains = channel.empty() if (!skip_pfam) { if (!pfam_db) { @@ -59,8 +66,37 @@ workflow DOMAIN_ANNOTATION { ch_funfam_domains = HMMSEARCH_FUNFAM.out.domain_summary } + if (!skip_mroot) { + if (!mroot_db) { + ch_mroot_link = channel.of([ [ id: 'mroot' ], mroot_latest_link ]) + // download file from url + WGET_MROOT( ch_mroot_link ) + // untar file if its a tar.gz + UNTAR_MROOT( WGET_MROOT.out.outfile ) + // extract hmm files from dir + ch_mroot_db = UNTAR_MROOT.out.untar + .map { + meta, dir -> + // collect all .hmm files from dir + def hmm_files = file("${dir}/**/*.hmm") + tuple(meta, hmm_files) + } + } else { + ch_mroot_db = channel.of([ [ id: 'mroot' ], mroot_db ]) + } + + ch_input_for_hmmsearch_mroot = ch_fasta + .combine(ch_mroot_db) + .map{ meta, seqs, _meta2, models -> [meta, models, seqs, false, false, true] } + + HMMSEARCH_MROOT( ch_input_for_hmmsearch_mroot ) + ch_versions = ch_versions.mix( HMMSEARCH_MROOT.out.versions.first() ) + ch_mroot_domains = HMMSEARCH_MROOT.out.domain_summary + } + emit: pfam_domains = ch_pfam_domains funfam_domains = ch_funfam_domains + mroot_domains = ch_mroot_domains versions = ch_versions } diff --git a/subworkflows/local/domain_annotation/meta.yml b/subworkflows/local/domain_annotation/meta.yml index e04e241..ad37de9 100644 --- a/subworkflows/local/domain_annotation/meta.yml +++ b/subworkflows/local/domain_annotation/meta.yml @@ -42,6 +42,18 @@ input: type: string description: | Path to the latest FunFam HMM database, to download + - skip_mroot: + type: boolean + description: | + Skip domain annotation with MetagRoot + - mroot_db: + type: string + description: | + Path to an existing HMM MetagRoot library on the system. If provided, the ARIA2_METAGROOT db download will be skipped. + - mroot_latest_link: + type: string + description: | + Path to the latest MetagRoot HMM database, to download output: - pfam_domains: type: file @@ -51,6 +63,10 @@ output: type: file description: | domtbl.gz files with funfam domain annotation for input amino acid sequences + - mroot_domains: + type: file + description: | + domtbl.gz files with metagroot domain annotation for input amino acid sequences - versions: type: file description: | diff --git a/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf b/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf index 1ba3ccc..7ef2d1a 100644 --- a/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_proteinannotator_pipeline/main.nf @@ -180,7 +180,7 @@ def toolCitationText() { params.skip_preprocessing ? "" : "Input sequences were preprocessed with SeqKit (gap trimming, length filtering, validation, duplicate removal) (Shen et al. 2024)." ].join(' ').trim() - def domain_annotation_text = (params.skip_pfam && params.skip_funfam) ? "" : "Domains were annotated with hmmer/hmmsearch (Eddy et al. 2011)." + def domain_annotation_text = (params.skip_pfam && params.skip_funfam && params.skip_mroot) ? "" : "Domains were annotated with hmmer/hmmsearch (Eddy et al. 2011)." def prediction_text = params.skip_s4pred ? "" : "Secondary structures were predicted via the s4pred software (Moffat et al. 2021)." @@ -202,7 +202,7 @@ def toolBibliographyText() { params.skip_preprocessing ? '' : '
  • Shen, W., Sipos, B., & Zhao, L. (2024). SeqKit2: A Swiss army knife for sequence and alignment processing. Imeta, 3(3), e191. doi: 10.1002/imt2.191
  • ' ].join(' ').trim() - def domain_annotation_text = (params.skip_pfam && params.skip_funfam) ? '' : '
  • Eddy, S. R. (2011). Accelerated profile HMM searches. PLoS computational biology, 7(10), e1002195. doi: 10.1371/journal.pcbi.1002195
  • ' + def domain_annotation_text = (params.skip_pfam && params.skip_funfam && params.skip_mroot) ? '' : '
  • Eddy, S. R. (2011). Accelerated profile HMM searches. PLoS computational biology, 7(10), e1002195. doi: 10.1371/journal.pcbi.1002195
  • ' def prediction_text = params.skip_s4pred ? '' : '
  • Moffat, L., & Jones, D. T. (2021). Increasing the accuracy of single sequence prediction methods using a deep semi-supervised learning framework. Bioinformatics, 37(21), 3744-3751. doi: 10.1093/bioinformatics/btab491
  • ' diff --git a/workflows/proteinannotator.nf b/workflows/proteinannotator.nf index fae1d7a..4398d69 100644 --- a/workflows/proteinannotator.nf +++ b/workflows/proteinannotator.nf @@ -29,6 +29,9 @@ workflow PROTEINANNOTATOR { skip_funfam // boolean funfam_db // string, path to the pfam HMM database, if already exists funfam_latest_link // string, path to the latest pfam HMM database, to download + skip_mroot // boolean + mroot_db // string, path to the metagroot HMM database, if already exists + mroot_latest_link // string, path to the latest metagroot HMM database, to download skip_interproscan // boolean interproscan_db_url // string, url to download db interproscan_db // string, existing db @@ -49,7 +52,10 @@ workflow PROTEINANNOTATOR { pfam_latest_link, skip_funfam, funfam_db, - funfam_latest_link + funfam_latest_link, + skip_mroot, + mroot_db, + mroot_latest_link ) ch_versions = ch_versions.mix( DOMAIN_ANNOTATION.out.versions ) From 63fd565dcbe9a311932fe8feff14a960bdaadc38 Mon Sep 17 00:00:00 2001 From: angelphanth Date: Thu, 12 Mar 2026 16:53:15 +0000 Subject: [PATCH 3/3] nf-core modules wget and untar as a workaround for aria2 to get metagroot tar file? --- modules.json | 5 ++ modules/nf-core/wget/environment.yml | 7 ++ modules/nf-core/wget/main.nf | 48 ++++++++++++++ modules/nf-core/wget/meta.yml | 52 +++++++++++++++ modules/nf-core/wget/tests/main.nf.test | 62 +++++++++++++++++ modules/nf-core/wget/tests/main.nf.test.snap | 70 ++++++++++++++++++++ modules/nf-core/wget/tests/nextflow.config | 6 ++ 7 files changed, 250 insertions(+) create mode 100644 modules/nf-core/wget/environment.yml create mode 100644 modules/nf-core/wget/main.nf create mode 100644 modules/nf-core/wget/meta.yml create mode 100644 modules/nf-core/wget/tests/main.nf.test create mode 100644 modules/nf-core/wget/tests/main.nf.test.snap create mode 100644 modules/nf-core/wget/tests/nextflow.config diff --git a/modules.json b/modules.json index 37ba5b8..6a31b58 100644 --- a/modules.json +++ b/modules.json @@ -59,6 +59,11 @@ "branch": "master", "git_sha": "447f7bc0fa41dfc2400c8cad4c0291880dc060cf", "installed_by": ["modules"] + }, + "wget": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] } } }, diff --git a/modules/nf-core/wget/environment.yml b/modules/nf-core/wget/environment.yml new file mode 100644 index 0000000..9eb304e --- /dev/null +++ b/modules/nf-core/wget/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::wget=1.21.4 diff --git a/modules/nf-core/wget/main.nf b/modules/nf-core/wget/main.nf new file mode 100644 index 0000000..9bc6f15 --- /dev/null +++ b/modules/nf-core/wget/main.nf @@ -0,0 +1,48 @@ +process WGET { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/3b/3b54fa9135194c72a18d00db6b399c03248103f87e43ca75e4b50d61179994b3/data': + 'community.wave.seqera.io/library/wget:1.21.4--8b0fcde81c17be5e' }" + + input: + tuple val(meta), val(url) + + output: + tuple val(meta), path("${prefix}.${suffix}"), emit: outfile + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: 'html' + """ + wget \\ + -O - \\ + $args \\ + $url \\ + > ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(wget --version | head -1 | cut -d ' ' -f 3) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: 'html' + """ + touch ${prefix}.${suffix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(wget --version | head -1 | cut -d ' ' -f 3) + END_VERSIONS + """ +} diff --git a/modules/nf-core/wget/meta.yml b/modules/nf-core/wget/meta.yml new file mode 100644 index 0000000..56df0af --- /dev/null +++ b/modules/nf-core/wget/meta.yml @@ -0,0 +1,52 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "wget" +description: The non-interactive network downloader +keywords: + - "wget" + - "download" + - "network" +tools: + - "wget": + description: "wget is a free utility for non-interactive download of files from + the Web." + homepage: "https://www.gnu.org/software/wget/" + documentation: "https://www.gnu.org/software/wget/manual/wget.html" + licence: ["GPL"] + identifier: "" + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - url: + type: string + description: URL to download + pattern: "^https?://*.*" + +output: + outfile: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + - ${prefix}.${suffix}: + type: file + description: Downloaded file + pattern: "*.*" + + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@itrujnara" +maintainers: + - "@itrujnara" diff --git a/modules/nf-core/wget/tests/main.nf.test b/modules/nf-core/wget/tests/main.nf.test new file mode 100644 index 0000000..e094288 --- /dev/null +++ b/modules/nf-core/wget/tests/main.nf.test @@ -0,0 +1,62 @@ +// nf-core modules test wget +nextflow_process { + + name "Test Process WGET" + script "../main.nf" + process "WGET" + + tag "modules" + tag "modules_nfcore" + tag "wget" + + test("sarscov2 - gff") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/modules/data/genomics/sarscov2/genome/genome.gff3", + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - gff - stub") { + + options "-stub" + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + "https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/modules/data/genomics/sarscov2/genome/genome.gff3", + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/wget/tests/main.nf.test.snap b/modules/nf-core/wget/tests/main.nf.test.snap new file mode 100644 index 0000000..6c05160 --- /dev/null +++ b/modules/nf-core/wget/tests/main.nf.test.snap @@ -0,0 +1,70 @@ +{ + "sarscov2 - gff": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.gff3:md5,357435a81a9981a0128e840ebe11051e" + ] + ], + "1": [ + "versions.yml:md5,a747f72db5fc051f64676a0ba6f32f35" + ], + "outfile": [ + [ + { + "id": "test" + }, + "test.gff3:md5,357435a81a9981a0128e840ebe11051e" + ] + ], + "versions": [ + "versions.yml:md5,a747f72db5fc051f64676a0ba6f32f35" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.10.4" + }, + "timestamp": "2025-03-26T12:27:32.67617" + }, + "sarscov2 - gff - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,a747f72db5fc051f64676a0ba6f32f35" + ], + "outfile": [ + [ + { + "id": "test", + "single_end": false + }, + "test.gff3:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,a747f72db5fc051f64676a0ba6f32f35" + ] + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.10.4" + }, + "timestamp": "2025-03-26T12:21:06.414955" + } +} \ No newline at end of file diff --git a/modules/nf-core/wget/tests/nextflow.config b/modules/nf-core/wget/tests/nextflow.config new file mode 100644 index 0000000..236f4e1 --- /dev/null +++ b/modules/nf-core/wget/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + withName: "WGET" { + ext.prefix = "test" + ext.suffix = "gff3" + } +}