From acbe9d6b803c145e38a3e1d899fcf667122f77a8 Mon Sep 17 00:00:00 2001 From: Edward Zhou Date: Wed, 3 Jun 2026 11:15:02 -0400 Subject: [PATCH] Add life sciences plugins --- .agents/plugins/marketplace.json | 24 + README.md | 14 +- .../.codex-plugin/plugin.json | 53 + plugins/life-science-research/README.md | 175 + .../life-science-research/assets/app-icon.png | Bin 0 -> 17108 bytes .../skills/alphafold-skill/SKILL.md | 41 + .../skills/alphafold-skill/agents/openai.yaml | 3 + .../alphafold-skill/scripts/rest_request.py | 291 ++ .../skills/bgee-skill/SKILL.md | 36 + .../skills/bgee-skill/agents/openai.yaml | 3 + .../bgee-skill/scripts/sparql_request.py | 200 ++ .../skills/bindingdb-skill/SKILL.md | 38 + .../skills/bindingdb-skill/agents/openai.yaml | 3 + .../bindingdb-skill/scripts/rest_request.py | 291 ++ .../skills/biobankjapan-phewas-skill/SKILL.md | 41 + .../agents/openai.yaml | 3 + .../scripts/biobankjapan_phewas.py | 226 ++ .../scripts/variant_resolution.py | 397 +++ .../skills/biorxiv-skill/SKILL.md | 40 + .../skills/biorxiv-skill/agents/openai.yaml | 3 + .../biorxiv-skill/scripts/rest_request.py | 291 ++ .../biostudies-arrayexpress-skill/SKILL.md | 38 + .../agents/openai.yaml | 3 + .../scripts/rest_request.py | 291 ++ .../skills/cbioportal-skill/SKILL.md | 39 + .../cbioportal-skill/agents/openai.yaml | 3 + .../cbioportal-skill/scripts/rest_request.py | 291 ++ .../skills/cellxgene-skill/SKILL.md | 37 + .../skills/cellxgene-skill/agents/openai.yaml | 3 + .../cellxgene-skill/scripts/rest_request.py | 291 ++ .../skills/chebi-skill/SKILL.md | 38 + .../skills/chebi-skill/agents/openai.yaml | 3 + .../chebi-skill/scripts/rest_request.py | 291 ++ .../skills/chembl-skill/SKILL.md | 40 + .../skills/chembl-skill/agents/openai.yaml | 3 + .../chembl-skill/scripts/rest_request.py | 291 ++ .../skills/civic-skill/SKILL.md | 35 + .../skills/civic-skill/agents/openai.yaml | 3 + .../civic-skill/scripts/civic_graphql.py | 154 + .../skills/clinicaltrials-skill/SKILL.md | 40 + .../clinicaltrials-skill/agents/openai.yaml | 3 + .../scripts/clinicaltrials_client.py | 230 ++ .../skills/clinvar-variation-skill/SKILL.md | 43 + .../agents/openai.yaml | 3 + .../scripts/clinvar_variation.py | 193 + .../skills/efo-ontology-skill/SKILL.md | 39 + .../efo-ontology-skill/agents/openai.yaml | 3 + .../scripts/rest_request.py | 291 ++ .../skills/encode-skill/SKILL.md | 39 + .../skills/encode-skill/agents/openai.yaml | 3 + .../encode-skill/scripts/rest_request.py | 291 ++ .../skills/ensembl-skill/SKILL.md | 39 + .../skills/ensembl-skill/agents/openai.yaml | 3 + .../ensembl-skill/scripts/rest_request.py | 291 ++ .../skills/epigraphdb-skill/SKILL.md | 39 + .../epigraphdb-skill/agents/openai.yaml | 3 + .../epigraphdb-skill/scripts/rest_request.py | 291 ++ .../skills/eqtl-catalogue-skill/SKILL.md | 40 + .../eqtl-catalogue-skill/agents/openai.yaml | 3 + .../scripts/rest_request.py | 373 ++ .../scripts/test_rest_request.py | 95 + .../skills/eva-skill/SKILL.md | 37 + .../skills/eva-skill/agents/openai.yaml | 3 + .../skills/eva-skill/scripts/rest_request.py | 291 ++ .../skills/finngen-phewas-skill/SKILL.md | 41 + .../finngen-phewas-skill/agents/openai.yaml | 3 + .../scripts/finngen_phewas.py | 223 ++ .../scripts/variant_resolution.py | 397 +++ .../genebass-gene-burden-skill/SKILL.md | 36 + .../agents/openai.yaml | 3 + .../scripts/genebass_gene_burden.py | 287 ++ .../skills/gnomad-graphql-skill/SKILL.md | 38 + .../gnomad-graphql-skill/agents/openai.yaml | 3 + .../scripts/gnomad_graphql.py | 154 + .../skills/gtex-eqtl-skill/SKILL.md | 99 + .../skills/gtex-eqtl-skill/agents/openai.yaml | 3 + .../gtex-eqtl-skill/scripts/gtex_eqtl.py | 146 + .../scripts/variant_resolution.py | 397 +++ .../skills/gwas-catalog-skill/SKILL.md | 41 + .../gwas-catalog-skill/agents/openai.yaml | 3 + .../scripts/rest_request.py | 291 ++ .../skills/hmdb-skill/SKILL.md | 38 + .../skills/hmdb-skill/agents/openai.yaml | 3 + .../skills/hmdb-skill/scripts/rest_request.py | 291 ++ .../skills/human-protein-atlas-skill/SKILL.md | 40 + .../agents/openai.yaml | 3 + .../scripts/rest_request.py | 291 ++ .../skills/ipd-skill/SKILL.md | 38 + .../skills/ipd-skill/agents/openai.yaml | 3 + .../skills/ipd-skill/scripts/rest_request.py | 291 ++ .../locus-to-gene-mapper-skill/SKILL.md | 353 ++ .../agents/openai.yaml | 3 + .../scripts/map_locus_to_gene.py | 2209 ++++++++++++ .../scripts/test_map_locus_to_gene.py | 150 + .../skills/metabolights-skill/SKILL.md | 37 + .../metabolights-skill/agents/openai.yaml | 3 + .../scripts/rest_request.py | 291 ++ .../skills/mgnify-skill/SKILL.md | 37 + .../skills/mgnify-skill/agents/openai.yaml | 3 + .../mgnify-skill/scripts/rest_request.py | 291 ++ .../skills/ncbi-blast-skill/SKILL.md | 62 + .../ncbi-blast-skill/agents/openai.yaml | 3 + .../references/blast-common-url-api.txt | 75 + .../references/intent-notes.txt | 7 + .../ncbi-blast-skill/scripts/ncbi_blast.py | 765 ++++ .../scripts/test_ncbi_blast.py | 464 +++ .../skills/ncbi-clinicaltables-skill/SKILL.md | 42 + .../agents/openai.yaml | 3 + .../scripts/ncbi_gene_clinicaltables.py | 146 + .../skills/ncbi-datasets-skill/SKILL.md | 39 + .../ncbi-datasets-skill/agents/openai.yaml | 3 + .../scripts/ncbi_datasets.py | 233 ++ .../skills/ncbi-entrez-skill/SKILL.md | 43 + .../ncbi-entrez-skill/agents/openai.yaml | 3 + .../ncbi-entrez-skill/references/geo.md | 22 + .../ncbi-entrez-skill/scripts/ncbi_entrez.py | 300 ++ .../skills/ncbi-pmc-skill/SKILL.md | 37 + .../skills/ncbi-pmc-skill/agents/openai.yaml | 3 + .../skills/ncbi-pmc-skill/scripts/ncbi_pmc.py | 256 ++ .../skills/opentargets-skill/SKILL.md | 59 + .../opentargets-skill/agents/openai.yaml | 3 + .../scripts/opentargets_disease_heatmap.py | 344 ++ .../scripts/opentargets_graphql.py | 154 + .../skills/pharmgkb-skill/SKILL.md | 38 + .../skills/pharmgkb-skill/agents/openai.yaml | 3 + .../pharmgkb-skill/scripts/rest_request.py | 291 ++ .../skills/pride-skill/SKILL.md | 37 + .../skills/pride-skill/agents/openai.yaml | 3 + .../pride-skill/scripts/rest_request.py | 291 ++ .../skills/proteomexchange-skill/SKILL.md | 39 + .../proteomexchange-skill/agents/openai.yaml | 3 + .../scripts/rest_request.py | 291 ++ .../skills/pubchem-pug-skill/SKILL.md | 38 + .../pubchem-pug-skill/agents/openai.yaml | 3 + .../pubchem-pug-skill/scripts/rest_request.py | 291 ++ .../skills/quickgo-skill/SKILL.md | 39 + .../skills/quickgo-skill/agents/openai.yaml | 3 + .../quickgo-skill/scripts/rest_request.py | 291 ++ .../skills/rcsb-pdb-skill/SKILL.md | 38 + .../skills/rcsb-pdb-skill/agents/openai.yaml | 3 + .../rcsb-pdb-skill/scripts/rest_request.py | 291 ++ .../skills/reactome-skill/SKILL.md | 39 + .../skills/reactome-skill/agents/openai.yaml | 3 + .../reactome-skill/scripts/rest_request.py | 291 ++ .../skills/research-router-skill/SKILL.md | 145 + .../research-router-skill/agents/openai.yaml | 4 + .../skills/rhea-skill/SKILL.md | 37 + .../skills/rhea-skill/agents/openai.yaml | 3 + .../skills/rhea-skill/scripts/rest_request.py | 291 ++ .../skills/rnacentral-skill/SKILL.md | 39 + .../rnacentral-skill/agents/openai.yaml | 3 + .../rnacentral-skill/scripts/rest_request.py | 291 ++ .../skills/string-skill/SKILL.md | 41 + .../skills/string-skill/agents/openai.yaml | 3 + .../string-skill/scripts/rest_request.py | 291 ++ .../skills/tpmi-phewas-skill/SKILL.md | 41 + .../tpmi-phewas-skill/agents/openai.yaml | 3 + .../tpmi-phewas-skill/scripts/tpmi_phewas.py | 228 ++ .../scripts/variant_resolution.py | 397 +++ .../skills/ukb-topmed-phewas-skill/SKILL.md | 41 + .../agents/openai.yaml | 3 + .../scripts/ukb_topmed_phewas.py | 228 ++ .../scripts/variant_resolution.py | 397 +++ .../skills/uniprot-skill/SKILL.md | 41 + .../skills/uniprot-skill/agents/openai.yaml | 3 + .../uniprot-skill/scripts/rest_request.py | 291 ++ .../ngs-analysis/.codex-plugin/plugin.json | 53 + plugins/ngs-analysis/README.md | 458 +++ plugins/ngs-analysis/assets/app-icon.png | Bin 0 -> 39140 bytes .../references/database-registry.json | 100 + .../references/intake-schema.json | 135 + .../references/pipeline-registry.json | 762 ++++ .../references/reference-registry.json | 71 + .../references/run-envelope-schema.json | 55 + .../references/runtime-install-guidance.md | 85 + .../scripts/ngs_epigenomics_utils.py | 627 ++++ .../ngs-analysis/scripts/ngs_planner_utils.py | 74 + plugins/ngs-analysis/scripts/ngs_preflight.py | 414 +++ .../scripts/ngs_reference_manager.py | 1616 +++++++++ .../ngs-analysis/scripts/ngs_resource_gate.py | 162 + plugins/ngs-analysis/scripts/ngs_run_utils.py | 586 +++ .../scripts/ngs_visualization_utils.py | 779 ++++ .../scripts/run_amplicon_microbiome.py | 1208 +++++++ .../scripts/run_atacseq_peaks_qc.py | 732 ++++ .../ngs-analysis/scripts/run_bcl_to_fastq.py | 823 +++++ .../scripts/run_bulk_rnaseq_counts_qc.py | 1055 ++++++ .../scripts/run_bulk_rnaseq_de.py | 707 ++++ .../scripts/run_chip_cutrun_peaks_qc.py | 781 ++++ .../scripts/run_dna_germline_variants.py | 632 ++++ .../scripts/run_dna_somatic_variants.py | 844 +++++ .../scripts/run_dna_umi_panel_variants.py | 997 ++++++ .../scripts/run_dna_variant_calling.py | 1064 ++++++ .../scripts/run_fastq_assay_package.py | 3137 +++++++++++++++++ plugins/ngs-analysis/scripts/run_fastq_qc.py | 1259 +++++++ .../scripts/run_nfcore_pipeline.py | 529 +++ .../scripts/run_scrnaseq_fastq_to_count.py | 863 +++++ .../scripts/run_scrnaseq_post_count_qc.py | 1342 +++++++ .../scripts/run_shotgun_metagenomics.py | 1377 ++++++++ .../skills/ngs-amplicon-microbiome/SKILL.md | 99 + .../agents/openai.yaml | 4 + .../skills/ngs-analysis-router/SKILL.md | 97 + .../ngs-analysis-router/agents/openai.yaml | 4 + .../skills/ngs-atacseq-peaks-qc/SKILL.md | 79 + .../ngs-atacseq-peaks-qc/agents/openai.yaml | 4 + .../skills/ngs-bcl-to-fastq/SKILL.md | 91 + .../ngs-bcl-to-fastq/agents/openai.yaml | 4 + .../skills/ngs-bulk-rnaseq-counts-qc/SKILL.md | 70 + .../agents/openai.yaml | 4 + .../SKILL.md | 74 + .../agents/openai.yaml | 4 + .../skills/ngs-bulk-rnaseq/SKILL.md | 102 + .../skills/ngs-bulk-rnaseq/agents/openai.yaml | 4 + .../skills/ngs-chip-cutrun-peaks-qc/SKILL.md | 80 + .../agents/openai.yaml | 4 + .../skills/ngs-dna-germline-variants/SKILL.md | 79 + .../agents/openai.yaml | 4 + .../skills/ngs-dna-somatic-variants/SKILL.md | 67 + .../agents/openai.yaml | 4 + .../ngs-dna-umi-panel-variants/SKILL.md | 66 + .../agents/openai.yaml | 4 + .../skills/ngs-dna-variant-calling/SKILL.md | 88 + .../agents/openai.yaml | 4 + .../skills/ngs-epigenomics-peaks/SKILL.md | 80 + .../ngs-epigenomics-peaks/agents/openai.yaml | 4 + .../ngs-analysis/skills/ngs-fastq-qc/SKILL.md | 127 + .../skills/ngs-fastq-qc/agents/openai.yaml | 4 + .../skills/ngs-runtime-env/SKILL.md | 91 + .../skills/ngs-runtime-env/agents/openai.yaml | 4 + .../skills/ngs-scrna-seq/SKILL.md | 88 + .../skills/ngs-scrna-seq/agents/openai.yaml | 4 + .../skills/ngs-shotgun-metagenomics/SKILL.md | 103 + .../agents/openai.yaml | 4 + .../ngs-analysis/skills/scrna-seq-qc/SKILL.md | 97 + .../skills/scrna-seq-qc/agents/openai.yaml | 4 + .../qc-annotation-umap-heuristics.md | 184 + .../tests/test_bcl_to_fastq_runner.py | 164 + .../test_bulk_rnaseq_counts_qc_runner.py | 160 + .../tests/test_new_backend_planners.py | 1025 ++++++ .../ngs-analysis/tests/test_ngs_preflight.py | 77 + .../test_scrnaseq_post_count_qc_runner.py | 68 + .../amplicon_microbiome/run_dada2_backend.R | 223 ++ .../bulk_rnaseq_counts_qc/Snakefile.smk | 110 + .../aggregate_salmon_quant.py | 178 + .../run_bulk_de.R | 455 +++ .../scrnaseq_fastq_to_count/Snakefile.smk | 63 + .../run_star_genome_generate.py | 67 + .../scrnaseq_fastq_to_count/run_starsolo.py | 125 + 247 files changed, 49337 insertions(+), 5 deletions(-) create mode 100644 plugins/life-science-research/.codex-plugin/plugin.json create mode 100644 plugins/life-science-research/README.md create mode 100644 plugins/life-science-research/assets/app-icon.png create mode 100644 plugins/life-science-research/skills/alphafold-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/alphafold-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/alphafold-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/bgee-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/bgee-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/bgee-skill/scripts/sparql_request.py create mode 100644 plugins/life-science-research/skills/bindingdb-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/bindingdb-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/bindingdb-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/biobankjapan-phewas-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/biobankjapan-phewas-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/biobankjapan_phewas.py create mode 100644 plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/variant_resolution.py create mode 100644 plugins/life-science-research/skills/biorxiv-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/biorxiv-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/biorxiv-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/biostudies-arrayexpress-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/biostudies-arrayexpress-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/biostudies-arrayexpress-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/cbioportal-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/cbioportal-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/cbioportal-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/cellxgene-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/cellxgene-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/cellxgene-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/chebi-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/chebi-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/chebi-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/chembl-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/chembl-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/chembl-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/civic-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/civic-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/civic-skill/scripts/civic_graphql.py create mode 100644 plugins/life-science-research/skills/clinicaltrials-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/clinicaltrials-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/clinicaltrials-skill/scripts/clinicaltrials_client.py create mode 100644 plugins/life-science-research/skills/clinvar-variation-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/clinvar-variation-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/clinvar-variation-skill/scripts/clinvar_variation.py create mode 100644 plugins/life-science-research/skills/efo-ontology-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/efo-ontology-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/efo-ontology-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/encode-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/encode-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/encode-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/ensembl-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/ensembl-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/ensembl-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/epigraphdb-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/epigraphdb-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/epigraphdb-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/eqtl-catalogue-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/eqtl-catalogue-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/test_rest_request.py create mode 100644 plugins/life-science-research/skills/eva-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/eva-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/eva-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/finngen-phewas-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/finngen-phewas-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/finngen-phewas-skill/scripts/finngen_phewas.py create mode 100644 plugins/life-science-research/skills/finngen-phewas-skill/scripts/variant_resolution.py create mode 100644 plugins/life-science-research/skills/genebass-gene-burden-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/genebass-gene-burden-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/genebass-gene-burden-skill/scripts/genebass_gene_burden.py create mode 100644 plugins/life-science-research/skills/gnomad-graphql-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/gnomad-graphql-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/gnomad-graphql-skill/scripts/gnomad_graphql.py create mode 100644 plugins/life-science-research/skills/gtex-eqtl-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/gtex-eqtl-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/gtex-eqtl-skill/scripts/gtex_eqtl.py create mode 100644 plugins/life-science-research/skills/gtex-eqtl-skill/scripts/variant_resolution.py create mode 100644 plugins/life-science-research/skills/gwas-catalog-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/gwas-catalog-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/gwas-catalog-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/hmdb-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/hmdb-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/hmdb-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/human-protein-atlas-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/human-protein-atlas-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/human-protein-atlas-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/ipd-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/ipd-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/ipd-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/locus-to-gene-mapper-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/locus-to-gene-mapper-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/map_locus_to_gene.py create mode 100644 plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/test_map_locus_to_gene.py create mode 100644 plugins/life-science-research/skills/metabolights-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/metabolights-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/metabolights-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/mgnify-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/mgnify-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/mgnify-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/ncbi-blast-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/ncbi-blast-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/ncbi-blast-skill/references/blast-common-url-api.txt create mode 100644 plugins/life-science-research/skills/ncbi-blast-skill/references/intent-notes.txt create mode 100644 plugins/life-science-research/skills/ncbi-blast-skill/scripts/ncbi_blast.py create mode 100644 plugins/life-science-research/skills/ncbi-blast-skill/scripts/test_ncbi_blast.py create mode 100644 plugins/life-science-research/skills/ncbi-clinicaltables-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/ncbi-clinicaltables-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/ncbi-clinicaltables-skill/scripts/ncbi_gene_clinicaltables.py create mode 100644 plugins/life-science-research/skills/ncbi-datasets-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/ncbi-datasets-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/ncbi-datasets-skill/scripts/ncbi_datasets.py create mode 100644 plugins/life-science-research/skills/ncbi-entrez-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/ncbi-entrez-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/ncbi-entrez-skill/references/geo.md create mode 100644 plugins/life-science-research/skills/ncbi-entrez-skill/scripts/ncbi_entrez.py create mode 100644 plugins/life-science-research/skills/ncbi-pmc-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/ncbi-pmc-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/ncbi-pmc-skill/scripts/ncbi_pmc.py create mode 100644 plugins/life-science-research/skills/opentargets-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/opentargets-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_disease_heatmap.py create mode 100644 plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_graphql.py create mode 100644 plugins/life-science-research/skills/pharmgkb-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/pharmgkb-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/pharmgkb-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/pride-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/pride-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/pride-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/proteomexchange-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/proteomexchange-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/proteomexchange-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/pubchem-pug-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/pubchem-pug-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/pubchem-pug-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/quickgo-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/quickgo-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/quickgo-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/rcsb-pdb-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/rcsb-pdb-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/rcsb-pdb-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/reactome-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/reactome-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/reactome-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/research-router-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/research-router-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/rhea-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/rhea-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/rhea-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/rnacentral-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/rnacentral-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/rnacentral-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/string-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/string-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/string-skill/scripts/rest_request.py create mode 100644 plugins/life-science-research/skills/tpmi-phewas-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/tpmi-phewas-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/tpmi-phewas-skill/scripts/tpmi_phewas.py create mode 100644 plugins/life-science-research/skills/tpmi-phewas-skill/scripts/variant_resolution.py create mode 100644 plugins/life-science-research/skills/ukb-topmed-phewas-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/ukb-topmed-phewas-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/ukb_topmed_phewas.py create mode 100644 plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/variant_resolution.py create mode 100644 plugins/life-science-research/skills/uniprot-skill/SKILL.md create mode 100644 plugins/life-science-research/skills/uniprot-skill/agents/openai.yaml create mode 100644 plugins/life-science-research/skills/uniprot-skill/scripts/rest_request.py create mode 100644 plugins/ngs-analysis/.codex-plugin/plugin.json create mode 100644 plugins/ngs-analysis/README.md create mode 100644 plugins/ngs-analysis/assets/app-icon.png create mode 100644 plugins/ngs-analysis/references/database-registry.json create mode 100644 plugins/ngs-analysis/references/intake-schema.json create mode 100644 plugins/ngs-analysis/references/pipeline-registry.json create mode 100644 plugins/ngs-analysis/references/reference-registry.json create mode 100644 plugins/ngs-analysis/references/run-envelope-schema.json create mode 100644 plugins/ngs-analysis/references/runtime-install-guidance.md create mode 100644 plugins/ngs-analysis/scripts/ngs_epigenomics_utils.py create mode 100644 plugins/ngs-analysis/scripts/ngs_planner_utils.py create mode 100755 plugins/ngs-analysis/scripts/ngs_preflight.py create mode 100644 plugins/ngs-analysis/scripts/ngs_reference_manager.py create mode 100644 plugins/ngs-analysis/scripts/ngs_resource_gate.py create mode 100644 plugins/ngs-analysis/scripts/ngs_run_utils.py create mode 100644 plugins/ngs-analysis/scripts/ngs_visualization_utils.py create mode 100644 plugins/ngs-analysis/scripts/run_amplicon_microbiome.py create mode 100644 plugins/ngs-analysis/scripts/run_atacseq_peaks_qc.py create mode 100644 plugins/ngs-analysis/scripts/run_bcl_to_fastq.py create mode 100644 plugins/ngs-analysis/scripts/run_bulk_rnaseq_counts_qc.py create mode 100644 plugins/ngs-analysis/scripts/run_bulk_rnaseq_de.py create mode 100644 plugins/ngs-analysis/scripts/run_chip_cutrun_peaks_qc.py create mode 100644 plugins/ngs-analysis/scripts/run_dna_germline_variants.py create mode 100644 plugins/ngs-analysis/scripts/run_dna_somatic_variants.py create mode 100644 plugins/ngs-analysis/scripts/run_dna_umi_panel_variants.py create mode 100644 plugins/ngs-analysis/scripts/run_dna_variant_calling.py create mode 100644 plugins/ngs-analysis/scripts/run_fastq_assay_package.py create mode 100755 plugins/ngs-analysis/scripts/run_fastq_qc.py create mode 100644 plugins/ngs-analysis/scripts/run_nfcore_pipeline.py create mode 100755 plugins/ngs-analysis/scripts/run_scrnaseq_fastq_to_count.py create mode 100644 plugins/ngs-analysis/scripts/run_scrnaseq_post_count_qc.py create mode 100644 plugins/ngs-analysis/scripts/run_shotgun_metagenomics.py create mode 100644 plugins/ngs-analysis/skills/ngs-amplicon-microbiome/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-amplicon-microbiome/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-analysis-router/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-analysis-router/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-atacseq-peaks-qc/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-atacseq-peaks-qc/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-bcl-to-fastq/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-bcl-to-fastq/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-bulk-rnaseq-counts-qc/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-bulk-rnaseq-counts-qc/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-bulk-rnaseq-differential-expression/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-bulk-rnaseq-differential-expression/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-bulk-rnaseq/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-bulk-rnaseq/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-chip-cutrun-peaks-qc/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-chip-cutrun-peaks-qc/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-dna-germline-variants/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-dna-germline-variants/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-dna-somatic-variants/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-dna-somatic-variants/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-dna-umi-panel-variants/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-dna-umi-panel-variants/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-dna-variant-calling/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-dna-variant-calling/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-epigenomics-peaks/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-epigenomics-peaks/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-fastq-qc/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-fastq-qc/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-runtime-env/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-runtime-env/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-scrna-seq/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-scrna-seq/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/ngs-shotgun-metagenomics/SKILL.md create mode 100644 plugins/ngs-analysis/skills/ngs-shotgun-metagenomics/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/scrna-seq-qc/SKILL.md create mode 100644 plugins/ngs-analysis/skills/scrna-seq-qc/agents/openai.yaml create mode 100644 plugins/ngs-analysis/skills/scrna-seq-qc/references/qc-annotation-umap-heuristics.md create mode 100644 plugins/ngs-analysis/tests/test_bcl_to_fastq_runner.py create mode 100644 plugins/ngs-analysis/tests/test_bulk_rnaseq_counts_qc_runner.py create mode 100644 plugins/ngs-analysis/tests/test_new_backend_planners.py create mode 100644 plugins/ngs-analysis/tests/test_ngs_preflight.py create mode 100644 plugins/ngs-analysis/tests/test_scrnaseq_post_count_qc_runner.py create mode 100644 plugins/ngs-analysis/workflows/amplicon_microbiome/run_dada2_backend.R create mode 100644 plugins/ngs-analysis/workflows/bulk_rnaseq_counts_qc/Snakefile.smk create mode 100644 plugins/ngs-analysis/workflows/bulk_rnaseq_counts_qc/aggregate_salmon_quant.py create mode 100644 plugins/ngs-analysis/workflows/bulk_rnaseq_differential_expression/run_bulk_de.R create mode 100644 plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/Snakefile.smk create mode 100644 plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/run_star_genome_generate.py create mode 100644 plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/run_starsolo.py diff --git a/.agents/plugins/marketplace.json b/.agents/plugins/marketplace.json index 28515ab..060b8c2 100644 --- a/.agents/plugins/marketplace.json +++ b/.agents/plugins/marketplace.json @@ -51,6 +51,30 @@ "authentication": "ON_USE" }, "category": "Business" + }, + { + "name": "ngs-analysis", + "source": { + "source": "local", + "path": "./plugins/ngs-analysis" + }, + "policy": { + "installation": "AVAILABLE", + "authentication": "ON_USE" + }, + "category": "Research" + }, + { + "name": "life-science-research", + "source": { + "source": "local", + "path": "./plugins/life-science-research" + }, + "policy": { + "installation": "AVAILABLE", + "authentication": "ON_USE" + }, + "category": "Research" } ] } diff --git a/README.md b/README.md index e9a700c..7be1e22 100644 --- a/README.md +++ b/README.md @@ -3,10 +3,10 @@ Role-based plugins make Codex easier to customize for a team's day-to-day work. These templates package domain-specific skills, connector bindings, and starter assets so teams can adapt Codex for roles like sales, data analytics, product -design, and financial markets. They were built with OpenAI subject matter -experts around workflows that are already helping teams move faster internally -and with alpha partners. Over the coming weeks, we'll continue expanding this -collection with more roles, workflows, and examples. +design, financial markets, and life sciences research. They were built with +OpenAI subject matter experts around workflows that are already helping teams +move faster internally and with alpha partners. Over the coming weeks, we'll +continue expanding this collection with more roles, workflows, and examples. The plugins are intended to be customized before use. Connector-backed plugins may include placeholder app and connector ids that must be replaced with ids @@ -20,6 +20,8 @@ available to the target workspace. | [Data Analytics](./plugins/data-analytics) | Query, visualize, explain, and validate datasets; build dashboards; and investigate metrics. | Databricks, Snowflake, BigQuery, Hex, Amplitude, Mixpanel, Statsig, Metabase, ThoughtSpot, Google Drive, Slack, Microsoft 365, and more | | [Product Design](./plugins/product-design) | Create product specs, prototypes, UI critiques, and product design artifacts. | Sites | | [Financial Markets](./plugins/financial-markets) | Build public-equity research, earnings analysis, valuation work, model updates, long/short pitches, risk reviews, dashboards, and investment memos. | FactSet, LSEG, Morningstar, Daloopa, Quartr, S&P, PitchBook, Slack, Google Drive, Gmail, SharePoint, Teams, and more | +| [Life Science Research](./plugins/life-science-research) | Synthesize evidence-backed life-sciences research across genetics, omics, biology, chemistry, structure, clinical evidence, and public dataset discovery. | None | +| [Life Sciences NGS Analysis](./plugins/ngs-analysis) | Guide sequencing intake, route BCL/FASTQ and count-matrix workflows, validate references, and prepare reproducible local run envelopes for supported NGS assays. | None | ## Repository Layout @@ -30,7 +32,9 @@ available to the target workspace. |-- sales/ |-- data-analytics/ |-- product-design/ - `-- financial-markets/ + |-- financial-markets/ + |-- life-science-research/ + `-- ngs-analysis/ ``` Each plugin generally follows this structure: diff --git a/plugins/life-science-research/.codex-plugin/plugin.json b/plugins/life-science-research/.codex-plugin/plugin.json new file mode 100644 index 0000000..da83fe1 --- /dev/null +++ b/plugins/life-science-research/.codex-plugin/plugin.json @@ -0,0 +1,53 @@ +{ + "name": "life-science-research", + "version": "1.0.0", + "description": "General life-sciences research workflows with query routing, evidence synthesis, and optional parallel subagent analysis across genetics, omics, biology, chemistry, structure, clinical evidence, and public dataset discovery.", + "author": { + "name": "OpenAI" + }, + "homepage": "https://github.com/openai/openai/tree/master/plugins/life-science-research", + "repository": "https://github.com/openai/openai/tree/master/plugins/life-science-research", + "license": "Proprietary", + "keywords": [ + "life-science", + "research", + "bioinformatics", + "human-genetics", + "functional-genomics", + "transcriptomics", + "proteomics", + "metabolomics", + "clinical-research", + "drug-discovery", + "skill-routing", + "evidence-synthesis", + "parallel-analysis", + "gwas", + "variant-interpretation", + "pathway-biology", + "protein-structure" + ], + "skills": "./skills/", + "interface": { + "displayName": "Life Science Research", + "shortDescription": "General life-sciences research with routing, evidence synthesis, and optional parallel subagent analysis", + "longDescription": "Internal life-science research workflows that help Codex interpret a user's research question, normalize the relevant entities, choose the right skills, and synthesize evidence-backed answers across public resources. The plugin spans human genetics, functional genomics, expression, pathways, protein structure, chemistry, pharmacology, literature, clinical evidence, and public study discovery, with a research-router entrypoint for broad tasks and optional subagent-assisted parallel work when evidence lanes are independent.", + "developerName": "OpenAI", + "category": "Research", + "capabilities": [ + "Interactive", + "Read", + "Write" + ], + "websiteURL": "https://openai.com/", + "privacyPolicyURL": "https://openai.com/policies/row-privacy-policy/", + "termsOfServiceURL": "https://openai.com/policies/row-terms-of-use/", + "defaultPrompt": [ + "Use relevant skills and databases to support life-science research tasks." + ], + "brandColor": "#166534", + "composerIcon": "./assets/app-icon.png", + "logo": "./assets/app-icon.png", + "screenshots": [] + } +} diff --git a/plugins/life-science-research/README.md b/plugins/life-science-research/README.md new file mode 100644 index 0000000..9833b98 --- /dev/null +++ b/plugins/life-science-research/README.md @@ -0,0 +1,175 @@ +# Life Science Research Plugin + +This plugin is a general life-sciences research layer for Codex. It packages a broad set of modular skills that can be composed to answer questions across human genetics, functional genomics, expression, pathway biology, protein structure, chemistry, clinical evidence, and public study discovery. + +The goal is not to force every request through one fixed workflow. The goal is to help Codex understand the user's research question, normalize the relevant entities, choose the smallest useful set of skills, and synthesize a concise evidence-backed answer. + +The plugin now includes a `research-router-skill` that should be treated as the default entrypoint for broad, ambiguous, or multi-step life-sciences research tasks. + +## What This Plugin Should Do + +When a user invokes this plugin, treat it as a general research copilot for life sciences: + +1. Understand the research task. + Determine whether the user is asking for gene or target background, variant interpretation, locus-to-gene prioritization, pathway context, expression profiling, structure lookup, chemistry or ligand evidence, clinical-trial landscape, literature discovery, or dataset discovery. +2. Normalize the core entities. + Resolve the gene, protein, disease, phenotype, variant, compound, tissue, cell type, species, accession, or pathway identifiers before branching into downstream lookups. +3. Route to the right skills. + Prefer the minimum number of skills needed to answer the question well. Use single-source lookups for focused questions and multi-skill chains only when the question requires synthesis. +4. Parallelize only when it helps. + If the work breaks into independent evidence lanes and Codex subagents are available, use them for bounded parallel retrieval and analysis. Keep initial scoping, entity normalization, and final synthesis with the coordinating agent. +5. Cross-check evidence across sources. + Where the answer matters, compare orthogonal evidence types instead of over-indexing on one source. +6. Synthesize for the user. + Return a concise research answer with the key evidence, important caveats, and clear next steps. Save raw payloads only when the user asks for them. + +## Research Patterns + +This plugin is meant to support workflows like: + +- target and gene background research +- variant interpretation and identifier resolution +- locus-to-gene prioritization +- cohort replication and PheWAS follow-up +- expression and tissue or cell-type context +- pathway and network interpretation +- protein, structure, and function lookup +- chemistry, ligand, and pharmacology research +- clinical, translational, and cancer evidence review +- literature, preprint, and public dataset discovery +- metabolomics, proteomics, and microbiome context gathering + +## Entry Point + +- `research-router-skill`: the default orchestration layer for broad life-sciences questions. It classifies the request, normalizes entities, selects downstream skills, decides whether parallel subagents are useful, and synthesizes the final answer. + +## Skill Families + +The plugin currently bundles 50 skills. The most useful way to think about them is by research area rather than as a flat list. + +### Human Genetics And Variant Evidence + +- `opentargets-skill` +- `gwas-catalog-skill` +- `clinvar-variation-skill` +- `gnomad-graphql-skill` +- `ensembl-skill` +- `eva-skill` +- `epigraphdb-skill` +- `genebass-gene-burden-skill` +- `gtex-eqtl-skill` +- `eqtl-catalogue-skill` +- `locus-to-gene-mapper-skill` +- `finngen-phewas-skill` +- `ukb-topmed-phewas-skill` +- `biobankjapan-phewas-skill` +- `tpmi-phewas-skill` + +### Expression, Cell Context, And Functional Genomics + +- `bgee-skill` +- `human-protein-atlas-skill` +- `cellxgene-skill` +- `encode-skill` +- `rnacentral-skill` + +### Protein, Structure, Pathway, And Functional Biology + +- `alphafold-skill` +- `rcsb-pdb-skill` +- `uniprot-skill` +- `string-skill` +- `quickgo-skill` +- `reactome-skill` +- `rhea-skill` + +### Chemistry, Metabolites, And Pharmacology + +- `bindingdb-skill` +- `chembl-skill` +- `pubchem-pug-skill` +- `chebi-skill` +- `pharmgkb-skill` +- `hmdb-skill` + +### Clinical, Translational, And Disease Evidence + +- `clinicaltrials-skill` +- `cbioportal-skill` +- `civic-skill` +- `ipd-skill` + +### Literature, Search, And Public Study Discovery + +- `ncbi-entrez-skill` +- `ncbi-pmc-skill` +- `biorxiv-skill` +- `biostudies-arrayexpress-skill` +- `ncbi-datasets-skill` +- `ncbi-blast-skill` +- `ncbi-clinicaltables-skill` + +### Multi-Omics, Proteomics, And Specialized Data Sources + +- `pride-skill` +- `proteomexchange-skill` +- `metabolights-skill` +- `mgnify-skill` +- `efo-ontology-skill` + +## Recommended Query Strategy + +For broad or ambiguous requests, route work in this order: + +1. Clarify the objective from the user prompt. + Is the user trying to explain biology, prioritize targets, interpret a variant, find public evidence, or discover studies and datasets? +2. Resolve identifiers and ontology terms first. + Use entity-normalization and ontology skills before deeper evidence retrieval. +3. Pull evidence from the smallest relevant set of source families. + For example: + genetics plus expression for target prioritization + structure plus chemistry for ligandability questions + literature plus datasets for exploratory research + clinical plus pharmacology for translational questions +4. Parallelize only if the evidence lanes are independent. + Good examples include genetics versus expression, structure versus chemistry, or literature versus clinical evidence for the same question. Avoid parallelization for narrow lookups or tightly coupled chains where every step depends on the previous one. +5. Reconcile disagreements. + Call out conflicts across datasets, ancestry limitations, tissue specificity, study design caveats, and evidence gaps. +6. End with a direct synthesis. + Answer the user's actual question instead of returning an unsorted dump of source results. + +## Subagent Guidance + +When Codex subagents are available, use them as a retrieval and analysis accelerator, not as a replacement for core reasoning. + +Use subagents when: + +- the request spans multiple evidence families that can be gathered independently +- several genes, variants, compounds, or datasets need side-by-side comparison +- a broad research brief benefits from separate lane summaries before synthesis + +Keep the coordinating agent responsible for: + +- interpreting the user request +- defining scope and analysis lanes +- resolving identifiers and canonical entities +- reconciling conflicting evidence +- writing the final synthesis + +Each subagent should receive a bounded objective and return concise findings, caveats, sources used, and any artifact paths. The final answer should present one integrated conclusion rather than a stack of disconnected sub-results. + +## Example Prompts + +- `Use Life Science Research to summarize the public genetics and expression evidence linking IL6R to asthma.` +- `Find preprints, public datasets, and pathway context relevant to TREM2 in microglia.` +- `Map the most plausible causal genes at this inflammatory bowel disease locus and explain why.` +- `Summarize known structure, ligand, and pathway information for EGFR.` +- `Pull ClinicalTrials.gov, ChEMBL, and PharmGKB context for JAK inhibitors in alopecia areata.` +- `Find metabolomics and proteomics resources relevant to MASLD and PPARG.` +- `Interpret this variant using ClinVar, gnomAD, Ensembl, and cohort association evidence.` + +## Operational Notes + +The plugin does not require plugin-local app connectors or MCP servers. The bundled skills are self-contained under `plugins/life-science-research/skills/` and generally call their own scripts or public APIs directly. + +This plugin should be treated as a routing and synthesis layer over those skills. A focused question may require only one skill. A broader research question may require a short multi-skill chain, and when the work splits naturally into independent lanes, optional subagent-assisted parallel analysis before final synthesis. diff --git a/plugins/life-science-research/assets/app-icon.png b/plugins/life-science-research/assets/app-icon.png new file mode 100644 index 0000000000000000000000000000000000000000..a3a7ce94b70a565df5506c8db89707abdfead4d9 GIT binary patch literal 17108 zcmeIZ`9GBV8$W)Hu||{>NmMEdp_C=dsE`zLvZO+FiZ&8uZ|-v1L@H&EIjIyOArj4$ zR6?dC*=0}m?99yfx^>R^`~%<5PoH;>$2qu{*Xw#+>+@RfE7q1~vuDZ90)W}(TesK% z@Z+!iAS{SK=!;c?@rTICt-HMdMCXzJQ6M&61}{>)Y|J)6`bW7RFU*qvCN!XH z9R=7b6VG^^R&Ou-NGmhZ=73guh<&(aOWk>3g!2PUU1yxiUWI>L!aq9GPTUt=ReVT@ z4ti#DVE8gX*JJ%wBaVIdw*1^;1;{^oaB|;BL#0wmDuunD4;T@4O&0dcsyP&fQM$s8 z!KWL5ZHt%x;JU91>$l5n`Kmx9)Lu3(7 zH5TZ->UQ=m+S z{@C8rO5nH>Z?z9du-H53C!6$7-_=_LYFSipE3dW7Zziv|H*84Ft;_+p7(rl%RQXg+ z6f9o_45>4GpoXp{tkjCr8`|=Ac-x1nvce>sL2F*6a zE7{Vnn#$OJ2H&eayEgr=A|nv6u-#jF&KI<{185ySy-v+6q#cN-VY$+;ZDrN{@r)+T^2hdq87!GOT(FkJ=(moO~pF#`R&#f62skVC%jPV%=0Wi#K0 zk0FSgznpfX5St&}?6>(%3;Ab}dBk|<%-hkaiZ49Z0TG-U7{xb&n|Y0t9z-Qz?cb${ zf|5X5>|GO6hJ;zB076gqOvnp#qt13WckqFDSI^*wF=cB}e8^{2OF2^9;tyXiLXR&HawwNJnzwUSA5q4p}y<>_!;$6SSqUi9P)fQmd5L@PkH#2hS?;VS*@e(G5)_TJ=r)^kyX>Vvb5?@Y8~E zHN}sHwhapx3IGv+_ItrK70t*P6+_c+sY~n|A1ch+5HQ-mIhRNK)dL`Vx;6QT=WwZS z(l%5r2JOYFgsgE1w_vp3hx)TLV%}I(jt^P_ZU2K@UWU#z^#p2NSgBFqnN%HVu)cn) zr7qcOmpzrS0(Y428g8#$WJ7~lJ9o6aNVc+zqB2ZSc4>|c+e<6ySJDMQ%r>OjNoTj3 z5UmtR{rMMwSY(kAdnwxP*Bx}3LcQ6bD#-6*JygcT%PY`+UF*MXZ{0cHV>@rZtPl)4 zG>&?FS75Iv@3d7@R<8QEBdMgC3jS{X15FPfG#wJ>(WFt_k!uyS+@ljv5(*}ehsNm9)(z(p#poo z->0TFY0s)OBeoFU(dMUsx9R+hKJ@8+^c^sa^mi>b#uGklu|(b~&CSporv@$mCr<33 z|D|T1G#04mE0bDHkSLS<#26SZt9QDzTs!ig$gpO zQDbh&iNBdQB0w=q$y>|OQGl|dx9%ZcJE@lf8C${yXvU*Q<8y3vJx)BJr*BCGuU3Y* zhEWn}qk>lC|3LsX6M#nHCx2K2h5R8!Bme)uTmRz*sRsYk4I0UY|Cz)8%z;#c|2yX3 zwaa$U1F@DKf#6{(R~SX**iMU2%YoL)ekz%1QCKe#?};y|;8+<&@(~XZ zdG{c&6jlV78vbghm$mWAA`0!8%^Q^&P(YYr%J5Cdiw6y!e@eMBJqz*|g1#jA^pkZV zdpQb1kXCo+O-K+xxFx{A<-=aQRgqL8UB<&`p*qY`WKy|jvJI>~g8MO{dNalU|&&z>04ulgCM;;Ir#0KS)V(!5{Q8`0E5 zbHmFQ#5rOj>2-e|ARU5^XCywn1x&pd1f!VZx&p1 zFDRU&Lb29R1X^Ze?KamlN}h_KY#QR*K76J7C`h4JNtGZoo?i=X>CwJsuT5oWAog0; z8~Qqe4i{=?q(Qra7|HV=Vc3M_CWhjYGPvZ<(G=cK3T+UDz73@qa^q)0ooLp5dMgl@ zH=6wsAH@9=6CD8!K9D+?(Wfg8d9Ir3f8|Jk@&y3p*O(`|^eGTiG9C(XY8M4$^c8{C zT;9Po^E@aEdEY6`FHMsixM{6b$0!;um?^N4;RBa(Gr-%31vYFwy!iOqZ! z-L2xpIplS4)|rjulNt+>jfZkTMg}tnWxqb7`+0J3QpaMH0-FyCfrd=hbKYK#&y)l! z9z`d1MG!9qRA`4$p<}tN=ZPSLnpIS>^8hk`>5nBQdd!08jKR^@sxE?E2FSJkv=a~R zduL2aKu+|;~|&fE=aRN74|>jD&@ENmkuxvzld-^ zrJt!Q>_M7r;w8Y%v{C4!8u)(@2DU{&6>kKHi#3RfZ<9C_RV2ITa^s%R z78}Iu>;#i3APg~ID}4Iw_L?nZ)_f)mt%2_d(TKttzWAID`ELz@VTFQMZ;Ndo-Bd4( zIdXpc$4^qOiAK9m@YCLr#O?PV*B+CgNvk!cp;#sn`Fc10YXn^kDNk|@iBLq}8{(!E zio2u<%iZ}w zEFSl$L+%~JV^~LTG&A+=Tl6k9j>6_cK^!|{dLxyGRT3C1>BAPhw-jjqKkkk_2bWEHaEXpWpOfjE z`xJE}62PtOc%M(5$j_E2(5V3gD+O(HZ>3^X{_+GCuB;a8TmO83X$l|!wfHm@@`>U#k(8I9BnE(DO z4UjkgHw-;ICkWPyF45sz;;v$J?X-Q!6*bb)J^rMF$+JVeU|03;Npm4gmHx`)4i`#bBT26NIG%OUG@ zZq?M&(K4W0s}bu>{H&#@G2+KFCYkb>Q=_(vZ^n4IHGSzn!3W$v+(}yX(UAVMR+tD{ z%Lj7roo0K=ArC(&;On_M7hIf?ZAp2Wv!PxeXjlL9 zc>|a6z(FNBn~`~LO@B(-dub>@IC^aTr&VJ&Q6OE{ieK11=4UCQC5Qis^)D-TbY}Nt2(UI|wMNJ>Nc)b}sx?Bu7P1Ia;ToZ%~Kb^(4)is!f59n|C z*USVhFf6+F$0VZo^tYt+AX=CQECRjMbQvb}ftl#^G>)5s3*+mCN}GIpju80%Sy>uv zZ<+WoT~I@Z4`i00VX*e2`WOAl4}xwOcOv?ANO<=T*b~Ddr{Ky8ju|O|lP6ny1*X8o zEfjZ|fn?w!j=S!k3oIIHI2=9M`Cj>BP^<6hbwuMjKA>i<3$Y;OSytWBx7?0=bPllK zF;=~A9KhpscoYO3Kl2HO(dl*RA}O?a_zZU*(8AmG8vhAEjI%!19_UO>V-<~XLQxoO zFqn?R&0?Ryz4Y6}I13_KR49*oW1+6Ur%;}j zfpU=O-8Fa-tI6!xoqJ2AB`L#rO#j2b1(AKi(|@U_8sRh-$!L4dBjGV*dmK5fvQLDe z-2+4dI@8W>W*r?EoBy>gwx2Ap=S|ey!74lGt+XAb(42*VA%ucFdj=6`zUy%X7!Jl` z=KX69pm)rF_u%gUq-O!J7?A*U%|k7%3kBMx-l`wy?%C9T0uWiVV(;|pqm%V$;Al2F zJ&}NX!~ovx{{j$A0=2baIS|)0QK7_GchKRD%{V;0B7Csyqw-AY88|t4Gv>wEgKI8D z#ganz%ci&XDz$EdJNx-T@4$@911h=+Z!93|EvasO_D=xPQ2jsRuvhJYStjpjSMf+!|B;8?t;(*D;j>`)$8f8zmAeyg0` zE_xjtmjV&_8*Rh=m|m(dV?H0az4{ks5U<;UAazgH)j4EyfL8Qh2vTm^e)-IO>S}+-brpr;k9parr_lhz!anbkPFMb5h7~A-_&y4~iY}g143c3T3FEa$U zi-Mr#lF_(99F0+F2VEsCBe|HY%oDM_Rke|P=nNNnsK40;IjEWI{g0{Ksk!q88wAaw z_b>l8GqMu>}Od4pz9LunG1`a$#J|ESHo3hH31L&znUzaDz zsz(Ov)}?Z>G7Lh8v7ncTMx$I^^{+5k`|EXMTY7#erG`PrYhft+#z$`V3xSrJn*Ktgpuw^89otuWp7#Gf z^dJZ!vne!RM*?%QJ=qU4g8Da7;DqmF25(CB@`IgMeNr7nnyvZar10;&R@3c^{R3@Y z_{4VsnuV!CkPLG$m?<24d)ddw?)OzFa?$9Flrh)j2~J2J{r0+vcS7~$ggq|#p~=i@ z!tiM0@-^uQtwDSXawz*?AJcQaSq_Zuz(}1v>8?lOP1%ulg~CVpRHr`oV_nut2-{Qp;yPs% zvDlPLA9L|@r-?FWUN{hIJ~ZtNXJ*aSQgMEdPIL;xM$os^xUgVT0|U-}a-S%_`HaO! zChLx4f>6RwM)GjCjKj~Qv{u{mexbcT)t_}e@zH(HnSj}R1G^-1+x%mm^Ew`J5eoa+ z?>H-e{jFMqzxqD0W>FE;EA&WzRvuT`#iMk^;k(X4jNP_xZj(Jekl%(8-lengGt-l; zj0ktK%PF8d6;6Dc$>;`7VwdUiOfWT0B41yZ1ZgExkiFsiQhpeKtZwFWp4~&-*2k3M z=+$Tlz8VcJW6rrj1-hx*j0vg`Wd(X58(dD(&P2w?`>YEGJ1-sRbOWB`) z+mOnQyUNcH^Q#UE(pHOkQ%u%Fh}R;Z+=w9hf}?}z%hAGqez>(f7w~+4N>%qz8Qy%D zYLhvmhj}_TU`T@a+*ZWgNqlX)P*Kf?i8#aIclM}diG<&5*CeK=)zTas8@8Dmt%}4v zyS&@^_92ZEJ2^rOZ(U)d6p`v{=6m@md(aHVgrD5EP!fSSJcRd+~97iKxfp6W~JGOSLNgic1Zi}4E@%~xpt;tw|FWori zWd6BMmb{~%1idNC(!UAO%=s}p$6(+3Q^zg78#|QIJBc&xqIaGEN? zpzYi!tUow~9AIN|RQpkR%>`7jwz$xF?rRYMBR`LN`besbQZx|62_8c)by=#^SS2Bb zOCWCuP6=hQggKqXSWmhw(d9DBad)k_sl+!`jQOO}TwNUsCzpgXe?6Oj~1F@Upu4j%z=m2!5s37MgFEAcrw-4)C3#ULv7D>=VAML5@W|GeB%wGl?Mt;eb zXrqzaTUy9S!lDU-Oek8}to3P;XiPMNA!XXx#YALiODqz20SIG4i4jm*k6$6K-cr5J}I z^75X=MDqD>2U_U{bvX@0}+NoAwKN?v5H0gi0DU zsBeS?6!3;VBy zS@thvh0@*|=_T0V3GC4LoEwFnyi8a?_1}V&po79BnH?LDQ-pnOwcClPTv+H_rrs~2KELti2x(Oa=N(!jX~8J6Mve5Fmx5nur@Y5Ek>K?rHiVlN~e-Gv(KMHI-<9BW#P#>^^`5$L*XzIQqCcV<$iBxY=^o(lKy?FF(2EOtBQl zHZ*)@P}lcW5#o9Fol{G&2!3HF;r>tAl)zTFhJ9B4XXWXGRTL}pK?)W%X0ez4<7Yay z0&8!~RKFmKyqPhi%*xDB5bCucOW7=a5r~eulCCChg5@0aJJZ|E*?LHwOC#xY zWv^Sxc&L}HX>4ZkgPcS9(tl`YteM#TeY!y7&XXCPX(sUQ)`Tgm4L8qRL?Kxx2Yzyq zVk}Cztm1KE#q7O^ELrDzLYH|gTP&3J2Og~ZuoPp@86~oYBD54-nlQ(^r=u_xw?SnH12HumS zN@X0MP3g^~2APF!|+PqU3d>`&rLX5EZ`KjE8r?5Uspx}|zq2hfktoisp>=p3h z?YnX83~C)MYI^d+SnHtAhVzyV1$&CO%{;-FDR)?}=)RnM0@$60wexHFcsE+k&=TJYEQZr-{| zOreSN4@_G$&-|sfp5>K9J%Z|rnt-AHK(+`;h(g;=Vp(ji8)J0$tRk>@B=4K%v^`?1`Pii1q!PyBFUY9iu>G%pN~ ztzME$mcl4#?rK@mOyyW$xMUU6vDS9yWX8%d%rL~Rpj3C06*!5I6Aic5hD?=SJ>_|Ub#Jy*IMl*?3qcuh5lkOuf2h*NEv zkOYfw9PlG!hRIxz>gZE`Ip3YPaj89z0^1_?fZ6&b8<#$pQN4zvdYOK3^69id1GWHfSly`@)1v*;79*wrh-=lRu$bMrv#Tf@#k z6}KuhpS9#G>S*YTvf<(m&~Pc~rKg^{ePZYB+0cCB_Zt*^kG2Vt4zr{*p8YvScA5 zxEVB-Byf*~Bb#YKL#bd&srB@!_8q(~3>WW(7zxXgA$WY2~A}~5nap;qp;<7iG$7lG>IQ@@hbjVPH&Hg=!KBcs&}gW9_;;2HQ(Je<66ut^nd-ADGe#XlH6 z{9v%_!UQk>(KQMTS7)Sg`elNK_ggDog#dG`>)#fn-8prrtQR>*Z!hAt#PQZWth#Fe z97X7gshOsS6f-ugK8OX%u`hA-`Cm%q3GQ4N4xQlnwL_tCr+a<91aZE zGG28lrW~x3dEXwzB$ou30o%y+eC{%tMGKdibsVz5mf*Lfe7_>p@mEZDJa zk|~I@oG06gcqD!tsX1#SN+u8nL*LZvxz#1u<#6#0rF==Poe#rTfMK@sySn|IFL>08 zO5qgi#48t1We9`+o5=FXZ<3;*vHfgkRjTZ6n?@x*;;bg0^}MY807WaX>D3Zet|jho zjK$pr@bzc(QJl1OF_;*mOMmHn|K8*f7J2<&^RfKZ#%-|m`?wwk&a=A-m9zD7tKmZ)lndI78hbz87hM&WN^vciKwb48wP`;!|OfC;=cfzrA zLJ5BU`Zi^Ra~&U=hmsAelwZnwCtw%%rXX$KwOL4fh-X`p#Q9iX5r#A}+1fS%9$!T& zX}Vf)biKHq6|f~_VZGn$D7>awz-OK6$@b$TG8Q$#unGp;VG1suaJJ5K4oJGf@eMqJPn$M&&OOFTbG*`-k4j%k~Bebku- zZnX(V6d3voC^l;;w0AqM3FAvHPFOjd^JO!YF~tvyqfGhm8i?Z}rx7~hRC3VFiDJpC zFlz~P>3HFo43wc?!3~Nm)~LV0(G&b9Ff;#H9IYxRs*I(C1HJkj?5Z14F)$UWnSMQE<%t~;E0}mZ~j-+So1b@y%u{}1Y;MS{Yqx%T*wVaWf_LEaq zKd|OfPBKXM%s#HQ4i;Im_yvI>#~Z5hq~R+2?2GR(HoU~;K71caK75;Vh-6IOY7ywh z-VFKR0gBcITrsCraNNJFUJS@b``FC8#Tkm zsyEJe^Mi)tcxzBWH!vq?!1nVgS{9ZcBR~Zi!?cP?+mCBdYZ$(VUjbaSqB73%5i{Jb ziKsHQPSqFfzzVj@x9LVI`k&+cIW{8rUb+PR*97D2+o%`VX!=7TezF&(5N)V4veB56 zO+rBydCSSIpjedy({q$eq=MVSHf*D&M1+C6x^|R~HwdYDC=komciNo;*P7C++Q7AO4?W$F7 zWp||Mr035)QOc8zSiP}y5^YkNYqNf(I(2&DQ4{tHCRa60*KwN|4Za(@1n@P7luwU0 zs&k!CA+7rN^wg_H-7Z`TT~`{eFn8_vPznYMaoX6(ofl^x7H>-~H_wUBEkTKb{`p-m z{Z|3nJRf0b?rGo_x&YOPnaA-cUaStom(ag`ese19K?Aqd=t{}!OjMSsdT#FP?@Tka zR-472z|GO}27R(c8$FKu6zXT+C{y5e=Z63jU@N|GmW1r@AFnbMe3FfkyA30a^yv!Y zUpFm2U<+*ceNA#o0sl!ww=aKwFY)aU=Pk!mKPV{hdv{@n3b*_cl7vrf-!^!domU`c zeP0agF+;C-91reJcEPUV^3H_Rf?Zz~(H%b8^!jULV!^ONC|ZPf*+yPJPdYp4DLf&A zPL>L|h3!O}a^K1EXpjgdw=N?!@EesKt3PYNDj|vjZVf`oXZMUS551T-n|O&&SjTRy zt-Zx`8fYO0Qz(y_U^&W*l@b@IlYQ^ChUxw zQa>30dkyDi2&H|P0`C6FVnwtvx33*|cyZo7Yu$+98`wkqXt+M$XsbUvW%~F1kG)3R zi1Db5vj%Prcm7aH{b}5fAz*b)UX6$@zpqd1CaVS41OP@#F7XmMeOHaAK5yYLGw^`x z%~xyhyYWXC?^iO9xf12KLyr66z74Nm4OPQW{K=kBB(codeYf7UPer{CQ-fqkD^K`$>*IARkPB@J{SMsnK_C0?;k8@Vp>JVPC2clY;tV- z{EU|ai>gYEGjQeRw^o%cH=N)4-*XW2|K2q`kQl`;D+-vKI9q%WqMzoSee}DW@YDSp5*=fNz*)N(8Fl=Ku+?mGp%WN`ZPV}E| ze32E`^0gAqSviQkKPdX6 zIF+2b4Q{ESu$1gPl5b<)fZoISee27J@RpvM&XkSDgIX~DrZ1}H&=kJmvShS_yO=u> zjZL|-QA%6!l1jQVQq{HIix-H^Dre0pq1U(sbmau%*k6S!E*#WQyFDs(HO~0}Tiv#R z``Tld(_{-ms=SO!rPR{OF9SLFg1=(_a4ko}Nt`VENND*jTk~MGJBC`eUsf0|k~=sD z3tDc^q)tkz`z^%B_11VyD->Jc4Gp}pnn%6)$Cjmd-TRw+LExzG{le1CA~3!wYmQ6o zN`HC8Y5A`HyT?9=R@h|-1!9N{U!HvBg>C#ilA}Jpec>~;w8OXWnv=q&{XBe$7g5zt znC%kisU%Z&u6m~eoTeN;}n?3^HYd5!*=qiOEB#`e1cs)5d5 z_37nmdoIyC%&CN9y}$OpG*eF`iwKI5=XzMGe|7pGeVpR892gsNH&nK#kdttrfAgqh zlflUG)0*Trc1LskH{uLe`Zx54AmLhRL{^+PmLjW~Ukw%Tl#5GezOFoqYiN!lwXDv) zO?TwE~v7hYq&?6O@{LT$% z<|)2GLp$S7k0;NTh3svMkknYQKKxfK`+mu+6hJ(6-2n;`fr6cJ9r1Anh1A z{mFz6^iJRaT=MEQ3-OGMO4W%ZA?zWYUxJ%keh}iB2l@m z3{q=FYzy)?Uh$~wv%>tXYw;8^&n-rm?vLev4-w>rTXe8dA@<0|kQMNy>{X&Mmd<4q zgea9fTc+D?+~AU3acb3aC$9llW$bN!3>(SzZM&k=Uxl=fmfO(7+$0R}Fs5WznUyVa6ozFmTYBOe5x2rL;j3i=o_R?S?ATQ~h@M*3Hgwj~%9-oSwn% z!{CKYrP!b~Qf%~c(G5MxKlao8mpqS=m`eJg#;gl^h-s4J$(irp-e^f(mgzLom5C1O zSP}dhGZE#!sGK!8i1=aL^wiKT9^FVs#7P&atCd=TNF4QihLzE~ zN3q`, `uniprot/summary/.json`, `sequence/summary`, and `annotations/.json`. +- Keep sequence-style inputs compact and prefer rerunning instead of copying prior output back into context. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common AlphaFold patterns: + - `{"base_url":"https://alphafold.ebi.ac.uk/api","path":"prediction/Q5VSL9"}` + - `{"base_url":"https://alphafold.ebi.ac.uk/api","path":"uniprot/summary/Q5VSL9.json"}` + - `{"base_url":"https://alphafold.ebi.ac.uk/api","path":"annotations/Q5VSL9.json","params":{"type":"MUTAGEN"},"max_items":3}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` such as `invalid_json`, `invalid_input`, `network_error`, or `invalid_response`. + +## Execution +```bash +echo '{"base_url":"https://alphafold.ebi.ac.uk/api","path":"prediction/Q5VSL9"}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/alphafold-skill/agents/openai.yaml b/plugins/life-science-research/skills/alphafold-skill/agents/openai.yaml new file mode 100644 index 0000000..6e63d26 --- /dev/null +++ b/plugins/life-science-research/skills/alphafold-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "AlphaFold" + short_description: "Fetch AlphaFold structure summaries" diff --git a/plugins/life-science-research/skills/alphafold-skill/scripts/rest_request.py b/plugins/life-science-research/skills/alphafold-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/alphafold-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/bgee-skill/SKILL.md b/plugins/life-science-research/skills/bgee-skill/SKILL.md new file mode 100644 index 0000000..5882216 --- /dev/null +++ b/plugins/life-science-research/skills/bgee-skill/SKILL.md @@ -0,0 +1,36 @@ +--- +name: bgee-skill +description: Submit compact Bgee SPARQL requests for healthy wild-type expression metadata and ontology-aware lookup patterns. Use when a user wants concise Bgee summaries; save raw results only on request. +--- + +## Operating rules +- Use `scripts/sparql_request.py` for all Bgee SPARQL work. +- Start with small `SELECT` or `ASK` queries and add `LIMIT` early. +- Prefer ontology-aware, healthy wild-type expression questions over broad triple dumps. +- Use `query_path` for longer SPARQL documents instead of pasting large inline queries. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the SPARQL JSON by default. +- Return raw results only if the user explicitly asks for machine-readable output. +- Default to JSON result format unless the user explicitly asks for text output. + +## Input +- Read one JSON object from stdin. +- Required field: `query` or `query_path` +- Optional fields: `method`, `params`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common Bgee patterns: + - `{"query":"ASK {}"}` + - `{"query":"SELECT * WHERE { ?s ?p ?o } LIMIT 3","max_items":3}` + +## Output +- Success returns `ok`, `source`, a compact `summary`, and `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` such as `invalid_json`, `invalid_input`, `network_error`, or `invalid_response`. + +## Execution +```bash +echo '{"query":"ASK {}"}' | python scripts/sparql_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/sparql_request.py`. diff --git a/plugins/life-science-research/skills/bgee-skill/agents/openai.yaml b/plugins/life-science-research/skills/bgee-skill/agents/openai.yaml new file mode 100644 index 0000000..6680873 --- /dev/null +++ b/plugins/life-science-research/skills/bgee-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Bgee" + short_description: "Fetch Bgee expression summaries" diff --git a/plugins/life-science-research/skills/bgee-skill/scripts/sparql_request.py b/plugins/life-science-research/skills/bgee-skill/scripts/sparql_request.py new file mode 100644 index 0000000..c1ad183 --- /dev/null +++ b/plugins/life-science-research/skills/bgee-skill/scripts/sparql_request.py @@ -0,0 +1,200 @@ +#!/usr/bin/env python3 +"""Compact Bgee SPARQL client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +ENDPOINT = "https://www.bgee.org/sparql/" + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + query = payload.get("query") + query_path = payload.get("query_path") + if query is None and query_path is None: + raise ValueError("Provide `query` or `query_path`.") + if query is not None and (not isinstance(query, str) or not query.strip()): + raise ValueError("`query` must be a non-empty string.") + if query_path is not None and (not isinstance(query_path, str) or not query_path.strip()): + raise ValueError("`query_path` must be a non-empty string.") + method = str(payload.get("method", "GET")).upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + params = payload.get("params") or {} + if not isinstance(params, dict): + raise ValueError("`params` must be an object.") + response_format = str(payload.get("response_format", "auto")).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + save_raw = payload.get("save_raw", False) + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean.") + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None and ( + not isinstance(raw_output_path, str) or not raw_output_path.strip() + ): + raise ValueError("`raw_output_path` must be a non-empty string.") + for key in ("max_items", "max_depth", "timeout_sec"): + default = 5 if key == "max_items" else 3 if key == "max_depth" else 60 + value = payload.get(key, default) + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{key}` must be a positive integer.") + payload[key] = value + query_text = ( + query.strip() if isinstance(query, str) else Path(query_path).read_text(encoding="utf-8") + ) + return { + "query": query_text, + "method": method, + "params": params, + "response_format": response_format, + "max_items": payload["max_items"], + "max_depth": payload["max_depth"], + "timeout_sec": payload["timeout_sec"], + "save_raw": save_raw, + "raw_output_path": raw_output_path.strip() if isinstance(raw_output_path, str) else None, + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + headers = { + "Accept": "application/sparql-results+json, application/json;q=0.9, text/tab-separated-values;q=0.8, text/plain;q=0.5" + } + try: + if config["method"] == "GET": + params = dict(config["params"]) + params.setdefault("query", config["query"]) + params.setdefault("format", "json") + response = requests.get( + ENDPOINT, params=params, headers=headers, timeout=config["timeout_sec"] + ) + else: + response = requests.post( + ENDPOINT, + data={"query": config["query"], **config["params"]}, + headers=headers, + timeout=config["timeout_sec"], + ) + response.raise_for_status() + except requests.RequestException as exc: + return error("network_error", f"SPARQL request failed: {exc}") + + raw_output_path = None + if config["save_raw"]: + suffix = "json" if "json" in (response.headers.get("content-type") or "").lower() else "txt" + path = Path(config["raw_output_path"] or f"/tmp/bgee-sparql.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(response.text, encoding="utf-8") + raw_output_path = str(path) + + content_type = (response.headers.get("content-type") or "").lower() + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith("{") + ) + if not auto_json and config["response_format"] != "json": + text_head = None if raw_output_path else response.text[:800] + return { + "ok": True, + "source": "bgee-sparql", + "content_type": content_type, + "text_head": text_head, + "text_head_truncated": False if raw_output_path else len(response.text) > 800, + "raw_output_path": raw_output_path, + "warnings": [], + } + + try: + data = response.json() + except ValueError as exc: + return error("invalid_response", str(exc)) + + if "boolean" in data: + summary: Any = {"boolean": data["boolean"]} + top_keys = ["boolean"] + else: + results = data.get("results", {}) + bindings = results.get("bindings", []) if isinstance(results, dict) else [] + summary = { + "head": data.get("head", {}), + "record_count_returned": min(len(bindings), config["max_items"]), + "record_count_available": len(bindings), + "truncated": len(bindings) > config["max_items"], + "records": _compact( + bindings[: config["max_items"]], config["max_items"], config["max_depth"] + ), + } + top_keys = list(summary)[: config["max_items"]] + + return { + "ok": True, + "source": "bgee-sparql", + "top_keys": top_keys, + "summary": _compact(summary, config["max_items"], config["max_depth"]), + "raw_output_path": raw_output_path, + "warnings": [], + } + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/bindingdb-skill/SKILL.md b/plugins/life-science-research/skills/bindingdb-skill/SKILL.md new file mode 100644 index 0000000..8f27873 --- /dev/null +++ b/plugins/life-science-research/skills/bindingdb-skill/SKILL.md @@ -0,0 +1,38 @@ +--- +name: bindingdb-skill +description: Submit compact BindingDB REST API requests for ligand-target binding lookups by PDB, UniProt, or similarity search. Use when a user wants concise BindingDB summaries; save raw payloads only on request. +--- + +## Operating rules +- Use `scripts/rest_request.py` for all BindingDB API calls. +- Use `base_url=https://bindingdb.org`. +- Add `response=application/json` in `params` when you want structured output; some empty-result cases may still return an empty body. +- For broad lookup endpoints, start around `max_items=10`; similarity-style queries are better with `5-10`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer these paths: `rest/getLigandsByPDBs`, `rest/getLigandsByUniprots`, `rest/getLigandsBySmiles`, and `rest/getTargetsByCompound`. +- If the user needs the full payload, set `save_raw=true` and report the saved file path instead of pasting large response bodies into chat. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common BindingDB patterns: + - `{"base_url":"https://bindingdb.org","path":"rest/getLigandsByPDBs","params":{"pdb":"1Q0L","cutoff":100,"identity":92,"response":"application/json"},"max_items":10}` + - `{"base_url":"https://bindingdb.org","path":"rest/getLigandsBySmiles","params":{"smiles":"CC(=O)OC1=CC=CC=C1C(=O)O","cutoff":0.9,"response":"application/json"},"max_items":5}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records`, a compact `summary`, or `text_head`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://bindingdb.org","path":"rest/getLigandsByPDBs","params":{"pdb":"1Q0L","cutoff":100,"identity":92,"response":"application/json"},"max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/bindingdb-skill/agents/openai.yaml b/plugins/life-science-research/skills/bindingdb-skill/agents/openai.yaml new file mode 100644 index 0000000..4219d67 --- /dev/null +++ b/plugins/life-science-research/skills/bindingdb-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "BindingDB" + short_description: "Fetch BindingDB ligand-target summaries" diff --git a/plugins/life-science-research/skills/bindingdb-skill/scripts/rest_request.py b/plugins/life-science-research/skills/bindingdb-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/bindingdb-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/biobankjapan-phewas-skill/SKILL.md b/plugins/life-science-research/skills/biobankjapan-phewas-skill/SKILL.md new file mode 100644 index 0000000..d0bee3a --- /dev/null +++ b/plugins/life-science-research/skills/biobankjapan-phewas-skill/SKILL.md @@ -0,0 +1,41 @@ +--- +name: biobankjapan-phewas-skill +description: Fetch compact BioBank Japan PheWAS summaries for single variants by accepting rsID, GRCh38, or GRCh37 input and resolving to the required GRCh37 query. Use when a user wants concise BBJ association results for one variant +--- + +## Operating rules +- Use `scripts/biobankjapan_phewas.py` for all BioBank Japan PheWAS lookups. +- Accept exactly one of `rsid`, `grch37`, `grch38`, or `variant`; resolve to the canonical GRCh37 `chr:pos-ref-alt` query before calling BioBank Japan. +- The script accepts `max_results`; start with `max_results=10` and only increase it if the first slice is insufficient. +- Re-run the lookup in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. +- If the user needs the full association payload, set `save_raw=true` and report `raw_output_path` instead of pasting large arrays into chat. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the JSON verbatim only if the user explicitly asks for machine-readable output. +- Surface the canonical queried variant, total association count, and whether the results were truncated. +- Increase `max_results` gradually instead of asking for large association dumps in one call. + +## Input +- Read one JSON object from stdin, or a single JSON string containing the variant. +- Required input: exactly one of `rsid`, `grch37`, `grch38`, or `variant` +- Optional fields: `max_results`, `save_raw`, `raw_output_path`, `timeout_sec` +- Common patterns: + - `{"grch37":"10:114758349-C-T","max_results":10}` + - `{"grch38":"10:112998590-C-T","max_results":10}` + - `{"rsid":"rs7903146","max_results":10}` + - `{"variant":"10:114758349:C:T","max_results":25,"save_raw":true}` + +## Output +- Success returns `ok`, `source`, `input`, `query_variant`, `max_results_applied`, `association_count`, `association_count_total`, `truncated`, `associations`, `variant`, `variant_url`, `raw_output_path`, and `warnings`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"grch37":"10:114758349-C-T","max_results":10}' | python scripts/biobankjapan_phewas.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/biobankjapan_phewas.py`. diff --git a/plugins/life-science-research/skills/biobankjapan-phewas-skill/agents/openai.yaml b/plugins/life-science-research/skills/biobankjapan-phewas-skill/agents/openai.yaml new file mode 100644 index 0000000..384ca2c --- /dev/null +++ b/plugins/life-science-research/skills/biobankjapan-phewas-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "BioBank Japan PheWAS" + short_description: "Fetch BioBank Japan PheWAS associations" diff --git a/plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/biobankjapan_phewas.py b/plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/biobankjapan_phewas.py new file mode 100644 index 0000000..18e1f65 --- /dev/null +++ b/plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/biobankjapan_phewas.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 +"""biobankjapan-phewas + +Fetch BioBank Japan PheWAS associations for one variant input. +Input JSON on stdin: + - {"grch37":"10:114758349-C-T"} + - {"grch38":"10:112998590:C:T","max_results":25} + - {"rsid":"rs7903146","max_results":25,"save_raw":true} + - "10:114758349-C-T" +Output JSON on stdout. +""" + +from __future__ import annotations + +import json +import re +import sys +from pathlib import Path +from typing import Any + +import requests +from variant_resolution import ( + VariantResolutionError, + extract_variant_input, + resolve_query_variant, +) + +BBJ_BASE = "https://pheweb.jp" +USER_AGENT = "biobankjapan-phewas-skill/1.0 (+requests)" +DEFAULT_TIMEOUT_S = 20 +DEFAULT_MAX_RESULTS = 10 +SAFE_PATH_RE = re.compile(r"[^A-Za-z0-9._-]+") + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return { + "ok": False, + "error": {"code": code, "message": message}, + "warnings": warnings or [], + } + + +def parse_input(payload: Any) -> tuple[str, str, int, bool, str | None, float]: + if isinstance(payload, str): + return "grch37", payload.strip(), DEFAULT_MAX_RESULTS, False, None, DEFAULT_TIMEOUT_S + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + input_type, variant = extract_variant_input(payload, default_build_key="grch37") + + max_results = payload.get("max_results", DEFAULT_MAX_RESULTS) + if not isinstance(max_results, int) or max_results <= 0: + raise ValueError("`max_results` must be a positive integer when provided.") + + save_raw = payload.get("save_raw", False) + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean when provided.") + + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None: + if not isinstance(raw_output_path, str) or not raw_output_path.strip(): + raise ValueError("`raw_output_path` must be a non-empty string when provided.") + raw_output_path = raw_output_path.strip() + + timeout_sec = payload.get("timeout_sec", DEFAULT_TIMEOUT_S) + if not isinstance(timeout_sec, (int, float)) or timeout_sec <= 0: + raise ValueError("`timeout_sec` must be a positive number when provided.") + + return input_type, variant, max_results, save_raw, raw_output_path, float(timeout_sec) + + +def fetch_bbj_variant( + session: requests.Session, + variant_str: str, + timeout_sec: float, +) -> tuple[Any | None, int | None]: + encoded = requests.utils.quote(variant_str, safe=":-") + url = f"{BBJ_BASE}/api/variant/{encoded}" + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + + response = session.get(url, headers=headers, timeout=timeout_sec) + if response.status_code == 404: + return None, 404 + response.raise_for_status() + return response.json(), response.status_code + + +def extract_associations(data: Any) -> list[Any]: + if data is None: + return [] + if isinstance(data, dict) and isinstance(data.get("phenos"), list): + return data["phenos"] + if isinstance(data, list): + return data + return [] + + +def resolve_raw_output_path(canonical_variant: str, raw_output_path: str | None) -> Path: + if raw_output_path: + return Path(raw_output_path).expanduser() + + safe_variant = SAFE_PATH_RE.sub("_", canonical_variant).strip("._") or "variant" + return Path("/tmp") / f"biobankjapan-phewas-{safe_variant}.json" + + +def write_raw_json(path: Path, data: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data), encoding="utf-8") + + +def main() -> int: + warnings: list[str] = [] + + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + + try: + input_type, input_variant, max_results, save_raw, raw_output_path, timeout_sec = ( + parse_input(payload) + ) + except ValueError as exc: + sys.stdout.write(json.dumps(error("invalid_input", str(exc)))) + return 2 + + try: + resolution = resolve_query_variant( + input_type=input_type, + input_value=input_variant, + target_build="GRCh37", + ) + parsed = dict(resolution["query_variant"]) + warnings.extend(resolution["warnings"]) + except VariantResolutionError as exc: + sys.stdout.write(json.dumps(error(exc.code, exc.message, exc.warnings))) + return 1 + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"Variant resolution failed: {exc}"))) + return 1 + + session = requests.Session() + try: + data, status_code = fetch_bbj_variant(session, parsed["canonical"], timeout_sec) + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"BBJ request failed: {exc}"))) + return 1 + except ValueError as exc: + sys.stdout.write(json.dumps(error("invalid_response", f"BBJ returned non-JSON: {exc}"))) + return 1 + + variant_url = f"{BBJ_BASE}/variant/{parsed['canonical']}" + saved_raw_output_path: str | None = None + if save_raw and data is not None: + raw_path = resolve_raw_output_path(parsed["canonical"], raw_output_path) + try: + write_raw_json(raw_path, data) + except OSError as exc: + sys.stdout.write(json.dumps(error("write_error", f"Could not write raw output: {exc}"))) + return 1 + saved_raw_output_path = str(raw_path) + + if status_code == 404: + warnings.append("Variant not found in BioBank Japan PheWAS API (HTTP 404).") + output = { + "ok": True, + "source": "biobank-japan", + "input": resolution["input"], + "query_variant": parsed, + "max_results_applied": max_results, + "association_count": 0, + "association_count_total": 0, + "truncated": False, + "associations": [], + "variant": None, + "variant_url": variant_url, + "raw_output_path": None, + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + associations = extract_associations(data) + total = len(associations) + if total > max_results: + associations = associations[:max_results] + truncated = len(associations) < total + + variant_info = None + if isinstance(data, dict): + variant_info = { + "chrom": data.get("chrom"), + "pos": data.get("pos"), + "ref": data.get("ref"), + "alt": data.get("alt"), + "rsids": data.get("rsids"), + "variant_name": data.get("variant_name"), + "nearest_genes": data.get("nearest_genes"), + } + + output = { + "ok": True, + "source": "biobank-japan", + "input": resolution["input"], + "query_variant": parsed, + "max_results_applied": max_results, + "association_count": len(associations), + "association_count_total": total, + "truncated": truncated, + "associations": associations, + "variant": variant_info, + "variant_url": variant_url, + "raw_output_path": saved_raw_output_path, + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/variant_resolution.py b/plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/variant_resolution.py new file mode 100644 index 0000000..cf52103 --- /dev/null +++ b/plugins/life-science-research/skills/biobankjapan-phewas-skill/scripts/variant_resolution.py @@ -0,0 +1,397 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import Any + +import requests + +ENSEMBL_GRCH38 = "https://rest.ensembl.org" +ENSEMBL_GRCH37 = "https://grch37.rest.ensembl.org" + +DEFAULT_TIMEOUT_S = 15 +USER_AGENT = "variant-coordinate-finder/1.0 (+requests)" + +SEP_RE = re.compile(r"[-:_/\s]+") +CHR_RE = re.compile(r"^(?:chr)?([0-9]{1,2}|X|Y|M|MT)$", re.IGNORECASE) +ALLELE_RE = re.compile(r"^[A-Za-z*]+$") + + +class VariantResolutionError(Exception): + def __init__(self, code: str, message: str, warnings: list[str] | None = None): + super().__init__(message) + self.code = code + self.message = message + self.warnings = warnings or [] + + +@dataclass +class Coord: + chr: str + pos: int + ref: str | None + alts: list[str] + + +def build_key_for(build: str) -> str: + return "grch37" if build in {"GRCh37", "hg19"} else "grch38" + + +def build_variant_record( + chrom: str, + pos: int, + ref: str | None, + alt: str | None, +) -> dict[str, Any]: + record: dict[str, Any] = { + "chr": chrom, + "pos": pos, + "ref": ref, + "alt": alt, + } + if ref is not None and alt is not None: + record["canonical"] = f"{chrom}:{pos}-{ref}-{alt}" + return record + + +def parse_variant_string(value: str) -> tuple[str, int, str, str]: + raw = value.strip() + if not raw: + raise ValueError("Variant string is empty.") + + parts = [part for part in SEP_RE.split(raw) if part] + if len(parts) != 4: + raise ValueError( + "Invalid variant format. Expected chrom-pos-ref-alt with flexible separators." + ) + + chrom_raw, pos_raw, ref_raw, alt_raw = parts + match = CHR_RE.match(chrom_raw) + if not match: + raise ValueError(f"Invalid chromosome: {chrom_raw!r}") + + chrom = match.group(1).upper() + if chrom == "M": + chrom = "MT" + + try: + pos = int(pos_raw) + except ValueError as exc: + raise ValueError(f"Invalid position: {pos_raw!r}") from exc + if pos <= 0: + raise ValueError("Position must be > 0.") + + ref = ref_raw.upper() + alt = alt_raw.upper() + if not ALLELE_RE.match(ref): + raise ValueError(f"Invalid REF allele: {ref_raw!r}") + if not ALLELE_RE.match(alt): + raise ValueError(f"Invalid ALT allele: {alt_raw!r}") + + return chrom, pos, ref, alt + + +def extract_variant_input(payload: Any, *, default_build_key: str) -> tuple[str, str]: + if isinstance(payload, str): + return default_build_key, payload.strip() + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + provided: list[tuple[str, str]] = [] + for key in ("rsid", "grch37", "grch38", "variant"): + value = payload.get(key) + if isinstance(value, str) and value.strip(): + provided.append((key, value.strip())) + + if not provided: + raise ValueError( + f"Provide exactly one of `rsid`, `grch37`, `grch38`, or `{default_build_key}`." + ) + if len(provided) != 1: + raise ValueError( + "Provide exactly one variant input: `rsid`, `grch37`, `grch38`, or `variant`." + ) + + input_type, input_value = provided[0] + if input_type == "variant": + input_type = default_build_key + return input_type, input_value + + +def _server_for(build: str) -> str: + return ENSEMBL_GRCH37 if build in {"GRCh37", "hg19"} else ENSEMBL_GRCH38 + + +def _assembly_cmp(build: str) -> str: + return "GRCh37" if build in {"GRCh37", "hg19"} else "GRCh38" + + +def _get_json(url: str, *, timeout: int = DEFAULT_TIMEOUT_S) -> Any: + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + return response.json() + + +def lookup_rsid(rsid: str, build: str = "GRCh38") -> Coord | None: + server = _server_for(build) + asm = _assembly_cmp(build) + url = ( + f"{server}/variation/human/{requests.utils.quote(rsid, safe='')}" + "?content-type=application/json" + ) + + data = _get_json(url, timeout=DEFAULT_TIMEOUT_S) + mappings = data.get("mappings") if isinstance(data, dict) else None + if not mappings: + return None + + for mapping in mappings: + if ( + isinstance(mapping, dict) + and mapping.get("assembly_name") == asm + and mapping.get("seq_region_name") + and mapping.get("start") is not None + ): + allele_string = mapping.get("allele_string") + alleles = allele_string.split("/") if isinstance(allele_string, str) else [] + ref = alleles[0] if alleles else None + alts = alleles[1:] if len(alleles) > 1 else [] + return Coord( + chr=str(mapping["seq_region_name"]), + pos=int(mapping["start"]), + ref=ref, + alts=alts, + ) + + return None + + +def lookup_position( + chrom: str, + pos: int, + build: str = "GRCh38", +) -> tuple[str, str | None, list[str]] | None: + server = _server_for(build) + url = ( + f"{server}/overlap/region/human/{chrom}:{pos}-{pos}" + "?feature=variation;content-type=application/json" + ) + + data = _get_json(url, timeout=DEFAULT_TIMEOUT_S) + if not isinstance(data, list) or not data: + return None + + for variant in data: + if ( + isinstance(variant, dict) + and isinstance(variant.get("id"), str) + and variant["id"].startswith("rs") + ): + alleles = variant.get("alleles") if isinstance(variant.get("alleles"), list) else [] + ref = alleles[0] if alleles else None + alts = alleles[1:] if len(alleles) > 1 else [] + return variant["id"], ref, alts + + return None + + +def resolve_rsid_both_builds(rsid: str) -> dict[str, Any]: + g38 = None + g37 = None + warnings: list[str] = [] + + try: + g38 = lookup_rsid(rsid, "GRCh38") + except Exception as exc: # noqa: BLE001 + warnings.append(f"GRCh38 lookup failed: {type(exc).__name__}: {exc}") + + try: + g37 = lookup_rsid(rsid, "GRCh37") + except Exception as exc: # noqa: BLE001 + warnings.append(f"GRCh37 lookup failed: {type(exc).__name__}: {exc}") + + ref = (g38.ref if g38 else None) or (g37.ref if g37 else None) + alts = (g38.alts if (g38 and g38.alts) else []) or (g37.alts if g37 else []) + + return { + "rsid": rsid, + "grch38": {"chr": g38.chr if g38 else None, "pos": g38.pos if g38 else None}, + "grch37": {"chr": g37.chr if g37 else None, "pos": g37.pos if g37 else None}, + "ref": ref, + "alts": alts, + "warnings": warnings, + } + + +def resolve_position_both_builds(chrom: str, pos: int, build: str) -> dict[str, Any] | None: + is_hg19 = build in {"hg19", "GRCh37"} + other_build = "GRCh38" if is_hg19 else "GRCh37" + + pos_result = lookup_position(chrom, pos, build) + if not pos_result: + return None + + rsid, ref, alts = pos_result + + other = None + warnings: list[str] = [] + try: + other = lookup_rsid(rsid, other_build) + except Exception as exc: # noqa: BLE001 + warnings.append(f"Other-build lookup failed: {type(exc).__name__}: {exc}") + + if is_hg19: + return { + "rsid": rsid, + "grch38": {"chr": other.chr if other else None, "pos": other.pos if other else None}, + "grch37": {"chr": chrom, "pos": pos}, + "ref": (other.ref if other and other.ref else ref), + "alts": (other.alts if other and other.alts else alts), + "warnings": warnings, + } + + return { + "rsid": rsid, + "grch38": {"chr": chrom, "pos": pos}, + "grch37": {"chr": other.chr if other else None, "pos": other.pos if other else None}, + "ref": ref, + "alts": alts, + "warnings": warnings, + } + + +def resolve_variant(input_type: str, input_value: str) -> dict[str, Any]: + warnings: list[str] = [] + + if input_type == "rsid": + rsid = input_value.strip() + if not rsid.startswith("rs"): + raise ValueError("rsid must start with 'rs'.") + + resolved = resolve_rsid_both_builds(rsid) + warnings.extend(resolved.get("warnings", [])) + + ref = resolved.get("ref") + alts = resolved.get("alts") or [] + alt = alts[0] if alts else None + + g37 = None + if resolved.get("grch37", {}).get("chr") and resolved.get("grch37", {}).get("pos"): + g37 = build_variant_record( + resolved["grch37"]["chr"], + resolved["grch37"]["pos"], + ref, + alt, + ) + + g38 = None + if resolved.get("grch38", {}).get("chr") and resolved.get("grch38", {}).get("pos"): + g38 = build_variant_record( + resolved["grch38"]["chr"], + resolved["grch38"]["pos"], + ref, + alt, + ) + + return { + "input": {"type": "rsid", "value": rsid}, + "rsid": rsid, + "grch37": g37, + "grch38": g38, + "warnings": warnings, + } + + if input_type not in {"grch37", "grch38"}: + raise ValueError(f"Unsupported input type: {input_type!r}") + + build = "GRCh37" if input_type == "grch37" else "GRCh38" + chrom, pos, ref_in, alt_in = parse_variant_string(input_value) + + resolved = resolve_position_both_builds(chrom, pos, build) + if not resolved: + raise VariantResolutionError( + "not_found", + f"No rsID found at {chrom}:{pos} on {build} via Ensembl overlap endpoint.", + ) + + warnings.extend(resolved.get("warnings", [])) + + ref = resolved.get("ref") + alts = resolved.get("alts") or [] + if ref and ref_in != ref: + warnings.append(f"Input ref {ref_in} != resolved ref {ref}; keeping resolved ref.") + + alt = alt_in if alt_in in alts else (alts[0] if alts else alt_in) + if alts and alt_in not in alts: + warnings.append(f"Input alt {alt_in} not among resolved alts {alts}; using {alt}.") + + rsid = resolved.get("rsid") + + g37 = None + if resolved.get("grch37", {}).get("chr") and resolved.get("grch37", {}).get("pos"): + g37 = build_variant_record( + resolved["grch37"]["chr"], + resolved["grch37"]["pos"], + ref, + alt, + ) + + g38 = None + if resolved.get("grch38", {}).get("chr") and resolved.get("grch38", {}).get("pos"): + g38 = build_variant_record( + resolved["grch38"]["chr"], + resolved["grch38"]["pos"], + ref, + alt, + ) + + return { + "input": {"type": input_type, "value": input_value}, + "rsid": rsid, + "grch37": g37, + "grch38": g38, + "warnings": warnings, + } + + +def resolve_query_variant( + *, + input_type: str, + input_value: str, + target_build: str, +) -> dict[str, Any]: + target_key = build_key_for(target_build) + if input_type == target_key: + chrom, pos, ref, alt = parse_variant_string(input_value) + target_variant = build_variant_record(chrom, pos, ref, alt) + return { + "input": {"type": input_type, "value": input_value}, + "query_variant": target_variant, + "rsid": None, + "grch37": target_variant if target_key == "grch37" else None, + "grch38": target_variant if target_key == "grch38" else None, + "warnings": [], + } + + resolved = resolve_variant(input_type, input_value) + target_variant = resolved.get(target_key) + if not isinstance(target_variant, dict) or not target_variant.get("canonical"): + raise VariantResolutionError( + "resolution_failed", + f"Could not resolve input variant to {target_build}.", + warnings=list(resolved.get("warnings") or []), + ) + + return { + "input": resolved["input"], + "query_variant": target_variant, + "rsid": resolved.get("rsid"), + "grch37": resolved.get("grch37"), + "grch38": resolved.get("grch38"), + "warnings": list(resolved.get("warnings") or []), + } diff --git a/plugins/life-science-research/skills/biorxiv-skill/SKILL.md b/plugins/life-science-research/skills/biorxiv-skill/SKILL.md new file mode 100644 index 0000000..d8e657a --- /dev/null +++ b/plugins/life-science-research/skills/biorxiv-skill/SKILL.md @@ -0,0 +1,40 @@ +--- +name: biorxiv-skill +description: Submit compact bioRxiv and medRxiv API requests for details, publication-linkage, and DOI lookups. Use when a user wants concise preprint metadata summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all bioRxiv and medRxiv API calls. +- Use `base_url=https://api.biorxiv.org`. +- The script accepts `max_items`; for `details` and `pubs` pages, start around `max_items=10`. +- Prefer one cursor page at a time instead of increasing page size or pasting long collections into chat. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not part of the true request. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the raw script JSON only if the user explicitly asks for machine-readable output. +- Prefer these paths: `details/////json`, `details///na/json`, `pubs////`, and `pubs///na/json`. +- If the user needs full page contents, set `save_raw=true` and report the saved file path rather than pasting large collections into chat. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common biorxiv patterns: + - `{"base_url":"https://api.biorxiv.org","path":"details/biorxiv/2025-03-21/2025-03-28/0/json","record_path":"collection","max_items":10}` + - `{"base_url":"https://api.biorxiv.org","path":"details/medrxiv/10.1101/2020.09.09.20191205/na/json","record_path":"collection","max_items":10}` + - `{"base_url":"https://api.biorxiv.org","path":"pubs/medrxiv/2020-03-01/2020-03-30/0","record_path":"collection","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://api.biorxiv.org","path":"details/biorxiv/2025-03-21/2025-03-28/0/json","record_path":"collection","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/biorxiv-skill/agents/openai.yaml b/plugins/life-science-research/skills/biorxiv-skill/agents/openai.yaml new file mode 100644 index 0000000..0cdc332 --- /dev/null +++ b/plugins/life-science-research/skills/biorxiv-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "bioRxiv / medRxiv" + short_description: "Fetch bioRxiv and medRxiv preprint summaries" diff --git a/plugins/life-science-research/skills/biorxiv-skill/scripts/rest_request.py b/plugins/life-science-research/skills/biorxiv-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/biorxiv-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/biostudies-arrayexpress-skill/SKILL.md b/plugins/life-science-research/skills/biostudies-arrayexpress-skill/SKILL.md new file mode 100644 index 0000000..2e9d0c9 --- /dev/null +++ b/plugins/life-science-research/skills/biostudies-arrayexpress-skill/SKILL.md @@ -0,0 +1,38 @@ +--- +name: biostudies-arrayexpress-skill +description: Submit compact BioStudies and ArrayExpress API requests for free-text search and accession-based study retrieval. Use when a user wants concise BioStudies summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all BioStudies and ArrayExpress calls. +- Use `base_url=https://www.ebi.ac.uk/biostudies/api/v1`. +- Search pages are better with `pageSize=10` and `max_items=10`; accession lookups usually do not need `max_items`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer these paths: `search`, `ArrayExpress/search`, `studies/`, and `studies//info`. +- If the user needs the full payload, set `save_raw=true` and report the saved file path instead of pasting large study records into chat. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common BioStudies patterns: + - `{"base_url":"https://www.ebi.ac.uk/biostudies/api/v1","path":"search","params":{"query":"rna","page":1,"pageSize":10},"record_path":"hits","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/biostudies/api/v1","path":"ArrayExpress/search","params":{"query":"single cell","page":1,"pageSize":10},"record_path":"hits","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/biostudies/api/v1","path":"studies/E-MTAB-6701"}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/biostudies/api/v1","path":"search","params":{"query":"rna","page":1,"pageSize":10},"record_path":"hits","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/biostudies-arrayexpress-skill/agents/openai.yaml b/plugins/life-science-research/skills/biostudies-arrayexpress-skill/agents/openai.yaml new file mode 100644 index 0000000..8f67834 --- /dev/null +++ b/plugins/life-science-research/skills/biostudies-arrayexpress-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "BioStudies / ArrayExpress" + short_description: "Fetch BioStudies and ArrayExpress study summaries" diff --git a/plugins/life-science-research/skills/biostudies-arrayexpress-skill/scripts/rest_request.py b/plugins/life-science-research/skills/biostudies-arrayexpress-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/biostudies-arrayexpress-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/cbioportal-skill/SKILL.md b/plugins/life-science-research/skills/cbioportal-skill/SKILL.md new file mode 100644 index 0000000..ed882d8 --- /dev/null +++ b/plugins/life-science-research/skills/cbioportal-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: cbioportal-skill +description: Submit compact cBioPortal API requests for studies, molecular profiles, mutations, clinical data, and samples. Use when a user wants concise cBioPortal summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all cBioPortal API calls. +- Use `base_url=https://www.cbioportal.org/api`. +- Collection endpoints are better with `pageSize=10` and `max_items=10`; single study or profile lookups usually do not need `max_items`. +- Use `method=POST` plus `json_body` for fetch-style endpoints such as mutation fetches. +- Send `Accept: application/json` in `headers`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer these paths: `studies`, `studies//molecular-profiles`, `molecular-profiles//mutations/fetch`, and study-level clinical or sample endpoints. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common cBioPortal patterns: + - `{"base_url":"https://www.cbioportal.org/api","path":"studies","params":{"keyword":"breast","projection":"SUMMARY","pageSize":10},"headers":{"Accept":"application/json"},"max_items":10}` + - `{"base_url":"https://www.cbioportal.org/api","path":"molecular-profiles/brca_tcga_mutations/mutations/fetch","method":"POST","json_body":{"sampleListId":"brca_tcga_all","entrezGeneIds":[7157]},"headers":{"Accept":"application/json"},"max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.cbioportal.org/api","path":"studies","params":{"keyword":"breast","projection":"SUMMARY","pageSize":10},"headers":{"Accept":"application/json"},"max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/cbioportal-skill/agents/openai.yaml b/plugins/life-science-research/skills/cbioportal-skill/agents/openai.yaml new file mode 100644 index 0000000..8334833 --- /dev/null +++ b/plugins/life-science-research/skills/cbioportal-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "cBioPortal" + short_description: "Fetch cBioPortal study and mutation summaries" diff --git a/plugins/life-science-research/skills/cbioportal-skill/scripts/rest_request.py b/plugins/life-science-research/skills/cbioportal-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/cbioportal-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/cellxgene-skill/SKILL.md b/plugins/life-science-research/skills/cellxgene-skill/SKILL.md new file mode 100644 index 0000000..688593c --- /dev/null +++ b/plugins/life-science-research/skills/cellxgene-skill/SKILL.md @@ -0,0 +1,37 @@ +--- +name: cellxgene-skill +description: Submit compact CELLxGENE Discover API requests for public collection and dataset metadata. Use when a user wants concise single-cell collection summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all CELLxGENE Discover calls. +- Use `base_url=https://api.cellxgene.cziscience.com/curation/v1`. +- Prefer targeted collection detail lookups rather than full archive dumps by default. +- The public `collections` list can be large and may require a higher `timeout_sec`; collection detail lookups are usually the better first call. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer these paths: `collections/` first, then `collections` when the user explicitly wants broad archive discovery. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common CELLxGENE patterns: + - `{"base_url":"https://api.cellxgene.cziscience.com/curation/v1","path":"collections/db468083-041c-41ca-8f6f-bf991a070adf","max_items":5}` + - `{"base_url":"https://api.cellxgene.cziscience.com/curation/v1","path":"collections","timeout_sec":60,"max_items":5}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://api.cellxgene.cziscience.com/curation/v1","path":"collections/db468083-041c-41ca-8f6f-bf991a070adf","max_items":5}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/cellxgene-skill/agents/openai.yaml b/plugins/life-science-research/skills/cellxgene-skill/agents/openai.yaml new file mode 100644 index 0000000..8b32425 --- /dev/null +++ b/plugins/life-science-research/skills/cellxgene-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "CELLxGENE" + short_description: "Fetch CELLxGENE collection summaries" diff --git a/plugins/life-science-research/skills/cellxgene-skill/scripts/rest_request.py b/plugins/life-science-research/skills/cellxgene-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/cellxgene-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/chebi-skill/SKILL.md b/plugins/life-science-research/skills/chebi-skill/SKILL.md new file mode 100644 index 0000000..c2b0300 --- /dev/null +++ b/plugins/life-science-research/skills/chebi-skill/SKILL.md @@ -0,0 +1,38 @@ +--- +name: chebi-skill +description: Submit compact ChEBI 2.0 API requests for chemical search, compound lookup, ontology traversal, and structure metadata. Use when a user wants concise ChEBI summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all ChEBI calls. +- Use `base_url=https://www.ebi.ac.uk`. +- Prefer the documented public routes under `chebi/backend/api/public/`. +- Start with `es_search/` for free-text lookup and use `compound//` for targeted records. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer these paths: `chebi/backend/api/public/es_search/`, `chebi/backend/api/public/compound//`, and ontology child or parent routes. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common ChEBI patterns: + - `{"base_url":"https://www.ebi.ac.uk","path":"chebi/backend/api/public/es_search/","params":{"query":"caffeine","size":10},"record_path":"results","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk","path":"chebi/backend/api/public/compound/CHEBI:27732/"}` + - `{"base_url":"https://www.ebi.ac.uk","path":"chebi/backend/api/public/ontology/children/CHEBI:27732/"}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk","path":"chebi/backend/api/public/es_search/","params":{"query":"caffeine","size":10},"record_path":"results","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/chebi-skill/agents/openai.yaml b/plugins/life-science-research/skills/chebi-skill/agents/openai.yaml new file mode 100644 index 0000000..ab9e06b --- /dev/null +++ b/plugins/life-science-research/skills/chebi-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "ChEBI" + short_description: "Fetch ChEBI compound summaries" diff --git a/plugins/life-science-research/skills/chebi-skill/scripts/rest_request.py b/plugins/life-science-research/skills/chebi-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/chebi-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/chembl-skill/SKILL.md b/plugins/life-science-research/skills/chembl-skill/SKILL.md new file mode 100644 index 0000000..d5518e2 --- /dev/null +++ b/plugins/life-science-research/skills/chembl-skill/SKILL.md @@ -0,0 +1,40 @@ +--- +name: chembl-skill +description: Submit compact ChEMBL API requests for activity, molecule, target, mechanism, and text-search endpoints. Use when a user wants concise ChEMBL summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all ChEMBL API calls. +- Use `base_url=https://www.ebi.ac.uk/chembl/api/data`. +- The script accepts `max_items`; for activity, mechanism, and text-search collections, start with API `limit=10` and `max_items=10`. +- Single molecule or target lookups usually do not need `max_items`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the script JSON verbatim only if the user explicitly asks for machine-readable output. +- Prefer these paths: `activity.json`, `molecule/.json`, `target/.json`, `mechanism.json`, and `molecule/search.json`. +- Use `record_path` to target list fields like `activities`, `mechanisms`, or `molecules`. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common ChEMBL patterns: + - `{"base_url":"https://www.ebi.ac.uk/chembl/api/data","path":"activity.json","params":{"molecule_chembl_id":"CHEMBL25","limit":10},"record_path":"activities","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/chembl/api/data","path":"molecule/CHEMBL25.json"}` + - `{"base_url":"https://www.ebi.ac.uk/chembl/api/data","path":"molecule/search.json","params":{"q":"imatinib","limit":10},"record_path":"molecules","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/chembl/api/data","path":"activity.json","params":{"molecule_chembl_id":"CHEMBL25","limit":10},"record_path":"activities","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/chembl-skill/agents/openai.yaml b/plugins/life-science-research/skills/chembl-skill/agents/openai.yaml new file mode 100644 index 0000000..c033fd0 --- /dev/null +++ b/plugins/life-science-research/skills/chembl-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "ChEMBL" + short_description: "Fetch ChEMBL activity and target summaries" diff --git a/plugins/life-science-research/skills/chembl-skill/scripts/rest_request.py b/plugins/life-science-research/skills/chembl-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/chembl-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/civic-skill/SKILL.md b/plugins/life-science-research/skills/civic-skill/SKILL.md new file mode 100644 index 0000000..91801c2 --- /dev/null +++ b/plugins/life-science-research/skills/civic-skill/SKILL.md @@ -0,0 +1,35 @@ +--- +name: civic-skill +description: Submit compact CIViC GraphQL requests for cancer variant interpretation schema inspection and targeted evidence retrieval. Use when a user wants concise CIViC summaries +--- + +## Operating rules +- Use `scripts/civic_graphql.py` for all CIViC GraphQL work. +- Keep selection sets narrow and start with schema or targeted entity queries. +- Use `query_path` for longer GraphQL documents instead of pasting large inline queries. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer sanity, schema, and targeted evidence queries over broad graph dumps. + +## Input +- Read one JSON object from stdin. +- Required field: `query` or `query_path` +- Optional fields: `variables`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common CIViC patterns: + - `{"query":"query { __typename }"}` + - `{"query":"query { __schema { queryType { fields { name } } } }","max_items":20}` + +## Output +- Success returns `ok`, `source`, `top_keys`, a compact `summary`, and `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` such as `invalid_json`, `invalid_input`, `network_error`, `invalid_response`, or `graphql_error`. + +## Execution +```bash +echo '{"query":"query { __typename }"}' | python scripts/civic_graphql.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/civic_graphql.py`. diff --git a/plugins/life-science-research/skills/civic-skill/agents/openai.yaml b/plugins/life-science-research/skills/civic-skill/agents/openai.yaml new file mode 100644 index 0000000..7518caa --- /dev/null +++ b/plugins/life-science-research/skills/civic-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "CIViC" + short_description: "Fetch CIViC cancer variant summaries" diff --git a/plugins/life-science-research/skills/civic-skill/scripts/civic_graphql.py b/plugins/life-science-research/skills/civic-skill/scripts/civic_graphql.py new file mode 100644 index 0000000..5f9dea7 --- /dev/null +++ b/plugins/life-science-research/skills/civic-skill/scripts/civic_graphql.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +"""Compact CIViC GraphQL client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +ENDPOINT = "https://civicdb.org/api/graphql" + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + query = payload.get("query") + query_path = payload.get("query_path") + if query is None and query_path is None: + raise ValueError("Provide `query` or `query_path`.") + if query is not None and (not isinstance(query, str) or not query.strip()): + raise ValueError("`query` must be a non-empty string.") + if query_path is not None and (not isinstance(query_path, str) or not query_path.strip()): + raise ValueError("`query_path` must be a non-empty string.") + variables = payload.get("variables") or {} + if not isinstance(variables, dict): + raise ValueError("`variables` must be an object.") + save_raw = payload.get("save_raw", False) + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean.") + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None and ( + not isinstance(raw_output_path, str) or not raw_output_path.strip() + ): + raise ValueError("`raw_output_path` must be a non-empty string.") + for key in ("max_items", "max_depth", "timeout_sec"): + value = payload.get(key, 5 if key == "max_items" else 3 if key == "max_depth" else 60) + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{key}` must be a positive integer.") + payload[key] = value + query_text = ( + query.strip() if isinstance(query, str) else Path(query_path).read_text(encoding="utf-8") + ) + return { + "query": query_text, + "variables": variables, + "max_items": payload["max_items"], + "max_depth": payload["max_depth"], + "timeout_sec": payload["timeout_sec"], + "save_raw": save_raw, + "raw_output_path": raw_output_path.strip() if isinstance(raw_output_path, str) else None, + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + try: + response = requests.post( + ENDPOINT, + json={"query": config["query"], "variables": config["variables"]}, + timeout=config["timeout_sec"], + ) + response.raise_for_status() + data = response.json() + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"GraphQL request failed: {exc}") + + raw_output_path = None + if config["save_raw"]: + raw_text = json.dumps(data, indent=2) + path = Path(config["raw_output_path"] or "/tmp/civic-graphql.json") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_text, encoding="utf-8") + raw_output_path = str(path) + + if "errors" in data: + return error( + "graphql_error", + json.dumps(data["errors"])[:500], + warnings=[f"raw_output_path={raw_output_path}"] if raw_output_path else [], + ) + + payload_data = data.get("data") + if not isinstance(payload_data, dict): + return error("invalid_response", "GraphQL response did not include a `data` object.") + + return { + "ok": True, + "source": "civic-graphql", + "top_keys": list(payload_data)[: config["max_items"]], + "summary": _compact(payload_data, config["max_items"], config["max_depth"]), + "raw_output_path": raw_output_path, + "warnings": [], + } + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/clinicaltrials-skill/SKILL.md b/plugins/life-science-research/skills/clinicaltrials-skill/SKILL.md new file mode 100644 index 0000000..7f1911d --- /dev/null +++ b/plugins/life-science-research/skills/clinicaltrials-skill/SKILL.md @@ -0,0 +1,40 @@ +--- +name: clinicaltrials-skill +description: Submit compact ClinicalTrials.gov API v2 requests for study search, metadata, enums, search areas, and field statistics. Use when a user wants concise ClinicalTrials.gov summaries +--- + +## Operating rules +- Use `scripts/clinicaltrials_client.py` for all ClinicalTrials.gov v2 calls. +- Study searches are better with `max_items=10` and `max_pages=1`; only increase pages when the user explicitly wants more than the first page. +- Use targeted `params` instead of broad unfiltered study dumps. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer `action=studies` for search and `action=metadata|search_areas|enums|stats_size|field_values|field_sizes` for API introspection and field stats. +- If the user needs full pages or aggregated responses, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required field: `action` +- Supported actions: `studies`, `metadata`, `search_areas`, `enums`, `stats_size`, `field_values`, `field_sizes`, `request` +- Optional fields: `path` for `action=request`, `params`, `max_items`, `max_depth`, `max_pages`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common ClinicalTrials.gov patterns: + - `{"action":"studies","params":{"query.cond":"prostate cancer","filter.overallStatus":"RECRUITING","pageSize":10},"max_items":10,"max_pages":1}` + - `{"action":"metadata"}` + - `{"action":"field_values","params":{"field":"protocolSection.identificationModule.organization.fullName"}}` + +## Output +- `action=studies` returns `pages_fetched`, `next_page_token`, count metadata, and compact `records`. +- Other actions return either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"action":"studies","params":{"query.cond":"prostate cancer","filter.overallStatus":"RECRUITING","pageSize":10},"max_items":10,"max_pages":1}' | python scripts/clinicaltrials_client.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/clinicaltrials_client.py`. diff --git a/plugins/life-science-research/skills/clinicaltrials-skill/agents/openai.yaml b/plugins/life-science-research/skills/clinicaltrials-skill/agents/openai.yaml new file mode 100644 index 0000000..d0612ed --- /dev/null +++ b/plugins/life-science-research/skills/clinicaltrials-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "ClinicalTrials.gov" + short_description: "Fetch ClinicalTrials.gov study summaries" diff --git a/plugins/life-science-research/skills/clinicaltrials-skill/scripts/clinicaltrials_client.py b/plugins/life-science-research/skills/clinicaltrials-skill/scripts/clinicaltrials_client.py new file mode 100644 index 0000000..5e399f9 --- /dev/null +++ b/plugins/life-science-research/skills/clinicaltrials-skill/scripts/clinicaltrials_client.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python3 +"""Compact ClinicalTrials.gov v2 helper for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +BASE_URL = "https://clinicaltrials.gov/api/v2" +PATHS = { + "studies": "/studies", + "metadata": "/studies/metadata", + "search_areas": "/studies/search-areas", + "enums": "/studies/enums", + "stats_size": "/stats/size", + "field_values": "/stats/field/values", + "field_sizes": "/stats/field/sizes", +} + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + action = payload.get("action", "studies") + if action not in set(PATHS) | {"request"}: + raise ValueError( + "`action` must be one of: studies, metadata, search_areas, enums, stats_size, field_values, field_sizes, request." + ) + params = payload.get("params") or {} + if not isinstance(params, dict): + raise ValueError("`params` must be an object.") + path = payload.get("path") + if action == "request": + if not isinstance(path, str) or not path.strip(): + raise ValueError("`path` is required when `action=request`.") + return { + "action": action, + "path": path.strip() if isinstance(path, str) else PATHS.get(action), + "params": params, + "max_items": _require_int("max_items", payload.get("max_items"), 10), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "max_pages": _require_int("max_pages", payload.get("max_pages"), 1), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": payload.get("raw_output_path"), + } + + +def _save_raw(data: Any, raw_output_path: str | None) -> str: + path = Path(raw_output_path or "/tmp/clinicaltrials-response.json") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data, indent=2), encoding="utf-8") + return str(path) + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update({"Accept": "application/json"}) + try: + if config["action"] == "studies": + next_page_token = None + if isinstance(config["params"].get("pageToken"), str): + next_page_token = config["params"]["pageToken"] + studies: list[Any] = [] + pages: list[dict[str, Any]] = [] + total_count: int | None = None + pages_fetched = 0 + for _ in range(config["max_pages"]): + params = dict(config["params"]) + if next_page_token: + params["pageToken"] = next_page_token + response = session.get( + BASE_URL + config["path"], params=params, timeout=config["timeout_sec"] + ) + response.raise_for_status() + page = response.json() + pages.append(page) + pages_fetched += 1 + page_studies = page.get("studies") if isinstance(page.get("studies"), list) else [] + studies.extend(page_studies) + if isinstance(page.get("totalCount"), int): + total_count = page["totalCount"] + next_page_token = ( + page.get("nextPageToken") + if isinstance(page.get("nextPageToken"), str) + else None + ) + if not next_page_token: + break + raw_output_path = ( + _save_raw(pages if len(pages) > 1 else pages[0], config["raw_output_path"]) + if config["save_raw"] and pages + else None + ) + available = total_count if isinstance(total_count, int) else len(studies) + return { + "ok": True, + "source": "clinicaltrials-v2", + "action": "studies", + "pages_fetched": pages_fetched, + "next_page_token": next_page_token, + "record_count_returned": min(len(studies), config["max_items"]), + "record_count_available": available, + "truncated": len(studies) > config["max_items"] or next_page_token is not None, + "records": _compact( + studies[: config["max_items"]], config["max_items"], config["max_depth"] + ), + "raw_output_path": raw_output_path, + "warnings": [], + } + + response = session.get( + BASE_URL + config["path"], params=config["params"], timeout=config["timeout_sec"] + ) + response.raise_for_status() + data = response.json() + raw_output_path = _save_raw(data, config["raw_output_path"]) if config["save_raw"] else None + target = data + top_keys = list(data)[: config["max_items"]] if isinstance(data, dict) else None + if isinstance(data, dict): + for key in ("studies", "fields", "areas", "enums", "values", "sizes"): + value = data.get(key) + if isinstance(value, list): + target = value + break + output = { + "ok": True, + "source": "clinicaltrials-v2", + "action": config["action"], + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + output.update( + { + "record_count_returned": min(len(target), config["max_items"]), + "record_count_available": len(target), + "truncated": len(target) > config["max_items"], + "records": _compact( + target[: config["max_items"]], config["max_items"], config["max_depth"] + ), + } + ) + else: + output["summary"] = _compact(target, config["max_items"], config["max_depth"]) + output["top_keys"] = top_keys + return output + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/clinvar-variation-skill/SKILL.md b/plugins/life-science-research/skills/clinvar-variation-skill/SKILL.md new file mode 100644 index 0000000..e2bc215 --- /dev/null +++ b/plugins/life-science-research/skills/clinvar-variation-skill/SKILL.md @@ -0,0 +1,43 @@ +--- +name: clinvar-variation-skill +description: Submit compact ClinVar Clinical Tables and NCBI Variation requests for search, VCV, RCV, SCV, and RefSNP lookups. Use when a user wants variant-level summaries or identifier mapping +--- + +## Operating rules +- Use `scripts/clinvar_variation.py` for all ClinVar and NCBI Variation work. +- The script accepts `max_items`; for `action=search`, start around `max_items=10`. +- For `vcv`, `rcv`, `scv`, and `refsnp`, omit `max_items` unless you need to trim nested arrays in the summary. +- Re-run requests in long conversations instead of relying on prior tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. +- If the user asks for full JSON, set `save_raw=true` and report the saved file path instead of pasting large payloads into chat. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the JSON verbatim only if the user explicitly asks for machine-readable output. +- Use `action=search` for the Clinical Tables endpoint. +- Use `action=vcv|rcv|scv|refsnp` for NCBI Variation beta objects. + +## Input +- Read one JSON object from stdin. +- Required field: `action` +- Action-specific required fields: + - `search`: `terms` + - `vcv`: `vcv` + - `rcv`: `rcv` + - `scv`: `scv` + - `refsnp`: `refsnp` +- Optional fields: `params`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` + +## Output +- `search` returns `total`, `identifiers`, `display_rows`, `extra_fields`, and truncation metadata. +- `vcv|rcv|scv|refsnp` return a compact `summary` and optional `top_keys`. +- Use `raw_output_path` when `save_raw=true`. +- Failures return `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"action":"search","terms":"VCV000013080","max_items":10}' | python scripts/clinvar_variation.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/clinvar_variation.py`. diff --git a/plugins/life-science-research/skills/clinvar-variation-skill/agents/openai.yaml b/plugins/life-science-research/skills/clinvar-variation-skill/agents/openai.yaml new file mode 100644 index 0000000..6238e40 --- /dev/null +++ b/plugins/life-science-research/skills/clinvar-variation-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "ClinVar / NCBI Variation" + short_description: "Fetch ClinVar and NCBI Variation summaries" diff --git a/plugins/life-science-research/skills/clinvar-variation-skill/scripts/clinvar_variation.py b/plugins/life-science-research/skills/clinvar-variation-skill/scripts/clinvar_variation.py new file mode 100644 index 0000000..ce5d23b --- /dev/null +++ b/plugins/life-science-research/skills/clinvar-variation-skill/scripts/clinvar_variation.py @@ -0,0 +1,193 @@ +#!/usr/bin/env python3 +"""Compact ClinVar + NCBI Variation helper for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +CLINICAL_TABLES_URL = "https://clinicaltables.nlm.nih.gov/api/variants/v4/search" +VARIATION_BASE = "https://api.ncbi.nlm.nih.gov/variation/v0/beta" + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + action = payload.get("action") + if action not in {"search", "vcv", "rcv", "scv", "refsnp"}: + raise ValueError("`action` must be one of: search, vcv, rcv, scv, refsnp.") + max_items = payload.get("max_items", 5) + max_depth = payload.get("max_depth", 3) + timeout_sec = payload.get("timeout_sec", 30) + save_raw = payload.get("save_raw", False) + for name, value in { + "max_items": max_items, + "max_depth": max_depth, + "timeout_sec": timeout_sec, + }.items(): + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean.") + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None and ( + not isinstance(raw_output_path, str) or not raw_output_path.strip() + ): + raise ValueError("`raw_output_path` must be a non-empty string.") + return { + "action": action, + "terms": payload.get("terms"), + "vcv": payload.get("vcv"), + "rcv": payload.get("rcv"), + "scv": payload.get("scv"), + "refsnp": payload.get("refsnp"), + "params": payload.get("params") or {}, + "max_items": max_items, + "max_depth": max_depth, + "timeout_sec": timeout_sec, + "save_raw": save_raw, + "raw_output_path": raw_output_path.strip() if isinstance(raw_output_path, str) else None, + } + + +def _save(raw_output: str, raw_output_path: str | None) -> str: + path = Path(raw_output_path or "/tmp/clinvar-variation-raw.json") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + try: + if config["action"] == "search": + if not isinstance(config["terms"], str) or not config["terms"].strip(): + raise ValueError("`terms` is required for `search`.") + params = {"terms": config["terms"].strip(), "maxList": config["max_items"]} + if not isinstance(config["params"], dict): + raise ValueError("`params` must be an object.") + params.update(config["params"]) + response = requests.get( + CLINICAL_TABLES_URL, params=params, timeout=config["timeout_sec"] + ) + response.raise_for_status() + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = ( + _save(raw_output, config["raw_output_path"]) if config["save_raw"] else None + ) + if not isinstance(data, list) or len(data) < 4: + return error( + "invalid_response", + "Clinical Tables response did not match the expected list shape.", + ) + total = data[0] + identifiers = data[1] if isinstance(data[1], list) else [] + extra_fields = data[2] + display_rows = data[3] if isinstance(data[3], list) else [] + return { + "ok": True, + "source": "clinvar-clinicaltables", + "action": "search", + "total": total, + "record_count_returned": len(display_rows[: config["max_items"]]), + "record_count_available": len(display_rows), + "truncated": len(display_rows) < total if isinstance(total, int) else False, + "identifiers": identifiers[: config["max_items"]], + "display_rows": _compact( + display_rows[: config["max_items"]], config["max_items"], config["max_depth"] + ), + "extra_fields": _compact(extra_fields, config["max_items"], config["max_depth"]), + "raw_output_path": raw_output_path, + "warnings": [], + } + + id_key = config["action"] + identifier = config[id_key] + if not isinstance(identifier, (str, int)) or not str(identifier).strip(): + raise ValueError(f"`{id_key}` is required for `{config['action']}`.") + if config["action"] == "vcv": + digits = str(identifier).strip().lstrip("VCV") + path = f"{VARIATION_BASE}/clinvar/variation/{digits}" + elif config["action"] == "rcv": + path = f"{VARIATION_BASE}/clinvar/rcv/{str(identifier).strip()}" + elif config["action"] == "scv": + path = f"{VARIATION_BASE}/clinvar/scv/{str(identifier).strip()}" + else: + digits = str(identifier).strip().lstrip("rs").lstrip("RS") + path = f"{VARIATION_BASE}/refsnp/{digits}" + + response = requests.get(path, timeout=config["timeout_sec"]) + response.raise_for_status() + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = ( + _save(raw_output, config["raw_output_path"]) if config["save_raw"] else None + ) + return { + "ok": True, + "source": "clinvar-variation", + "action": config["action"], + "summary": _compact(data, config["max_items"], config["max_depth"]), + "top_keys": list(data)[: config["max_items"]] if isinstance(data, dict) else None, + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_input", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + output = execute(payload) + sys.stdout.write(json.dumps(output)) + return 0 if output.get("ok") else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/efo-ontology-skill/SKILL.md b/plugins/life-science-research/skills/efo-ontology-skill/SKILL.md new file mode 100644 index 0000000..bb0a151 --- /dev/null +++ b/plugins/life-science-research/skills/efo-ontology-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: efo-ontology-skill +description: Submit compact EFO OLS4 requests for search, term lookup, children, and descendants. Use when a user wants concise EFO resolution or ontology-expansion summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all OLS4 and EFO API calls. +- Use `base_url=https://www.ebi.ac.uk/ols4/api`. +- Search, children, and descendant endpoints are better with `max_items=10`; single term lookups usually do not need `max_items`. +- Use the smallest ontology expansion that answers the question. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer these paths: `search`, `ontologies/efo/terms/`, and the corresponding `children` or `descendants` paths. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common OLS4 patterns: + - `{"base_url":"https://www.ebi.ac.uk/ols4/api","path":"search","params":{"q":"asthma","ontology":"efo"},"record_path":"response.docs","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/ols4/api","path":"ontologies/efo/terms/http%253A%252F%252Fwww.ebi.ac.uk%252Fefo%252FEFO_0000270"}` + - `{"base_url":"https://www.ebi.ac.uk/ols4/api","path":"ontologies/efo/terms/http%253A%252F%252Fwww.ebi.ac.uk%252Fefo%252FEFO_0000270/descendants","record_path":"_embedded.terms","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/ols4/api","path":"search","params":{"q":"asthma","ontology":"efo"},"record_path":"response.docs","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/efo-ontology-skill/agents/openai.yaml b/plugins/life-science-research/skills/efo-ontology-skill/agents/openai.yaml new file mode 100644 index 0000000..7ad851b --- /dev/null +++ b/plugins/life-science-research/skills/efo-ontology-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "EFO Ontology" + short_description: "Resolve EFO terms and ontology expansions" diff --git a/plugins/life-science-research/skills/efo-ontology-skill/scripts/rest_request.py b/plugins/life-science-research/skills/efo-ontology-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/efo-ontology-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/encode-skill/SKILL.md b/plugins/life-science-research/skills/encode-skill/SKILL.md new file mode 100644 index 0000000..b085fcd --- /dev/null +++ b/plugins/life-science-research/skills/encode-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: encode-skill +description: Submit compact ENCODE REST API requests for object lookups, portal-style search, and metadata retrieval. Use when a user wants concise ENCODE summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all ENCODE API calls. +- Use `base_url=https://www.encodeproject.org`. +- Object lookups usually do not need `max_items`; portal-style search endpoints are better with `limit=10` and `max_items=10`. +- Send `Accept: application/json` in `headers` and add `format=json` in `params` when needed. +- Keep request volume modest and avoid large unfiltered searches. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer accession paths such as `biosamples//` and search paths such as `search/`. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common ENCODE patterns: + - `{"base_url":"https://www.encodeproject.org","path":"biosamples/ENCBS000AAA/","params":{"frame":"object","format":"json"},"headers":{"Accept":"application/json"}}` + - `{"base_url":"https://www.encodeproject.org","path":"search/","params":{"type":"Experiment","assay_term_name":"RNA-seq","limit":10,"format":"json"},"record_path":"@graph","headers":{"Accept":"application/json"},"max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.encodeproject.org","path":"search/","params":{"type":"Experiment","assay_term_name":"RNA-seq","limit":10,"format":"json"},"record_path":"@graph","headers":{"Accept":"application/json"},"max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/encode-skill/agents/openai.yaml b/plugins/life-science-research/skills/encode-skill/agents/openai.yaml new file mode 100644 index 0000000..93044b0 --- /dev/null +++ b/plugins/life-science-research/skills/encode-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "ENCODE" + short_description: "Fetch ENCODE dataset summaries" diff --git a/plugins/life-science-research/skills/encode-skill/scripts/rest_request.py b/plugins/life-science-research/skills/encode-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/encode-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/ensembl-skill/SKILL.md b/plugins/life-science-research/skills/ensembl-skill/SKILL.md new file mode 100644 index 0000000..653dc43 --- /dev/null +++ b/plugins/life-science-research/skills/ensembl-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: ensembl-skill +description: Submit compact Ensembl REST API requests for lookup, overlap, cross-reference, and variation endpoints. Use when a user wants concise Ensembl summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all Ensembl API calls. +- Use `base_url=https://rest.ensembl.org`. +- The script accepts `max_items`; object lookups usually do not need it, but `overlap` and `xrefs` are better with `max_items=10`. +- Send JSON-friendly headers such as `Accept: application/json` and `Content-Type: application/json`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not part of the true request. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the script JSON verbatim only if the user explicitly asks for machine-readable output. +- Prefer these paths: `lookup/id/`, `overlap/region//`, `xrefs/id/`, and `variation//`. +- Use `save_raw=true` when the user needs the full payload. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common Ensembl patterns: + - `{"base_url":"https://rest.ensembl.org","path":"lookup/id/ENSG00000141510","headers":{"Accept":"application/json","Content-Type":"application/json"}}` + - `{"base_url":"https://rest.ensembl.org","path":"overlap/region/homo_sapiens/1:1000000-1002000","params":{"feature":"gene"},"headers":{"Accept":"application/json","Content-Type":"application/json"},"max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://rest.ensembl.org","path":"lookup/id/ENSG00000141510","headers":{"Accept":"application/json","Content-Type":"application/json"}}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/ensembl-skill/agents/openai.yaml b/plugins/life-science-research/skills/ensembl-skill/agents/openai.yaml new file mode 100644 index 0000000..5a9cf25 --- /dev/null +++ b/plugins/life-science-research/skills/ensembl-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Ensembl" + short_description: "Fetch Ensembl gene and variation summaries" diff --git a/plugins/life-science-research/skills/ensembl-skill/scripts/rest_request.py b/plugins/life-science-research/skills/ensembl-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/ensembl-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/epigraphdb-skill/SKILL.md b/plugins/life-science-research/skills/epigraphdb-skill/SKILL.md new file mode 100644 index 0000000..ca58a98 --- /dev/null +++ b/plugins/life-science-research/skills/epigraphdb-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: epigraphdb-skill +description: Submit compact EpiGraphDB API requests for ontology, literature, MR, gene-drug, and support-path evidence. Use when a user wants concise EpiGraphDB summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all EpiGraphDB API calls. +- Use `base_url=https://api.epigraphdb.org`. +- Start with `max_items=10` for list-style endpoints; use smaller caps for literature-heavy or pairwise endpoints if the response fans out quickly. +- Prefer the connectivity guard endpoints first when endpoint availability matters: `ping`, `builds`, and `meta/api-endpoints`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer targeted paths such as `ontology/gwas-efo`, `gene/drugs`, `gene/druggability/ppi`, `mr`, and `literature/gwas`. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common EpiGraphDB patterns: + - `{"base_url":"https://api.epigraphdb.org","path":"ping"}` + - `{"base_url":"https://api.epigraphdb.org","path":"ontology/gwas-efo","params":{"trait":"asthma","score_threshold":0.8,"fuzzy":true},"max_items":10}` + - `{"base_url":"https://api.epigraphdb.org","path":"gene/drugs","params":{"gene_name":"IL6R"},"max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://api.epigraphdb.org","path":"ontology/gwas-efo","params":{"trait":"asthma","score_threshold":0.8,"fuzzy":true},"max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/epigraphdb-skill/agents/openai.yaml b/plugins/life-science-research/skills/epigraphdb-skill/agents/openai.yaml new file mode 100644 index 0000000..7df58b6 --- /dev/null +++ b/plugins/life-science-research/skills/epigraphdb-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "EpiGraphDB" + short_description: "Fetch EpiGraphDB evidence summaries" diff --git a/plugins/life-science-research/skills/epigraphdb-skill/scripts/rest_request.py b/plugins/life-science-research/skills/epigraphdb-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/epigraphdb-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/eqtl-catalogue-skill/SKILL.md b/plugins/life-science-research/skills/eqtl-catalogue-skill/SKILL.md new file mode 100644 index 0000000..f0a151c --- /dev/null +++ b/plugins/life-science-research/skills/eqtl-catalogue-skill/SKILL.md @@ -0,0 +1,40 @@ +--- +name: eqtl-catalogue-skill +description: Submit compact eQTL Catalogue API requests for association retrieval and documented metadata endpoints. Use when a user wants concise public eQTL Catalogue summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all eQTL Catalogue calls. +- Use `base_url=https://www.ebi.ac.uk/eqtl/api`. +- Prefer targeted association endpoints over broad list endpoints. +- The public API currently appears strict about query validation, and live smoke tests returned intermittent `400`/`500`/timeout failures even with documented parameter sets; treat this source as usable but upstream-fragile. +- For association endpoints, the script now backfills compatibility defaults for `quant_method`, `p_lower`, `p_upper`, and blank filter strings because the live API is currently rejecting omitted optional filters. +- Prefer `variant_id` in requests; the script mirrors it to the legacy `snp` query key to accommodate the current server-side validator. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer documented versioned paths such as `v3/studies`, `v3/associations`, `v3/studies//associations`, or legacy `v1/.../associations` routes with explicit filters, and surface upstream `400`/`500` errors verbatim when they occur. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common eQTL Catalogue patterns: + - `{"base_url":"https://www.ebi.ac.uk/eqtl/api","path":"v3/studies","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/eqtl/api","path":"v3/associations","params":{"gene_id":"ENSG00000141510","rsid":"rs7903146","size":10},"max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/eqtl/api","path":"v1/genes/ENSG00000141510/associations","params":{"variant_id":"rs7903146","size":10},"max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/eqtl/api","path":"v3/studies","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/eqtl-catalogue-skill/agents/openai.yaml b/plugins/life-science-research/skills/eqtl-catalogue-skill/agents/openai.yaml new file mode 100644 index 0000000..1757358 --- /dev/null +++ b/plugins/life-science-research/skills/eqtl-catalogue-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "eQTL Catalogue" + short_description: "Fetch eQTL Catalogue summaries" diff --git a/plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/rest_request.py b/plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/rest_request.py new file mode 100644 index 0000000..7a8c1ea --- /dev/null +++ b/plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/rest_request.py @@ -0,0 +1,373 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +EQTLCAT_BASE_URL = "https://www.ebi.ac.uk/eqtl/api" +EQTLCAT_ASSOCIATION_FLOAT_DEFAULTS = { + "p_lower": 0, + "p_upper": 1, +} +EQTLCAT_ASSOCIATION_STR_DEFAULTS = { + "quant_method": "ge", + "snp": "", + "study": "", + "tissue": "", + "gene_id": "", + "molecular_trait_id": "", + "qtl_group": "", +} + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _drop_none_values(value: dict[str, Any]) -> dict[str, Any]: + return {key: item for key, item in value.items() if item is not None} + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def _is_eqtl_catalogue_association_request(base_url: str, path: str) -> bool: + return base_url.rstrip("/") == EQTLCAT_BASE_URL and "associations" in path + + +def _normalize_eqtl_catalogue_params( + base_url: str, path: str, params: dict[str, Any] +) -> dict[str, Any]: + normalized = _drop_none_values(params) + if not _is_eqtl_catalogue_association_request(base_url, path): + return normalized + + # The live eQTL Catalogue API currently validates these query keys as + # non-null even when the docs say they are optional. Backfill safe defaults + # so omitted filters do not trigger upstream Pydantic errors. + variant_id = normalized.get("variant_id") + snp = normalized.get("snp") + if variant_id and not snp: + normalized["snp"] = variant_id + elif snp and not variant_id: + normalized["variant_id"] = snp + + for key, value in EQTLCAT_ASSOCIATION_FLOAT_DEFAULTS.items(): + if key not in normalized: + normalized[key] = value + for key, value in EQTLCAT_ASSOCIATION_STR_DEFAULTS.items(): + if key not in normalized: + normalized[key] = value + return normalized + + +def _extract_error_message(response: Any) -> str: + body = (getattr(response, "text", "") or "").strip() + content_type = str((getattr(response, "headers", {}) or {}).get("content-type") or "").lower() + + if "json" in content_type and hasattr(response, "json"): + try: + data = response.json() + except ValueError: + data = None + if isinstance(data, dict): + for key in ("message", "detail", "error"): + value = data.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + + if body: + return body[:800] + return f"HTTP {getattr(response, 'status_code', 'unknown')}" + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + params = _require_object("params", payload.get("params")) + headers = _drop_none_values(_require_object("headers", payload.get("headers"))) + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _normalize_eqtl_catalogue_params(base_url, path, params), + "headers": headers, + "json_body": json_body, + "form_body": _drop_none_values(_require_object("form_body", form_body)) + if form_body is not None + else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.HTTPError as exc: + response = exc.response + if response is None: + return error("http_error", f"Request failed: {exc}") + message = _extract_error_message(response) + return error( + "http_error", + f"HTTP {response.status_code} for {config['path']}: {message}", + ) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/test_rest_request.py b/plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/test_rest_request.py new file mode 100644 index 0000000..8b41bc2 --- /dev/null +++ b/plugins/life-science-research/skills/eqtl-catalogue-skill/scripts/test_rest_request.py @@ -0,0 +1,95 @@ +import importlib.util +import unittest +from pathlib import Path +from unittest import mock + +import requests + +MODULE_PATH = Path(__file__).with_name("rest_request.py") +SPEC = importlib.util.spec_from_file_location("eqtl_catalogue_rest_request", MODULE_PATH) +rest_request = importlib.util.module_from_spec(SPEC) +assert SPEC.loader is not None +SPEC.loader.exec_module(rest_request) + + +class _FakeResponse: + def __init__(self, status_code: int, body: str, content_type: str = "application/json") -> None: + self.status_code = status_code + self.text = body + self.headers = {"content-type": content_type} + + def raise_for_status(self) -> None: + if self.status_code >= 400: + raise requests.HTTPError(f"{self.status_code} error", response=self) + + def json(self): + import json + + return json.loads(self.text) + + +class _FakeSession: + def __init__(self, response: _FakeResponse) -> None: + self.headers = {} + self._response = response + + def request(self, method: str, url: str, **kwargs): + return self._response + + def close(self) -> None: + return None + + +class RestRequestTests(unittest.TestCase): + def test_parse_input_backfills_eqtl_association_defaults(self) -> None: + payload = { + "base_url": "https://www.ebi.ac.uk/eqtl/api", + "path": "genes/ENSG00000141510/associations", + "params": {"variant_id": "rs7903146", "size": 10}, + } + + parsed = rest_request.parse_input(payload) + + self.assertEqual(parsed["params"]["variant_id"], "rs7903146") + self.assertEqual(parsed["params"]["snp"], "rs7903146") + self.assertEqual(parsed["params"]["quant_method"], "ge") + self.assertEqual(parsed["params"]["p_lower"], 0) + self.assertEqual(parsed["params"]["p_upper"], 1) + self.assertEqual(parsed["params"]["study"], "") + self.assertEqual(parsed["params"]["tissue"], "") + self.assertEqual(parsed["params"]["gene_id"], "") + self.assertEqual(parsed["params"]["molecular_trait_id"], "") + self.assertEqual(parsed["params"]["qtl_group"], "") + + def test_parse_input_does_not_backfill_other_services(self) -> None: + payload = { + "base_url": "https://www.ebi.ac.uk/gwas/rest/api/v2", + "path": "associations", + "params": {"mapped_gene": "BRCA1"}, + } + + parsed = rest_request.parse_input(payload) + + self.assertEqual(parsed["params"], {"mapped_gene": "BRCA1"}) + + def test_execute_surfaces_http_error_body(self) -> None: + payload = { + "base_url": "https://www.ebi.ac.uk/eqtl/api", + "path": "studies/BadStudy/associations", + "params": {"size": 1}, + } + response = _FakeResponse(400, '{"message":"upstream validation exploded"}') + + with mock.patch.object( + rest_request.requests, "Session", return_value=_FakeSession(response) + ): + output = rest_request.execute(payload) + + self.assertFalse(output["ok"]) + self.assertEqual(output["error"]["code"], "http_error") + self.assertIn("HTTP 400", output["error"]["message"]) + self.assertIn("upstream validation exploded", output["error"]["message"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/plugins/life-science-research/skills/eva-skill/SKILL.md b/plugins/life-science-research/skills/eva-skill/SKILL.md new file mode 100644 index 0000000..76e357f --- /dev/null +++ b/plugins/life-science-research/skills/eva-skill/SKILL.md @@ -0,0 +1,37 @@ +--- +name: eva-skill +description: Submit compact EVA REST requests for species metadata and archived variant lookups. Use when a user wants concise European Variation Archive summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all EVA calls. +- Use `base_url=https://www.ebi.ac.uk/eva/webservices/rest/v1`. +- Prefer metadata and targeted variant lookups over broad genomic window pulls. +- Keep region queries narrow by species, assembly, or small coordinate windows when possible. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer these paths: `meta/species/list` and targeted variant or region routes from the EVA REST API. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common EVA patterns: + - `{"base_url":"https://www.ebi.ac.uk/eva/webservices/rest/v1","path":"meta/species/list","record_path":"response.0.result","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/eva/webservices/rest/v1","path":"variants/rs699","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/eva/webservices/rest/v1","path":"meta/species/list","record_path":"response.0.result","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/eva-skill/agents/openai.yaml b/plugins/life-science-research/skills/eva-skill/agents/openai.yaml new file mode 100644 index 0000000..95cd98e --- /dev/null +++ b/plugins/life-science-research/skills/eva-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "EVA" + short_description: "Fetch European Variation Archive summaries" diff --git a/plugins/life-science-research/skills/eva-skill/scripts/rest_request.py b/plugins/life-science-research/skills/eva-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/eva-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/finngen-phewas-skill/SKILL.md b/plugins/life-science-research/skills/finngen-phewas-skill/SKILL.md new file mode 100644 index 0000000..14e0225 --- /dev/null +++ b/plugins/life-science-research/skills/finngen-phewas-skill/SKILL.md @@ -0,0 +1,41 @@ +--- +name: finngen-phewas-skill +description: Fetch compact FinnGen PheWAS summaries for single variants by accepting rsID, GRCh37, or GRCh38 input and resolving to the required GRCh38 query. Use when a user wants concise FinnGen association results for one variant +--- + +## Operating rules +- Use `scripts/finngen_phewas.py` for all FinnGen PheWAS lookups. +- Accept exactly one of `rsid`, `grch37`, `grch38`, or `variant`; resolve to the canonical GRCh38 `chr:pos-ref-alt` query before calling FinnGen. +- The script accepts `max_results`; start with `max_results=10` and only increase it if the first slice is insufficient. +- Re-run the lookup in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. +- If the user needs the full association payload, set `save_raw=true` and report `raw_output_path` instead of pasting large arrays into chat. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the JSON verbatim only if the user explicitly asks for machine-readable output. +- Surface the canonical queried variant, total association count, truncation status, and any returned `regions`. +- Increase `max_results` gradually instead of asking for large association dumps in one call. + +## Input +- Read one JSON object from stdin, or a single JSON string containing the variant. +- Required input: exactly one of `rsid`, `grch37`, `grch38`, or `variant` +- Optional fields: `max_results`, `save_raw`, `raw_output_path`, `timeout_sec` +- Common patterns: + - `{"grch38":"10:112998590-C-T","max_results":10}` + - `{"grch37":"10:114758349-C-T","max_results":10}` + - `{"rsid":"rs7903146","max_results":10}` + - `{"variant":"10:112998590:C:T","max_results":25,"save_raw":true}` + +## Output +- Success returns `ok`, `source`, `input`, `query_variant`, `max_results_applied`, `association_count`, `association_count_total`, `truncated`, `associations`, `variant`, `regions`, `variant_url`, `raw_output_path`, and `warnings`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"grch38":"10:112998590-C-T","max_results":10}' | python scripts/finngen_phewas.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/finngen_phewas.py`. diff --git a/plugins/life-science-research/skills/finngen-phewas-skill/agents/openai.yaml b/plugins/life-science-research/skills/finngen-phewas-skill/agents/openai.yaml new file mode 100644 index 0000000..52cd3c1 --- /dev/null +++ b/plugins/life-science-research/skills/finngen-phewas-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "FinnGen PheWAS" + short_description: "Fetch FinnGen PheWAS associations" diff --git a/plugins/life-science-research/skills/finngen-phewas-skill/scripts/finngen_phewas.py b/plugins/life-science-research/skills/finngen-phewas-skill/scripts/finngen_phewas.py new file mode 100644 index 0000000..1cfcee4 --- /dev/null +++ b/plugins/life-science-research/skills/finngen-phewas-skill/scripts/finngen_phewas.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 +"""finngen-phewas + +Fetch FinnGen PheWAS associations for one variant input. +Input JSON on stdin: + - {"grch38":"10-112998590-C-T"} + - {"grch37":"10:114758349:C:T","max_results":25} + - {"rsid":"rs7903146","max_results":25,"save_raw":true} + - "10-112998590-C-T" +Output JSON on stdout. +""" + +from __future__ import annotations + +import json +import re +import sys +from pathlib import Path +from typing import Any + +import requests +from variant_resolution import ( + VariantResolutionError, + extract_variant_input, + resolve_query_variant, +) + +FINNGEN_BASE = "https://r12.finngen.fi" +USER_AGENT = "finngen-phewas-skill/1.0 (+requests)" +DEFAULT_TIMEOUT_S = 20 +DEFAULT_MAX_RESULTS = 10 +SAFE_PATH_RE = re.compile(r"[^A-Za-z0-9._-]+") + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return { + "ok": False, + "error": {"code": code, "message": message}, + "warnings": warnings or [], + } + + +def parse_input(payload: Any) -> tuple[str, str, int, bool, str | None, float]: + if isinstance(payload, str): + return "grch38", payload.strip(), DEFAULT_MAX_RESULTS, False, None, DEFAULT_TIMEOUT_S + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + input_type, variant = extract_variant_input(payload, default_build_key="grch38") + + max_results = payload.get("max_results", DEFAULT_MAX_RESULTS) + if not isinstance(max_results, int) or max_results <= 0: + raise ValueError("`max_results` must be a positive integer when provided.") + + save_raw = payload.get("save_raw", False) + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean when provided.") + + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None: + if not isinstance(raw_output_path, str) or not raw_output_path.strip(): + raise ValueError("`raw_output_path` must be a non-empty string when provided.") + raw_output_path = raw_output_path.strip() + + timeout_sec = payload.get("timeout_sec", DEFAULT_TIMEOUT_S) + if not isinstance(timeout_sec, (int, float)) or timeout_sec <= 0: + raise ValueError("`timeout_sec` must be a positive number when provided.") + + return input_type, variant, max_results, save_raw, raw_output_path, float(timeout_sec) + + +def fetch_finngen_variant( + session: requests.Session, + variant_str: str, + timeout_sec: float, +) -> tuple[Any | None, int | None]: + encoded = requests.utils.quote(variant_str, safe=":-") + url = f"{FINNGEN_BASE}/api/variant/{encoded}" + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + + resp = session.get(url, headers=headers, timeout=timeout_sec) + if resp.status_code == 404: + return None, 404 + resp.raise_for_status() + return resp.json(), resp.status_code + + +def extract_associations(data: Any) -> list[Any]: + if data is None: + return [] + if isinstance(data, dict) and isinstance(data.get("results"), list): + return data["results"] + if isinstance(data, list): + return data + return [] + + +def resolve_raw_output_path(canonical_variant: str, raw_output_path: str | None) -> Path: + if raw_output_path: + return Path(raw_output_path).expanduser() + + safe_variant = SAFE_PATH_RE.sub("_", canonical_variant).strip("._") or "variant" + return Path("/tmp") / f"finngen-phewas-{safe_variant}.json" + + +def write_raw_json(path: Path, data: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data), encoding="utf-8") + + +def main() -> int: + warnings: list[str] = [] + + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + + try: + input_type, input_variant, max_results, save_raw, raw_output_path, timeout_sec = ( + parse_input(payload) + ) + except ValueError as exc: + sys.stdout.write(json.dumps(error("invalid_input", str(exc)))) + return 2 + + try: + resolution = resolve_query_variant( + input_type=input_type, + input_value=input_variant, + target_build="GRCh38", + ) + parsed = dict(resolution["query_variant"]) + warnings.extend(resolution["warnings"]) + except VariantResolutionError as exc: + sys.stdout.write(json.dumps(error(exc.code, exc.message, exc.warnings))) + return 1 + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"Variant resolution failed: {exc}"))) + return 1 + + session = requests.Session() + try: + data, status_code = fetch_finngen_variant(session, parsed["canonical"], timeout_sec) + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"FinnGen request failed: {exc}"))) + return 1 + except ValueError as exc: + sys.stdout.write(json.dumps(error("invalid_response", f"FinnGen returned non-JSON: {exc}"))) + return 1 + + variant_url = f"{FINNGEN_BASE}/variant/{parsed['canonical']}" + saved_raw_output_path: str | None = None + if save_raw and data is not None: + raw_path = resolve_raw_output_path(parsed["canonical"], raw_output_path) + try: + write_raw_json(raw_path, data) + except OSError as exc: + sys.stdout.write(json.dumps(error("write_error", f"Could not write raw output: {exc}"))) + return 1 + saved_raw_output_path = str(raw_path) + + if status_code == 404: + warnings.append("Variant not found in FinnGen PheWAS API (HTTP 404).") + output = { + "ok": True, + "source": "finngen", + "input": resolution["input"], + "query_variant": parsed, + "max_results_applied": max_results, + "association_count": 0, + "association_count_total": 0, + "truncated": False, + "associations": [], + "variant": None, + "regions": [], + "variant_url": variant_url, + "raw_output_path": None, + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + associations = extract_associations(data) + total = len(associations) + if total > max_results: + associations = associations[:max_results] + truncated = len(associations) < total + + variant_info = data.get("variant") if isinstance(data, dict) else None + regions = ( + data.get("regions") + if isinstance(data, dict) and isinstance(data.get("regions"), list) + else [] + ) + + output = { + "ok": True, + "source": "finngen", + "input": resolution["input"], + "query_variant": parsed, + "max_results_applied": max_results, + "association_count": len(associations), + "association_count_total": total, + "truncated": truncated, + "associations": associations, + "variant": variant_info, + "regions": regions, + "variant_url": variant_url, + "raw_output_path": saved_raw_output_path, + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/finngen-phewas-skill/scripts/variant_resolution.py b/plugins/life-science-research/skills/finngen-phewas-skill/scripts/variant_resolution.py new file mode 100644 index 0000000..cf52103 --- /dev/null +++ b/plugins/life-science-research/skills/finngen-phewas-skill/scripts/variant_resolution.py @@ -0,0 +1,397 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import Any + +import requests + +ENSEMBL_GRCH38 = "https://rest.ensembl.org" +ENSEMBL_GRCH37 = "https://grch37.rest.ensembl.org" + +DEFAULT_TIMEOUT_S = 15 +USER_AGENT = "variant-coordinate-finder/1.0 (+requests)" + +SEP_RE = re.compile(r"[-:_/\s]+") +CHR_RE = re.compile(r"^(?:chr)?([0-9]{1,2}|X|Y|M|MT)$", re.IGNORECASE) +ALLELE_RE = re.compile(r"^[A-Za-z*]+$") + + +class VariantResolutionError(Exception): + def __init__(self, code: str, message: str, warnings: list[str] | None = None): + super().__init__(message) + self.code = code + self.message = message + self.warnings = warnings or [] + + +@dataclass +class Coord: + chr: str + pos: int + ref: str | None + alts: list[str] + + +def build_key_for(build: str) -> str: + return "grch37" if build in {"GRCh37", "hg19"} else "grch38" + + +def build_variant_record( + chrom: str, + pos: int, + ref: str | None, + alt: str | None, +) -> dict[str, Any]: + record: dict[str, Any] = { + "chr": chrom, + "pos": pos, + "ref": ref, + "alt": alt, + } + if ref is not None and alt is not None: + record["canonical"] = f"{chrom}:{pos}-{ref}-{alt}" + return record + + +def parse_variant_string(value: str) -> tuple[str, int, str, str]: + raw = value.strip() + if not raw: + raise ValueError("Variant string is empty.") + + parts = [part for part in SEP_RE.split(raw) if part] + if len(parts) != 4: + raise ValueError( + "Invalid variant format. Expected chrom-pos-ref-alt with flexible separators." + ) + + chrom_raw, pos_raw, ref_raw, alt_raw = parts + match = CHR_RE.match(chrom_raw) + if not match: + raise ValueError(f"Invalid chromosome: {chrom_raw!r}") + + chrom = match.group(1).upper() + if chrom == "M": + chrom = "MT" + + try: + pos = int(pos_raw) + except ValueError as exc: + raise ValueError(f"Invalid position: {pos_raw!r}") from exc + if pos <= 0: + raise ValueError("Position must be > 0.") + + ref = ref_raw.upper() + alt = alt_raw.upper() + if not ALLELE_RE.match(ref): + raise ValueError(f"Invalid REF allele: {ref_raw!r}") + if not ALLELE_RE.match(alt): + raise ValueError(f"Invalid ALT allele: {alt_raw!r}") + + return chrom, pos, ref, alt + + +def extract_variant_input(payload: Any, *, default_build_key: str) -> tuple[str, str]: + if isinstance(payload, str): + return default_build_key, payload.strip() + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + provided: list[tuple[str, str]] = [] + for key in ("rsid", "grch37", "grch38", "variant"): + value = payload.get(key) + if isinstance(value, str) and value.strip(): + provided.append((key, value.strip())) + + if not provided: + raise ValueError( + f"Provide exactly one of `rsid`, `grch37`, `grch38`, or `{default_build_key}`." + ) + if len(provided) != 1: + raise ValueError( + "Provide exactly one variant input: `rsid`, `grch37`, `grch38`, or `variant`." + ) + + input_type, input_value = provided[0] + if input_type == "variant": + input_type = default_build_key + return input_type, input_value + + +def _server_for(build: str) -> str: + return ENSEMBL_GRCH37 if build in {"GRCh37", "hg19"} else ENSEMBL_GRCH38 + + +def _assembly_cmp(build: str) -> str: + return "GRCh37" if build in {"GRCh37", "hg19"} else "GRCh38" + + +def _get_json(url: str, *, timeout: int = DEFAULT_TIMEOUT_S) -> Any: + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + return response.json() + + +def lookup_rsid(rsid: str, build: str = "GRCh38") -> Coord | None: + server = _server_for(build) + asm = _assembly_cmp(build) + url = ( + f"{server}/variation/human/{requests.utils.quote(rsid, safe='')}" + "?content-type=application/json" + ) + + data = _get_json(url, timeout=DEFAULT_TIMEOUT_S) + mappings = data.get("mappings") if isinstance(data, dict) else None + if not mappings: + return None + + for mapping in mappings: + if ( + isinstance(mapping, dict) + and mapping.get("assembly_name") == asm + and mapping.get("seq_region_name") + and mapping.get("start") is not None + ): + allele_string = mapping.get("allele_string") + alleles = allele_string.split("/") if isinstance(allele_string, str) else [] + ref = alleles[0] if alleles else None + alts = alleles[1:] if len(alleles) > 1 else [] + return Coord( + chr=str(mapping["seq_region_name"]), + pos=int(mapping["start"]), + ref=ref, + alts=alts, + ) + + return None + + +def lookup_position( + chrom: str, + pos: int, + build: str = "GRCh38", +) -> tuple[str, str | None, list[str]] | None: + server = _server_for(build) + url = ( + f"{server}/overlap/region/human/{chrom}:{pos}-{pos}" + "?feature=variation;content-type=application/json" + ) + + data = _get_json(url, timeout=DEFAULT_TIMEOUT_S) + if not isinstance(data, list) or not data: + return None + + for variant in data: + if ( + isinstance(variant, dict) + and isinstance(variant.get("id"), str) + and variant["id"].startswith("rs") + ): + alleles = variant.get("alleles") if isinstance(variant.get("alleles"), list) else [] + ref = alleles[0] if alleles else None + alts = alleles[1:] if len(alleles) > 1 else [] + return variant["id"], ref, alts + + return None + + +def resolve_rsid_both_builds(rsid: str) -> dict[str, Any]: + g38 = None + g37 = None + warnings: list[str] = [] + + try: + g38 = lookup_rsid(rsid, "GRCh38") + except Exception as exc: # noqa: BLE001 + warnings.append(f"GRCh38 lookup failed: {type(exc).__name__}: {exc}") + + try: + g37 = lookup_rsid(rsid, "GRCh37") + except Exception as exc: # noqa: BLE001 + warnings.append(f"GRCh37 lookup failed: {type(exc).__name__}: {exc}") + + ref = (g38.ref if g38 else None) or (g37.ref if g37 else None) + alts = (g38.alts if (g38 and g38.alts) else []) or (g37.alts if g37 else []) + + return { + "rsid": rsid, + "grch38": {"chr": g38.chr if g38 else None, "pos": g38.pos if g38 else None}, + "grch37": {"chr": g37.chr if g37 else None, "pos": g37.pos if g37 else None}, + "ref": ref, + "alts": alts, + "warnings": warnings, + } + + +def resolve_position_both_builds(chrom: str, pos: int, build: str) -> dict[str, Any] | None: + is_hg19 = build in {"hg19", "GRCh37"} + other_build = "GRCh38" if is_hg19 else "GRCh37" + + pos_result = lookup_position(chrom, pos, build) + if not pos_result: + return None + + rsid, ref, alts = pos_result + + other = None + warnings: list[str] = [] + try: + other = lookup_rsid(rsid, other_build) + except Exception as exc: # noqa: BLE001 + warnings.append(f"Other-build lookup failed: {type(exc).__name__}: {exc}") + + if is_hg19: + return { + "rsid": rsid, + "grch38": {"chr": other.chr if other else None, "pos": other.pos if other else None}, + "grch37": {"chr": chrom, "pos": pos}, + "ref": (other.ref if other and other.ref else ref), + "alts": (other.alts if other and other.alts else alts), + "warnings": warnings, + } + + return { + "rsid": rsid, + "grch38": {"chr": chrom, "pos": pos}, + "grch37": {"chr": other.chr if other else None, "pos": other.pos if other else None}, + "ref": ref, + "alts": alts, + "warnings": warnings, + } + + +def resolve_variant(input_type: str, input_value: str) -> dict[str, Any]: + warnings: list[str] = [] + + if input_type == "rsid": + rsid = input_value.strip() + if not rsid.startswith("rs"): + raise ValueError("rsid must start with 'rs'.") + + resolved = resolve_rsid_both_builds(rsid) + warnings.extend(resolved.get("warnings", [])) + + ref = resolved.get("ref") + alts = resolved.get("alts") or [] + alt = alts[0] if alts else None + + g37 = None + if resolved.get("grch37", {}).get("chr") and resolved.get("grch37", {}).get("pos"): + g37 = build_variant_record( + resolved["grch37"]["chr"], + resolved["grch37"]["pos"], + ref, + alt, + ) + + g38 = None + if resolved.get("grch38", {}).get("chr") and resolved.get("grch38", {}).get("pos"): + g38 = build_variant_record( + resolved["grch38"]["chr"], + resolved["grch38"]["pos"], + ref, + alt, + ) + + return { + "input": {"type": "rsid", "value": rsid}, + "rsid": rsid, + "grch37": g37, + "grch38": g38, + "warnings": warnings, + } + + if input_type not in {"grch37", "grch38"}: + raise ValueError(f"Unsupported input type: {input_type!r}") + + build = "GRCh37" if input_type == "grch37" else "GRCh38" + chrom, pos, ref_in, alt_in = parse_variant_string(input_value) + + resolved = resolve_position_both_builds(chrom, pos, build) + if not resolved: + raise VariantResolutionError( + "not_found", + f"No rsID found at {chrom}:{pos} on {build} via Ensembl overlap endpoint.", + ) + + warnings.extend(resolved.get("warnings", [])) + + ref = resolved.get("ref") + alts = resolved.get("alts") or [] + if ref and ref_in != ref: + warnings.append(f"Input ref {ref_in} != resolved ref {ref}; keeping resolved ref.") + + alt = alt_in if alt_in in alts else (alts[0] if alts else alt_in) + if alts and alt_in not in alts: + warnings.append(f"Input alt {alt_in} not among resolved alts {alts}; using {alt}.") + + rsid = resolved.get("rsid") + + g37 = None + if resolved.get("grch37", {}).get("chr") and resolved.get("grch37", {}).get("pos"): + g37 = build_variant_record( + resolved["grch37"]["chr"], + resolved["grch37"]["pos"], + ref, + alt, + ) + + g38 = None + if resolved.get("grch38", {}).get("chr") and resolved.get("grch38", {}).get("pos"): + g38 = build_variant_record( + resolved["grch38"]["chr"], + resolved["grch38"]["pos"], + ref, + alt, + ) + + return { + "input": {"type": input_type, "value": input_value}, + "rsid": rsid, + "grch37": g37, + "grch38": g38, + "warnings": warnings, + } + + +def resolve_query_variant( + *, + input_type: str, + input_value: str, + target_build: str, +) -> dict[str, Any]: + target_key = build_key_for(target_build) + if input_type == target_key: + chrom, pos, ref, alt = parse_variant_string(input_value) + target_variant = build_variant_record(chrom, pos, ref, alt) + return { + "input": {"type": input_type, "value": input_value}, + "query_variant": target_variant, + "rsid": None, + "grch37": target_variant if target_key == "grch37" else None, + "grch38": target_variant if target_key == "grch38" else None, + "warnings": [], + } + + resolved = resolve_variant(input_type, input_value) + target_variant = resolved.get(target_key) + if not isinstance(target_variant, dict) or not target_variant.get("canonical"): + raise VariantResolutionError( + "resolution_failed", + f"Could not resolve input variant to {target_build}.", + warnings=list(resolved.get("warnings") or []), + ) + + return { + "input": resolved["input"], + "query_variant": target_variant, + "rsid": resolved.get("rsid"), + "grch37": resolved.get("grch37"), + "grch38": resolved.get("grch38"), + "warnings": list(resolved.get("warnings") or []), + } diff --git a/plugins/life-science-research/skills/genebass-gene-burden-skill/SKILL.md b/plugins/life-science-research/skills/genebass-gene-burden-skill/SKILL.md new file mode 100644 index 0000000..a997c7a --- /dev/null +++ b/plugins/life-science-research/skills/genebass-gene-burden-skill/SKILL.md @@ -0,0 +1,36 @@ +--- +name: genebass-gene-burden-skill +description: Submit compact Genebass gene burden requests for one Ensembl gene ID and one burden set. Use when a user wants concise Genebass PheWAS summaries +--- + +## Operating rules +- Use `scripts/genebass_gene_burden.py` for all Genebass calls. +- This skill accepts one Ensembl gene ID per invocation. +- `max_results` is flexible; start around `25` for broad summaries and increase only if the user explicitly wants more associations. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Supported burden sets are `pLoF`, `missense|LC`, and `synonymous`, with the aliases already handled by the script. +- If the user needs the full result set, increase `max_results` deliberately instead of dumping everything by default. + +## Input +- Read JSON from stdin as either a string Ensembl ID or an object. +- String form: + - `"ENSG00000173531"` +- Object form: + - `{"ensembl_gene_id":"ENSG00000173531","burden_set":"pLoF","max_results":25}` + +## Output +- Success returns `ok`, `source`, input metadata, `gene`, association counts, `truncated`, and compact `associations`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"ensembl_gene_id":"ENSG00000173531","burden_set":"pLoF","max_results":25}' | python scripts/genebass_gene_burden.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/genebass_gene_burden.py`. diff --git a/plugins/life-science-research/skills/genebass-gene-burden-skill/agents/openai.yaml b/plugins/life-science-research/skills/genebass-gene-burden-skill/agents/openai.yaml new file mode 100644 index 0000000..297702f --- /dev/null +++ b/plugins/life-science-research/skills/genebass-gene-burden-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Genebass Gene Burden" + short_description: "Fetch Genebass gene burden summaries" diff --git a/plugins/life-science-research/skills/genebass-gene-burden-skill/scripts/genebass_gene_burden.py b/plugins/life-science-research/skills/genebass-gene-burden-skill/scripts/genebass_gene_burden.py new file mode 100644 index 0000000..c4c7495 --- /dev/null +++ b/plugins/life-science-research/skills/genebass-gene-burden-skill/scripts/genebass_gene_burden.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +"""genebass-gene-burden + +Fetch Genebass gene burden PheWAS associations for one Ensembl gene input. +Input JSON on stdin: + - {"ensembl_gene_id":"ENSG00000173531"} + - {"ensembl_gene_id":"ENSG00000173531","burden_set":"pLoF","max_results":100} + - "ENSG00000173531" +Output JSON on stdout. +""" + +from __future__ import annotations + +import json +import re +import sys +from typing import Any +from urllib.parse import quote, unquote + +import requests + +GENEBASS_API_BASE = "https://main.genebass.org/api" +USER_AGENT = "genebass-gene-burden-skill/1.0 (+requests)" +DEFAULT_TIMEOUT_S = 30 +CANONICAL_BURDEN_SETS = ("pLoF", "missense|LC", "synonymous") +ENSG_RE = re.compile(r"^ENSG[0-9]+(?:\.[0-9]+)?$", re.IGNORECASE) + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return { + "ok": False, + "error": {"code": code, "message": message}, + "warnings": warnings or [], + } + + +def normalize_gene_id(raw: str) -> str: + value = raw.strip() + if not value: + raise ValueError("Ensembl gene ID is empty.") + if not ENSG_RE.match(value): + raise ValueError( + "Invalid Ensembl gene ID. Expected format like ENSG00000173531 (optional .version)." + ) + value = value.upper() + if "." in value: + value = value.split(".", 1)[0] + return value + + +def normalize_burden_set(raw: str) -> str: + value = unquote(raw.strip()) + key = re.sub(r"[^a-z0-9]", "", value.lower()) + + if key in {"plof", "lof"}: + return "pLoF" + if key in {"missense", "missenselc"}: + return "missense|LC" + if key == "synonymous": + return "synonymous" + + canonical = ", ".join(CANONICAL_BURDEN_SETS) + raise ValueError( + f"Invalid `burden_set`. Allowed canonical values: {canonical}. " + "Accepted aliases: LoF/LOF/lof/plof -> pLoF; missense -> missense|LC." + ) + + +def parse_input(payload: Any) -> tuple[str, str, int | None]: + if isinstance(payload, str): + return normalize_gene_id(payload), "pLoF", None + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + gene = payload.get("ensembl_gene_id") or payload.get("gene_id") or payload.get("gene") + if not gene or not isinstance(gene, str): + raise ValueError("Provide `ensembl_gene_id` as a non-empty string.") + gene_id = normalize_gene_id(gene) + + burden_raw = payload.get("burden_set", "pLoF") + if not isinstance(burden_raw, str) or not burden_raw.strip(): + raise ValueError("`burden_set` must be a non-empty string when provided.") + burden_set = normalize_burden_set(burden_raw) + + max_results = payload.get("max_results") + if max_results is None: + return gene_id, burden_set, None + + if not isinstance(max_results, int) or max_results <= 0: + raise ValueError("`max_results` must be a positive integer when provided.") + return gene_id, burden_set, max_results + + +def fetch_gene_phewas(gene_id: str, burden_set: str) -> tuple[str, Any, int]: + encoded_gene = quote(gene_id, safe="") + encoded_burden = quote(burden_set, safe="|") + url = f"{GENEBASS_API_BASE}/phewas/{encoded_gene}?burdenSet={encoded_burden}" + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + + resp = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT_S) + status_code = resp.status_code + if status_code == 404: + return url, None, status_code + if status_code >= 400: + snippet = " ".join(resp.text.split())[:180] + raise RuntimeError(f"Genebass API returned HTTP {status_code}: {snippet}") + try: + data = resp.json() + except ValueError as exc: + raise RuntimeError(f"Genebass API returned non-JSON for URL {url}") from exc + return url, data, status_code + + +def fetch_phenotypes_metadata() -> list[dict[str, Any]]: + url = f"{GENEBASS_API_BASE}/phenotypes" + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + + resp = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT_S) + if resp.status_code >= 400: + snippet = " ".join(resp.text.split())[:180] + raise RuntimeError(f"Genebass phenotypes API returned HTTP {resp.status_code}: {snippet}") + try: + data = resp.json() + except ValueError as exc: + raise RuntimeError("Genebass phenotypes API returned non-JSON.") from exc + + if isinstance(data, list): + return [row for row in data if isinstance(row, dict)] + return [] + + +def build_description_map(phenotypes: list[dict[str, Any]]) -> dict[str, str]: + mapping: dict[str, str] = {} + for row in phenotypes: + analysis_id = row.get("analysis_id") + description = row.get("description") + if isinstance(analysis_id, str) and analysis_id and isinstance(description, str): + mapping[analysis_id] = description + return mapping + + +def unpack_phewas_payload(data: Any) -> tuple[dict[str, Any] | None, list[dict[str, Any]]]: + if data is None: + return None, [] + + if isinstance(data, dict): + gene = data.get("gene") if isinstance(data.get("gene"), dict) else None + rows = data.get("phewas") if isinstance(data.get("phewas"), list) else [] + return gene, [row for row in rows if isinstance(row, dict)] + + if isinstance(data, list) and data: + first = data[0] + if isinstance(first, dict) and isinstance(first.get("phewas"), list): + gene = first.get("gene") if isinstance(first.get("gene"), dict) else None + rows = first.get("phewas") or [] + return gene, [row for row in rows if isinstance(row, dict)] + return None, [row for row in data if isinstance(row, dict)] + + return None, [] + + +def build_phenotype_id(row: dict[str, Any]) -> str: + trait_type = row.get("trait_type", "") + phenocode = row.get("phenocode", "") + pheno_sex = row.get("pheno_sex", "") + coding = row.get("coding", "") + modifier = row.get("modifier", "") + return f"{trait_type}-{phenocode}-{pheno_sex}-{coding}-{modifier}" + + +def transform_rows( + rows: list[dict[str, Any]], description_map: dict[str, str] +) -> list[dict[str, Any]]: + out: list[dict[str, Any]] = [] + for row in rows: + phenotype_id = build_phenotype_id(row) + out.append( + { + "phenotype_id": phenotype_id, + "phenotype_description": description_map.get(phenotype_id), + "skat_o_pvalue": row.get("Pvalue"), + } + ) + return out + + +def main() -> int: + warnings: list[str] = [] + + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + + try: + gene_id, burden_set, max_results = parse_input(payload) + except ValueError as exc: + sys.stdout.write(json.dumps(error("invalid_input", str(exc)))) + return 2 + + try: + query_url, data, status_code = fetch_gene_phewas(gene_id, burden_set) + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"Genebass request failed: {exc}"))) + return 1 + except RuntimeError as exc: + message = str(exc) + code = "upstream_error" + if "HTTP 500" in message: + code = "not_found_or_invalid_upstream_request" + message = ( + "Genebass returned HTTP 500. This often means an unknown gene ID or invalid " + "burden_set. Use Ensembl IDs and burden_set in " + "{pLoF, missense|LC, synonymous} (aliases are accepted)." + ) + sys.stdout.write(json.dumps(error(code, message))) + return 1 + + if status_code == 404: + warnings.append("Gene not found in Genebass PheWAS API (HTTP 404).") + output = { + "ok": True, + "source": "genebass", + "input": {"type": "ensembl_gene_id", "value": gene_id}, + "burden_set": burden_set, + "query_url": query_url, + "gene": None, + "association_count": 0, + "association_count_total": 0, + "truncated": False, + "associations": [], + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + description_map: dict[str, str] = {} + try: + phenotypes = fetch_phenotypes_metadata() + description_map = build_description_map(phenotypes) + except requests.RequestException as exc: + warnings.append(f"Could not fetch phenotype descriptions: {exc}") + except RuntimeError as exc: + warnings.append(str(exc)) + + gene, rows = unpack_phewas_payload(data) + associations = transform_rows(rows, description_map) + total = len(associations) + if max_results is not None and total > max_results: + associations = associations[:max_results] + truncated = len(associations) < total + + gene_out = None + if isinstance(gene, dict): + gene_out = { + "gene_id": gene.get("gene_id"), + "symbol": gene.get("symbol"), + "name": gene.get("name"), + } + + output = { + "ok": True, + "source": "genebass", + "input": {"type": "ensembl_gene_id", "value": gene_id}, + "burden_set": burden_set, + "query_url": query_url, + "gene": gene_out, + "association_count": len(associations), + "association_count_total": total, + "truncated": truncated, + "associations": associations, + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/gnomad-graphql-skill/SKILL.md b/plugins/life-science-research/skills/gnomad-graphql-skill/SKILL.md new file mode 100644 index 0000000..f38ed6b --- /dev/null +++ b/plugins/life-science-research/skills/gnomad-graphql-skill/SKILL.md @@ -0,0 +1,38 @@ +--- +name: gnomad-graphql-skill +description: Submit compact gnomAD GraphQL requests for frequency, gene constraint, and variant context queries. Use when a user wants concise gnomAD summaries +--- + +## Operating rules +- Use `scripts/gnomad_graphql.py` for all gnomAD GraphQL work. +- For nested GraphQL results, start with `max_items=3` to `5`. +- Keep selection sets narrow and page or filter at the query level instead of asking for broad dumps. +- Use `query_path` for long GraphQL documents instead of pasting large inline queries. +- Re-run requests in long conversations instead of relying on earlier tool output. +- Treat displayed `...` in tool previews as UI truncation, not part of the real query. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer targeted queries for variant frequency, gene constraint, or transcript consequence context. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required field: `query` or `query_path` +- Optional fields: `variables`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common gnomAD patterns: + - `{"query":"query { meta { clinvar_release_date } }"}` + - `{"query":"query Variant($variantId: String!, $dataset: DatasetId!) { variant(variantId: $variantId, dataset: $dataset) { variantId genome { ac an af } } }","variables":{"variantId":"1-55516888-G-GA","dataset":"gnomad_r4"},"max_items":3}` + +## Output +- Success returns `ok`, `source`, `top_keys`, a compact `summary`, and `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` such as `invalid_json`, `invalid_input`, `network_error`, `invalid_response`, or `graphql_error`. + +## Execution +```bash +echo '{"query":"query { meta { clinvar_release_date } }"}' | python scripts/gnomad_graphql.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/gnomad_graphql.py`. diff --git a/plugins/life-science-research/skills/gnomad-graphql-skill/agents/openai.yaml b/plugins/life-science-research/skills/gnomad-graphql-skill/agents/openai.yaml new file mode 100644 index 0000000..dd7ead4 --- /dev/null +++ b/plugins/life-science-research/skills/gnomad-graphql-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "gnomAD" + short_description: "Fetch gnomAD frequency and constraint summaries" diff --git a/plugins/life-science-research/skills/gnomad-graphql-skill/scripts/gnomad_graphql.py b/plugins/life-science-research/skills/gnomad-graphql-skill/scripts/gnomad_graphql.py new file mode 100644 index 0000000..7057e25 --- /dev/null +++ b/plugins/life-science-research/skills/gnomad-graphql-skill/scripts/gnomad_graphql.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +"""Compact gnomAD GraphQL client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +ENDPOINT = "https://gnomad.broadinstitute.org/api" + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + query = payload.get("query") + query_path = payload.get("query_path") + if query is None and query_path is None: + raise ValueError("Provide `query` or `query_path`.") + if query is not None and (not isinstance(query, str) or not query.strip()): + raise ValueError("`query` must be a non-empty string.") + if query_path is not None and (not isinstance(query_path, str) or not query_path.strip()): + raise ValueError("`query_path` must be a non-empty string.") + variables = payload.get("variables") or {} + if not isinstance(variables, dict): + raise ValueError("`variables` must be an object.") + save_raw = payload.get("save_raw", False) + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean.") + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None and ( + not isinstance(raw_output_path, str) or not raw_output_path.strip() + ): + raise ValueError("`raw_output_path` must be a non-empty string.") + for key in ("max_items", "max_depth", "timeout_sec"): + value = payload.get(key, 5 if key == "max_items" else 3 if key == "max_depth" else 60) + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{key}` must be a positive integer.") + payload[key] = value + query_text = ( + query.strip() if isinstance(query, str) else Path(query_path).read_text(encoding="utf-8") + ) + return { + "query": query_text, + "variables": variables, + "max_items": payload["max_items"], + "max_depth": payload["max_depth"], + "timeout_sec": payload["timeout_sec"], + "save_raw": save_raw, + "raw_output_path": raw_output_path.strip() if isinstance(raw_output_path, str) else None, + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + try: + response = requests.post( + ENDPOINT, + json={"query": config["query"], "variables": config["variables"]}, + timeout=config["timeout_sec"], + ) + response.raise_for_status() + data = response.json() + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"GraphQL request failed: {exc}") + + raw_output_path = None + if config["save_raw"]: + raw_text = json.dumps(data, indent=2) + path = Path(config["raw_output_path"] or "/tmp/gnomad-graphql.json") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_text, encoding="utf-8") + raw_output_path = str(path) + + if "errors" in data: + return error( + "graphql_error", + json.dumps(data["errors"])[:500], + warnings=[f"raw_output_path={raw_output_path}"] if raw_output_path else [], + ) + + payload_data = data.get("data") + if not isinstance(payload_data, dict): + return error("invalid_response", "GraphQL response did not include a `data` object.") + + return { + "ok": True, + "source": "gnomad-graphql", + "top_keys": list(payload_data)[: config["max_items"]], + "summary": _compact(payload_data, config["max_items"], config["max_depth"]), + "raw_output_path": raw_output_path, + "warnings": [], + } + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/gtex-eqtl-skill/SKILL.md b/plugins/life-science-research/skills/gtex-eqtl-skill/SKILL.md new file mode 100644 index 0000000..2cbe3c3 --- /dev/null +++ b/plugins/life-science-research/skills/gtex-eqtl-skill/SKILL.md @@ -0,0 +1,99 @@ +--- +name: gtex-eqtl-skill +description: Fetch GTEx single-tissue eQTL associations from one variant input by accepting rsID, GRCh37, or GRCh38 input and resolving to the required GRCh38 query for the GTEx v2 API. Use when a user wants eQTL associations returned as JSON. +--- + +# Operating rules + +- Use Python `requests` for all network calls. +- Accept exactly one of `rsid`, `grch37`, `grch38`, or `variant`, and resolve to a GRCh38 `chrom-pos-ref-alt` query. +- Convert to GTEx `variantId` format: `chr{chrom}_{pos}_{ref}_{alt}_b38`. +- Always return one JSON object (no markdown) as final output. + +# Input + +Accept JSON on stdin as either: + +- A string: `"10-112998590-C-T"` (treated as GRCh38) +- An object: + +```json +{ + "grch38": "10-112998590-C-T", + "max_results": 200 +} +``` + +Other accepted object forms include: + +```json +{ + "grch37": "10-114758349-C-T" +} +``` + +```json +{ + "rsid": "rs7903146", + "max_results": 50 +} +``` + +Allowed variant separators include `-`, `:`, `_`, `/`, or whitespace, for example: + +- `10-112998590-C-T` +- `10:112998590-C-T` +- `10:112998590:C:T` +- `chr10 112998590 C T` + +`max_results` is optional and truncates returned eQTL rows when provided. + +# Output + +Success shape: + +```json +{ + "ok": true, + "source": "gtex-v2", + "input": {"type": "grch38", "value": "10-112998590-C-T"}, + "query_variant": { + "chr": "10", + "pos": 112998590, + "ref": "C", + "alt": "T", + "canonical": "10:112998590-C-T", + "variant_id": "chr10_112998590_C_T_b38" + }, + "eqtl_count": 2, + "eqtl_count_total": 2, + "truncated": false, + "eqtls": [], + "paging_info": {}, + "warnings": [] +} +``` + +Failure shape: + +```json +{ + "ok": false, + "error": {"code": "...", "message": "..."}, + "warnings": [] +} +``` + +# Execution + +Use: + +- `scripts/gtex_eqtl.py` + +The script reads JSON from stdin and prints JSON to stdout. + +Example: + +```bash +echo '{"grch38":"10-112998590-C-T","max_results":5}' | python scripts/gtex_eqtl.py +``` diff --git a/plugins/life-science-research/skills/gtex-eqtl-skill/agents/openai.yaml b/plugins/life-science-research/skills/gtex-eqtl-skill/agents/openai.yaml new file mode 100644 index 0000000..ee75153 --- /dev/null +++ b/plugins/life-science-research/skills/gtex-eqtl-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "GTEx eQTL" + short_description: "Fetch GTEx eQTL associations" diff --git a/plugins/life-science-research/skills/gtex-eqtl-skill/scripts/gtex_eqtl.py b/plugins/life-science-research/skills/gtex-eqtl-skill/scripts/gtex_eqtl.py new file mode 100644 index 0000000..1b3dd82 --- /dev/null +++ b/plugins/life-science-research/skills/gtex-eqtl-skill/scripts/gtex_eqtl.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +"""gtex-eqtl + +Fetch GTEx single-tissue eQTL associations for one variant input. +Input JSON on stdin: + - {"grch38":"10-112998590-C-T"} + - {"grch37":"10:114758349:C:T","max_results":100} + - {"rsid":"rs7903146","max_results":100} + - "10-112998590-C-T" +Output JSON on stdout. +""" + +from __future__ import annotations + +import json +import sys +from typing import Any + +import requests +from variant_resolution import ( + VariantResolutionError, + extract_variant_input, + resolve_query_variant, +) + +GTEX_API = "https://gtexportal.org/api/v2" +USER_AGENT = "gtex-eqtl-skill/1.0 (+requests)" +DEFAULT_TIMEOUT_S = 25 + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return { + "ok": False, + "error": {"code": code, "message": message}, + "warnings": warnings or [], + } + + +def parse_input(payload: Any) -> tuple[str, str, int | None]: + if isinstance(payload, str): + return "grch38", payload.strip(), None + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + input_type, variant = extract_variant_input(payload, default_build_key="grch38") + + max_results = payload.get("max_results") + if max_results is None: + return input_type, variant, None + + if not isinstance(max_results, int) or max_results <= 0: + raise ValueError("`max_results` must be a positive integer when provided.") + return input_type, variant, max_results + + +def build_variant_id(parsed: dict[str, Any]) -> str: + return f"chr{parsed['chr']}_{parsed['pos']}_{parsed['ref']}_{parsed['alt']}_b38" + + +def fetch_eqtls(variant_id: str) -> Any: + encoded = requests.utils.quote(variant_id, safe="") + url = f"{GTEX_API}/association/singleTissueEqtl?variantId={encoded}" + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + resp = requests.get(url, headers=headers, timeout=DEFAULT_TIMEOUT_S) + resp.raise_for_status() + return resp.json() + + +def extract_rows(data: Any) -> list[Any]: + if isinstance(data, dict) and isinstance(data.get("data"), list): + return data["data"] + if isinstance(data, list): + return data + return [] + + +def main() -> int: + warnings: list[str] = [] + + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + + try: + input_type, input_variant, max_results = parse_input(payload) + except ValueError as exc: + sys.stdout.write(json.dumps(error("invalid_input", str(exc)))) + return 2 + + try: + resolution = resolve_query_variant( + input_type=input_type, + input_value=input_variant, + target_build="GRCh38", + ) + parsed = dict(resolution["query_variant"]) + parsed["variant_id"] = build_variant_id(parsed) + warnings.extend(resolution["warnings"]) + except VariantResolutionError as exc: + sys.stdout.write(json.dumps(error(exc.code, exc.message, exc.warnings))) + return 1 + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"Variant resolution failed: {exc}"))) + return 1 + + try: + data = fetch_eqtls(parsed["variant_id"]) + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"GTEx request failed: {exc}"))) + return 1 + except ValueError as exc: + sys.stdout.write(json.dumps(error("invalid_response", f"GTEx returned non-JSON: {exc}"))) + return 1 + + rows = extract_rows(data) + total = len(rows) + if max_results is not None and total > max_results: + rows = rows[:max_results] + truncated = len(rows) < total + + paging_info = data.get("paging_info") if isinstance(data, dict) else None + + output = { + "ok": True, + "source": "gtex-v2", + "input": resolution["input"], + "query_variant": parsed, + "eqtl_count": len(rows), + "eqtl_count_total": total, + "truncated": truncated, + "eqtls": rows, + "paging_info": paging_info, + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/gtex-eqtl-skill/scripts/variant_resolution.py b/plugins/life-science-research/skills/gtex-eqtl-skill/scripts/variant_resolution.py new file mode 100644 index 0000000..cf52103 --- /dev/null +++ b/plugins/life-science-research/skills/gtex-eqtl-skill/scripts/variant_resolution.py @@ -0,0 +1,397 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import Any + +import requests + +ENSEMBL_GRCH38 = "https://rest.ensembl.org" +ENSEMBL_GRCH37 = "https://grch37.rest.ensembl.org" + +DEFAULT_TIMEOUT_S = 15 +USER_AGENT = "variant-coordinate-finder/1.0 (+requests)" + +SEP_RE = re.compile(r"[-:_/\s]+") +CHR_RE = re.compile(r"^(?:chr)?([0-9]{1,2}|X|Y|M|MT)$", re.IGNORECASE) +ALLELE_RE = re.compile(r"^[A-Za-z*]+$") + + +class VariantResolutionError(Exception): + def __init__(self, code: str, message: str, warnings: list[str] | None = None): + super().__init__(message) + self.code = code + self.message = message + self.warnings = warnings or [] + + +@dataclass +class Coord: + chr: str + pos: int + ref: str | None + alts: list[str] + + +def build_key_for(build: str) -> str: + return "grch37" if build in {"GRCh37", "hg19"} else "grch38" + + +def build_variant_record( + chrom: str, + pos: int, + ref: str | None, + alt: str | None, +) -> dict[str, Any]: + record: dict[str, Any] = { + "chr": chrom, + "pos": pos, + "ref": ref, + "alt": alt, + } + if ref is not None and alt is not None: + record["canonical"] = f"{chrom}:{pos}-{ref}-{alt}" + return record + + +def parse_variant_string(value: str) -> tuple[str, int, str, str]: + raw = value.strip() + if not raw: + raise ValueError("Variant string is empty.") + + parts = [part for part in SEP_RE.split(raw) if part] + if len(parts) != 4: + raise ValueError( + "Invalid variant format. Expected chrom-pos-ref-alt with flexible separators." + ) + + chrom_raw, pos_raw, ref_raw, alt_raw = parts + match = CHR_RE.match(chrom_raw) + if not match: + raise ValueError(f"Invalid chromosome: {chrom_raw!r}") + + chrom = match.group(1).upper() + if chrom == "M": + chrom = "MT" + + try: + pos = int(pos_raw) + except ValueError as exc: + raise ValueError(f"Invalid position: {pos_raw!r}") from exc + if pos <= 0: + raise ValueError("Position must be > 0.") + + ref = ref_raw.upper() + alt = alt_raw.upper() + if not ALLELE_RE.match(ref): + raise ValueError(f"Invalid REF allele: {ref_raw!r}") + if not ALLELE_RE.match(alt): + raise ValueError(f"Invalid ALT allele: {alt_raw!r}") + + return chrom, pos, ref, alt + + +def extract_variant_input(payload: Any, *, default_build_key: str) -> tuple[str, str]: + if isinstance(payload, str): + return default_build_key, payload.strip() + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + provided: list[tuple[str, str]] = [] + for key in ("rsid", "grch37", "grch38", "variant"): + value = payload.get(key) + if isinstance(value, str) and value.strip(): + provided.append((key, value.strip())) + + if not provided: + raise ValueError( + f"Provide exactly one of `rsid`, `grch37`, `grch38`, or `{default_build_key}`." + ) + if len(provided) != 1: + raise ValueError( + "Provide exactly one variant input: `rsid`, `grch37`, `grch38`, or `variant`." + ) + + input_type, input_value = provided[0] + if input_type == "variant": + input_type = default_build_key + return input_type, input_value + + +def _server_for(build: str) -> str: + return ENSEMBL_GRCH37 if build in {"GRCh37", "hg19"} else ENSEMBL_GRCH38 + + +def _assembly_cmp(build: str) -> str: + return "GRCh37" if build in {"GRCh37", "hg19"} else "GRCh38" + + +def _get_json(url: str, *, timeout: int = DEFAULT_TIMEOUT_S) -> Any: + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + return response.json() + + +def lookup_rsid(rsid: str, build: str = "GRCh38") -> Coord | None: + server = _server_for(build) + asm = _assembly_cmp(build) + url = ( + f"{server}/variation/human/{requests.utils.quote(rsid, safe='')}" + "?content-type=application/json" + ) + + data = _get_json(url, timeout=DEFAULT_TIMEOUT_S) + mappings = data.get("mappings") if isinstance(data, dict) else None + if not mappings: + return None + + for mapping in mappings: + if ( + isinstance(mapping, dict) + and mapping.get("assembly_name") == asm + and mapping.get("seq_region_name") + and mapping.get("start") is not None + ): + allele_string = mapping.get("allele_string") + alleles = allele_string.split("/") if isinstance(allele_string, str) else [] + ref = alleles[0] if alleles else None + alts = alleles[1:] if len(alleles) > 1 else [] + return Coord( + chr=str(mapping["seq_region_name"]), + pos=int(mapping["start"]), + ref=ref, + alts=alts, + ) + + return None + + +def lookup_position( + chrom: str, + pos: int, + build: str = "GRCh38", +) -> tuple[str, str | None, list[str]] | None: + server = _server_for(build) + url = ( + f"{server}/overlap/region/human/{chrom}:{pos}-{pos}" + "?feature=variation;content-type=application/json" + ) + + data = _get_json(url, timeout=DEFAULT_TIMEOUT_S) + if not isinstance(data, list) or not data: + return None + + for variant in data: + if ( + isinstance(variant, dict) + and isinstance(variant.get("id"), str) + and variant["id"].startswith("rs") + ): + alleles = variant.get("alleles") if isinstance(variant.get("alleles"), list) else [] + ref = alleles[0] if alleles else None + alts = alleles[1:] if len(alleles) > 1 else [] + return variant["id"], ref, alts + + return None + + +def resolve_rsid_both_builds(rsid: str) -> dict[str, Any]: + g38 = None + g37 = None + warnings: list[str] = [] + + try: + g38 = lookup_rsid(rsid, "GRCh38") + except Exception as exc: # noqa: BLE001 + warnings.append(f"GRCh38 lookup failed: {type(exc).__name__}: {exc}") + + try: + g37 = lookup_rsid(rsid, "GRCh37") + except Exception as exc: # noqa: BLE001 + warnings.append(f"GRCh37 lookup failed: {type(exc).__name__}: {exc}") + + ref = (g38.ref if g38 else None) or (g37.ref if g37 else None) + alts = (g38.alts if (g38 and g38.alts) else []) or (g37.alts if g37 else []) + + return { + "rsid": rsid, + "grch38": {"chr": g38.chr if g38 else None, "pos": g38.pos if g38 else None}, + "grch37": {"chr": g37.chr if g37 else None, "pos": g37.pos if g37 else None}, + "ref": ref, + "alts": alts, + "warnings": warnings, + } + + +def resolve_position_both_builds(chrom: str, pos: int, build: str) -> dict[str, Any] | None: + is_hg19 = build in {"hg19", "GRCh37"} + other_build = "GRCh38" if is_hg19 else "GRCh37" + + pos_result = lookup_position(chrom, pos, build) + if not pos_result: + return None + + rsid, ref, alts = pos_result + + other = None + warnings: list[str] = [] + try: + other = lookup_rsid(rsid, other_build) + except Exception as exc: # noqa: BLE001 + warnings.append(f"Other-build lookup failed: {type(exc).__name__}: {exc}") + + if is_hg19: + return { + "rsid": rsid, + "grch38": {"chr": other.chr if other else None, "pos": other.pos if other else None}, + "grch37": {"chr": chrom, "pos": pos}, + "ref": (other.ref if other and other.ref else ref), + "alts": (other.alts if other and other.alts else alts), + "warnings": warnings, + } + + return { + "rsid": rsid, + "grch38": {"chr": chrom, "pos": pos}, + "grch37": {"chr": other.chr if other else None, "pos": other.pos if other else None}, + "ref": ref, + "alts": alts, + "warnings": warnings, + } + + +def resolve_variant(input_type: str, input_value: str) -> dict[str, Any]: + warnings: list[str] = [] + + if input_type == "rsid": + rsid = input_value.strip() + if not rsid.startswith("rs"): + raise ValueError("rsid must start with 'rs'.") + + resolved = resolve_rsid_both_builds(rsid) + warnings.extend(resolved.get("warnings", [])) + + ref = resolved.get("ref") + alts = resolved.get("alts") or [] + alt = alts[0] if alts else None + + g37 = None + if resolved.get("grch37", {}).get("chr") and resolved.get("grch37", {}).get("pos"): + g37 = build_variant_record( + resolved["grch37"]["chr"], + resolved["grch37"]["pos"], + ref, + alt, + ) + + g38 = None + if resolved.get("grch38", {}).get("chr") and resolved.get("grch38", {}).get("pos"): + g38 = build_variant_record( + resolved["grch38"]["chr"], + resolved["grch38"]["pos"], + ref, + alt, + ) + + return { + "input": {"type": "rsid", "value": rsid}, + "rsid": rsid, + "grch37": g37, + "grch38": g38, + "warnings": warnings, + } + + if input_type not in {"grch37", "grch38"}: + raise ValueError(f"Unsupported input type: {input_type!r}") + + build = "GRCh37" if input_type == "grch37" else "GRCh38" + chrom, pos, ref_in, alt_in = parse_variant_string(input_value) + + resolved = resolve_position_both_builds(chrom, pos, build) + if not resolved: + raise VariantResolutionError( + "not_found", + f"No rsID found at {chrom}:{pos} on {build} via Ensembl overlap endpoint.", + ) + + warnings.extend(resolved.get("warnings", [])) + + ref = resolved.get("ref") + alts = resolved.get("alts") or [] + if ref and ref_in != ref: + warnings.append(f"Input ref {ref_in} != resolved ref {ref}; keeping resolved ref.") + + alt = alt_in if alt_in in alts else (alts[0] if alts else alt_in) + if alts and alt_in not in alts: + warnings.append(f"Input alt {alt_in} not among resolved alts {alts}; using {alt}.") + + rsid = resolved.get("rsid") + + g37 = None + if resolved.get("grch37", {}).get("chr") and resolved.get("grch37", {}).get("pos"): + g37 = build_variant_record( + resolved["grch37"]["chr"], + resolved["grch37"]["pos"], + ref, + alt, + ) + + g38 = None + if resolved.get("grch38", {}).get("chr") and resolved.get("grch38", {}).get("pos"): + g38 = build_variant_record( + resolved["grch38"]["chr"], + resolved["grch38"]["pos"], + ref, + alt, + ) + + return { + "input": {"type": input_type, "value": input_value}, + "rsid": rsid, + "grch37": g37, + "grch38": g38, + "warnings": warnings, + } + + +def resolve_query_variant( + *, + input_type: str, + input_value: str, + target_build: str, +) -> dict[str, Any]: + target_key = build_key_for(target_build) + if input_type == target_key: + chrom, pos, ref, alt = parse_variant_string(input_value) + target_variant = build_variant_record(chrom, pos, ref, alt) + return { + "input": {"type": input_type, "value": input_value}, + "query_variant": target_variant, + "rsid": None, + "grch37": target_variant if target_key == "grch37" else None, + "grch38": target_variant if target_key == "grch38" else None, + "warnings": [], + } + + resolved = resolve_variant(input_type, input_value) + target_variant = resolved.get(target_key) + if not isinstance(target_variant, dict) or not target_variant.get("canonical"): + raise VariantResolutionError( + "resolution_failed", + f"Could not resolve input variant to {target_build}.", + warnings=list(resolved.get("warnings") or []), + ) + + return { + "input": resolved["input"], + "query_variant": target_variant, + "rsid": resolved.get("rsid"), + "grch37": resolved.get("grch37"), + "grch38": resolved.get("grch38"), + "warnings": list(resolved.get("warnings") or []), + } diff --git a/plugins/life-science-research/skills/gwas-catalog-skill/SKILL.md b/plugins/life-science-research/skills/gwas-catalog-skill/SKILL.md new file mode 100644 index 0000000..4d90a8e --- /dev/null +++ b/plugins/life-science-research/skills/gwas-catalog-skill/SKILL.md @@ -0,0 +1,41 @@ +--- +name: gwas-catalog-skill +description: Submit compact GWAS Catalog REST API v2 requests for studies, associations, SNPs, EFO traits, genes, publications, loci, and metadata. Use when a user wants concise GWAS Catalog summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all GWAS Catalog API calls. +- Use `base_url=https://www.ebi.ac.uk/gwas/rest/api/v2`. +- The script accepts `max_items`; for collection endpoints, start with API `size=10` and `max_items=10`. +- Single-resource endpoints such as `studies/` generally do not need `max_items`. +- Use `record_path` to target `_embedded.` lists. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the script JSON verbatim only if the user explicitly asks for machine-readable output. +- Prefer these paths: `metadata`, `studies`, `studies/`, `associations`, `snps`, `efoTraits`, `genes`, `publications`, and `loci`. +- Use `save_raw=true` if the user needs the full HATEOAS payload or pagination links. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common GWAS Catalog patterns: + - `{"base_url":"https://www.ebi.ac.uk/gwas/rest/api/v2","path":"metadata"}` + - `{"base_url":"https://www.ebi.ac.uk/gwas/rest/api/v2","path":"studies","params":{"efo_trait":"asthma","size":10},"record_path":"_embedded.studies","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/gwas/rest/api/v2","path":"associations","params":{"mapped_gene":"BRCA1","size":10},"record_path":"_embedded.associations","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/gwas/rest/api/v2","path":"studies","params":{"efo_trait":"asthma","size":10},"record_path":"_embedded.studies","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/gwas-catalog-skill/agents/openai.yaml b/plugins/life-science-research/skills/gwas-catalog-skill/agents/openai.yaml new file mode 100644 index 0000000..9e4b056 --- /dev/null +++ b/plugins/life-science-research/skills/gwas-catalog-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "GWAS Catalog" + short_description: "Fetch GWAS Catalog study and association summaries" diff --git a/plugins/life-science-research/skills/gwas-catalog-skill/scripts/rest_request.py b/plugins/life-science-research/skills/gwas-catalog-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/gwas-catalog-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/hmdb-skill/SKILL.md b/plugins/life-science-research/skills/hmdb-skill/SKILL.md new file mode 100644 index 0000000..bee98f0 --- /dev/null +++ b/plugins/life-science-research/skills/hmdb-skill/SKILL.md @@ -0,0 +1,38 @@ +--- +name: hmdb-skill +description: Submit compact HMDB search requests for metabolites, proteins, diseases, and pathways. Use when a user wants concise HMDB summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all HMDB calls. +- Use `base_url=https://hmdb.ca`. +- Search endpoints are better with `per_page=10` and `max_items=10`. +- Keep category-specific requests narrow instead of broad searches across multiple categories at once. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer `unearth/q` with explicit `query`, `category`, and `format=json`. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common HMDB patterns: + - `{"base_url":"https://hmdb.ca","path":"unearth/q","params":{"query":"serotonin","category":"metabolites","format":"json","per_page":10},"record_path":"metabolites","max_items":10}` + - `{"base_url":"https://hmdb.ca","path":"unearth/q","params":{"query":"glycolysis","category":"pathways","format":"json","per_page":10},"max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://hmdb.ca","path":"unearth/q","params":{"query":"serotonin","category":"metabolites","format":"json","per_page":10},"record_path":"metabolites","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/hmdb-skill/agents/openai.yaml b/plugins/life-science-research/skills/hmdb-skill/agents/openai.yaml new file mode 100644 index 0000000..a5cef90 --- /dev/null +++ b/plugins/life-science-research/skills/hmdb-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "HMDB" + short_description: "Fetch HMDB metabolite summaries" diff --git a/plugins/life-science-research/skills/hmdb-skill/scripts/rest_request.py b/plugins/life-science-research/skills/hmdb-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/hmdb-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/human-protein-atlas-skill/SKILL.md b/plugins/life-science-research/skills/human-protein-atlas-skill/SKILL.md new file mode 100644 index 0000000..f001023 --- /dev/null +++ b/plugins/life-science-research/skills/human-protein-atlas-skill/SKILL.md @@ -0,0 +1,40 @@ +--- +name: human-protein-atlas-skill +description: Submit compact Human Protein Atlas requests for gene JSON, search downloads, and page-level tissue or cell-line lookups. Use when a user wants concise Human Protein Atlas summaries; save raw JSON or HTML only on request. +--- + +## Operating rules +- Use `scripts/rest_request.py` for all Human Protein Atlas calls. +- Use `base_url=https://www.proteinatlas.org`. +- The script accepts `max_items`; single gene entry lookups usually do not need it, while search and download endpoints are better with `max_items=10`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. +- If the user asks for full HTML or JSON, set `save_raw=true` and report the saved file path instead of pasting large payloads into chat. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the script JSON verbatim only if the user explicitly asks for machine-readable output. +- Prefer these paths: `.json`, `api/search_download.php`, `search/tissue/`, and `search/cellline/`. +- For page-level search endpoints, prefer `response_format=text` so the script returns only `text_head` unless raw output is requested. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common HPA patterns: + - `{"base_url":"https://www.proteinatlas.org","path":"ENSG00000141510.json"}` + - `{"base_url":"https://www.proteinatlas.org","path":"api/search_download.php","params":{"search":"TP53","format":"json","columns":"g,gs,tissue","compress":"no"},"max_items":10}` + - `{"base_url":"https://www.proteinatlas.org","path":"search/tissue/TP53","response_format":"text"}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records`, a compact `summary`, or `text_head`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.proteinatlas.org","path":"ENSG00000141510.json"}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/human-protein-atlas-skill/agents/openai.yaml b/plugins/life-science-research/skills/human-protein-atlas-skill/agents/openai.yaml new file mode 100644 index 0000000..e3b4da8 --- /dev/null +++ b/plugins/life-science-research/skills/human-protein-atlas-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Human Protein Atlas" + short_description: "Fetch Human Protein Atlas summaries" diff --git a/plugins/life-science-research/skills/human-protein-atlas-skill/scripts/rest_request.py b/plugins/life-science-research/skills/human-protein-atlas-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/human-protein-atlas-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/ipd-skill/SKILL.md b/plugins/life-science-research/skills/ipd-skill/SKILL.md new file mode 100644 index 0000000..e7b2e0e --- /dev/null +++ b/plugins/life-science-research/skills/ipd-skill/SKILL.md @@ -0,0 +1,38 @@ +--- +name: ipd-skill +description: Submit compact IPD REST requests for HLA allele and cell-level metadata using the public IPD query API. Use when a user wants concise IPD summaries; save raw JSON or text only on request. +--- + +## Operating rules +- Use `scripts/rest_request.py` for all IPD calls. +- Use `base_url=https://www.ebi.ac.uk/cgi-bin/ipd/api`. +- The most stable public routes are `allele` and `cell`. +- For HLA allele browsing, pass `project=HLA` and keep `limit` modest. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON or text only if the user explicitly asks for machine-readable output. +- Prefer these paths: `allele`, `cell`, and `allele/download`. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common IPD patterns: + - `{"base_url":"https://www.ebi.ac.uk/cgi-bin/ipd/api","path":"allele","params":{"project":"HLA","limit":10},"record_path":"data","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/cgi-bin/ipd/api","path":"allele","params":{"project":"HLA","query":"contains(name,\"A*01\")","limit":10},"record_path":"data","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/cgi-bin/ipd/api","path":"cell","params":{"limit":10},"record_path":"data","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records`, a compact `summary`, or `text_head`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/cgi-bin/ipd/api","path":"allele","params":{"project":"HLA","limit":10},"record_path":"data","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/ipd-skill/agents/openai.yaml b/plugins/life-science-research/skills/ipd-skill/agents/openai.yaml new file mode 100644 index 0000000..72894e1 --- /dev/null +++ b/plugins/life-science-research/skills/ipd-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "IPD" + short_description: "Fetch IPD allele summaries" diff --git a/plugins/life-science-research/skills/ipd-skill/scripts/rest_request.py b/plugins/life-science-research/skills/ipd-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/ipd-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/locus-to-gene-mapper-skill/SKILL.md b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/SKILL.md new file mode 100644 index 0000000..70d94fb --- /dev/null +++ b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/SKILL.md @@ -0,0 +1,353 @@ +--- +name: locus-to-gene-mapper-skill +description: Map GWAS loci to ranked candidate genes using a deterministic multi-skill chain (EFO -> GWAS -> coordinates -> Open Targets L2G/coloc -> eQTL -> burden/coding context), with reproducible tables and optional figures. Use when a user provides a trait/EFO term and/or lead variants and needs locus-to-gene prioritization for downstream biology decisions. +--- + +## Locus-to-Gene Mapper + +Generate a reproducible locus-to-gene mapping for one trait (or a seed set of lead variants), with explicit evidence attribution and conservative confidence labels. + +This skill is optimized for bioinformaticians who need executable, traceable mapping from variant signals to plausible causal genes. + +## Required Inputs + +Provide at least one anchor source: + +- `trait_query` (string), for example `chronic obstructive pulmonary disease` +- `efo_id` (string), for example `EFO_0000341` +- `seed_rsids` (list[string]), for example `["rs1873625", "rs7903146"]` + +## Optional Inputs + +- `target_gene` (string), optional gene of interest for highlighting in output +- `show_child_traits` (bool), default `true` +- `phenotype_terms` (list[string]), optional additional terms to include when finding anchors +- `max_anchor_associations` (int), default `1200` +- `max_loci` (int), default `25` +- `max_genes_per_locus` (int), default `10` +- `max_coloc_rows_per_locus` (int), default `100` +- `max_eqtl_rows_per_variant` (int), default `200` +- `genebass_burden_sets` (list[string]), default `["pLoF", "missense|LC"]` +- `include_clinvar` (bool), default `true` +- `include_gnomad_context` (bool), default `true` +- `include_hpa_tissue_context` (bool), default `true` +- `include_figures` (bool), default `false` +- `disable_default_seeds` (bool), default `false`; if `false`, common traits automatically get built-in seed rsIDs +- `figure_output_dir` (string), default `./output/figures` +- `mapping_output_path` (string), default `./output/locus_to_gene_mapping.json` +- `summary_output_path` (string), default `./output/locus_to_gene_summary.md` + +## Runtime Requirements + +- Python `3.11+` +- `requests` +- Optional for figure generation: `matplotlib`, `seaborn`, `pandas` + +## Bundled Script (Deterministic Runner) + +- Primary entrypoint: `scripts/map_locus_to_gene.py` +- This script: + - resolves trait/EFO and anchor variants, + - resolves seed and anchor rsID coordinates directly through NCBI RefSNP/dbSNP placements, + - gathers locus-to-gene evidence through the chained skills, + - writes mapping JSON and summary markdown, + - optionally renders figures when plotting deps are available. + +Run: + +```bash +python locus-to-gene-mapper-skill/scripts/map_locus_to_gene.py \ + --input-json /path/to/input.json \ + --print-result +``` + +Quick start (no input JSON file): + +```bash +python locus-to-gene-mapper-skill/scripts/map_locus_to_gene.py \ + --trait-query "type 2 diabetes" \ + --print-result +``` + +Trait-only runs default to `include_figures=true` unless explicitly disabled with `--no-include-figures`. + +Minimal input JSON: + +```json +{ + "trait_query": "type 2 diabetes" +} +``` + +Built-in default seeds (when `disable_default_seeds=false`): + +- `type 2 diabetes` / `t2d` -> `rs7903146`, `rs13266634`, `rs7756992`, `rs5219`, `rs1801282`, `rs4402960` +- `coronary artery disease` / `cad` -> `rs1333049`, `rs4977574`, `rs9349379`, `rs6725887`, `rs1746048`, `rs3184504` +- `body mass index` / `bmi` -> `rs9939609`, `rs17782313`, `rs6548238`, `rs10938397`, `rs7498665`, `rs7138803` +- `asthma` -> `rs7216389`, `rs2305480`, `rs9273349` +- `rheumatoid arthritis` -> `rs2476601`, `rs3761847`, `rs660895` +- `alzheimer disease` -> `rs429358`, `rs7412`, `rs6733839`, `rs11136000`, `rs3851179` +- `ldl cholesterol` / `total cholesterol` -> `rs7412`, `rs429358`, `rs6511720`, `rs629301`, `rs12740374`, `rs11591147` + +## Autonomous Execution Contract (Embedded Behavior) + +When a user asks for locus-to-gene mapping and gives only a trait (for example, `type 2 diabetes`), do the following automatically: + +1. Run the bundled script with `--trait-query "" --print-result` (no manual JSON required). +2. If it returns `No anchors remained`, rerun once with a built-in default seed rsID for that trait (unless `disable_default_seeds=true`). +3. Read the generated `mapping_output_path` and `summary_output_path`. +4. Return this concise response structure: + - `Top 5 cross-locus prioritized genes` + - `Per-locus top gene (score, confidence)` + - `Visualization artifact` (figure path(s) or Mermaid fallback block) + - `Warnings and limitations` +5. For inline image rendering in chat: + - read `inline_image_markdown` from script result + - emit those lines exactly as plain markdown (no code fences) + - if inline rendering still fails, instruct user to upload PNG files into the chat + +Do not ask the user to run python manually unless execution is actually blocked. + +## Skill Chaining Order (Mandatory) + +Use these skills in order. Skip only when an earlier step is not needed by provided inputs. + +1. `efo-ontology-skill` + - Resolve `trait_query` to canonical EFO term and synonyms. + - Expand descendants when `show_child_traits=true`. +2. `gwas-catalog-skill` + - Discover anchor variants for the trait/EFO scope. + - Pull association/study metadata for locus context. +3. Built-in NCBI RefSNP coordinate resolution + - Normalize each anchor rsID to GRCh37/GRCh38 top-level chromosome placements. +4. `opentargets-skill` + - Retrieve credible set context, L2G predictions, and colocalisation evidence per locus. +5. `gtex-eqtl-skill` + - Retrieve single-tissue eQTL support for anchor variants. +6. `genebass-gene-burden-skill` + - Retrieve rare-variant burden support for candidate genes. +7. `clinvar-variation-skill` (when `include_clinvar=true`) + - Add variant clinical/coding annotations. +8. `gnomad-graphql-skill` (when `include_gnomad_context=true`) + - Add frequency and gene-level constraint context. +9. `human-protein-atlas-skill` (when `include_hpa_tissue_context=true`) + - Add tissue plausibility context for top genes. + +Never perform additional retrieval after final candidate-gene scoring starts. + +## Output Contract (Required) + +Always return: + +1. `locus_to_gene_mapping.json` +2. `locus_to_gene_summary.md` + +### JSON contract + +```json +{ + "meta": { + "trait_query": "...", + "efo_id": "EFO_...", + "generated_at": "ISO-8601", + "sources_queried": [] + }, + "anchors": [ + { + "rsid": "rs...", + "grch38": {"chr": "3", "pos": 49629531, "ref": "A", "alt": "C"}, + "lead_trait": "...", + "p_value": 2e-11, + "cohort": "..." + } + ], + "loci": [ + { + "locus_id": "chr3:49000000-50200000", + "lead_rsid": "rs...", + "candidate_genes": [ + { + "symbol": "MST1", + "ensembl_id": "ENSG...", + "overall_score": 0.71, + "confidence": "High|Medium|Low|VeryLow", + "evidence": { + "l2g_max": 0.83, + "coloc_max_h4": 0.84, + "eqtl_tissues": ["Lung"], + "rare_variant_support": "none|nominal|strong", + "coding_support": "none|noncoding|coding", + "clinvar_support": "none|present", + "gnomad_context": "...", + "hpa_tissue_support": ["lung"] + }, + "rationale": [ + "..." + ], + "limitations": [ + "..." + ] + } + ] + } + ], + "cross_locus_ranked_genes": [ + { + "symbol": "...", + "supporting_loci": 3, + "mean_score": 0.62, + "max_score": 0.81 + } + ], + "warnings": [], + "limitations": [] +} +``` + +### Markdown summary contract + +The summary must include sections in this exact order: + +1. `Objective` +2. `Inputs and scope` +3. `Anchor variant summary` +4. `Per-locus top genes` +5. `Cross-locus prioritized genes` +6. `Key caveats` +7. `Recommended next analyses` + +## Optional Figure Contract + +Only produce figures when `include_figures=true`. + +If figures are generated, append this block to JSON: + +```json +{ + "figures": [ + { + "id": "locus_gene_heatmap", + "path": "./output/figures/locus_gene_heatmap.png", + "caption": "Top candidate genes by evidence component across loci" + } + ] +} +``` + +Recommended figure set: + +1. `locus_gene_heatmap.png` + - Rows: top genes, columns: evidence components (`L2G`, `coloc`, `eQTL`, `burden`, `coding`). +2. `locus_score_decomposition.png` + - Stacked bars per locus for top 3 genes. +3. `tissue_support_dotplot.png` + - Gene-by-tissue evidence dots from GTEx/HPA context. + +If plotting dependencies are unavailable, skip PNG generation and output Mermaid diagrams in markdown as fallback. +The script also returns `inline_image_markdown` and `render_instructions` fields to support inline chat rendering. + +## Scoring Rules (Deterministic) + +For each candidate gene per locus, compute: + +- `l2g_component`: max L2G score for the gene in locus (`0..1`) +- `coloc_component`: max `h4` (or `clpp` when only CLPP is available), clipped to `0..1` +- `eqtl_component`: `min(1, relevant_tissue_hits / 3)` +- `burden_component`: + - `1.0` if burden `p < 2.5e-6` + - `0.6` if `2.5e-6 <= p < 0.05` + - `0.0` otherwise +- `coding_component`: + - `1.0` for coding consequence in target gene with supportive ClinVar annotation + - `0.6` for coding consequence in target gene without supportive ClinVar annotation + - `0.3` for noncoding-in-gene support only + - `0.0` otherwise + +Overall score: + +`overall_score = 0.40*l2g + 0.25*coloc + 0.15*eqtl + 0.10*burden + 0.10*coding` + +Confidence label: + +- `High` if score `>= 0.75` +- `Medium` if `0.55 <= score < 0.75` +- `Low` if `0.35 <= score < 0.55` +- `VeryLow` if score `< 0.35` + +## Pipeline Contract + +### Phase 0: Validate and normalize input + +- Enforce that at least one of `trait_query`, `efo_id`, `seed_rsids` is present. +- Normalize rsID formatting and deduplicate seed variants. +- Resolve free-text trait to one canonical EFO term when needed. + +### Phase 1: Build anchor set + +- If trait/EFO input is provided, pull associations and rank anchors by p-value and effect availability. +- Merge trait-derived anchors with user-supplied `seed_rsids`. +- Cap anchors using `max_loci` and log dropped anchors in `warnings`. + +### Phase 2: Gather locus-to-gene evidence + +- Normalize anchor coordinates (both builds when possible). +- Pull Open Targets locus evidence (credible set/L2G/coloc). +- Pull GTEx variant-level eQTL rows. +- Pull gene-level burden results for mapped candidate genes. +- Pull ClinVar and gnomAD context when enabled. + +### Phase 3: Harmonize and score + +- Build a per-locus candidate-gene table. +- Compute deterministic component scores and overall score. +- Create cross-locus aggregate rankings. + +### Phase 4: Synthesize outputs + +- Write JSON mapping file. +- Write markdown summary in exact section order. +- Optionally generate figures and append `figures` metadata. + +### Phase 5: QC gates + +Fail the run when any of the following occurs: + +- No anchors after normalization. +- Unresolved GRCh38 coordinates should be surfaced as `status=degraded`, not treated as an analytically clean pass. +- Any locus has candidate genes without score fields. +- `overall_score` outside `0..1`. +- Summary section order mismatch. +- Claim of causality without explicit evidence support in rationale text. + +## Public Interface + +```python +def map_locus_to_gene(input_json: dict) -> dict: + ... +``` + +Return: + +```json +{ + "status": "ok", + "mapping_output_path": "./output/locus_to_gene_mapping.json", + "summary_output_path": "./output/locus_to_gene_summary.md", + "figure_paths": [], + "warnings": [], + "limitations": [] +} +``` + +## Non-Invention Rules + +- Never invent rsIDs, p-values, scores, cohort labels, tissues, or gene links. +- Never silently impute missing evidence as positive support. +- When evidence is missing, record it as a limitation and reduce confidence. +- Keep evidence provenance explicit (`source skill` + endpoint family) in rationale lines. + +## Non-Goals + +- Do not claim definitive causal genes from association evidence alone. +- Do not run fine-mapping methods not directly provided by upstream sources. +- Do not collapse multiple independent signals into one without stating assumptions. diff --git a/plugins/life-science-research/skills/locus-to-gene-mapper-skill/agents/openai.yaml b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/agents/openai.yaml new file mode 100644 index 0000000..f6f710c --- /dev/null +++ b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Locus-to-Gene Mapper" + short_description: "Map GWAS loci to ranked candidate genes" diff --git a/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/map_locus_to_gene.py b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/map_locus_to_gene.py new file mode 100644 index 0000000..49c2c27 --- /dev/null +++ b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/map_locus_to_gene.py @@ -0,0 +1,2209 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import argparse +import datetime as dt +import json +import math +import re +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +import requests + +GWAS_BASE = "https://www.ebi.ac.uk/gwas/rest/api" +EFO_BASE = "https://www.ebi.ac.uk/ols4/api" +OT_BASE = "https://api.platform.opentargets.org/api/v4/graphql" +GNOMAD_BASE = "https://gnomad.broadinstitute.org/api" +REFSNP_BASE = "https://api.ncbi.nlm.nih.gov/variation/v0/refsnp" + +DEFAULT_LOCUS_PADDING_BP = 1_000_000 +REFSEQ_CHROMOSOMES = {f"NC_{i:06d}": str(i) for i in range(1, 23)} +REFSEQ_CHROMOSOMES.update({"NC_000023": "X", "NC_000024": "Y", "NC_012920": "MT"}) + +REPO_ROOT = Path(__file__).resolve().parents[2] +GTEX_EQTL_SCRIPT = REPO_ROOT / "gtex-eqtl-skill" / "scripts" / "gtex_eqtl.py" +GENEBASS_GENE_BURDEN_SCRIPT = ( + REPO_ROOT / "genebass-gene-burden-skill" / "scripts" / "genebass_gene_burden.py" +) + +TOKEN_STOPWORDS = { + "disease", + "disorder", + "trait", + "syndrome", + "chronic", + "acute", + "self", + "reported", + "unknown", +} + +DEFAULT_TRAIT_SEED_RSIDS: dict[str, list[str]] = { + "type 2 diabetes": ["rs7903146", "rs13266634", "rs7756992", "rs5219", "rs1801282", "rs4402960"], + "type ii diabetes": [ + "rs7903146", + "rs13266634", + "rs7756992", + "rs5219", + "rs1801282", + "rs4402960", + ], + "t2d": ["rs7903146", "rs13266634", "rs7756992", "rs5219", "rs1801282", "rs4402960"], + "coronary artery disease": [ + "rs1333049", + "rs4977574", + "rs9349379", + "rs6725887", + "rs1746048", + "rs3184504", + ], + "cad": ["rs1333049", "rs4977574", "rs9349379", "rs6725887", "rs1746048", "rs3184504"], + "body mass index": [ + "rs9939609", + "rs17782313", + "rs6548238", + "rs10938397", + "rs7498665", + "rs7138803", + ], + "bmi": ["rs9939609", "rs17782313", "rs6548238", "rs10938397", "rs7498665", "rs7138803"], + "asthma": ["rs7216389", "rs2305480", "rs9273349"], + "rheumatoid arthritis": ["rs2476601", "rs3761847", "rs660895"], + "alzheimer disease": ["rs429358", "rs7412", "rs6733839", "rs11136000", "rs3851179"], + "alzheimers disease": ["rs429358", "rs7412", "rs6733839", "rs11136000", "rs3851179"], + "ldl cholesterol": ["rs7412", "rs429358", "rs6511720", "rs629301", "rs12740374", "rs11591147"], + "total cholesterol": [ + "rs7412", + "rs429358", + "rs6511720", + "rs629301", + "rs12740374", + "rs11591147", + ], +} + +SEARCH_STUDY_QUERY = """ +query searchStudy($q: String!, $page: Pagination) { + search(queryString: $q, entityNames: ["study"], page: $page) { + total + hits { + score + object { + ... on Study { + id + projectId + traitFromSource + hasSumstats + } + } + } + } +} +""" + +STUDY_CREDIBLE_SETS_QUERY = """ +query studyCredibleSets($studyId: String!, $page: Pagination) { + study(studyId: $studyId) { + id + projectId + traitFromSource + credibleSets(page: $page) { + count + rows { + studyLocusId + chromosome + position + pValueExponent + pValueMantissa + variant { id rsIds } + } + } + } +} +""" + +CREDIBLE_SETS_DETAIL_BATCH_QUERY = """ +query l2gAndColoc($studyLocusIds: [String!]!) { + credibleSets(studyLocusIds: $studyLocusIds) { + rows { + studyLocusId + l2GPredictions { + rows { score target { id approvedSymbol } } + } + colocalisation(page: {index: 0, size: 100}) { + rows { + colocalisationMethod + h4 + clpp + otherStudyLocus { studyId studyLocusId } + } + } + } + } +} +""" + +SEARCH_TARGET_QUERY = """ +query searchTarget($q: String!) { + search(queryString: $q, entityNames: ["target"], page: {index: 0, size: 10}) { + hits { + score + object { + ... on Target { + id + approvedSymbol + approvedName + } + } + } + } +} +""" + +GNOMAD_GENE_QUERY = """ +query GeneConstraint($geneSymbol: String!, $referenceGenome: ReferenceGenomeId!) { + gene(gene_symbol: $geneSymbol, reference_genome: $referenceGenome) { + symbol + gencode_symbol + gnomad_constraint { + exp_lof + obs_lof + oe_lof + oe_lof_lower + oe_lof_upper + lof_z + mis_z + pLI + } + } +} +""" + +CODING_SEQUENCE_TERMS = { + "missense_variant", + "stop_gained", + "stop_lost", + "frameshift_variant", + "protein_altering_variant", + "inframe_insertion", + "inframe_deletion", + "splice_donor_variant", + "splice_acceptor_variant", +} + + +def now_iso() -> str: + return dt.datetime.now(dt.timezone.utc).isoformat() + + +def ensure_parent(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + + +def dedupe_keep_order(items: list[str]) -> list[str]: + seen: set[str] = set() + out: list[str] = [] + for item in items: + s = str(item).strip() + if not s: + continue + if s in seen: + continue + seen.add(s) + out.append(s) + return out + + +def safe_float(value: Any) -> float | None: + if value is None: + return None + if isinstance(value, (int, float)): + return float(value) + s = str(value).strip() + if not s: + return None + s = s.replace(",", "") + match = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", s) + if not match: + return None + try: + return float(match.group(0)) + except ValueError: + return None + + +def coerce_dict(value: Any) -> dict[str, Any]: + return value if isinstance(value, dict) else {} + + +def coerce_list_of_dicts(value: Any) -> list[dict[str, Any]]: + if not isinstance(value, list): + return [] + return [item for item in value if isinstance(item, dict)] + + +def as_string_list(value: Any) -> list[str]: + if value is None: + return [] + if isinstance(value, list): + return [str(v).strip() for v in value if str(v).strip()] + s = str(value).strip() + return [s] if s else [] + + +def normalize_rsid(value: str) -> str | None: + m = re.search(r"(rs\d+)", value.strip(), flags=re.IGNORECASE) + if not m: + return None + return m.group(1).lower().replace("rs", "rs") + + +def normalize_trait_key(value: str) -> str: + return re.sub(r"[^a-z0-9]+", " ", value.lower()).strip() + + +def tokenize(value: str) -> set[str]: + tokens = {tok for tok in re.findall(r"[A-Za-z0-9]+", value.lower()) if len(tok) > 2} + return {tok for tok in tokens if tok not in TOKEN_STOPWORDS} + + +def lexical_match_score(text: str, term: str) -> float: + text_n = re.sub(r"[^a-z0-9]+", " ", text.lower()).strip() + term_n = re.sub(r"[^a-z0-9]+", " ", term.lower()).strip() + if not text_n or not term_n: + return 0.0 + if term_n in text_n: + return 1.0 + + text_tokens = tokenize(text_n) + term_tokens = tokenize(term_n) + if not text_tokens or not term_tokens: + return 0.0 + + overlap = len(text_tokens.intersection(term_tokens)) + if overlap == 0: + return 0.0 + + coverage = overlap / max(len(term_tokens), 1) + precision = overlap / max(len(text_tokens), 1) + score = 0.6 * coverage + 0.4 * precision + if overlap >= 2: + score += 0.1 + return min(score, 1.0) + + +def safe_get_json( + url: str, params: dict[str, Any] | None = None, timeout: int = 45 +) -> dict[str, Any]: + response = requests.get(url, params=params, timeout=timeout) + response.raise_for_status() + payload = response.json() + if isinstance(payload, dict): + return payload + return {"results": payload} + + +def safe_post_json(url: str, payload: dict[str, Any], timeout: int = 60) -> dict[str, Any]: + response = requests.post(url, json=payload, timeout=timeout) + response.raise_for_status() + data = response.json() + if isinstance(data, dict): + return data + return {"results": data} + + +def run_json_skill_script( + script_path: Path, + payload: dict[str, Any], + limitations: list[str], + timeout_s: int = 45, +) -> dict[str, Any] | None: + if not script_path.exists(): + limitations.append(f"Missing skill script: {script_path}") + return None + try: + proc = subprocess.run( + [sys.executable, str(script_path)], + input=json.dumps(payload), + text=True, + capture_output=True, + timeout=timeout_s, + check=False, + ) + except Exception as exc: + limitations.append(f"Failed to execute {script_path.name}: {exc}") + return None + + if proc.returncode != 0: + stderr = (proc.stderr or "").strip() + stdout = (proc.stdout or "").strip() + details = stderr or stdout or f"exit_code={proc.returncode}" + limitations.append(f"{script_path.name} failed: {details}") + return None + + out = (proc.stdout or "").strip() + if not out: + limitations.append(f"{script_path.name} returned empty output") + return None + + try: + parsed = json.loads(out) + except Exception as exc: + limitations.append(f"{script_path.name} returned non-JSON output: {exc}") + return None + + if not isinstance(parsed, dict): + limitations.append(f"{script_path.name} returned unexpected JSON shape") + return None + + return parsed + + +def resolve_efo(trait_query: str, warnings: list[str], limitations: list[str]) -> dict[str, Any]: + if not trait_query: + return { + "anchor_label": "", + "efo_id": None, + "anchor_iri": None, + "synonyms": [], + "descendants": [], + "resolver_source": "efo-ontology-skill", + } + + params = { + "q": trait_query, + "ontology": "efo", + "type": "class", + "queryFields": "label,synonym,short_form,obo_id", + "rows": 25, + "exact": "false", + "local": "true", + } + try: + search_data = safe_get_json(f"{EFO_BASE}/search", params=params) + docs = (search_data.get("response") or {}).get("docs") or [] + if not docs: + warnings.append("No EFO hit found for trait_query; continuing with free-text only.") + return { + "anchor_label": trait_query, + "efo_id": None, + "anchor_iri": None, + "synonyms": [], + "descendants": [], + "resolver_source": "efo-ontology-skill", + } + + top = coerce_dict(docs[0]) + iri = top.get("iri") + label = str(top.get("label") or trait_query) + efo_id = top.get("obo_id") + synonyms = as_string_list(top.get("synonym")) + + descendants: list[str] = [] + if iri: + encoded = requests.utils.quote(requests.utils.quote(str(iri), safe=""), safe="") + page = 0 + total_pages = 1 + while page < total_pages and page < 6: + desc_data = safe_get_json( + f"{EFO_BASE}/ontologies/efo/terms/{encoded}/descendants", + params={"size": 200, "page": page}, + ) + rows = (desc_data.get("_embedded") or {}).get("terms") or [] + descendants.extend( + [str(row.get("label")).strip() for row in rows if coerce_dict(row).get("label")] + ) + page_info = coerce_dict(desc_data.get("page")) + total_pages = int(page_info.get("totalPages", 0) or 0) + page += 1 + + return { + "anchor_label": label, + "efo_id": efo_id, + "anchor_iri": iri, + "synonyms": dedupe_keep_order(synonyms), + "descendants": dedupe_keep_order(descendants), + "resolver_source": "efo-ontology-skill", + } + except Exception as exc: + limitations.append(f"EFO resolver unavailable: {exc}") + return { + "anchor_label": trait_query, + "efo_id": None, + "anchor_iri": None, + "synonyms": [], + "descendants": [], + "resolver_source": "efo-ontology-skill", + } + + +def gwas_iter_associations( + params: dict[str, Any], + max_rows: int, + page_size: int = 200, + max_pages: int = 25, +) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + page = 0 + total_pages = 1 + while page < total_pages and page < max_pages and len(rows) < max_rows: + q = dict(params) + q.update({"size": page_size, "page": page}) + data = safe_get_json(f"{GWAS_BASE}/v2/associations", params=q, timeout=45) + chunk = (data.get("_embedded") or {}).get("associations") or [] + rows.extend(coerce_list_of_dicts(chunk)) + page_info = coerce_dict(data.get("page")) + total_pages = int(page_info.get("totalPages", 0) or 0) + page += 1 + time.sleep(0.05) + return rows[:max_rows] + + +def parse_rsid_from_association(row: dict[str, Any]) -> str | None: + snp_allele = row.get("snp_allele") + if isinstance(snp_allele, list): + for item in snp_allele: + if isinstance(item, dict) and item.get("rs_id"): + rsid = normalize_rsid(str(item["rs_id"])) + if rsid: + return rsid + effect = row.get("snp_effect_allele") + if isinstance(effect, list) and effect: + token = str(effect[0]) + rsid = normalize_rsid(token) + if rsid: + return rsid + if row.get("rs_id"): + rsid = normalize_rsid(str(row["rs_id"])) + if rsid: + return rsid + snp_link = coerce_dict(coerce_dict(row.get("_links")).get("snp")).get("href") + if isinstance(snp_link, str): + rsid = normalize_rsid(snp_link) + if rsid: + return rsid + return None + + +def extract_trait_name(row: dict[str, Any]) -> str: + efo_traits = row.get("efo_traits") + if isinstance(efo_traits, list): + for trait in efo_traits: + if isinstance(trait, dict) and trait.get("efo_trait"): + return str(trait["efo_trait"]) + reported_trait = row.get("reported_trait") + if isinstance(reported_trait, list) and reported_trait: + return str(reported_trait[0]) + if isinstance(reported_trait, str): + return reported_trait + return "" + + +def extract_mapped_genes(row: dict[str, Any]) -> list[str]: + mapped = row.get("mapped_genes") + out: list[str] = [] + if isinstance(mapped, list): + for entry in mapped: + if isinstance(entry, str): + parts = [p.strip() for p in entry.split(",") if p.strip()] + out.extend(parts) + return dedupe_keep_order(out) + + +def normalize_anchor_row(row: dict[str, Any]) -> dict[str, Any] | None: + rsid = parse_rsid_from_association(row) + if not rsid: + return None + p_value = safe_float(row.get("p_value")) + trait_name = extract_trait_name(row) + return { + "rsid": rsid, + "lead_trait": trait_name, + "p_value": p_value, + "cohort": "", + "accession_id": row.get("accession_id"), + "mapped_genes": extract_mapped_genes(row), + "association_id": row.get("association_id"), + } + + +def fetch_gwas_study_metadata( + accession_ids: list[str], limitations: list[str] +) -> dict[str, dict[str, Any]]: + out: dict[str, dict[str, Any]] = {} + for accession_id in sorted(set(accession_ids)): + if not accession_id: + continue + try: + data = safe_get_json(f"{GWAS_BASE}/v2/studies/{accession_id}", timeout=45) + out[accession_id] = { + "cohort": ", ".join(as_string_list(data.get("cohort"))), + "discovery_ancestry": ", ".join(as_string_list(data.get("discovery_ancestry"))), + "initial_sample_size": data.get("initial_sample_size"), + } + time.sleep(0.03) + except Exception as exc: + limitations.append(f"GWAS study metadata unavailable for {accession_id}: {exc}") + return out + + +def chromosome_from_refseq(seq_id: str) -> str | None: + accession = seq_id.split(".", 1)[0] + return REFSEQ_CHROMOSOMES.get(accession) + + +def assembly_key_from_traits(traits: list[dict[str, Any]]) -> str | None: + for trait in traits: + assembly_name = str(trait.get("assembly_name") or "") + if assembly_name.startswith("GRCh38"): + return "grch38" + if assembly_name.startswith("GRCh37"): + return "grch37" + return None + + +def coordinate_from_placement(placement: dict[str, Any]) -> dict[str, Any] | None: + seq_id = str(placement.get("seq_id") or "") + chrom = chromosome_from_refseq(seq_id) + if not chrom: + return None + + placement_annot = coerce_dict(placement.get("placement_annot")) + traits = coerce_list_of_dicts(placement_annot.get("seq_id_traits_by_assembly")) + if not traits: + return None + + # Prefer primary top-level chromosome placements over alt loci or patches. + if not any( + trait.get("is_top_level") + and trait.get("is_chromosome") + and not trait.get("is_alt") + and not trait.get("is_patch") + for trait in traits + ): + return None + + spdis: list[dict[str, Any]] = [] + for allele in coerce_list_of_dicts(placement.get("alleles")): + spdi = coerce_dict(coerce_dict(allele.get("allele")).get("spdi")) + if spdi: + spdis.append(spdi) + if not spdis: + return None + + positions = {spdi.get("position") for spdi in spdis if spdi.get("position") is not None} + if not positions: + return None + try: + pos = int(sorted(positions)[0]) + 1 + except Exception: + return None + + deleted_sequences = [ + str(spdi.get("deleted_sequence") or "") + for spdi in spdis + if str(spdi.get("deleted_sequence") or "") + ] + if not deleted_sequences: + return None + ref = deleted_sequences[0] + + alternate_alleles = sorted( + { + str(spdi.get("inserted_sequence") or "") + for spdi in spdis + if str(spdi.get("inserted_sequence") or "") + and str(spdi.get("inserted_sequence") or "") != str(spdi.get("deleted_sequence") or "") + } + ) + alt = alternate_alleles[-1] if alternate_alleles else ref + + assembly_name = str(traits[0].get("assembly_name") or "") + return { + "chr": chrom, + "pos": pos, + "ref": ref, + "alt": alt, + "alternate_alleles": alternate_alleles, + "seq_id": seq_id, + "assembly": assembly_name, + } + + +def fetch_refsnp_payload(rsid: str, limitations: list[str]) -> dict[str, Any] | None: + digits = "".join(ch for ch in rsid if ch.isdigit()) + if not digits: + return None + try: + return safe_get_json(f"{REFSNP_BASE}/{digits}", timeout=35) + except Exception as exc: + limitations.append(f"RefSNP lookup failed for {rsid}: {exc}") + return None + + +def resolve_refsnp_coordinates( + rsid: str, warnings: list[str], limitations: list[str] +) -> dict[str, dict[str, Any]]: + payload = fetch_refsnp_payload(rsid, limitations) + if not payload: + return {} + + coords: dict[str, dict[str, Any]] = {} + snapshot = coerce_dict(payload.get("primary_snapshot_data")) + for placement in coerce_list_of_dicts(snapshot.get("placements_with_allele")): + traits = coerce_list_of_dicts( + coerce_dict(placement.get("placement_annot")).get("seq_id_traits_by_assembly") + ) + assembly_key = assembly_key_from_traits(traits) + if not assembly_key or assembly_key in coords: + continue + coord = coordinate_from_placement(placement) + if coord: + coords[assembly_key] = coord + + if "grch38" not in coords: + warnings.append(f"Coordinate lookup did not find a GRCh38 top-level placement for {rsid}.") + return coords + + +def resolve_anchor_coordinates( + anchors: list[dict[str, Any]], warnings: list[str], limitations: list[str] +) -> None: + for anchor in anchors: + rsid = str(anchor.get("rsid") or "") + if not rsid: + continue + coord_result = resolve_refsnp_coordinates(rsid, warnings, limitations) + g38 = coerce_dict(coord_result.get("grch38")) + g37 = coerce_dict(coord_result.get("grch37")) + anchor["grch38"] = g38 if g38 else None + anchor["grch37"] = g37 if g37 else None + + chr_ = g38.get("chr") + pos = g38.get("pos") + if chr_ is not None and pos is not None: + try: + pos_i = int(pos) + start = max(1, pos_i - DEFAULT_LOCUS_PADDING_BP) + end = pos_i + DEFAULT_LOCUS_PADDING_BP + anchor["locus_id"] = f"chr{str(chr_).upper()}:{start}-{end}" + except Exception: + anchor["locus_id"] = f"rsid:{rsid}" + else: + anchor["locus_id"] = f"rsid:{rsid}" + + +def ot_query(query: str, variables: dict[str, Any], limitations: list[str]) -> dict[str, Any]: + try: + payload = safe_post_json(OT_BASE, {"query": query, "variables": variables}, timeout=120) + except Exception as exc: + limitations.append(f"Open Targets request failed: {exc}") + return {} + + if payload.get("errors"): + limitations.append(f"Open Targets GraphQL error: {payload.get('errors')}") + return {} + + return coerce_dict(payload.get("data")) + + +def search_ot_studies( + terms: list[str], + max_studies: int, + limitations: list[str], +) -> list[dict[str, Any]]: + by_id: dict[str, dict[str, Any]] = {} + for term in terms: + if not term: + continue + data = ot_query( + SEARCH_STUDY_QUERY, {"q": term, "page": {"index": 0, "size": 25}}, limitations + ) + hits = coerce_list_of_dicts(coerce_dict(data.get("search")).get("hits")) + for hit in hits: + obj = coerce_dict(hit.get("object")) + study_id = obj.get("id") + if not study_id: + continue + study = by_id.get(study_id) + score = safe_float(hit.get("score")) or 0.0 + if study is None: + by_id[study_id] = { + "id": study_id, + "projectId": obj.get("projectId"), + "traitFromSource": obj.get("traitFromSource"), + "hasSumstats": bool(obj.get("hasSumstats")), + "best_score": score, + "matched_terms": [term], + } + else: + study["best_score"] = max(float(study.get("best_score") or 0.0), score) + if term not in study["matched_terms"]: + study["matched_terms"].append(term) + + studies = sorted( + by_id.values(), key=lambda row: float(row.get("best_score") or 0.0), reverse=True + ) + if not studies: + return [] + + with_sumstats = [s for s in studies if s.get("hasSumstats")] + chosen = with_sumstats[:max_studies] if with_sumstats else studies[:max_studies] + return chosen + + +def fetch_ot_l2g_coloc_for_anchors( + anchor_rsids: list[str], + trait_terms: list[str], + max_coloc_rows_per_locus: int, + limitations: list[str], + warnings: list[str], +) -> dict[str, Any]: + result: dict[str, Any] = { + "per_anchor": {rsid: {"l2g": [], "coloc": []} for rsid in anchor_rsids}, + "studies_used": [], + "matched_study_loci": 0, + } + if not anchor_rsids or not trait_terms: + return result + + studies = search_ot_studies(trait_terms, max_studies=8, limitations=limitations) + if not studies: + warnings.append( + "No Open Targets studies found for trait terms; L2G/coloc components may be sparse." + ) + return result + + anchor_set = set(anchor_rsids) + study_locus_to_anchors: dict[str, set[str]] = {} + + for study in studies: + study_id = str(study.get("id") or "") + if not study_id: + continue + data = ot_query( + STUDY_CREDIBLE_SETS_QUERY, + {"studyId": study_id, "page": {"index": 0, "size": 800}}, + limitations, + ) + study_payload = coerce_dict(data.get("study")) + cs_rows = coerce_list_of_dicts(coerce_dict(study_payload.get("credibleSets")).get("rows")) + for row in cs_rows: + study_locus_id = row.get("studyLocusId") + if not study_locus_id: + continue + variant = coerce_dict(row.get("variant")) + rsids = [normalize_rsid(str(r)) for r in as_string_list(variant.get("rsIds"))] + matched = {r for r in rsids if r and r in anchor_set} + if not matched: + continue + study_locus_to_anchors.setdefault(str(study_locus_id), set()).update(matched) + + result["studies_used"].append( + { + "id": study_id, + "projectId": study.get("projectId"), + "traitFromSource": study.get("traitFromSource"), + "matched_terms": study.get("matched_terms", []), + "credible_set_count": len(cs_rows), + } + ) + time.sleep(0.06) + + study_locus_ids = sorted(study_locus_to_anchors.keys()) + result["matched_study_loci"] = len(study_locus_ids) + if not study_locus_ids: + warnings.append("No Open Targets credible sets were matched to anchor rsIDs.") + return result + + chunk_size = 40 + for i in range(0, len(study_locus_ids), chunk_size): + chunk = study_locus_ids[i : i + chunk_size] + data = ot_query(CREDIBLE_SETS_DETAIL_BATCH_QUERY, {"studyLocusIds": chunk}, limitations) + cs_rows = coerce_list_of_dicts(coerce_dict(data.get("credibleSets")).get("rows")) + + for row in cs_rows: + study_locus_id = str(row.get("studyLocusId") or "") + if not study_locus_id: + continue + matched_anchors = study_locus_to_anchors.get(study_locus_id, set()) + if not matched_anchors: + continue + + l2g_rows = coerce_list_of_dicts(coerce_dict(row.get("l2GPredictions")).get("rows")) + coloc_rows = coerce_list_of_dicts(coerce_dict(row.get("colocalisation")).get("rows")) + if max_coloc_rows_per_locus > 0: + coloc_rows = coloc_rows[:max_coloc_rows_per_locus] + + l2g_records: list[dict[str, Any]] = [] + for pred in l2g_rows: + target = coerce_dict(pred.get("target")) + symbol = str(target.get("approvedSymbol") or "").strip() + if not symbol: + continue + l2g_records.append( + { + "symbol": symbol, + "ensembl_id": target.get("id"), + "score": safe_float(pred.get("score")) or 0.0, + "studyLocusId": study_locus_id, + } + ) + + coloc_records: list[dict[str, Any]] = [] + for coloc in coloc_rows: + coloc_records.append( + { + "studyLocusId": study_locus_id, + "method": coloc.get("colocalisationMethod"), + "h4": safe_float(coloc.get("h4")), + "clpp": safe_float(coloc.get("clpp")), + "otherStudyId": coerce_dict(coloc.get("otherStudyLocus")).get("studyId"), + } + ) + + for anchor_rsid in matched_anchors: + result["per_anchor"].setdefault(anchor_rsid, {"l2g": [], "coloc": []}) + result["per_anchor"][anchor_rsid]["l2g"].extend(l2g_records) + result["per_anchor"][anchor_rsid]["coloc"].extend(coloc_records) + time.sleep(0.05) + + return result + + +def extract_eqtl_gene_symbol(row: dict[str, Any]) -> str | None: + candidates = [ + row.get("geneSymbol"), + row.get("gene_symbol"), + row.get("geneName"), + row.get("gene_name"), + row.get("symbol"), + ] + gene_obj = row.get("gene") + if isinstance(gene_obj, dict): + candidates.extend( + [ + gene_obj.get("symbol"), + gene_obj.get("geneSymbol"), + gene_obj.get("approvedSymbol"), + ] + ) + + for candidate in candidates: + if isinstance(candidate, str) and candidate.strip(): + return candidate.strip() + return None + + +def extract_eqtl_tissue(row: dict[str, Any]) -> str: + candidates = [ + row.get("tissueSiteDetailId"), + row.get("tissue"), + row.get("tissue_id"), + row.get("tissueSiteDetail"), + ] + for candidate in candidates: + if isinstance(candidate, str) and candidate.strip(): + return candidate.strip() + return "." + + +def fetch_gtex_support( + anchors: list[dict[str, Any]], + max_results: int, + limitations: list[str], + warnings: list[str], +) -> dict[str, dict[str, set[str]]]: + out: dict[str, dict[str, set[str]]] = {} + + for anchor in anchors: + rsid = str(anchor.get("rsid") or "") + g38 = coerce_dict(anchor.get("grch38")) + chr_ = g38.get("chr") + pos = g38.get("pos") + ref = g38.get("ref") + alt = g38.get("alt") + if chr_ is None or pos is None or not ref or not alt: + continue + + variant = f"{chr_}:{pos}-{ref}-{alt}" + payload = {"grch38": variant, "max_results": max_results} + result = run_json_skill_script(GTEX_EQTL_SCRIPT, payload, limitations, timeout_s=40) + if not result: + continue + if not result.get("ok"): + err = coerce_dict(result.get("error")).get("message") + warnings.append(f"GTEx lookup failed for {rsid}: {err}") + continue + + eqtls = result.get("eqtls") + if not isinstance(eqtls, list): + continue + + gene_to_tissues = out.setdefault(rsid, {}) + for row in eqtls: + if not isinstance(row, dict): + continue + symbol = extract_eqtl_gene_symbol(row) + if not symbol: + continue + tissue = extract_eqtl_tissue(row) + gene_to_tissues.setdefault(symbol, set()).add(tissue) + + return out + + +def resolve_ensembl_ids_for_symbols(symbols: list[str], limitations: list[str]) -> dict[str, str]: + out: dict[str, str] = {} + for symbol in symbols: + symbol_n = symbol.strip() + if not symbol_n: + continue + data = ot_query(SEARCH_TARGET_QUERY, {"q": symbol_n}, limitations) + hits = coerce_list_of_dicts(coerce_dict(data.get("search")).get("hits")) + selected_id: str | None = None + for hit in hits: + obj = coerce_dict(hit.get("object")) + approved_symbol = str(obj.get("approvedSymbol") or "") + target_id = str(obj.get("id") or "") + if approved_symbol.upper() == symbol_n.upper() and target_id.startswith("ENSG"): + selected_id = target_id + break + if not selected_id and hits: + first_obj = coerce_dict(hits[0].get("object")) + maybe_id = str(first_obj.get("id") or "") + maybe_symbol = str(first_obj.get("approvedSymbol") or "") + if maybe_id.startswith("ENSG") and maybe_symbol.upper() == symbol_n.upper(): + selected_id = maybe_id + if selected_id: + out[symbol_n] = selected_id + time.sleep(0.03) + return out + + +def fetch_genebass_support( + symbol_to_ensembl: dict[str, str], + burden_sets: list[str], + trait_terms: list[str], + max_results: int, + limitations: list[str], +) -> dict[str, dict[str, Any]]: + out: dict[str, dict[str, Any]] = {} + trait_terms_n = [t for t in trait_terms if t] + + for symbol, ensembl_id in symbol_to_ensembl.items(): + best_p: float | None = None + best_phenotype: str | None = None + supporting_rows = 0 + + for burden_set in burden_sets: + payload = { + "ensembl_gene_id": ensembl_id, + "burden_set": burden_set, + "max_results": max_results, + } + result = run_json_skill_script( + GENEBASS_GENE_BURDEN_SCRIPT, payload, limitations, timeout_s=45 + ) + if not result or not result.get("ok"): + continue + associations = result.get("associations") + if not isinstance(associations, list): + continue + + for row in associations: + if not isinstance(row, dict): + continue + phenotype = str(row.get("phenotype_description") or "") + pval = safe_float(row.get("skat_o_pvalue")) + if pval is None: + continue + + match = 0.0 + if trait_terms_n: + match = max( + (lexical_match_score(phenotype, term) for term in trait_terms_n), + default=0.0, + ) + if not trait_terms_n or match >= 0.58: + supporting_rows += 1 + if best_p is None or pval < best_p: + best_p = pval + best_phenotype = phenotype + + if best_p is not None and best_p < 2.5e-6: + support = "strong" + elif best_p is not None and best_p < 0.05: + support = "nominal" + else: + support = "none" + + out[symbol] = { + "best_p": best_p, + "best_phenotype": best_phenotype, + "support": support, + "supporting_rows": supporting_rows, + } + + return out + + +def fetch_refsnp_annotations(rsids: list[str], limitations: list[str]) -> dict[str, dict[str, Any]]: + out: dict[str, dict[str, Any]] = {} + + for rsid in rsids: + payload = fetch_refsnp_payload(rsid, limitations) + if not payload: + continue + + snapshot = coerce_dict(payload.get("primary_snapshot_data")) + genes = { + str(item.get("locus") or item.get("name")).strip() + for item in coerce_list_of_dicts(snapshot.get("genes")) + if item.get("locus") or item.get("name") + } + coding_genes: set[str] = set() + consequence_terms: set[str] = set() + + for allele_ann in coerce_list_of_dicts(snapshot.get("allele_annotations")): + for asm_ann in coerce_list_of_dicts(allele_ann.get("assembly_annotation")): + for gene in coerce_list_of_dicts(asm_ann.get("genes")): + gene_symbol = str(gene.get("locus") or gene.get("name") or "").strip() + if gene_symbol: + genes.add(gene_symbol) + is_coding = False + for so in coerce_list_of_dicts(gene.get("sequence_ontology")): + term = str(so.get("name") or "").strip() + if term: + consequence_terms.add(term) + for rna in coerce_list_of_dicts(gene.get("rnas")): + for so in coerce_list_of_dicts(rna.get("sequence_ontology")): + term = str(so.get("name") or "").strip() + if term: + consequence_terms.add(term) + protein = rna.get("protein") + protein_items = [protein] if isinstance(protein, dict) else protein + if not isinstance(protein_items, list): + protein_items = [] + for protein_item in protein_items: + if not isinstance(protein_item, dict): + continue + for so in coerce_list_of_dicts(protein_item.get("sequence_ontology")): + term = str(so.get("name") or "").strip() + if not term: + continue + consequence_terms.add(term) + if term in CODING_SEQUENCE_TERMS: + is_coding = True + if gene_symbol and is_coding: + coding_genes.add(gene_symbol) + + out[rsid] = { + "genes": sorted(genes), + "coding_genes": sorted(coding_genes), + "consequence_terms": sorted(consequence_terms), + } + time.sleep(0.05) + + return out + + +def fetch_gnomad_gene_constraints( + symbols: list[str], + limitations: list[str], +) -> dict[str, dict[str, Any]]: + out: dict[str, dict[str, Any]] = {} + + for symbol in symbols[:40]: + payload = { + "query": GNOMAD_GENE_QUERY, + "variables": {"geneSymbol": symbol, "referenceGenome": "GRCh38"}, + } + try: + data = safe_post_json(GNOMAD_BASE, payload, timeout=45) + except Exception as exc: + limitations.append(f"gnomAD gene-constraint lookup failed for {symbol}: {exc}") + continue + + errors = data.get("errors") + if errors: + limitations.append(f"gnomAD GraphQL error for {symbol}: {errors}") + continue + + gene = coerce_dict(coerce_dict(data.get("data")).get("gene")) + constraint = coerce_dict(gene.get("gnomad_constraint")) + if not gene or not constraint: + continue + + out[symbol] = { + "oe_lof": safe_float(constraint.get("oe_lof")), + "oe_lof_lower": safe_float(constraint.get("oe_lof_lower")), + "oe_lof_upper": safe_float(constraint.get("oe_lof_upper")), + "mis_z": safe_float(constraint.get("mis_z")), + "lof_z": safe_float(constraint.get("lof_z")), + "pli": safe_float(constraint.get("pLI")), + } + time.sleep(0.05) + + return out + + +def support_from_burden(best_p: float | None) -> tuple[str, float]: + if best_p is None: + return "none", 0.0 + if best_p < 2.5e-6: + return "strong", 1.0 + if best_p < 0.05: + return "nominal", 0.6 + return "none", 0.0 + + +def coding_component(coding_support: str, clinvar_support: str) -> float: + if coding_support == "coding" and clinvar_support == "present": + return 1.0 + if coding_support == "coding": + return 0.6 + if coding_support == "noncoding": + return 0.3 + return 0.0 + + +def confidence_label(score: float) -> str: + if score >= 0.75: + return "High" + if score >= 0.55: + return "Medium" + if score >= 0.35: + return "Low" + return "VeryLow" + + +def format_gnomad_context(constraint: dict[str, Any] | None) -> str: + if not constraint: + return "." + oe_upper = constraint.get("oe_lof_upper") + pli = constraint.get("pli") + parts = [] + if oe_upper is not None: + parts.append(f"oe_lof_upper={oe_upper:.3g}") + if pli is not None: + parts.append(f"pLI={pli:.3g}") + return "; ".join(parts) if parts else "." + + +def clamp01(value: float) -> float: + return max(0.0, min(1.0, value)) + + +def markdown_image_tag(alt_text: str, absolute_path: str) -> str: + # Use angle-bracket URL form so paths with spaces still render. + return f"![{alt_text}](<{absolute_path}>)" + + +def build_inline_image_markdown(figure_entries: list[dict[str, Any]]) -> list[str]: + lines: list[str] = [] + for fig in figure_entries: + path = str(fig.get("path") or "").strip() + if not path: + continue + caption = str(fig.get("caption") or fig.get("id") or "figure").strip() + lines.append(markdown_image_tag(caption, path)) + return lines + + +def build_summary_markdown( + mapping_payload: dict[str, Any], + figure_entries: list[dict[str, Any]], + figure_fallback_mermaid: str | None, +) -> str: + meta = coerce_dict(mapping_payload.get("meta")) + loci = coerce_list_of_dicts(mapping_payload.get("loci")) + cross = coerce_list_of_dicts(mapping_payload.get("cross_locus_ranked_genes")) + warnings = as_string_list(mapping_payload.get("warnings")) + limitations = as_string_list(mapping_payload.get("limitations")) + + trait_query = str(meta.get("trait_query") or "") + efo_id = str(meta.get("efo_id") or "unresolved") + + lines: list[str] = [] + lines.append("## Objective") + lines.append( + f"Map GWAS loci for `{trait_query or 'seeded variants'}` to ranked candidate genes using a deterministic evidence chain (GWAS, coordinates, Open Targets L2G/coloc, GTEx eQTL, burden, coding context)." + ) + lines.append("") + + lines.append("## Inputs and scope") + lines.append(f"- Trait query: `{trait_query or '.'}`") + lines.append(f"- EFO ID: `{efo_id}`") + lines.append( + f"- Anchor variants: `{len(as_string_list(mapping_payload.get('anchors')))} loci seeds in output payload`" + ) + lines.append(f"- Generated at: `{meta.get('generated_at')}`") + lines.append("") + + lines.append("## Anchor variant summary") + anchors = coerce_list_of_dicts(mapping_payload.get("anchors")) + if not anchors: + lines.append("No anchors were retained after normalization.") + else: + for anchor in anchors[:20]: + rsid = anchor.get("rsid") or "." + p = anchor.get("p_value") + p_txt = f"{p:.3g}" if isinstance(p, (int, float)) else "." + trait = anchor.get("lead_trait") or "." + locus_id = anchor.get("locus_id") or "." + lines.append(f"- `{rsid}` | p={p_txt} | trait={trait} | locus={locus_id}") + lines.append("") + + lines.append("## Per-locus top genes") + if not loci: + lines.append("No loci available.") + else: + for locus in loci: + locus_id = locus.get("locus_id") or "." + lead_rsid = locus.get("lead_rsid") or "." + lines.append(f"### {locus_id} (lead `{lead_rsid}`)") + genes = coerce_list_of_dicts(locus.get("candidate_genes")) + if not genes: + lines.append("- No candidate genes scored.") + continue + for gene in genes[:5]: + symbol = gene.get("symbol") or "." + score = safe_float(gene.get("overall_score")) or 0.0 + conf = gene.get("confidence") or "." + evidence = coerce_dict(gene.get("evidence")) + l2g = safe_float(evidence.get("l2g_max")) or 0.0 + coloc = safe_float(evidence.get("coloc_max_h4")) or 0.0 + tissues = as_string_list(evidence.get("eqtl_tissues")) + lines.append( + f"- `{symbol}` | score={score:.3f} ({conf}) | L2G={l2g:.3f} | coloc={coloc:.3f} | eQTL tissues={len([t for t in tissues if t != '.'])}" + ) + lines.append("") + + lines.append("## Cross-locus prioritized genes") + if not cross: + lines.append("No cross-locus aggregated ranking available.") + else: + for row in cross[:15]: + symbol = row.get("symbol") or "." + supporting_loci = row.get("supporting_loci") or 0 + mean_score = safe_float(row.get("mean_score")) or 0.0 + max_score = safe_float(row.get("max_score")) or 0.0 + lines.append( + f"- `{symbol}` | supporting_loci={supporting_loci} | mean_score={mean_score:.3f} | max_score={max_score:.3f}" + ) + lines.append("") + + lines.append("## Key caveats") + caveats = dedupe_keep_order(limitations + warnings) + if not caveats: + lines.append("- No major caveats recorded.") + else: + for item in caveats[:20]: + lines.append(f"- {item}") + lines.append("") + + lines.append("## Recommended next analyses") + lines.append("1. Run fine-mapping/conditional analysis on top loci before causal claims.") + lines.append( + "2. Validate top genes with independent cohort summary statistics where available." + ) + lines.append( + "3. Add tissue- and cell-type-specific molecular QTL datasets for stronger functional assignment." + ) + lines.append("4. Review liability/pleiotropy for top genes before portfolio decisions.") + lines.append("") + + if figure_entries: + lines.append("## Optional figures") + for fig in figure_entries: + fig_id = fig.get("id") or "figure" + path = fig.get("path") or "" + caption = fig.get("caption") or "" + lines.append(f"- `{fig_id}`: `{path}` - {caption}") + lines.append("") + lines.append("Inline render tags (plain markdown, do not wrap in code fences):") + for tag in build_inline_image_markdown(figure_entries): + lines.append(tag) + lines.append("") + + if figure_fallback_mermaid: + lines.append("## Figure fallback (Mermaid)") + lines.append("```mermaid") + lines.extend(figure_fallback_mermaid.splitlines()) + lines.append("```") + lines.append("") + + return "\n".join(lines).rstrip() + "\n" + + +def validate_summary_section_order(summary_markdown: str) -> None: + headings = [ + line.strip()[3:].strip() + for line in summary_markdown.splitlines() + if line.strip().startswith("## ") + ] + expected = [ + "Objective", + "Inputs and scope", + "Anchor variant summary", + "Per-locus top genes", + "Cross-locus prioritized genes", + "Key caveats", + "Recommended next analyses", + ] + if headings[: len(expected)] != expected: + raise ValueError( + "Summary section order mismatch. " + f"Expected first headings {expected}, found {headings[: len(expected)]}." + ) + + +def generate_optional_figures( + loci: list[dict[str, Any]], + figure_output_dir: Path, + warnings: list[str], +) -> tuple[list[dict[str, Any]], str | None]: + try: + import matplotlib.pyplot as plt + except Exception as exc: + warnings.append(f"Figure generation skipped: plotting dependency unavailable ({exc}).") + fallback = "graph LR\nA[GWAS anchors] --> B[Open Targets L2G/coloc]\nB --> C[Candidate gene scoring]\nC --> D[Per-locus ranking]\nD --> E[Cross-locus ranking]" + return [], fallback + + ensure_parent(figure_output_dir / "dummy.txt") + figure_entries: list[dict[str, Any]] = [] + + # Heatmap: top genes x evidence components + heat_rows: list[tuple[str, list[float]]] = [] + for locus in loci: + locus_id = str(locus.get("locus_id") or "") + for gene in coerce_list_of_dicts(locus.get("candidate_genes"))[:3]: + evidence = coerce_dict(gene.get("evidence")) + label = f"{gene.get('symbol')} | {locus_id}" + row = [ + safe_float(evidence.get("l2g_max")) or 0.0, + safe_float(evidence.get("coloc_max_h4")) or 0.0, + min( + 1.0, + len([t for t in as_string_list(evidence.get("eqtl_tissues")) if t != "."]) + / 3.0, + ), + 1.0 + if evidence.get("rare_variant_support") == "strong" + else (0.6 if evidence.get("rare_variant_support") == "nominal" else 0.0), + 1.0 + if evidence.get("coding_support") == "coding" + else (0.3 if evidence.get("coding_support") == "noncoding" else 0.0), + ] + heat_rows.append((label[:50], row)) + + if heat_rows: + labels = [x[0] for x in heat_rows] + matrix = [x[1] for x in heat_rows] + fig, ax = plt.subplots(figsize=(8, max(3.5, len(labels) * 0.35))) + im = ax.imshow(matrix, aspect="auto", vmin=0, vmax=1) + ax.set_xticks(range(5)) + ax.set_xticklabels(["L2G", "coloc", "eQTL", "burden", "coding"], rotation=25, ha="right") + ax.set_yticks(range(len(labels))) + ax.set_yticklabels(labels) + ax.set_title("Locus-to-gene evidence heatmap") + fig.colorbar(im, ax=ax, fraction=0.03, pad=0.02) + fig.tight_layout() + heatmap_path = (figure_output_dir / "locus_gene_heatmap.png").resolve() + fig.savefig(heatmap_path, dpi=180) + plt.close(fig) + figure_entries.append( + { + "id": "locus_gene_heatmap", + "path": str(heatmap_path), + "caption": "Top candidate genes by evidence component across loci", + } + ) + + # Stacked decomposition for top gene per locus. + locus_labels: list[str] = [] + l2g_vals: list[float] = [] + coloc_vals: list[float] = [] + eqtl_vals: list[float] = [] + burden_vals: list[float] = [] + coding_vals: list[float] = [] + + for locus in loci: + genes = coerce_list_of_dicts(locus.get("candidate_genes")) + if not genes: + continue + top_gene = genes[0] + evidence = coerce_dict(top_gene.get("evidence")) + locus_labels.append(str(locus.get("lead_rsid") or locus.get("locus_id") or ".")) + l2g_vals.append(0.40 * (safe_float(evidence.get("l2g_max")) or 0.0)) + coloc_vals.append(0.25 * (safe_float(evidence.get("coloc_max_h4")) or 0.0)) + eqtl_vals.append( + 0.15 + * min( + 1.0, + len([t for t in as_string_list(evidence.get("eqtl_tissues")) if t != "."]) / 3.0, + ) + ) + burden_vals.append( + 0.10 + * ( + 1.0 + if evidence.get("rare_variant_support") == "strong" + else (0.6 if evidence.get("rare_variant_support") == "nominal" else 0.0) + ) + ) + coding_vals.append( + 0.10 + * ( + 1.0 + if evidence.get("coding_support") == "coding" + else (0.3 if evidence.get("coding_support") == "noncoding" else 0.0) + ) + ) + + if locus_labels: + fig, ax = plt.subplots(figsize=(max(6, len(locus_labels) * 0.8), 4.2)) + x = range(len(locus_labels)) + bottom = [0.0 for _ in locus_labels] + for label, vals, color in [ + ("L2G", l2g_vals, "#1f77b4"), + ("coloc", coloc_vals, "#ff7f0e"), + ("eQTL", eqtl_vals, "#2ca02c"), + ("burden", burden_vals, "#d62728"), + ("coding", coding_vals, "#9467bd"), + ]: + ax.bar(x, vals, bottom=bottom, label=label, color=color) + bottom = [b + v for b, v in zip(bottom, vals)] + ax.set_xticks(list(x)) + ax.set_xticklabels(locus_labels, rotation=30, ha="right") + ax.set_ylim(0, 1.05) + ax.set_ylabel("Weighted score contribution") + ax.set_title("Top-gene score decomposition by locus") + ax.legend(loc="upper right", fontsize=8) + fig.tight_layout() + stack_path = (figure_output_dir / "locus_score_decomposition.png").resolve() + fig.savefig(stack_path, dpi=180) + plt.close(fig) + figure_entries.append( + { + "id": "locus_score_decomposition", + "path": str(stack_path), + "caption": "Weighted score decomposition for top gene in each locus", + } + ) + + # Tissue support dotplot. + tissue_points: list[tuple[str, str]] = [] + for locus in loci: + for gene in coerce_list_of_dicts(locus.get("candidate_genes"))[:4]: + symbol = str(gene.get("symbol") or "") + if not symbol: + continue + evidence = coerce_dict(gene.get("evidence")) + tissues = [t for t in as_string_list(evidence.get("eqtl_tissues")) if t and t != "."] + hpa = [t for t in as_string_list(evidence.get("hpa_tissue_support")) if t and t != "."] + for tissue in dedupe_keep_order(tissues + hpa): + tissue_points.append((symbol, tissue)) + + if tissue_points: + unique_genes = sorted({g for g, _ in tissue_points}) + unique_tissues = sorted({t for _, t in tissue_points}) + gene_index = {g: i for i, g in enumerate(unique_genes)} + tissue_index = {t: i for i, t in enumerate(unique_tissues)} + x_vals = [tissue_index[t] for _, t in tissue_points] + y_vals = [gene_index[g] for g, _ in tissue_points] + + fig, ax = plt.subplots( + figsize=(max(6, len(unique_tissues) * 0.45), max(3.5, len(unique_genes) * 0.35)) + ) + ax.scatter(x_vals, y_vals, s=35, alpha=0.75) + ax.set_xticks(range(len(unique_tissues))) + ax.set_xticklabels(unique_tissues, rotation=45, ha="right") + ax.set_yticks(range(len(unique_genes))) + ax.set_yticklabels(unique_genes) + ax.set_title("Tissue support (GTEx/HPA)") + fig.tight_layout() + dot_path = (figure_output_dir / "tissue_support_dotplot.png").resolve() + fig.savefig(dot_path, dpi=180) + plt.close(fig) + figure_entries.append( + { + "id": "tissue_support_dotplot", + "path": str(dot_path), + "caption": "Gene-by-tissue support dots from GTEx/HPA context", + } + ) + + return figure_entries, None + + +def build_anchors( + input_json: dict[str, Any], + efo_payload: dict[str, Any], + warnings: list[str], + limitations: list[str], +) -> list[dict[str, Any]]: + trait_query = str(input_json.get("trait_query") or "").strip() + explicit_efo_id = str(input_json.get("efo_id") or "").strip() or None + efo_id = explicit_efo_id or efo_payload.get("efo_id") + show_child_traits = bool(input_json.get("show_child_traits", True)) + max_anchor_associations = int(input_json.get("max_anchor_associations") or 1200) + max_loci = int(input_json.get("max_loci") or 25) + phenotype_terms = as_string_list(input_json.get("phenotype_terms")) + + normalized_rows: list[dict[str, Any]] = [] + + query_specs: list[dict[str, Any]] = [] + if efo_id: + query_specs.append({"efo_id": efo_id, "show_child_traits": show_child_traits}) + if trait_query: + query_specs.append({"efo_trait": trait_query}) + for term in phenotype_terms[:8]: + query_specs.append({"efo_trait": term}) + + if query_specs: + per_query_limit = max(100, math.ceil(max_anchor_associations / len(query_specs))) + for query in query_specs: + try: + rows = gwas_iter_associations(query, max_rows=per_query_limit) + for row in rows: + normalized = normalize_anchor_row(row) + if normalized: + normalized_rows.append(normalized) + except Exception as exc: + limitations.append(f"GWAS anchor retrieval failed for query {query}: {exc}") + + # seed rsids always participate + seed_rsids = [ + normalize_rsid(s) for s in as_string_list(input_json.get("seed_rsids")) if normalize_rsid(s) + ] + + best_by_rsid: dict[str, dict[str, Any]] = {} + for row in normalized_rows: + rsid = str(row.get("rsid") or "") + if not rsid: + continue + current = best_by_rsid.get(rsid) + p = safe_float(row.get("p_value")) + if current is None: + best_by_rsid[rsid] = row + continue + cp = safe_float(current.get("p_value")) + if cp is None or (p is not None and p < cp): + best_by_rsid[rsid] = row + + ranked = sorted( + best_by_rsid.values(), + key=lambda r: ( + safe_float(r.get("p_value")) if safe_float(r.get("p_value")) is not None else 1e99 + ), + ) + + anchors: list[dict[str, Any]] = [] + for row in ranked: + if len(anchors) >= max_loci: + break + anchors.append( + { + "rsid": row.get("rsid"), + "lead_trait": row.get("lead_trait") or "", + "p_value": safe_float(row.get("p_value")), + "cohort": row.get("cohort") or "", + "accession_id": row.get("accession_id"), + "mapped_genes": dedupe_keep_order(as_string_list(row.get("mapped_genes"))), + } + ) + + current_rsids = {str(anchor.get("rsid")) for anchor in anchors} + for seed in seed_rsids: + if seed in current_rsids: + continue + if len(anchors) >= max_loci: + break + anchors.append( + { + "rsid": seed, + "lead_trait": trait_query, + "p_value": None, + "cohort": "", + "accession_id": None, + "mapped_genes": [], + } + ) + current_rsids.add(seed) + + accession_ids = [str(a.get("accession_id")) for a in anchors if a.get("accession_id")] + study_index = fetch_gwas_study_metadata(accession_ids, limitations) + for anchor in anchors: + accession_id = anchor.get("accession_id") + if accession_id and accession_id in study_index: + anchor["cohort"] = study_index[accession_id].get("cohort") or anchor.get("cohort") + + if not anchors: + warnings.append("No anchors derived from GWAS queries and seed variants.") + + resolve_anchor_coordinates(anchors, warnings, limitations) + return anchors + + +def group_anchors_by_locus(anchors: list[dict[str, Any]]) -> list[dict[str, Any]]: + by_locus: dict[str, list[dict[str, Any]]] = {} + for anchor in anchors: + locus_id = str(anchor.get("locus_id") or f"rsid:{anchor.get('rsid')}") + by_locus.setdefault(locus_id, []).append(anchor) + + grouped: list[dict[str, Any]] = [] + for locus_id, rows in by_locus.items(): + rows_sorted = sorted( + rows, + key=lambda r: ( + safe_float(r.get("p_value")) if safe_float(r.get("p_value")) is not None else 1e99 + ), + ) + grouped.append( + { + "locus_id": locus_id, + "anchors": rows_sorted, + "lead_rsid": rows_sorted[0].get("rsid") if rows_sorted else None, + } + ) + grouped.sort( + key=lambda g: ( + safe_float(coerce_list_of_dicts(g.get("anchors"))[0].get("p_value")) + if coerce_list_of_dicts(g.get("anchors")) + and safe_float(coerce_list_of_dicts(g.get("anchors"))[0].get("p_value")) is not None + else 1e99 + ) + ) + return grouped + + +def map_locus_to_gene(input_json: dict[str, Any]) -> dict[str, Any]: + warnings: list[str] = [] + limitations: list[str] = [] + + normalized_input: dict[str, Any] = dict(input_json) + + trait_query = str(normalized_input.get("trait_query") or "").strip() + efo_id_input = str(normalized_input.get("efo_id") or "").strip() + seed_rsids = [ + normalize_rsid(s) + for s in as_string_list(normalized_input.get("seed_rsids")) + if normalize_rsid(s) + ] + + disable_default_seeds = bool(normalized_input.get("disable_default_seeds", False)) + if trait_query and not seed_rsids and not disable_default_seeds: + preset = DEFAULT_TRAIT_SEED_RSIDS.get(normalize_trait_key(trait_query)) + if preset: + seed_rsids = dedupe_keep_order([normalize_rsid(s) or s for s in preset]) + normalized_input["seed_rsids"] = seed_rsids + warnings.append( + f"Applied default seed rsIDs for trait '{trait_query}': {', '.join(seed_rsids)}." + ) + + if not trait_query and not efo_id_input and not seed_rsids: + raise ValueError("Provide at least one anchor source: trait_query, efo_id, or seed_rsids.") + + max_genes_per_locus = int(normalized_input.get("max_genes_per_locus") or 10) + max_coloc_rows_per_locus = int(normalized_input.get("max_coloc_rows_per_locus") or 100) + max_eqtl_rows_per_variant = int(normalized_input.get("max_eqtl_rows_per_variant") or 200) + burden_sets = as_string_list(normalized_input.get("genebass_burden_sets")) or [ + "pLoF", + "missense|LC", + ] + include_clinvar = bool(normalized_input.get("include_clinvar", True)) + include_gnomad_context = bool(normalized_input.get("include_gnomad_context", True)) + include_hpa_tissue_context = bool(normalized_input.get("include_hpa_tissue_context", True)) + include_figures = bool(normalized_input.get("include_figures", False)) + + mapping_output_path = Path( + str(normalized_input.get("mapping_output_path") or "./output/locus_to_gene_mapping.json") + ) + summary_output_path = Path( + str(normalized_input.get("summary_output_path") or "./output/locus_to_gene_summary.md") + ) + figure_output_dir = Path(str(normalized_input.get("figure_output_dir") or "./output/figures")) + + efo_payload = resolve_efo(trait_query, warnings, limitations) + if efo_id_input: + efo_payload["efo_id"] = efo_id_input + + anchors = build_anchors(normalized_input, efo_payload, warnings, limitations) + if not anchors: + raise ValueError("No anchors remained after normalization.") + + unresolved_coord_rsids = [ + str(anchor.get("rsid")) + for anchor in anchors + if anchor.get("rsid") and not coerce_dict(anchor.get("grch38")) + ] + if unresolved_coord_rsids: + limitations.append( + "Unresolved GRCh38 coordinates for anchors: " + + ", ".join(dedupe_keep_order(unresolved_coord_rsids)) + ) + + anchor_rsids = dedupe_keep_order([str(a.get("rsid")) for a in anchors if a.get("rsid")]) + trait_terms = dedupe_keep_order( + [ + trait_query, + str(efo_payload.get("anchor_label") or ""), + *as_string_list(efo_payload.get("synonyms"))[:12], + *as_string_list(normalized_input.get("phenotype_terms")), + ] + ) + + ot_support = fetch_ot_l2g_coloc_for_anchors( + anchor_rsids=anchor_rsids, + trait_terms=trait_terms, + max_coloc_rows_per_locus=max_coloc_rows_per_locus, + limitations=limitations, + warnings=warnings, + ) + + gtex_support = fetch_gtex_support( + anchors=anchors, + max_results=max_eqtl_rows_per_variant, + limitations=limitations, + warnings=warnings, + ) + + refsnp_annotations = ( + fetch_refsnp_annotations(anchor_rsids, limitations) if include_clinvar else {} + ) + + grouped_loci = group_anchors_by_locus(anchors) + + all_candidate_symbols: list[str] = [] + for locus in grouped_loci: + locus_symbols: list[str] = [] + for anchor in coerce_list_of_dicts(locus.get("anchors")): + locus_symbols.extend(as_string_list(anchor.get("mapped_genes"))) + rsid = str(anchor.get("rsid") or "") + annot = coerce_dict(refsnp_annotations.get(rsid)) + locus_symbols.extend(as_string_list(annot.get("coding_genes"))) + locus_symbols.extend(as_string_list(annot.get("genes"))) + l2g_rows = coerce_list_of_dicts( + coerce_dict(ot_support.get("per_anchor", {})).get(rsid, {}).get("l2g") + ) + for row in l2g_rows: + symbol = str(row.get("symbol") or "").strip() + if symbol: + locus_symbols.append(symbol) + + target_gene = str(normalized_input.get("target_gene") or "").strip() + if target_gene: + locus_symbols.append(target_gene) + + locus_symbols = dedupe_keep_order(locus_symbols) + if not locus_symbols: + locus_symbols = ["UNMAPPED_GENE"] + locus["candidate_symbols"] = locus_symbols + all_candidate_symbols.extend(locus_symbols) + + unique_symbols = dedupe_keep_order(all_candidate_symbols) + symbol_to_ensembl = resolve_ensembl_ids_for_symbols(unique_symbols, limitations) + genebass_support = fetch_genebass_support( + symbol_to_ensembl=symbol_to_ensembl, + burden_sets=burden_sets, + trait_terms=trait_terms, + max_results=int(normalized_input.get("genebass_max_results") or 300), + limitations=limitations, + ) + + gnomad_constraints = ( + fetch_gnomad_gene_constraints(unique_symbols, limitations) if include_gnomad_context else {} + ) + + hpa_support: dict[str, list[str]] = {} + if include_hpa_tissue_context: + limitations.append( + "Human Protein Atlas API enrichment is not implemented in this script yet; hpa_tissue_support is left empty." + ) + + loci_output: list[dict[str, Any]] = [] + cross_locus_rows: list[dict[str, Any]] = [] + + for locus in grouped_loci: + anchors_in_locus = coerce_list_of_dicts(locus.get("anchors")) + symbols = as_string_list(locus.get("candidate_symbols")) + gene_rows: list[dict[str, Any]] = [] + + for symbol in symbols: + l2g_scores: list[float] = [] + coloc_values: list[float] = [] + eqtl_tissues: set[str] = set() + mapped_hit = False + clinvar_present = False + coding_hit = False + + for anchor in anchors_in_locus: + rsid = str(anchor.get("rsid") or "") + mapped_genes_upper = {g.upper() for g in as_string_list(anchor.get("mapped_genes"))} + if symbol.upper() in mapped_genes_upper: + mapped_hit = True + + support = coerce_dict(coerce_dict(ot_support.get("per_anchor", {})).get(rsid)) + l2g_rows = coerce_list_of_dicts(support.get("l2g")) + matched_l2g = [ + safe_float(row.get("score")) or 0.0 + for row in l2g_rows + if str(row.get("symbol") or "").upper() == symbol.upper() + ] + l2g_scores.extend(matched_l2g) + + if matched_l2g: + for coloc_row in coerce_list_of_dicts(support.get("coloc")): + h4 = safe_float(coloc_row.get("h4")) + clpp = safe_float(coloc_row.get("clpp")) + coloc_values.append( + h4 if h4 is not None else (clpp if clpp is not None else 0.0) + ) + + tissues = coerce_dict(gtex_support.get(rsid, {})).get(symbol) + if isinstance(tissues, set): + eqtl_tissues.update(tissues) + elif isinstance(tissues, list): + eqtl_tissues.update([str(t) for t in tissues if str(t).strip()]) + + annot = coerce_dict(refsnp_annotations.get(rsid)) + genes = {g.upper() for g in as_string_list(annot.get("genes"))} + coding_genes = {g.upper() for g in as_string_list(annot.get("coding_genes"))} + if symbol.upper() in genes: + clinvar_present = True + if symbol.upper() in coding_genes: + coding_hit = True + + l2g_component = clamp01(max(l2g_scores) if l2g_scores else 0.0) + coloc_component = clamp01(max(coloc_values) if coloc_values else 0.0) + if l2g_component <= 0.0 and coloc_component > 0.0: + # Avoid applying coloc to genes with no gene-level assignment signal. + coloc_component = 0.0 + + relevant_eqtl_tissues = [t for t in sorted(eqtl_tissues) if t and t != "."] + eqtl_component = clamp01(min(1.0, len(relevant_eqtl_tissues) / 3.0)) + + gene_burden = coerce_dict(genebass_support.get(symbol)) + best_burden_p = safe_float(gene_burden.get("best_p")) + rare_variant_support, burden_component = support_from_burden(best_burden_p) + + if coding_hit: + coding_support = "coding" + elif clinvar_present or mapped_hit: + coding_support = "noncoding" + else: + coding_support = "none" + + clinvar_support = "present" if clinvar_present else "none" + coding_comp = coding_component(coding_support, clinvar_support) + + overall = clamp01( + 0.40 * l2g_component + + 0.25 * coloc_component + + 0.15 * eqtl_component + + 0.10 * burden_component + + 0.10 * coding_comp + ) + confidence = confidence_label(overall) + + evidence = { + "l2g_max": round(l2g_component, 6), + "coloc_max_h4": round(coloc_component, 6), + "eqtl_tissues": relevant_eqtl_tissues, + "rare_variant_support": rare_variant_support, + "coding_support": coding_support, + "clinvar_support": clinvar_support, + "gnomad_context": format_gnomad_context(gnomad_constraints.get(symbol)), + "hpa_tissue_support": hpa_support.get(symbol, []), + } + + rationale: list[str] = [] + gene_limits: list[str] = [] + + if l2g_component > 0: + rationale.append( + f"Open Targets L2G max score {l2g_component:.3f} for matched anchor locus." + ) + else: + gene_limits.append("No matched L2G support found for this gene in anchored loci.") + + if coloc_component > 0: + rationale.append( + f"Colocalisation signal present (max h4/clpp proxy {coloc_component:.3f})." + ) + else: + gene_limits.append("No coloc support assigned to this gene in matched loci.") + + if relevant_eqtl_tissues: + rationale.append( + f"GTEx eQTL support observed in {len(relevant_eqtl_tissues)} tissue(s): {', '.join(relevant_eqtl_tissues[:4])}." + ) + else: + gene_limits.append("No GTEx eQTL rows mapped to this gene from anchor variants.") + + if best_burden_p is not None: + rationale.append( + f"Genebass burden support is {rare_variant_support} (best trait-matched p={best_burden_p:.3g})." + ) + else: + gene_limits.append("No trait-matched Genebass burden support found.") + + if coding_support == "coding": + rationale.append("Coding consequence support present from rsID annotation.") + elif coding_support == "noncoding": + rationale.append( + "Locus membership or noncoding variant annotation supports proximity to this gene." + ) + else: + gene_limits.append("No coding or in-gene annotation support identified.") + + gene_row = { + "symbol": symbol, + "ensembl_id": symbol_to_ensembl.get(symbol), + "overall_score": round(overall, 6), + "confidence": confidence, + "evidence": evidence, + "rationale": rationale, + "limitations": gene_limits, + } + gene_rows.append(gene_row) + cross_locus_rows.append({"symbol": symbol, "score": overall}) + + gene_rows.sort( + key=lambda row: ( + -safe_float(row.get("overall_score")) + if safe_float(row.get("overall_score")) is not None + else 0.0, + str(row.get("symbol") or ""), + ) + ) + gene_rows = gene_rows[:max_genes_per_locus] + + loci_output.append( + { + "locus_id": locus.get("locus_id"), + "lead_rsid": locus.get("lead_rsid"), + "candidate_genes": gene_rows, + } + ) + + if not loci_output: + raise ValueError("No loci available after candidate gene scoring.") + + # Cross-locus aggregate ranking. + aggregate: dict[str, list[float]] = {} + for locus in loci_output: + for gene in coerce_list_of_dicts(locus.get("candidate_genes")): + symbol = str(gene.get("symbol") or "") + score = safe_float(gene.get("overall_score")) + if not symbol or score is None: + continue + aggregate.setdefault(symbol, []).append(score) + + cross_locus_ranked_genes: list[dict[str, Any]] = [] + for symbol, scores in aggregate.items(): + cross_locus_ranked_genes.append( + { + "symbol": symbol, + "supporting_loci": len(scores), + "mean_score": round(sum(scores) / len(scores), 6), + "max_score": round(max(scores), 6), + } + ) + cross_locus_ranked_genes.sort( + key=lambda row: ( + -safe_float(row.get("max_score")) + if safe_float(row.get("max_score")) is not None + else 0.0, + -safe_float(row.get("mean_score")) + if safe_float(row.get("mean_score")) is not None + else 0.0, + str(row.get("symbol") or ""), + ) + ) + + # QC gates. + for locus in loci_output: + genes = coerce_list_of_dicts(locus.get("candidate_genes")) + if not genes: + raise ValueError(f"Locus {locus.get('locus_id')} has no candidate genes after scoring.") + for gene in genes: + if "overall_score" not in gene: + raise ValueError( + f"Gene row missing overall_score in locus {locus.get('locus_id')}." + ) + score = safe_float(gene.get("overall_score")) + if score is None or score < 0 or score > 1: + raise ValueError( + f"overall_score outside [0,1] for gene {gene.get('symbol')} in locus {locus.get('locus_id')}" + ) + + mapping_payload: dict[str, Any] = { + "meta": { + "trait_query": trait_query, + "efo_id": efo_payload.get("efo_id"), + "generated_at": now_iso(), + "sources_queried": [ + "efo-ontology-skill", + "gwas-catalog-skill", + "ncbi-refsnp-coordinate-resolution", + "opentargets-skill", + "gtex-eqtl-skill", + "genebass-gene-burden-skill", + "clinvar-variation-skill" + if include_clinvar + else "clinvar-variation-skill(skipped)", + "gnomad-graphql-skill" + if include_gnomad_context + else "gnomad-graphql-skill(skipped)", + "human-protein-atlas-skill" + if include_hpa_tissue_context + else "human-protein-atlas-skill(skipped)", + ], + }, + "anchors": anchors, + "loci": loci_output, + "cross_locus_ranked_genes": cross_locus_ranked_genes, + "warnings": dedupe_keep_order(warnings), + "limitations": dedupe_keep_order(limitations), + } + + figure_entries: list[dict[str, Any]] = [] + figure_fallback_mermaid: str | None = None + if include_figures: + figure_entries, figure_fallback_mermaid = generate_optional_figures( + loci_output, figure_output_dir, warnings + ) + if not figure_entries and not figure_fallback_mermaid: + figure_fallback_mermaid = ( + "graph LR\n" + "A[Anchor variants] --> B[Locus grouping]\n" + "B --> C[Evidence scoring]\n" + "C --> D[Per-locus top genes]\n" + "D --> E[Cross-locus ranking]" + ) + warnings.append( + "No figure PNGs were generated; emitted Mermaid fallback visualization." + ) + if figure_entries: + mapping_payload["figures"] = figure_entries + mapping_payload["inline_image_markdown"] = build_inline_image_markdown(figure_entries) + + summary = build_summary_markdown(mapping_payload, figure_entries, figure_fallback_mermaid) + validate_summary_section_order(summary) + + ensure_parent(mapping_output_path) + ensure_parent(summary_output_path) + + mapping_output_path.write_text(json.dumps(mapping_payload, indent=2), encoding="utf-8") + summary_output_path.write_text(summary, encoding="utf-8") + + critical_limitations = [ + item for item in limitations if item.startswith("Unresolved GRCh38 coordinates") + ] + + return { + "status": "degraded" if critical_limitations else "ok", + "mapping_output_path": str(mapping_output_path), + "summary_output_path": str(summary_output_path), + "figure_paths": [str(fig.get("path")) for fig in figure_entries], + "inline_image_markdown": build_inline_image_markdown(figure_entries), + "render_instructions": ( + "Paste `inline_image_markdown` lines directly in the chat as plain markdown. " + "Do not wrap them in code fences." + ), + "warnings": dedupe_keep_order(warnings), + "limitations": dedupe_keep_order(limitations), + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Map GWAS loci to ranked candidate genes.") + parser.add_argument("--input-json", help="Path to input JSON file.") + parser.add_argument( + "--trait-query", help="Trait/disease free-text query (for example: type 2 diabetes)." + ) + parser.add_argument("--efo-id", help="Explicit EFO ID anchor (for example: EFO_0001360).") + parser.add_argument( + "--seed-rsid", + action="append", + default=[], + help="Seed rsID anchor (repeatable), for example: --seed-rsid rs7903146", + ) + parser.add_argument("--target-gene", help="Optional target gene to highlight.") + parser.add_argument( + "--include-figures", + action=argparse.BooleanOptionalAction, + default=None, + help="Render optional figures. Trait-only runs default to true unless --no-include-figures is set.", + ) + parser.add_argument("--mapping-output-path", help="Optional mapping JSON output path.") + parser.add_argument("--summary-output-path", help="Optional summary markdown output path.") + parser.add_argument("--figure-output-dir", help="Optional figure output directory.") + parser.add_argument( + "--print-inline-image-markdown", + action="store_true", + help="Print render-ready markdown image tags as plain lines (not code-fenced).", + ) + parser.add_argument("--print-result", action="store_true", help="Print JSON result to stdout.") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + payload: dict[str, Any] + if args.input_json: + input_path = Path(args.input_json) + try: + payload = json.loads(input_path.read_text(encoding="utf-8")) + except Exception as exc: + err = {"status": "error", "error": f"Could not parse input JSON: {exc}"} + print(json.dumps(err, indent=2)) + return 2 + if args.include_figures is not None: + payload["include_figures"] = bool(args.include_figures) + else: + payload = {} + if args.trait_query: + payload["trait_query"] = args.trait_query + if args.efo_id: + payload["efo_id"] = args.efo_id + seed_rsids = [normalize_rsid(s) for s in args.seed_rsid if normalize_rsid(s)] + if seed_rsids: + payload["seed_rsids"] = seed_rsids + if args.target_gene: + payload["target_gene"] = args.target_gene + if args.include_figures is None: + payload["include_figures"] = True + else: + payload["include_figures"] = bool(args.include_figures) + if args.mapping_output_path: + payload["mapping_output_path"] = args.mapping_output_path + if args.summary_output_path: + payload["summary_output_path"] = args.summary_output_path + if args.figure_output_dir: + payload["figure_output_dir"] = args.figure_output_dir + + try: + result = map_locus_to_gene(payload) + except Exception as exc: + err = {"status": "error", "error": str(exc)} + print(json.dumps(err, indent=2)) + return 1 + + if args.print_inline_image_markdown: + for line in as_string_list(result.get("inline_image_markdown")): + print(line) + if args.print_result: + print(json.dumps(result, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/test_map_locus_to_gene.py b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/test_map_locus_to_gene.py new file mode 100644 index 0000000..5fa4881 --- /dev/null +++ b/plugins/life-science-research/skills/locus-to-gene-mapper-skill/scripts/test_map_locus_to_gene.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import importlib.util +import unittest +from pathlib import Path +from unittest import mock + +SCRIPT_PATH = Path(__file__).with_name("map_locus_to_gene.py") +SPEC = importlib.util.spec_from_file_location("map_locus_to_gene", SCRIPT_PATH) +assert SPEC and SPEC.loader +map_locus_to_gene = importlib.util.module_from_spec(SPEC) +SPEC.loader.exec_module(map_locus_to_gene) + + +def refsnp_payload() -> dict: + return { + "primary_snapshot_data": { + "placements_with_allele": [ + { + "seq_id": "NC_000010.11", + "placement_annot": { + "seq_id_traits_by_assembly": [ + { + "assembly_name": "GRCh38.p14", + "is_top_level": True, + "is_chromosome": True, + "is_alt": False, + "is_patch": False, + } + ] + }, + "alleles": [ + { + "allele": { + "spdi": { + "position": 112998589, + "deleted_sequence": "C", + "inserted_sequence": "C", + } + } + }, + { + "allele": { + "spdi": { + "position": 112998589, + "deleted_sequence": "C", + "inserted_sequence": "G", + } + } + }, + { + "allele": { + "spdi": { + "position": 112998589, + "deleted_sequence": "C", + "inserted_sequence": "T", + } + } + }, + ], + }, + { + "seq_id": "NC_000010.10", + "placement_annot": { + "seq_id_traits_by_assembly": [ + { + "assembly_name": "GRCh37.p13", + "is_top_level": True, + "is_chromosome": True, + "is_alt": False, + "is_patch": False, + } + ] + }, + "alleles": [ + { + "allele": { + "spdi": { + "position": 114758348, + "deleted_sequence": "C", + "inserted_sequence": "C", + } + } + }, + { + "allele": { + "spdi": { + "position": 114758348, + "deleted_sequence": "C", + "inserted_sequence": "T", + } + } + }, + ], + }, + ], + "allele_annotations": [ + { + "assembly_annotation": [ + { + "seq_id": "NC_000010.11", + "genes": [ + { + "name": "transcription factor 7 like 2", + "locus": "TCF7L2", + "rnas": [ + { + "sequence_ontology": [ + {"name": "intron_variant"}, + ] + } + ], + } + ], + } + ] + } + ], + } + } + + +class RefSnpResolutionTests(unittest.TestCase): + def test_refsnp_base_uses_current_numeric_lookup_endpoint(self) -> None: + self.assertEqual( + map_locus_to_gene.REFSNP_BASE, + "https://api.ncbi.nlm.nih.gov/variation/v0/refsnp", + ) + + def test_resolve_refsnp_coordinates_uses_top_level_grch_placements(self) -> None: + with mock.patch.object(map_locus_to_gene, "safe_get_json", return_value=refsnp_payload()): + coords = map_locus_to_gene.resolve_refsnp_coordinates("rs7903146", [], []) + + self.assertEqual(coords["grch38"]["chr"], "10") + self.assertEqual(coords["grch38"]["pos"], 112998590) + self.assertEqual(coords["grch38"]["ref"], "C") + self.assertEqual(coords["grch38"]["alt"], "T") + self.assertEqual(coords["grch37"]["pos"], 114758349) + + def test_fetch_refsnp_annotations_uses_gene_locus_symbols(self) -> None: + with mock.patch.object(map_locus_to_gene, "safe_get_json", return_value=refsnp_payload()): + annotations = map_locus_to_gene.fetch_refsnp_annotations(["rs7903146"], []) + + self.assertEqual(annotations["rs7903146"]["genes"], ["TCF7L2"]) + self.assertIn("intron_variant", annotations["rs7903146"]["consequence_terms"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/plugins/life-science-research/skills/metabolights-skill/SKILL.md b/plugins/life-science-research/skills/metabolights-skill/SKILL.md new file mode 100644 index 0000000..16d4e75 --- /dev/null +++ b/plugins/life-science-research/skills/metabolights-skill/SKILL.md @@ -0,0 +1,37 @@ +--- +name: metabolights-skill +description: Submit compact MetaboLights requests for study discovery and study-level metabolomics metadata. Use when a user wants concise MetaboLights summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all MetaboLights calls. +- Use `base_url=https://www.ebi.ac.uk/metabolights/ws`. +- Start with `studies` for archive browsing and `studies/` for targeted records. +- Keep study discovery narrow and paged rather than pulling very large pages. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer these paths: `studies` and `studies/`. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common MetaboLights patterns: + - `{"base_url":"https://www.ebi.ac.uk/metabolights/ws","path":"studies","record_path":"content","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/metabolights/ws","path":"studies/MTBLS1"}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/metabolights/ws","path":"studies","record_path":"content","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/metabolights-skill/agents/openai.yaml b/plugins/life-science-research/skills/metabolights-skill/agents/openai.yaml new file mode 100644 index 0000000..9d0f819 --- /dev/null +++ b/plugins/life-science-research/skills/metabolights-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "MetaboLights" + short_description: "Fetch MetaboLights study summaries" diff --git a/plugins/life-science-research/skills/metabolights-skill/scripts/rest_request.py b/plugins/life-science-research/skills/metabolights-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/metabolights-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/mgnify-skill/SKILL.md b/plugins/life-science-research/skills/mgnify-skill/SKILL.md new file mode 100644 index 0000000..564ea0c --- /dev/null +++ b/plugins/life-science-research/skills/mgnify-skill/SKILL.md @@ -0,0 +1,37 @@ +--- +name: mgnify-skill +description: Submit compact MGnify API requests for microbiome studies, samples, and biome metadata. Use when a user wants concise MGnify summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all MGnify calls. +- Use `base_url=https://www.ebi.ac.uk/metagenomics/api/v1`. +- MGnify uses JSON:API-style responses. Prefer `record_path=data` for collection endpoints. +- Keep requests narrow by study accession, sample accession, or biome whenever possible. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer these paths: `studies`, `samples`, and `biomes`. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common MGnify patterns: + - `{"base_url":"https://www.ebi.ac.uk/metagenomics/api/v1","path":"studies","params":{"page_size":10},"record_path":"data","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/metagenomics/api/v1","path":"biomes","params":{"page_size":10},"record_path":"data","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/metagenomics/api/v1","path":"studies","params":{"page_size":10},"record_path":"data","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/mgnify-skill/agents/openai.yaml b/plugins/life-science-research/skills/mgnify-skill/agents/openai.yaml new file mode 100644 index 0000000..6302516 --- /dev/null +++ b/plugins/life-science-research/skills/mgnify-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "MGnify" + short_description: "Fetch MGnify microbiome summaries" diff --git a/plugins/life-science-research/skills/mgnify-skill/scripts/rest_request.py b/plugins/life-science-research/skills/mgnify-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/mgnify-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/ncbi-blast-skill/SKILL.md b/plugins/life-science-research/skills/ncbi-blast-skill/SKILL.md new file mode 100644 index 0000000..1bde049 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-blast-skill/SKILL.md @@ -0,0 +1,62 @@ +--- +name: ncbi-blast-skill +description: Submit, poll, and summarize NCBI BLAST Common URL API jobs (Blast.cgi) for nucleotide or protein sequences. Use when a user wants RID status, BLAST results, or compact top-hit summaries; fetch raw Text/JSON2 only on request. +--- + +## Operating rules + +- Use `scripts/ncbi_blast.py` for all concrete BLAST work. +- Honor NCBI limits: `>=10s` between requests and `>=60s` between polls for the same RID. +- Always surface the `RID` in the response so the job can be resumed or refetched later. +- If the conversation is long or multiple tool calls have occurred, refetch from the `RID` instead of trusting older context. +- If a prior turn saved raw output to disk, do not read it back into context unless the user asks for a specific follow-up. + +## Execution behavior + +- Return compact BLAST summaries first. +- Do not paste full `JSON2` or long Text alignments into chat by default. +- Default to `max_hits=5` and `max_queries=5`. +- If the user asks for raw output, write it to a file and report the path. +- Only provide Python code when the user explicitly asks for code or execution is unavailable. +- For normal user-facing answers, summarize the script JSON in markdown; if the user explicitly asks for machine-readable output, return the JSON verbatim. + +## Input + +- The script reads one JSON object from stdin. +- `action` must be one of `submit`, `status`, `fetch`, or `run`. +- `submit` and `run` require `program`, `database`, `query_fasta`, and `email` (or `NCBI_EMAIL`). +- `status` and `fetch` require `rid`. +- `program` must be one of `blastn`, `blastp`, `blastx`, `tblastn`, or `tblastx`. +- `result_format` defaults to `json2` for `run` and `fetch`. +- `tool` defaults to `NCBI_TOOL`, then `ncbi-blast-skill`. +- `max_hits` defaults to `5`; `max_queries` defaults to `5`. +- `hitlist_size` defaults to `50`; `descriptions` and `alignments` default to `5`. +- `wait_timeout_sec` defaults to `900`. +- `save_raw` defaults to `false`. +- If `save_raw=true` and `raw_output_path` is omitted, the script writes to `/tmp/ncbi-blast-.`. +- `query_fasta` may contain multi-FASTA input; compact summaries still cap per-query output with `max_hits` and `max_queries`. + +## Output + +- Common success fields: `ok`, `source`, `action`, `warnings`. +- `submit` returns `rid`, `rtoe_seconds`, and `status="SUBMITTED"`. +- `status` returns `rid`, normalized `status`, and `has_hits`. +- `run` and `fetch` with `result_format=json2` return `rid`, `status`, `has_hits`, `result_format`, `query_count_returned`, `query_count_available`, `query_summaries_truncated`, `query_summaries`, and `raw_output_path`. +- Each `query_summary` contains `query_title`, `hit_count_returned`, `hit_count_available`, `truncated`, and `top_hits`. +- Each `top_hit` contains `rank`, `accession`, `title`, `evalue`, and `bit_score`. +- `fetch` with `result_format=text` returns `text_head` capped at 800 characters unless `save_raw=true`; when `save_raw=true`, it returns only the artifact path. +- Failures return `ok=false`, `error.code`, `error.message`, and `warnings`. + +## Execution + +- Run `python scripts/ncbi_blast.py`. +- If `requests` is missing, install it once before first use with `python -m pip install requests`. + +```bash +echo '{"action":"run","program":"blastp","database":"swissprot","query_fasta":">q1\nMTEYK...","email":"you@example.com"}' | python scripts/ncbi_blast.py +``` + +## References + +- Load `references/blast-common-url-api.txt` only for parameter details or uncommon BLAST options. +- Do not load `references/intent-notes.txt` during normal skill execution; it is not runtime guidance. diff --git a/plugins/life-science-research/skills/ncbi-blast-skill/agents/openai.yaml b/plugins/life-science-research/skills/ncbi-blast-skill/agents/openai.yaml new file mode 100644 index 0000000..80dc033 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-blast-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "NCBI BLAST" + short_description: "Run and summarize NCBI BLAST jobs" diff --git a/plugins/life-science-research/skills/ncbi-blast-skill/references/blast-common-url-api.txt b/plugins/life-science-research/skills/ncbi-blast-skill/references/blast-common-url-api.txt new file mode 100644 index 0000000..cf14c9d --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-blast-skill/references/blast-common-url-api.txt @@ -0,0 +1,75 @@ +NCBI BLAST Common URL API (REST) notes +Source docs: https://ncbi.github.io/blast-cloud/dev/api.html +Primary endpoint: https://blast.ncbi.nlm.nih.gov/Blast.cgi + +Overview: +- BLAST searches are submitted with CMD=Put. +- Status checks and result retrieval use CMD=Get. +- Service is shared/public; avoid high-frequency polling and identify requests with tool/email. + +Usage guidelines: +- Do not contact server more often than once every 10 seconds. +- Do not poll a single RID more often than once a minute. +- Include URL parameters email and tool. +- If submitting >50 searches, run on weekends or 9 pm to 5 am US Eastern on weekdays. +- If submitting >100 searches in 24h, jobs may be moved to slower queue or blocked. +- For short queries, batch multiple sequences in one submission when feasible. + +Required submit parameters (CMD=Put): +- CMD=Put +- QUERY= +- DATABASE= +- PROGRAM= + - megablast enabled via PROGRAM=blastn and MEGABLAST=on + +Common optional submit parameters: +- FILTER (F/T/L/mL) +- EXPECT (e-value) +- WORD_SIZE +- GAPCOSTS +- MATRIX +- COMPOSITION_BASED_STATISTICS +- HITLIST_SIZE +- SHORT_QUERY_ADJUST +- FORMAT_TYPE (HTML, Text, XML2, JSON2, JSON2_S, SAM, etc.) +- DESCRIPTIONS +- ALIGNMENTS +- NCBI_GI +- NO_DATABASE_OVERRIDE (experimental) +- tool +- email + +Required retrieve/check parameters (CMD=Get): +- CMD=Get +- RID= + +Common optional retrieve parameters: +- FORMAT_OBJECT=SearchInfo (for status) +- FORMAT_TYPE (HTML, Text, XML2, JSON2, JSON2_S, SAM, CSV with tabular view) +- DESCRIPTIONS +- ALIGNMENTS +- NCBI_GI + +Typical flow: +1) Submit search with CMD=Put and parse RID + RTOE from response. +2) Wait at least max(RTOE, 10s) before first status check. +3) Poll status with CMD=Get&FORMAT_OBJECT=SearchInfo&RID=, no more than once/min per RID. +4) If READY and ThereAreHits=yes, retrieve results via CMD=Get&RID=&FORMAT_TYPE=Text (or JSON2/XML2). +5) Handle FAILED, UNKNOWN, and no-hit states explicitly. + +Example status values in SearchInfo: +- Status=WAITING +- Status=READY + ThereAreHits=yes/no +- Status=FAILED +- Status=UNKNOWN (expired) + +Example submit request shape: +POST https://blast.ncbi.nlm.nih.gov/Blast.cgi +Content-Type: application/x-www-form-urlencoded +CMD=Put&PROGRAM=blastn&DATABASE=core_nt&QUERY=>q1%0AACGT... + +Example status request shape: +GET https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&FORMAT_OBJECT=SearchInfo&RID= + +Example result request shape: +GET https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Get&FORMAT_TYPE=Text&RID= diff --git a/plugins/life-science-research/skills/ncbi-blast-skill/references/intent-notes.txt b/plugins/life-science-research/skills/ncbi-blast-skill/references/intent-notes.txt new file mode 100644 index 0000000..d0a39ac --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-blast-skill/references/intent-notes.txt @@ -0,0 +1,7 @@ +User intent for BLAST skill generation: +- Build a dedicated NCBI BLAST API skill from Common URL API docs and sample Perl workflow. +- Keep examples runnable and practical for submission, polling, and result retrieval. +- Include rate-limit guardrails and good API citizenship expectations. +- Emphasize required parameters for CMD=Put and CMD=Get. +- Cover common program/database choices and response status handling. +- Make the skill concise and directly usable for sequence search tasks. diff --git a/plugins/life-science-research/skills/ncbi-blast-skill/scripts/ncbi_blast.py b/plugins/life-science-research/skills/ncbi-blast-skill/scripts/ncbi_blast.py new file mode 100644 index 0000000..65ced8e --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-blast-skill/scripts/ncbi_blast.py @@ -0,0 +1,765 @@ +#!/usr/bin/env python3 +"""ncbi_blast + +Compact NCBI BLAST Common URL API helper. + +Reads one JSON object from stdin and prints one JSON object to stdout. +""" + +from __future__ import annotations + +import io +import json +import os +import re +import sys +import time +import zipfile +from pathlib import Path +from typing import Any, Callable + +try: + import requests +except ImportError as exc: # pragma: no cover - exercised via runtime guard + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +BLAST_URL = "https://blast.ncbi.nlm.nih.gov/Blast.cgi" +DEFAULT_TOOL = "ncbi-blast-skill" +DEFAULT_RESULT_FORMAT = "json2" +DEFAULT_MAX_HITS = 5 +DEFAULT_MAX_QUERIES = 5 +DEFAULT_HITLIST_SIZE = 50 +DEFAULT_DESCRIPTIONS = 5 +DEFAULT_ALIGNMENTS = 5 +DEFAULT_WAIT_TIMEOUT_SEC = 900 +MIN_REQUEST_INTERVAL_SEC = 10 +MIN_POLL_INTERVAL_SEC = 60 + +RID_RE = re.compile(r"^\s*RID\s*=\s*(\S+)", re.MULTILINE) +RTOE_RE = re.compile(r"^\s*RTOE\s*=\s*(\d+)", re.MULTILINE) +STATUS_RE = re.compile(r"Status=(WAITING|READY|FAILED|UNKNOWN)") + +VALID_ACTIONS = {"submit", "status", "fetch", "run"} +VALID_PROGRAMS = {"blastn", "blastp", "blastx", "tblastn", "tblastx"} +VALID_RESULT_FORMATS = {"json2", "text"} + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return { + "ok": False, + "error": {"code": code, "message": message}, + "warnings": warnings or [], + } + + +def _parse_positive_int(payload: dict[str, Any], key: str, default: int) -> int: + value = payload.get(key, default) + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{key}` must be a positive integer.") + return value + + +def _parse_bool(payload: dict[str, Any], key: str, default: bool) -> bool: + value = payload.get(key, default) + if not isinstance(value, bool): + raise ValueError(f"`{key}` must be a boolean.") + return value + + +def _parse_str(payload: dict[str, Any], key: str) -> str | None: + value = payload.get(key) + if value is None: + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{key}` must be a non-empty string when provided.") + return value.strip() + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + + action = payload.get("action") + if not isinstance(action, str) or action not in VALID_ACTIONS: + raise ValueError("`action` must be one of: submit, status, fetch, run.") + + program = _parse_str(payload, "program") + if program is not None and program not in VALID_PROGRAMS: + raise ValueError("`program` must be one of: blastn, blastp, blastx, tblastn, tblastx.") + + database = _parse_str(payload, "database") + query_fasta = _parse_str(payload, "query_fasta") + rid = _parse_str(payload, "rid") + + tool = _parse_str(payload, "tool") or os.environ.get("NCBI_TOOL", DEFAULT_TOOL) + email = _parse_str(payload, "email") or os.environ.get("NCBI_EMAIL") + if email: + email = email.strip() + + result_format_raw = payload.get("result_format", DEFAULT_RESULT_FORMAT) + if ( + not isinstance(result_format_raw, str) + or result_format_raw.lower() not in VALID_RESULT_FORMATS + ): + raise ValueError("`result_format` must be `json2` or `text`.") + result_format = result_format_raw.lower() + + raw_output_path = _parse_str(payload, "raw_output_path") + + config = { + "action": action, + "program": program, + "database": database, + "query_fasta": query_fasta, + "rid": rid, + "tool": tool, + "email": email, + "result_format": result_format, + "max_hits": _parse_positive_int(payload, "max_hits", DEFAULT_MAX_HITS), + "max_queries": _parse_positive_int(payload, "max_queries", DEFAULT_MAX_QUERIES), + "hitlist_size": _parse_positive_int(payload, "hitlist_size", DEFAULT_HITLIST_SIZE), + "descriptions": _parse_positive_int(payload, "descriptions", DEFAULT_DESCRIPTIONS), + "alignments": _parse_positive_int(payload, "alignments", DEFAULT_ALIGNMENTS), + "wait_timeout_sec": _parse_positive_int( + payload, "wait_timeout_sec", DEFAULT_WAIT_TIMEOUT_SEC + ), + "megablast": _parse_bool(payload, "megablast", False), + "save_raw": _parse_bool(payload, "save_raw", False), + "raw_output_path": raw_output_path, + } + + if action in {"submit", "run"}: + if program is None: + raise ValueError(f"`program` is required for `{action}`.") + if database is None: + raise ValueError(f"`database` is required for `{action}`.") + if query_fasta is None: + raise ValueError(f"`query_fasta` is required for `{action}`.") + if email is None: + raise ValueError(f"`email` is required for `{action}` or via NCBI_EMAIL.") + + if action in {"status", "fetch"} and rid is None: + raise ValueError(f"`rid` is required for `{action}`.") + + return config + + +class RequestThrottle: + def __init__( + self, + min_interval_sec: int = MIN_REQUEST_INTERVAL_SEC, + sleep_fn: Callable[[float], None] = time.sleep, + clock_fn: Callable[[], float] = time.time, + ) -> None: + self.min_interval_sec = min_interval_sec + self.sleep_fn = sleep_fn + self.clock_fn = clock_fn + self.last_request_ts: float | None = None + + def request(self, session: requests.Session, method: str, **kwargs: Any) -> requests.Response: + if self.last_request_ts is not None: + remaining = self.min_interval_sec - (self.clock_fn() - self.last_request_ts) + if remaining > 0: + self.sleep_fn(remaining) + response = session.request(method, BLAST_URL, **kwargs) + self.last_request_ts = self.clock_fn() + response.raise_for_status() + return response + + +def make_session(tool: str | None, email: str | None) -> requests.Session: + session = requests.Session() + ua_parts = ["ncbi-blast-skill/1.0 (+requests)"] + if tool: + ua_parts.append(f"tool={tool}") + if email: + ua_parts.append(f"email={email}") + session.headers["User-Agent"] = " ".join(ua_parts) + return session + + +def submit_search( + session: requests.Session, + throttle: RequestThrottle, + config: dict[str, Any], +) -> dict[str, Any]: + params = { + "CMD": "Put", + "PROGRAM": config["program"], + "DATABASE": config["database"], + "QUERY": config["query_fasta"], + "FORMAT_TYPE": "Text", + "HITLIST_SIZE": config["hitlist_size"], + "tool": config["tool"], + "email": config["email"], + } + if config["megablast"] and config["program"] == "blastn": + params["MEGABLAST"] = "on" + + response = throttle.request(session, "POST", data=params, timeout=60) + text = response.text + + rid_match = RID_RE.search(text) + rtoe_match = RTOE_RE.search(text) + if not rid_match: + raise ValueError("BLAST submit response did not include an RID.") + + rid = rid_match.group(1) + rtoe_seconds = int(rtoe_match.group(1)) if rtoe_match else 10 + + return { + "ok": True, + "source": "ncbi-blast", + "action": "submit", + "rid": rid, + "rtoe_seconds": rtoe_seconds, + "status": "SUBMITTED", + "warnings": [], + } + + +def parse_search_info(body: str) -> dict[str, Any]: + status_match = STATUS_RE.search(body) + if not status_match: + raise ValueError("BLAST SearchInfo response did not include a recognized status.") + status = status_match.group(1) + has_hits = "ThereAreHits=yes" in body + return {"status": status, "has_hits": has_hits} + + +def get_search_info( + session: requests.Session, + throttle: RequestThrottle, + rid: str, + tool: str | None, + email: str | None, +) -> dict[str, Any]: + params = { + "CMD": "Get", + "FORMAT_OBJECT": "SearchInfo", + "RID": rid, + } + if tool: + params["tool"] = tool + if email: + params["email"] = email + + response = throttle.request(session, "GET", params=params, timeout=30) + parsed = parse_search_info(response.text) + return { + "ok": True, + "source": "ncbi-blast", + "action": "status", + "rid": rid, + "status": parsed["status"], + "has_hits": parsed["has_hits"], + "warnings": [], + } + + +def _derive_raw_output_path(rid: str, result_format: str, raw_output_path: str | None) -> Path: + if raw_output_path: + return Path(raw_output_path) + suffix = "json" if result_format == "json2" else "txt" + return Path(f"/tmp/ncbi-blast-{rid}.{suffix}") + + +def _save_raw_output( + rid: str, + result_format: str, + raw_output: str, + raw_output_path: str | None, +) -> str: + path = _derive_raw_output_path(rid, result_format, raw_output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def _summarize_json2_payload( + data: Any, + max_queries: int, + max_hits: int, +) -> dict[str, Any]: + reports = data.get("BlastOutput2") + if isinstance(reports, dict): + reports = [reports] + if not isinstance(reports, list): + raise ValueError("BLAST JSON2 response did not include `BlastOutput2`.") + + summaries: list[dict[str, Any]] = [] + query_count_available = len(reports) + has_hits = False + + for index, report in enumerate(reports[:max_queries], start=1): + if not isinstance(report, dict): + raise ValueError("BLAST JSON2 report entry was not an object.") + report_body = report.get("report", {}) + if not isinstance(report_body, dict): + raise ValueError("BLAST JSON2 report payload was not an object.") + search = report_body.get("results", {}).get("search", {}) + if not isinstance(search, dict): + raise ValueError("BLAST JSON2 search payload was not an object.") + + query_title = search.get("query_title") or search.get("query_id") or f"query_{index}" + hits = search.get("hits") or [] + if not isinstance(hits, list): + raise ValueError("BLAST JSON2 hits payload was not a list.") + + top_hits = [] + for rank, hit in enumerate(hits[:max_hits], start=1): + if not isinstance(hit, dict): + continue + descriptions = hit.get("description") or [] + desc = descriptions[0] if descriptions and isinstance(descriptions[0], dict) else {} + hsps = hit.get("hsps") or [] + hsp = hsps[0] if hsps and isinstance(hsps[0], dict) else {} + top_hits.append( + { + "rank": rank, + "accession": desc.get("accession") or desc.get("id"), + "title": desc.get("title"), + "evalue": hsp.get("evalue"), + "bit_score": hsp.get("bit_score"), + } + ) + + hit_count_available = len(hits) + has_hits = has_hits or hit_count_available > 0 + summaries.append( + { + "query_title": query_title, + "hit_count_returned": len(top_hits), + "hit_count_available": hit_count_available, + "truncated": len(top_hits) < hit_count_available, + "top_hits": top_hits, + } + ) + + return { + "query_count_returned": len(summaries), + "query_count_available": query_count_available, + "query_summaries_truncated": len(summaries) < query_count_available, + "query_summaries": summaries, + "has_hits": has_hits, + } + + +def _load_json_member(zip_file: zipfile.ZipFile, member_name: str) -> tuple[Any, str]: + try: + text = zip_file.read(member_name).decode("utf-8") + except UnicodeDecodeError as exc: + raise ValueError(f"BLAST JSON2 archive member {member_name!r} was not UTF-8 text.") from exc + try: + return json.loads(text), text + except ValueError as exc: + raise ValueError(f"BLAST JSON2 archive member {member_name!r} was not valid JSON.") from exc + + +def _merge_blast_payloads(payloads: list[dict[str, Any]]) -> dict[str, Any]: + if not payloads: + raise ValueError("BLAST JSON2 archive did not contain any payload JSON files.") + if len(payloads) == 1: + return payloads[0] + + merged = dict(payloads[0]) + merged_reports: list[Any] = [] + for payload in payloads: + reports = payload.get("BlastOutput2") + if isinstance(reports, dict): + merged_reports.append(reports) + continue + if not isinstance(reports, list): + raise ValueError("BLAST JSON2 payload did not include `BlastOutput2`.") + merged_reports.extend(reports) + merged["BlastOutput2"] = merged_reports + return merged + + +def _extract_json2_payload(response: requests.Response) -> tuple[Any, str]: + content_type = (response.headers.get("content-type") or "").lower() + raw_bytes = response.content + if content_type.startswith("application/zip") or raw_bytes.startswith(b"PK\x03\x04"): + try: + zip_file = zipfile.ZipFile(io.BytesIO(raw_bytes)) + except zipfile.BadZipFile as exc: + raise ValueError( + "BLAST JSON2 response looked like a ZIP archive but could not be opened." + ) from exc + + with zip_file: + json_members = [name for name in zip_file.namelist() if name.lower().endswith(".json")] + if not json_members: + raise ValueError("BLAST JSON2 archive did not contain any JSON members.") + + manifest: dict[str, Any] | None = None + manifest_members: list[str] = [] + for member_name in json_members: + payload, _ = _load_json_member(zip_file, member_name) + if isinstance(payload, dict) and isinstance(payload.get("BlastJSON"), list): + manifest = payload + manifest_members = [ + item.get("File") + for item in payload["BlastJSON"] + if isinstance(item, dict) and isinstance(item.get("File"), str) + ] + break + + payload_objects: list[dict[str, Any]] = [] + if manifest_members: + for member_name in manifest_members: + if member_name not in zip_file.namelist(): + raise ValueError( + f"BLAST JSON2 archive referenced missing member {member_name!r}." + ) + payload, _ = _load_json_member(zip_file, member_name) + if not isinstance(payload, dict): + raise ValueError( + f"BLAST JSON2 archive member {member_name!r} was not an object." + ) + payload_objects.append(payload) + else: + for member_name in json_members: + payload, _ = _load_json_member(zip_file, member_name) + if isinstance(payload, dict) and "BlastOutput2" in payload: + payload_objects.append(payload) + + if not payload_objects: + if manifest is not None: + raise ValueError( + "BLAST JSON2 archive manifest did not point to any payload JSON files." + ) + raise ValueError("BLAST JSON2 archive did not contain a `BlastOutput2` payload.") + + merged_payload = _merge_blast_payloads(payload_objects) + return merged_payload, json.dumps(merged_payload) + + raw_text = response.text + try: + return response.json(), raw_text + except ValueError as exc: + raise ValueError("BLAST JSON2 response was not valid JSON.") from exc + + +def _fetch_result_ready( + session: requests.Session, + throttle: RequestThrottle, + rid: str, + config: dict[str, Any], +) -> dict[str, Any]: + result_format = config["result_format"] + params: dict[str, Any] = { + "CMD": "Get", + "RID": rid, + } + if config["tool"]: + params["tool"] = config["tool"] + if config["email"]: + params["email"] = config["email"] + + if result_format == "json2": + params["FORMAT_TYPE"] = "JSON2" + response = throttle.request(session, "GET", params=params, timeout=60) + raw_output_path = None + try: + data, raw_json_text = _extract_json2_payload(response) + except ValueError as exc: + if config["save_raw"]: + raw_output_path = _save_raw_output( + rid=rid, + result_format=result_format, + raw_output=response.text, + raw_output_path=config["raw_output_path"], + ) + raise ValueError(f"{exc} Raw response saved to {raw_output_path}.") from exc + raise + + if config["save_raw"]: + raw_output_path = _save_raw_output( + rid=rid, + result_format=result_format, + raw_output=raw_json_text, + raw_output_path=config["raw_output_path"], + ) + + summary = _summarize_json2_payload( + data, + max_queries=config["max_queries"], + max_hits=config["max_hits"], + ) + + return { + "ok": True, + "source": "ncbi-blast", + "action": "fetch", + "rid": rid, + "status": "READY", + "has_hits": summary["has_hits"], + "result_format": result_format, + "query_count_returned": summary["query_count_returned"], + "query_count_available": summary["query_count_available"], + "query_summaries_truncated": summary["query_summaries_truncated"], + "query_summaries": summary["query_summaries"], + "raw_output_path": raw_output_path, + "warnings": [], + } + + params["FORMAT_TYPE"] = "Text" + params["DESCRIPTIONS"] = config["descriptions"] + params["ALIGNMENTS"] = config["alignments"] + response = throttle.request(session, "GET", params=params, timeout=60) + text = response.text + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + rid=rid, + result_format=result_format, + raw_output=text, + raw_output_path=config["raw_output_path"], + ) + return { + "ok": True, + "source": "ncbi-blast", + "action": "fetch", + "rid": rid, + "status": "READY", + "has_hits": True, + "result_format": result_format, + "raw_output_path": raw_output_path, + "warnings": [], + } + + text_head = text[:800] + return { + "ok": True, + "source": "ncbi-blast", + "action": "fetch", + "rid": rid, + "status": "READY", + "has_hits": True, + "result_format": result_format, + "text_head": text_head, + "text_head_truncated": len(text_head) < len(text), + "raw_output_path": raw_output_path, + "warnings": [], + } + + +def fetch_action( + session: requests.Session, + throttle: RequestThrottle, + config: dict[str, Any], + status_payload: dict[str, Any] | None = None, +) -> dict[str, Any]: + rid = config["rid"] + if status_payload is None: + status_payload = get_search_info( + session=session, + throttle=throttle, + rid=rid, + tool=config["tool"], + email=config["email"], + ) + + status = status_payload["status"] + has_hits = status_payload["has_hits"] + + if status == "WAITING": + return { + "ok": True, + "source": "ncbi-blast", + "action": "fetch", + "rid": rid, + "status": status, + "has_hits": has_hits, + "result_format": config["result_format"], + "raw_output_path": None, + "warnings": [], + } + + if status == "FAILED": + return error("blast_failed", f"BLAST job {rid} reported FAILED.") + if status == "UNKNOWN": + return error("blast_unknown", f"BLAST job {rid} reported UNKNOWN or expired.") + if not has_hits: + return { + "ok": True, + "source": "ncbi-blast", + "action": "fetch", + "rid": rid, + "status": status, + "has_hits": False, + "result_format": config["result_format"], + "raw_output_path": None, + "warnings": [], + } + + return _fetch_result_ready(session=session, throttle=throttle, rid=rid, config=config) + + +def run_action( + session: requests.Session, + throttle: RequestThrottle, + config: dict[str, Any], + sleep_fn: Callable[[float], None] = time.sleep, + clock_fn: Callable[[], float] = time.time, +) -> dict[str, Any]: + submit_payload = submit_search(session=session, throttle=throttle, config=config) + rid = submit_payload["rid"] + rtoe_seconds = submit_payload["rtoe_seconds"] + deadline = clock_fn() + config["wait_timeout_sec"] + + initial_wait = max(rtoe_seconds, MIN_REQUEST_INTERVAL_SEC) + now = clock_fn() + if now + initial_wait > deadline: + return { + "ok": True, + "source": "ncbi-blast", + "action": "run", + "rid": rid, + "rtoe_seconds": rtoe_seconds, + "status": "WAITING", + "has_hits": False, + "result_format": config["result_format"], + "raw_output_path": None, + "warnings": [], + } + sleep_fn(initial_wait) + + while True: + status_payload = get_search_info( + session=session, + throttle=throttle, + rid=rid, + tool=config["tool"], + email=config["email"], + ) + status = status_payload["status"] + has_hits = status_payload["has_hits"] + + if status == "READY": + if not has_hits: + return { + "ok": True, + "source": "ncbi-blast", + "action": "run", + "rid": rid, + "rtoe_seconds": rtoe_seconds, + "status": status, + "has_hits": False, + "result_format": config["result_format"], + "raw_output_path": None, + "warnings": [], + } + + fetch_config = dict(config) + fetch_config["rid"] = rid + fetch_payload = fetch_action( + session=session, + throttle=throttle, + config=fetch_config, + status_payload=status_payload, + ) + if fetch_payload.get("ok"): + fetch_payload["action"] = "run" + fetch_payload["rtoe_seconds"] = rtoe_seconds + return fetch_payload + + if status == "FAILED": + return error("blast_failed", f"BLAST job {rid} reported FAILED.") + if status == "UNKNOWN": + return error("blast_unknown", f"BLAST job {rid} reported UNKNOWN or expired.") + + remaining = deadline - clock_fn() + if remaining <= MIN_POLL_INTERVAL_SEC: + return { + "ok": True, + "source": "ncbi-blast", + "action": "run", + "rid": rid, + "rtoe_seconds": rtoe_seconds, + "status": "WAITING", + "has_hits": False, + "result_format": config["result_format"], + "raw_output_path": None, + "warnings": [], + } + sleep_fn(MIN_POLL_INTERVAL_SEC) + + +def execute( + payload: Any, + *, + session: requests.Session | None = None, + sleep_fn: Callable[[float], None] = time.sleep, + clock_fn: Callable[[], float] = time.time, +) -> dict[str, Any]: + if requests is None: + return error( + "missing_dependency", + f"`requests` is required to run this script: {REQUESTS_IMPORT_ERROR}", + ) + + config = parse_input(payload) + local_session = session or make_session(config["tool"], config["email"]) + throttle = RequestThrottle(sleep_fn=sleep_fn, clock_fn=clock_fn) + + try: + if config["action"] == "submit": + return submit_search(session=local_session, throttle=throttle, config=config) + if config["action"] == "status": + return get_search_info( + session=local_session, + throttle=throttle, + rid=config["rid"], + tool=config["tool"], + email=config["email"], + ) + if config["action"] == "fetch": + return fetch_action(session=local_session, throttle=throttle, config=config) + return run_action( + session=local_session, + throttle=throttle, + config=config, + sleep_fn=sleep_fn, + clock_fn=clock_fn, + ) + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"BLAST request failed: {exc}") + finally: + if session is None: + local_session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + exit_code = 2 + else: + if output.get("ok"): + exit_code = 0 + elif output.get("error", {}).get("code") in {"invalid_json", "invalid_input"}: + exit_code = 2 + else: + exit_code = 1 + + sys.stdout.write(json.dumps(output)) + return exit_code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/ncbi-blast-skill/scripts/test_ncbi_blast.py b/plugins/life-science-research/skills/ncbi-blast-skill/scripts/test_ncbi_blast.py new file mode 100644 index 0000000..447635f --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-blast-skill/scripts/test_ncbi_blast.py @@ -0,0 +1,464 @@ +#!/usr/bin/env python3 +"""Unit tests for ncbi_blast.py.""" + +from __future__ import annotations + +import importlib.util +import io +import json +import tempfile +import unittest +import zipfile +from pathlib import Path +from unittest import mock + +SCRIPT_PATH = Path(__file__).with_name("ncbi_blast.py") +SPEC = importlib.util.spec_from_file_location("ncbi_blast", SCRIPT_PATH) +assert SPEC and SPEC.loader +ncbi_blast = importlib.util.module_from_spec(SPEC) +SPEC.loader.exec_module(ncbi_blast) + + +class FakeClock: + def __init__(self) -> None: + self.now = 0.0 + + def time(self) -> float: + return self.now + + def sleep(self, seconds: float) -> None: + self.now += seconds + + +class FakeResponse: + def __init__( + self, + text: str, + *, + json_data: object | None = None, + status_code: int = 200, + headers: dict[str, str] | None = None, + content: bytes | None = None, + ) -> None: + self.text = text + self._json_data = json_data + self.status_code = status_code + self.headers = headers or {} + self.content = content if content is not None else text.encode("utf-8") + + def raise_for_status(self) -> None: + if self.status_code >= 400: + raise ncbi_blast.requests.HTTPError(f"HTTP {self.status_code}") + + def json(self) -> object: + if self._json_data is None: + raise ValueError("Response did not contain JSON.") + return self._json_data + + +class FakeSession: + def __init__(self, responses: list[FakeResponse]) -> None: + self.responses = list(responses) + self.calls: list[tuple[str, str, dict[str, object]]] = [] + self.headers: dict[str, str] = {} + + def request(self, method: str, url: str, **kwargs: object) -> FakeResponse: + self.calls.append((method, url, kwargs)) + if not self.responses: + raise AssertionError("Unexpected extra HTTP request.") + return self.responses.pop(0) + + def close(self) -> None: + return None + + +class NcbiBlastTests(unittest.TestCase): + def _json2_zip_bytes(self, payloads: list[tuple[str, dict[str, object]]]) -> bytes: + stream = io.BytesIO() + with zipfile.ZipFile(stream, "w", compression=zipfile.ZIP_DEFLATED) as zip_file: + manifest_name = "RID123.json" + manifest = {"BlastJSON": [{"File": name} for name, _ in payloads]} + zip_file.writestr(manifest_name, json.dumps(manifest)) + for name, payload in payloads: + zip_file.writestr(name, json.dumps(payload)) + return stream.getvalue() + + def test_submit_parses_rid_and_rtoe(self) -> None: + session = FakeSession( + [ + FakeResponse( + "RID = RID123\nRTOE = 42\nThis is extra body text that should not appear inline.\n" + ) + ] + ) + + result = ncbi_blast.execute( + { + "action": "submit", + "program": "blastp", + "database": "swissprot", + "query_fasta": ">q1\nMTEYK", + "email": "user@example.com", + }, + session=session, + ) + + self.assertTrue(result["ok"]) + self.assertEqual(result["rid"], "RID123") + self.assertEqual(result["rtoe_seconds"], 42) + self.assertEqual(result["status"], "SUBMITTED") + self.assertNotIn("body", json.dumps(result)) + + def test_status_normalizes_searchinfo_states(self) -> None: + cases = [ + ("Status=WAITING\n", "WAITING", False), + ("Status=READY\nThereAreHits=yes\n", "READY", True), + ("Status=READY\nThereAreHits=no\n", "READY", False), + ("Status=FAILED\n", "FAILED", False), + ("Status=UNKNOWN\n", "UNKNOWN", False), + ] + + for body, expected_status, expected_hits in cases: + with self.subTest(status=expected_status, hits=expected_hits): + session = FakeSession([FakeResponse(body)]) + result = ncbi_blast.execute( + {"action": "status", "rid": "RID123"}, + session=session, + ) + self.assertTrue(result["ok"]) + self.assertEqual(result["status"], expected_status) + self.assertEqual(result["has_hits"], expected_hits) + + def test_run_returns_compact_json2_summary_with_caps(self) -> None: + clock = FakeClock() + blast_json = { + "BlastOutput2": [ + { + "report": { + "results": { + "search": { + "query_title": "q1", + "hits": [ + { + "description": [{"accession": "A1", "title": "Alpha"}], + "hsps": [{"evalue": 0.0, "bit_score": 99.9}], + }, + { + "description": [{"accession": "A2", "title": "Beta"}], + "hsps": [{"evalue": 1e-10, "bit_score": 88.8}], + }, + ], + } + } + } + }, + { + "report": { + "results": { + "search": { + "query_title": "q2", + "hits": [ + { + "description": [{"accession": "B1", "title": "Gamma"}], + "hsps": [{"evalue": 2e-5, "bit_score": 77.7}], + } + ], + } + } + } + }, + ] + } + session = FakeSession( + [ + FakeResponse("RID = RID123\nRTOE = 1\n"), + FakeResponse("Status=READY\nThereAreHits=yes\n"), + FakeResponse(json.dumps(blast_json), json_data=blast_json), + ] + ) + + result = ncbi_blast.execute( + { + "action": "run", + "program": "blastp", + "database": "swissprot", + "query_fasta": ">q1\nMTEYK", + "email": "user@example.com", + "max_hits": 1, + "max_queries": 1, + }, + session=session, + sleep_fn=clock.sleep, + clock_fn=clock.time, + ) + + self.assertTrue(result["ok"]) + self.assertEqual(result["action"], "run") + self.assertEqual(result["status"], "READY") + self.assertTrue(result["has_hits"]) + self.assertEqual(result["result_format"], "json2") + self.assertEqual(result["query_count_returned"], 1) + self.assertEqual(result["query_count_available"], 2) + self.assertTrue(result["query_summaries_truncated"]) + self.assertEqual(result["query_summaries"][0]["hit_count_returned"], 1) + self.assertEqual(result["query_summaries"][0]["hit_count_available"], 2) + self.assertTrue(result["query_summaries"][0]["truncated"]) + self.assertEqual(result["query_summaries"][0]["top_hits"][0]["accession"], "A1") + + def test_run_returns_waiting_when_timeout_expires_before_poll(self) -> None: + session = FakeSession([FakeResponse("RID = RID123\nRTOE = 1\n")]) + + result = ncbi_blast.execute( + { + "action": "run", + "program": "blastp", + "database": "swissprot", + "query_fasta": ">q1\nMTEYK", + "email": "user@example.com", + "wait_timeout_sec": 5, + }, + session=session, + sleep_fn=lambda _: None, + clock_fn=lambda: 0.0, + ) + + self.assertTrue(result["ok"]) + self.assertEqual(result["status"], "WAITING") + self.assertEqual(result["rid"], "RID123") + self.assertEqual(result["rtoe_seconds"], 1) + + def test_fetch_json2_sets_truncation_fields(self) -> None: + blast_json = { + "BlastOutput2": [ + { + "report": { + "results": { + "search": { + "query_id": "query_one", + "hits": [ + { + "description": [{"accession": "A1", "title": "Alpha"}], + "hsps": [{"evalue": 0.0, "bit_score": 99.9}], + }, + { + "description": [{"accession": "A2", "title": "Beta"}], + "hsps": [{"evalue": 1.0, "bit_score": 88.8}], + }, + ], + } + } + } + } + ] + } + session = FakeSession( + [ + FakeResponse("Status=READY\nThereAreHits=yes\n"), + FakeResponse(json.dumps(blast_json), json_data=blast_json), + ] + ) + clock = FakeClock() + + result = ncbi_blast.execute( + { + "action": "fetch", + "rid": "RID123", + "max_hits": 1, + }, + session=session, + sleep_fn=clock.sleep, + clock_fn=clock.time, + ) + + self.assertTrue(result["ok"]) + self.assertEqual(result["status"], "READY") + self.assertEqual(result["query_summaries"][0]["query_title"], "query_one") + self.assertEqual(result["query_summaries"][0]["hit_count_available"], 2) + self.assertTrue(result["query_summaries"][0]["truncated"]) + + def test_fetch_json2_unpacks_zip_payload(self) -> None: + zipped_payload = self._json2_zip_bytes( + [ + ( + "RID123_1.json", + { + "BlastOutput2": { + "report": { + "results": { + "search": { + "query_title": "q1", + "hits": [ + { + "description": [ + {"accession": "A1", "title": "Alpha"} + ], + "hsps": [{"evalue": 0.0, "bit_score": 99.9}], + } + ], + } + } + } + } + }, + ) + ] + ) + session = FakeSession( + [ + FakeResponse("Status=READY\nThereAreHits=yes\n"), + FakeResponse( + "", + headers={"content-type": "application/zip"}, + content=zipped_payload, + ), + ] + ) + clock = FakeClock() + with tempfile.TemporaryDirectory() as tmpdir: + output_path = Path(tmpdir) / "blast.json" + result = ncbi_blast.execute( + { + "action": "fetch", + "rid": "RID123", + "save_raw": True, + "raw_output_path": str(output_path), + }, + session=session, + sleep_fn=clock.sleep, + clock_fn=clock.time, + ) + + self.assertTrue(result["ok"]) + self.assertEqual(result["status"], "READY") + self.assertEqual(result["query_summaries"][0]["query_title"], "q1") + self.assertEqual(result["query_summaries"][0]["top_hits"][0]["accession"], "A1") + self.assertEqual(result["raw_output_path"], str(output_path)) + self.assertTrue(output_path.exists()) + + def test_fetch_text_caps_text_head(self) -> None: + text_body = "A" * 1200 + session = FakeSession( + [ + FakeResponse("Status=READY\nThereAreHits=yes\n"), + FakeResponse(text_body), + ] + ) + clock = FakeClock() + + result = ncbi_blast.execute( + { + "action": "fetch", + "rid": "RID123", + "result_format": "text", + }, + session=session, + sleep_fn=clock.sleep, + clock_fn=clock.time, + ) + + self.assertTrue(result["ok"]) + self.assertEqual(len(result["text_head"]), 800) + self.assertTrue(result["text_head_truncated"]) + self.assertIsNone(result["raw_output_path"]) + + def test_fetch_text_save_raw_writes_artifact(self) -> None: + text_body = "BLAST-TEXT\n" * 100 + with tempfile.TemporaryDirectory() as tmpdir: + output_path = Path(tmpdir) / "blast.txt" + session = FakeSession( + [ + FakeResponse("Status=READY\nThereAreHits=yes\n"), + FakeResponse(text_body), + ] + ) + clock = FakeClock() + + result = ncbi_blast.execute( + { + "action": "fetch", + "rid": "RID123", + "result_format": "text", + "save_raw": True, + "raw_output_path": str(output_path), + }, + session=session, + sleep_fn=clock.sleep, + clock_fn=clock.time, + ) + + self.assertTrue(result["ok"]) + self.assertEqual(result["raw_output_path"], str(output_path)) + self.assertTrue(output_path.exists()) + self.assertEqual(output_path.read_text(encoding="utf-8"), text_body) + self.assertNotIn("text_head", result) + + def test_fetch_json2_invalid_response_still_saves_raw_when_requested(self) -> None: + raw_body = "not json" + with tempfile.TemporaryDirectory() as tmpdir: + output_path = Path(tmpdir) / "blast.json" + session = FakeSession( + [ + FakeResponse("Status=READY\nThereAreHits=yes\n"), + FakeResponse(raw_body), + ] + ) + clock = FakeClock() + + result = ncbi_blast.execute( + { + "action": "fetch", + "rid": "RID123", + "save_raw": True, + "raw_output_path": str(output_path), + }, + session=session, + sleep_fn=clock.sleep, + clock_fn=clock.time, + ) + + self.assertFalse(result["ok"]) + self.assertEqual(result["error"]["code"], "invalid_response") + self.assertIn(str(output_path), result["error"]["message"]) + self.assertTrue(output_path.exists()) + self.assertEqual(output_path.read_text(encoding="utf-8"), raw_body) + + def test_invalid_inputs_return_invalid_input(self) -> None: + bad_payloads = [ + { + "action": "submit", + "program": "blastp", + "database": "swissprot", + "email": "user@example.com", + }, + {"action": "status"}, + {"action": "bogus"}, + { + "action": "fetch", + "rid": "RID123", + "max_hits": 0, + }, + { + "action": "run", + "program": "blastp", + "database": "swissprot", + "query_fasta": ">q1\nMTEYK", + }, + ] + + for payload in bad_payloads: + with self.subTest(payload=payload): + stdin = io.StringIO(json.dumps(payload)) + stdout = io.StringIO() + with ( + mock.patch.object(ncbi_blast.sys, "stdin", stdin), + mock.patch.object(ncbi_blast.sys, "stdout", stdout), + ): + exit_code = ncbi_blast.main() + output = json.loads(stdout.getvalue()) + self.assertEqual(exit_code, 2) + self.assertFalse(output["ok"]) + self.assertEqual(output["error"]["code"], "invalid_input") + + +if __name__ == "__main__": + unittest.main() diff --git a/plugins/life-science-research/skills/ncbi-clinicaltables-skill/SKILL.md b/plugins/life-science-research/skills/ncbi-clinicaltables-skill/SKILL.md new file mode 100644 index 0000000..982cc26 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-clinicaltables-skill/SKILL.md @@ -0,0 +1,42 @@ +--- +name: ncbi-clinicaltables-skill +description: Submit compact Clinical Tables NCBI Gene requests for human gene lookup, pagination, and field selection. Use when a user wants concise autocomplete-style human gene search results +--- + +## Operating rules +- Use `scripts/ncbi_gene_clinicaltables.py` for all Clinical Tables gene searches. +- The script accepts `max_items`; for search pages, start with `count=10` and `max_items=10`. +- Use `params` for endpoint options like `df`, `ef`, `sf`, `q`, `offset`, and `count`. +- Prefer `ncbi-entrez-skill` when the user wants general Entrez Gene records rather than autocomplete/search rows. +- Page with `offset` instead of asking for large pulls. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. +- If the user asks for the full payload, set `save_raw=true` and report the saved file path instead of pasting large response arrays into chat. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the JSON verbatim only if the user explicitly asks for machine-readable output. +- Use `terms` for the primary search text. +- Keep `count` modest and page with `offset` instead of pulling large result sets at once. + +## Input +- Read one JSON object from stdin. +- Required field: `terms` +- Optional fields: `params`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common NCBI Gene patterns: + - `{"terms":"TP53","params":{"df":"GeneID,Symbol,description"}}` + - `{"terms":"BRCA","params":{"count":10,"df":"chromosome,GeneID,Symbol,description,type_of_gene"},"max_items":10}` + - `{"terms":"kinase","params":{"count":10,"offset":10,"df":"GeneID,Symbol,description"},"max_items":10}` + +## Output +- Success returns `ok`, `source`, `terms`, `total`, `codes`, `display_rows`, `extra_fields`, and truncation metadata. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"terms":"TP53","params":{"count":10,"df":"GeneID,Symbol,description"},"max_items":10}' | python scripts/ncbi_gene_clinicaltables.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/ncbi_gene_clinicaltables.py`. diff --git a/plugins/life-science-research/skills/ncbi-clinicaltables-skill/agents/openai.yaml b/plugins/life-science-research/skills/ncbi-clinicaltables-skill/agents/openai.yaml new file mode 100644 index 0000000..4e1446d --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-clinicaltables-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "NCBI Clinical Tables" + short_description: "Fetch Clinical Tables gene lookup summaries" diff --git a/plugins/life-science-research/skills/ncbi-clinicaltables-skill/scripts/ncbi_gene_clinicaltables.py b/plugins/life-science-research/skills/ncbi-clinicaltables-skill/scripts/ncbi_gene_clinicaltables.py new file mode 100644 index 0000000..a1118b8 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-clinicaltables-skill/scripts/ncbi_gene_clinicaltables.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +"""Compact Clinical Tables NCBI Gene helper for imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +ENDPOINT = "https://clinicaltables.nlm.nih.gov/api/ncbi_genes/v3/search" + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + terms = payload.get("terms") + if not isinstance(terms, str) or not terms.strip(): + raise ValueError("`terms` is required.") + params = payload.get("params") or {} + if not isinstance(params, dict): + raise ValueError("`params` must be an object.") + max_items = payload.get("max_items", 5) + max_depth = payload.get("max_depth", 3) + timeout_sec = payload.get("timeout_sec", 30) + save_raw = payload.get("save_raw", False) + for name, value in { + "max_items": max_items, + "max_depth": max_depth, + "timeout_sec": timeout_sec, + }.items(): + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean.") + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None and ( + not isinstance(raw_output_path, str) or not raw_output_path.strip() + ): + raise ValueError("`raw_output_path` must be a non-empty string.") + return { + "terms": terms.strip(), + "params": params, + "max_items": max_items, + "max_depth": max_depth, + "timeout_sec": timeout_sec, + "save_raw": save_raw, + "raw_output_path": raw_output_path.strip() if isinstance(raw_output_path, str) else None, + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + try: + config = parse_input(payload) + params = {"terms": config["terms"], "count": config["max_items"]} + params.update(config["params"]) + response = requests.get(ENDPOINT, params=params, timeout=config["timeout_sec"]) + response.raise_for_status() + data = response.json() + if not isinstance(data, list) or len(data) < 4: + return error( + "invalid_response", "NCBI Gene response did not match the expected list shape." + ) + raw_output_path = None + if config["save_raw"]: + raw_text = json.dumps(data, indent=2) + path = Path(config["raw_output_path"] or "/tmp/ncbi-gene-search.json") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_text, encoding="utf-8") + raw_output_path = str(path) + total = data[0] + codes = data[1] if isinstance(data[1], list) else [] + extra_fields = data[2] + display_rows = data[3] if isinstance(data[3], list) else [] + return { + "ok": True, + "source": "ncbi-gene-clinicaltables", + "terms": config["terms"], + "total": total, + "record_count_returned": len(display_rows[: config["max_items"]]), + "record_count_available": len(display_rows), + "truncated": len(display_rows) < total if isinstance(total, int) else False, + "codes": codes[: config["max_items"]], + "display_rows": _compact( + display_rows[: config["max_items"]], config["max_items"], config["max_depth"] + ), + "extra_fields": _compact(extra_fields, config["max_items"], config["max_depth"]), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_input", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + output = execute(payload) + sys.stdout.write(json.dumps(output)) + return 0 if output.get("ok") else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/ncbi-datasets-skill/SKILL.md b/plugins/life-science-research/skills/ncbi-datasets-skill/SKILL.md new file mode 100644 index 0000000..aa80feb --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-datasets-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: ncbi-datasets-skill +description: Submit compact NCBI Datasets v2 requests for assembly, genome, taxonomy, and related metadata endpoints. Use when a user wants concise NCBI Datasets summaries; save raw JSON or text only on request. +--- + +## Operating rules +- Use `scripts/ncbi_datasets.py` for all Datasets v2 calls in this package. +- Use explicit REST `path` values relative to `https://api.ncbi.nlm.nih.gov/datasets/v2`. +- Prefer targeted metadata paths instead of broad unfiltered pulls. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script output by default. +- Return raw JSON or text only if the user explicitly asks for machine-readable output. +- Prefer targeted endpoint calls instead of broad unfiltered dumps. +- If the user needs the full raw response, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required field: `path` +- Optional fields: `params`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common Datasets patterns: + - `{"path":"genome/taxon/9606/dataset_report","params":{"page_size":10},"record_path":"reports","max_items":10}` + - `{"path":"genome/accession/GCF_000001405.40/dataset_report"}` + - `{"path":"taxonomy/taxon/9606"}` + +## Output +- Success returns `ok`, `source`, path metadata, and either compact `records`, a compact `summary`, or `text_head`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"path":"genome/taxon/9606/dataset_report","params":{"page_size":10},"record_path":"reports","max_items":10}' | python scripts/ncbi_datasets.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/ncbi_datasets.py`. diff --git a/plugins/life-science-research/skills/ncbi-datasets-skill/agents/openai.yaml b/plugins/life-science-research/skills/ncbi-datasets-skill/agents/openai.yaml new file mode 100644 index 0000000..458013c --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-datasets-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "NCBI Datasets" + short_description: "Fetch NCBI Datasets summaries" diff --git a/plugins/life-science-research/skills/ncbi-datasets-skill/scripts/ncbi_datasets.py b/plugins/life-science-research/skills/ncbi-datasets-skill/scripts/ncbi_datasets.py new file mode 100644 index 0000000..bf33000 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-datasets-skill/scripts/ncbi_datasets.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +"""Compact NCBI Datasets v2 helper for imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +DATASETS_BASE = "https://api.ncbi.nlm.nih.gov/datasets/v2" + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + for key in ("result", "results", "records", "uids", "documents"): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_text(raw_text: str, raw_output_path: str | None, suffix: str) -> str: + path = Path(raw_output_path or f"/tmp/ncbi-datasets.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_text, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "path": _require_str("path", payload.get("path"), required=True), + "params": _require_object("params", payload.get("params")), + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 10), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def _json_output(data: Any, config: dict[str, Any], raw_output_path: str | None) -> dict[str, Any]: + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + output = { + "ok": True, + "source": "ncbi-datasets", + "path": config["path"], + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + output.update( + { + "record_count_returned": min(len(target), config["max_items"]), + "record_count_available": len(target), + "truncated": len(target) > config["max_items"], + "records": _compact( + target[: config["max_items"]], config["max_items"], config["max_depth"] + ), + } + ) + else: + output["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + output["top_keys"] = list(target)[: config["max_items"]] + return output + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + try: + url = DATASETS_BASE.rstrip("/") + "/" + config["path"].lstrip("/") + response = requests.get(url, params=config["params"], timeout=config["timeout_sec"]) + response.raise_for_status() + text = response.text + content_type = (response.headers.get("content-type") or "").lower() + if ( + config["response_format"] == "json" + or "json" in content_type + or text.lstrip().startswith("{") + ): + data = response.json() + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_text( + json.dumps(data, indent=2), config["raw_output_path"], "json" + ) + return _json_output(data, config, raw_output_path) + raw_output_path = ( + _save_raw_text(text, config["raw_output_path"], "txt") if config["save_raw"] else None + ) + text_head = None if raw_output_path else text[:800] + return { + "ok": True, + "source": "ncbi-datasets", + "path": config["path"], + "text_head": text_head, + "text_head_truncated": False if raw_output_path else len(text) > 800, + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/ncbi-entrez-skill/SKILL.md b/plugins/life-science-research/skills/ncbi-entrez-skill/SKILL.md new file mode 100644 index 0000000..b9d6d87 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-entrez-skill/SKILL.md @@ -0,0 +1,43 @@ +--- +name: ncbi-entrez-skill +description: Submit compact NCBI Entrez E-Utilities requests for PubMed, Gene, Protein, Nucleotide, PMC metadata, and GEO metadata workflows. Use when a user wants concise Entrez search, fetch, summary, or link results; save raw JSON or XML only on request. +--- + +## Operating rules +- Use `scripts/ncbi_entrez.py` for all Entrez calls in this package. +- Use explicit `endpoint` values such as `esearch`, `esummary`, `efetch`, `elink`, or `einfo`. +- Search-style Entrez calls are better with `retmax=10` and `max_items=10`. +- GEO is nested under this skill. Use `db=gds` or `db=geoprofiles` for GEO metadata and load `references/geo.md` only when the user is specifically asking about GEO. +- BLAST workflows belong in `ncbi-blast-skill`. PMC Open Access workflows belong in `ncbi-pmc-skill`. Datasets v2 workflows belong in `ncbi-datasets-skill`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script output by default. +- Return raw JSON or XML only if the user explicitly asks for machine-readable output. +- Prefer targeted endpoint calls instead of broad unfiltered dumps. +- If the user needs the full raw response, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required field: `endpoint` +- Optional fields: `params`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common Entrez patterns: + - `{"endpoint":"esearch","params":{"db":"pubmed","term":"KRAS AND colorectal cancer","retmode":"json","retmax":10},"max_items":10}` + - `{"endpoint":"esummary","params":{"db":"gene","id":"7157","retmode":"json"},"max_items":10}` + - `{"endpoint":"efetch","params":{"db":"protein","id":"NP_000537.3","retmode":"xml"},"response_format":"xml","max_items":10}` + - `{"endpoint":"elink","params":{"dbfrom":"gds","db":"pubmed","id":"200000001","retmode":"json"},"max_items":10}` + +## Output +- Success returns `ok`, `source`, endpoint metadata, and either compact `records`, a compact `summary`, or `text_head`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"endpoint":"esearch","params":{"db":"gene","term":"TP53[gene] AND human[orgn]","retmode":"json","retmax":10},"max_items":10}' | python scripts/ncbi_entrez.py +``` + +## References +- Load `references/geo.md` only when the user specifically needs GEO query patterns. +- Keep the import package limited to this file, `references/geo.md`, and `scripts/ncbi_entrez.py`. diff --git a/plugins/life-science-research/skills/ncbi-entrez-skill/agents/openai.yaml b/plugins/life-science-research/skills/ncbi-entrez-skill/agents/openai.yaml new file mode 100644 index 0000000..9159ce9 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-entrez-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "NCBI Entrez" + short_description: "Fetch NCBI Entrez search and metadata summaries" diff --git a/plugins/life-science-research/skills/ncbi-entrez-skill/references/geo.md b/plugins/life-science-research/skills/ncbi-entrez-skill/references/geo.md new file mode 100644 index 0000000..0428ee1 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-entrez-skill/references/geo.md @@ -0,0 +1,22 @@ +# GEO Within Entrez + +Use GEO through Entrez rather than a separate GEO skill. + +## Database choices +- Use `db=gds` for GEO DataSets and series-level metadata. +- Use `db=geoprofiles` for profile-level records. + +## Common workflow +1. Run `esearch` to find candidate UIDs. +2. Run `esummary` or `efetch` for targeted metadata. +3. Run `elink` when the user wants linked PubMed articles or related Entrez records. + +## Query patterns +- Series searches often use `GSE[ETYP]`. +- Sample searches often use `GSM[ETYP]`. +- Combine accession filters with disease or organism terms when narrowing results. + +## Examples +- `{"endpoint":"esearch","params":{"db":"gds","term":"GSE[ETYP] AND breast cancer","retmax":10},"max_items":10}` +- `{"endpoint":"esummary","params":{"db":"gds","id":"200000001","retmode":"json"},"max_items":10}` +- `{"endpoint":"elink","params":{"dbfrom":"gds","db":"pubmed","id":"200000001","retmode":"json"},"max_items":10}` diff --git a/plugins/life-science-research/skills/ncbi-entrez-skill/scripts/ncbi_entrez.py b/plugins/life-science-research/skills/ncbi-entrez-skill/scripts/ncbi_entrez.py new file mode 100644 index 0000000..6ae0599 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-entrez-skill/scripts/ncbi_entrez.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 +"""Compact NCBI Entrez E-Utilities helper for imported skills.""" + +from __future__ import annotations + +import json +import os +import sys +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +EUTILS_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + for key in ("result", "results", "records", "uids", "documents"): + value = data.get(key) + if isinstance(value, list): + return key, value + if "esearchresult" in data: + result = data["esearchresult"] + if isinstance(result, dict) and isinstance(result.get("idlist"), list): + return "esearchresult.idlist", result["idlist"] + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _xml_to_simple(elem: ET.Element, max_items: int, max_depth: int) -> Any: + children = list(elem) + text = (elem.text or "").strip() + if not children: + return text + if max_depth <= 0: + return "..." + grouped: dict[str, Any] = {} + for child in children[:max_items]: + tag = child.tag.split("}", 1)[-1] + value = _xml_to_simple(child, max_items, max_depth - 1) + if tag in grouped: + current = grouped[tag] + if not isinstance(current, list): + grouped[tag] = [current] + grouped[tag].append(value) + else: + grouped[tag] = value + if len(children) > max_items: + grouped["_truncated_children"] = len(children) - max_items + if text: + grouped["_text"] = text + return grouped + + +def _save_raw_text(raw_text: str, raw_output_path: str | None, suffix: str) -> str: + path = Path(raw_output_path or f"/tmp/ncbi-entrez.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_text, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text", "xml"}: + raise ValueError("`response_format` must be auto, json, text, or xml.") + return { + "endpoint": _require_str("endpoint", payload.get("endpoint"), required=True), + "params": _require_object("params", payload.get("params")), + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 10), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def _ncbi_common_params(params: dict[str, Any]) -> dict[str, Any]: + merged = dict(params) + api_key = os.environ.get("NCBI_API_KEY") or os.environ.get("NCBI_EUTILS_API_KEY") + tool = os.environ.get("NCBI_TOOL") + email = os.environ.get("NCBI_EMAIL") + if api_key and "api_key" not in merged: + merged["api_key"] = api_key + if tool and "tool" not in merged: + merged["tool"] = tool + if email and "email" not in merged: + merged["email"] = email + return merged + + +def _json_or_xml_output( + data: Any, config: dict[str, Any], raw_output_path: str | None +) -> dict[str, Any]: + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + output = { + "ok": True, + "source": "ncbi-entrez", + "endpoint": config["endpoint"], + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + output.update( + { + "record_count_returned": min(len(target), config["max_items"]), + "record_count_available": len(target), + "truncated": len(target) > config["max_items"], + "records": _compact( + target[: config["max_items"]], config["max_items"], config["max_depth"] + ), + } + ) + else: + output["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + output["top_keys"] = list(target)[: config["max_items"]] + return output + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + try: + suffix = config["endpoint"] + if not suffix.endswith(".fcgi"): + suffix = f"{suffix}.fcgi" + url = f"{EUTILS_BASE}/{suffix.lstrip('/')}" + response = requests.get( + url, params=_ncbi_common_params(config["params"]), timeout=config["timeout_sec"] + ) + response.raise_for_status() + + wants_json = config["response_format"] == "json" + wants_xml = config["response_format"] == "xml" + content_type = (response.headers.get("content-type") or "").lower() + text = response.text + + if wants_json or text.lstrip().startswith("{") or "json" in content_type: + data = response.json() + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_text( + json.dumps(data, indent=2), config["raw_output_path"], "json" + ) + return _json_or_xml_output(data, config, raw_output_path) + + if wants_xml or text.lstrip().startswith("<"): + root = ET.fromstring(text) + data = { + root.tag.split("}", 1)[-1]: _xml_to_simple( + root, config["max_items"], config["max_depth"] + ) + } + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_text(text, config["raw_output_path"], "xml") + return _json_or_xml_output(data, config, raw_output_path) + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_text(text, config["raw_output_path"], "txt") + text_head = None if raw_output_path else text[:800] + return { + "ok": True, + "source": "ncbi-entrez", + "endpoint": config["endpoint"], + "text_head": text_head, + "text_head_truncated": False if raw_output_path else len(text) > 800, + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except ET.ParseError as exc: + return error("invalid_response", f"Could not parse XML response: {exc}") + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/ncbi-pmc-skill/SKILL.md b/plugins/life-science-research/skills/ncbi-pmc-skill/SKILL.md new file mode 100644 index 0000000..aa16f57 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-pmc-skill/SKILL.md @@ -0,0 +1,37 @@ +--- +name: ncbi-pmc-skill +description: Submit compact NCBI PMC Open Access requests for article/file availability metadata. Use when a user wants concise PMC Open Access summaries; save raw XML only on request. +--- + +## Operating rules +- Use `scripts/ncbi_pmc.py` for all PMC Open Access calls in this package. +- This skill is intentionally narrow: it currently covers the PMC Open Access service rather than the full PMC API surface. +- Pass endpoint-specific query parameters under `params`, typically `id` for a PMCID or DOI-style lookup supported by the OA service. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script output by default. +- Return raw XML only if the user explicitly asks for machine-readable output. +- Prefer targeted endpoint calls instead of broad unfiltered dumps. +- If the user needs the full raw response, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Optional fields: `params`, `record_path`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common PMC Open Access patterns: + - `{"params":{"id":"PMC3257301"},"max_items":10}` + - `{"params":{"id":"10.1093/nar/gkr1184"},"max_items":10}` + +## Output +- Success returns `ok`, `source`, and a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"params":{"id":"PMC3257301"},"max_items":10}' | python scripts/ncbi_pmc.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/ncbi_pmc.py`. diff --git a/plugins/life-science-research/skills/ncbi-pmc-skill/agents/openai.yaml b/plugins/life-science-research/skills/ncbi-pmc-skill/agents/openai.yaml new file mode 100644 index 0000000..6e66635 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-pmc-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "NCBI PMC" + short_description: "Fetch PMC Open Access summaries" diff --git a/plugins/life-science-research/skills/ncbi-pmc-skill/scripts/ncbi_pmc.py b/plugins/life-science-research/skills/ncbi-pmc-skill/scripts/ncbi_pmc.py new file mode 100644 index 0000000..b4fa120 --- /dev/null +++ b/plugins/life-science-research/skills/ncbi-pmc-skill/scripts/ncbi_pmc.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +"""Compact NCBI PMC Open Access helper for imported skills.""" + +from __future__ import annotations + +import json +import os +import sys +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +PMC_OA_URL = "https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi" + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + for key in ("records", "items", "documents"): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _xml_to_simple(elem: ET.Element, max_items: int, max_depth: int) -> Any: + children = list(elem) + text = (elem.text or "").strip() + if not children: + return text + if max_depth <= 0: + return "..." + grouped: dict[str, Any] = {} + for child in children[:max_items]: + tag = child.tag.split("}", 1)[-1] + value = _xml_to_simple(child, max_items, max_depth - 1) + if tag in grouped: + current = grouped[tag] + if not isinstance(current, list): + grouped[tag] = [current] + grouped[tag].append(value) + else: + grouped[tag] = value + if len(children) > max_items: + grouped["_truncated_children"] = len(children) - max_items + if text: + grouped["_text"] = text + return grouped + + +def _save_raw_text(raw_text: str, raw_output_path: str | None, suffix: str) -> str: + path = Path(raw_output_path or f"/tmp/ncbi-pmc.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_text, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + return { + "params": _require_object("params", payload.get("params")), + "record_path": _require_str("record_path", payload.get("record_path")), + "max_items": _require_int("max_items", payload.get("max_items"), 10), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def _ncbi_common_params(params: dict[str, Any]) -> dict[str, Any]: + merged = dict(params) + api_key = os.environ.get("NCBI_API_KEY") or os.environ.get("NCBI_EUTILS_API_KEY") + tool = os.environ.get("NCBI_TOOL") + email = os.environ.get("NCBI_EMAIL") + if api_key and "api_key" not in merged: + merged["api_key"] = api_key + if tool and "tool" not in merged: + merged["tool"] = tool + if email and "email" not in merged: + merged["email"] = email + return merged + + +def _summary_output( + data: Any, config: dict[str, Any], raw_output_path: str | None +) -> dict[str, Any]: + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + output = { + "ok": True, + "source": "ncbi-pmc-oa", + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + output.update( + { + "record_count_returned": min(len(target), config["max_items"]), + "record_count_available": len(target), + "truncated": len(target) > config["max_items"], + "records": _compact( + target[: config["max_items"]], config["max_items"], config["max_depth"] + ), + } + ) + else: + output["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + output["top_keys"] = list(target)[: config["max_items"]] + return output + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + try: + response = requests.get( + PMC_OA_URL, params=_ncbi_common_params(config["params"]), timeout=config["timeout_sec"] + ) + response.raise_for_status() + raw_output_path = ( + _save_raw_text(response.text, config["raw_output_path"], "xml") + if config["save_raw"] + else None + ) + root = ET.fromstring(response.text) + data = { + root.tag.split("}", 1)[-1]: _xml_to_simple( + root, config["max_items"], config["max_depth"] + ) + } + return _summary_output(data, config, raw_output_path) + except ValueError as exc: + return error("invalid_response", str(exc)) + except ET.ParseError as exc: + return error("invalid_response", f"Could not parse XML response: {exc}") + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/opentargets-skill/SKILL.md b/plugins/life-science-research/skills/opentargets-skill/SKILL.md new file mode 100644 index 0000000..2ae0627 --- /dev/null +++ b/plugins/life-science-research/skills/opentargets-skill/SKILL.md @@ -0,0 +1,59 @@ +--- +name: opentargets-skill +description: Submit compact Open Targets Platform GraphQL requests for target, disease, drug, variant, study, and search data, including associated-disease datasource heatmap matrices. Use when a user wants concise Open Targets summaries or per-datasource evidence context +--- + +## Operating rules +- Use `scripts/opentargets_graphql.py` for all Open Targets GraphQL work. +- Use `scripts/opentargets_disease_heatmap.py` when the user wants the associated-disease bubble grid or a disease-by-datasource evidence matrix. +- The script accepts `max_items`; for nested GraphQL results, start with `max_items=3` to `5`. +- Keep GraphQL selection sets narrow and page connection-style fields conservatively. +- Use `query_path` for long GraphQL documents instead of pasting large inline query strings. +- Re-run requests in long conversations instead of relying on earlier tool output. +- Treat displayed `...` in tool previews as UI truncation, not part of the real query. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the JSON verbatim only if the user explicitly asks for machine-readable output. +- Prefer targeted GraphQL queries that select only the fields needed for the user task. +- Use schema introspection only when necessary; do not dump large schema payloads into chat. +- For the associated-disease heatmap, treat `datasourceScores` as evidence-source breadth/context. Do not treat heatmap breadth alone as proof of causal target assignment, mechanism, or direction of effect. + +## Input +- Read one JSON object from stdin. +- Required field: `query` or `query_path` +- Optional fields: `variables`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common Open Targets patterns: + - `{"query":"query { __typename }"}` + - `{"query":"query searchAny($q: String!) { search(queryString: $q) { total hits { entity score object { ... on Target { id approvedSymbol } } } } }","variables":{"q":"MST1"},"max_items":3}` + +## Output +- Success returns `ok`, `source`, `top_keys`, a compact `summary`, and `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` such as `invalid_json`, `invalid_input`, `network_error`, `invalid_response`, or `graphql_error`. + +## Execution +```bash +echo '{"query":"query { __typename }"}' | python scripts/opentargets_graphql.py +``` + +Associated-disease heatmap helper: + +```bash +echo '{ + "ensembl_id":"ENSG00000186868", + "page_size":50, + "max_pages":4, + "disease_name_filter":"alzh" +}' | python scripts/opentargets_disease_heatmap.py +``` + +The helper paginates `associatedDiseases`, collects `datasourceScores`, and returns: + +- `matrix.columns`: datasource IDs plus display labels +- `matrix.rows`: diseases with `datasource_scores` +- `summary.rows_preview`: top datasource signals per disease + +Use the disease-name filter as a client-side substring filter similar to the UI. If you later need the overall association score column, inspect the GraphQL row type first before adding candidate fields such as `score` or `associationScore`. + +## References +- No additional runtime references are required; keep the import package limited to this file and the bundled scripts in `scripts/`. diff --git a/plugins/life-science-research/skills/opentargets-skill/agents/openai.yaml b/plugins/life-science-research/skills/opentargets-skill/agents/openai.yaml new file mode 100644 index 0000000..35353c9 --- /dev/null +++ b/plugins/life-science-research/skills/opentargets-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Open Targets" + short_description: "Fetch Open Targets evidence summaries" diff --git a/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_disease_heatmap.py b/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_disease_heatmap.py new file mode 100644 index 0000000..def0c3f --- /dev/null +++ b/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_disease_heatmap.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +"""Fetch Open Targets associated-disease datasource scores as a heatmap matrix.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +ENDPOINT = "https://api.platform.opentargets.org/api/v4/graphql" + +QUERY = """ +query associatedDiseasesHeatmap($ensemblId: String!, $size: Int!, $index: Int!) { + target(ensemblId: $ensemblId) { + id + approvedSymbol + associatedDiseases(page: { size: $size, index: $index }) { + count + rows { + disease { id name } + datasourceScores { id score } + } + } + } +} +""" + +DATASOURCE_LABELS = { + "ot_genetics_portal": "GWAS associations", + "gene_burden": "Gene Burden", + "eva": "ClinVar", + "gene2phenotype": "Gene2phenotype", + "gene2phenotype_literature": "Gene2phenotype literature", + "genomics_england": "GEL PanelApp", + "uniprot_literature": "UniProt literature", + "uniprot_variants": "UniProt curated variants", + "orphanet": "Orphanet", + "clingen": "ClinGen", + "cancer_gene_census": "Cancer Gene Census", + "intogen": "IntOGen", + "eva_somatic": "ClinVar (somatic)", + "cancer_biomarkers": "Cancer Biomarkers", + "chembl": "ChEMBL", + "crispr_screen": "CRISPR Screens", + "project_score": "Project Score", + "reactome": "Reactome", + "europepmc": "Europe PMC", + "expression_atlas": "Expression Atlas", + "impc": "IMPC", +} + +PREFERRED_COLUMN_ORDER = [ + "ot_genetics_portal", + "gene_burden", + "eva", + "genomics_england", + "gene2phenotype", + "uniprot_literature", + "uniprot_variants", + "orphanet", + "clingen", + "cancer_gene_census", + "intogen", + "eva_somatic", + "cancer_biomarkers", + "chembl", + "crispr_screen", + "project_score", + "reactome", + "europepmc", + "expression_atlas", + "impc", +] + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def safe_float(value: Any) -> float | None: + if isinstance(value, (int, float)): + return float(value) + if isinstance(value, str): + try: + return float(value) + except ValueError: + return None + return None + + +def prettify_datasource_id(datasource_id: str) -> str: + words = datasource_id.replace("-", "_").split("_") + if not words: + return datasource_id + pieces: list[str] = [] + for word in words: + lowered = word.lower() + if lowered == "pmc": + pieces.append("PMC") + elif lowered == "crispr": + pieces.append("CRISPR") + elif lowered == "impc": + pieces.append("IMPC") + elif lowered == "chembl": + pieces.append("ChEMBL") + else: + pieces.append(word.capitalize()) + return " ".join(pieces) + + +def label_for_datasource(datasource_id: str) -> str: + return DATASOURCE_LABELS.get(datasource_id, prettify_datasource_id(datasource_id)) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + ensembl_id = payload.get("ensembl_id") or payload.get("ensemblId") + if not isinstance(ensembl_id, str) or not ensembl_id.strip(): + raise ValueError("Provide `ensembl_id`.") + page_size = payload.get("page_size", payload.get("size", 50)) + max_pages = payload.get("max_pages", 10) + if not isinstance(page_size, int) or page_size <= 0: + raise ValueError("`page_size` must be a positive integer.") + if not isinstance(max_pages, int) or max_pages <= 0: + raise ValueError("`max_pages` must be a positive integer.") + disease_filter = payload.get("disease_name_filter") or payload.get("diseaseNameFilter") + if disease_filter is not None and not isinstance(disease_filter, str): + raise ValueError("`disease_name_filter` must be a string.") + save_raw = payload.get("save_raw", False) + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean.") + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None and ( + not isinstance(raw_output_path, str) or not raw_output_path.strip() + ): + raise ValueError("`raw_output_path` must be a non-empty string.") + return { + "ensembl_id": ensembl_id.strip(), + "page_size": page_size, + "max_pages": max_pages, + "disease_name_filter": disease_filter.strip() if isinstance(disease_filter, str) else None, + "save_raw": save_raw, + "raw_output_path": raw_output_path.strip() if isinstance(raw_output_path, str) else None, + } + + +def fetch_page(ensembl_id: str, page_size: int, page_index: int) -> dict[str, Any]: + response = requests.post( + ENDPOINT, + json={ + "query": QUERY, + "variables": {"ensemblId": ensembl_id, "size": page_size, "index": page_index}, + }, + timeout=60, + ) + response.raise_for_status() + data = response.json() + if "errors" in data: + raise RuntimeError(json.dumps(data["errors"])[:500]) + payload = data.get("data") + if not isinstance(payload, dict): + raise RuntimeError("GraphQL response did not include a `data` object.") + target = payload.get("target") + if not isinstance(target, dict): + raise RuntimeError("Target was not found in the GraphQL response.") + return target + + +def sort_datasource_ids(datasource_ids: set[str]) -> list[str]: + preferred_index = {item: idx for idx, item in enumerate(PREFERRED_COLUMN_ORDER)} + return sorted( + datasource_ids, + key=lambda item: ( + preferred_index.get(item, len(preferred_index)), + label_for_datasource(item), + ), + ) + + +def build_top_datasources(score_map: dict[str, float], limit: int = 3) -> list[dict[str, Any]]: + ranked = sorted(score_map.items(), key=lambda item: item[1], reverse=True)[:limit] + return [ + {"id": datasource_id, "label": label_for_datasource(datasource_id), "score": score} + for datasource_id, score in ranked + ] + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + + config = parse_input(payload) + fetched_rows: list[dict[str, Any]] = [] + raw_pages: list[dict[str, Any]] = [] + warnings: list[str] = [] + total_count: int | None = None + target_id: str | None = None + approved_symbol: str | None = None + + try: + for page_index in range(config["max_pages"]): + target = fetch_page(config["ensembl_id"], config["page_size"], page_index) + if target_id is None: + target_id = str(target.get("id") or config["ensembl_id"]) + if approved_symbol is None and target.get("approvedSymbol"): + approved_symbol = str(target.get("approvedSymbol")) + associated = target.get("associatedDiseases") or {} + if not isinstance(associated, dict): + raise RuntimeError("`associatedDiseases` was missing from the target payload.") + if total_count is None and isinstance(associated.get("count"), int): + total_count = int(associated["count"]) + rows = associated.get("rows") or [] + if not isinstance(rows, list): + raise RuntimeError("`associatedDiseases.rows` was not a list.") + fetched_rows.extend(rows) + raw_pages.append({"index": page_index, "rows": rows}) + if not rows or len(rows) < config["page_size"]: + break + if total_count is not None and len(fetched_rows) >= total_count: + break + else: + warnings.append( + f"Stopped after `max_pages={config['max_pages']}` before exhausting associated disease pages." + ) + except ValueError as exc: + return error("invalid_response", str(exc), warnings=warnings) + except requests.RequestException as exc: + return error("network_error", f"GraphQL request failed: {exc}", warnings=warnings) + except RuntimeError as exc: + return error("graphql_error", str(exc), warnings=warnings) + + disease_filter = config["disease_name_filter"] + filtered_rows: list[dict[str, Any]] = [] + datasource_ids: set[str] = set() + for row in fetched_rows: + disease = row.get("disease") or {} + if not isinstance(disease, dict): + continue + disease_name = str(disease.get("name") or "").strip() + if disease_filter and disease_filter.casefold() not in disease_name.casefold(): + continue + score_map: dict[str, float] = {} + for item in row.get("datasourceScores") or []: + if not isinstance(item, dict): + continue + datasource_id = str(item.get("id") or "").strip() + score = safe_float(item.get("score")) + if not datasource_id or score is None: + continue + score_map[datasource_id] = score + datasource_ids.add(datasource_id) + filtered_rows.append( + { + "disease_id": str(disease.get("id") or ""), + "disease_name": disease_name, + "datasource_scores": score_map, + } + ) + + ordered_datasource_ids = sort_datasource_ids(datasource_ids) + columns = [ + {"id": datasource_id, "label": label_for_datasource(datasource_id)} + for datasource_id in ordered_datasource_ids + ] + row_preview = [ + { + "disease_name": row["disease_name"], + "top_datasources": build_top_datasources(row["datasource_scores"]), + } + for row in filtered_rows[:5] + ] + + raw_output_path = None + if config["save_raw"]: + raw_text = json.dumps( + { + "query_name": "associatedDiseasesHeatmap", + "target": {"id": target_id, "approvedSymbol": approved_symbol}, + "page_size": config["page_size"], + "max_pages": config["max_pages"], + "pages": raw_pages, + }, + indent=2, + ) + path = Path(config["raw_output_path"] or "/tmp/opentargets-associated-diseases.json") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_text, encoding="utf-8") + raw_output_path = str(path) + + if disease_filter and not filtered_rows: + warnings.append(f"No diseases matched `disease_name_filter={disease_filter}`.") + + return { + "ok": True, + "source": "opentargets-disease-heatmap", + "summary": { + "target": {"id": target_id or config["ensembl_id"], "approved_symbol": approved_symbol}, + "pages_fetched": len(raw_pages), + "fetched_rows": len(fetched_rows), + "returned_rows": len(filtered_rows), + "total_count": total_count, + "disease_name_filter": disease_filter, + "columns": columns, + "rows_preview": row_preview, + }, + "matrix": { + "target": {"id": target_id or config["ensembl_id"], "approved_symbol": approved_symbol}, + "columns": columns, + "rows": filtered_rows, + }, + "raw_output_path": raw_output_path, + "warnings": warnings, + } + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_graphql.py b/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_graphql.py new file mode 100644 index 0000000..f7d624f --- /dev/null +++ b/plugins/life-science-research/skills/opentargets-skill/scripts/opentargets_graphql.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +"""Compact Open Targets GraphQL client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + +ENDPOINT = "https://api.platform.opentargets.org/api/v4/graphql" + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + query = payload.get("query") + query_path = payload.get("query_path") + if query is None and query_path is None: + raise ValueError("Provide `query` or `query_path`.") + if query is not None and (not isinstance(query, str) or not query.strip()): + raise ValueError("`query` must be a non-empty string.") + if query_path is not None and (not isinstance(query_path, str) or not query_path.strip()): + raise ValueError("`query_path` must be a non-empty string.") + variables = payload.get("variables") or {} + if not isinstance(variables, dict): + raise ValueError("`variables` must be an object.") + save_raw = payload.get("save_raw", False) + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean.") + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None and ( + not isinstance(raw_output_path, str) or not raw_output_path.strip() + ): + raise ValueError("`raw_output_path` must be a non-empty string.") + for key in ("max_items", "max_depth", "timeout_sec"): + value = payload.get(key, 5 if key == "max_items" else 3 if key == "max_depth" else 60) + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{key}` must be a positive integer.") + payload[key] = value + query_text = ( + query.strip() if isinstance(query, str) else Path(query_path).read_text(encoding="utf-8") + ) + return { + "query": query_text, + "variables": variables, + "max_items": payload["max_items"], + "max_depth": payload["max_depth"], + "timeout_sec": payload["timeout_sec"], + "save_raw": save_raw, + "raw_output_path": raw_output_path.strip() if isinstance(raw_output_path, str) else None, + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + try: + response = requests.post( + ENDPOINT, + json={"query": config["query"], "variables": config["variables"]}, + timeout=config["timeout_sec"], + ) + response.raise_for_status() + data = response.json() + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"GraphQL request failed: {exc}") + + raw_output_path = None + if config["save_raw"]: + raw_text = json.dumps(data, indent=2) + path = Path(config["raw_output_path"] or "/tmp/opentargets-graphql.json") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_text, encoding="utf-8") + raw_output_path = str(path) + + if "errors" in data: + return error( + "graphql_error", + json.dumps(data["errors"])[:500], + warnings=[f"raw_output_path={raw_output_path}"] if raw_output_path else [], + ) + + payload_data = data.get("data") + if not isinstance(payload_data, dict): + return error("invalid_response", "GraphQL response did not include a `data` object.") + + return { + "ok": True, + "source": "opentargets-graphql", + "top_keys": list(payload_data)[: config["max_items"]], + "summary": _compact(payload_data, config["max_items"], config["max_depth"]), + "raw_output_path": raw_output_path, + "warnings": [], + } + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/pharmgkb-skill/SKILL.md b/plugins/life-science-research/skills/pharmgkb-skill/SKILL.md new file mode 100644 index 0000000..8a72c64 --- /dev/null +++ b/plugins/life-science-research/skills/pharmgkb-skill/SKILL.md @@ -0,0 +1,38 @@ +--- +name: pharmgkb-skill +description: Submit compact PharmGKB API requests for genes, variants, clinical annotations, dosing guidelines, and search. Use when a user wants concise PharmGKB summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all PharmGKB API calls. +- Use `base_url=https://api.pharmgkb.org/v1/data`. +- Single object lookups usually do not need `max_items`; list and search endpoints are better with `max_items=10`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer these paths: `gene/`, `variant/`, `clinicalAnnotation`, `dosingGuideline`, and search endpoints. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common PharmGKB patterns: + - `{"base_url":"https://api.pharmgkb.org/v1/data","path":"gene/PA36679"}` + - `{"base_url":"https://api.pharmgkb.org/v1/data","path":"clinicalAnnotation","params":{"relatedChemicals.accessionId":"PA449726","limit":10},"max_items":10}` + - `{"base_url":"https://api.pharmgkb.org/v1/data","path":"variant/PA166158545"}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://api.pharmgkb.org/v1/data","path":"gene/PA36679"}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/pharmgkb-skill/agents/openai.yaml b/plugins/life-science-research/skills/pharmgkb-skill/agents/openai.yaml new file mode 100644 index 0000000..54a71f6 --- /dev/null +++ b/plugins/life-science-research/skills/pharmgkb-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "PharmGKB" + short_description: "Fetch PharmGKB pharmacogenomics summaries" diff --git a/plugins/life-science-research/skills/pharmgkb-skill/scripts/rest_request.py b/plugins/life-science-research/skills/pharmgkb-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/pharmgkb-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/pride-skill/SKILL.md b/plugins/life-science-research/skills/pride-skill/SKILL.md new file mode 100644 index 0000000..d172534 --- /dev/null +++ b/plugins/life-science-research/skills/pride-skill/SKILL.md @@ -0,0 +1,37 @@ +--- +name: pride-skill +description: Submit compact PRIDE Archive API requests for proteomics project discovery and project-level metadata. Use when a user wants concise PRIDE summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all PRIDE Archive calls. +- Use `base_url=https://www.ebi.ac.uk/pride/ws/archive/v2`. +- Start with `projects` for discovery and keep page sizes modest. +- Prefer project-level metadata lookups over broad archive dumps. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer these paths: `projects` and `projects/`. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common PRIDE patterns: + - `{"base_url":"https://www.ebi.ac.uk/pride/ws/archive/v2","path":"projects","params":{"keyword":"proteomics","pageSize":10},"max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/pride/ws/archive/v2","path":"projects/PXD001357"}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/pride/ws/archive/v2","path":"projects","params":{"keyword":"proteomics","pageSize":10},"max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/pride-skill/agents/openai.yaml b/plugins/life-science-research/skills/pride-skill/agents/openai.yaml new file mode 100644 index 0000000..fa2b133 --- /dev/null +++ b/plugins/life-science-research/skills/pride-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "PRIDE" + short_description: "Fetch PRIDE proteomics summaries" diff --git a/plugins/life-science-research/skills/pride-skill/scripts/rest_request.py b/plugins/life-science-research/skills/pride-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/pride-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/proteomexchange-skill/SKILL.md b/plugins/life-science-research/skills/proteomexchange-skill/SKILL.md new file mode 100644 index 0000000..f67cf36 --- /dev/null +++ b/plugins/life-science-research/skills/proteomexchange-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: proteomexchange-skill +description: Submit compact ProteomeXchange PROXI requests for datasets, libraries, peptidoforms, proteins, PSMs, spectra, and USI examples. Use when a user wants concise PROXI summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all ProteomeXchange PROXI calls. +- Use `base_url=https://proteomecentral.proteomexchange.org/api/proxi/v0.1`. +- Collection endpoints are better with `max_items=10`; targeted identifier lookups usually do not need `max_items`. +- Keep requests narrow by identifier, spectrum, or dataset whenever possible. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer these paths: `datasets`, `datasets/`, `libraries`, `peptidoforms`, `proteins`, `psms`, `spectra`, and `usi_examples`. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common PROXI patterns: + - `{"base_url":"https://proteomecentral.proteomexchange.org/api/proxi/v0.1","path":"datasets","max_items":10}` + - `{"base_url":"https://proteomecentral.proteomexchange.org/api/proxi/v0.1","path":"datasets/PXD000001"}` + - `{"base_url":"https://proteomecentral.proteomexchange.org/api/proxi/v0.1","path":"usi_examples","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://proteomecentral.proteomexchange.org/api/proxi/v0.1","path":"datasets","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/proteomexchange-skill/agents/openai.yaml b/plugins/life-science-research/skills/proteomexchange-skill/agents/openai.yaml new file mode 100644 index 0000000..8bac552 --- /dev/null +++ b/plugins/life-science-research/skills/proteomexchange-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "ProteomeXchange" + short_description: "Fetch ProteomeXchange PROXI summaries" diff --git a/plugins/life-science-research/skills/proteomexchange-skill/scripts/rest_request.py b/plugins/life-science-research/skills/proteomexchange-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/proteomexchange-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/pubchem-pug-skill/SKILL.md b/plugins/life-science-research/skills/pubchem-pug-skill/SKILL.md new file mode 100644 index 0000000..84b1e1d --- /dev/null +++ b/plugins/life-science-research/skills/pubchem-pug-skill/SKILL.md @@ -0,0 +1,38 @@ +--- +name: pubchem-pug-skill +description: Submit compact PubChem PUG REST requests for compound properties, descriptions, assay summaries, and substance metadata. Use when a user wants concise PubChem summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all PubChem PUG calls. +- Use `base_url=https://pubchem.ncbi.nlm.nih.gov/rest/pug`. +- Property and description endpoints usually return a single focused record; assay or broader list endpoints are better with `max_items=10`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer property, description, assay summary, and substance paths instead of broad record dumps. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common PubChem patterns: + - `{"base_url":"https://pubchem.ncbi.nlm.nih.gov/rest/pug","path":"compound/name/aspirin/property/MolecularFormula,MolecularWeight/JSON","record_path":"PropertyTable.Properties"}` + - `{"base_url":"https://pubchem.ncbi.nlm.nih.gov/rest/pug","path":"compound/cid/2244/description/JSON","record_path":"InformationList.Information","max_items":10}` + - `{"base_url":"https://pubchem.ncbi.nlm.nih.gov/rest/pug","path":"assay/aid/1706/summary/JSON","record_path":"AssaySummaries","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://pubchem.ncbi.nlm.nih.gov/rest/pug","path":"compound/name/aspirin/property/MolecularFormula,MolecularWeight/JSON","record_path":"PropertyTable.Properties"}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/pubchem-pug-skill/agents/openai.yaml b/plugins/life-science-research/skills/pubchem-pug-skill/agents/openai.yaml new file mode 100644 index 0000000..f592368 --- /dev/null +++ b/plugins/life-science-research/skills/pubchem-pug-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "PubChem PUG" + short_description: "Fetch PubChem compound summaries" diff --git a/plugins/life-science-research/skills/pubchem-pug-skill/scripts/rest_request.py b/plugins/life-science-research/skills/pubchem-pug-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/pubchem-pug-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/quickgo-skill/SKILL.md b/plugins/life-science-research/skills/quickgo-skill/SKILL.md new file mode 100644 index 0000000..3252c80 --- /dev/null +++ b/plugins/life-science-research/skills/quickgo-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: quickgo-skill +description: Submit compact QuickGO requests for GO terms, annotations, and ontology traversal. Use when a user wants concise QuickGO summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all QuickGO API calls. +- Use `base_url=https://www.ebi.ac.uk/QuickGO/services`. +- GO term lookups usually do not need `max_items`; annotation and traversal endpoints are better with `limit=10` and `max_items=10`. +- Send `Accept: application/json` in `headers`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer these paths: `ontology/go/terms/`, `annotation/search`, and ontology child or ancestor endpoints. +- Treat `annotation/search` as upstream-fragile when QuickGO's annotation Solr backend is unavailable; fall back to ontology term lookup or UniProt GO annotations when appropriate. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common QuickGO patterns: + - `{"base_url":"https://www.ebi.ac.uk/QuickGO/services","path":"ontology/go/terms/GO:0008150,GO:0003674","headers":{"Accept":"application/json"},"record_path":"results","max_items":10}` + - `{"base_url":"https://www.ebi.ac.uk/QuickGO/services","path":"annotation/search","params":{"geneProductId":"P04637","limit":10},"headers":{"Accept":"application/json"},"record_path":"results","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.ebi.ac.uk/QuickGO/services","path":"ontology/go/terms/GO:0006915","headers":{"Accept":"application/json"},"record_path":"results","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/quickgo-skill/agents/openai.yaml b/plugins/life-science-research/skills/quickgo-skill/agents/openai.yaml new file mode 100644 index 0000000..dd2186d --- /dev/null +++ b/plugins/life-science-research/skills/quickgo-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "QuickGO" + short_description: "Fetch QuickGO ontology and annotation summaries" diff --git a/plugins/life-science-research/skills/quickgo-skill/scripts/rest_request.py b/plugins/life-science-research/skills/quickgo-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/quickgo-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/rcsb-pdb-skill/SKILL.md b/plugins/life-science-research/skills/rcsb-pdb-skill/SKILL.md new file mode 100644 index 0000000..2bb3e62 --- /dev/null +++ b/plugins/life-science-research/skills/rcsb-pdb-skill/SKILL.md @@ -0,0 +1,38 @@ +--- +name: rcsb-pdb-skill +description: Submit compact RCSB PDB requests for core metadata, Search API queries, and FASTA downloads. Use when a user wants concise RCSB summaries; save raw JSON or FASTA only on request. +--- + +## Operating rules +- Use `scripts/rest_request.py` for all RCSB PDB and Search API calls. +- Use `base_url=https://data.rcsb.org/rest/v1` for core metadata, `https://search.rcsb.org/rcsbsearch/v2` for Search API, and `https://www.rcsb.org` for FASTA downloads. +- Core entry or assembly lookups usually do not need `max_items`; Search API results are better with query pager rows around `10` and `max_items=10`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer core metadata endpoints for focused lookups and Search API POST requests for discovery. +- For FASTA downloads, use `response_format=text` so the script returns a short `text_head` unless raw output is requested. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common RCSB patterns: + - `{"base_url":"https://data.rcsb.org/rest/v1","path":"core/entry/4hhb"}` + - `{"base_url":"https://search.rcsb.org/rcsbsearch/v2","path":"query","method":"POST","json_body":{"query":{"type":"terminal","service":"full_text","parameters":{"value":"hemoglobin"}},"return_type":"entry","request_options":{"pager":{"start":0,"rows":10}}},"record_path":"result_set","max_items":10}` + - `{"base_url":"https://www.rcsb.org","path":"fasta/entry/4HHB/download","response_format":"text"}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records`, a compact `summary`, or `text_head`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://data.rcsb.org/rest/v1","path":"core/entry/4hhb"}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/rcsb-pdb-skill/agents/openai.yaml b/plugins/life-science-research/skills/rcsb-pdb-skill/agents/openai.yaml new file mode 100644 index 0000000..7c32aa8 --- /dev/null +++ b/plugins/life-science-research/skills/rcsb-pdb-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "RCSB PDB" + short_description: "Fetch RCSB PDB structure summaries" diff --git a/plugins/life-science-research/skills/rcsb-pdb-skill/scripts/rest_request.py b/plugins/life-science-research/skills/rcsb-pdb-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/rcsb-pdb-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/reactome-skill/SKILL.md b/plugins/life-science-research/skills/reactome-skill/SKILL.md new file mode 100644 index 0000000..407f5ff --- /dev/null +++ b/plugins/life-science-research/skills/reactome-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: reactome-skill +description: Submit compact Reactome ContentService requests for pathway, event, participant, search, and diagram-related data. Use when a user wants concise Reactome summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all Reactome ContentService calls. +- Use `base_url=https://reactome.org/ContentService`. +- Single pathway or event lookups usually do not need `max_items`; list-style pathway membership calls are better with `max_items=10`. +- Send `Accept: application/json` in `headers` when requesting JSON. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Prefer these paths: `data/query/`, `data/pathways/low/entity/`, `data/participants/`, and search endpoints. +- If the user needs the full payload, set `save_raw=true` and report the saved file path. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common Reactome patterns: + - `{"base_url":"https://reactome.org/ContentService","path":"data/query/R-HSA-199420","headers":{"Accept":"application/json"}}` + - `{"base_url":"https://reactome.org/ContentService","path":"data/pathways/low/entity/P38398","params":{"species":"Homo sapiens"},"headers":{"Accept":"application/json"},"max_items":10}` + - `{"base_url":"https://reactome.org/ContentService","path":"data/participants/R-HSA-199420","headers":{"Accept":"application/json"},"max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://reactome.org/ContentService","path":"data/query/R-HSA-199420","headers":{"Accept":"application/json"}}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/reactome-skill/agents/openai.yaml b/plugins/life-science-research/skills/reactome-skill/agents/openai.yaml new file mode 100644 index 0000000..a73108c --- /dev/null +++ b/plugins/life-science-research/skills/reactome-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Reactome" + short_description: "Fetch Reactome pathway summaries" diff --git a/plugins/life-science-research/skills/reactome-skill/scripts/rest_request.py b/plugins/life-science-research/skills/reactome-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/reactome-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/research-router-skill/SKILL.md b/plugins/life-science-research/skills/research-router-skill/SKILL.md new file mode 100644 index 0000000..9aff213 --- /dev/null +++ b/plugins/life-science-research/skills/research-router-skill/SKILL.md @@ -0,0 +1,145 @@ +--- +name: research-router-skill +description: Route broad or ambiguous life-sciences research requests to the right skills, normalize core entities, optionally parallelize independent evidence gathering with subagents when available, and synthesize a concise evidence-backed answer. Use when a user asks a general life-sciences question that could span multiple sources or analysis types. +--- + +## Research Router + +Use this skill as the default orchestration layer for broad life-sciences research requests. + +Do not use it for narrow single-source lookups when a more specific skill already matches the request cleanly. + +## Primary Responsibility + +Turn an open-ended research question into a small, defensible retrieval plan: + +1. understand the research objective +2. normalize the main entities +3. select the minimum useful set of downstream skills +4. gather evidence +5. synthesize the answer for the user + +The router owns the framing and the final synthesis. It should not dump raw source payloads unless the user explicitly asks for them. + +## When To Use This Skill + +Use this skill when any of the following are true: + +- the user asks a broad question such as `what is known about ...` +- the question could require more than one evidence type +- the right source is unclear at the start +- the request mixes entities, for example gene plus disease, variant plus phenotype, protein plus ligand, or pathway plus dataset +- the user wants a synthesized answer rather than a single database lookup + +## Research Task Classification + +Start by classifying the request into one or more lanes: + +- human genetics and variant interpretation +- locus-to-gene prioritization +- expression, tissue, or cell-type context +- pathway, network, or functional biology +- protein structure and mechanism +- chemistry, ligands, and pharmacology +- clinical, translational, or cancer evidence +- literature, preprints, and public dataset discovery +- metabolomics, proteomics, or microbiome context + +Prefer 1 to 3 lanes. Only expand further if the user explicitly asks for a broad landscape review. + +## Entity Normalization + +Normalize the key entities before deep retrieval. + +Common patterns: + +- gene or protein: `ncbi-clinicaltables-skill`, `ensembl-skill`, `uniprot-skill` +- disease or phenotype: `efo-ontology-skill`, `opentargets-skill` +- variant: `clinvar-variation-skill`, `ensembl-skill`, cohort-specific PheWAS skills +- compound or metabolite: `chembl-skill`, `pubchem-pug-skill`, `chebi-skill`, `hmdb-skill` +- pathway or function: `reactome-skill`, `quickgo-skill`, `string-skill` +- accession or dataset identifier: `ncbi-datasets-skill`, `biostudies-arrayexpress-skill`, `pride-skill`, `metabolights-skill` + +Do not start broad evidence collection until the important entities are stable enough to route correctly. + +## Skill Selection Heuristics + +Choose the smallest set of skills that can answer the question well. + +Examples: + +- target or disease evidence review: + `opentargets-skill`, `gwas-catalog-skill`, `gtex-eqtl-skill`, `human-protein-atlas-skill` +- variant interpretation: + `clinvar-variation-skill`, `gnomad-graphql-skill`, `ensembl-skill`, one or more cohort PheWAS skills +- locus-to-gene mapping: + `locus-to-gene-mapper-skill`, or its component genetics skills when the user wants a custom workflow +- structure and mechanism: + `alphafold-skill`, `rcsb-pdb-skill`, `uniprot-skill`, `reactome-skill` +- chemistry and pharmacology: + `chembl-skill`, `bindingdb-skill`, `pubchem-pug-skill`, `pharmgkb-skill` +- clinical and translational: + `clinicaltrials-skill`, `cbioportal-skill`, `civic-skill` +- literature and dataset discovery: + `ncbi-entrez-skill`, `ncbi-pmc-skill`, `biorxiv-skill`, `biostudies-arrayexpress-skill`, `ncbi-datasets-skill` + +Prefer direct lookups before expensive multi-step chains. + +## Subagent And Parallelization Guidance + +If Codex subagents are available, use them only when the work cleanly decomposes into independent lanes. + +Good candidates for subagents: + +- genetics, expression, structure, chemistry, and clinical evidence can be gathered independently for the same question +- multiple loci, variants, genes, compounds, or datasets need parallel comparison +- a broad landscape review requires separate evidence summaries before synthesis + +Keep these steps with the coordinating agent: + +- initial interpretation of the user request +- entity normalization and final scope decisions +- conflict resolution across evidence sources +- final synthesis and recommendation writing + +Avoid subagents when: + +- one specific skill already answers the question +- later steps depend tightly on earlier intermediate outputs +- the work is mostly identifier resolution or narrow follow-up lookup +- the extra coordination cost is likely to exceed the retrieval benefit + +When delegating, give each subagent a bounded read-only objective such as one evidence family or one comparison unit. Each subagent should return: + +- what it checked +- the key findings +- the main caveats +- which skills or sources it used +- any artifact paths it produced + +The coordinating agent is responsible for reconciling overlaps, contradictions, and evidence gaps. + +## Output Contract + +Return a concise answer structured around the user's question, not around the tools. + +Unless the user asks for a different format, include: + +1. direct answer or working conclusion +2. key evidence by lane +3. main caveats or unresolved questions +4. recommended next analyses or follow-up lookups + +If the task is exploratory, explicitly distinguish: + +- evidence that supports a conclusion +- evidence that is only suggestive +- evidence that is missing or contradictory + +## Operating Rules + +- prefer concise source-backed synthesis over large raw dumps +- escalate to multi-skill workflows only when the question requires synthesis +- state important cohort, ancestry, assay, tissue, and study-design limitations +- do not overstate causality from association-only evidence +- if a downstream skill can answer the request directly, hand off to it instead of keeping the router in the foreground diff --git a/plugins/life-science-research/skills/research-router-skill/agents/openai.yaml b/plugins/life-science-research/skills/research-router-skill/agents/openai.yaml new file mode 100644 index 0000000..087ac92 --- /dev/null +++ b/plugins/life-science-research/skills/research-router-skill/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Research Router" + short_description: "Route broad life-sciences research across the right skills" + default_prompt: "Interpret this life-sciences research question, normalize the key entities, choose the smallest useful set of skills, and synthesize a concise evidence-backed answer. Use subagents only when the work cleanly decomposes into independent evidence lanes." diff --git a/plugins/life-science-research/skills/rhea-skill/SKILL.md b/plugins/life-science-research/skills/rhea-skill/SKILL.md new file mode 100644 index 0000000..d330a6c --- /dev/null +++ b/plugins/life-science-research/skills/rhea-skill/SKILL.md @@ -0,0 +1,37 @@ +--- +name: rhea-skill +description: Submit compact Rhea reaction search requests for biochemical reactions and reaction IDs. Use when a user wants concise Rhea summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all Rhea calls. +- Use `base_url=https://www.rhea-db.org`. +- Start with the `rhea` search endpoint plus `format=json`. +- Keep queries narrow by reaction ID, compound name, EC number, or free-text reaction term. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer these patterns: reaction search by `query`, targeted ID search via `query=RHEA:`, and small result windows. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common Rhea patterns: + - `{"base_url":"https://www.rhea-db.org","path":"rhea","params":{"query":"caffeine","format":"json"},"record_path":"results","max_items":10}` + - `{"base_url":"https://www.rhea-db.org","path":"rhea","params":{"query":"RHEA:47148","format":"json"},"record_path":"results","max_items":5}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://www.rhea-db.org","path":"rhea","params":{"query":"caffeine","format":"json"},"record_path":"results","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/rhea-skill/agents/openai.yaml b/plugins/life-science-research/skills/rhea-skill/agents/openai.yaml new file mode 100644 index 0000000..cb54c93 --- /dev/null +++ b/plugins/life-science-research/skills/rhea-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "Rhea" + short_description: "Fetch Rhea reaction summaries" diff --git a/plugins/life-science-research/skills/rhea-skill/scripts/rest_request.py b/plugins/life-science-research/skills/rhea-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/rhea-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/rnacentral-skill/SKILL.md b/plugins/life-science-research/skills/rnacentral-skill/SKILL.md new file mode 100644 index 0000000..d399558 --- /dev/null +++ b/plugins/life-science-research/skills/rnacentral-skill/SKILL.md @@ -0,0 +1,39 @@ +--- +name: rnacentral-skill +description: Submit compact RNAcentral API requests for RNA entry browsing, single-entry lookup, and cross-reference retrieval. Use when a user wants concise RNAcentral summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all RNAcentral calls. +- Use `base_url=https://rnacentral.org/api/v1`. +- Keep the trailing slash on collection and record paths to avoid redirects. +- Start with targeted lookups such as `rna//` because broad `rna/` browsing can be slow or time out. +- Re-run requests in long conversations instead of relying on older tool output. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return raw JSON only if the user explicitly asks for machine-readable output. +- Prefer these paths: `rna//`, `rna//`, `rna//xrefs/`, and targeted `rna/` searches with `q` plus small `page_size`. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common RNAcentral patterns: + - `{"base_url":"https://rnacentral.org/api/v1","path":"rna/URS000075C808/9606","max_items":10}` + - `{"base_url":"https://rnacentral.org/api/v1","path":"rna/","params":{"q":"TP53","page_size":10},"record_path":"results","max_items":10}` + - `{"base_url":"https://rnacentral.org/api/v1","path":"rna/URS0000000001/"}` + - `{"base_url":"https://rnacentral.org/api/v1","path":"rna/URS0000000001/xrefs/","record_path":"results","max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://rnacentral.org/api/v1","path":"rna/URS000075C808/9606","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/rnacentral-skill/agents/openai.yaml b/plugins/life-science-research/skills/rnacentral-skill/agents/openai.yaml new file mode 100644 index 0000000..b05942d --- /dev/null +++ b/plugins/life-science-research/skills/rnacentral-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "RNAcentral" + short_description: "Fetch RNAcentral summaries" diff --git a/plugins/life-science-research/skills/rnacentral-skill/scripts/rest_request.py b/plugins/life-science-research/skills/rnacentral-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/rnacentral-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/string-skill/SKILL.md b/plugins/life-science-research/skills/string-skill/SKILL.md new file mode 100644 index 0000000..3132a92 --- /dev/null +++ b/plugins/life-science-research/skills/string-skill/SKILL.md @@ -0,0 +1,41 @@ +--- +name: string-skill +description: Submit compact STRING API requests for network, interaction partner, and enrichment endpoints. Use when a user wants concise STRING summaries +--- + +## Operating rules +- Use `scripts/rest_request.py` for all STRING API calls. +- Use `base_url=https://string-db.org/api/json`. +- Use `method=POST` with `form_body` for STRING endpoints. +- Include `caller_identity` in `form_body`; keep it stable within a session when possible. +- The script accepts `max_items`; for `network` and `interaction_partners`, start with API `limit=10` and `max_items=10`. +- For `enrichment`, summarize the top `5` to `10` rows unless the user asks for more. +- Re-run requests in long conversations instead of relying on prior tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the script JSON verbatim only if the user explicitly asks for machine-readable output. +- Prefer these paths: `network`, `interaction_partners`, and `enrichment`. +- For long identifier lists, keep the request small and paged; if full results are needed, use `save_raw=true`. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common STRING patterns: + - `{"base_url":"https://string-db.org/api/json","path":"network","method":"POST","form_body":{"identifiers":"TP53","species":9606,"caller_identity":"chatgpt-skill","limit":10},"max_items":10}` + - `{"base_url":"https://string-db.org/api/json","path":"interaction_partners","method":"POST","form_body":{"identifier":"TP53","species":9606,"caller_identity":"chatgpt-skill","limit":10},"max_items":10}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records` or a compact `summary`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://string-db.org/api/json","path":"network","method":"POST","form_body":{"identifiers":"TP53","species":9606,"caller_identity":"chatgpt-skill","limit":10},"max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/string-skill/agents/openai.yaml b/plugins/life-science-research/skills/string-skill/agents/openai.yaml new file mode 100644 index 0000000..ce88f7b --- /dev/null +++ b/plugins/life-science-research/skills/string-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "STRING" + short_description: "Fetch STRING network summaries" diff --git a/plugins/life-science-research/skills/string-skill/scripts/rest_request.py b/plugins/life-science-research/skills/string-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/string-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/tpmi-phewas-skill/SKILL.md b/plugins/life-science-research/skills/tpmi-phewas-skill/SKILL.md new file mode 100644 index 0000000..fc74a11 --- /dev/null +++ b/plugins/life-science-research/skills/tpmi-phewas-skill/SKILL.md @@ -0,0 +1,41 @@ +--- +name: tpmi-phewas-skill +description: Fetch compact TPMI PheWAS summaries for single variants by accepting rsID, GRCh37, or GRCh38 input and resolving to the required GRCh38 query. Use when a user wants concise TPMI association results for one variant +--- + +## Operating rules +- Use `scripts/tpmi_phewas.py` for all TPMI PheWAS lookups. +- Accept exactly one of `rsid`, `grch37`, `grch38`, or `variant`; resolve to the canonical GRCh38 `chr:pos-ref-alt` query before calling TPMI. +- The script accepts `max_results`; start with `max_results=10` and only increase it if the first slice is insufficient. +- Re-run the lookup in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. +- If the user needs the full association payload, set `save_raw=true` and report `raw_output_path` instead of pasting large arrays into chat. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the JSON verbatim only if the user explicitly asks for machine-readable output. +- Surface the canonical queried variant, total association count, and whether the results were truncated. +- Increase `max_results` gradually instead of asking for large association dumps in one call. + +## Input +- Read one JSON object from stdin, or a single JSON string containing the variant. +- Required input: exactly one of `rsid`, `grch37`, `grch38`, or `variant` +- Optional fields: `max_results`, `save_raw`, `raw_output_path`, `timeout_sec` +- Common patterns: + - `{"grch38":"6:160540105-T-C","max_results":10}` + - `{"grch37":"6:162447146-T-C","max_results":10}` + - `{"rsid":"rs9273363","max_results":10}` + - `{"variant":"6:160540105:T:C","max_results":25,"save_raw":true}` + +## Output +- Success returns `ok`, `source`, `input`, `query_variant`, `max_results_applied`, `association_count`, `association_count_total`, `truncated`, `associations`, `variant`, `variant_url`, `raw_output_path`, and `warnings`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"grch38":"6:160540105-T-C","max_results":10}' | python scripts/tpmi_phewas.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/tpmi_phewas.py`. diff --git a/plugins/life-science-research/skills/tpmi-phewas-skill/agents/openai.yaml b/plugins/life-science-research/skills/tpmi-phewas-skill/agents/openai.yaml new file mode 100644 index 0000000..6f7cb9f --- /dev/null +++ b/plugins/life-science-research/skills/tpmi-phewas-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "TPMI PheWAS" + short_description: "Fetch TPMI PheWAS associations" diff --git a/plugins/life-science-research/skills/tpmi-phewas-skill/scripts/tpmi_phewas.py b/plugins/life-science-research/skills/tpmi-phewas-skill/scripts/tpmi_phewas.py new file mode 100644 index 0000000..1a62a78 --- /dev/null +++ b/plugins/life-science-research/skills/tpmi-phewas-skill/scripts/tpmi_phewas.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +"""tpmi-phewas + +Fetch TPMI PheWAS associations for one variant input. +Input JSON on stdin: + - {"grch38":"6:160540105-T-C"} + - {"grch37":"6:162447146:T:C","max_results":25} + - {"rsid":"rs9273363","max_results":25,"save_raw":true} + - "6:160540105-T-C" +Output JSON on stdout. +""" + +from __future__ import annotations + +import json +import re +import sys +from pathlib import Path +from typing import Any + +import requests +from variant_resolution import ( + VariantResolutionError, + extract_variant_input, + resolve_query_variant, +) + +TPMI_BASE = "https://pheweb.ibms.sinica.edu.tw" +USER_AGENT = "tpmi-phewas-skill/1.0 (+requests)" +DEFAULT_TIMEOUT_S = 20 +DEFAULT_MAX_RESULTS = 10 +SAFE_PATH_RE = re.compile(r"[^A-Za-z0-9._-]+") + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return { + "ok": False, + "error": {"code": code, "message": message}, + "warnings": warnings or [], + } + + +def parse_input(payload: Any) -> tuple[str, str, int, bool, str | None, float]: + if isinstance(payload, str): + return "grch38", payload.strip(), DEFAULT_MAX_RESULTS, False, None, DEFAULT_TIMEOUT_S + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + input_type, variant = extract_variant_input(payload, default_build_key="grch38") + + max_results = payload.get("max_results", DEFAULT_MAX_RESULTS) + if not isinstance(max_results, int) or max_results <= 0: + raise ValueError("`max_results` must be a positive integer when provided.") + + save_raw = payload.get("save_raw", False) + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean when provided.") + + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None: + if not isinstance(raw_output_path, str) or not raw_output_path.strip(): + raise ValueError("`raw_output_path` must be a non-empty string when provided.") + raw_output_path = raw_output_path.strip() + + timeout_sec = payload.get("timeout_sec", DEFAULT_TIMEOUT_S) + if not isinstance(timeout_sec, (int, float)) or timeout_sec <= 0: + raise ValueError("`timeout_sec` must be a positive number when provided.") + + return input_type, variant, max_results, save_raw, raw_output_path, float(timeout_sec) + + +def fetch_tpmi_variant( + session: requests.Session, + variant_str: str, + timeout_sec: float, +) -> tuple[Any | None, int | None]: + encoded = requests.utils.quote(variant_str, safe=":-") + url = f"{TPMI_BASE}/api/variant/{encoded}" + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + + resp = session.get(url, headers=headers, timeout=timeout_sec) + if resp.status_code == 404: + return None, 404 + resp.raise_for_status() + return resp.json(), resp.status_code + + +def extract_associations(data: Any) -> list[Any]: + if data is None: + return [] + if isinstance(data, dict) and isinstance(data.get("phenos"), list): + return data["phenos"] + if isinstance(data, dict) and isinstance(data.get("results"), list): + return data["results"] + if isinstance(data, list): + return data + return [] + + +def resolve_raw_output_path(canonical_variant: str, raw_output_path: str | None) -> Path: + if raw_output_path: + return Path(raw_output_path).expanduser() + + safe_variant = SAFE_PATH_RE.sub("_", canonical_variant).strip("._") or "variant" + return Path("/tmp") / f"tpmi-phewas-{safe_variant}.json" + + +def write_raw_json(path: Path, data: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data), encoding="utf-8") + + +def main() -> int: + warnings: list[str] = [] + + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + + try: + input_type, input_variant, max_results, save_raw, raw_output_path, timeout_sec = ( + parse_input(payload) + ) + except ValueError as exc: + sys.stdout.write(json.dumps(error("invalid_input", str(exc)))) + return 2 + + try: + resolution = resolve_query_variant( + input_type=input_type, + input_value=input_variant, + target_build="GRCh38", + ) + parsed = dict(resolution["query_variant"]) + warnings.extend(resolution["warnings"]) + except VariantResolutionError as exc: + sys.stdout.write(json.dumps(error(exc.code, exc.message, exc.warnings))) + return 1 + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"Variant resolution failed: {exc}"))) + return 1 + + session = requests.Session() + try: + data, status_code = fetch_tpmi_variant(session, parsed["canonical"], timeout_sec) + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"TPMI request failed: {exc}"))) + return 1 + except ValueError as exc: + sys.stdout.write(json.dumps(error("invalid_response", f"TPMI returned non-JSON: {exc}"))) + return 1 + + variant_url = f"{TPMI_BASE}/variant/{parsed['canonical']}" + saved_raw_output_path: str | None = None + if save_raw and data is not None: + raw_path = resolve_raw_output_path(parsed["canonical"], raw_output_path) + try: + write_raw_json(raw_path, data) + except OSError as exc: + sys.stdout.write(json.dumps(error("write_error", f"Could not write raw output: {exc}"))) + return 1 + saved_raw_output_path = str(raw_path) + + if status_code == 404 or data is None: + warnings.append("Variant not found in TPMI PheWAS API.") + output = { + "ok": True, + "source": "tpmi", + "input": resolution["input"], + "query_variant": parsed, + "max_results_applied": max_results, + "association_count": 0, + "association_count_total": 0, + "truncated": False, + "associations": [], + "variant": None, + "variant_url": variant_url, + "raw_output_path": None, + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + associations = extract_associations(data) + total = len(associations) + if total > max_results: + associations = associations[:max_results] + truncated = len(associations) < total + + variant_info = None + if isinstance(data, dict): + variant_info = { + "chrom": data.get("chrom"), + "pos": data.get("pos"), + "ref": data.get("ref"), + "alt": data.get("alt"), + "rsids": data.get("rsids"), + "variant_name": data.get("variant_name"), + "nearest_genes": data.get("nearest_genes"), + } + + output = { + "ok": True, + "source": "tpmi", + "input": resolution["input"], + "query_variant": parsed, + "max_results_applied": max_results, + "association_count": len(associations), + "association_count_total": total, + "truncated": truncated, + "associations": associations, + "variant": variant_info, + "variant_url": variant_url, + "raw_output_path": saved_raw_output_path, + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/tpmi-phewas-skill/scripts/variant_resolution.py b/plugins/life-science-research/skills/tpmi-phewas-skill/scripts/variant_resolution.py new file mode 100644 index 0000000..cf52103 --- /dev/null +++ b/plugins/life-science-research/skills/tpmi-phewas-skill/scripts/variant_resolution.py @@ -0,0 +1,397 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import Any + +import requests + +ENSEMBL_GRCH38 = "https://rest.ensembl.org" +ENSEMBL_GRCH37 = "https://grch37.rest.ensembl.org" + +DEFAULT_TIMEOUT_S = 15 +USER_AGENT = "variant-coordinate-finder/1.0 (+requests)" + +SEP_RE = re.compile(r"[-:_/\s]+") +CHR_RE = re.compile(r"^(?:chr)?([0-9]{1,2}|X|Y|M|MT)$", re.IGNORECASE) +ALLELE_RE = re.compile(r"^[A-Za-z*]+$") + + +class VariantResolutionError(Exception): + def __init__(self, code: str, message: str, warnings: list[str] | None = None): + super().__init__(message) + self.code = code + self.message = message + self.warnings = warnings or [] + + +@dataclass +class Coord: + chr: str + pos: int + ref: str | None + alts: list[str] + + +def build_key_for(build: str) -> str: + return "grch37" if build in {"GRCh37", "hg19"} else "grch38" + + +def build_variant_record( + chrom: str, + pos: int, + ref: str | None, + alt: str | None, +) -> dict[str, Any]: + record: dict[str, Any] = { + "chr": chrom, + "pos": pos, + "ref": ref, + "alt": alt, + } + if ref is not None and alt is not None: + record["canonical"] = f"{chrom}:{pos}-{ref}-{alt}" + return record + + +def parse_variant_string(value: str) -> tuple[str, int, str, str]: + raw = value.strip() + if not raw: + raise ValueError("Variant string is empty.") + + parts = [part for part in SEP_RE.split(raw) if part] + if len(parts) != 4: + raise ValueError( + "Invalid variant format. Expected chrom-pos-ref-alt with flexible separators." + ) + + chrom_raw, pos_raw, ref_raw, alt_raw = parts + match = CHR_RE.match(chrom_raw) + if not match: + raise ValueError(f"Invalid chromosome: {chrom_raw!r}") + + chrom = match.group(1).upper() + if chrom == "M": + chrom = "MT" + + try: + pos = int(pos_raw) + except ValueError as exc: + raise ValueError(f"Invalid position: {pos_raw!r}") from exc + if pos <= 0: + raise ValueError("Position must be > 0.") + + ref = ref_raw.upper() + alt = alt_raw.upper() + if not ALLELE_RE.match(ref): + raise ValueError(f"Invalid REF allele: {ref_raw!r}") + if not ALLELE_RE.match(alt): + raise ValueError(f"Invalid ALT allele: {alt_raw!r}") + + return chrom, pos, ref, alt + + +def extract_variant_input(payload: Any, *, default_build_key: str) -> tuple[str, str]: + if isinstance(payload, str): + return default_build_key, payload.strip() + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + provided: list[tuple[str, str]] = [] + for key in ("rsid", "grch37", "grch38", "variant"): + value = payload.get(key) + if isinstance(value, str) and value.strip(): + provided.append((key, value.strip())) + + if not provided: + raise ValueError( + f"Provide exactly one of `rsid`, `grch37`, `grch38`, or `{default_build_key}`." + ) + if len(provided) != 1: + raise ValueError( + "Provide exactly one variant input: `rsid`, `grch37`, `grch38`, or `variant`." + ) + + input_type, input_value = provided[0] + if input_type == "variant": + input_type = default_build_key + return input_type, input_value + + +def _server_for(build: str) -> str: + return ENSEMBL_GRCH37 if build in {"GRCh37", "hg19"} else ENSEMBL_GRCH38 + + +def _assembly_cmp(build: str) -> str: + return "GRCh37" if build in {"GRCh37", "hg19"} else "GRCh38" + + +def _get_json(url: str, *, timeout: int = DEFAULT_TIMEOUT_S) -> Any: + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + return response.json() + + +def lookup_rsid(rsid: str, build: str = "GRCh38") -> Coord | None: + server = _server_for(build) + asm = _assembly_cmp(build) + url = ( + f"{server}/variation/human/{requests.utils.quote(rsid, safe='')}" + "?content-type=application/json" + ) + + data = _get_json(url, timeout=DEFAULT_TIMEOUT_S) + mappings = data.get("mappings") if isinstance(data, dict) else None + if not mappings: + return None + + for mapping in mappings: + if ( + isinstance(mapping, dict) + and mapping.get("assembly_name") == asm + and mapping.get("seq_region_name") + and mapping.get("start") is not None + ): + allele_string = mapping.get("allele_string") + alleles = allele_string.split("/") if isinstance(allele_string, str) else [] + ref = alleles[0] if alleles else None + alts = alleles[1:] if len(alleles) > 1 else [] + return Coord( + chr=str(mapping["seq_region_name"]), + pos=int(mapping["start"]), + ref=ref, + alts=alts, + ) + + return None + + +def lookup_position( + chrom: str, + pos: int, + build: str = "GRCh38", +) -> tuple[str, str | None, list[str]] | None: + server = _server_for(build) + url = ( + f"{server}/overlap/region/human/{chrom}:{pos}-{pos}" + "?feature=variation;content-type=application/json" + ) + + data = _get_json(url, timeout=DEFAULT_TIMEOUT_S) + if not isinstance(data, list) or not data: + return None + + for variant in data: + if ( + isinstance(variant, dict) + and isinstance(variant.get("id"), str) + and variant["id"].startswith("rs") + ): + alleles = variant.get("alleles") if isinstance(variant.get("alleles"), list) else [] + ref = alleles[0] if alleles else None + alts = alleles[1:] if len(alleles) > 1 else [] + return variant["id"], ref, alts + + return None + + +def resolve_rsid_both_builds(rsid: str) -> dict[str, Any]: + g38 = None + g37 = None + warnings: list[str] = [] + + try: + g38 = lookup_rsid(rsid, "GRCh38") + except Exception as exc: # noqa: BLE001 + warnings.append(f"GRCh38 lookup failed: {type(exc).__name__}: {exc}") + + try: + g37 = lookup_rsid(rsid, "GRCh37") + except Exception as exc: # noqa: BLE001 + warnings.append(f"GRCh37 lookup failed: {type(exc).__name__}: {exc}") + + ref = (g38.ref if g38 else None) or (g37.ref if g37 else None) + alts = (g38.alts if (g38 and g38.alts) else []) or (g37.alts if g37 else []) + + return { + "rsid": rsid, + "grch38": {"chr": g38.chr if g38 else None, "pos": g38.pos if g38 else None}, + "grch37": {"chr": g37.chr if g37 else None, "pos": g37.pos if g37 else None}, + "ref": ref, + "alts": alts, + "warnings": warnings, + } + + +def resolve_position_both_builds(chrom: str, pos: int, build: str) -> dict[str, Any] | None: + is_hg19 = build in {"hg19", "GRCh37"} + other_build = "GRCh38" if is_hg19 else "GRCh37" + + pos_result = lookup_position(chrom, pos, build) + if not pos_result: + return None + + rsid, ref, alts = pos_result + + other = None + warnings: list[str] = [] + try: + other = lookup_rsid(rsid, other_build) + except Exception as exc: # noqa: BLE001 + warnings.append(f"Other-build lookup failed: {type(exc).__name__}: {exc}") + + if is_hg19: + return { + "rsid": rsid, + "grch38": {"chr": other.chr if other else None, "pos": other.pos if other else None}, + "grch37": {"chr": chrom, "pos": pos}, + "ref": (other.ref if other and other.ref else ref), + "alts": (other.alts if other and other.alts else alts), + "warnings": warnings, + } + + return { + "rsid": rsid, + "grch38": {"chr": chrom, "pos": pos}, + "grch37": {"chr": other.chr if other else None, "pos": other.pos if other else None}, + "ref": ref, + "alts": alts, + "warnings": warnings, + } + + +def resolve_variant(input_type: str, input_value: str) -> dict[str, Any]: + warnings: list[str] = [] + + if input_type == "rsid": + rsid = input_value.strip() + if not rsid.startswith("rs"): + raise ValueError("rsid must start with 'rs'.") + + resolved = resolve_rsid_both_builds(rsid) + warnings.extend(resolved.get("warnings", [])) + + ref = resolved.get("ref") + alts = resolved.get("alts") or [] + alt = alts[0] if alts else None + + g37 = None + if resolved.get("grch37", {}).get("chr") and resolved.get("grch37", {}).get("pos"): + g37 = build_variant_record( + resolved["grch37"]["chr"], + resolved["grch37"]["pos"], + ref, + alt, + ) + + g38 = None + if resolved.get("grch38", {}).get("chr") and resolved.get("grch38", {}).get("pos"): + g38 = build_variant_record( + resolved["grch38"]["chr"], + resolved["grch38"]["pos"], + ref, + alt, + ) + + return { + "input": {"type": "rsid", "value": rsid}, + "rsid": rsid, + "grch37": g37, + "grch38": g38, + "warnings": warnings, + } + + if input_type not in {"grch37", "grch38"}: + raise ValueError(f"Unsupported input type: {input_type!r}") + + build = "GRCh37" if input_type == "grch37" else "GRCh38" + chrom, pos, ref_in, alt_in = parse_variant_string(input_value) + + resolved = resolve_position_both_builds(chrom, pos, build) + if not resolved: + raise VariantResolutionError( + "not_found", + f"No rsID found at {chrom}:{pos} on {build} via Ensembl overlap endpoint.", + ) + + warnings.extend(resolved.get("warnings", [])) + + ref = resolved.get("ref") + alts = resolved.get("alts") or [] + if ref and ref_in != ref: + warnings.append(f"Input ref {ref_in} != resolved ref {ref}; keeping resolved ref.") + + alt = alt_in if alt_in in alts else (alts[0] if alts else alt_in) + if alts and alt_in not in alts: + warnings.append(f"Input alt {alt_in} not among resolved alts {alts}; using {alt}.") + + rsid = resolved.get("rsid") + + g37 = None + if resolved.get("grch37", {}).get("chr") and resolved.get("grch37", {}).get("pos"): + g37 = build_variant_record( + resolved["grch37"]["chr"], + resolved["grch37"]["pos"], + ref, + alt, + ) + + g38 = None + if resolved.get("grch38", {}).get("chr") and resolved.get("grch38", {}).get("pos"): + g38 = build_variant_record( + resolved["grch38"]["chr"], + resolved["grch38"]["pos"], + ref, + alt, + ) + + return { + "input": {"type": input_type, "value": input_value}, + "rsid": rsid, + "grch37": g37, + "grch38": g38, + "warnings": warnings, + } + + +def resolve_query_variant( + *, + input_type: str, + input_value: str, + target_build: str, +) -> dict[str, Any]: + target_key = build_key_for(target_build) + if input_type == target_key: + chrom, pos, ref, alt = parse_variant_string(input_value) + target_variant = build_variant_record(chrom, pos, ref, alt) + return { + "input": {"type": input_type, "value": input_value}, + "query_variant": target_variant, + "rsid": None, + "grch37": target_variant if target_key == "grch37" else None, + "grch38": target_variant if target_key == "grch38" else None, + "warnings": [], + } + + resolved = resolve_variant(input_type, input_value) + target_variant = resolved.get(target_key) + if not isinstance(target_variant, dict) or not target_variant.get("canonical"): + raise VariantResolutionError( + "resolution_failed", + f"Could not resolve input variant to {target_build}.", + warnings=list(resolved.get("warnings") or []), + ) + + return { + "input": resolved["input"], + "query_variant": target_variant, + "rsid": resolved.get("rsid"), + "grch37": resolved.get("grch37"), + "grch38": resolved.get("grch38"), + "warnings": list(resolved.get("warnings") or []), + } diff --git a/plugins/life-science-research/skills/ukb-topmed-phewas-skill/SKILL.md b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/SKILL.md new file mode 100644 index 0000000..9ebbc5d --- /dev/null +++ b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/SKILL.md @@ -0,0 +1,41 @@ +--- +name: ukb-topmed-phewas-skill +description: Fetch compact UKB-TOPMed PheWAS summaries for single variants by accepting rsID, GRCh37, or GRCh38 input and resolving to the required GRCh38 query. Use when a user wants concise UKB-TOPMed association results for one variant +--- + +## Operating rules +- Use `scripts/ukb_topmed_phewas.py` for all UKB-TOPMed PheWAS lookups. +- Accept exactly one of `rsid`, `grch37`, `grch38`, or `variant`; resolve to the canonical GRCh38 `chr:pos-ref-alt` query before calling UKB-TOPMed. +- The script accepts `max_results`; start with `max_results=10` and only increase it if the first slice is insufficient. +- Re-run the lookup in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not literal request content. +- If the user needs the full association payload, set `save_raw=true` and report `raw_output_path` instead of pasting large arrays into chat. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the JSON verbatim only if the user explicitly asks for machine-readable output. +- Surface the canonical queried variant, total association count, and whether the results were truncated. +- Increase `max_results` gradually instead of asking for large association dumps in one call. + +## Input +- Read one JSON object from stdin, or a single JSON string containing the variant. +- Required input: exactly one of `rsid`, `grch37`, `grch38`, or `variant` +- Optional fields: `max_results`, `save_raw`, `raw_output_path`, `timeout_sec` +- Common patterns: + - `{"grch38":"10:112998590-C-T","max_results":10}` + - `{"grch37":"10:114758349-C-T","max_results":10}` + - `{"rsid":"rs7903146","max_results":10}` + - `{"variant":"10:112998590:C:T","max_results":25,"save_raw":true}` + +## Output +- Success returns `ok`, `source`, `input`, `query_variant`, `max_results_applied`, `association_count`, `association_count_total`, `truncated`, `associations`, `variant`, `variant_url`, `raw_output_path`, and `warnings`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"grch38":"10:112998590-C-T","max_results":10}' | python scripts/ukb_topmed_phewas.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/ukb_topmed_phewas.py`. diff --git a/plugins/life-science-research/skills/ukb-topmed-phewas-skill/agents/openai.yaml b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/agents/openai.yaml new file mode 100644 index 0000000..d0e5704 --- /dev/null +++ b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "UKB-TOPMed PheWAS" + short_description: "Fetch UKB-TOPMed PheWAS associations" diff --git a/plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/ukb_topmed_phewas.py b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/ukb_topmed_phewas.py new file mode 100644 index 0000000..c6a1d9f --- /dev/null +++ b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/ukb_topmed_phewas.py @@ -0,0 +1,228 @@ +#!/usr/bin/env python3 +"""ukb-topmed-phewas + +Fetch UKB-TOPMed PheWAS associations for one variant input. +Input JSON on stdin: + - {"grch38":"10-112998590-C-T"} + - {"grch37":"10:114758349:C:T","max_results":25} + - {"rsid":"rs7903146","max_results":25,"save_raw":true} + - "10-112998590-C-T" +Output JSON on stdout. +""" + +from __future__ import annotations + +import json +import re +import sys +from pathlib import Path +from typing import Any + +import requests +from variant_resolution import ( + VariantResolutionError, + extract_variant_input, + resolve_query_variant, +) + +UKB_TOPMED_BASE = "https://pheweb.org/UKB-TOPMed" +USER_AGENT = "ukb-topmed-phewas-skill/1.0 (+requests)" +DEFAULT_TIMEOUT_S = 20 +DEFAULT_MAX_RESULTS = 10 +SAFE_PATH_RE = re.compile(r"[^A-Za-z0-9._-]+") + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return { + "ok": False, + "error": {"code": code, "message": message}, + "warnings": warnings or [], + } + + +def parse_input(payload: Any) -> tuple[str, str, int, bool, str | None, float]: + if isinstance(payload, str): + return "grch38", payload.strip(), DEFAULT_MAX_RESULTS, False, None, DEFAULT_TIMEOUT_S + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + input_type, variant = extract_variant_input(payload, default_build_key="grch38") + + max_results = payload.get("max_results", DEFAULT_MAX_RESULTS) + if not isinstance(max_results, int) or max_results <= 0: + raise ValueError("`max_results` must be a positive integer when provided.") + + save_raw = payload.get("save_raw", False) + if not isinstance(save_raw, bool): + raise ValueError("`save_raw` must be a boolean when provided.") + + raw_output_path = payload.get("raw_output_path") + if raw_output_path is not None: + if not isinstance(raw_output_path, str) or not raw_output_path.strip(): + raise ValueError("`raw_output_path` must be a non-empty string when provided.") + raw_output_path = raw_output_path.strip() + + timeout_sec = payload.get("timeout_sec", DEFAULT_TIMEOUT_S) + if not isinstance(timeout_sec, (int, float)) or timeout_sec <= 0: + raise ValueError("`timeout_sec` must be a positive number when provided.") + + return input_type, variant, max_results, save_raw, raw_output_path, float(timeout_sec) + + +def fetch_variant( + session: requests.Session, + variant_str: str, + timeout_sec: float, +) -> tuple[Any | None, int | None]: + encoded = requests.utils.quote(variant_str, safe=":-") + url = f"{UKB_TOPMED_BASE}/api/variant/{encoded}" + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + + resp = session.get(url, headers=headers, timeout=timeout_sec) + if resp.status_code == 404: + return None, 404 + resp.raise_for_status() + return resp.json(), resp.status_code + + +def extract_associations(data: Any) -> list[Any]: + if data is None: + return [] + if isinstance(data, dict) and isinstance(data.get("phenos"), list): + return data["phenos"] + if isinstance(data, list): + return data + return [] + + +def resolve_raw_output_path(canonical_variant: str, raw_output_path: str | None) -> Path: + if raw_output_path: + return Path(raw_output_path).expanduser() + + safe_variant = SAFE_PATH_RE.sub("_", canonical_variant).strip("._") or "variant" + return Path("/tmp") / f"ukb-topmed-phewas-{safe_variant}.json" + + +def write_raw_json(path: Path, data: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(data), encoding="utf-8") + + +def main() -> int: + warnings: list[str] = [] + + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + + try: + input_type, input_variant, max_results, save_raw, raw_output_path, timeout_sec = ( + parse_input(payload) + ) + except ValueError as exc: + sys.stdout.write(json.dumps(error("invalid_input", str(exc)))) + return 2 + + try: + resolution = resolve_query_variant( + input_type=input_type, + input_value=input_variant, + target_build="GRCh38", + ) + parsed = dict(resolution["query_variant"]) + warnings.extend(resolution["warnings"]) + except VariantResolutionError as exc: + sys.stdout.write(json.dumps(error(exc.code, exc.message, exc.warnings))) + return 1 + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"Variant resolution failed: {exc}"))) + return 1 + + session = requests.Session() + try: + data, status_code = fetch_variant(session, parsed["canonical"], timeout_sec) + except requests.RequestException as exc: + sys.stdout.write(json.dumps(error("network_error", f"UKB-TOPMed request failed: {exc}"))) + return 1 + except ValueError as exc: + sys.stdout.write( + json.dumps(error("invalid_response", f"UKB-TOPMed returned non-JSON: {exc}")) + ) + return 1 + + variant_url = f"{UKB_TOPMED_BASE}/variant/{parsed['canonical']}" + saved_raw_output_path: str | None = None + if save_raw and data is not None: + raw_path = resolve_raw_output_path(parsed["canonical"], raw_output_path) + try: + write_raw_json(raw_path, data) + except OSError as exc: + sys.stdout.write(json.dumps(error("write_error", f"Could not write raw output: {exc}"))) + return 1 + saved_raw_output_path = str(raw_path) + + if status_code == 404: + warnings.append("Variant not found in UKB-TOPMed PheWAS API (HTTP 404).") + output = { + "ok": True, + "source": "ukb-topmed", + "input": resolution["input"], + "query_variant": parsed, + "max_results_applied": max_results, + "association_count": 0, + "association_count_total": 0, + "truncated": False, + "associations": [], + "variant": None, + "variant_url": variant_url, + "raw_output_path": None, + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + associations = extract_associations(data) + total = len(associations) + if total > max_results: + associations = associations[:max_results] + truncated = len(associations) < total + + variant_info = None + if isinstance(data, dict): + variant_info = { + "chrom": data.get("chrom"), + "pos": data.get("pos"), + "ref": data.get("ref"), + "alt": data.get("alt"), + "rsids": data.get("rsids"), + "variant_name": data.get("variant_name"), + "nearest_genes": data.get("nearest_genes"), + } + + output = { + "ok": True, + "source": "ukb-topmed", + "input": resolution["input"], + "query_variant": parsed, + "max_results_applied": max_results, + "association_count": len(associations), + "association_count_total": total, + "truncated": truncated, + "associations": associations, + "variant": variant_info, + "variant_url": variant_url, + "raw_output_path": saved_raw_output_path, + "warnings": warnings, + } + sys.stdout.write(json.dumps(output)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/variant_resolution.py b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/variant_resolution.py new file mode 100644 index 0000000..cf52103 --- /dev/null +++ b/plugins/life-science-research/skills/ukb-topmed-phewas-skill/scripts/variant_resolution.py @@ -0,0 +1,397 @@ +from __future__ import annotations + +import re +from dataclasses import dataclass +from typing import Any + +import requests + +ENSEMBL_GRCH38 = "https://rest.ensembl.org" +ENSEMBL_GRCH37 = "https://grch37.rest.ensembl.org" + +DEFAULT_TIMEOUT_S = 15 +USER_AGENT = "variant-coordinate-finder/1.0 (+requests)" + +SEP_RE = re.compile(r"[-:_/\s]+") +CHR_RE = re.compile(r"^(?:chr)?([0-9]{1,2}|X|Y|M|MT)$", re.IGNORECASE) +ALLELE_RE = re.compile(r"^[A-Za-z*]+$") + + +class VariantResolutionError(Exception): + def __init__(self, code: str, message: str, warnings: list[str] | None = None): + super().__init__(message) + self.code = code + self.message = message + self.warnings = warnings or [] + + +@dataclass +class Coord: + chr: str + pos: int + ref: str | None + alts: list[str] + + +def build_key_for(build: str) -> str: + return "grch37" if build in {"GRCh37", "hg19"} else "grch38" + + +def build_variant_record( + chrom: str, + pos: int, + ref: str | None, + alt: str | None, +) -> dict[str, Any]: + record: dict[str, Any] = { + "chr": chrom, + "pos": pos, + "ref": ref, + "alt": alt, + } + if ref is not None and alt is not None: + record["canonical"] = f"{chrom}:{pos}-{ref}-{alt}" + return record + + +def parse_variant_string(value: str) -> tuple[str, int, str, str]: + raw = value.strip() + if not raw: + raise ValueError("Variant string is empty.") + + parts = [part for part in SEP_RE.split(raw) if part] + if len(parts) != 4: + raise ValueError( + "Invalid variant format. Expected chrom-pos-ref-alt with flexible separators." + ) + + chrom_raw, pos_raw, ref_raw, alt_raw = parts + match = CHR_RE.match(chrom_raw) + if not match: + raise ValueError(f"Invalid chromosome: {chrom_raw!r}") + + chrom = match.group(1).upper() + if chrom == "M": + chrom = "MT" + + try: + pos = int(pos_raw) + except ValueError as exc: + raise ValueError(f"Invalid position: {pos_raw!r}") from exc + if pos <= 0: + raise ValueError("Position must be > 0.") + + ref = ref_raw.upper() + alt = alt_raw.upper() + if not ALLELE_RE.match(ref): + raise ValueError(f"Invalid REF allele: {ref_raw!r}") + if not ALLELE_RE.match(alt): + raise ValueError(f"Invalid ALT allele: {alt_raw!r}") + + return chrom, pos, ref, alt + + +def extract_variant_input(payload: Any, *, default_build_key: str) -> tuple[str, str]: + if isinstance(payload, str): + return default_build_key, payload.strip() + + if not isinstance(payload, dict): + raise ValueError("Input must be a JSON string or object.") + + provided: list[tuple[str, str]] = [] + for key in ("rsid", "grch37", "grch38", "variant"): + value = payload.get(key) + if isinstance(value, str) and value.strip(): + provided.append((key, value.strip())) + + if not provided: + raise ValueError( + f"Provide exactly one of `rsid`, `grch37`, `grch38`, or `{default_build_key}`." + ) + if len(provided) != 1: + raise ValueError( + "Provide exactly one variant input: `rsid`, `grch37`, `grch38`, or `variant`." + ) + + input_type, input_value = provided[0] + if input_type == "variant": + input_type = default_build_key + return input_type, input_value + + +def _server_for(build: str) -> str: + return ENSEMBL_GRCH37 if build in {"GRCh37", "hg19"} else ENSEMBL_GRCH38 + + +def _assembly_cmp(build: str) -> str: + return "GRCh37" if build in {"GRCh37", "hg19"} else "GRCh38" + + +def _get_json(url: str, *, timeout: int = DEFAULT_TIMEOUT_S) -> Any: + headers = { + "Accept": "application/json", + "User-Agent": USER_AGENT, + } + response = requests.get(url, headers=headers, timeout=timeout) + response.raise_for_status() + return response.json() + + +def lookup_rsid(rsid: str, build: str = "GRCh38") -> Coord | None: + server = _server_for(build) + asm = _assembly_cmp(build) + url = ( + f"{server}/variation/human/{requests.utils.quote(rsid, safe='')}" + "?content-type=application/json" + ) + + data = _get_json(url, timeout=DEFAULT_TIMEOUT_S) + mappings = data.get("mappings") if isinstance(data, dict) else None + if not mappings: + return None + + for mapping in mappings: + if ( + isinstance(mapping, dict) + and mapping.get("assembly_name") == asm + and mapping.get("seq_region_name") + and mapping.get("start") is not None + ): + allele_string = mapping.get("allele_string") + alleles = allele_string.split("/") if isinstance(allele_string, str) else [] + ref = alleles[0] if alleles else None + alts = alleles[1:] if len(alleles) > 1 else [] + return Coord( + chr=str(mapping["seq_region_name"]), + pos=int(mapping["start"]), + ref=ref, + alts=alts, + ) + + return None + + +def lookup_position( + chrom: str, + pos: int, + build: str = "GRCh38", +) -> tuple[str, str | None, list[str]] | None: + server = _server_for(build) + url = ( + f"{server}/overlap/region/human/{chrom}:{pos}-{pos}" + "?feature=variation;content-type=application/json" + ) + + data = _get_json(url, timeout=DEFAULT_TIMEOUT_S) + if not isinstance(data, list) or not data: + return None + + for variant in data: + if ( + isinstance(variant, dict) + and isinstance(variant.get("id"), str) + and variant["id"].startswith("rs") + ): + alleles = variant.get("alleles") if isinstance(variant.get("alleles"), list) else [] + ref = alleles[0] if alleles else None + alts = alleles[1:] if len(alleles) > 1 else [] + return variant["id"], ref, alts + + return None + + +def resolve_rsid_both_builds(rsid: str) -> dict[str, Any]: + g38 = None + g37 = None + warnings: list[str] = [] + + try: + g38 = lookup_rsid(rsid, "GRCh38") + except Exception as exc: # noqa: BLE001 + warnings.append(f"GRCh38 lookup failed: {type(exc).__name__}: {exc}") + + try: + g37 = lookup_rsid(rsid, "GRCh37") + except Exception as exc: # noqa: BLE001 + warnings.append(f"GRCh37 lookup failed: {type(exc).__name__}: {exc}") + + ref = (g38.ref if g38 else None) or (g37.ref if g37 else None) + alts = (g38.alts if (g38 and g38.alts) else []) or (g37.alts if g37 else []) + + return { + "rsid": rsid, + "grch38": {"chr": g38.chr if g38 else None, "pos": g38.pos if g38 else None}, + "grch37": {"chr": g37.chr if g37 else None, "pos": g37.pos if g37 else None}, + "ref": ref, + "alts": alts, + "warnings": warnings, + } + + +def resolve_position_both_builds(chrom: str, pos: int, build: str) -> dict[str, Any] | None: + is_hg19 = build in {"hg19", "GRCh37"} + other_build = "GRCh38" if is_hg19 else "GRCh37" + + pos_result = lookup_position(chrom, pos, build) + if not pos_result: + return None + + rsid, ref, alts = pos_result + + other = None + warnings: list[str] = [] + try: + other = lookup_rsid(rsid, other_build) + except Exception as exc: # noqa: BLE001 + warnings.append(f"Other-build lookup failed: {type(exc).__name__}: {exc}") + + if is_hg19: + return { + "rsid": rsid, + "grch38": {"chr": other.chr if other else None, "pos": other.pos if other else None}, + "grch37": {"chr": chrom, "pos": pos}, + "ref": (other.ref if other and other.ref else ref), + "alts": (other.alts if other and other.alts else alts), + "warnings": warnings, + } + + return { + "rsid": rsid, + "grch38": {"chr": chrom, "pos": pos}, + "grch37": {"chr": other.chr if other else None, "pos": other.pos if other else None}, + "ref": ref, + "alts": alts, + "warnings": warnings, + } + + +def resolve_variant(input_type: str, input_value: str) -> dict[str, Any]: + warnings: list[str] = [] + + if input_type == "rsid": + rsid = input_value.strip() + if not rsid.startswith("rs"): + raise ValueError("rsid must start with 'rs'.") + + resolved = resolve_rsid_both_builds(rsid) + warnings.extend(resolved.get("warnings", [])) + + ref = resolved.get("ref") + alts = resolved.get("alts") or [] + alt = alts[0] if alts else None + + g37 = None + if resolved.get("grch37", {}).get("chr") and resolved.get("grch37", {}).get("pos"): + g37 = build_variant_record( + resolved["grch37"]["chr"], + resolved["grch37"]["pos"], + ref, + alt, + ) + + g38 = None + if resolved.get("grch38", {}).get("chr") and resolved.get("grch38", {}).get("pos"): + g38 = build_variant_record( + resolved["grch38"]["chr"], + resolved["grch38"]["pos"], + ref, + alt, + ) + + return { + "input": {"type": "rsid", "value": rsid}, + "rsid": rsid, + "grch37": g37, + "grch38": g38, + "warnings": warnings, + } + + if input_type not in {"grch37", "grch38"}: + raise ValueError(f"Unsupported input type: {input_type!r}") + + build = "GRCh37" if input_type == "grch37" else "GRCh38" + chrom, pos, ref_in, alt_in = parse_variant_string(input_value) + + resolved = resolve_position_both_builds(chrom, pos, build) + if not resolved: + raise VariantResolutionError( + "not_found", + f"No rsID found at {chrom}:{pos} on {build} via Ensembl overlap endpoint.", + ) + + warnings.extend(resolved.get("warnings", [])) + + ref = resolved.get("ref") + alts = resolved.get("alts") or [] + if ref and ref_in != ref: + warnings.append(f"Input ref {ref_in} != resolved ref {ref}; keeping resolved ref.") + + alt = alt_in if alt_in in alts else (alts[0] if alts else alt_in) + if alts and alt_in not in alts: + warnings.append(f"Input alt {alt_in} not among resolved alts {alts}; using {alt}.") + + rsid = resolved.get("rsid") + + g37 = None + if resolved.get("grch37", {}).get("chr") and resolved.get("grch37", {}).get("pos"): + g37 = build_variant_record( + resolved["grch37"]["chr"], + resolved["grch37"]["pos"], + ref, + alt, + ) + + g38 = None + if resolved.get("grch38", {}).get("chr") and resolved.get("grch38", {}).get("pos"): + g38 = build_variant_record( + resolved["grch38"]["chr"], + resolved["grch38"]["pos"], + ref, + alt, + ) + + return { + "input": {"type": input_type, "value": input_value}, + "rsid": rsid, + "grch37": g37, + "grch38": g38, + "warnings": warnings, + } + + +def resolve_query_variant( + *, + input_type: str, + input_value: str, + target_build: str, +) -> dict[str, Any]: + target_key = build_key_for(target_build) + if input_type == target_key: + chrom, pos, ref, alt = parse_variant_string(input_value) + target_variant = build_variant_record(chrom, pos, ref, alt) + return { + "input": {"type": input_type, "value": input_value}, + "query_variant": target_variant, + "rsid": None, + "grch37": target_variant if target_key == "grch37" else None, + "grch38": target_variant if target_key == "grch38" else None, + "warnings": [], + } + + resolved = resolve_variant(input_type, input_value) + target_variant = resolved.get(target_key) + if not isinstance(target_variant, dict) or not target_variant.get("canonical"): + raise VariantResolutionError( + "resolution_failed", + f"Could not resolve input variant to {target_build}.", + warnings=list(resolved.get("warnings") or []), + ) + + return { + "input": resolved["input"], + "query_variant": target_variant, + "rsid": resolved.get("rsid"), + "grch37": resolved.get("grch37"), + "grch38": resolved.get("grch38"), + "warnings": list(resolved.get("warnings") or []), + } diff --git a/plugins/life-science-research/skills/uniprot-skill/SKILL.md b/plugins/life-science-research/skills/uniprot-skill/SKILL.md new file mode 100644 index 0000000..24a5de1 --- /dev/null +++ b/plugins/life-science-research/skills/uniprot-skill/SKILL.md @@ -0,0 +1,41 @@ +--- +name: uniprot-skill +description: Submit compact UniProt REST API requests for UniProtKB, UniRef, UniParc, and FASTA stream endpoints. Use when a user wants concise UniProt summaries; save raw JSON or FASTA only on request. +--- + +## Operating rules +- Use `scripts/rest_request.py` for all UniProt API calls. +- Use `base_url=https://rest.uniprot.org`. +- The script accepts `max_items`; for search endpoints, start with API `size=10` and `max_items=10`. +- Single accession or cluster lookups usually do not need `max_items`. +- Re-run requests in long conversations instead of relying on older tool output. +- Treat displayed `...` in tool previews as UI truncation, not part of the real request. +- If the user asks for full JSON or FASTA, set `save_raw=true` and report the saved file path instead of pasting the payload into chat. + +## Execution behavior +- Return concise markdown summaries from the script JSON by default. +- Return the script JSON verbatim only if the user explicitly asks for machine-readable output. +- Prefer these paths: `uniprotkb/search`, `uniprotkb/`, `uniref/`, `uniparc/search`, and `uniprotkb/stream`. +- For `stream`, use `response_format=text` so the script returns only a short `text_head` unless raw output is requested. + +## Input +- Read one JSON object from stdin. +- Required fields: `base_url`, `path` +- Optional fields: `method`, `params`, `headers`, `json_body`, `form_body`, `record_path`, `response_format`, `max_items`, `max_depth`, `timeout_sec`, `save_raw`, `raw_output_path` +- Common UniProt patterns: + - `{"base_url":"https://rest.uniprot.org","path":"uniprotkb/search","params":{"query":"gene:TP53 AND organism_id:9606","fields":"accession,gene_names","size":10,"format":"json"},"record_path":"results","max_items":10}` + - `{"base_url":"https://rest.uniprot.org","path":"uniprotkb/P04637","params":{"format":"json"}}` + - `{"base_url":"https://rest.uniprot.org","path":"uniprotkb/stream","params":{"query":"organism_id:562","format":"fasta","size":2},"response_format":"text"}` + +## Output +- Success returns `ok`, `source`, `path`, `method`, `status_code`, `warnings`, and either compact `records`, a compact `summary`, or `text_head`. +- Use `raw_output_path` when `save_raw=true`. +- Failure returns `ok=false` with `error.code` and `error.message`. + +## Execution +```bash +echo '{"base_url":"https://rest.uniprot.org","path":"uniprotkb/search","params":{"query":"gene:TP53 AND organism_id:9606","fields":"accession,gene_names","size":10,"format":"json"},"record_path":"results","max_items":10}' | python scripts/rest_request.py +``` + +## References +- No additional runtime references are required; keep the import package limited to this file and `scripts/rest_request.py`. diff --git a/plugins/life-science-research/skills/uniprot-skill/agents/openai.yaml b/plugins/life-science-research/skills/uniprot-skill/agents/openai.yaml new file mode 100644 index 0000000..d406877 --- /dev/null +++ b/plugins/life-science-research/skills/uniprot-skill/agents/openai.yaml @@ -0,0 +1,3 @@ +interface: + display_name: "UniProt" + short_description: "Fetch UniProt summaries" diff --git a/plugins/life-science-research/skills/uniprot-skill/scripts/rest_request.py b/plugins/life-science-research/skills/uniprot-skill/scripts/rest_request.py new file mode 100644 index 0000000..4de3cf1 --- /dev/null +++ b/plugins/life-science-research/skills/uniprot-skill/scripts/rest_request.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +"""Generic compact REST client for ChatGPT-imported skills.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path +from typing import Any + +try: + import requests +except ImportError as exc: # pragma: no cover + requests = None + REQUESTS_IMPORT_ERROR = exc +else: + REQUESTS_IMPORT_ERROR = None + + +def error(code: str, message: str, warnings: list[str] | None = None) -> dict[str, Any]: + return {"ok": False, "error": {"code": code, "message": message}, "warnings": warnings or []} + + +def _require_object(name: str, value: Any) -> dict[str, Any]: + if value is None: + return {} + if not isinstance(value, dict): + raise ValueError(f"`{name}` must be an object.") + return value + + +def _require_bool(name: str, value: Any, default: bool) -> bool: + if value is None: + return default + if not isinstance(value, bool): + raise ValueError(f"`{name}` must be a boolean.") + return value + + +def _require_int(name: str, value: Any, default: int) -> int: + if value is None: + return default + if not isinstance(value, int) or value <= 0: + raise ValueError(f"`{name}` must be a positive integer.") + return value + + +def _require_str(name: str, value: Any, required: bool = False) -> str | None: + if value is None: + if required: + raise ValueError(f"`{name}` is required.") + return None + if not isinstance(value, str) or not value.strip(): + raise ValueError(f"`{name}` must be a non-empty string.") + return value.strip() + + +def _service_name(base_url: str) -> str: + host = base_url.split("://", 1)[-1].split("/", 1)[0] + return host.replace(".", "-") + + +def _build_url(base_url: str, path: str) -> str: + if path.startswith(("http://", "https://")): + return path + return base_url.rstrip("/") + "/" + path.lstrip("/") + + +def _get_by_path(value: Any, path: str) -> Any: + current = value + for part in path.split("."): + if isinstance(current, list): + if not part.isdigit(): + raise ValueError(f"`record_path` segment {part!r} must be a list index.") + index = int(part) + if index >= len(current): + raise ValueError(f"`record_path` index {index} is out of range.") + current = current[index] + elif isinstance(current, dict): + if part not in current: + raise ValueError(f"`record_path` key {part!r} was not present in the response.") + current = current[part] + else: + raise ValueError(f"`record_path` segment {part!r} could not be applied.") + return current + + +def _infer_target(data: Any) -> tuple[str | None, Any]: + if isinstance(data, list): + return "$", data + if isinstance(data, dict): + embedded = data.get("_embedded") + if isinstance(embedded, dict): + for key, value in embedded.items(): + if isinstance(value, list): + return f"_embedded.{key}", value + for key in ( + "collection", + "results", + "structures", + "activities", + "molecules", + "mechanisms", + "records", + "items", + ): + value = data.get(key) + if isinstance(value, list): + return key, value + return None, data + + +def _compact(value: Any, max_items: int, max_depth: int) -> Any: + if isinstance(value, str): + return value if len(value) <= 240 else value[:240] + "..." + if max_depth <= 0: + if isinstance(value, (dict, list)): + return "..." + return value + if isinstance(value, list): + out = [_compact(item, max_items, max_depth - 1) for item in value[:max_items]] + if len(value) > max_items: + out.append(f"... (+{len(value) - max_items} more)") + return out + if isinstance(value, dict): + out: dict[str, Any] = {} + items = list(value.items()) + for key, item in items[:max_items]: + out[str(key)] = _compact(item, max_items, max_depth - 1) + if len(items) > max_items: + out["_truncated_keys"] = len(items) - max_items + return out + return value + + +def _save_raw_output( + raw_output: str, raw_output_path: str | None, base_url: str, suffix: str +) -> str: + path = Path(raw_output_path or f"/tmp/{_service_name(base_url)}-raw.{suffix}") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(raw_output, encoding="utf-8") + return str(path) + + +def parse_input(payload: Any) -> dict[str, Any]: + if not isinstance(payload, dict): + raise ValueError("Input must be one JSON object.") + base_url = _require_str("base_url", payload.get("base_url"), required=True) + path = _require_str("path", payload.get("path"), required=True) + method = (_require_str("method", payload.get("method")) or "GET").upper() + if method not in {"GET", "POST"}: + raise ValueError("`method` must be GET or POST.") + json_body = payload.get("json_body") + form_body = payload.get("form_body") + if json_body is not None and form_body is not None: + raise ValueError("Provide only one of `json_body` or `form_body`.") + response_format = ( + _require_str("response_format", payload.get("response_format")) or "auto" + ).lower() + if response_format not in {"auto", "json", "text"}: + raise ValueError("`response_format` must be auto, json, or text.") + return { + "base_url": base_url, + "path": path, + "method": method, + "params": _require_object("params", payload.get("params")), + "headers": _require_object("headers", payload.get("headers")), + "json_body": json_body, + "form_body": _require_object("form_body", form_body) if form_body is not None else None, + "record_path": _require_str("record_path", payload.get("record_path")), + "response_format": response_format, + "max_items": _require_int("max_items", payload.get("max_items"), 5), + "max_depth": _require_int("max_depth", payload.get("max_depth"), 3), + "timeout_sec": _require_int("timeout_sec", payload.get("timeout_sec"), 30), + "save_raw": _require_bool("save_raw", payload.get("save_raw"), False), + "raw_output_path": _require_str("raw_output_path", payload.get("raw_output_path")), + } + + +def execute(payload: Any) -> dict[str, Any]: + if requests is None: + return error("missing_dependency", f"`requests` is required: {REQUESTS_IMPORT_ERROR}") + config = parse_input(payload) + session = requests.Session() + session.headers.update(config["headers"]) + url = _build_url(config["base_url"], config["path"]) + + request_kwargs: dict[str, Any] = {"params": config["params"], "timeout": config["timeout_sec"]} + if config["json_body"] is not None: + request_kwargs["json"] = config["json_body"] + if config["form_body"] is not None: + request_kwargs["data"] = config["form_body"] + + try: + response = session.request(config["method"], url, **request_kwargs) + response.raise_for_status() + content_type = (response.headers.get("content-type") or "").lower() + wants_json = config["response_format"] == "json" + wants_text = config["response_format"] == "text" + auto_json = not wants_text and ( + "json" in content_type or response.text.lstrip().startswith(("{", "[")) + ) + + if wants_json or auto_json: + data = response.json() + raw_output = json.dumps(data, indent=2) + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + raw_output, config["raw_output_path"], config["base_url"], "json" + ) + + record_path = config["record_path"] + path_used, target = ( + _infer_target(data) + if record_path is None + else (record_path, _get_by_path(data, record_path)) + ) + out = { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "record_path": path_used, + "raw_output_path": raw_output_path, + "warnings": [], + } + if isinstance(target, list): + records = target[: config["max_items"]] + out.update( + { + "record_count_returned": len(records), + "record_count_available": len(target), + "truncated": len(records) < len(target), + "records": _compact(records, config["max_items"], config["max_depth"]), + } + ) + else: + out["summary"] = _compact(target, config["max_items"], config["max_depth"]) + if isinstance(target, dict): + out["top_keys"] = list(target)[: config["max_items"]] + return out + + raw_output_path = None + if config["save_raw"]: + raw_output_path = _save_raw_output( + response.text, config["raw_output_path"], config["base_url"], "txt" + ) + text_head = response.text[:800] + return { + "ok": True, + "source": _service_name(config["base_url"]), + "path": config["path"], + "method": config["method"], + "status_code": response.status_code, + "content_type": content_type, + "text_head": None if raw_output_path else text_head, + "text_head_truncated": False + if raw_output_path + else len(text_head) < len(response.text), + "raw_output_path": raw_output_path, + "warnings": [], + } + except ValueError as exc: + return error("invalid_response", str(exc)) + except requests.RequestException as exc: + return error("network_error", f"Request failed: {exc}") + finally: + session.close() + + +def main() -> int: + try: + payload = json.load(sys.stdin) + except Exception as exc: # noqa: BLE001 + sys.stdout.write(json.dumps(error("invalid_json", f"Could not parse JSON input: {exc}"))) + return 2 + try: + output = execute(payload) + except ValueError as exc: + output = error("invalid_input", str(exc)) + code = 2 + else: + code = 0 if output.get("ok") else 1 + sys.stdout.write(json.dumps(output)) + return code + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/.codex-plugin/plugin.json b/plugins/ngs-analysis/.codex-plugin/plugin.json new file mode 100644 index 0000000..c84e07d --- /dev/null +++ b/plugins/ngs-analysis/.codex-plugin/plugin.json @@ -0,0 +1,53 @@ +{ + "name": "ngs-analysis", + "version": "1.0.1", + "description": "Guided NGS intake, local execution, and public-pipeline routing for BCL, FASTQ, DNA variant, RNA-seq, single-cell, epigenomics, amplicon, and metagenomics analyses, with deeper decision skills for high-risk assay branches.", + "author": { + "name": "OpenAI", + "email": "support@openai.com", + "url": "https://openai.com/" + }, + "homepage": "https://openai.com/", + "repository": "https://github.com/openai/openai", + "license": "MIT", + "keywords": [ + "ngs", + "sequencing", + "bioinformatics", + "fastq", + "bcl", + "rnaseq", + "scrnaseq", + "variant-calling", + "atacseq", + "chipseq", + "microbiome", + "metagenomics", + "pipeline-routing", + "nextflow", + "nf-core" + ], + "skills": "./skills/", + "interface": { + "displayName": "Life Sciences NGS Analysis", + "shortDescription": "Guided NGS routing and local execution for sequencing analysis", + "longDescription": "A guided intake, routing, and execution plugin for next-generation sequencing workflows. It helps Codex inspect local sequencing inputs, ask only the missing assay-specific questions, choose public or freely accessible runtime-installable packages where possible, check existing tool availability before any install, and execute supported local workflows with validation, logs, manifests, QC reports, and artifact indexes. It includes deeper decision skills for BCL demultiplexing, FASTQ QC execution and interpretation, germline, somatic and UMI-panel DNA variants, bulk RNA-seq count generation and differential expression, ATAC-seq, ChIP-seq/CUT&RUN/CUT&Tag, and embedded post-count scRNA-seq QC.", + "developerName": "OpenAI", + "category": "Research", + "capabilities": [ + "Interactive", + "Read", + "Write" + ], + "websiteURL": "https://openai.com/", + "privacyPolicyURL": "https://openai.com/policies/privacy-policy/", + "termsOfServiceURL": "https://openai.com/policies/terms-of-use/", + "defaultPrompt": [ + "Guide me through the minimum required NGS analysis questions, inspect available BCL/FASTQ files or count matrices, choose the right public pipeline or deeper assay-specific skill, check whether required tools already exist, and execute supported local workflows with pre-execution validation and artifact capture." + ], + "brandColor": "#0F766E", + "composerIcon": "./assets/app-icon.png", + "logo": "./assets/app-icon.png", + "screenshots": [] + } +} diff --git a/plugins/ngs-analysis/README.md b/plugins/ngs-analysis/README.md new file mode 100644 index 0000000..19c6aab --- /dev/null +++ b/plugins/ngs-analysis/README.md @@ -0,0 +1,458 @@ +# Life Sciences NGS Analysis Plugin + +This plugin provides a guided intake and execution layer for common next-generation sequencing analyses. It routes users from BCL or FASTQ files to public, reproducible pipelines while checking local tool availability before installing anything. + +## What It Does + +- Inspects sequencing inputs before asking questions. +- Asks the minimum assay-specific questions needed to choose an analysis route. +- Prefers public, runtime-installable tools and nf-core workflows where practical. +- Runs tool preflight checks before suggesting downloads or installs. +- Keeps proprietary, credentialed, or cloud-upload paths explicit instead of silently using them. +- Treats preflight as validation before executing approved local workflows where supported. +- Produces timestamped run directories with manifests, validation summaries, logs, QC reports, exact command timing/return-code detail, checksummed artifact indexes, and input-to-output lineage tables. +- Produces native visualization bundles under `visualizations/` when a lane has enough downstream data to plot. + +## Included Skills + +- `ngs-analysis-router`: top-level intake and routing. +- `ngs-runtime-env`: package/tool existence checks and install planning. +- `ngs-bcl-to-fastq`: BCL run-folder validation, demultiplexing, and demux metric review. +- `ngs-fastq-qc`: FASTQ quality control, trimming decisions, and MultiQC interpretation. +- `ngs-dna-variant-calling`: WGS/WES/panel variant dispatcher. +- `ngs-dna-germline-variants`: germline WGS/WES/panel variant calling and QC. +- `ngs-dna-somatic-variants`: tumor-normal and tumor-only somatic variant calling and QC. +- `ngs-dna-umi-panel-variants`: UMI, duplex, and low-frequency targeted panel workflows. +- `ngs-bulk-rnaseq`: bulk RNA-seq dispatcher. +- `ngs-bulk-rnaseq-counts-qc`: bulk RNA-seq FASTQ-to-count processing and QC. +- `ngs-bulk-rnaseq-differential-expression`: bulk RNA-seq count-matrix differential expression. +- `ngs-scrna-seq`: single-cell or single-nucleus RNA-seq FASTQ-to-count kickoff. +- `scrna-seq-qc`: embedded post-count single-cell QC, annotation, clustering, and UMAP guidance. +- `ngs-epigenomics-peaks`: ATAC-seq, ChIP-seq, CUT&RUN, and CUT&Tag dispatcher. +- `ngs-atacseq-peaks-qc`: ATAC-seq QC, peak, consensus, and differential accessibility workflows. +- `ngs-chip-cutrun-peaks-qc`: ChIP-seq, CUT&RUN, and CUT&Tag QC, control, peak, and differential binding workflows. +- `ngs-amplicon-microbiome`: 16S/18S/ITS/COI amplicon analysis kickoff. +- `ngs-shotgun-metagenomics`: shotgun metagenomics taxonomic and functional profiling kickoff. + +## Capability Status + +This package is intentionally mixed maturity. Use the status below when deciding what to run versus what to treat as planning guidance. + +Local execution lanes: + +- `ngs-fastq-qc`: plugin-owned local runner for FASTQ validation, FastQC/MultiQC execution, optional trimming, logs, summaries, and artifact indexes. +- `ngs-bulk-rnaseq-counts-qc`: plugin-owned local runner for bulk RNA-seq FASTQ validation, FastQC/MultiQC, Salmon transcript quantification, TPM/NumReads/effective-length matrices, logs, summaries, and artifact indexes. +- `ngs-bulk-rnaseq-differential-expression`: plugin-owned local runner for count-matrix validation, contrast/replicate checks, automatic DESeq2/edgeR/limma method selection, QC plots, normalized matrices, result tables, logs, summaries, and artifact indexes. +- `ngs-scrna-seq`: plugin-owned local FASTQ-to-count runner for STARsolo-backed scRNA/snRNA count generation. +- `scrna-seq-qc`: post-count QC and annotation guidance, plus a matrix-level runner for 10x-style matrices. The runner uses conservative PBMC marker fallback when no matched reference is provided, so tissue-specific annotation should be reviewed or replaced before broader use. +- `ngs-dna-variant-calling`: plugin-owned BAM/CRAM-to-VCF execution package using samtools/bcftools for focused local checks, with nf-core/sarek still preferred for full WGS/WES/panel workflows. +- `ngs-dna-germline-variants`: plugin-owned higher-fidelity germline runner for BQSR, per-sample gVCFs, and optional joint genotyping when a local GATK toolchain and matched known-sites resources are available. +- `ngs-epigenomics-peaks`: plugin-owned FASTQ validation/QC execution package for ATAC-seq, ChIP-seq, CUT&RUN, and CUT&Tag intake, with readiness artifacts for the alignment and peak-calling stage. +- `ngs-amplicon-microbiome`: plugin-owned FASTQ validation/QC execution package for marker-gene amplicon inputs, with explicit primer/taxonomy backend readiness artifacts. +- `ngs-shotgun-metagenomics`: plugin-owned FASTQ validation/QC execution package for shotgun metagenomics inputs, with explicit database-gated taxonomic profiling status. +- `ngs-bcl-to-fastq`: plugin-owned BCL run-folder and sample-sheet validator that executes BCL Convert or legacy bcl2fastq when an installed converter is available. + +Dispatch lanes: + +- `ngs-bulk-rnaseq`: routes users to the counts/QC runner when starting from FASTQs, or to the differential-expression runner when starting from an expression matrix. + +Dispatch and subtype lanes: + +- `ngs-dna-germline-variants` +- `ngs-dna-somatic-variants` +- `ngs-dna-umi-panel-variants` +- `ngs-atacseq-peaks-qc` +- `ngs-chip-cutrun-peaks-qc` + +These lanes route to the shared DNA or epigenomics execution packages when a compact local run is appropriate, and remain responsible for assay-specific guidance, metadata checks, controls, and full-workflow handoff. + +## Runtime Preflight + +From the repo root: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --list +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline bulk_rnaseq --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline bulk_rnaseq_counts_qc --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline bulk_rnaseq_differential_expression --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --profile local_light --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --tool fastqc --network-checks +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline shotgun_metagenomics --manager micromamba --install-plan-outdir runtime_readiness/shotgun_install +``` + +The script checks local executables first with `PATH` lookup. Optional network checks query package indexes or container registries only when requested. It does not install packages unless `--install-missing --yes` is explicitly provided. + +Use `--install-plan-outdir` for a permission-ready package handoff. It writes `install_plan.json` as the canonical review artifact and a guarded `install_commands.sh` companion generated from the same plan. The shell script exits in review-only mode unless `NGS_RUN_INSTALL_COMMANDS=1` is set after explicit user approval. Reference and database downloads remain separate and should be handled through `ngs_reference_manager.py setup-plan`. + +## Pipeline Dependency Matrix + +Use this matrix as a quick lane-to-package guide before running a workflow. The structured source of truth remains `references/pipeline-registry.json`; use `ngs_preflight.py --pipeline --emit-install-plan` to check the current machine and generate a reviewable install plan. + +| Pipeline lane | Required or primary tools | Useful optional tools | Reference/database requirements | +| --- | --- | --- | --- | +| BCL-to-FASTQ | `bcl-convert` | `bcl2fastq` for legacy compatibility | Illumina run folder with `RunInfo.xml`, optional `RunParameters.xml`, `Data/Intensities/BaseCalls`, and a compatible sample sheet | +| FASTQ QC and trimming | `snakemake`, `fastqc`, `multiqc` | `fastp`, `cutadapt`, `seqkit` | FASTQ files and sample sheet or explicit sample/R1/R2 paths | +| Bulk RNA-seq counts/QC | `snakemake`, `fastqc`, `multiqc`, `salmon` | `star`, `subread`, `seqkit` | Transcriptome FASTA, genome FASTA, GTF, and optional registered genome bundle | +| Bulk RNA-seq differential expression | `Rscript` plus available DE backend packages | `DESeq2`, `edgeR`, `limma`, `marimo` review surface | Count/expression matrix, sample metadata, contrast table, and design context | +| scRNA FASTQ-to-count | `snakemake`, `STAR` or configured STAR container | `kb-python`, `cellranger` when explicitly requested and licensed | Genome FASTA, GTF, chemistry/whitelist inputs, and optional registered genome bundle | +| scRNA post-count QC | Python/R analysis environment with `scanpy` and runner dependencies | Bioconductor QC packages, `marimo` review surface | 10x-style matrix bundle, barcodes/features, metadata, and optional raw droplet matrix | +| Generic DNA variant calling | `samtools`, `bcftools` | `bwa-mem2`, `gatk`, `deepvariant` container | Reference FASTA, indexes, optional target/region, optional annotation VCF | +| Germline DNA variants | `gatk`, `samtools` | `bcftools`, `bwa-mem2`, `deepvariant` container | Matched reference FASTA, known-sites resources, optional target BED, optional cohort/joint-calling resources | +| Somatic DNA variants | `gatk`, `samtools`, `bcftools` | panel-of-normals resources | Tumor-normal or tumor-only sample sheet, reference FASTA, germline resource, optional panel of normals and target BED | +| UMI panel variants | `samtools`, `bcftools`, `fgbio` when consensus generation is needed | `bwa-mem2`, `gatk`, duplex-review helpers | Reference FASTA, target BED, UMI read structure/tags, consensus or raw UMI inputs, optional hotspot/review VCF | +| ATAC-seq peaks/QC | `bowtie2`, `samtools`, `bedtools`, `macs2`, `bamCoverage` | `computeMatrix`, `plotProfile`, `plotHeatmap`, `homer`, `multiqc` | Bowtie2 index, genome size, blacklist BED, optional TSS BED, optional registered genome bundle | +| ChIP/CUT&RUN/CUT&Tag peaks/QC | `bowtie2`, `samtools`, `bedtools`, `macs2`, `bamCoverage` | `homer`, `multiqc` | Bowtie2 index, genome size, IP/control metadata, optional blacklist/TSS annotations and registered genome bundle | +| Amplicon microbiome | `qiime2` or `dada2` | `cutadapt`, `seqkit`, `multiqc` | Primer/marker metadata, taxonomy classifier or database, sample metadata, optional ASV/taxonomy tables for review | +| Shotgun metagenomics | `kraken2`, `bracken` by default | `kneaddata`, `humann`, `metaphlan`, `seqkit`, `multiqc` | Kraken2 database, optional host-depletion reference, optional Bracken and HUMAnN databases | +| nf-core adapter | `nextflow` | `multiqc`, Docker/Singularity/Apptainer or site profile | Pipeline-specific reference/database bundle and any required container/runtime profile | + +Package checks and reference/database checks are intentionally separate. Missing executables should produce an install plan; missing references or databases should produce resource readiness and setup-plan artifacts before any large download is attempted. + +## FASTQ QC Local Execution + +The FASTQ QC lane accepts a sample sheet or a single sample, validates FASTQ structure and pairing, runs FastQC/MultiQC through a local Snakemake workflow, and writes a standardized run envelope. By default, outputs are written under `ngs_runs/fastq_qc/` in the current working directory; pass `--outdir` to choose a different run directory. + +```bash +python plugins/ngs-analysis/scripts/run_fastq_qc.py \ + --sample-sheet samplesheet.csv \ + --execute +``` + +Single paired sample: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_qc.py \ + --sample sampleA \ + --r1 sampleA_R1.fastq.gz \ + --r2 sampleA_R2.fastq.gz \ + --execute +``` + +Optional trimming and re-QC: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_qc.py \ + --sample-sheet samplesheet.csv \ + --trim-mode fastp \ + --execute +``` + +Each successful run writes `run_manifest.json`, `config.json`, `validation/`, `workflow/Snakefile`, `logs/`, `artifact_index.json`, `summary.md`, FastQC/MultiQC outputs, browser helpers plus `visualizations/localhost_launch_hint.txt` for the preferred localhost review path, and `qc_interpretation.json`. + +## Bulk RNA-seq Counts/QC Local Execution + +The bulk RNA-seq counts/QC lane accepts an nf-core-style sample sheet, validates FASTQ paths and read structure, runs FastQC/MultiQC plus Salmon transcript quantification through a plugin-owned Snakemake workflow, and writes the standard run envelope. + +```bash +python plugins/ngs-analysis/scripts/run_bulk_rnaseq_counts_qc.py \ + --sample-sheet samplesheet.csv \ + --fastq-root path/to/fastqs \ + --transcriptome-fasta reference/transcriptome.fasta \ + --genome-fasta reference/genome.fa \ + --annotation-gtf reference/genes.gtf \ + --execute +``` + +Each successful run writes `run_manifest.json`, `config.json`, `validation/`, `workflow/Snakefile`, `logs/`, `versions/software_versions.json`, `artifact_index.json` with per-file SHA256 checksums, `summary.md`, a review bundle under `visualizations/`, browser helpers plus `visualizations/localhost_launch_hint.txt` for the preferred localhost review path, Salmon `quant.sf` files, and `rnaseq_salmon/matrices/{tpm,num_reads,effective_length,samples}.tsv`. + +The counts/QC runner also writes a run-local resource-readiness bundle under `resources/`. It is advisory by default for explicitly supplied local references; use `--genome-build`, `--bundle-root =`, and `--require-resource-plan` when a registered genome bundle must be complete before the run can be marked ready. + +## Bulk RNA-seq Differential Expression Local Execution + +The differential-expression lane accepts a count or expression matrix, sample metadata, and a contrast table. It validates sample matching, replicate sufficiency, matrix scale, and R/Bioconductor package availability. With `--method auto`, integer-like `raw_counts` prefer DESeq2 when available, then edgeR; non-integer inputs route to `limma_log2`. Use `--input-mode` to declare `raw_counts`, `normalized_expression`, or `log_expression`; `auto` infers the mode and emits a warning when normalization is skipped because the matrix is already transformed. + +```bash +python plugins/ngs-analysis/scripts/run_bulk_rnaseq_de.py \ + --count-matrix count_matrix.tsv \ + --sample-metadata sample_metadata.tsv \ + --contrasts contrasts.tsv \ + --input-mode auto \ + --execute +``` + +Each successful run writes `run_manifest.json`, `config.json`, `validation/`, `workflow/scripts/run_bulk_de.R`, `logs/`, `manifest/contrast_status.tsv`, input-mode-aware matrix artifacts, `qc/design_matrix.tsv`, `qc/design_diagnostics.tsv`, `qc/sample_outlier_metrics.tsv`, `qc/statistical_warnings.tsv`, `qc/mean_variance_trend.png`, per-contrast result tables, explicit `.not_tested.tsv` stubs for blocked contrasts, clearer limma volcano/MA plots when applicable, a review bundle under `visualizations/`, `notebooks/bulk_rnaseq_de_review.marimo.py`, an auto-launched localhost Marimo review app recorded in `notebooks/marimo_server.json`, `versions/`, checksummed `artifact_index.json`, and `summary.md`. + +## scRNA FASTQ-to-count Local Execution + +The plugin-owned scRNA execution lane accepts local FASTQs, validates barcode-versus-cDNA pairing, runs STARsolo through a dedicated Snakemake workflow, and writes a standardized run envelope. By default, outputs are written under `ngs_runs/scrnaseq_fastq_to_count/` in the current working directory; pass `--outdir` to choose a different run directory. + +```bash +python plugins/ngs-analysis/scripts/run_scrnaseq_fastq_to_count.py \ + --sample-sheet samplesheet.csv \ + --genome-fasta reference/genome.fa \ + --annotation-gtf reference/genes.gtf \ + --cb-whitelist reference/whitelist.txt \ + --execute +``` + +Each successful run writes `run_manifest.json`, `manifest/lineage.tsv`, `manifest/working_samplesheet.csv`, `manifest/inputs_manifest.tsv`, `config.json`, `validation/`, `workflow/Snakefile`, `logs/`, `versions/software_versions.json`, `artifact_index.json`, `summary.md`, and STARsolo count artifacts. The run manifest records pinned STAR image metadata, chemistry-detection evidence, and explicit STARsolo cell-calling filter settings. + +The FASTQ-to-count runner also writes advisory `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts by default. Add `--require-resource-plan` with a registered `--genome-build` and `--bundle-root` when genome bundle completeness should block readiness. + +## scRNA Post-count Execution + +For 10x-style matrix bundles, the package includes a post-count QC runner: + +```bash +python plugins/ngs-analysis/scripts/run_scrnaseq_post_count_qc.py \ + --input-dir path/to/scrna_bundle +``` + +The input directory should contain `matrix/`, `manifest.tsv`, and `dataset_metadata.json`, unless explicit paths are provided. An optional `--raw-matrix-dir` enables emptyDrops-style cell-calling checks when a raw droplet matrix is available. This runner emits a standard envelope with `run_manifest.json`, `manifest/lineage.tsv`, `validation/tool_preflight.json`, `versions/software_versions.json`, checksummed `artifact_index.json`, `summary.md`, `provenance/analysis_status.json`, and `visualizations/index.html`. It also auto-launches a localhost Marimo review app recorded in `notebooks/marimo_server.json` and emits `notebooks/scrna_qc_review.marimo.py` as a notebook backup over the portable PNG/CSV/H5AD outputs. Tissue-specific annotation and integration choices should still be reviewed against the dataset and reference context. + +## DNA Variant Calling Execution + +The DNA variant-calling execution package accepts a BAM/CRAM sample sheet plus a matching reference FASTA, validates alignment/reference inputs, runs samtools QC, calls variants with bcftools, and writes the standard run envelope. Use nf-core/sarek or a lab-validated workflow for full germline, somatic, trio, or panel analysis. + +```bash +python plugins/ngs-analysis/scripts/run_dna_variant_calling.py \ + --sample-sheet dna_samples.tsv \ + --reference-fasta reference.fa \ + --region chr20:1-100000 \ + --filter-min-qual 30 \ + --filter-min-site-dp 10 \ + --execute +``` + +Add a small known-variant annotation layer by passing a bgzip/tabix-indexed resource VCF: + +```bash +python plugins/ngs-analysis/scripts/run_dna_variant_calling.py \ + --sample-sheet dna_samples.tsv \ + --reference-fasta reference.fa \ + --annotation-vcf gnomad_small.vcf.gz \ + --execute +``` + +Each successful run writes `run_manifest.json`, `validation/`, `logs/`, `qc/*.flagstat.txt`, `qc/*.idxstats.tsv`, `qc/*.coverage.tsv`, `qc/*.depth.tsv` when a region is provided, `qc/*.callability.json`, `qc/*.variant_summary.json`, `variants/*.vcf.gz`, optional `variants/*.annotated.vcf.gz`, optional `variants/*.filtered.vcf.gz`, `variants/*.bcftools_stats.txt`, `artifact_index.json`, and `summary.md`. + +The generic BAM/CRAM-to-VCF runner now emits advisory `resources/` readiness artifacts for the selected genome bundle. Use `--require-resource-plan` when missing registered references should block readiness; otherwise the explicit `--reference-fasta` remains enough for focused local checks. + +## Germline DNA Variant Calling Execution + +For germline-specific local runs that should own BQSR and cohort assumptions, use the dedicated runner: + +```bash +python plugins/ngs-analysis/scripts/run_dna_germline_variants.py \ + --sample-sheet dna_samples.tsv \ + --reference-fasta reference.fa \ + --known-sites dbsnp.vcf.gz \ + --known-sites mills.vcf.gz \ + --emit-gvcf \ + --joint-call \ + --execute +``` + +The runner validates resource completeness and writes a standard run envelope even when execution is blocked by missing GATK or mismatched resource bundles. Successful runs emit per-sample recalibration tables and BAMs, `gvcf/*.g.vcf.gz`, optional `joint/cohort.joint.vcf.gz`, `qc/*.flagstat.txt`, `qc/*.idxstats.tsv`, `artifact_index.json`, and `summary.md`. + +The germline runner also writes the same advisory `resources/` bundle by default, with `--require-resource-plan` available for runs that must prove a complete registered reference and known-sites bundle. + +## Somatic And UMI DNA Variant Execution + +Somatic and UMI panel lanes now have dedicated runners instead of relying on the generic BAM-to-VCF path. + +```bash +python plugins/ngs-analysis/scripts/run_dna_somatic_variants.py \ + --sample-sheet somatic_pairs.tsv \ + --reference-fasta reference.fa \ + --germline-resource af-only-gnomad.vcf.gz \ + --panel-of-normals pon.vcf.gz \ + --execute +``` + +The somatic runner validates tumor-normal/tumor-only pairing, writes `workflow/somatic_command_plan.json`, emits Mutect2/contamination/filtering command plans, records tumor-only and missing-resource caveats, and produces filtered VCF artifacts when GATK resources are available. It also writes `qc/somatic_pair_review.{tsv,json}` so each pair has an explicit matched-normal status, PON/germline-resource status, contamination-table status, filtered-VCF status, and parsed variant-count summary when backend artifacts exist. + +```bash +python plugins/ngs-analysis/scripts/run_dna_umi_panel_variants.py \ + --sample-sheet umi_panel_samples.tsv \ + --reference-fasta reference.fa \ + --target-bed panel_targets.bed \ + --umi-mode duplex \ + --umi-tag RX \ + --execute +``` + +The UMI runner validates raw versus consensus BAM state, writes `workflow/umi_panel_command_plan.json`, emits fgbio consensus and consensus-BAM variant-calling commands, and records `qc/umi_postrun_summary.{tsv,json}` with consensus read counts, target coverage, variant stats, and family-size/duplex metrics when those backend artifacts exist. It also writes `qc/umi_molecular_evidence_contract.{tsv,json}` to make the low-AF review contract explicit: consensus BAM, family-size/molecule metrics, consensus VCF, variant stats, hotspot review, and duplex review readiness stay visible per sample. + +Somatic and UMI direct runners now write advisory resource-readiness bundles by default under `resources/`. Use `--genome-build`, `--bundle-root grch38_core=/refs/GRCh38`, and `--require-resource-plan` when the run should be blocked unless the registered reference bundle is complete; leave the default advisory mode for custom or reduced references where the explicit FASTA/BED inputs are enough for a local check. + +## FASTQ Assay Execution + +The epigenomics, amplicon microbiome, and shotgun metagenomics execution packages share a FASTQ validation/QC runner. It resolves sample-sheet paths, validates read structure, runs seqkit stats and FastQC/MultiQC when available, then writes lane-specific readiness/status artifacts for the next workflow stage. + +```bash +python plugins/ngs-analysis/scripts/run_fastq_assay_package.py \ + --lane epigenomics_peaks \ + --sample-sheet assay_samples.csv \ + --execute +``` + +Supported lanes are `epigenomics_peaks`, `amplicon_microbiome`, and `shotgun_metagenomics`. The shotgun lane can run Kraken2 only when both `kraken2` and a database path are provided; otherwise it records the database/tool blocker explicitly. + +Each run also writes `visualizations/index.html` and `visualizations/visualization_manifest.json`. Successful FASTQ-assay executions emit `qc_verdict.json`; shotgun and amplicon runs also emit `qc_interpretation.json` with machine-readable reason codes, readiness verdicts, and concrete follow-on commands for backend generation plus plot re-rendering. The common `run_manifest.json` includes audit metadata such as plugin version, exact argv, environment snapshot, input-file checksums, a parameter hash, and `manifest/lineage.tsv`. With FASTQ-only inputs, the visual bundle points to read-level QC and the lane readiness artifact. When downstream tables are available, the runner can also create native plot/table bundles: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_assay_package.py \ + --lane amplicon_microbiome \ + --sample-sheet amplicon_samples.tsv \ + --asv-table asv_table.tsv \ + --taxonomy-table taxonomy.tsv \ + --execute +``` + +Amplicon visualizations include alpha diversity tables/plots, Bray-Curtis PCoA, rarefaction curves, and taxa barplots. Shotgun visualizations can be generated from Kraken reports, Bracken tables, and HUMAnN path/gene-family tables: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_assay_package.py \ + --lane shotgun_metagenomics \ + --sample-sheet shotgun_samples.csv \ + --kraken-report sample.report.txt \ + --bracken-table sample.bracken.tsv \ + --humann-pathabundance humann_pathabundance.tsv \ + --humann-genefamilies humann_genefamilies.tsv \ + --execute +``` + +For amplicon lanes, the runner also emits `methods/amplicon_methods.json` and a concrete backend handoff bundle under `workflow/amplicon_backend_*.{json,sh}`. If downstream ASV/taxonomy inputs are labeled synthetic or introduce sample columns that are not present in the real sample sheet, the run is marked review-only and beta-diversity/PCoA are blocked unless `--allow-synthetic-diversity` is passed explicitly. + +## Assay Backend Execution + +Dedicated backend runners expand beyond read-QC/readiness packages when the required tools and references are present: + +```bash +python plugins/ngs-analysis/scripts/run_atacseq_peaks_qc.py \ + --sample-sheet atac_samples.csv \ + --bowtie2-index /refs/GRCh38/bowtie2/genome \ + --genome-size hs \ + --blacklist-bed /refs/GRCh38/blacklists/encode_blacklist.bed \ + --tss-bed /refs/GRCh38/tss.bed \ + --execute +``` + +```bash +python plugins/ngs-analysis/scripts/run_chip_cutrun_peaks_qc.py \ + --sample-sheet chip_samples.csv \ + --assay chipseq \ + --target-class tf \ + --peak-mode narrow \ + --bowtie2-index /refs/GRCh38/bowtie2/genome \ + --genome-size hs \ + --execute +``` + +These runners produce command plans and envelopes for alignment, filtering, MACS2 peaks, FRiP, consensus peaks, bigWig tracks, ATAC TSS matrices, and motif handoff artifacts. + +Epigenomics backend runs also write normalized review outputs: `qc/atacseq_qc_summary.{tsv,json}` or `qc/chip_cutrun_qc_summary.{tsv,json}`, native dashboards under `qc/*_dashboard.html`, compact SVG plots for FRiP/peak counts and insert-size distributions, `tracks/browser_tracks.tsv`, `tracks/browser_track_preview.html`, `tracks/ucsc_track_lines.txt`, `tracks/igv_session.xml`, and `motifs/motif_summary.tsv`. ATAC runs generate TSS profile/heatmap commands when `--tss-bed` is supplied; ATAC and ChIP/CUT&RUN runs can add HOMER motif commands with `--run-motifs --motif-genome `. + +ATAC and ChIP/CUT&RUN direct runners also write advisory resource-readiness bundles by default. Use `--require-resource-plan` with an explicit `--genome-build` and `--bundle-root` when reference completeness should block execution readiness. + +```bash +python plugins/ngs-analysis/scripts/run_amplicon_microbiome.py \ + --sample-sheet amplicon_samples.tsv \ + --backend qiime2 \ + --primer-forward GTGYCAGCMGCCGCGGTAA \ + --primer-reverse GGACTACNVGGGTWTCTAAT \ + --taxonomy-classifier silva-138-classifier.qza \ + --metadata sample_metadata.tsv \ + --execute +``` + +```bash +python plugins/ngs-analysis/scripts/run_shotgun_metagenomics.py \ + --sample-sheet shotgun_samples.csv \ + --kraken-db /db/kraken2/standard \ + --host-reference /refs/human_kneaddata_db \ + --run-bracken \ + --run-humann \ + --humann-db /db/humann \ + --execute +``` + +Use `--backend dada2` when the user wants the direct R/Bioconductor path. The plugin now ships `workflows/amplicon_microbiome/run_dada2_backend.R`, checks the `dada2` R package before execution, runs ASV inference when the package is available, and writes `tables/asv_table.tsv`, `tables/representative_sequences.fasta`, `tables/read_retention.tsv`, optional `tables/taxonomy.tsv`, and `dada2/dada2_backend_state.rds`. + +The amplicon runner keeps database/tool blockers explicit, exports QIIME2 denoising stats, normalizes QIIME2 exports, and preserves native DADA2 outputs in the same review contract. BIOM-only exports are reported with a concrete `biom convert` command rather than silently treated as parsed tables. When a normalized ASV/feature table is present, the runner now derives `tables/alpha_diversity.tsv`, `tables/bray_curtis_distance.tsv`, `tables/top_taxa_or_features.tsv`, `tables/amplicon_diversity_summary.json`, and native SVG/HTML review artifacts under `visualizations/`. + +The amplicon direct runner writes advisory taxonomy-database readiness bundles by default. Use `--bundle-root silva_138_amplicon=/db/silva`, `--include-optional-resources`, and `--require-resource-plan` when database completeness should block readiness. + +The shotgun runner keeps database/tool blockers explicit, runs KneadData host depletion when `--host-reference` is supplied, routes Kraken2 and HUMAnN over the cleaned reads, and normalizes backend outputs into `tables/bracken_est_reads_matrix.tsv`, `tables/bracken_relative_abundance_matrix.tsv`, `tables/humann_pathabundance_matrix.tsv`, `tables/humann_genefamilies_matrix.tsv`, plus Bracken/HUMAnN summary JSON. It also derives `tables/top_bracken_taxa.tsv`, `tables/top_humann_pathways.tsv`, `tables/top_humann_gene_families.tsv`, `tables/metagenomics_backend_review.json`, and native dashboard/SVG review artifacts when those matrices are available. Missing database outputs remain `not_available` in the visualization manifest. + +For direct Kraken2/Bracken/HUMAnN runs, the shotgun runner also writes `resources/resource_plan.json`, `resources/resource_manifest.tsv`, `resources/resource_env.sh`, `resources/resource_readiness.md`, and resource setup-plan artifacts. The Kraken2 database contract is always required; Bracken and HUMAnN are promoted to blocking resource checks when `--run-bracken` or `--run-humann` is requested. + +## nf-core Adapter + +When the user wants nf-core execution, use the adapter to generate pinned Nextflow commands and capture trace/report artifacts in the standard envelope: + +```bash +python plugins/ngs-analysis/scripts/run_nfcore_pipeline.py \ + --pipeline rnaseq \ + --sample-sheet samplesheet.csv \ + --profile docker \ + --revision 3.18.0 \ + --genome GRCh38 \ + --bundle-root grch38_core=/refs/GRCh38 \ + --execute +``` + +Supported adapters include `rnaseq`, `scrnaseq`, `sarek`, `atacseq`, `chipseq`, `cutandrun`, `ampliseq`, and `taxprofiler`. Each adapter now writes a run-local resource gate under `resources/` before execution: `resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, `resource_setup_plan.json`, `resource_setup_plan.tsv`, `resource_setup_plan.md`, and `resource_setup_commands.sh`. Missing required reference or database bundles block the run envelope from being marked ready; use `--bundle-root bundle=/path`, `--genome-build`, and `--include-optional-resources` to make the readiness state explicit. Use `--skip-resource-plan` only for command-shape review, not for execution-readiness claims. + +## References And Databases + +Reference/database readiness is tracked separately from executable preflight: + +```bash +python plugins/ngs-analysis/scripts/ngs_reference_manager.py list +python plugins/ngs-analysis/scripts/ngs_reference_manager.py check --kind reference --bundle grch38_core --root /refs/GRCh38 +python plugins/ngs-analysis/scripts/ngs_reference_manager.py explain-missing --kind database --bundle kraken2_standard --root /db/kraken2/standard +python plugins/ngs-analysis/scripts/ngs_reference_manager.py plan --pipeline shotgun_metagenomics --include-optional --outdir resource_readiness/shotgun +python plugins/ngs-analysis/scripts/ngs_reference_manager.py setup-plan --pipeline shotgun_metagenomics --include-optional --outdir resource_readiness/shotgun_setup +python plugins/ngs-analysis/scripts/ngs_reference_manager.py plan --pipeline atacseq --genome-build GRCh38 --bundle-root grch38_core=/refs/GRCh38 --outdir resource_readiness/atac +python plugins/ngs-analysis/scripts/ngs_reference_manager.py inventory --outdir resource_readiness/inventory +python plugins/ngs-analysis/scripts/ngs_reference_manager.py lock --outdir resource_readiness/lock --include-checksums +python plugins/ngs-analysis/scripts/ngs_reference_manager.py verify-lock --lockfile resource_readiness/lock/resource_lock.json --outdir resource_readiness/lock_verify --fail-on-mismatch +python plugins/ngs-analysis/scripts/ngs_reference_manager.py check-all --kind database --output resource_readiness/database_audit.json +``` + +The `plan` command resolves pipeline-specific required and optional bundles, checks configured roots or `--bundle-root` overrides, and writes `resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, plus setup-plan artifacts. This gives each analysis a concrete reference/database gate before execution. The companion `setup-plan` command writes only the operator setup bundle: `resource_setup_plan.json`, `resource_setup_plan.tsv`, `resource_setup_plan.md`, and `resource_setup_commands.sh`. The shell skeleton keeps setup hints commented by default so large database/reference downloads stay deliberate. The registries live in `references/reference-registry.json` and `references/database-registry.json`; they encode expected files, root environment variables, source notes, setup hints, size caveats, and license/version caveats. + +The `inventory` command is the project-level resource surface. It checks every known reference/database bundle, accepts repeated `--bundle-root bundle=/path` overrides, and writes `resource_inventory.json`, `resource_inventory.tsv`, `resource_env.sh`, and `resource_dashboard.md`. Use it before broad multi-lane runs to see missing bundle files, env vars, setup hints, license notes, and which pipelines each bundle gates. + +The `lock` command snapshots the current inventory into `resource_lock.json`, `resource_lock.tsv`, and `resource_lock.md`; add `--include-checksums` for SHA256 hashes on files below the checksum threshold. Use `verify-lock` before reruns or handoffs to prove that locked resources are still present and unchanged, or to produce a concrete drift report. + +## BCL To FASTQ Execution + +The BCL package validates `RunInfo.xml`, optional `RunParameters.xml`, `Data/Intensities/BaseCalls`, and the sample sheet. With `--execute`, it runs `bcl-convert` when installed, falls back to `bcl2fastq` when available, and otherwise records the converter blocker without auto-downloading proprietary software. + +Successful executions now emit a demultiplexing QC summary under `qc/demux_qc_summary.json`, include generated FASTQs and BCL reports in `artifact_index.json`, and surface Docker-wrapper readiness in `validation/runtime_preflight.json` when the local converter is a Docker-backed wrapper. + +```bash +python plugins/ngs-analysis/scripts/run_bcl_to_fastq.py \ + --run-folder /path/to/run \ + --sample-sheet SampleSheet.csv \ + --output-directory fastq_out \ + --execute +``` + +## Local Plugin Install + +The shareable unit is a marketplace root containing both: + +```text +.agents/plugins/marketplace.json +plugins/ngs-analysis/ +``` + +Unzip or clone that root anywhere on the recipient machine, then add the root directory as a Codex local marketplace. Install `Life Sciences NGS Analysis` from the `Life Sciences NGS Analysis Local Plugins` marketplace. After installation, invoke it by asking for NGS routing or by referencing skills such as `ngs-analysis-router`, `ngs-fastq-qc`, `ngs-dna-somatic-variants`, or `ngs-scrna-seq`. + +Do not distribute only the `plugins/ngs-analysis/` directory unless the recipient already knows how to register a Codex marketplace entry for it. + +## Local Execution Profile + +When Docker, registry egress, or Nextflow process containers are unavailable, route compact local execution runs through the `local_light` profile. This profile uses Snakemake or direct shell commands with isolated conda/mamba environments and avoids containers by default. + +The default local lanes are FASTQ QC with FastQC/MultiQC, Salmon quantification for bulk RNA-seq inputs, bulk RNA-seq count-matrix differential expression, samtools/bcftools variant checks where suitable references exist, post-count single-cell QC, FASTQ-level epigenomics/amplicon/shotgun packages, and BCL run-folder validation/conversion when an Illumina converter is installed. + +## Public-First Boundary + +The default registry favors public packages and workflows such as nf-core, FastQC, MultiQC, Cutadapt, fastp, STAR, Salmon, GATK4, DeepVariant, samtools/bcftools, QIIME2, DADA2, Kraken2, and HUMAnN. + +Some common tools are public to download but not fully open-source or may require EULA/license acceptance. Examples include Illumina BCL Convert, 10x Cell Ranger, DRAGEN, and Sentieon. The skills surface those boundaries before use. diff --git a/plugins/ngs-analysis/assets/app-icon.png b/plugins/ngs-analysis/assets/app-icon.png new file mode 100644 index 0000000000000000000000000000000000000000..c9fad9910b725e6c953e9a7dc67decd44c2ac325 GIT binary patch literal 39140 zcmdpdWm}tF({*rnio3fMcXxM}0>$0kOL2FCySsaFch?JwTY&;a^Mw2T7jKT_OFks$ z-m~|bwbsl;sVd8$AQB(~000y@SxI#O0K)q70}l)S#`oyk3ILEAiUw;03hM zYJJ#z1sW8(Wv&m!a!6oCkU`3;Lr5WnW{yOjWi4^^mJ4%4%>4_Ui;afxM=XM`2u`A^ z2o6tTLq=B2i(fJNZ`W=1f& z$UCQyG393o3JP=%K=K=D1}`JU!5%pp8d`!(XlB-mwwIYU;Mr_oXh>(C1dWuBBZe0a z07WcuFK{AqtxCXeiXZF)pqUvTKc3JE>awRGB9XfRV4Upnp`oeFlXTyq{Y=BOa!~o! z5cH=+C#c3{bAinp-@MRa`M%`(zyJP2*fh5S zEzl81axENummzi9d(Cr`!S!URPCyBZbpsGF!mtI4hPlgKh(p}hZ#mzu>xg)+>VAI^ z5tQ@u_6~hq|Dq~Cq|eduR$-Nm1{#~A3B5(F(wCTy*4(x#BOCYhO2X1L@2D2Y6d1IS^2LRF79R;|(j~BT0A|C(I zdrlokWt1; zV6y1x<5nF`Bs$~K79j~qUV%_s3UI2b*=@nD|6>X+;f#MoIaVf`E^%Cp0**Ybm0gmG z{M&F$sIvYG4tSmvs-azxUdss8>#Xl?A7jNw!CvrXU2<;i5txHoXf1cXd;G}imSjkH&p z)B6%5ar5yqLYQFe49Ua&crfhU5ZG6$Abal7*oEr1)`$gVPs~G#<+38CEd;7aD!M+DOJVF}Vv5jL# z*jqzn40m-~5yHl(YloBj_*m}{xOt~>8Nr1C$1pD4LJWYFH9LY$f~=X4&2|kJB~w+m z#X68TsQJ>^040RvJ_q_JlAB%CnMIqZkOwkX&*m-mKqZK~+>-BXH@B#Fzk5}Eb$v<) zqo9f`@IPFmUecDa;vxH>n-t?$(u>`?zrxjwBj4$U|D5q#2QQkoadYVrB^r**i5U(#>lmF?NK)s~-K>}hZR z$Z7YuXjGER*-@s=L&&->5OoX3@Z#!TNBJtP1%2#a**Wops%P?=Z_=)v4Jc_&>SM3o zqj}b%QJ_8S{`+PBWdO5dJfMW2*lu}m7j(&-QtkLabp?iMv{`h+p3BoiTH(9GDTW1E z2!LCGYysI>c)@T=5Pm+*+E^2R+78RX!6U3%6^v_L@%a1V(m+3G>^|bA_oHc9Vzd8M z;6n+qu`THA-q`6WV2X#pkx3mAhNY#d&Jf<}*LYf2*sO1Whxg7!9@}>s8uQyuka)NW z)!+3LShZvKt9RayQ<*xFGp^iAL;v2COw%Ya3iJa-qYSNS$q7^WCNfz7<{CdFnKF>l zIpNIq&rgw;%m4i9L0t62-mZa>LRP%=)w7NPe}1yo!O+=RZHP6N*oBHVUk%HX*GL%x zIe1MaViR`Vsb%R^=Kt_pCkRP3LYyPNbam~%k>WaKhOyLfGeOS$0^UK4XIs9>Eq0Bk zgkY^L^M^sO(bWR+w34Ma;Y(5(!uhogry@$y<_zKgS_Siz# z0Hb<=ONY9{BfZ}nl_)Gj>0mPl#0#j{OV3osP^D&sme=MBzt5$98^SQJKowTx^Vi!Vz>OYAch68m&a0C>x7-Y8zNT3Jg(bG zt9OL}d)@<48(!R34q9duwlS*_z&L-RN7OS$D}Sh^)eNcy88(}Tbgs>jzf=#rjUXS$ z4Li4Qf*2s%%Eb`QWS{372apymO08Auk_TbsFNri9I7c>1_C{4}I{i3W`($p>EUe zmCwbXzVNUki`p9yD`EyEm8~Id*h7((L6=dpzB;SNy)tZKwTwCP(~rYwG5zg5qdalU z3{$R)Dd7tZ54X;>AaTFNdCKKTZsXRr>sY_RpH>y8nB5>b*T~I(Kd%1yc~CVCuJ)dD z$*biVPDi15&gsZ_zfA<(@X;j>8;DOndq8nNjAwPK1bV7Vb(fT1eUccSa|bpcSB!Gt zmJByZ?>;?VJpo-QRg>N%LOD_}i&GV=i;6xy?`(o2EmD-T^MQTP9Hh|6Puu|SoynLW zK&c?#%`)sy-e7CnL*{-Sjf?A~f??+a^{=Z$tc=qiJ4F0glH0c(8NGXi9-pKKMGJ6H zL#*#S{4Ir5=;8gL@!OQxqN(z|UJ4(q3?EzSsX*HaofZPtkr0zZ2X(xgC@U z(#ZSCFUGwWk)go~GhD_D`u+7WK59!^((`E2+Dpqfzc;6C+} zVPc%qYu-J}!RbH3W&6J4pA8GC(9}}NOt>^Lq4k$*Vl+8Vru_+&=SlrWbspkf6Gdc{2H3HFi|p(t@7f)!UkaPn znoUXzx!$afK0~LAhkeo|a59&&*1r$*^%z#`(>qfifo+oa^~3H&b~!y&b^BTDL1_~B z6l3REB@Ib?u$6-rmK`Q<7v$9!ADV8?rRAJR*J4-FeDsU{(7q9LyMu%#0D)epDRcIj z<&tcKC5C7CKECAR8LjgrRB4M!+HAH_VRmVP6y+!oGewE^gd7~y7q7`qhNH(rU6yT0 zU;a#B{rAT?IpSc(+cN;e0b=}gytQ8~Zh=|Z4yS36r9J0iZjjrsc0Dm3W-h?)Yuin~ zs*sqUV6rjc=glJV`46F9zUoA>A}N!ITm<3N)mSfmL5|$snGDoQ3{%VuI~)v;uoy8(Ue*t?^Ht{=@3`&O_xC=Q-S@-A z*UG==yf*~a*yJ42>W?LzQ&pAj^?ccLf;4q#3@T{98BcK)CjF#gB+?Vjm?xL>p_K78 zf;amo1RQ4_nyTW6;taPwp0@*@Uy13WZZYJDw5W$o7bn8W_bKd3B`QCUz;lQ~|69cJ z+))1(nimRjJ0Hy$$kV~awFy!b%#^pp7S%7f#r<;s`0pKqsoL#$<^?+`Je}cP#b4TLehurXH6UjQ3_?RzN@u_eQe79(qd!u#Ikr-= zNe8gW->_})%?danek=vH*rE8N;8B<<1ccenqOrdGI~~7j$GoWvuqqlG1xPDa*uK^6 z!QOh8v$fqE7K|q$Qo?;f#_7T(2KLx}WIlnH(Xgz8P1+M6lke);YMKSqZIojSt4zy` zBAd?Avequ(%gCfbpbBM1Mm~RDbLhDJVC;K3vozH5EGJ&4Yj?2g|Cz`Oi4g^bd6w_y zFck<=WGr3oLYujJvD|D(n-aL5QN!=PMI+9I`Q2OCe&IAsU=<3#<3gG}tTX=^tTGrX zD<$xXcX+Q(b-=tQSfNEH^%%Zv24MnL<7?oAe*sJbjM}-^^_xMyUsyx2(y*ESJ0!%0<=t>8C#Bt@Wa}1i&}vvP3N7ZuF{5lCq81s8)h=JPr#=O5D9VTds)*t zQ%V?mrU6gUU(j2I)Wv8`p;lMxUj4I3k~EV_|&*D~7ob3n<9c4Pnxw0w%~Dr^#?cM4pn3wsK(bXdCkhH23==s& zaA-@z=%E-<@r9hPF}`xq)I=_YU}3`P;x^MR#Qt}cta4=uD^_uc6GLxg7@>~97W(aP zNC`xtnQPZ~a<=esiJAqnij{L?4+YSS?n-_!DhJxan;)P#NRPy3PHy(YdW4Li65e);le)=C0Y^!a9)8W@Pdqu z7&vXy`0%Y$s%s7uR7A~1##g0qdhM1GAQ^}8&w_H3jQW3s^$98%Fe zUVA03Nen%KaRnzIc6}(%Rh}{%a-?<^$3t3rm<+!3aHkW)N!tZ&6Ou*sw6BnZMt}7; zAkPLX7$WyA-;w{7udMB#=fu_CHf>)VCx?wC)vG`)esEjaHUCQjF9S_d&xFfcv4<9v z`7j@r&Yd%Q#oz9s=^Ctrz1UG3nS5&s-dep%NJi{TK2;_dOXwPOfts%{*dP-#<)?3| zU8GQ^GQUdDO<34ypN}sQQ^6CHlm*lO(*G8(hp24< zK>2!1De1aa@>GtqcJ4B!=z1@Ci58gAYU}O#1L-QeZ+pZq5*Sh(%r(?Gd8;*!*ToP6 z^p@&}UU4I9NA97rjGL$tc`LzY2IW^N`~dgllhTwpaX3_%RfOTuFYIuLKYr_#KJ>*f zC6+C~7qSfrM;dwt<-QUVjqYweUl>|^lT&S=)aP!};${rjjp9LT#TN1v)6;9fM2 zlXB=f;Lx>w5YGsalfUw- z=iw8ZgDZP=K{3hBS!9@ZuM*&^Cm{mtEhug-=-Ljj{b*W^k*^+jn5@({?g;3|z=hEsUelK9mV zHkE7NA&eHJ#S6QD>ON2*7iGH%5=DBJ(s8M}b%*!D*ES^&c(GP=ECY(HQbLQUtWO<+ zHf8+0`{elcW+fczQx$VzqtfHR-n{JV+`(_z8{mh=rj!5f0(V1Pmm&P|!BgM$^pX>G zNcFhE84;`tc-V+GfrCMWPL1H;G$5*MkfFcML^2Hs#8~}gCbO=!ZnNoHR#xw4e#C1P z${`SrHg*UHe%@bA3N~jXa2TuU2Yp4;oDNMNy#X5kZ&-` zE_Y3s;;YPq5?y%_3NqHv>m_JZ_HH{@?HVc~p%*v9vI|jHqFjuWz8&i#W{WxFx(bGMijIj1{_Mc!h@(Y3TsU zjGqlfi~L+3BmnzoVR{k(S)R^-z*+!5SKn<#v@Si02jn(t+m`NFnemy z+aXI8bcDQTdUNXkONK5WA=K*+9G1B5j|XxuRP6db9S5+{n(!mJs!c{is#!ss$$pw z9B6hQqp?F&KC==Xb_W+&`7?aRlb#hxGbkWR)o@wyX}o;J-fee`nEaSe1ashAi&2)Y zz}Q5Rc_PCv*Y_rC8w#%ZBBhXE9 zy0&;Xfeq+(K;kQ|#8jMOGZ}|3$Y&aj@zP;+Rw0u5=^F53N8k|K{A!p;-Y6;H+#Lh5p>64ugl+o>mC9pLL0I6;o)MWP@u3OaOpofUX*===%%^A(K#cy+zU zbJd{R7*Ygaj32K-5DLZYpfEp0wlPW^KM7V7Y_wYz$nv+IjUBgBlhSEI2S^YE*cZJF~B4aj=7@ z?v=Algr5HaX|1V^1hhM^ctXa)q_8xr9prbM5p3h^O?^>-2&QaZne>sVc0e-`^LxXh ztBj9#w(;@;1}s_=+|dfVeyzpLy-;yABP!3W5QqEMSM4Ncv}rLH9qKCUJ9DS|?22QB zRHd7o_ntn8#~WQ9+NLe$r~dc_%;z&nApyBZFfrx)@?oK{z*f$Hk@zw%sTvqQIC4=) z7;+jyZ8bkYd`SlxL4LL^D(u6;p=J2~0k>=y|_1ne~bE{LBM zhim8CJn~!M&|1On>~Y*g^ZCrQo+~`IdLCN95H;*bc2C1=3}-)e4F!XM(A%rNdz<3l$_^q8s}8LH6_0}-j3x69 z2QYFFx~sfep_^2uPbo8tleqo|lZr@ZRzE}>fNfVGC;H=+fl^~(J^d%QThmEu9>r># zvhFw_ug0->+V}p0q4kC zQoldRpd9$0JQ$H}b1iw+ekSm(LOR*se>c7NwZv5_Ku|r4V+hRq@fOiN_EWSDNig?{ zS9CmQ=1sn9S7%96)Ep&_Wx28CZ z>s%M6DP_JmfI~_JJwgVt?=QameE#R}Abq7vd0+0{`+)PJj9g)`24^-PMWThoFi&5& zzw!yQ5ZHs>nBCzI>kEn4y#Pp}#$!|NPv`sES(94Qt7rj9A3k~F1-ivWgt7!3q#Qsl zlE-ZhJ5761?}G9>X$(A4aXdK_@7w-F(uy7RJcQIJz_d07LLche(uf6*gcP%N0#bgb z=ydmXX#(>or{o32jAkw25;}rAnE3ya4qzZ4Ob}>l4zC^!RFDU@zANN=9si-P|F<@} z-0%%1*}F~Nbxlex0WL4u3h_ns6QibT6;gQUpMb1-9K(0N}g*_O_$gSsRYL)Vj)lCmCnyK;K ziP_Ht6|roWR+4E?&MnViMZ1kTNlqaPL(J}DXXd)DxmfB$`tc;@Hhg_-}lM&4oC21^W)8l`rg z?Xb>#w@0E2?oq_l{r96mlx`~l=g9AE4Lsmd@4-%frRR;8{?M{Ip~hDLu1Dt&f5I}H z`$86t>!y;7lncYa{I~bTF?kXmzj`K7#Kee7KFD8<>M{vfveL60nv>6V@OZ@Qc0h10Mc@>v1V2OkbyFj#{ zfcG6=GS1LU#%1$?!-z-d%>3eY+Os=U`pjl3p9?Pkz4iH*L3~}O?RUGf`y-et#`gmZ zbMqV<9M`UJ*Fr<0+~=4OL_>J~gtvX_w0MbxDh(8ScVJNFTQdsdZq((uU0Xq5$a~IYs|jeutuXhE zd|vJH=x-1Ox?CDGae*Gt4Z*+&b68p#zM+{0ywXj02?%@r>OW*wzpyIO^R-eFUG8X8 zk}E=CLo-1fW-~%KN2KNNTt|WVu!A(a7!0jrilme)t}+-~4Ym&hqH~_V&(+kthD;eq zJaiBnP`C5Gt(0-qd3fGfmDo<;!{Wj#ix*EI$YDSys3@-jdj%>ryqZ*07@QAW&kq2y7bmv;k0S^VPI27slcv!SLgZ(xoJE zP;1q@@%xP0wAQ37H>`5Qv}&%Ly;?P`N_Pp*E6?&J(-^R{x5Q;_Zs~U->SybPU%~iz z{ACS#!W8%<5u=D35TXHZecgl$fAB`? zdwAgbUhnyXhK>4rktLx|+PHHI3Cb{a9!n0`NoQo0S*+7V0btkXF}m-ZxgEvp$_sss zkv&NrqX;4y7MOO~;If2DE+mn7p}2OTH_PCzur&l3%mP6yHK0iAM7uP>UtLsMdNtUK za&bAF8+4Bp4mQO+$$np{8<7)eb(5B2iI1w5Fu)+#y}_SXvAI)v38xUitMYY~B> zJ5FX>8!Rmi!@m?|=Vrq@3Z9g7GVTWKTKn8$P4SrkL2OgP8O)?>9dHW75uEsJY}gf@NE5Y)L-hZyCGAC=1UwE^5v4W|mI-d@Q&kZa zKv{41zs&I<0*h-?IKW}OTFJmsVD%R(?xUwNM;YSVTF@$3A-v8c=s!V4U-*kG=>k}g zNLOE6mv{JjZY=_7Z#FMI@wZ%Y3mr1u=BM|50ZIf;b2NnB(wrhV zk~@ZHRik+(`1a1NT^ryLaeV_mIu9=|Xw8s5tm(euqD3%Z+EAjv zsB8KgD3pw}jA+|X!IHomks-oIzso8nVMs0M>z-;*{ESzjER{BqKn63a`m5M9^VG9L zf{rQ;a_L=9SpLoqUyvBl_zN9B3=RfG>4CFetqo5*0kv6#n`42w^|$xn_WmeZS|iWc zGE#+)NG)uvuavK(weQwqV#Tk6CLhBCRA$cqYi9a`rD1Q1yc4+9Di62V*D#h8oC~E# z=!ZxTgR44q#cY&9-`Fa=!*P>o)lUM6Vfk?SqTY-%i{~<$&~)6jH%A;_Rab*u zw64p!BES*>P9T(4U_04eJ&nX3&6Mp zXRFK8UX$=`Q&SMz2~tc3W9P9cKRS}?3I(jRz=!7u>BqET)@DbW_tNl+JRJIPfVHb! z8nA^68Zy4*M~NJ#B#+F~xFkBf(hn^JW#3glaLWzQAK@Rv%YV~K*j7J}^V}nZ_J>aj zjN4U9%&wY!to=!$tGpO$l-}OGL?X8c`+ULd_qG9{2qLlfP$lC+oSgl}iVVFYxT#HA zU&@W@s`o^YFYI8<%^#2zd7mU&TE#CHH z5YUE{pV|ddE#R;MnpWsS!sQFG%E}e@yIy1@FYa75hlu{mZ<`yR{Y13DLT*jtc;B5? zqKuk$rP-?n@m>_h2fsN`lL*$YynLsVM%uvECd8qMNg^h`8qdo>xGr03DGWC%vJ=tS zky>PJbX-D&@->ZXd!C)2KuX>VhN&*NCjykq)2jaWc+EmlbaeKcHwVw=cbmfm3T27v zyz8+mVrVJih?JCs^wo&SRtYy1{u$t(bkl-|(-j8SyflA8?#qO(* zitwz^qiH-#h1s{+Akb+YO0RBF85I)E3@h}p0m0JP4=y~K{(k0}8yz=R)Zp>e+h)%aXSiJV8MXg=^4+a;KwshK{TW2+mSa=UkU#eMq zJeQTlv<`piJuwZ360-ce4$$d)T3Mg`gK#y!yiXZcI`?n8)Q%)*vC`IJ-6GY_lI&Zv z5%iTRgW6j5jF+;}eG_dsOZ0xJ2iV3Rpe&v?E0x%SpAzZ5^c&&n|1-3pIURXddEdi8${3WGaOBXi^% zv;j1W)rG;<8|QxYJ9ouFem+EyTkF8xtr0fDtMg9)t_Di zL8KfzJj_5TAY=7xe5XwBzm|e9SkLBG(i6n@S3%yVQ;n7N4ybbSAC{Ktjw0$x2Be$# z1qG%3%TvDh8N&RwgeYdEZ`RRZ*pP`W%7qExHUdZbugKGKyA#6~-}_ZTdIVbexL6>E zQcF@6JOO}i+t6zPh}bEuWB1GR?E2Uk1zmorr8Qidj&3_-dai%jl5rUxLNo!D_+JY( z>-%*5bf=S$r9Qm9c*n?5E}YB)9{tQDzf?(Pjbwuq$_Q~T2X#0)M~s`}ZBucA^NG>j zkkS;~Q?axGo!#w!B0h@wVb};^PpIZgFx4Ro2x0|z>G~6%GV!zCK{4VGzF`pQzSAB6sE!? z%XZr&Sbq!)1D`p@|m;=!2G2L5r`?J0fGlNAQ>PEQF*%An1MWsXm_| z0}?Md(#8mR8q1}tlsUowyHbQ0&65@TTCf0-cah&kkZc69Gr@S$K0^&oK7%n8`3(u zL6x!cE9wUpA$$$ax~M6_gz$uevstZ6x|zPi20+&w$Tix!OWQXyk|wl2+e_@fMw97E zu8g2_H~&-3b%mp=TXW20PQVImkh3JJ?_wi-cEtLnEtqP9tGU)sJV7rDbzVJh7h20T zR4yA9yvDK$$h$dMQOIAHT|1AgCX?Ddi*WGIJJ<~Vr%ruF6jLQ(UMWAbAw6STDJ^&azrUgt;eg`dFw zO0$DWgI%q07)AT?@phYv>b5VlhHT$-Q3TY3niF_;)qCx| zYFGUeO%OrD41#Wfr8<#%FM$SbQfuONK2Nj+Kn^F{OsW(i)gloD8tzJB?SK7;z`;$B z^QN0q$!fuf=x6hV4z9#|o0~w-plFAYB58x`I=8F5BZ%im1py8n=7ni)uo$LRbu&;^ zm+k&sK?IZ~HK+UP<@1iTOZ4#QB{JO=H0$alv!YS7w+b&LXCRT43Csy7``3jZ#TV4= z_;75>>{dD*SN(CnQixQ%;f5o$52q#Q`SaF^*9tC!)B{`iJ<4vhyeb~q4kHSIr#2gl zVpqSyULmh=*oK(mK;G}+7AUgq_b|%ZbMzZ5_p)uqE#L(6vB@0EXj4$L{-V|JLf5}M z$C-=iic;3C#^l~N>_;H=dF-X#1iWi&R9X2IqEUXfDX2K||E(kyU3Z4xq#@YC75I*G zBpuk#`E?OL?3zXBCZbxn{jm(KCm}NJUyGwRD=N4da7B&KXFmj{^%<(w{7GOm4xf?# z+Q{N}jnV2Xx~pk)lgN+MlBkMjx28);vL-APLXsxuvz|ipyv>e}=zKf`35wEeoAveh zxMZa^f^GR>!)k|3)jm+yg|FZ7_0C|-iKn#S_QM!bf7|UGsZae6x5Qf3$Ru?{V8^}? z5uw}LtK@dkknlrk(xJ(lFrkv1pyTd`t5NkFtc&Z$H*Ho_%K=H8A4S)?0ZR|XYrUZF z@dLki&i!G)eX+cM&?74pj~3s&f{xPRNcZ}AvScL>5QZKZvhKJ`vaO8y9Yy}eaY+3= z>ud+_ZG_vNul$U%(sfg#Ers$HDR|_;kE_V`JJ<%@Gva`e0#dS}5kF2i*?Uhmn@k;2 z!j?QB|GKZ$nY?-}X^+Ctpfhu>;kF5U*pwJ!sMfhbpuGBQ$+vdplwm<$c|mDEJyh}b zu%<+^o`m~VX7%xVQ(bxmd9b-tO^6(VuR&^u6Nkx=Kj`$doYS%Mjy4I2u#)*ynbd|> zo2&pwi-tnS=`#Mm^3)32$*Sg_)3cK|_jjy(9~_m~8O2#Wt)4Xb&i5Sl?wPDU*M5eN z=cW3Mp2ruFBprv(!ZH*svfkPGI8^mhHiO$?)>t08Ruv8bGT~F|1bl(@3%7z=9wsN( zk*~cP50(Od2DASgf{KG-kd)$L&GPKc>u7?EWK@gA6Gh4LdL_r{#kJ2hZeVNR^DXV| zaApW9Csk$m{qJhJJUVPg2fG$5_g9B+l&(;2p?!8N&j(!w{dX{C#F4s(pnWipzVy)s zJC0m!=f``j=l+vwt+an{Xen34G0LeFP?<{<#brUQlecsxLOfgK*?5HE0mOF>TiNV*1Udpfo1^RAliT^6;& ztF*0iIx@hAhGYt}|LSmp>ADI`l=lp((~7H{W@a^N04?Q7Fy4Ha_jb8J(Gv@vb9L5m z0of&x;&~T^4y$!ynehAt6IAf5xpr10U03eyNr>TS12+@Rp2NqzH5w9sfc%+5CDY?FOAymdv#0g7oK-f*$H^~dp&vd~#;w?X z&cjY|ivSK?!)7qOd$CBrhfZU#CX;5IPWu|lxaWEP=X8fLB~fqB>r+nAoy*i`H~U_S zdHq9}Sw@oO zqw~f>Q7Q#onfIOG5Pr>7QVBSGWo%U|2Vx;aE1nhQxtWgK>iTx4r(sPHbq*&{$*gx4 zCqoBE*{iGtU-vx6fIjy3Wp_mojz*cY7_$W;PIjgN*O=RlSXtfEt?F+<9z8B}_ zyyP;_!171nc;!EL=A+&CZQ@(nhJDkH&O{)Z+eHTy4ST4t_4+D5OlR>+SiN#TEuo(v zzq~y3Xzjh;z3T)Vi5vvB(^`RDRBufcYK;43k-C@XU(~cfT|fUDb%F0F2<&{|QHMcM zLOG7=OH5j+DvR?_qE1>7OukTbLYvIdyD?%xrxA9m`N{73S}mTsm@gy(jkv(|7#6+2 zM}w(3?|Z9mZQw^$$cyRNztb6>>sa}0x(lA1!cN-U^StNMd9^23OiWS95#ejs)C)RF10l=t z{1lZ;Cwk2esup@T-y^wF>#fpmg-0Z~`VFMrZuzP#yCznwx73|g4j)C%T?hQ>*!w&5 zwa)*>P810A@E(Z(7u~#L6Ws*~w!tRTfBOc)en(*mHzJ8uY~u2iI`rM1$_>etNKIR{ z(+s4CM3hmU+iM$9b)-g*duWC)M_z=P70d^=V-tG7D--YEYVMCJ^@O?G)~Ug=vi7P4 zeXjYgAh)+WZ|rPYPtA-O`9Zy{&l1!vP$kI9zrI|g+c84wC&jPn;iwc@{-n;dpI*sW z&c7M$x3np^ie%B4P;u&>kB;nE{Gae6fat?f-mVcmmrn!cRx|S$BlJvIp!<-Nr$1Ph?XZ*V+93WLN^V-@b66KkNdZ?;mi3_F*DosUy8zKQU;X z%0^+@wxsJiq_IBm65sFL{I4apZVU%ARcXC_Hi-nxB$iYsKa^Op2~8dh%SyyX{T(i@vfcqP~KxsFAHgw;5NcJZ-!2Yw0{36)9K0Rjbz0dlbq?C~B zg0eCkzB1fc{e_Mmw;6TajA3(NTHpIKJJ&k{QA*w?Q~ z>tCE?6l_-hVqt_tUp}%zi$j~u%tKfStMvbEy5M0ExGwPxX#R#S0E^Vxd+koFs^)gM zx>eRL7}~KS#$Veb6&p3bRQ%NSpkvYxwzB)=51bJ1uH8QyzxCy^b(N&d%|F{gn~!&0 zZ}R#5c%3oxbK03;9aWadt84C=(Q_~OGh4c}9xCNt_l@r};`&}WGFSsfPx=<{|9Zr9 z5LcGj&=69)GzlSv@VgXV7`?>K1@wB}ov?0i7oWH}OjZ?C7!aRQZAF^9Dx;Kym&i?1 zWFeN7J|t;wM8Hr{qyw+6vq&#JF6uvC`xy6LrO_E7hjnH@AJ6hy4(nc$SFr7In@8Af zpSEA1y}w>U;E-bdufLGv_m;<82g2KI8aXHCm#^xgC8FQ8BZpUZ!na(HQ;bEPj7<7` ztp_p2E=9R&A$XI$*Ck{}!cT!YwIgD~x66gimSCGhg)ZM`A)kljttSYdgEGE2J2(wf zUV1$$r3zTa_j<&7VyM>KCiCK^{|yUjGYJppjB#cmz$@Tt>B;DMw>>O1)VnnTt_O9~ zobXs)(taefvAgd5`E7Lj!?|A~o_#UWN;MUoYNp1MU9WlC*?6wmj+YYBkVQz)5d%Gh zo^w_gKG=`OTGk*Wnur7Tm_e?nK{dt!T)e(lD42AGt=1Uur$35kx`l1^D=2f!Cck6ygZrp3-mY| zo)LK?MZy?v`lYK+eh`5}@@+)bWEJJe=y1|VMf{F_? zB>*V10JeR-EAvX#ZPwQ{V5sW!Fk*eX9kQzJekeZ)@c=k=+P02Db5V_TGD~xaGg~FGn(G7+uVrMtP2*n`& z%=i3^%=KxXfFrU)Ev#kklvhZY+e3H#%SQ4+(f1#-0gntJSKF_w-PXTPNd3R>C@Nuu z9NfBEak)3FHChFkp8p03I=MHz@e*n=VKr09e>aOqvX*Bs?%+0A8UC8%`r*IbCAWBI zon^vlW@uHOCvZlx-0#10c5(fB<y@Z|uYTG<>kiI(Xn4|*zW7ixIvG9lO7k3nE@Fady3 zPz<4^PSIg|UASroJR+8Ou(Wp@ciWry@wuu?w-*q`xw5==%Tr&xPO!=Mo_ z+%ISX!DxGSQbL@N)lKd=*=JkEYKBtfFz#QCXt8ysJFD2th5tx=Ud0QCY?oF3kVIQi2 zmjntwcL?lQ&I66hfC@%i&hmi)Y$j@0c8R*|vzzKRb)Re~Ki>Ew!b*jc+fn%*>76tk zT&_&Wm028SaQ(*Hb*BSrZzviX{ZQc_bT%TVLV}YP-eXIy$z8aHqY^izCfnksaBzK+ z<6UzB?ur8pqJRhz_*;HAlzyoG3sbMvYNbsdZT*lGavq@pXtJA;b zv9-7V+?=AQV%5>GL2yU^84!_l)0{_%IUm4+9Sr|lp_^V)B>8(I7}#v=(2^c|JG~An z+~BkUJ;-vR@6hw9Z5-M2%dY)@`y!#$LrRjQ%}8#l~+3x^Y)sHB~+L zYN-5MOGt2cMTpD(4vowUY3Mc|zA3fMjk|d6+BY*1nvYKPzA9G>qRQZbAK08%tt>gw zWGk$HKFwLJ?fEC8^eR%n{O1q2M~{W)L#K=TZlHJYrcOBMdb?OVJm%!aYJETt(`;Tb zje}i(0{{O=ItRzNx+aWo)TFU(+fLKiwr$&L*r2g(+h&8twi??>&+!_-6imNWJDb`*8WBcSQ@ zDd))dp9C3n*J(6H`V4bE1#OvVO6rp)*E(kVNrrlhe1!$ckMK;P4u-uT)JOI-0ICyZ zL-CyM=Ih_E%t^CaK9@U)fDZ5~H&%|l{|@7k-;1f{YH(D;BOm|ni0EUItneKN=fN?k zRqU`G?ID?3vC|$h=iP^8m;SV4N^H@@i0%@$7&G$)kqE&k2IS9ZAy}hv>A!7y#R^;u zIiG4UZLwcDL6diwgIi4@l#zFvN`rSnLh?oPWq!xCDzkS{q_}8H90Y?V2Itw!YEeDc zFABSSe~umT|0uzdmF0I7peWz|KpOtt%KeS->s^5S)W#;rfJNJNTYY4&)o}jfWuXW{ zux7>@b#hA1!JoQQW=Q$3QqjtVG>TVB3LGd6gd8=Jo5{q13XW=ih#b5YvckwyOSe&NNakIv&15 z^Mh0KnyS%Aj3He{x7!&zq_=vfOCJR|FZo{oOD-mytt+V!*+mKZukmraPxYu5Q0-Ny z^1A;Lce$J*DKoYLB}Z`*P`ufM#85kt&e{sAz5qod8YgpH-yZvuQ}qGH$tC`;0!Sb^ z={X|}gVqo?5ICW7;+D(bx)uDqSZ2Q3o_1lb3G~rx8SuWYPOQ{`zKdWr{pPanZT`*n z`*COJ{P!1wJslRzYBnPCFA`b+5XL=x>f1Cmq|m%*MTz7e#{R1?F)^5PrEdOyzV5CF*R>IKA&i4pJ}yN@VOTFEJlO`3B!Z+aapHMp4nPBlacWwUJ5w?L)2> zw|b+O(rx&TsO@Jhu(mWIz^pEvv3P$6SDO2L!yuDFpc!pF05lD#Mnx8CaTM;&U3FXB zmKeMnOtW0QJU#VK^ZQKv0{oyb#?a{F{Xk1u;UYVnUy)L{twwZ;D3fR9cZu>TcVs=y zuH!;W2*gCe9w}}(?RB_=2wnd;t3FfLe~a>E?f4Ag2Ub3BEXzwAV(p*aACIIoXx0LT zP&yQ6C}9!sVHgSL=-k}P=j_m0Q+hv%s5%*phI-z9_Tp~7hVPY}X4>iE*~Cu~ zV#v}wA9ewgl<_~AOtU-9`M&eK8&AM7Y!Z)0rM|-pON$%ORenR zK)+30T>>g-yH3w`yj3m5!}!^Oxpwz3I}|0Zb%H<5T+~{ENtnBoSXz}yv`$3cL|vzA!)l+;VZi3F z0|X12s4sak=*LDP?K7a{iQ^hh&l$0NI>%poA2t{F4DM_9(n8W^kfCr%;}+n^2vAW* z!{lzaa3`lkw543sMMo$}vdk!9Rd2Du!%^lC=qe;07u$9j->=ujuR4!?>WSR*i8&j@V?2lRu+d5-mi6XbP7|ZI6avTORp06!~(dS(y zJS}K$qZGtrH^#$pvTy6cf_0P^iS3?#G!+1@Yh-#VYB1%^Q zho_Xsn#nbBJUxnlg?I^;htYNNX!j5=ptARIOr_1@L)T+Tf{}35VzwVuUd&hohX7w2 zAOGWCV}fath68PQK0@rj==?2m03n{57}WD{v8L)hhXBSJ%LVx0CP!tK;JwiAPF5$G z@Nt8K431xas*E=o>dI05%S(vl&7hP_5-n+?W(A(wdO@O_T)w9Q6|HYO!n_|0N56=C z_M?7fgX36$m66A1$7OahvnBoZn}^Qo7WDHH%IUdGJ_kH`U8&hyxF zG0%InkHfRdK5q1r+q3r?=%DbaP$o-n!2{w!j=1FC_FKhy+mVvL{uasueuQNDW2p^| z6nC{S8CvV9C1|nRYB;M%F6!vqNCxq}kvU{dx}kzojqKx>8;K9Q^M08gHqW_Af!=>@ zM_mQQ)?WaJ8v`ctK@$7;5ZZ3H1wL2pcDog|y_H&WuUk!^9m8K)oKLcZ-Uh@*-DFKi z6TNM;#wZcgA&e6zQ!QeOz@s3ukZTeDidb)gi8W5J-y+;*yGfjahU7iNha9>L-Ee!{ zT*msi?+H*l$SeIQP{`q$6T34!EB{ScoU{1Xb%2>`r2si|eQ29Qoc#=pvG6rR=ELOz zcxT@hUN58iqJ;Q;Z44y2v;Q7G;J5@ze>M5vd#AHEJhsVjRWj+}G?OA9oUB6ESSM3k zVeQsprNwtAD>!2gVv=#l;ib&nwYpMRS_)+zE9L**68Xb;yid1xQ+3_{&Qt(?>cx6( znZ>M=synL&(YP#_WZO#*Ha;c6_SOlPrK%pQw^swNv*Q=M+6O`c!fSpYSVOxw-c$;K z!11th1IOTf^iS8NSq-mwT9bT}z1xZ{3Z`<)9s%7GgRM!oda!WVbVb>ikt*L^=LGUQ2zE6Z+4r2 zK@1KsVm6$N8=lQ5-dxYM)jJs%*hG)*YYAwZy{#q%)f-TjY=79Oq@GQ1r|@)~$_Xkt ze6>P1Hq37tD{=&vtmv|gOiHxIWq=EKZV5AeF_RK+zB`l@{FY`(4;y${nQ#Uwjq7*c zG~9J4ct87a<~qN0o~b0p;o<;z)qP+xpc?msRNKs`<8KL~iYiJ(4^q*`5CP@JTnxZK zArPZ~eZv(4>=^$VSsn6Hp#Yub-*(U8o!sS@w_Y)(xIY`>MRNX~i~A*#86l@Nh6?0| zWKr0dNGg+*iaAPeBv__EEzEWNpeDXDg#MA`q!<$%z}U*QBB&B|LLbV zqTjAnwDuxt!Qr-#KZ&xl9tNr45L|Tq25+jCq}7}!+c!!k4ivnM(RR4h1LAsKim zU`sr41;)#|mpbo#7opqTuV#1}{#Tje1%LXqU+$)RSrj_%2Y0(;!yu&xwStfjpcYPq ziA!%ZSQ{5zTA0%-OVnggO_cSpR`V&%S(|nm%oKE=+NiRZ>g_M&lE>CK^w~fJl)ar( zOG^|Bpq|fU^zM7W$1sG!W67xFV0|M%DTNM$Y2T?*TjVw|a87vYMz5Topl&s3s%SPp z-dXEGYILgor1X>D`x#IF8%x8Bech5Iy%3p3ZS4%C|qPH{dZ-oK&Pj5%5bcTvh2!V42S5Dzj@Z z;fevf+=1rXrIf@nZzO?nGzBE&66Dv!Hu`ia<}{Zn6G4h;u?!6V@l;v=y3q9gVKt?> z%j4e)2BI!uxQ|=TYWoA>U1PsN@r#l82r!j4G#=w$;!oGK419tK{fe_8t6`nG$9Cg3w7LyPvfnJBe3 zxKg7GK+YA>yTGy{4QqjLVBA=O>8K6!Dwt#`tdmP#7$C zu|MtW)1(BJ?oFTX5^)MX+8!_vn;ye}#kY9wOg>wk5J|(WgH4Vid8wA5HRMm6YP_?+5#o&lc&wFt3NlAz3jSEocdOvwY(tUWJ4q=2%+@>oZq5;4I zqw07e+t$D5o1OvR^|H1Wl%W(cp%UHWM|iToLArRN%`Mez{;fZ_vC54-X817eS&!b= zWh9ojJ=NZ_vK|=z%NpL$F!M!`k3bEgSoPeGQKlGm34(UT@Fr`3kfjD^(sU2@&=kkahFT)Ogv{P*KI)3%8dH>+@{xpRqZ*ERaIrx=c_@NUpeT| z4VXKZG(BPa{+Jy-(sPlm@_-@(`Gu_&8IJ386@M#h=v8Vg}*`GDm6Uz2G~W!x+_P?h&(NCb(<>!7}HABdpK zRMBL1=!ffdItiod-ai*N^AjDQMWg1T`Ua4QIGy73nOqR(Dc@%MLGPF8x-s>4|3-H| zOH}z>zwD^C=Y`g1>qG){Yq%4v0u+=$yXZE$GYLFt0)k*dVhV&EgeZD9-59@JOIe2D zkXjU&Dl^T2IAPDA{z!N+*QS?|mwYLv8YwHUmMwerMf5i%w&oH;inrY8XsQ#P^4y2jKsZjBxT`-lVyN~O5 zB!{kRxoKQr$4RV;!*of%y2U728BAiy45o52WF?)L6#1x68rV4jCZbOnZ8>hqp;YRl*(T0|2F73ZZ`Ow7aW-p z4tE)pe(e61l5Zo(UK;tOs=`O1-4+p zEY+zgb40i5MJh;n-FlyVLQh9ivzvj&SOXUz8Uxva0Hu+ejc#_$nxSXX8Sc^kk+iLm z(+kY{LhdMF+V|z04Akmz7=tilkWbXC-*q->l9C|=dQ{tojSfRBz#L&Du`8bwg1+Zy zGM$`iGH7@D=2AU$J*a57c}x+ZT0~P>fu5I5V`^&ca_U)RNM*X$8Th;q?VM(5LBxmq zt}RTI)KrW-jSh`MDI2~KiY}APqiE<6JW7ASjH^sadSbxz9e+b@m+QTC0k!i_;~a+7 zdt<(5OWDe`i?Z0@v|E!IFdP1h<|H{* zOk!0)*-TZHPB)LS@)u*}DFgXYzVb3DNb=A=PTIr3tzzhFnckPqk3bBCYhZQ(vXZuh zfjDR>Dbp_xL}956nnDj1U|4P&OUyU9aAqx-5wnWJJ?$y3(wgn}8U-7|#-x39!)?Y= z2<*&S9Zv+cU7!w{IZ&uXW(;L9@sB=3sa$t16xyFdDsm3o%g`@TL25$abqTUQ{%tBTOOTrS8@F0fb)M5{-clYr36d}OlsP~BmRJQOVpATQhAWC*LvXX}`7ybzW zu(ymmZh|5~5-TPrMI=S6x(AYwxT^Udqem_tQhVR)OzNtTEv3)Jbi!0o(LO~(m4CD` zrwR$KdUET62^+OEBMecCs_|_QAe=6LE+4!D$uO7})l^o%wULu$=W+BF`m*ov>Fs&b zWu}{HUA?6|CsspCX{x**QsANzL`!mS4kah)y$F7!8j?V`0B7I!ccVN?Uceh@3(ZUv zO1L0elf<1gskHA!1G*qcTaW?hYQ?v4+t?Ne;cAw6(LPMRBjY|jx5XO&(BxiByw40IwhJP>z3P z9)W-R5OHV+t%er!(hAXu6(K%(r(HqO#aD;!mZOi!$K!TAuyn~kqRPi-@IXJDE>8FJ zoXoD(-B0Xe+WJ?tw1PU#aToZGmVF1gvJ5U3{k z7|3(^&ct_ooe_c0CS}_f-=j9)3rF56oo+@$o6+Mg6`EF*u2t2nQoOq{w~Hpvu<`r~ z_uJAsGJF)N5hQw)=$-x!_k+~~oiEp8l!)qa^D$i9tlLO$?j|Iwk7qe)@L;5FzC{f6 zec`+o95U51CozXZcemIG6!x^+p2sRj0-pM@VpNyy{`i=@&F6!C1$%U3(vcUESYCMI z_XurKn%-_f`{_XspyxfWYUR*>8vew^go-&uq^)t366VfK`%a-x;KV93zX#E7jkh!e z!7}Q`ps>0WP3wNv{c+p6z2URNmxJ+|-xT2U--STQ+_z{qyaLPjKmzUc`*RV^>Se4#kT4`DVsuWg_s3;O&LD{=mw)(*GE%Hd+Pt@MqItSwn@xw=*1IA>py7WV zjimcJ6sx6a_2P4`yDK{lAD+ev-KhtKLb4=ec1O^PY5(+_!Uc(UzbPW7J-99(tgpT` z>5FEtq|nNc9SN;0mBmo?vQko2we=5DfMG+lrS*1qZd)1RuiEEz2dNVP-#h<3hzKW1 z$H_jJ@libOV|S3`;n$(GZ5pm7C{=_mHMLqsaS_t6Qm)Qcw6a2ta31r{4!Qr!tvCTU z&3UhQeEYG&t@Y!+tIc}*p!5i7`y_xh1=M?V=K45!>*@K8XhpBzlp8ew14dIs0=YC) zc=aqMibb025?A;u&qM?zVqL6Aa54$&Ko(sM$@exwcI4Ut z=FqT)6RUpA4+dx$#DEvJD6UOUuj&q`V2{2#;8(I#vK3^WVQIqZy&tF)m#Imt4vjOG zHZIYge8}{+7-a>&<594B$Won`wGz)2YOjLUaL}Z|>U~>?zu~*eBPTtZeZ7oCX3@ds z@*b)6?%laLtKB;G`-c~nuv2PdN7 zU$NnQ(~ddyY!6&jvHW`FzbBB7yryjW3hL+6mYj_}jz9an`spWQSYH>$!I={F%4v1esB~P9L1t&r?~2|KJxvh z-}6B-QTM;5igMMtlrfXjNlZZ1{ZmHoyOC1%z-geu_tDiq%$?+U>VkhMX-c)lsE8pO z@$)y{oY^r42RLD?wcTfw4i-P3bnNy61K1(H{zv0>@;d^`^!OWJHkkZf8ah_v=3rw? z((Mnovv?N{CZZdpd5(IJ?6o76t6VUbnL{r_`)9JT^jdBTJovd>joNIIaFGrcNI3QV z)1BykxG8Ha{e~+$|Z|C-C_cFUP zV&nM9Lcx)TwQ-;5@ZIs`RiBWgp%a>C#EOQ&-5JRuIX#Ub8n|PLjuv>>B?z6AH90Wl z!;=)(0;blmdD;ou@#m-VQq3udDN)gQv^CGubyfoUj|oyge`%KM(K3K`cwly=2j&rG z_EW?a;?-kOp`P1BU~_>c?vF0}^fH>-!U>1q5~M8UIYbg@XgmzQBPh75r(CB0w2PBU z8`p80VdSFTu)%P=-lpKbZPw|#m!Mek*;BRLCe+`uyJ()8POl=#^h zf|Qm;L%V4kxe7-3xUJ6|<`;^numr^Fy!2nRj>^Xw3L+l*_rZ$-KFgJYyx~8)v(Js^ z604bbhg%$>=K5zMj@}f^K2`ML^Qv&o3wsU=iS%1zVyZQsZcKOGTf|K(B~Sx3 zBM{C&3I++TZ?B6)Ebe0{$=Fk1OG69@>6=yd!|TE44K6e676tX*w|5r!+Ahu3n7__I zHL=a+X0UtoFJGSTuXI_f`UJfL{Zzxgel+7vgxNp|mDN1|J8u1VVHg%|Gsh14fw3qn%g{Hma ztrj_YD>at3B#O)(Jc=NDy73b1mi&w(cAg-(-uQRQQ>-xwL$wO~#>M zX}{91bTC_;Zp2(*% z4{RS!5r0L^RE+pZ2_S9~4F;p!?l8?G^gRpE>wd0mPa-??K0C->+QG)6f4zU~@!jS! zc(0uk7Zw#IB?a+~024y0p^9-``nzG}$PAfxZU0nP|BZRg8s)y&K0IM{w>BUsmnZ|u zCHU^&&WO`y5d_;)bP6}8jJYJHZbs5^-V-(3UI4s6fOU2*PUkg&MPS4`APN|=H<5&i zl}G&k1^f(V&g0d}Ur34RBWIgnSColJvvy%KYm%&*M?hfK1lM;VUICAGG9j17#DoCH zg%@of$$>;BFFiVWXw;iB;KBpP1a#f7S@gQx5Cn8M0dF@SmMKDMUw$A922!UJ8|9UC zf)QtMjBaRk9@jN)G5e0uZ7r8oc`&o3y=8_xu!AMq!IxS5bz!q33aT3zW-NPd9?S>< zvuKf?m_*{@D2Sx`HZLS>Ty_d`KDeXcIXc530n}85_3FN-E#&{Es_;B-$u->w(DUqb z$+`Ns`YI`khKXO+qHGO?i?m=&y~ zOCCGf2N9SBUNC#GHY4=VNo(tQ;PC5CIm+BU^W z>e9up)Ry2n;KRWuu+ccVy0>rXCUmhy7_qw0BBFv*hC$euG@rNeqA?dXmcUG-Q{m#g zs4tZG+wny%cC(6bi6k;!DQ7XXEA3Z#^R`65a#3pfSeLF0EDRiAE-v*vHcv)GjlyNu zki;=B&!yBj>#jZXJvOFfGe^JFrr=OAJPT}rtgN48t?P=|c2Vfz2(|$+ncsZv*i{W05`MAYPJ3## z(rR5DaUJJTiuSZ8ZzW*l`fwKR`-d4a3Xc_RZRXf7(jR~&(?mjWfV^o-uSWv6ozv}0 zt)KJdT;f!jE7NhLA`}>PJ_G^W5cb60|P$cmoc2mw6&MW`4HB+!^b9&6gndOj3v~)dEI?7R6%b;Kv((Yn^*DU3n!Oh>i zGHL5ODv3sROP=I!U~eKGv+1_vAIWQu6>6lmORrRg!AY1s9(A202xY|BAf|M^So7g{7vQ3CnLX(L==!ZzWKw#Zew)nv*S=hq~G!v zra}WPsH6p1S*mHO9G&;3PR0St6uLWnMSrt!tk-**Za{7MlafRZjg&RMI#Yg84?t{0 z+D$#^44<8#9A2grKCApz$|=0A@azmSuZY(ju)Y*Q>N+mX7M?kD`KxDN%Oj?5>I0AWBaH2b06Gb8RiX$% ze5Z3~2|42!0|S{M*^8)O^gp6)UOt3pm~+4TeAbVha!9S!E{nYa8&pP|yBefZZcN|a zfDYsYGR4(hNi4Mzdt&R3X|wmJm3(>S2XR(@?AC&>wjw$(U({~;EBzbo4~7KNP9A~$ zak9IhWaWKPa}EJ*h#mXNP&s11$h_ zH2n10Q$edymD~UzMmYDAJG?yf_&*Jeqp!EqP}}tiFlqi>Pt}XqEG44403LA%MYD}B z_c~J!?MkXLKHT{gnM$4*LS13hBgd;x(ruGh8nv%g3pmr{nrLn7l+(W-SO|_f0A9b# zcMvE!RkjeQ&UdX6SyN>~mwi7|B8u<+9hNZ;WY9c1CtfWbre4ZouqU{QOH9#Bx=`X) z4hwjU1?G#!6McqhIC*}XfNs%T2bNiKfM5))K5ip5W#Tc4yg?BJ^j6KxL`%{1-_#Ug z?-9z=n4MqR?+%&#fi4PrJJvoUa-=_GVFc38l~oVZeQ! z3|`E@I%l_g1F%|>v~R98@eH4&myYT5%+>N4-*biI*i#EEHoMBy&@5| z^?ms#{4c3*1!f8Xsk=k<{D0k$|MaCrRQ7y0M5;MQ4%TT4C4GT2tB$$Ruu%;A`1Ya!tY-R3>GJzrod4Dlm zv6^Q6{(HLmN@_klH}P77`ZNl5uI(k0=*DIw)^Ohz6?LZV<><4vaalMa{`M(qnc@se zN;U;>AZBY>*?jhq@ zE6X%zNjH&~XHw(1M|Hmx5bPE?J#YzRgjRhEE9e(YlJy?BQSA(NWpy=D?RojR z4XX(#G$&U>+^e=bXzQMw0GKv0*Htxo)Y@N08q7P{iVD(JHD#aOUNnQ@wlwCbXfa?Itb`gTTz3zV+e#1p^LFfgH3}Y z&%=axTu}}}BWN-ee@|+ULS5mN=qT&B?0MaCqhtCodYj4jTM!Kxov;A6@*V@4xddEJ zDX`w02jsz;r4*Mp3cVj+-k?MT>p+oOQm)M0AKu()Dq@}EmS5_-?A`KRFE0MeK9(jy zlj=j#vyh6;xyuh6VG~VN1o!7H2}DZp;;A+%=T_R7v#VIbWM?eC}IIVZ-( z%7tooFL%J?na!6bj3|w|L?`$KWe`@5?x*m9 z8aLV5Z@Z5)yt|866{sSy(P7Fk;1%+sdjyP%(B1Ew-Um$6T*jO9L|_pv@*-H?sYFhF zH7`Yald!26-g5jKb4v8iEBFmqLvjiBBuD9AG}8c7g`t6$ms?`7B5s=|2E+-*PA-dw$Ip??lwmYV-4arm(r$=l=rn9*0=ngwru*jWS753T3-!J z5E{y;>!4KgVV=7jgG5SElZn1ypwUi8kGW9Cnsw8WJI?XUU*HhF7^8XaatA50K6dC{ z@uV{rg-lS)2wEa9FjPeire}ntp$nq9&YT-ZNd@Gcbm&>*R*u3iJ=e2y>7`f~OXsZW zr0d?Q4K(8U#LEzE3u-1x-@a725L>{S`1J1TW!vx8T=zuvd{M@oTdPn+M+UgZ%XA3A zD?wT!^sTJ&PLYxVn~z8BcZ}_>r+xii?u~AMKW!&rVnDJ7BOzn_KYUH=p1(6gi1~K0 zV3cJEL4F%bgUfo`;(z+pzj}F_5nSo$`Y+VC0Y#(F^$wumfDTP0-OP-2Fa7r(FhxFd zGWv}pu|sFB1i-I;!rRB?gP4XdU^@C+gs%1=Nu({tgzd&IrsGEP_Yv4UgGn^D{9j zB_rZJAjc_~FN5VQ36U<0!+h$d)^l$d0{X}(5kV~;j?-09woX2e`|_@mym})HkscCm zRI>dYOHp`*^D)PYK;g$}Q>%)3S%ytzR&cfIRR28_ms%+NJ-wV>{TmC2jk^qAX4uf{ zQR_VuekY#bUe~mOEHgJ2N2xZVpB%BzD0N!HnZVf+`K+E0nGV1PI|470L`?{>YTvkZ znfk(0-4)?FKCKfB9I1ylVo34YVP$Hcul)y)VN9@;g9Sc%eB;Az$QCUJXy6t>db@+5g*-I8{_$>vFNp#-$*PkerrEA5LlPqs#9gMX=IJ?s zqN<+HN}q)h&N}~xoy>+b+-d$#e>wT&y0lI*Hb;ayPF!+zMtrBO8%vN%693lu($nbK zJUleiQYcLEl(N<4c1^9l-Yf|!Ix%Aib6-3}sLt0Exf*-j_d5d!i{43WtV1$%nTqCA zTXRj51*g!Mmx;63nRd=F7Dj!k#b3K5!VhPC)kXMrg$F2dHb_#nguBA^NGP;9IUtw; zI3z7~j0Yw!+fLTLZxD^k`k!Ze_B|lJV-lR5lvHWV3Za_AbpZ@bH2r=6dtN-tyYs`k ztjeM*08myXw54WEFYbfvtQ&Ah_sb;9QJPsig^7O12cdk zD%^JClq0rj**Bqc+3{A9Kka`tlDYL+ft865onZx?BD{n#*6h9l9;$Aq>+bVm6M{Rc zB^<8=+zokn_SW&Mat>~gjd3ny-TFush!COmz$W^mX7{h| z>G>I6#7Mi;jJ}aFn@=4-p&BG4SrPbYHtcdoJfHof)Y`v$fiGZ!W&@`Yoga5Dz-+P- zjC2%kFmYky52rBTH{dWFBajkGSyyjw-a+A+rH*_pXPFxQc_MDM_oz$oouqp zOv%1w8)2v5`rKQn+-QkF4XC2rbOI-6TW(>gU#)l=LV)!{G7GA_jTqeuv0yN{Q7~L^ z;Hw*7UA;P%x7$t5SwP#rL!_?H+0SIo?i<^|v|aw~H*u%!&&Oh5e~(Osc_0$@Kxnc4 zvbU#oC=^B0stKuL@aBjdS>6TDJICK7fq;)?d zZltS_r%Q{~f}wT^Qo)fFqMK8L_S)&yz(@mA^i@2_I=E$IAlBns@ieYN^TmGdQU>4y-SsQYLcN?MN516B-z?J{ z!}rlKBwyxlTphU}4;tVkhE};JrubXD<9D+F*zL~{4p%x4lT4{!koY7iFMa>^0EO=v zb}Y`gQaUz-HH`q3+$~x-7B#18!#1ZKAUzJGJe>qxwL? zFCdcv$eXJxCYZ0jt(KQ$AJ_r`cy{)VR_>6m$fD}kvEMBoC^D*T7PMaA&L z(-TyXRvEIlShs&=JEs(m=|Kg+^sh&IpYLKy`~#y5rmBl*wiKR7(2RoRYR7|g4$`Ml zY@Z=`a;z-c+^I_5+z4yjz_OipG4c7ZAM@+Sci=k(1=0S&jt%w4jun0O791#a!6c|i z;kgIO*5-7iSbi{0Pe~7zgyso89ozT+x+}jLc=&Qkq>%-+tZOcm?gC1RCP}1D8}~)Q z;EjT;u|LBu%N6a0YmnYi-oB!PYB*Re+~_KY6hcXG5pru|Nj6MJN2s>50q3#p1AI7} zL_)OzY(N`Dpk93elAxOSFn1MKACz^+56bt$w-G6?%(=m!rjwmfD+-FvN#V2+b@bV? zXt6H-WVZH!rtwiXZ~z2$Z5~eD-a0-g8J#o2B`S;FnM8ZIflabM>?OrQ&7{XQx7H#> zE`Ra_E^9cz!R6&p)I9PJ2}kY(A?aeWm78fo9kNrqOW;JgK|5b9=bP>3dds3ArvDF% z;NvCJ$qtG*1^l(G^8Ml@-S_7;W*9DP!x)3A#z-$;wC+8MHDLmmd|f^IT-6!#hwwgM#ku(BdY6}c`}~}iLib?{4!76Npfw7Q$8v(r7T7S=k5HO35H_Q2_i^^3^r;h(a3|3o&mutQgMixn?5NYIuT@JoAIC}Ysea;O|j6)55rp^4i^qPLo321 z#hPmy!S5D~ao;>oHG?P36;71;#IcB+;En(3nn@X1Me&B*pBPVwkWMexQ9EWVv;BzndHTiG0OS%ncvzpjI6Dd>w(s`q-IM zB;{N89m;bQujE+}E#u#mly+qiP5{ka1F4gs-Sfu#zzCO07oN*ZMoE$g zOE^y*iX4Rq5}0c~04=Nzz4<)25K9mP(dVY&lYraup*S~yAq|I7VaDmqI0#99tt>Wb zm)Nl?ToP(O#B{3*!u!7OjbSGYI$~E{GA2`Nu)Y64f?qHDsrgR4v-2k*T&@pAnQ$Ad zGt`>)LskOE?D_l#Q1E4Lr3jkI^F)>G4h~pEc-&B`;UG61n*8VFKgmU5o*Mi07uF>Y z0IW_0%A!C{QEBcaoU~6!6mo%Bk>$@H;z-H_KUrl&F61Tdf2s1U6|!+58RGHmK{Tsx zhiv+dPcb%(-oVCDDk>dCO*28hlLF*K6dN=E=FByG7-!NcJEt_Rh+}TK+m?a+*$9Eop<9hh#UK#&@0xJXZc1vf6ygGHQ zV{rF5>FYn6OB1KbhgT_wsiL->+2kAhwd4HIPIGU{Qne;unwzuGN2@UIZQ*_*irxPt zGpS)=IY1%@?FrZ-O=8ro?t_kk%P;-XwB<5V%DRVfNw&1()W7sDjIDB_BLRllR@mVz z^rqaz5Piy~)v(!OFm&;a`&AU0bV>zP2u7nkG6N}sHw8v8bf`F>VbG23dl?3YSqihK zPff9=6eUp+3@3m8EbXtDb-1a;^Cn^M8CFv}4eW-X!?BoUWqec(O1;d{jI<|p;+KZZ zHk~z$kD8Rm(ugv=ZGP}2I=3D?@3+3a)A-MVlNv=bC6a&b6OH=;M!`nQ6%M=FQayzr zBDC3gIt45LB}gIkk0Iw00wB)iupBBp{?6O^GV^$F<>$M2dd3)49LP4d z(`L4*2&stp!?-lPWYjymnYf^So$=HE=zDFz(RBmVxMI4XjF(hr-F{wm7L@JP2iryF zcC4KZBn3cC5m||t^$)2Nc4NXVJ={;!w(m*eBMw$$=+zERpwt73@D34h-XWknkFrF0 z24uC0Q6gSrj;23xDKwXRA!hUSV$BwAHZ8|GJhv|pt%w#65wx#1scK`%>h$6sOQq*a zlA)=8@g{yppS@^}RwiRwCEY8Bb^oEyIB6T9f7- zKGkDrX|;ASd%b-ris^O?(vhZePXvbp2k1q^aq9V?r4T%#i|)>0^l>-QIOHXHx*}8A z#b~bXWHrP3b@-25`F8=!I7e5JG;Qg-O)#?HQNxP@pWY+sRD^8OUuc5@ql#8KiI3af zgNPhpe{z|8Irte4oJ$Mrod9iQ201)H<t>~S`TWb@v2Ps>fxaalEKH&|)nDJ+tbtA9 zHz$b!orrLvmVtp5p5*&rwpgu%><;x5vamB9-h+7yg3Z2F{r+NsG-RmrzB^# z-oceu)(9ZTak^J8sfI}q>UuRQ+<}Nvj;QP?(Lln4%}xLI7V!`O-Xifi%(mX~(Cnuu^8`HS zlzN2Q7234FNkhT@CnaHdEW1?FNx*=WL)z)Zg_vfmqHt62w%!0NDX!u!_%c;DUQLHH zP;3oNM{)fQqOKw7MtE}aV+{-r*r8-4m49mp*@E41zx|>Rw zxnV4-r(t|Uyz!hmWEJ@?&1xe(Ws?|L=!NhPrntGE{Y$F%QP*4p;{b~9G240}De)Ab zO@kzXVF#vo@NZjpMXwxQB|lFPc_%H6ORiPhSz5u4b% zcI{mhLF`exc867pnkA}K>>7gse9)1jr;e)V23T3U{g?-+||)T<&tUTn>GP)iIY1= zk>U8o_*}Edj42{ktnORib)?ubq7qIUmA3JoJ46FiyI?MI{&Rd-`hA8^2_B}kZ#yLd zTe8^-SG)t%BzZfE-LAwafyt5PE3K8+RD`S!h8YMb-Z@>J96+ky*3B)-;`|NF{QT6t z!7E|uZk|2en73zs3!B2Vt&36@<*pLhY+Sfih46lKVIQ1EUb}*(t0|aNmZ0W}Uf{Eq zg5D59CDzyv#!F+*yg$93@Sq5rg<`cP*up|@B}XqrB+FIDbDJLZtYz##v?+dsviMa0 z4Qway=$Z8qCYL-&2eAMgNUBJpzXL26z@mnjgok}p*8C@TA$Q8`0GS_`z2+7 zo_Xoz2P3!M?`OE?M24mJM%VhUtf(LZcE9Zgg%_BUMEXQ(;Tt!(wIC9QZhB8Qo$kPC z#!U055QhqV_X3J&$g}5y`}u1 zX`&Uwm6O$2N=?NcrEyjDsD0aw zrK5~wwzhvj@s*QA8=-xUrj>5g->?)3?&ZSjezgb~>gOxXCH&0X#>*eGwJ`(L69g04 znI>3HPKeGyg?i(eHpfT96QC^J^wgplNi!q1DBX(nCl7%A*?1e{?UKws-gxr6!95M| z1O7bqadVWs_RU98#XDv3UlZj2ow-U$>r8!6ct&JK6`V9cstysC*yMnni2nP}Xkh5P zhlp@yfZ|ggJdS_jp=uH6$cKLCumS-TkXmf?XS(z~(g}`k);)5s+=MK@arWA|d&|T> zWrz1RU^t6ew7V%RriH*9(f_wU0sMtguGGPIXVWIYZfFjXM`DMg+(WzsaLb-&A(D=c zfvHRZciX+20@Rsg1|CrF7v(Ih+jr9h>(jFRI+tYPHk*Ij)zHnxcKv&~-a%{XN_;@! zYLl0X<=sX$`$^vRQ;{0foE-REg7qH?kL=*BT;sT=y-yLR4B-cvh}|j_)99HbKY%xG zI$E4EK8^gzu6QE@A0kaK$^ticEwSnBzH9b6gkKKme!B!$pir8<<5jq9EuIz50x-G> zB7mbitA%KK`#)r(7-0yG%u)Hy_nj<@yl0zDwyr;0&;G|3?ufkTqYk{lL-Kla_sfLz z<*44V18U+w5UfOwc##R7u0<#dT0PH<9vdE3tU|SY_q98<0+nFktV1U+AR6dfHmp=^ ziR#N{Pf)i_0jQ3xHLc$c>YBoQ|7V@&`c;?1mzz_9dqA>idHCVhm+$(sJJSw1t5Cr7 zZcV7b4amS&Y+YeWiRP%@qJ(i(o$xjj&tuOT`xra@=|Ra$nEg?H1gSk8HuPEueM`dL z@}?2gb!l}JiGTfLc%+vfU88mDCB$acIsHNH50+(AO+^I~>6_9_5p?jx;(Nf!+dBYy zGCPpR;|5VA-=bH^PAi8kIbcRCG3K5*y_gnDj=xf!pv!25W*yx`&KG3$1aXW?>e9k^ z_#Dzexj1`we@HJ=>~SktY4Zw;nJ2XfGYteN;Dh1VmeAc=@Rrjwl#HzC&bA8-h7j9u zaTW`pvM(L|ib_OLZTn-N zepF=p&8Yc1$2&V0azl`dr4zzDp*b*6Sa^j0?BdT{bsd6AVv_!Cmn=_VX_nYFhYK_5K(8t8 zZ>e1Oq8dx-EaQyRF1{BILXk`YLN7f?du zA3&&6MikLgNoH`2uib72B6Y?)7f$&lsu+58uQbW)>;Unz=ZR^pmNoW`NyH0$e2vptoMs>!~xNT^DOZhbU(N@X@5Rs$_u?$#gPQc^9m;hyi&RdtOn z({1o&D)j<1=rVO4dXbu#fTuIO?R}T&(Rw%ihC}So^||l=D<_{cdagsQH+ni^2BKCQ z)#Q;3+fG2Df+7LrV}9zSOq(Rnx8^3!Vq`9hhbNYd#dLyDzDseUFjs^AGY%c-R#5A^ zkeqmBB>U-KbzX*hlySzy>e}JU|VA+T|YM~^GRh5dAgz9fs%W{R@v3bBKjO(ybj`mih|Th z<|#beN}Lx4ZJGE7FXq1%PVnO<&-Q%_(v(#;BtvJ56fjbQ?NY*RzLm30%Xc{l)LTMDEZA~%uPFM`Sv6t{>sw3wPTuVT=OQ}0QSn7E%V%zh#8LOF z1({uikikWY_pV&0-=BUe#V_Q``g(8kT=8d2J-3N6R7l8hce8S|l8)6N!XdG|BE??Dk> znTtRCgv|_(Jj!GZv7aBfYT5~Sj>!+!bd}~OWa@9uo9yIo${%zkIfVU%_Ed^YjwKk8 ztgUJRJCU!&t+>gb8gg!cUiFrr{Mv1JZj!{J?G+)61l|Cl;Rd-Voez+N@4q~^@)|Ce zG@d={{Wm3Ku-CPr_9Ef2bIICFiGR?)64`%o1EdsQq4k)1y3Ph@G~_v^?N|dF6gIP0 ztYl$f|+Mn>g;72RJC(@^9cxMigs zyP1wBL2W~}mZKHU!zzKn@$6R3+3BywL+X9=#3vW?it5}LJ-HUQR{FJQdZV;bA(eIbT#U^4(03E>wcE>o*=RCC|!a zq!nGKFNbq7$odQhrN_Eo90=vl2x%>wh=~*vlV}Qe2DVoD^KPH>YuZDAO*hxK+Yy-N zs?$H~Cl!`*Tco!rn?jfxP`cZ2dv8~X|5S7gD^oIXHBWu)*at?HSd3qXtoW-Sj$3oP zoQnz{ez}2r&;dcM^YGwJY(d8{@CToZOoU{(wB^>Ep#9tpQc*dxEoox`9xMfH$sA)V z(&7I-FQ;hvZ!e7y1G{e;GqieoJJEktC6&WiLk|T-Wz~LcH%PmJ`yJbh`4R4(`9Q zqa3Ay8LPy3bTQ0N)*`N0zE__YYdpZ~KR2;qBJw_8gIn)D6ePRxh(330&TEKt$6T`7 zOGsKpd|lCae@2NVqb=+- zpA|H9876^#EVXo-9FRr<_kG&YfILPE3byQ`3hvye>mFAcs$wXWz^8I$PV|A8>w;Oh zXOVCl5^i1I(+!{rwRF6kXU~ZzVeV26ESqk9k9{|hS`45=m78Zo)>`Lrd8hl^iHEd& zo0oyFuT!Q1XC2BzXUO)yzZ;uH3oB-}i+oKgfw6_ZbtIQU8!nD62i+zgMvD?p1E)jI z+cA`Pm6X7z-L_vzUAfA<#24l6m`3vshS9XjH-I+6-lb&kK~SniuP)V@dkwU`{xNxCYwBG$!< zqqss3@{x8fWH}UAdVA_Q>$e!jXvT_*6+v7b)asJc9@a>Z;(L!3_NJjGl_A_cS>(id zcDqz*=klioZTO2_mgb!u-OljgI>VM5Tvnl&Ty4+0qsWdyBp~m#mWY+gm|f*!*x%93 zvp~6^t4mUO0_D(S&JIRV3BV0vn3n`qE+hEJaopflXhpeqYr3nxlhyq{$dOlR`Cm5Y zqpjW4OS`?s`Gv~M?!Pr`*qr@&xk-?yY7~aCIv0rpo&Y1C&=pu2j%VM;UUj6^{Tz~P zT};;zl)~Fc41VA(I~o%ghIUGcHN?rt>GE0d5JOy-p!DkyDN_u$ogayn0d&GbOle=* zXEcCY-=fS?M+DDeiST6S1^fbMDY$BX0@G&H@z4qri0gWt#wY`$5OX$ zinTMEp?^$GOn7&A{d(J@V$`c?>Qj(r{eTKagurX{GMIL$xo!)X&q54Ke#nL#A_=x3 zL2?_pH*SD%wl! zASDFEPKid~=%b9A++T4qQ*^V84FLm@=bDM8oBc1%1axYecKtYp{(dl zlKbTb1}9u7+Kj31vOe8@uP!Zp)_d_p1avdhP{rh)O)NWiZA!vd!KdZ=g!+gpyvlQI zh@VNB@Gxp{e61&_Ray#w)cZ+c}@lE^w-Gm@5`f4O&BU=$i* zs1VF|FbDv&70IY@J8Tc#TmD00Vh~9!*8Z8U1jVp&w)rkDHGrl_6!8DEaFf$lw_4pB VT;GRV{Jq{%Q%z5`O4%;@{{Upu>~a7A literal 0 HcmV?d00001 diff --git a/plugins/ngs-analysis/references/database-registry.json b/plugins/ngs-analysis/references/database-registry.json new file mode 100644 index 0000000..737dff7 --- /dev/null +++ b/plugins/ngs-analysis/references/database-registry.json @@ -0,0 +1,100 @@ +{ + "schema_version": "0.1.0", + "databases": { + "silva_138_amplicon": { + "display_name": "SILVA 138 marker-gene taxonomy database", + "kind": "taxonomy_database", + "database_family": "SILVA", + "version": "138", + "root_env": "NGS_DB_SILVA_138_ROOT", + "source": "SILVA release files converted for the selected amplicon backend.", + "license_note": "Respect SILVA distribution and citation requirements.", + "estimated_size": "medium; depends on release and classifier representation", + "suggested_setup": [ + "mkdir -p \"$NGS_DB_SILVA_138_ROOT\"", + "Place release-matched taxonomy.tsv and sequences.fasta under \"$NGS_DB_SILVA_138_ROOT\".", + "For QIIME2 classify-sklearn, also track the trained .qza classifier path used for the run." + ], + "required_files": [ + "taxonomy.tsv", + "sequences.fasta" + ] + }, + "gtdb_release": { + "display_name": "GTDB taxonomy database bundle", + "kind": "taxonomy_database", + "database_family": "GTDB", + "version": "user_selected", + "root_env": "NGS_DB_GTDB_ROOT", + "source": "GTDB release bundle prepared for the selected classifier.", + "license_note": "Track GTDB release and classifier-specific conversion details.", + "estimated_size": "medium to large; depends on GTDB release and classifier representation", + "suggested_setup": [ + "mkdir -p \"$NGS_DB_GTDB_ROOT\"", + "Place release-matched taxonomy.tsv and sequences.fasta under \"$NGS_DB_GTDB_ROOT\".", + "Record the GTDB release, classifier backend, and conversion command in the run resource manifest." + ], + "required_files": [ + "taxonomy.tsv", + "sequences.fasta" + ] + }, + "kraken2_standard": { + "display_name": "Kraken2 standard database", + "kind": "metagenomics_database", + "database_family": "Kraken2", + "version": "user_selected", + "root_env": "NGS_DB_KRAKEN2_ROOT", + "source": "Kraken2 standard database or lab-curated equivalent.", + "license_note": "Large database downloads should be deliberate and checksum-tracked.", + "estimated_size": "large; Kraken2 standard databases can require substantial disk space", + "suggested_setup": [ + "mkdir -p \"$NGS_DB_KRAKEN2_ROOT\"", + "kraken2-build --standard --db \"$NGS_DB_KRAKEN2_ROOT\" --threads ", + "kraken2-inspect --db \"$NGS_DB_KRAKEN2_ROOT\" > \"$NGS_DB_KRAKEN2_ROOT\"/inspect.txt" + ], + "required_files": [ + "hash.k2d", + "opts.k2d", + "taxo.k2d" + ] + }, + "bracken_standard": { + "display_name": "Bracken database paired to Kraken2", + "kind": "metagenomics_database", + "database_family": "Bracken", + "version": "user_selected", + "root_env": "NGS_DB_BRACKEN_ROOT", + "source": "Bracken files generated from the exact Kraken2 database used for classification.", + "license_note": "Bracken abundance estimates are only valid when read length and Kraken database match the generated kmer distribution.", + "estimated_size": "small to medium relative to the paired Kraken2 database", + "suggested_setup": [ + "Use the exact Kraken2 database root used for classification.", + "bracken-build -d \"$NGS_DB_BRACKEN_ROOT\" -t -k 35 -l ", + "Keep one kmer distribution per read length when workflows mix read lengths." + ], + "required_files": [ + "database100mers.kmer_distrib" + ] + }, + "humann_uniref90": { + "display_name": "HUMAnN UniRef90 and ChocoPhlAn database bundle", + "kind": "functional_profile_database", + "database_family": "HUMAnN", + "version": "user_selected", + "root_env": "NGS_DB_HUMANN_ROOT", + "source": "HUMAnN utility downloads or local mirrored database bundle.", + "license_note": "HUMAnN databases are large and should be versioned with the HUMAnN software release.", + "estimated_size": "large; ChocoPhlAn plus UniRef databases can require substantial disk space", + "suggested_setup": [ + "mkdir -p \"$NGS_DB_HUMANN_ROOT\"", + "humann_databases --download chocophlan full \"$NGS_DB_HUMANN_ROOT\"", + "humann_databases --download uniref uniref90_diamond \"$NGS_DB_HUMANN_ROOT\"" + ], + "required_files": [ + "chocophlan", + "uniref" + ] + } + } +} diff --git a/plugins/ngs-analysis/references/intake-schema.json b/plugins/ngs-analysis/references/intake-schema.json new file mode 100644 index 0000000..5371db6 --- /dev/null +++ b/plugins/ngs-analysis/references/intake-schema.json @@ -0,0 +1,135 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "NGS analysis intake", + "type": "object", + "additionalProperties": false, + "required": ["input_type", "assay", "objective"], + "properties": { + "input_type": { + "type": "string", + "enum": ["bcl_run_folder", "fastq", "bam_or_cram", "count_matrix", "vcf", "unknown"] + }, + "assay": { + "type": "string", + "enum": [ + "wgs", + "wes", + "targeted_panel", + "bulk_rnaseq", + "scrnaseq", + "snrnaseq", + "atacseq", + "chipseq", + "cutandrun", + "cutandtag", + "amplicon_microbiome", + "shotgun_metagenomics", + "fastq_qc_only", + "unknown" + ] + }, + "objective": { + "type": "array", + "items": { + "type": "string", + "enum": [ + "demultiplex", + "qc", + "trim", + "align", + "variant_calling", + "germline_variant_calling", + "somatic_variant_calling", + "umi_panel_variant_calling", + "variant_annotation", + "expression_counts", + "differential_expression", + "cell_matrix", + "cell_qc", + "cell_annotation", + "post_count_qc", + "peak_calling", + "differential_accessibility", + "differential_binding", + "taxonomic_profile", + "functional_profile", + "report_only" + ] + }, + "minItems": 1, + "uniqueItems": true + }, + "organism": {"type": "string"}, + "reference": { + "type": "object", + "additionalProperties": true, + "properties": { + "genome": {"type": "string"}, + "fasta": {"type": "string"}, + "gtf": {"type": "string"}, + "bed": {"type": "string"}, + "known_sites": {"type": "array", "items": {"type": "string"}} + } + }, + "library": { + "type": "object", + "additionalProperties": true, + "properties": { + "paired_end": {"type": "boolean"}, + "strandedness": { + "type": "string", + "enum": ["forward", "reverse", "unstranded", "unknown"] + }, + "umi": {"type": "boolean"}, + "chemistry": {"type": "string"}, + "read_length": {"type": "string"}, + "adapters_or_primers": {"type": "array", "items": {"type": "string"}} + } + }, + "study_design": { + "type": "object", + "additionalProperties": true, + "properties": { + "samplesheet": {"type": "string"}, + "contrasts": {"type": "array", "items": {"type": "string"}}, + "tumor_normal": {"type": "boolean"}, + "tumor_only": {"type": "boolean"}, + "trio": {"type": "boolean"}, + "sample_model": { + "type": "string", + "enum": ["singleton", "cohort", "duo", "trio", "family", "tumor_only", "tumor_normal", "umi_panel", "unknown"] + }, + "replicates": {"type": "boolean"}, + "batches": {"type": "array", "items": {"type": "string"}} + } + }, + "runtime": { + "type": "object", + "additionalProperties": true, + "properties": { + "executor": { + "type": "string", + "enum": ["local", "slurm", "sge", "aws", "gcp", "azure", "unknown"] + }, + "container_engine": { + "type": "string", + "enum": ["docker", "singularity", "apptainer", "none", "unknown"] + }, + "can_install": {"type": "boolean"}, + "allow_network": {"type": "boolean"}, + "allow_cloud_upload": {"type": "boolean"}, + "gpu_available": {"type": "boolean"} + } + }, + "routing_decision": { + "type": "object", + "additionalProperties": true, + "properties": { + "pipeline_key": {"type": "string"}, + "pipeline_name": {"type": "string"}, + "confidence": {"type": "string", "enum": ["high", "medium", "low"]}, + "missing_essentials": {"type": "array", "items": {"type": "string"}} + } + } + } +} diff --git a/plugins/ngs-analysis/references/pipeline-registry.json b/plugins/ngs-analysis/references/pipeline-registry.json new file mode 100644 index 0000000..a05d40a --- /dev/null +++ b/plugins/ngs-analysis/references/pipeline-registry.json @@ -0,0 +1,762 @@ +{ + "schema_version": "0.1.0", + "tools": { + "nextflow": { + "executables": ["nextflow"], + "kind": "workflow_runner", + "install": {"conda": "bioconda::nextflow"}, + "notes": "Required for nf-core workflows." + }, + "snakemake": { + "executables": ["snakemake"], + "python_modules": ["snakemake"], + "kind": "workflow_runner", + "install": {"conda": "bioconda::snakemake", "pip": "snakemake"}, + "notes": "Preferred local workflow runner for file-based local or devbox execution without Docker." + }, + "mamba": { + "executables": ["mamba"], + "kind": "environment_manager", + "notes": "Preferred environment manager for local conda environments when available." + }, + "micromamba": { + "executables": ["micromamba"], + "kind": "environment_manager", + "notes": "Conda-compatible environment manager; useful on devboxes where system conda is absent." + }, + "fastqc": { + "executables": ["fastqc"], + "kind": "qc", + "install": {"conda": "bioconda::fastqc"} + }, + "multiqc": { + "executables": ["multiqc"], + "python_modules": ["multiqc"], + "kind": "reporting", + "install": {"conda": "bioconda::multiqc", "pip": "multiqc"} + }, + "fastp": { + "executables": ["fastp"], + "kind": "fastq_preprocessor", + "install": {"conda": "bioconda::fastp"} + }, + "cutadapt": { + "executables": ["cutadapt"], + "python_modules": ["cutadapt"], + "kind": "adapter_trimming", + "install": {"conda": "bioconda::cutadapt", "pip": "cutadapt"} + }, + "seqkit": { + "executables": ["seqkit"], + "kind": "fastq_util", + "install": {"conda": "bioconda::seqkit"} + }, + "samtools": { + "executables": ["samtools"], + "kind": "hts_util", + "install": {"conda": "bioconda::samtools"} + }, + "bcftools": { + "executables": ["bcftools"], + "kind": "variant_util", + "install": {"conda": "bioconda::bcftools"} + }, + "fgbio": { + "executables": ["fgbio"], + "kind": "umi_consensus", + "install": {"conda": "bioconda::fgbio"}, + "notes": "Useful for UMI-aware targeted sequencing workflows when a lab protocol does not provide its own consensus step." + }, + "bwa-mem2": { + "executables": ["bwa-mem2"], + "kind": "aligner", + "install": {"conda": "bioconda::bwa-mem2"} + }, + "bowtie2": { + "executables": ["bowtie2"], + "kind": "aligner", + "install": {"conda": "bioconda::bowtie2"} + }, + "bedtools": { + "executables": ["bedtools"], + "kind": "interval_util", + "install": {"conda": "bioconda::bedtools"} + }, + "gatk": { + "executables": ["gatk"], + "kind": "variant_calling", + "install": {"conda": "bioconda::gatk4"}, + "notes": "GATK4 is open source; best-practice resource bundles are large and should be downloaded deliberately." + }, + "deepvariant": { + "executables": ["run_deepvariant"], + "kind": "variant_calling", + "container_images": ["google/deepvariant:latest"], + "notes": "Commonly run by Docker or Singularity rather than a local executable." + }, + "star": { + "executables": ["STAR"], + "kind": "rna_aligner", + "install": {"conda": "bioconda::star"} + }, + "salmon": { + "executables": ["salmon"], + "kind": "rna_quantification", + "install": {"conda": "bioconda::salmon"} + }, + "subread": { + "executables": ["featureCounts"], + "kind": "rna_counting", + "install": {"conda": "bioconda::subread"} + }, + "rscript": { + "executables": ["Rscript"], + "kind": "statistical_runtime", + "install": {"conda": "conda-forge::r-base"}, + "notes": "R/Bioconductor package checks are workflow-specific; this preflight only verifies that an R runtime exists." + }, + "scanpy": { + "python_modules": ["scanpy"], + "kind": "single_cell_analysis", + "install": {"conda": "conda-forge::scanpy", "pip": "scanpy"} + }, + "kb-python": { + "executables": ["kb"], + "python_modules": ["kb_python"], + "kind": "single_cell_counting", + "install": {"pip": "kb-python"} + }, + "macs2": { + "executables": ["macs2"], + "kind": "peak_calling", + "install": {"conda": "bioconda::macs2"} + }, + "deeptools": { + "executables": ["bamCoverage", "computeMatrix", "plotProfile", "plotHeatmap"], + "kind": "signal_qc", + "install": {"conda": "bioconda::deeptools"} + }, + "homer": { + "executables": ["findMotifsGenome.pl"], + "kind": "motif_enrichment", + "install": {"conda": "bioconda::homer"}, + "notes": "Optional motif enrichment backend for ATAC, ChIP-seq, CUT&RUN, and CUT&Tag peak sets." + }, + "qiime2": { + "executables": ["qiime"], + "kind": "amplicon_microbiome", + "notes": "QIIME2 installation is best done with its published environment file or container for the target release." + }, + "dada2": { + "kind": "amplicon_denoising", + "install": {"conda": "bioconda::bioconductor-dada2"}, + "notes": "R/Bioconductor DADA2 backend for ASV inference. The plugin runner checks this as an R package at execution time." + }, + "kraken2": { + "executables": ["kraken2"], + "kind": "taxonomic_classification", + "install": {"conda": "bioconda::kraken2"}, + "notes": "Databases are large and should be selected before download." + }, + "bracken": { + "executables": ["bracken"], + "kind": "taxonomic_abundance", + "install": {"conda": "bioconda::bracken"} + }, + "metaphlan": { + "executables": ["metaphlan"], + "kind": "taxonomic_profile", + "install": {"conda": "bioconda::metaphlan"} + }, + "kneaddata": { + "executables": ["kneaddata"], + "kind": "host_depletion", + "install": {"conda": "bioconda::kneaddata"}, + "notes": "Optional shotgun metagenomics host-depletion backend. Requires a prepared host reference database and should be treated as required when --host-reference is supplied." + }, + "humann": { + "executables": ["humann"], + "kind": "functional_profile", + "install": {"conda": "bioconda::humann"} + }, + "bcl-convert": { + "executables": ["bcl-convert"], + "kind": "bcl_conversion", + "license": "free_proprietary", + "notes": "Illumina BCL Convert is free for local use but proprietary and distributed as Illumina RPM installers. Do not auto-download without explicit user approval." + }, + "bcl2fastq": { + "executables": ["bcl2fastq"], + "kind": "bcl_conversion", + "license": "legacy_proprietary", + "notes": "Legacy Illumina converter. Use only when BCL Convert is unavailable or the run requires legacy compatibility." + }, + "cellranger": { + "executables": ["cellranger"], + "kind": "single_cell_vendor_pipeline", + "license": "eula", + "notes": "10x Cell Ranger requires EULA acceptance. Prefer public alternatives unless the user explicitly wants vendor-standard output and has accepted the license." + } + }, + "profiles": { + "local_light": { + "display_name": "Local execution profile", + "runner": "snakemake_or_direct_shell", + "environment": "mamba_or_micromamba_conda_envs", + "containers": "disabled_by_default", + "required_tools": ["snakemake"], + "preferred_tools": ["fastqc", "multiqc", "fastp", "seqkit", "salmon", "samtools", "bcftools"], + "optional_tools": ["cutadapt", "bwa-mem2", "bowtie2", "bedtools", "deeptools", "subread", "scanpy", "star", "macs2", "kraken2", "bracken", "kneaddata", "humann", "bcl-convert", "bcl2fastq"], + "first_lanes": ["fastq_qc", "bulk_rnaseq_counts_qc", "bulk_rnaseq_differential_expression", "dna_variant_calling", "scrnaseq_post_count_qc", "epigenomics_peaks", "amplicon_microbiome", "shotgun_metagenomics", "bcl_to_fastq"], + "notes": "Use when Docker, Nextflow, or container registry access is unavailable or unstable. This profile runs local workflows over staged or user-provided data." + }, + "production_nfcore": { + "display_name": "nf-core execution profile", + "runner": "nextflow", + "environment": "docker_singularity_conda_or_site_profile", + "required_tools": ["nextflow"], + "preferred_tools": ["multiqc"], + "first_lanes": ["bulk_rnaseq", "scrnaseq", "dna_variant_calling", "dna_germline_variants", "dna_somatic_variants", "atacseq_peaks_qc", "chip_cutrun_peaks_qc", "amplicon_microbiome", "shotgun_metagenomics"], + "adapter": "plugins/ngs-analysis/scripts/run_nfcore_pipeline.py", + "notes": "Use when the user wants pinned nf-core execution with Nextflow reports, trace, timeline, DAG, and published results captured in a standard run envelope." + } + }, + "pipelines": { + "bcl_to_fastq": { + "display_name": "BCL to FASTQ conversion", + "route_when": ["bcl_run_folder", "demultiplex"], + "local_executor": "plugins/ngs-analysis/scripts/run_bcl_to_fastq.py", + "preferred_tools": ["bcl-convert"], + "optional_tools": ["bcl2fastq"], + "local_light_tools": ["bcl-convert", "bcl2fastq"], + "run_envelope_schema": "plugins/ngs-analysis/references/run-envelope-schema.json", + "local_outputs": [ + "run_manifest.json", + "validation/runinfo.json", + "validation/runparameters.json", + "validation/samplesheet_summary.json", + "validation/runtime_preflight.json", + "commands.sh", + "logs/bcl_conversion.log", + "qc/demux_qc_summary.json", + "artifact_index.json", + "summary.md" + ], + "public_boundary": "BCL Convert is public to download and free for local use, but proprietary; do not auto-download.", + "essential_questions": [ + "Where is the Illumina run folder containing RunInfo.xml?", + "Which SampleSheet.csv should be used?", + "Should lanes be split or combined?", + "Are UMI bases present in reads or index reads?", + "What output directory should receive FASTQs and reports?" + ] + }, + "fastq_qc": { + "display_name": "FASTQ QC and trimming", + "route_when": ["fastq", "qc", "trim"], + "local_executor": "plugins/ngs-analysis/scripts/run_fastq_qc.py", + "preferred_tools": ["fastqc", "multiqc", "fastp", "cutadapt", "seqkit"], + "local_light_tools": ["snakemake", "fastqc", "multiqc", "fastp", "seqkit"], + "essential_questions": [ + "Are reads paired-end or single-end?", + "Is there a local sample sheet, or should a single sample be run from explicit R1/R2 paths?", + "Are adapters or primer sequences known?", + "Is trimming requested or QC-only?", + "Which output directory should receive the timestamped run envelope?", + "Should outputs preserve the original FASTQs?" + ] + }, + "dna_variant_calling": { + "display_name": "DNA variant calling with nf-core/sarek", + "route_when": ["wgs", "wes", "targeted_panel", "variant_calling"], + "preferred_workflow": "nf-core/sarek", + "preferred_tools": ["nextflow", "samtools", "bcftools"], + "optional_tools": ["bwa-mem2", "gatk", "deepvariant"], + "local_executor": "plugins/ngs-analysis/scripts/run_dna_variant_calling.py", + "local_light_workflow": "direct_samtools_bcftools_bam_to_vcf", + "local_light_tools": ["samtools", "gatk", "bcftools"], + "run_envelope_schema": "plugins/ngs-analysis/references/run-envelope-schema.json", + "local_outputs": [ + "run_manifest.json", + "validation/samples.normalized.tsv", + "qc/*.flagstat.txt", + "qc/*.idxstats.tsv", + "variants/*.vcf.gz", + "variants/*.bcftools_stats.txt", + "visualizations/index.html", + "visualizations/visualization_manifest.json", + "notebooks/vcf_review.marimo.py", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "Is this WGS, WES, or targeted panel data?", + "Is the analysis germline, tumor-only, tumor-normal, or trio?", + "Which reference genome and known-sites resources should be used?", + "For WES/panel, where is the target BED file?", + "Are UMIs present?", + "Should variants be annotated with VEP or SnpEff?" + ] + }, + "dna_germline_variants": { + "display_name": "Germline DNA variant calling", + "route_when": ["germline", "singleton", "cohort", "trio", "family", "inherited_panel"], + "preferred_workflow": "nf-core/sarek", + "preferred_tools": ["nextflow", "samtools", "bcftools"], + "optional_tools": ["bwa-mem2", "gatk", "deepvariant"], + "local_executor": "plugins/ngs-analysis/scripts/run_dna_germline_variants.py", + "local_light_workflow": "gatk_bqsr_haplotypecaller_joint_genotyping", + "local_light_tools": ["snakemake", "bwa-mem2", "samtools", "bcftools"], + "local_outputs": [ + "run_manifest.json", + "validation/samples.normalized.tsv", + "qc/*.flagstat.txt", + "qc/*.idxstats.tsv", + "recal/*.recal.table", + "recal/*.recal.bam", + "gvcf/*.g.vcf.gz", + "joint/cohort.joint.vcf.gz", + "visualizations/index.html", + "visualizations/visualization_manifest.json", + "notebooks/vcf_review.marimo.py", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "Is this WGS, WES, or targeted inherited-panel data?", + "Is the sample model singleton, cohort, duo, trio, or family?", + "Which reference build, known-sites resources, and annotation cache should be used?", + "For WES/panel, where are the target and bait BED files?", + "Should outputs be per-sample VCFs, gVCFs, or a jointly called cohort VCF?" + ] + }, + "dna_somatic_variants": { + "display_name": "Somatic DNA variant calling", + "route_when": ["somatic", "tumor_normal", "tumor_only", "cancer_panel"], + "preferred_workflow": "nf-core/sarek", + "preferred_tools": ["nextflow", "gatk", "samtools", "bcftools"], + "optional_tools": ["bwa-mem2", "deepvariant"], + "local_executor": "plugins/ngs-analysis/scripts/run_dna_somatic_variants.py", + "local_light_workflow": "gatk_mutect2_tumor_normal_or_tumor_only", + "local_light_tools": ["gatk", "samtools", "bcftools"], + "local_outputs": [ + "run_manifest.json", + "validation/pairs.normalized.tsv", + "workflow/somatic_command_plan.json", + "qc/somatic_qc_summary.json", + "qc/somatic_filter_reasons.tsv", + "variants/*.unfiltered.vcf.gz", + "variants/*.filtered.vcf.gz", + "visualizations/index.html", + "visualizations/visualization_manifest.json", + "notebooks/vcf_review.marimo.py", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "Is this tumor-normal, tumor-only, relapse-baseline, or another cancer design?", + "Where is the tumor-normal pairing table?", + "Which reference build, germline resource, panel-of-normals, and annotation cache should be used?", + "For WES/panel, where is the target BED file?", + "What allele-fraction, contamination, and tumor-purity constraints should be documented?" + ] + }, + "dna_umi_panel_variants": { + "display_name": "UMI-aware targeted DNA panel variant calling", + "route_when": ["umi_panel", "duplex_panel", "molecular_barcode", "low_frequency_panel"], + "preferred_tools": ["fastqc", "multiqc", "samtools", "bcftools"], + "optional_tools": ["fgbio", "bwa-mem2", "gatk"], + "local_executor": "plugins/ngs-analysis/scripts/run_dna_umi_panel_variants.py", + "local_light_workflow": "fgbio_consensus_plus_bcftools_panel_calling", + "local_light_tools": ["fgbio", "samtools", "bcftools"], + "local_outputs": [ + "run_manifest.json", + "validation/samples.normalized.tsv", + "workflow/umi_panel_command_plan.json", + "qc/umi_consensus_plan.json", + "qc/umi_family_size_summary.tsv", + "consensus/*.bam", + "variants/*.consensus.vcf.gz", + "visualizations/index.html", + "visualizations/visualization_manifest.json", + "notebooks/vcf_review.marimo.py", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "Which panel or capture kit and target BED should be used?", + "Where are the UMIs encoded: read bases, index reads, single UMI, or duplex UMI?", + "Have consensus reads already been generated?", + "What minimum allele fraction and intended sensitivity should be documented?", + "Which controls or spike-ins should be carried through QC?" + ] + }, + "bulk_rnaseq": { + "display_name": "Bulk RNA-seq with nf-core/rnaseq", + "route_when": ["bulk_rnaseq", "expression_counts", "differential_expression"], + "preferred_workflow": "nf-core/rnaseq", + "preferred_tools": ["nextflow", "fastqc", "multiqc"], + "optional_tools": ["star", "salmon", "subread"], + "local_executor": "plugins/ngs-analysis/scripts/run_bulk_rnaseq_counts_qc.py", + "secondary_local_executor": "plugins/ngs-analysis/scripts/run_bulk_rnaseq_de.py", + "local_light_workflow": "snakemake_salmon_quant_plus_r_de", + "local_light_tools": ["snakemake", "fastqc", "multiqc", "salmon", "rscript"], + "run_envelope_schema": "plugins/ngs-analysis/references/run-envelope-schema.json", + "essential_questions": [ + "What organism, genome FASTA, and GTF annotation should be used?", + "Is the library stranded, reverse-stranded, unstranded, or unknown?", + "Are reads paired-end or single-end?", + "Is the goal counts only or differential expression too?", + "If differential expression is needed, what is the sample metadata and contrast design?" + ] + }, + "bulk_rnaseq_counts_qc": { + "display_name": "Bulk RNA-seq count generation and QC", + "route_when": ["bulk_rnaseq_counts", "fastq_to_counts", "rnaseq_qc"], + "preferred_workflow": "nf-core/rnaseq", + "preferred_tools": ["nextflow", "fastqc", "multiqc"], + "optional_tools": ["star", "salmon", "subread"], + "local_executor": "plugins/ngs-analysis/scripts/run_bulk_rnaseq_counts_qc.py", + "local_workflow": "plugins/ngs-analysis/workflows/bulk_rnaseq_counts_qc/Snakefile.smk", + "local_light_workflow": "snakemake_salmon_quant", + "local_light_tools": ["snakemake", "fastqc", "multiqc", "salmon"], + "run_envelope_schema": "plugins/ngs-analysis/references/run-envelope-schema.json", + "local_outputs": [ + "run_manifest.json", + "validation/input_summary.json", + "validation/validation_summary.json", + "logs/snakemake_dry_run.log", + "logs/snakemake_execute.log", + "fastqc/multiqc/multiqc_browser_helper.html", + "rnaseq_salmon/multiqc/multiqc_browser_helper.html", + "visualizations/localhost_launch_hint.txt", + "rnaseq_salmon/matrices/tpm.tsv", + "rnaseq_salmon/matrices/num_reads.tsv", + "rnaseq_salmon/matrices/effective_length.tsv", + "rnaseq_salmon/matrices/samples.tsv", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "What organism, genome FASTA, and GTF annotation should be used?", + "Is strandedness known, unknown, or should it be inferred?", + "Are reads paired-end or single-end?", + "Should quantification produce gene counts, transcript estimates, or both?", + "Where is the sample metadata table that must carry into downstream analysis?" + ] + }, + "bulk_rnaseq_differential_expression": { + "display_name": "Bulk RNA-seq differential expression", + "route_when": ["differential_expression", "rnaseq_de", "counts_to_de"], + "preferred_tools": ["rscript"], + "local_executor": "plugins/ngs-analysis/scripts/run_bulk_rnaseq_de.py", + "local_workflow": "plugins/ngs-analysis/workflows/bulk_rnaseq_differential_expression/run_bulk_de.R", + "local_tools": ["Rscript"], + "local_r_packages": ["DESeq2", "edgeR", "limma"], + "run_envelope_schema": "plugins/ngs-analysis/references/run-envelope-schema.json", + "local_outputs": [ + "run_manifest.json", + "validation/input_summary.json", + "validation/validation_summary.json", + "logs/validation_dry_run.log", + "logs/rscript_execute.log", + "manifest/contrast_status.tsv", + "results/normalized_counts.tsv", + "results/log2_expression_matrix.tsv", + "results/.tsv", + "qc/library_sizes.png", + "qc/pca.png", + "qc/sample_distance_heatmap.png", + "plots/_volcano.png", + "plots/_ma.png", + "visualizations/index.html", + "visualizations/visualization_manifest.json", + "notebooks/bulk_rnaseq_de_review.marimo.py", + "notebooks/marimo_server.json", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "Where are the raw count matrix and sample metadata?", + "What are the biological replicates, batch variables, donor pairing, and covariates?", + "What design formula and contrasts should be run?", + "Which statistical framework should be used: DESeq2, edgeR, limma-voom, or lab standard?", + "Which plots and result tables are required?" + ] + }, + "scrnaseq": { + "display_name": "Single-cell RNA-seq with public alternatives", + "route_when": ["scrnaseq_fastq", "snrnaseq_fastq", "single_cell_count_generation"], + "local_executor": "plugins/ngs-analysis/scripts/run_scrnaseq_fastq_to_count.py", + "preferred_workflow": "nf-core/scrnaseq", + "preferred_tools": ["nextflow"], + "local_workflow": "plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/Snakefile.smk", + "local_tools": ["snakemake", "star"], + "optional_tools": ["kb-python", "star", "cellranger"], + "run_envelope_schema": "plugins/ngs-analysis/references/run-envelope-schema.json", + "local_outputs": [ + "run_manifest.json", + "manifest/lineage.tsv", + "manifest/working_samplesheet.csv", + "manifest/inputs_manifest.tsv", + "validation/input_summary.json", + "validation/validation_summary.json", + "validation/tool_preflight.json", + "versions/software_versions.json", + "counts/*/Solo.out/Gene/raw/*", + "counts/*/Solo.out/Gene/filtered/*", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "Is the input raw FASTQ requiring count generation, or a post-count object that should route to scrna-seq-qc?", + "Which chemistry or barcode/UMI layout was used?", + "Is this single-cell or single-nucleus?", + "What organism and reference should be used?", + "Should the output stop at a count matrix, or continue to QC, clustering, annotation, and UMAPs?" + ] + }, + "scrnaseq_post_count_qc": { + "display_name": "scRNA-seq post-count QC, annotation, and UMAP", + "route_when": ["h5ad", "matrix", "cellranger_output", "single_cell_qc", "single_cell_annotation", "umap"], + "preferred_skill": "scrna-seq-qc", + "local_executor": "plugins/ngs-analysis/scripts/run_scrnaseq_post_count_qc.py", + "preferred_tools": ["scanpy"], + "local_light_tools": ["scanpy"], + "run_envelope_schema": "plugins/ngs-analysis/references/run-envelope-schema.json", + "local_outputs": [ + "run_manifest.json", + "manifest/lineage.tsv", + "validation/input_summary.json", + "validation/tool_preflight.json", + "versions/software_versions.json", + "qc/threshold_justification.png", + "qc/cell_qc_metrics.csv", + "tables/cell_qc_summary.tsv", + "annotation/cell_labels.csv", + "embeddings/umap_coords.csv", + "plots/umap_global.png", + "plots/umap_by_coarse_label.png", + "visualizations/index.html", + "visualizations/visualization_manifest.json", + "notebooks/scrna_qc_review.marimo.py", + "notebooks/marimo_server.json", + "analysis_with_flags.h5ad", + "filtered_view.h5ad", + "provenance/analysis_status.json", + "summary.md", + "artifact_index.json" + ], + "essential_questions": [ + "Where is the count matrix, h5ad, h5, rds, or Cell Ranger-style output?", + "Are raw counts preserved?", + "What organism, tissue, assay type, chemistry, and sample/channel metadata are available?", + "Should the endpoint include QC only, annotation, clustering, UMAPs, or downstream differential summaries?", + "Is there a matched reference atlas or should marker-based fallback annotation be used?" + ] + }, + "epigenomics_peaks": { + "display_name": "Epigenomics peak calling", + "route_when": ["atacseq", "chipseq", "cutandrun", "cutandtag", "peak_calling"], + "preferred_workflows": ["nf-core/atacseq", "nf-core/chipseq", "nf-core/cutandrun"], + "preferred_tools": ["nextflow", "fastqc", "multiqc", "macs2", "bedtools"], + "local_executor": "plugins/ngs-analysis/scripts/run_fastq_assay_package.py", + "local_executor_lane": "epigenomics_peaks", + "local_light_tools": ["seqkit", "fastqc", "multiqc"], + "run_envelope_schema": "plugins/ngs-analysis/references/run-envelope-schema.json", + "local_outputs": [ + "run_manifest.json", + "manifest/lineage.tsv", + "validation/samples.normalized.tsv", + "qc/seqkit_stats.tsv", + "fastqc/multiqc/multiqc_browser_helper.html", + "visualizations/localhost_launch_hint.txt", + "peak_calling_readiness.json", + "qc_verdict.json after successful execution", + "visualizations/index.html", + "visualizations/visualization_manifest.json", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "Is this ATAC-seq, ChIP-seq, CUT&RUN, or CUT&Tag?", + "What genome and blacklist should be used?", + "Are controls available, such as input DNA or IgG?", + "Are there biological replicates?", + "Is the desired output peaks, bigWigs, QC only, or differential accessibility/binding?" + ] + }, + "atacseq_peaks_qc": { + "display_name": "ATAC-seq QC and peak calling", + "route_when": ["atacseq", "accessibility", "differential_accessibility"], + "preferred_workflow": "nf-core/atacseq", + "preferred_tools": ["nextflow", "fastqc", "multiqc", "macs2", "bedtools", "deeptools"], + "optional_tools": ["homer"], + "local_executor": "plugins/ngs-analysis/scripts/run_atacseq_peaks_qc.py", + "read_qc_executor": "plugins/ngs-analysis/scripts/run_fastq_assay_package.py --lane epigenomics_peaks", + "local_light_workflow": "bowtie2_samtools_macs2_bedtools_deeptools", + "local_light_tools": ["samtools", "bowtie2", "macs2", "bedtools", "deeptools"], + "local_outputs": [ + "run_manifest.json", + "validation/samples.normalized.tsv", + "workflow/atacseq_command_plan.json", + "qc/atac_qc_contract.json", + "qc/atacseq_qc_summary.tsv", + "qc/atacseq_qc_summary.json", + "qc/*.flagstat.txt", + "qc/*.frip_reads.txt", + "qc/*.insert_sizes.txt", + "qc/*.tss_matrix.gz", + "qc/*.tss_profile.png", + "qc/*.tss_heatmap.png", + "peaks/*.narrowPeak", + "peaks/consensus_peaks.bed", + "tracks/*.bw", + "tracks/browser_tracks.tsv", + "tracks/ucsc_track_lines.txt", + "tracks/igv_session.xml", + "motifs/motif_summary.tsv", + "visualizations/index.html", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "What genome build, blacklist, and mitochondrial contig names should be used?", + "Are there biological replicates, conditions, and batches?", + "Should outputs include QC only, peaks, consensus peaks, bigWigs, or differential accessibility?", + "Should Tn5 shifting be handled by the workflow?", + "Which QC gates are required: TSS enrichment, FRiP, insert-size periodicity, blacklist overlap, or replicate concordance?" + ] + }, + "chip_cutrun_peaks_qc": { + "display_name": "ChIP-seq, CUT&RUN, and CUT&Tag QC and peak calling", + "route_when": ["chipseq", "cutandrun", "cutandtag", "differential_binding"], + "preferred_workflows": ["nf-core/chipseq", "nf-core/cutandrun"], + "preferred_tools": ["nextflow", "fastqc", "multiqc", "macs2", "bedtools", "deeptools"], + "optional_tools": ["homer"], + "local_executor": "plugins/ngs-analysis/scripts/run_chip_cutrun_peaks_qc.py", + "read_qc_executor": "plugins/ngs-analysis/scripts/run_fastq_assay_package.py --lane epigenomics_peaks", + "local_light_workflow": "bowtie2_samtools_macs2_bedtools_deeptools", + "local_light_tools": ["samtools", "bowtie2", "macs2", "bedtools", "deeptools"], + "local_outputs": [ + "run_manifest.json", + "validation/samples.normalized.tsv", + "workflow/chip_cutrun_command_plan.json", + "qc/chip_cutrun_qc_contract.json", + "qc/chip_cutrun_qc_summary.tsv", + "qc/chip_cutrun_qc_summary.json", + "qc/*.flagstat.txt", + "qc/*.frip_reads.txt", + "qc/*.insert_sizes.txt", + "peaks/*Peak", + "peaks/consensus_peaks.bed", + "tracks/*.bw", + "tracks/browser_tracks.tsv", + "tracks/ucsc_track_lines.txt", + "tracks/igv_session.xml", + "motifs/motif_enrichment_plan.json", + "motifs/motif_summary.tsv", + "visualizations/index.html", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "Is this ChIP-seq, CUT&RUN, or CUT&Tag?", + "What target class is being profiled: TF, histone mark, chromatin regulator, or custom?", + "Are input DNA, IgG, no-antibody, or spike-in controls available?", + "Should peaks be called in narrow or broad mode?", + "Should outputs include peaks, bigWigs, consensus peaks, count matrices, or differential binding?" + ] + }, + "amplicon_microbiome": { + "display_name": "Amplicon microbiome analysis", + "route_when": ["amplicon_microbiome", "taxonomic_profile"], + "preferred_workflow": "nf-core/ampliseq", + "preferred_tools": ["nextflow"], + "optional_tools": ["qiime2", "dada2", "cutadapt"], + "local_executor": "plugins/ngs-analysis/scripts/run_amplicon_microbiome.py", + "read_qc_executor": "plugins/ngs-analysis/scripts/run_fastq_assay_package.py --lane amplicon_microbiome", + "local_light_tools": ["qiime2", "dada2", "cutadapt"], + "run_envelope_schema": "plugins/ngs-analysis/references/run-envelope-schema.json", + "local_outputs": [ + "run_manifest.json", + "manifest/lineage.tsv", + "validation/samples.normalized.tsv", + "qc/seqkit_stats.tsv", + "fastqc/multiqc/multiqc_browser_helper.html", + "visualizations/localhost_launch_hint.txt", + "amplicon_analysis_status.json", + "qc_verdict.json after successful execution", + "qc_interpretation.json after successful execution", + "visualizations/index.html", + "visualizations/visualization_manifest.json", + "methods/amplicon_methods.json", + "workflow/amplicon_backend_status.json", + "workflow/amplicon_backend_plan.json", + "workflow/amplicon_backend_command_plan.json", + "methods/amplicon_backend_methods.json", + "workflow/qiime2_manifest.tsv", + "qiime2/table.qza when --backend qiime2 executes", + "qiime2/taxonomy.qza when a taxonomy classifier is provided", + "dada2/dada2_backend_state.rds when --backend dada2 executes", + "tables/alpha_diversity.tsv when --asv-table is provided", + "tables/asv_table.tsv or exported ASV table after backend execution", + "tables/representative_sequences.fasta when --backend dada2 executes", + "visualizations/beta_diversity_pcoa_bray_curtis.png when --asv-table has at least two matched samples or --allow-synthetic-diversity is set", + "visualizations/taxa_barplot_.png when --taxonomy-table is provided", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "Which marker was sequenced: 16S, 18S, ITS, COI, or another amplicon?", + "What primer sequences and orientation were used?", + "Are reads paired-end and should they be merged?", + "Which taxonomy database should be used?", + "Is the goal ASV table only or diversity/statistical analysis too?" + ] + }, + "shotgun_metagenomics": { + "display_name": "Shotgun metagenomics profiling", + "route_when": ["shotgun_metagenomics", "taxonomic_profile", "functional_profile"], + "preferred_workflow": "nf-core/taxprofiler", + "preferred_tools": ["nextflow", "kraken2", "bracken"], + "optional_tools": ["kneaddata", "metaphlan", "humann"], + "local_executor": "plugins/ngs-analysis/scripts/run_shotgun_metagenomics.py", + "read_qc_executor": "plugins/ngs-analysis/scripts/run_fastq_assay_package.py --lane shotgun_metagenomics", + "local_light_tools": ["kraken2", "bracken", "kneaddata", "humann"], + "run_envelope_schema": "plugins/ngs-analysis/references/run-envelope-schema.json", + "local_outputs": [ + "run_manifest.json", + "manifest/lineage.tsv", + "validation/samples.normalized.tsv", + "qc/seqkit_stats.tsv", + "fastqc/multiqc/multiqc_browser_helper.html", + "visualizations/localhost_launch_hint.txt", + "qc_verdict.json after successful execution", + "qc_interpretation.json after successful execution", + "taxonomic_classification_status.json", + "workflow/shotgun_backend_command_plan.json", + "qc/metagenomics_database_status.json", + "host_depletion/ when --host-reference executes", + "taxonomic_classification/*.kraken.report", + "taxonomic_classification/*.bracken.tsv", + "functional_profile/ when --run-humann executes", + "visualizations/index.html", + "visualizations/visualization_manifest.json", + "visualizations/kraken_top_taxa_barplot.png when Kraken reports are available", + "visualizations/bracken_relative_abundance_heatmap.png when Bracken tables are provided", + "visualizations/humann_pathway_heatmap.png when HUMAnN pathabundance is provided", + "visualizations/humann_gene_family_heatmap.png when HUMAnN genefamilies is provided", + "artifact_index.json", + "summary.md" + ], + "essential_questions": [ + "Should host reads be removed, and what host reference should be used?", + "Is the goal taxonomic profiling, functional profiling, assembly, or all of these?", + "Which database family should be used: Kraken2/Bracken, MetaPhlAn, HUMAnN, or custom?", + "Are reads paired-end or single-end?", + "Are there negative controls or batch variables that should be carried into QC?" + ] + } + } +} diff --git a/plugins/ngs-analysis/references/reference-registry.json b/plugins/ngs-analysis/references/reference-registry.json new file mode 100644 index 0000000..d7e6455 --- /dev/null +++ b/plugins/ngs-analysis/references/reference-registry.json @@ -0,0 +1,71 @@ +{ + "schema_version": "0.1.0", + "references": { + "grch38_core": { + "display_name": "GRCh38 core alignment and variant-calling bundle", + "kind": "genome_reference", + "genome_build": "GRCh38", + "root_env": "NGS_REF_GRCH38_ROOT", + "source": "User-provided local bundle, typically GATK resource bundle plus aligner indexes.", + "license_note": "Genome FASTA and known-sites resources should be downloaded from their authoritative providers and tracked with checksums.", + "estimated_size": "large; depends on FASTA, annotation, aligner indexes, and known-sites resources", + "suggested_setup": [ + "mkdir -p \"$NGS_REF_GRCH38_ROOT\"/known_sites \"$NGS_REF_GRCH38_ROOT\"/blacklists", + "samtools faidx \"$NGS_REF_GRCH38_ROOT\"/genome.fa", + "gatk CreateSequenceDictionary -R \"$NGS_REF_GRCH38_ROOT\"/genome.fa -O \"$NGS_REF_GRCH38_ROOT\"/genome.dict", + "Record source URLs, release versions, and checksums in a project resource manifest before analysis runs." + ], + "required_files": [ + "genome.fa", + "genome.fa.fai", + "genome.dict", + "annotation.gtf", + "known_sites/dbsnp.vcf.gz", + "known_sites/dbsnp.vcf.gz.tbi", + "known_sites/mills_and_1000g_gold_standard.indels.vcf.gz", + "known_sites/mills_and_1000g_gold_standard.indels.vcf.gz.tbi", + "blacklists/encode_blacklist.bed" + ] + }, + "grcm39_core": { + "display_name": "GRCm39 core mouse reference bundle", + "kind": "genome_reference", + "genome_build": "GRCm39", + "root_env": "NGS_REF_GRCM39_ROOT", + "source": "User-provided local FASTA, GTF, indexes, and blacklist resources.", + "license_note": "Use an annotation/source pair that matches the FASTA build.", + "estimated_size": "large; depends on FASTA, annotation, indexes, and optional known-sites resources", + "suggested_setup": [ + "mkdir -p \"$NGS_REF_GRCM39_ROOT\"/blacklists", + "samtools faidx \"$NGS_REF_GRCM39_ROOT\"/genome.fa", + "gatk CreateSequenceDictionary -R \"$NGS_REF_GRCM39_ROOT\"/genome.fa -O \"$NGS_REF_GRCM39_ROOT\"/genome.dict", + "Keep FASTA, GTF, blacklist, and index files from the same genome-build/release family." + ], + "required_files": [ + "genome.fa", + "genome.fa.fai", + "genome.dict", + "annotation.gtf", + "blacklists/blacklist.bed" + ] + }, + "reduced_micro_genome": { + "display_name": "Small local reference bundle", + "kind": "reduced_reference", + "genome_build": "reduced_local", + "root_env": "NGS_REF_REDUCED_ROOT", + "source": "Local synthetic or reduced-size test bundle.", + "license_note": "Reduced references are for runner validation and should not be used for biological interpretation.", + "estimated_size": "small", + "suggested_setup": [ + "Use only for runner validation.", + "Create genome.fa, genome.fa.fai, and annotation.gtf together so reduced coordinates remain internally consistent." + ], + "required_files": [ + "genome.fa", + "genome.fa.fai", + "annotation.gtf" + ] + } + } +} diff --git a/plugins/ngs-analysis/references/run-envelope-schema.json b/plugins/ngs-analysis/references/run-envelope-schema.json new file mode 100644 index 0000000..a29e202 --- /dev/null +++ b/plugins/ngs-analysis/references/run-envelope-schema.json @@ -0,0 +1,55 @@ +{ + "schema_version": "0.4.0", + "description": "Common top-level run envelope for plugin-owned NGS execution lanes.", + "required_top_level_files": [ + "run_manifest.json", + "config.json", + "validation/input_summary.json", + "validation/validation_summary.json", + "validation/tool_preflight.json", + "logs/", + "versions/software_versions.json", + "manifest/lineage.tsv", + "artifact_index.json", + "summary.md" + ], + "manifest_required_fields": [ + "schema_version", + "run_id", + "created_at", + "lane", + "workflow", + "run_dir", + "status", + "execute_requested", + "validation_ok", + "tool_preflight_ok", + "ready_to_execute", + "dry_run_performed", + "dry_run_ok", + "execution_ok", + "inputs", + "outputs", + "method", + "audit" + ], + "status_values": [ + "prepared", + "validated", + "blocked", + "failed", + "completed" + ], + "notes": [ + "Preflight validation should precede execution when a workflow supports it.", + "Raw sequencing inputs must be treated as read-only unless the user explicitly requests otherwise.", + "Assay-specific primary outputs live under lane-specific directories such as fastqc/, rnaseq_salmon/, qc/, results/, and plots/.", + "Native review bundles should use visualizations/index.html and visualizations/visualization_manifest.json when the runner generates plots or report links.", + "Notebook review surfaces, when present, belong under notebooks/ and should wrap generated artifacts rather than replacing the run envelope.", + "run_manifest.json may include full dry_run_result and execution_result objects with started_at, finished_at, returncode, and command detail.", + "artifact_index.json should include per-file SHA256 checksums and modification timestamps for provenance.", + "audit.parameter_sha256 should hash the declared inputs, outputs, lane, workflow, method, and execute_requested fields.", + "manifest/lineage.tsv should describe the declared input-to-output lineage with existence and checksum columns when the referenced files exist.", + "Assay-specific methods manifests and backend handoff bundles may live under methods/ and workflow/ and should be listed in outputs when present." + ] +} diff --git a/plugins/ngs-analysis/references/runtime-install-guidance.md b/plugins/ngs-analysis/references/runtime-install-guidance.md new file mode 100644 index 0000000..0b56a91 --- /dev/null +++ b/plugins/ngs-analysis/references/runtime-install-guidance.md @@ -0,0 +1,85 @@ +# Runtime Install And Existence Checks + +Use this guidance before downloading or installing anything. + +## Default Policy + +1. Check whether the executable already exists on `PATH`. +2. Check whether a Python module can be imported when the tool is Python-backed. +3. Check the active environment with `conda list`, `mamba list`, `micromamba list`, or `pip show` when available. +4. If the tool is missing, emit an install plan first. +5. Only query package indexes or container registries when network checks are allowed. +6. Only install when the user explicitly asked for installation or execution that requires installation. + +Avoid modifying system Python. Prefer an isolated conda/mamba/micromamba environment, a Nextflow container profile, Docker, Singularity, or Apptainer. + +## Package Existence Checks + +Local checks: + +```bash +command -v fastqc +command -v nextflow +command -v snakemake +python -c "import scanpy" +python -m pip show multiqc +conda list fastqc +``` + +Network checks: + +```bash +conda search -c bioconda fastqc +python -m pip index versions multiqc +docker manifest inspect google/deepvariant:latest +nextflow info nf-core/rnaseq +``` + +Network checks can be slow and may hit rate limits, so they should be explicit. + +## Install Planning + +Prefer one of these patterns: + +```bash +mamba create -n ngs-qc -c conda-forge -c bioconda fastqc multiqc fastp cutadapt seqkit +mamba create -n ngs-nextflow -c conda-forge -c bioconda nextflow +mamba create -n ngs-local -c conda-forge -c bioconda snakemake fastqc multiqc fastp seqkit salmon samtools bcftools +python -m pip install --user multiqc +``` + +For nf-core workflows, prefer containerized execution: + +```bash +nextflow run nf-core/rnaseq -profile test,docker --outdir results/rnaseq_test +``` + +When Docker, registry egress, or Nextflow process containers are unstable, use the local execution profile instead of forcing a full nf-core run: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --profile local_light --emit-install-plan +``` + +For approval handoff, persist the executable/package plan: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline shotgun_metagenomics --manager micromamba --install-plan-outdir runtime_readiness/shotgun_install +``` + +`install_plan.json` is the canonical artifact for Codex/user review. `install_commands.sh` is generated from that JSON and is guarded: by default it prints the plan path and exits without installing. Execute it only after explicit user approval by setting `NGS_RUN_INSTALL_COMMANDS=1`. + +Then use a plugin-owned runner when the selected lane has one, such as `run_fastq_qc.py` or `run_scrnaseq_fastq_to_count.py`. For lanes that do not yet have dedicated runners, prepare an assay-specific workflow envelope before execution. + +The local execution profile is meant for compact, auditable runs. When a site has a validated nf-core, WDL/Cromwell, or lab pipeline, preserve that pipeline's parameters and acceptance criteria. + +For large reference databases, do not auto-download. First estimate size, target path, and whether the database already exists. + +## Proprietary Or Credentialed Boundaries + +Public package routing does not mean every useful tool is open-source or credential-free. + +- Illumina BCL Convert: public/free local installer, proprietary, RPM-based. +- 10x Cell Ranger: public download with EULA; use only when the user has accepted the license or explicitly requests it. +- DRAGEN and Sentieon: commercial/licensed tools. +- BaseSpace, Terra, DNAnexus: account, permission, billing, and cloud-upload constraints. +- COSMIC, HGMD Professional, controlled human data repositories: licensing or authorization may be required. diff --git a/plugins/ngs-analysis/scripts/ngs_epigenomics_utils.py b/plugins/ngs-analysis/scripts/ngs_epigenomics_utils.py new file mode 100644 index 0000000..fca17be --- /dev/null +++ b/plugins/ngs-analysis/scripts/ngs_epigenomics_utils.py @@ -0,0 +1,627 @@ +#!/usr/bin/env python3 +"""Shared epigenomics artifact parsers and browser-track helpers.""" + +from __future__ import annotations + +import csv +import html +import statistics +from pathlib import Path +from typing import Any +from xml.sax.saxutils import escape + +from ngs_planner_utils import write_tsv +from ngs_run_utils import write_json, write_text + +EPIGENOMICS_SUMMARY_FIELDS = [ + "sample", + "layout", + "is_control", + "control_sample", + "status", + "filtered_bam", + "filtered_bam_exists", + "total_filtered_reads", + "mapped_reads", + "duplicate_reads", + "frip_reads", + "frip", + "raw_peak_count", + "blacklist_filtered_peak_count", + "consensus_peak_count", + "insert_size_count", + "insert_size_median", + "insert_size_mean", + "nucleosome_free_fraction", + "bigwig", + "bigwig_exists", + "tss_matrix_exists", + "tss_profile_exists", + "tss_heatmap_exists", + "motif_summary_exists", + "notes", +] + + +def parse_first_int(value: str) -> int | None: + try: + return int(str(value).strip().split()[0]) + except (ValueError, IndexError): + return None + + +def parse_flagstat(path: Path) -> dict[str, int | None]: + metrics: dict[str, int | None] = { + "total_reads": None, + "mapped_reads": None, + "duplicate_reads": None, + } + if not path.exists(): + return metrics + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + if " in total " in line: + metrics["total_reads"] = parse_first_int(line) + elif " mapped (" in line and " mate mapped" not in line: + metrics["mapped_reads"] = parse_first_int(line) + elif " duplicates" in line: + metrics["duplicate_reads"] = parse_first_int(line) + return metrics + + +def read_int(path: Path) -> int | None: + if not path.exists(): + return None + try: + return int(float(path.read_text(encoding="utf-8", errors="replace").strip().split()[0])) + except (ValueError, IndexError): + return None + + +def count_bed_rows(path: Path) -> int | None: + if not path.exists(): + return None + count = 0 + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + if line and not line.startswith("#"): + count += 1 + return count + + +def parse_insert_sizes(path: Path) -> dict[str, float | int | None]: + metrics: dict[str, float | int | None] = { + "insert_size_count": None, + "insert_size_median": None, + "insert_size_mean": None, + "nucleosome_free_fraction": None, + } + if not path.exists(): + return metrics + values: list[float] = [] + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + try: + value = abs(float(line.strip())) + except ValueError: + continue + if value > 0: + values.append(value) + if not values: + return metrics + metrics["insert_size_count"] = len(values) + metrics["insert_size_median"] = round(float(statistics.median(values)), 3) + metrics["insert_size_mean"] = round(float(sum(values) / len(values)), 3) + metrics["nucleosome_free_fraction"] = round( + sum(1 for value in values if value < 100) / len(values), 4 + ) + return metrics + + +def _rel(path: Path, run_dir: Path) -> str: + try: + return str(path.relative_to(run_dir)) + except ValueError: + return str(path) + + +def _to_float(value: Any) -> float | None: + if value in {None, ""}: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _write_svg_message(path: Path, title: str, message: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + body = f""" + + {html.escape(title)} + {html.escape(message)} + +""" + path.write_text(body, encoding="utf-8") + + +def write_frip_peak_overview_svg( + run_dir: Path, rows: list[dict[str, Any]], output_prefix: str, title: str +) -> str: + path = run_dir / "qc" / f"{output_prefix}_frip_peak_overview.svg" + values = [] + for row in rows: + frip = _to_float(row.get("frip")) + peak_count = _to_float( + row.get("blacklist_filtered_peak_count") or row.get("raw_peak_count") + ) + if frip is not None or peak_count is not None: + values.append( + {"sample": str(row.get("sample", "")), "frip": frip, "peak_count": peak_count} + ) + if not values: + _write_svg_message( + path, + f"{title} FRiP And Peak Overview", + "FRiP and peak-count metrics will populate after peak calling outputs are present.", + ) + return _rel(path, run_dir) + + width = 980 + row_height = 44 + height = 96 + row_height * len(values) + max_peak = max((item["peak_count"] or 0 for item in values), default=1) or 1 + lines = [ + f'', + '', + f'{html.escape(title)} FRiP And Peak Overview', + 'FRiP', + 'Peak count', + ] + for index, item in enumerate(values): + y = 104 + index * row_height + sample = item["sample"] + frip = item["frip"] or 0.0 + peak_count = item["peak_count"] or 0.0 + frip_width = max(2, min(220, frip * 220)) + peak_width = max(2, min(280, peak_count / max_peak * 280)) + lines.extend( + [ + f'{html.escape(sample)}', + f'', + f'', + f'{frip:.4g}', + f'', + f'', + f'{int(peak_count)}', + ] + ) + lines.append("\n") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + return _rel(path, run_dir) + + +def write_insert_size_distribution_svg( + run_dir: Path, samples: list[dict[str, str]], output_prefix: str, title: str +) -> str: + path = run_dir / "qc" / f"{output_prefix}_insert_size_distribution.svg" + bins = [(0, 100), (100, 200), (200, 400), (400, 800), (800, 2000)] + counts = [0 for _ in bins] + total = 0 + for sample in samples: + insert_path = run_dir / "qc" / f"{sample['sample']}.insert_sizes.txt" + if not insert_path.exists(): + continue + for line in insert_path.read_text(encoding="utf-8", errors="replace").splitlines(): + value = _to_float(line.strip()) + if value is None: + continue + total += 1 + abs_value = abs(value) + for index, (start, end) in enumerate(bins): + if start <= abs_value < end: + counts[index] += 1 + break + if not total: + _write_svg_message( + path, + f"{title} Insert-Size Distribution", + "Insert-size bars will populate after paired-read alignment metrics are available.", + ) + return _rel(path, run_dir) + + width = 900 + height = 300 + max_count = max(counts) or 1 + chart_x = 82 + chart_y = 66 + chart_h = 160 + bar_w = 110 + gap = 28 + lines = [ + f'', + '', + f'{html.escape(title)} Insert-Size Distribution', + f'Total fragments parsed: {total}', + ] + for index, ((start, end), count) in enumerate(zip(bins, counts)): + x = chart_x + index * (bar_w + gap) + bar_h = max(2, count / max_count * chart_h) + y = chart_y + chart_h - bar_h + label = f"{start}-{end}" + lines.extend( + [ + f'', + f'', + f'{label}', + f'{count}', + ] + ) + lines.append("\n") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + return _rel(path, run_dir) + + +def write_browser_track_preview(run_dir: Path, track_summary: dict[str, Any], title: str) -> str: + path = run_dir / "tracks" / "browser_track_preview.html" + rows = [] + for row in track_summary.get("tracks", []): + rows.append( + "" + f"{html.escape(str(row.get('sample', '')))}" + f"{html.escape(str(row.get('exists', '')))}" + f"{html.escape(str(row.get('bigwig', '')))}" + f"{html.escape(str(row.get('track_line', '')))}" + "" + ) + if not rows: + rows.append('No track rows were available.') + body = f""" + + + + {html.escape(title)} Browser Tracks + + + +

{html.escape(title)} Browser Tracks

+

Use these rows as a handoff for IGV/UCSC-style review. Relative bigWig paths require serving the run directory or replacing them with hosted URLs.

+ + + {"".join(rows)} +
SampleExistsbigWigUCSC Track Line
+ + +""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(body, encoding="utf-8") + return _rel(path, run_dir) + + +def write_epigenomics_dashboard( + run_dir: Path, + rows: list[dict[str, Any]], + *, + output_prefix: str, + title: str, + visual_paths: dict[str, str], +) -> str: + path = run_dir / "qc" / f"{output_prefix}_dashboard.html" + headers = [ + "sample", + "status", + "frip", + "raw_peak_count", + "blacklist_filtered_peak_count", + "insert_size_median", + "nucleosome_free_fraction", + "bigwig_exists", + "notes", + ] + row_html = [] + for row in rows: + row_html.append( + "" + + "".join(f"{html.escape(str(row.get(header, '')))}" for header in headers) + + "" + ) + if not row_html: + row_html.append( + f'No sample rows were available.' + ) + links = "".join( + f'
  • {html.escape(label)}
  • ' + for label, rel in [ + ("FRiP and peak overview", visual_paths.get("frip_peak_overview", "")), + ("Insert-size distribution", visual_paths.get("insert_size_distribution", "")), + ("Browser track preview", visual_paths.get("browser_track_preview", "")), + ] + if rel + ) + body = f""" + + + + {html.escape(title)} QC Dashboard + + + +

    {html.escape(title)} QC Dashboard

    +

    Compact native review of FRiP, peak counts, insert-size metrics, signal-track state, and remaining caveats parsed from the run directory.

    +
      {links}
    + + {"".join(f"" for header in headers)} + {"".join(row_html)} +
    {html.escape(header)}
    + + +""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(body, encoding="utf-8") + return _rel(path, run_dir) + + +def peak_paths(run_dir: Path, sample: str, peak_mode: str) -> tuple[Path, Path]: + raw_suffix = "narrowPeak" if peak_mode == "narrow" else "broadPeak" + raw = run_dir / "peaks" / f"{sample}_peaks.{raw_suffix}" + filtered = run_dir / "peaks" / f"{sample}.blacklist_filtered.{peak_mode}Peak" + return raw, filtered + + +def write_track_outputs( + run_dir: Path, samples: list[dict[str, str]], *, title: str +) -> dict[str, Any]: + rows = [] + resources = [] + for sample in samples: + name = sample["sample"] + bw = run_dir / "tracks" / f"{name}.bw" + exists = bw.exists() + row = { + "sample": name, + "bigwig": _rel(bw, run_dir), + "exists": str(exists).lower(), + "track_line": f'track type=bigWig name="{name}" description="{title} {name}" bigDataUrl={_rel(bw, run_dir)} visibility=full autoScale=on', + } + rows.append(row) + if exists: + resources.append( + f' ' + ) + write_tsv( + run_dir / "tracks" / "browser_tracks.tsv", + rows, + ["sample", "bigwig", "exists", "track_line"], + ) + write_text( + run_dir / "tracks" / "ucsc_track_lines.txt", + "\n".join(row["track_line"] for row in rows) + "\n", + ) + igv = [ + '', + f'', + " ", + ] + igv.extend(resources) + igv.extend([" ", "", ""]) + write_text(run_dir / "tracks" / "igv_session.xml", "\n".join(igv)) + summary = { + "status": "created" if any(row["exists"] == "true" for row in rows) else "not_available", + "tracks": rows, + "outputs": { + "browser_tracks": "tracks/browser_tracks.tsv", + "ucsc_track_lines": "tracks/ucsc_track_lines.txt", + "igv_session": "tracks/igv_session.xml", + }, + "note": "UCSC track lines use relative bigDataUrl values; serve the run directory over HTTP or edit URLs for a genome browser.", + } + write_json(run_dir / "tracks" / "track_manifest.json", summary) + return summary + + +def summarize_motif_outputs(run_dir: Path, samples: list[dict[str, str]]) -> dict[str, Any]: + rows = [] + for sample in samples: + name = sample["sample"] + known = run_dir / "motifs" / name / "knownResults.txt" + homer = run_dir / "motifs" / name / "homerResults.html" + row = { + "sample": name, + "known_results": _rel(known, run_dir), + "known_results_exists": str(known.exists()).lower(), + "homer_results": _rel(homer, run_dir), + "homer_results_exists": str(homer.exists()).lower(), + "top_motif": "", + "top_p_value": "", + } + if known.exists(): + try: + with known.open(newline="", encoding="utf-8", errors="replace") as handle: + reader = csv.DictReader(handle, delimiter="\t") + first = next(reader, None) + if first: + row["top_motif"] = ( + first.get("Motif Name") or first.get("Name") or first.get("motif") or "" + ) + row["top_p_value"] = ( + first.get("P-value") or first.get("p-value") or first.get("Pvalue") or "" + ) + except Exception: + row["top_motif"] = "" + rows.append(row) + write_tsv( + run_dir / "motifs" / "motif_summary.tsv", + rows, + [ + "sample", + "known_results", + "known_results_exists", + "homer_results", + "homer_results_exists", + "top_motif", + "top_p_value", + ], + ) + summary = { + "status": "created" + if any( + row["known_results_exists"] == "true" or row["homer_results_exists"] == "true" + for row in rows + ) + else "not_available", + "samples": rows, + "outputs": {"motif_summary": "motifs/motif_summary.tsv"}, + } + write_json(run_dir / "motifs" / "motif_summary.json", summary) + return summary + + +def summarize_epigenomics_outputs( + run_dir: Path, + samples: list[dict[str, str]], + *, + peak_mode: str, + output_prefix: str, + title: str, +) -> dict[str, Any]: + rows: list[dict[str, Any]] = [] + consensus_count = count_bed_rows(run_dir / "peaks" / "consensus_peaks.bed") + motif_summary = summarize_motif_outputs(run_dir, samples) + track_summary = write_track_outputs(run_dir, samples, title=title) + for sample in samples: + name = sample["sample"] + is_control = str(sample.get("is_control", "")).lower() == "true" + filtered_bam = run_dir / "alignment" / f"{name}.filtered.bam" + flagstat = parse_flagstat(run_dir / "qc" / f"{name}.flagstat.txt") + insert = parse_insert_sizes(run_dir / "qc" / f"{name}.insert_sizes.txt") + frip_reads = read_int(run_dir / "qc" / f"{name}.frip_reads.txt") + filtered_reads = read_int(run_dir / "qc" / f"{name}.filtered_reads.txt") + raw_peak, filtered_peak = peak_paths(run_dir, name, peak_mode) + raw_peak_count = count_bed_rows(raw_peak) + filtered_peak_count = count_bed_rows(filtered_peak) + bw = run_dir / "tracks" / f"{name}.bw" + tss_matrix = run_dir / "qc" / f"{name}.tss_matrix.gz" + tss_profile = run_dir / "qc" / f"{name}.tss_profile.png" + tss_heatmap = run_dir / "qc" / f"{name}.tss_heatmap.png" + motif_known = run_dir / "motifs" / name / "knownResults.txt" + observed = [ + filtered_bam.exists(), + flagstat["total_reads"] is not None, + raw_peak_count is not None, + bw.exists(), + frip_reads is not None and filtered_reads is not None, + ] + notes = [] + if not is_control and (frip_reads is None or filtered_reads in {None, 0}): + notes.append("FRiP inputs not found") + if not is_control and raw_peak_count is None: + notes.append("peak file not found") + if not bw.exists(): + notes.append("bigWig track not found") + if insert["insert_size_count"] is None: + notes.append("insert-size distribution not found") + if is_control: + notes.append("control sample: peak and FRiP outputs are not expected") + frip = "" + if not is_control and frip_reads is not None and filtered_reads: + frip = round(frip_reads / filtered_reads, 5) + status = ( + "created" + if all(observed[:2]) and (is_control or observed[2]) + else ("partial" if any(observed) else "not_executed") + ) + rows.append( + { + "sample": name, + "layout": sample.get("layout", ""), + "is_control": str(is_control).lower(), + "control_sample": sample.get("control_sample", ""), + "status": status, + "filtered_bam": _rel(filtered_bam, run_dir), + "filtered_bam_exists": str(filtered_bam.exists()).lower(), + "total_filtered_reads": filtered_reads if filtered_reads is not None else "", + "mapped_reads": flagstat["mapped_reads"] + if flagstat["mapped_reads"] is not None + else "", + "duplicate_reads": flagstat["duplicate_reads"] + if flagstat["duplicate_reads"] is not None + else "", + "frip_reads": frip_reads if frip_reads is not None else "", + "frip": frip, + "raw_peak_count": raw_peak_count if raw_peak_count is not None else "", + "blacklist_filtered_peak_count": filtered_peak_count + if filtered_peak_count is not None + else "", + "consensus_peak_count": consensus_count if consensus_count is not None else "", + "insert_size_count": insert["insert_size_count"] + if insert["insert_size_count"] is not None + else "", + "insert_size_median": insert["insert_size_median"] + if insert["insert_size_median"] is not None + else "", + "insert_size_mean": insert["insert_size_mean"] + if insert["insert_size_mean"] is not None + else "", + "nucleosome_free_fraction": insert["nucleosome_free_fraction"] + if insert["nucleosome_free_fraction"] is not None + else "", + "bigwig": _rel(bw, run_dir), + "bigwig_exists": str(bw.exists()).lower(), + "tss_matrix_exists": str(tss_matrix.exists()).lower(), + "tss_profile_exists": str(tss_profile.exists()).lower(), + "tss_heatmap_exists": str(tss_heatmap.exists()).lower(), + "motif_summary_exists": str(motif_known.exists()).lower(), + "notes": "; ".join(notes), + } + ) + write_tsv(run_dir / "qc" / f"{output_prefix}_summary.tsv", rows, EPIGENOMICS_SUMMARY_FIELDS) + visual_paths = { + "frip_peak_overview": write_frip_peak_overview_svg(run_dir, rows, output_prefix, title), + "insert_size_distribution": write_insert_size_distribution_svg( + run_dir, samples, output_prefix, title + ), + "browser_track_preview": write_browser_track_preview(run_dir, track_summary, title), + } + visual_paths["dashboard"] = write_epigenomics_dashboard( + run_dir, + rows, + output_prefix=output_prefix, + title=title, + visual_paths=visual_paths, + ) + summary = { + "status": "created" + if any(row["status"] in {"created", "partial"} for row in rows) + else "not_available", + "samples": rows, + "samples_with_peaks": sum(1 for row in rows if row["raw_peak_count"] != ""), + "samples_with_tracks": sum(1 for row in rows if row["bigwig_exists"] == "true"), + "consensus_peak_count": consensus_count, + "track_manifest": track_summary, + "motif_summary": motif_summary, + "visuals": visual_paths, + "outputs": { + "summary_table": f"qc/{output_prefix}_summary.tsv", + "summary_json": f"qc/{output_prefix}_summary.json", + "dashboard": visual_paths["dashboard"], + "frip_peak_overview": visual_paths["frip_peak_overview"], + "insert_size_distribution": visual_paths["insert_size_distribution"], + "track_manifest": "tracks/track_manifest.json", + "browser_tracks": "tracks/browser_tracks.tsv", + "browser_track_preview": visual_paths["browser_track_preview"], + "igv_session": "tracks/igv_session.xml", + "motif_summary": "motifs/motif_summary.tsv", + }, + } + write_json(run_dir / "qc" / f"{output_prefix}_summary.json", summary) + return summary diff --git a/plugins/ngs-analysis/scripts/ngs_planner_utils.py b/plugins/ngs-analysis/scripts/ngs_planner_utils.py new file mode 100644 index 0000000..fa7514b --- /dev/null +++ b/plugins/ngs-analysis/scripts/ngs_planner_utils.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python3 +"""Shared table and command-plan helpers for NGS runner build-outs.""" + +from __future__ import annotations + +import csv +import shlex +from pathlib import Path +from typing import Any + +from ngs_run_utils import write_text + + +def detect_delimiter(path: Path) -> str: + if path.suffix.lower() in {".tsv", ".tab"}: + return "\t" + return "," + + +def read_table(path: Path) -> tuple[list[dict[str, str]], list[str]]: + with path.open(newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle, delimiter=detect_delimiter(path)) + rows = [{key: (value or "").strip() for key, value in row.items()} for row in reader] + return rows, list(reader.fieldnames or []) + + +def resolve_path(raw: str | None, base: Path) -> Path | None: + if not raw: + return None + path = Path(raw).expanduser() + if not path.is_absolute(): + path = base / path + return path.resolve() + + +def write_tsv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter( + handle, fieldnames=fieldnames, delimiter="\t", extrasaction="ignore" + ) + writer.writeheader() + writer.writerows(rows) + + +def shell_join(cmd: list[str | Path]) -> str: + return shlex.join([str(item) for item in cmd]) + + +def write_command_script( + path: Path, commands: list[str], *, header: list[str] | None = None +) -> None: + lines = ["#!/usr/bin/env bash", "set -euo pipefail"] + if header: + lines.extend(header) + lines.extend(commands) + write_text(path, "\n".join(lines) + "\n") + + +def command_plan_entry( + name: str, command: list[str | Path] | str, *, outputs: list[str] | None = None +) -> dict[str, Any]: + command_string = command if isinstance(command, str) else shell_join(command) + return {"name": name, "command": command_string, "outputs": outputs or []} + + +def normalize_sample_name(value: str | None, fallback: str) -> str: + value = (value or "").strip() + if not value: + return fallback + safe = [] + for char in value: + safe.append(char if char.isalnum() or char in {"_", "-", "."} else "_") + return "".join(safe).strip("_") or fallback diff --git a/plugins/ngs-analysis/scripts/ngs_preflight.py b/plugins/ngs-analysis/scripts/ngs_preflight.py new file mode 100755 index 0000000..6f69d73 --- /dev/null +++ b/plugins/ngs-analysis/scripts/ngs_preflight.py @@ -0,0 +1,414 @@ +#!/usr/bin/env python3 +"""Check NGS tool availability before suggesting or running installs.""" + +from __future__ import annotations + +import argparse +import importlib.util +import json +import shlex +import shutil +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from ngs_run_utils import command_path + +DEFAULT_REGISTRY = Path(__file__).resolve().parents[1] / "references" / "pipeline-registry.json" + + +def load_registry(path: Path) -> dict[str, Any]: + with path.open("r", encoding="utf-8") as handle: + return json.load(handle) + + +def executable_status(name: str) -> dict[str, Any]: + resolved = command_path(name) + return {"name": name, "present": resolved is not None, "path": resolved} + + +def module_status(name: str) -> dict[str, Any]: + spec = importlib.util.find_spec(name) + return {"name": name, "present": spec is not None} + + +def executable_uses_docker(path: str | None) -> bool: + if not path: + return False + candidate = Path(path) + if not candidate.exists(): + return False + try: + head = candidate.read_text(encoding="utf-8", errors="ignore")[:2000] + except OSError: + return False + return "docker run" in head + + +def run_probe(cmd: list[str], timeout: int = 30) -> dict[str, Any]: + try: + result = subprocess.run( + cmd, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + timeout=timeout, + ) + except FileNotFoundError: + return {"cmd": cmd, "present": False, "ok": False, "detail": "command not found"} + except subprocess.TimeoutExpired: + return {"cmd": cmd, "present": True, "ok": False, "detail": "timeout"} + detail = (result.stdout or result.stderr).strip().splitlines() + return { + "cmd": cmd, + "present": True, + "ok": result.returncode == 0, + "returncode": result.returncode, + "detail": detail[:5], + } + + +def check_index(tool_name: str, tool: dict[str, Any], network_checks: bool) -> dict[str, Any]: + checks: dict[str, Any] = { + "tool": tool_name, + "executables": [executable_status(item) for item in tool.get("executables", [])], + "python_modules": [module_status(item) for item in tool.get("python_modules", [])], + "network": [], + "install": tool.get("install", {}), + "notes": tool.get("notes"), + "license": tool.get("license", "public_or_open"), + } + if tool_name == "bcl-convert": + resolved = next((item["path"] for item in checks["executables"] if item["present"]), None) + if executable_uses_docker(resolved): + checks["runtime"] = { + "docker_backed_wrapper": True, + "docker_daemon": run_probe(["docker", "info"], timeout=30), + } + if not network_checks: + return checks + + install = tool.get("install", {}) + conda_spec = install.get("conda") + pip_spec = install.get("pip") + + conda_cmd = shutil.which("mamba") or shutil.which("conda") or shutil.which("micromamba") + if conda_spec and conda_cmd: + package = conda_spec.split("::", 1)[-1] + channel = conda_spec.split("::", 1)[0] if "::" in conda_spec else "bioconda" + channels = ( + ["-c", "conda-forge", "-c", "bioconda"] if channel == "bioconda" else ["-c", channel] + ) + checks["network"].append(run_probe([conda_cmd, "search", *channels, package], timeout=60)) + + if pip_spec: + checks["network"].append( + run_probe([sys.executable, "-m", "pip", "index", "versions", pip_spec], timeout=60) + ) + + docker_cmd = shutil.which("docker") or shutil.which("podman") + if docker_cmd: + for image in tool.get("container_images", []): + checks["network"].append( + run_probe([docker_cmd, "manifest", "inspect", image], timeout=60) + ) + + return checks + + +def tool_is_present(status: dict[str, Any]) -> bool: + exe_ok = any(item["present"] for item in status.get("executables", [])) + module_ok = any(item["present"] for item in status.get("python_modules", [])) + return exe_ok or module_ok + + +def missing_by_profile_role(profile: dict[str, Any], missing: list[str]) -> dict[str, list[str]]: + missing_set = set(missing) + return { + role: [name for name in profile.get(field, []) if name in missing_set] + for role, field in [ + ("required", "required_tools"), + ("preferred", "preferred_tools"), + ("optional", "optional_tools"), + ] + } + + +def missing_by_pipeline_role(pipeline: dict[str, Any], missing: list[str]) -> dict[str, list[str]]: + missing_set = set(missing) + return { + role: [name for name in pipeline.get(field, []) if name in missing_set] + for role, field in [ + ("preferred", "preferred_tools"), + ("optional", "optional_tools"), + ("local_light", "local_light_tools"), + ] + } + + +def install_command(tool_name: str, tool: dict[str, Any], manager: str) -> list[str] | None: + del tool_name + install = tool.get("install", {}) + if manager in {"conda", "mamba", "micromamba"} and "conda" in install: + spec = install["conda"].split("::", 1) + if len(spec) == 2: + channel, package = spec + if channel == "bioconda": + return [manager, "install", "-y", "-c", "conda-forge", "-c", "bioconda", package] + return [manager, "install", "-y", "-c", channel, package] + return [manager, "install", "-y", install["conda"]] + if manager == "pip" and "pip" in install: + return [sys.executable, "-m", "pip", "install", install["pip"]] + return None + + +def install_plan_entries( + missing: list[str], registry: dict[str, Any], manager: str +) -> list[dict[str, Any]]: + entries = [] + for name in missing: + tool = registry["tools"][name] + cmd = install_command(name, tool, manager) + entries.append( + { + "tool": name, + "manager": manager, + "command": cmd, + "command_display": shlex.join(cmd) if cmd else None, + "install": tool.get("install", {}), + "executables": tool.get("executables", []), + "python_modules": tool.get("python_modules", []), + "notes": tool.get("notes"), + "license": tool.get("license", "public_or_open"), + "requires_user_approval": cmd is not None, + } + ) + return entries + + +def build_install_artifact( + *, + args: argparse.Namespace, + statuses: list[dict[str, Any]], + missing: list[str], + runtime_missing: list[str], + blocking_missing: list[str], + plan_entries: list[dict[str, Any]], +) -> dict[str, Any]: + return { + "schema_version": "1.0", + "generated_at": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"), + "selection": { + "tool": args.tool, + "pipeline": args.pipeline, + "profile": args.profile, + }, + "manager": args.manager, + "network_checks_requested": args.network_checks, + "permission_model": { + "requires_explicit_user_approval": bool(plan_entries), + "install_script_default_mode": "review_only", + "execution_opt_in": "Set NGS_RUN_INSTALL_COMMANDS=1 before running install_commands.sh.", + "does_not_install_by_itself": True, + }, + "missing": missing, + "runtime_missing": runtime_missing, + "blocking_missing": blocking_missing, + "install_plan": plan_entries, + "checked": statuses, + } + + +def render_install_commands(plan: dict[str, Any]) -> str: + lines = [ + "#!/usr/bin/env bash", + "set -euo pipefail", + "", + "# Generated by ngs_preflight.py from install_plan.json.", + "# Review install_plan.json before executing package installs.", + "# This script is review-only unless NGS_RUN_INSTALL_COMMANDS=1 is set.", + "", + 'PLAN_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"', + 'echo "Install plan: ${PLAN_DIR}/install_plan.json"', + 'if [[ "${NGS_RUN_INSTALL_COMMANDS:-}" != "1" ]]; then', + ' echo "Review-only mode. Set NGS_RUN_INSTALL_COMMANDS=1 to execute these commands."', + " exit 0", + "fi", + "", + "run_cmd() {", + ' printf "+ "', + ' printf "%q " "$@"', + ' printf "\\n"', + ' "$@"', + "}", + "", + ] + + commands = [entry for entry in plan.get("install_plan", []) if entry.get("command")] + if not commands: + lines.extend(['echo "No package install commands are required for this selection."', ""]) + return "\n".join(lines) + + for entry in plan.get("install_plan", []): + lines.append(f"# tool: {entry['tool']}") + if entry.get("notes"): + lines.append(f"# notes: {entry['notes']}") + if entry.get("license"): + lines.append(f"# license: {entry['license']}") + cmd = entry.get("command") + if cmd: + lines.append("run_cmd " + " ".join(shlex.quote(str(part)) for part in cmd)) + else: + lines.append( + f"# No {entry.get('manager', 'selected manager')} install command is registered for {entry['tool']}." + ) + lines.append("") + return "\n".join(lines) + + +def write_install_artifacts(plan: dict[str, Any], outdir: Path) -> dict[str, str]: + outdir.mkdir(parents=True, exist_ok=True) + plan_path = outdir / "install_plan.json" + commands_path = outdir / "install_commands.sh" + plan_path.write_text(json.dumps(plan, indent=2) + "\n", encoding="utf-8") + commands_path.write_text(render_install_commands(plan), encoding="utf-8") + commands_path.chmod(0o755) + return {"install_plan_json": str(plan_path), "install_commands_sh": str(commands_path)} + + +def selected_tools( + registry: dict[str, Any], tool: str | None, pipeline: str | None, profile: str | None +) -> list[str]: + if tool: + if tool not in registry["tools"]: + raise SystemExit(f"Unknown tool: {tool}") + return [tool] + if profile: + profiles = registry.get("profiles", {}) + if profile not in profiles: + raise SystemExit(f"Unknown profile: {profile}") + entry = profiles[profile] + names: list[str] = [] + for field in ("required_tools", "preferred_tools", "optional_tools"): + names.extend(entry.get(field, [])) + return [name for name in dict.fromkeys(names) if name in registry["tools"]] + if pipeline: + pipelines = registry["pipelines"] + if pipeline not in pipelines: + raise SystemExit(f"Unknown pipeline: {pipeline}") + entry = pipelines[pipeline] + names: list[str] = [] + for field in ("preferred_tools", "optional_tools", "local_light_tools"): + names.extend(entry.get(field, [])) + return [name for name in dict.fromkeys(names) if name in registry["tools"]] + raise SystemExit("Provide --tool, --pipeline, --profile, or --list") + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--registry", type=Path, default=DEFAULT_REGISTRY) + parser.add_argument("--tool") + parser.add_argument("--pipeline") + parser.add_argument("--profile", help="Check a named runtime profile such as local_light.") + parser.add_argument("--list", action="store_true") + parser.add_argument("--network-checks", action="store_true") + parser.add_argument("--emit-install-plan", action="store_true") + parser.add_argument( + "--install-plan-outdir", + type=Path, + help="Write install_plan.json and guarded install_commands.sh artifacts to this directory.", + ) + parser.add_argument( + "--manager", choices=["conda", "mamba", "micromamba", "pip"], default="mamba" + ) + parser.add_argument("--install-missing", action="store_true") + parser.add_argument("--yes", action="store_true", help="Required with --install-missing.") + args = parser.parse_args() + + registry = load_registry(args.registry) + + if args.list: + print( + json.dumps( + { + "pipelines": sorted(registry["pipelines"]), + "profiles": sorted(registry.get("profiles", {})), + "tools": sorted(registry["tools"]), + }, + indent=2, + ) + ) + return 0 + + names = selected_tools(registry, args.tool, args.pipeline, args.profile) + statuses = [check_index(name, registry["tools"][name], args.network_checks) for name in names] + missing = [status["tool"] for status in statuses if not tool_is_present(status)] + runtime_missing = [ + f"{status['tool']}:docker_daemon" + for status in statuses + if status.get("runtime", {}).get("docker_backed_wrapper") + and not status["runtime"]["docker_daemon"].get("ok", False) + ] + + output: dict[str, Any] = { + "checked": statuses, + "missing": missing, + "runtime_missing": runtime_missing, + } + blocking_missing = missing + runtime_missing + + if args.profile: + profile = registry.get("profiles", {})[args.profile] + by_role = missing_by_profile_role(profile, missing) + output["profile"] = { + "name": args.profile, + "missing_by_role": by_role, + "blocking_missing": by_role["required"], + } + blocking_missing = by_role["required"] + elif args.pipeline: + pipeline = registry.get("pipelines", {})[args.pipeline] + by_role = missing_by_pipeline_role(pipeline, missing) + output["pipeline"] = { + "name": args.pipeline, + "missing_by_role": by_role, + "blocking_missing": by_role["preferred"] + runtime_missing, + } + blocking_missing = by_role["preferred"] + runtime_missing + + if args.emit_install_plan or args.install_plan_outdir: + plan_entries = install_plan_entries(missing, registry, args.manager) + output["install_plan"] = plan_entries + install_artifact = build_install_artifact( + args=args, + statuses=statuses, + missing=missing, + runtime_missing=runtime_missing, + blocking_missing=blocking_missing, + plan_entries=plan_entries, + ) + if args.install_plan_outdir: + output["install_artifacts"] = write_install_artifacts( + install_artifact, args.install_plan_outdir + ) + + print(json.dumps(output, indent=2)) + + if args.install_missing: + if not args.yes: + raise SystemExit("--install-missing requires --yes") + for name in missing: + cmd = install_command(name, registry["tools"][name], args.manager) + if not cmd: + print(f"No {args.manager} install command registered for {name}", file=sys.stderr) + continue + subprocess.run(cmd, check=True) + + return 1 if blocking_missing else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/ngs_reference_manager.py b/plugins/ngs-analysis/scripts/ngs_reference_manager.py new file mode 100644 index 0000000..6de2c2b --- /dev/null +++ b/plugins/ngs-analysis/scripts/ngs_reference_manager.py @@ -0,0 +1,1616 @@ +#!/usr/bin/env python3 +"""Inspect and verify local NGS reference and database bundles.""" + +from __future__ import annotations + +import argparse +import json +import os +from pathlib import Path +from typing import Any + +from ngs_run_utils import now_iso, sha256_file, write_json + +PLUGIN_ROOT = Path(__file__).resolve().parents[1] +DEFAULT_REFERENCE_REGISTRY = PLUGIN_ROOT / "references" / "reference-registry.json" +DEFAULT_DATABASE_REGISTRY = PLUGIN_ROOT / "references" / "database-registry.json" +MAX_CHECKSUM_BYTES = 512 * 1024 * 1024 +PIPELINE_RESOURCE_REQUIREMENTS: dict[str, list[dict[str, Any]]] = { + "bulk_rnaseq_counts_qc": [ + { + "kind": "reference", + "selector": "genome_core", + "required": True, + "purpose": "genome FASTA, annotation GTF, and aligner/quantification indexes", + } + ], + "scrnaseq_fastq_to_count": [ + { + "kind": "reference", + "selector": "genome_core", + "required": True, + "purpose": "single-cell count-generation reference FASTA and annotation", + } + ], + "dna_variant_calling": [ + { + "kind": "reference", + "selector": "genome_core", + "required": True, + "purpose": "reference FASTA, sequence dictionary, indexes, and known-sites resources", + } + ], + "dna_germline_variants": [ + { + "kind": "reference", + "selector": "genome_core", + "required": True, + "purpose": "GATK/DeepVariant germline reference and known-sites resources", + } + ], + "dna_somatic_variants": [ + { + "kind": "reference", + "selector": "genome_core", + "required": True, + "purpose": "Mutect2 reference, indexes, blacklist, and optional cancer resources", + } + ], + "dna_umi_panel_variants": [ + { + "kind": "reference", + "selector": "genome_core", + "required": True, + "purpose": "panel reference FASTA/indexes and target-coverage context", + } + ], + "atacseq_peaks_qc": [ + { + "kind": "reference", + "selector": "genome_core", + "required": True, + "purpose": "alignment FASTA/indexes, blacklist, TSS annotation, and genome-build context", + } + ], + "chip_cutrun_peaks_qc": [ + { + "kind": "reference", + "selector": "genome_core", + "required": True, + "purpose": "alignment FASTA/indexes, blacklist, and genome-build context", + } + ], + "amplicon_microbiome": [ + { + "kind": "database", + "bundle": "silva_138_amplicon", + "required": True, + "purpose": "marker-gene taxonomy assignment", + }, + { + "kind": "database", + "bundle": "gtdb_release", + "required": False, + "purpose": "optional alternate taxonomy database", + }, + ], + "shotgun_metagenomics": [ + { + "kind": "database", + "bundle": "kraken2_standard", + "required": True, + "purpose": "Kraken2 taxonomic classification", + }, + { + "kind": "database", + "bundle": "bracken_standard", + "required": False, + "purpose": "Bracken abundance estimation paired to the Kraken2 database", + }, + { + "kind": "database", + "bundle": "humann_uniref90", + "required": False, + "purpose": "HUMAnN functional profiling", + }, + ], +} +PIPELINE_ALIASES = { + "bulk_rnaseq": "bulk_rnaseq_counts_qc", + "rnaseq": "bulk_rnaseq_counts_qc", + "scrna": "scrnaseq_fastq_to_count", + "scrnaseq": "scrnaseq_fastq_to_count", + "scrnaseq_fastq": "scrnaseq_fastq_to_count", + "sarek": "dna_variant_calling", + "germline": "dna_germline_variants", + "somatic": "dna_somatic_variants", + "umi": "dna_umi_panel_variants", + "atacseq": "atacseq_peaks_qc", + "chipseq": "chip_cutrun_peaks_qc", + "cutandrun": "chip_cutrun_peaks_qc", + "cutandtag": "chip_cutrun_peaks_qc", + "ampliseq": "amplicon_microbiome", + "taxprofiler": "shotgun_metagenomics", +} +GENOME_BUILD_TO_REFERENCE = { + "grch38": "grch38_core", + "hg38": "grch38_core", + "human": "grch38_core", + "grcm39": "grcm39_core", + "mm39": "grcm39_core", + "mouse": "grcm39_core", + "reduced": "reduced_micro_genome", + "reduced_local": "reduced_micro_genome", + "synthetic": "reduced_micro_genome", +} + + +def read_json(path: Path) -> dict[str, Any]: + return json.loads(path.read_text(encoding="utf-8")) + + +def expand_root(value: str | None) -> Path | None: + if not value: + return None + return Path(os.path.expandvars(value)).expanduser().resolve() + + +def load_registries( + reference_registry: Path = DEFAULT_REFERENCE_REGISTRY, + database_registry: Path = DEFAULT_DATABASE_REGISTRY, +) -> dict[str, Any]: + references = read_json(reference_registry) + databases = read_json(database_registry) + return { + "schema_version": { + "references": references.get("schema_version"), + "databases": databases.get("schema_version"), + }, + "references": references.get("references", {}), + "databases": databases.get("databases", {}), + } + + +def bundle_root(bundle: dict[str, Any], override_root: Path | None = None) -> Path | None: + if override_root: + return override_root.expanduser().resolve() + for key in ("root", "root_env"): + value = bundle.get(key) + if key == "root_env" and value: + expanded = expand_root(os.environ.get(str(value))) + else: + expanded = expand_root(value) + if expanded: + return expanded + return None + + +def env_assignment(bundle: dict[str, Any], root: str | None) -> dict[str, Any] | None: + root_env = bundle.get("root_env") + if not root_env: + return None + return {"name": str(root_env), "value": root} + + +def check_expected_files( + *, + bundle_name: str, + bundle: dict[str, Any], + override_root: Path | None = None, + include_checksums: bool = False, +) -> dict[str, Any]: + root = bundle_root(bundle, override_root) + expected = bundle.get("required_files", []) + records: list[dict[str, Any]] = [] + missing: list[str] = [] + for item in expected: + rel_path = str(item) + resolved = (root / rel_path).resolve() if root else None + exists = bool(resolved and resolved.exists()) + record: dict[str, Any] = { + "path": rel_path, + "resolved_path": str(resolved) if resolved else None, + "exists": exists, + "bytes": resolved.stat().st_size + if exists and resolved and resolved.is_file() + else None, + "sha256": None, + } + if not exists: + missing.append(rel_path) + elif include_checksums and resolved and resolved.is_file(): + size = resolved.stat().st_size + if size <= MAX_CHECKSUM_BYTES: + record["sha256"] = sha256_file(resolved) + else: + record["sha256_skipped_reason"] = ( + f"file exceeds {MAX_CHECKSUM_BYTES} byte checksum threshold" + ) + records.append(record) + return { + "bundle": bundle_name, + "display_name": bundle.get("display_name", bundle_name), + "kind": bundle.get("kind"), + "root": str(root) if root else None, + "ok": not missing and root is not None, + "missing": missing, + "files": records, + "metadata": { + "genome_build": bundle.get("genome_build"), + "database_family": bundle.get("database_family"), + "version": bundle.get("version"), + "source": bundle.get("source"), + "license_note": bundle.get("license_note"), + }, + } + + +def check_named_bundle( + name: str, + *, + kind: str = "reference", + root: Path | None = None, + include_checksums: bool = False, + registries: dict[str, Any] | None = None, +) -> dict[str, Any]: + registries = registries or load_registries() + collection_name = "references" if kind == "reference" else "databases" + collection = registries.get(collection_name, {}) + if name not in collection: + return { + "bundle": name, + "kind": kind, + "ok": False, + "missing": [], + "error": f"unknown {kind} bundle: {name}", + "available": sorted(collection), + } + return check_expected_files( + bundle_name=name, + bundle=collection[name], + override_root=root, + include_checksums=include_checksums, + ) + + +def normalize_pipeline_name(value: str) -> str: + key = value.strip().lower().replace("-", "_").replace("/", "_") + return PIPELINE_ALIASES.get(key, key) + + +def reference_bundle_for_genome(genome_build: str | None) -> str: + if not genome_build: + return "grch38_core" + return GENOME_BUILD_TO_REFERENCE.get(genome_build.strip().lower(), genome_build) + + +def parse_bundle_roots(values: list[str] | None) -> dict[str, Path]: + roots: dict[str, Path] = {} + for raw in values or []: + if "=" not in raw: + raise SystemExit(f"--bundle-root must be formatted as bundle=/path, got: {raw}") + name, value = raw.split("=", 1) + name = name.strip() + if not name: + raise SystemExit(f"--bundle-root is missing bundle name: {raw}") + roots[name] = Path(value).expanduser().resolve() + return roots + + +def resource_requirements_for_pipeline( + pipeline: str, + *, + genome_build: str | None = None, + include_optional: bool = False, +) -> tuple[str, list[dict[str, Any]]]: + normalized = normalize_pipeline_name(pipeline) + if normalized not in PIPELINE_RESOURCE_REQUIREMENTS: + raise SystemExit( + f"Unknown pipeline resource contract: {pipeline}. " + f"Known pipelines: {', '.join(sorted(PIPELINE_RESOURCE_REQUIREMENTS))}" + ) + resolved: list[dict[str, Any]] = [] + for requirement in PIPELINE_RESOURCE_REQUIREMENTS[normalized]: + if not requirement.get("required", True) and not include_optional: + continue + item = dict(requirement) + if item.get("selector") == "genome_core": + item["bundle"] = reference_bundle_for_genome(genome_build) + item.pop("selector", None) + resolved.append(item) + return normalized, resolved + + +def plan_pipeline_resources( + pipeline: str, + *, + genome_build: str | None = None, + bundle_roots: dict[str, Path] | None = None, + include_optional: bool = False, + include_checksums: bool = False, + registries: dict[str, Any] | None = None, +) -> dict[str, Any]: + registries = registries or load_registries() + bundle_roots = bundle_roots or {} + normalized, requirements = resource_requirements_for_pipeline( + pipeline, + genome_build=genome_build, + include_optional=include_optional, + ) + resources = [] + for requirement in requirements: + bundle = requirement["bundle"] + kind = requirement["kind"] + root = bundle_roots.get(bundle) + check = check_named_bundle( + bundle, + kind=kind, + root=root, + include_checksums=include_checksums, + registries=registries, + ) + collection_name = "references" if kind == "reference" else "databases" + bundle_payload = registries.get(collection_name, {}).get(bundle, {}) + resources.append( + { + "kind": kind, + "bundle": bundle, + "required": bool(requirement.get("required", True)), + "purpose": requirement.get("purpose"), + "ok": bool(check.get("ok")), + "blocking": bool(requirement.get("required", True)) and not bool(check.get("ok")), + "root": check.get("root"), + "env": env_assignment(bundle_payload, check.get("root")), + "check": check, + "setup": { + "source": bundle_payload.get("source"), + "license_note": bundle_payload.get("license_note"), + "suggested_setup": bundle_payload.get("suggested_setup", []), + "estimated_size": bundle_payload.get("estimated_size"), + }, + } + ) + missing_required = [ + { + "kind": item["kind"], + "bundle": item["bundle"], + "root": item["root"], + "missing": item["check"].get("missing", []), + "error": item["check"].get("error"), + } + for item in resources + if item["blocking"] + ] + return { + "created_at": now_iso(), + "pipeline": normalized, + "requested_pipeline": pipeline, + "genome_build": genome_build, + "include_optional": include_optional, + "ok": not missing_required, + "missing_required": missing_required, + "resources": resources, + } + + +def resource_manifest_rows(plan: dict[str, Any]) -> list[dict[str, Any]]: + rows = [] + for item in plan.get("resources", []): + check = item.get("check", {}) + metadata = check.get("metadata", {}) + env = item.get("env") or {} + rows.append( + { + "pipeline": plan.get("pipeline", ""), + "kind": item.get("kind", ""), + "bundle": item.get("bundle", ""), + "required": str(item.get("required", False)).lower(), + "ok": str(item.get("ok", False)).lower(), + "blocking": str(item.get("blocking", False)).lower(), + "root": item.get("root") or "", + "env_var": env.get("name", ""), + "purpose": item.get("purpose") or "", + "missing_count": len(check.get("missing", [])), + "missing_files": ";".join(check.get("missing", [])), + "source": metadata.get("source") or "", + "license_note": metadata.get("license_note") or "", + } + ) + return rows + + +def write_tsv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as handle: + handle.write("\t".join(fieldnames) + "\n") + for row in rows: + handle.write( + "\t".join(str(row.get(field, "")).replace("\n", " ") for field in fieldnames) + "\n" + ) + + +def validation_command_for_resource(action: dict[str, Any]) -> str: + root = action.get("root") + root_env = action.get("root_env") + if root: + root_arg = json.dumps(str(root)) + elif root_env: + root_arg = f'"${{{root_env}:-/path/to/{action.get("bundle")}}}"' + else: + root_arg = f'"/path/to/{action.get("bundle")}"' + return ( + "python plugins/ngs-analysis/scripts/ngs_reference_manager.py check " + f"--kind {action.get('kind')} --bundle {action.get('bundle')} --root {root_arg}" + ) + + +def resource_setup_action(plan: dict[str, Any], item: dict[str, Any]) -> dict[str, Any]: + check = item.get("check", {}) + env = item.get("env") or {} + setup = item.get("setup") or {} + missing_files = list(check.get("missing", [])) + root = item.get("root") or check.get("root") + action = { + "pipeline": plan.get("pipeline"), + "requested_pipeline": plan.get("requested_pipeline"), + "genome_build": plan.get("genome_build"), + "kind": item.get("kind"), + "bundle": item.get("bundle"), + "display_name": check.get("display_name") or item.get("bundle"), + "required": bool(item.get("required")), + "blocking": bool(item.get("blocking")), + "ok": bool(item.get("ok")), + "root": root, + "root_env": env.get("name", ""), + "purpose": item.get("purpose") or "", + "missing_files": missing_files, + "missing_count": len(missing_files), + "error": check.get("error"), + "source": setup.get("source"), + "license_note": setup.get("license_note"), + "estimated_size": setup.get("estimated_size"), + "suggested_setup": list(setup.get("suggested_setup") or []), + } + next_actions: list[str] = [] + if not root: + if action["root_env"]: + next_actions.append( + f"Choose or create a local bundle root and export {action['root_env']}=/path/to/{action['bundle']}." + ) + else: + next_actions.append( + f"Choose or create a local bundle root for {action['bundle']} and pass it with --bundle-root." + ) + elif missing_files: + next_actions.append( + f"Complete the bundle under {root} by adding the missing contract files." + ) + if action.get("error"): + next_actions.append(str(action["error"])) + if missing_files: + next_actions.append("Missing files: " + ", ".join(missing_files)) + if action["suggested_setup"]: + next_actions.append( + "Review and adapt the registry setup hints before downloading or generating large resources." + ) + action["validation_command"] = validation_command_for_resource(action) + next_actions.append("Re-run the validation command after setup.") + action["next_actions"] = next_actions + action["ready_after"] = { + "root_configured": "yes" if root else "pending", + "missing_files": "none" if not missing_files else "all missing files present", + "validation_command": action["validation_command"], + } + return action + + +def setup_plan_from_resource_plan( + plan: dict[str, Any], *, include_ready: bool = False +) -> dict[str, Any]: + actions = [ + resource_setup_action(plan, item) + for item in plan.get("resources", []) + if include_ready or not item.get("ok") + ] + return { + "schema_version": "ngs_resource_setup_plan/v0.1", + "created_at": now_iso(), + "pipeline": plan.get("pipeline"), + "requested_pipeline": plan.get("requested_pipeline"), + "genome_build": plan.get("genome_build"), + "include_optional": bool(plan.get("include_optional")), + "include_ready": include_ready, + "resource_plan_ok": bool(plan.get("ok")), + "ok": not any(action.get("blocking") for action in actions), + "action_count": len(actions), + "blocking_count": sum(1 for action in actions if action.get("blocking")), + "actions": actions, + } + + +def resource_setup_plan_rows(setup_plan: dict[str, Any]) -> list[dict[str, Any]]: + rows = [] + for action in setup_plan.get("actions", []): + rows.append( + { + "pipeline": setup_plan.get("pipeline", ""), + "kind": action.get("kind", ""), + "bundle": action.get("bundle", ""), + "display_name": action.get("display_name", ""), + "required": str(action.get("required", False)).lower(), + "blocking": str(action.get("blocking", False)).lower(), + "ok": str(action.get("ok", False)).lower(), + "root": action.get("root") or "", + "root_env": action.get("root_env") or "", + "purpose": action.get("purpose") or "", + "missing_count": action.get("missing_count", 0), + "missing_files": ";".join(action.get("missing_files", [])), + "estimated_size": action.get("estimated_size") or "", + "source": action.get("source") or "", + "license_note": action.get("license_note") or "", + "suggested_setup": "; ".join( + str(item) for item in action.get("suggested_setup", []) + ), + "validation_command": action.get("validation_command") or "", + } + ) + return rows + + +def write_resource_setup_markdown(setup_plan: dict[str, Any], path: Path) -> None: + lines = [ + "# NGS Resource Setup Plan", + "", + f"Created: `{setup_plan.get('created_at')}`", + f"Pipeline: `{setup_plan.get('pipeline')}`", + f"Resource plan ready: `{str(setup_plan.get('resource_plan_ok')).lower()}`", + f"Setup actions: `{setup_plan.get('action_count')}`", + f"Blocking setup actions: `{setup_plan.get('blocking_count')}`", + "", + "This file is a setup checklist. Review license, size, and source notes before downloading or generating large references/databases.", + "", + ] + actions = setup_plan.get("actions", []) + if not actions: + lines.append("No missing resources were selected for setup planning.") + for action in actions: + required = "required" if action.get("required") else "optional" + state = "ready" if action.get("ok") else "missing" + lines.extend( + [ + f"## `{action.get('bundle')}`", + "", + f"Display name: {action.get('display_name')}", + f"Kind: `{action.get('kind')}`", + f"State: `{state}`", + f"Requirement: `{required}`", + f"Blocking: `{str(action.get('blocking')).lower()}`", + f"Purpose: {action.get('purpose') or 'not specified'}", + f"Root: `{action.get('root') or 'not configured'}`", + f"Env var: `{action.get('root_env') or 'none'}`", + ] + ) + if action.get("estimated_size"): + lines.append(f"Estimated size: {action['estimated_size']}") + if action.get("source"): + lines.append(f"Source: {action['source']}") + if action.get("license_note"): + lines.append(f"License/source note: {action['license_note']}") + if action.get("missing_files"): + lines.extend(["", "Missing files:"]) + lines.extend(f"- `{item}`" for item in action["missing_files"]) + if action.get("next_actions"): + lines.extend(["", "Next actions:"]) + lines.extend(f"- {item}" for item in action["next_actions"]) + if action.get("suggested_setup"): + lines.extend(["", "Suggested setup hints:", "", "```bash"]) + lines.extend(str(command) for command in action["suggested_setup"]) + lines.append("```") + lines.extend( + [ + "", + "Validation command:", + "", + "```bash", + str(action.get("validation_command")), + "```", + "", + ] + ) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + + +def write_resource_setup_commands(setup_plan: dict[str, Any], path: Path) -> None: + lines = [ + "#!/usr/bin/env bash", + "set -euo pipefail", + "", + "# Review and edit placeholder paths before running.", + "# Large reference/database setup may require license, quota, and disk-space review.", + "# Registry setup hints are commented by default to avoid accidental large downloads.", + "", + ] + actions = setup_plan.get("actions", []) + if not actions: + lines.append("# No setup actions selected.") + for action in actions: + lines.extend( + [ + f"# === {action.get('bundle')} ({action.get('kind')}) ===", + f"# Purpose: {action.get('purpose') or 'not specified'}", + f"# Required: {str(action.get('required')).lower()}", + f"# Blocking: {str(action.get('blocking')).lower()}", + f"# Root: {action.get('root') or 'not configured'}", + ] + ) + if action.get("root_env"): + root_value = action.get("root") or f"/path/to/{action.get('bundle')}" + lines.append(f"# export {action['root_env']}={json.dumps(str(root_value))}") + if action.get("estimated_size"): + lines.append(f"# Estimated size: {action['estimated_size']}") + if action.get("license_note"): + lines.append(f"# License/source note: {action['license_note']}") + if action.get("missing_files"): + lines.append("# Missing files:") + lines.extend(f"# - {item}" for item in action["missing_files"]) + if action.get("suggested_setup"): + lines.append("# Suggested setup hints:") + lines.extend(f"# {command}" for command in action["suggested_setup"]) + lines.append(f"# Validate after setup: {action.get('validation_command')}") + lines.append("") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + + +def write_resource_setup_plan_outputs(setup_plan: dict[str, Any], outdir: Path) -> dict[str, str]: + outdir.mkdir(parents=True, exist_ok=True) + write_json(outdir / "resource_setup_plan.json", setup_plan) + fieldnames = [ + "pipeline", + "kind", + "bundle", + "display_name", + "required", + "blocking", + "ok", + "root", + "root_env", + "purpose", + "missing_count", + "missing_files", + "estimated_size", + "source", + "license_note", + "suggested_setup", + "validation_command", + ] + write_tsv(outdir / "resource_setup_plan.tsv", resource_setup_plan_rows(setup_plan), fieldnames) + write_resource_setup_commands(setup_plan, outdir / "resource_setup_commands.sh") + write_resource_setup_markdown(setup_plan, outdir / "resource_setup_plan.md") + return { + "resource_setup_plan": str(outdir / "resource_setup_plan.json"), + "resource_setup_plan_tsv": str(outdir / "resource_setup_plan.tsv"), + "resource_setup_commands": str(outdir / "resource_setup_commands.sh"), + "resource_setup_summary": str(outdir / "resource_setup_plan.md"), + } + + +def write_resource_plan_outputs(plan: dict[str, Any], outdir: Path) -> dict[str, str]: + outdir.mkdir(parents=True, exist_ok=True) + write_json(outdir / "resource_plan.json", plan) + fieldnames = [ + "pipeline", + "kind", + "bundle", + "required", + "ok", + "blocking", + "root", + "env_var", + "purpose", + "missing_count", + "missing_files", + "source", + "license_note", + ] + write_tsv(outdir / "resource_manifest.tsv", resource_manifest_rows(plan), fieldnames) + env_lines = [ + "#!/usr/bin/env bash", + "# Source this file after editing placeholder paths, then rerun resource checks.", + ] + for item in plan.get("resources", []): + env = item.get("env") + if not env: + continue + name = env["name"] + value = env.get("value") + if value: + env_lines.append(f"export {name}={json.dumps(value)}") + else: + env_lines.append(f"# export {name}=/path/to/{item['bundle']}") + (outdir / "resource_env.sh").write_text("\n".join(env_lines) + "\n", encoding="utf-8") + missing_lines = [ + "# NGS Resource Readiness", + "", + f"Pipeline: `{plan.get('pipeline')}`", + f"Ready: `{str(plan.get('ok')).lower()}`", + "", + ] + for item in plan.get("missing_required", []): + missing_lines.append(f"## Missing {item['kind']} `{item['bundle']}`") + missing_lines.append("") + missing_lines.append(f"Root: `{item.get('root') or 'not configured'}`") + if item.get("error"): + missing_lines.append(f"Error: {item['error']}") + for missing in item.get("missing", []): + missing_lines.append(f"- `{missing}`") + missing_lines.append("") + if not plan.get("missing_required"): + missing_lines.append("All required bundles are present.") + (outdir / "resource_readiness.md").write_text( + "\n".join(missing_lines).rstrip() + "\n", encoding="utf-8" + ) + setup_outputs = write_resource_setup_plan_outputs(setup_plan_from_resource_plan(plan), outdir) + return { + "resource_plan": str(outdir / "resource_plan.json"), + "resource_manifest": str(outdir / "resource_manifest.tsv"), + "resource_env": str(outdir / "resource_env.sh"), + "resource_readiness": str(outdir / "resource_readiness.md"), + **setup_outputs, + } + + +def check_all_bundles( + *, + kind: str = "all", + bundle_roots: dict[str, Path] | None = None, + include_checksums: bool = False, + registries: dict[str, Any] | None = None, +) -> dict[str, Any]: + registries = registries or load_registries() + bundle_roots = bundle_roots or {} + checks = [] + if kind in {"all", "reference"}: + for name in sorted(registries.get("references", {})): + checks.append( + check_named_bundle( + name, + kind="reference", + root=bundle_roots.get(name), + include_checksums=include_checksums, + registries=registries, + ) + ) + if kind in {"all", "database"}: + for name in sorted(registries.get("databases", {})): + checks.append( + check_named_bundle( + name, + kind="database", + root=bundle_roots.get(name), + include_checksums=include_checksums, + registries=registries, + ) + ) + missing = [item for item in checks if not item.get("ok")] + return { + "created_at": now_iso(), + "kind": kind, + "ok": not missing, + "checked_count": len(checks), + "ready_count": len(checks) - len(missing), + "missing_count": len(missing), + "checks": checks, + } + + +def pipeline_usage_by_bundle(registries: dict[str, Any]) -> dict[str, dict[str, list[str]]]: + usage: dict[str, dict[str, set[str]]] = {} + all_bundles = set(registries.get("references", {})) | set(registries.get("databases", {})) + for bundle in all_bundles: + usage[bundle] = {"required": set(), "optional": set()} + reference_bundles = set(registries.get("references", {})) + for pipeline, requirements in PIPELINE_RESOURCE_REQUIREMENTS.items(): + for requirement in requirements: + slot = "required" if requirement.get("required", True) else "optional" + if requirement.get("bundle"): + usage.setdefault( + str(requirement["bundle"]), {"required": set(), "optional": set()} + )[slot].add(pipeline) + elif requirement.get("selector") == "genome_core": + for bundle in reference_bundles: + usage.setdefault(bundle, {"required": set(), "optional": set()})[slot].add( + f"{pipeline}:genome_core" + ) + return { + bundle: { + "required": sorted(values["required"]), + "optional": sorted(values["optional"]), + } + for bundle, values in sorted(usage.items()) + } + + +def iter_bundle_payloads(registries: dict[str, Any], kind: str = "all"): + if kind in {"all", "reference"}: + for name, payload in sorted(registries.get("references", {}).items()): + yield "reference", name, payload + if kind in {"all", "database"}: + for name, payload in sorted(registries.get("databases", {}).items()): + yield "database", name, payload + + +def inventory_resources( + *, + kind: str = "all", + bundle_roots: dict[str, Path] | None = None, + include_checksums: bool = False, + registries: dict[str, Any] | None = None, +) -> dict[str, Any]: + registries = registries or load_registries() + bundle_roots = bundle_roots or {} + usage = pipeline_usage_by_bundle(registries) + bundles: list[dict[str, Any]] = [] + rows: list[dict[str, Any]] = [] + for item_kind, name, payload in iter_bundle_payloads(registries, kind=kind): + check = check_named_bundle( + name, + kind=item_kind, + root=bundle_roots.get(name), + include_checksums=include_checksums, + registries=registries, + ) + bundle_usage = usage.get(name, {"required": [], "optional": []}) + missing = check.get("missing", []) + env = env_assignment(payload, check.get("root")) + configured_root = str(bundle_roots[name]) if name in bundle_roots else check.get("root") + record = { + "kind": item_kind, + "bundle": name, + "display_name": payload.get("display_name", name), + "ok": bool(check.get("ok")), + "root": configured_root, + "root_env": payload.get("root_env"), + "env": env, + "required_file_count": len(payload.get("required_files", [])), + "missing_count": len(missing), + "missing": missing, + "estimated_size": payload.get("estimated_size"), + "source": payload.get("source"), + "license_note": payload.get("license_note"), + "suggested_setup": payload.get("suggested_setup", []), + "pipelines_required": bundle_usage.get("required", []), + "pipelines_optional": bundle_usage.get("optional", []), + "check": check, + } + bundles.append(record) + rows.append( + { + "kind": item_kind, + "bundle": name, + "display_name": record["display_name"], + "ok": str(record["ok"]).lower(), + "root": record["root"] or "", + "root_env": record["root_env"] or "", + "required_file_count": record["required_file_count"], + "missing_count": record["missing_count"], + "missing_files": ";".join(missing), + "pipelines_required": ";".join(record["pipelines_required"]), + "pipelines_optional": ";".join(record["pipelines_optional"]), + "estimated_size": record["estimated_size"] or "", + "source": record["source"] or "", + "license_note": record["license_note"] or "", + } + ) + missing_bundles = [item for item in bundles if not item["ok"]] + return { + "created_at": now_iso(), + "kind": kind, + "ok": not missing_bundles, + "bundle_count": len(bundles), + "ready_count": len(bundles) - len(missing_bundles), + "missing_count": len(missing_bundles), + "bundles": bundles, + "rows": rows, + } + + +def inventory_manifest_rows(inventory: dict[str, Any]) -> list[dict[str, Any]]: + return list(inventory.get("rows", [])) + + +def write_resource_inventory_outputs(inventory: dict[str, Any], outdir: Path) -> dict[str, str]: + outdir.mkdir(parents=True, exist_ok=True) + write_json(outdir / "resource_inventory.json", inventory) + fieldnames = [ + "kind", + "bundle", + "display_name", + "ok", + "root", + "root_env", + "required_file_count", + "missing_count", + "missing_files", + "pipelines_required", + "pipelines_optional", + "estimated_size", + "source", + "license_note", + ] + write_tsv(outdir / "resource_inventory.tsv", inventory_manifest_rows(inventory), fieldnames) + env_lines = [ + "#!/usr/bin/env bash", + "# Source this file after editing placeholder paths, then rerun ngs_reference_manager.py inventory.", + ] + for item in inventory.get("bundles", []): + env = item.get("env") + root_env = item.get("root_env") + if env and env.get("value"): + env_lines.append(f"export {env['name']}={json.dumps(env['value'])}") + elif root_env: + env_lines.append(f"# export {root_env}=/path/to/{item['bundle']}") + (outdir / "resource_env.sh").write_text("\n".join(env_lines) + "\n", encoding="utf-8") + write_resource_dashboard(inventory, outdir / "resource_dashboard.md") + return { + "resource_inventory": str(outdir / "resource_inventory.json"), + "resource_inventory_tsv": str(outdir / "resource_inventory.tsv"), + "resource_env": str(outdir / "resource_env.sh"), + "resource_dashboard": str(outdir / "resource_dashboard.md"), + } + + +def write_resource_dashboard(inventory: dict[str, Any], path: Path) -> None: + lines = [ + "# NGS Resource Inventory", + "", + f"Created: `{inventory.get('created_at')}`", + f"Ready bundles: `{inventory.get('ready_count')}/{inventory.get('bundle_count')}`", + f"All ready: `{str(inventory.get('ok')).lower()}`", + "", + "## Bundle Readiness", + "", + "| Kind | Bundle | Display Name | Ready | Root | Missing | Required By | Optional For |", + "|---|---|---|---:|---|---:|---|---|", + ] + for item in inventory.get("bundles", []): + lines.append( + "| {kind} | `{bundle}` | {display_name} | `{ok}` | `{root}` | {missing_count} | {required} | {optional} |".format( + kind=item.get("kind", ""), + bundle=item.get("bundle", ""), + display_name=item.get("display_name", ""), + ok=str(item.get("ok", False)).lower(), + root=item.get("root") or "not configured", + missing_count=item.get("missing_count", 0), + required=", ".join(item.get("pipelines_required", [])) or "", + optional=", ".join(item.get("pipelines_optional", [])) or "", + ) + ) + missing_items = [item for item in inventory.get("bundles", []) if not item.get("ok")] + if missing_items: + lines.extend(["", "## Missing Bundle Details", ""]) + for item in missing_items: + lines.extend( + [ + f"### `{item.get('bundle')}`", + "", + f"Root: `{item.get('root') or 'not configured'}`", + f"Env var: `{item.get('root_env') or 'none'}`", + ] + ) + missing = item.get("missing", []) + if missing: + lines.append("Missing files:") + lines.extend(f"- `{entry}`" for entry in missing) + else: + lines.append("Missing files: root not configured or bundle contract unavailable.") + if item.get("license_note"): + lines.append(f"License/source note: {item['license_note']}") + setup = item.get("suggested_setup", []) + if setup: + lines.extend(["", "Suggested setup:", "", "```bash"]) + lines.extend(str(command) for command in setup) + lines.append("```") + lines.append("") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + + +def resource_lock_from_inventory(inventory: dict[str, Any]) -> dict[str, Any]: + bundles: list[dict[str, Any]] = [] + for item in inventory.get("bundles", []): + check = item.get("check", {}) + bundles.append( + { + "kind": item.get("kind"), + "bundle": item.get("bundle"), + "display_name": item.get("display_name"), + "ok": bool(item.get("ok")), + "root": item.get("root"), + "root_env": item.get("root_env"), + "required_file_count": item.get("required_file_count", 0), + "missing": list(item.get("missing", [])), + "files": list(check.get("files", [])), + "metadata": check.get("metadata", {}), + "source": item.get("source"), + "license_note": item.get("license_note"), + "estimated_size": item.get("estimated_size"), + "pipelines_required": list(item.get("pipelines_required", [])), + "pipelines_optional": list(item.get("pipelines_optional", [])), + } + ) + return { + "schema_version": "ngs_resource_lock/v0.1", + "created_at": now_iso(), + "inventory_created_at": inventory.get("created_at"), + "kind": inventory.get("kind", "all"), + "ok": bool(inventory.get("ok")), + "bundle_count": inventory.get("bundle_count", len(bundles)), + "ready_count": inventory.get("ready_count", sum(1 for item in bundles if item.get("ok"))), + "missing_count": inventory.get( + "missing_count", sum(1 for item in bundles if not item.get("ok")) + ), + "bundles": bundles, + } + + +def resource_lock_rows(lock: dict[str, Any]) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for bundle in lock.get("bundles", []): + for file_record in bundle.get("files", []): + rows.append( + { + "kind": bundle.get("kind", ""), + "bundle": bundle.get("bundle", ""), + "display_name": bundle.get("display_name", ""), + "bundle_ok": str(bundle.get("ok", False)).lower(), + "root": bundle.get("root") or "", + "path": file_record.get("path", ""), + "resolved_path": file_record.get("resolved_path") or "", + "exists": str(file_record.get("exists", False)).lower(), + "bytes": file_record.get("bytes") or "", + "sha256": file_record.get("sha256") or "", + "sha256_skipped_reason": file_record.get("sha256_skipped_reason") or "", + "pipelines_required": ";".join(bundle.get("pipelines_required", [])), + "pipelines_optional": ";".join(bundle.get("pipelines_optional", [])), + } + ) + return rows + + +def write_resource_lock_outputs(lock: dict[str, Any], outdir: Path) -> dict[str, str]: + outdir.mkdir(parents=True, exist_ok=True) + write_json(outdir / "resource_lock.json", lock) + fieldnames = [ + "kind", + "bundle", + "display_name", + "bundle_ok", + "root", + "path", + "resolved_path", + "exists", + "bytes", + "sha256", + "sha256_skipped_reason", + "pipelines_required", + "pipelines_optional", + ] + write_tsv(outdir / "resource_lock.tsv", resource_lock_rows(lock), fieldnames) + write_resource_lock_summary(lock, outdir / "resource_lock.md") + return { + "resource_lock": str(outdir / "resource_lock.json"), + "resource_lock_tsv": str(outdir / "resource_lock.tsv"), + "resource_lock_summary": str(outdir / "resource_lock.md"), + } + + +def write_resource_lock_summary(lock: dict[str, Any], path: Path) -> None: + lines = [ + "# NGS Resource Lockfile", + "", + f"Created: `{lock.get('created_at')}`", + f"Schema: `{lock.get('schema_version')}`", + f"Ready bundles: `{lock.get('ready_count')}/{lock.get('bundle_count')}`", + f"All ready at lock time: `{str(lock.get('ok')).lower()}`", + "", + "| Kind | Bundle | Ready | Root | Files | Missing | Checksummed |", + "|---|---|---:|---|---:|---:|---:|", + ] + for bundle in lock.get("bundles", []): + files = bundle.get("files", []) + checksummed = sum(1 for item in files if item.get("sha256")) + missing = len(bundle.get("missing", [])) + lines.append( + "| {kind} | `{bundle}` | `{ok}` | `{root}` | {files} | {missing} | {checksummed} |".format( + kind=bundle.get("kind", ""), + bundle=bundle.get("bundle", ""), + ok=str(bundle.get("ok", False)).lower(), + root=bundle.get("root") or "not configured", + files=len(files), + missing=missing, + checksummed=checksummed, + ) + ) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + + +def locked_file_path( + bundle: dict[str, Any], file_record: dict[str, Any], bundle_roots: dict[str, Path] +) -> Path | None: + bundle_name = str(bundle.get("bundle") or "") + if bundle_name in bundle_roots: + return (bundle_roots[bundle_name] / str(file_record.get("path", ""))).expanduser().resolve() + resolved = file_record.get("resolved_path") + if resolved: + return Path(str(resolved)).expanduser().resolve() + root = bundle.get("root") + if root: + return (Path(str(root)).expanduser() / str(file_record.get("path", ""))).resolve() + return None + + +def verify_resource_lock( + lock: dict[str, Any], + *, + bundle_roots: dict[str, Path] | None = None, + verify_checksums: bool = True, +) -> dict[str, Any]: + bundle_roots = bundle_roots or {} + rows: list[dict[str, Any]] = [] + mismatches: list[dict[str, Any]] = [] + for bundle in lock.get("bundles", []): + bundle_name = str(bundle.get("bundle") or "") + for file_record in bundle.get("files", []): + path = locked_file_path(bundle, file_record, bundle_roots) + current_exists = bool(path and path.exists()) + expected_exists = bool(file_record.get("exists")) + current_bytes = ( + path.stat().st_size if current_exists and path and path.is_file() else None + ) + expected_bytes = file_record.get("bytes") + current_sha = None + expected_sha = file_record.get("sha256") + status = "matched" + issue = "" + if expected_exists and not current_exists: + status = "mismatch" + issue = "missing_now" + elif not expected_exists and current_exists: + status = "mismatch" + issue = "was_missing_now_present" + elif ( + expected_exists + and current_exists + and expected_bytes is not None + and current_bytes != expected_bytes + ): + status = "mismatch" + issue = "bytes_changed" + elif ( + expected_exists + and current_exists + and expected_sha + and verify_checksums + and path + and path.is_file() + ): + current_sha = sha256_file(path) + if current_sha != expected_sha: + status = "mismatch" + issue = "sha256_changed" + row = { + "bundle": bundle_name, + "kind": bundle.get("kind", ""), + "path": file_record.get("path", ""), + "resolved_path": str(path) if path else "", + "expected_exists": str(expected_exists).lower(), + "current_exists": str(current_exists).lower(), + "expected_bytes": expected_bytes if expected_bytes is not None else "", + "current_bytes": current_bytes if current_bytes is not None else "", + "expected_sha256": expected_sha or "", + "current_sha256": current_sha or "", + "status": status, + "issue": issue, + } + rows.append(row) + if status != "matched": + mismatches.append(row) + original_ok = bool(lock.get("ok")) + return { + "created_at": now_iso(), + "schema_version": "ngs_resource_lock_verification/v0.1", + "lock_created_at": lock.get("created_at"), + "lock_schema_version": lock.get("schema_version"), + "original_lock_ok": original_ok, + "ok": original_ok and not mismatches, + "verified_file_count": len(rows), + "mismatch_count": len(mismatches), + "mismatches": mismatches, + "rows": rows, + } + + +def write_resource_lock_verification_outputs( + verification: dict[str, Any], outdir: Path +) -> dict[str, str]: + outdir.mkdir(parents=True, exist_ok=True) + write_json(outdir / "resource_lock_verification.json", verification) + fieldnames = [ + "bundle", + "kind", + "path", + "resolved_path", + "expected_exists", + "current_exists", + "expected_bytes", + "current_bytes", + "expected_sha256", + "current_sha256", + "status", + "issue", + ] + write_tsv( + outdir / "resource_lock_verification.tsv", list(verification.get("rows", [])), fieldnames + ) + write_resource_lock_verification_summary(verification, outdir / "resource_lock_verification.md") + return { + "resource_lock_verification": str(outdir / "resource_lock_verification.json"), + "resource_lock_verification_tsv": str(outdir / "resource_lock_verification.tsv"), + "resource_lock_verification_summary": str(outdir / "resource_lock_verification.md"), + } + + +def write_resource_lock_verification_summary(verification: dict[str, Any], path: Path) -> None: + lines = [ + "# NGS Resource Lock Verification", + "", + f"Created: `{verification.get('created_at')}`", + f"Lock created: `{verification.get('lock_created_at')}`", + f"Original lock ready: `{str(verification.get('original_lock_ok')).lower()}`", + f"Verification ready: `{str(verification.get('ok')).lower()}`", + f"Files checked: `{verification.get('verified_file_count')}`", + f"Mismatches: `{verification.get('mismatch_count')}`", + "", + ] + if verification.get("mismatches"): + lines.extend( + ["## Mismatches", "", "| Bundle | Path | Issue | Current Path |", "|---|---|---|---|"] + ) + for item in verification["mismatches"]: + lines.append( + f"| `{item.get('bundle')}` | `{item.get('path')}` | `{item.get('issue')}` | `{item.get('resolved_path')}` |" + ) + else: + lines.append("All locked files match the lockfile state.") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8") + + +def list_bundles(registries: dict[str, Any]) -> dict[str, Any]: + return { + "created_at": now_iso(), + "references": { + name: { + "display_name": payload.get("display_name", name), + "genome_build": payload.get("genome_build"), + "root": payload.get("root") or f"${payload.get('root_env')}" + if payload.get("root_env") + else payload.get("root"), + "required_files": payload.get("required_files", []), + } + for name, payload in sorted(registries.get("references", {}).items()) + }, + "databases": { + name: { + "display_name": payload.get("display_name", name), + "database_family": payload.get("database_family"), + "version": payload.get("version"), + "root": payload.get("root") or f"${payload.get('root_env')}" + if payload.get("root_env") + else payload.get("root"), + "required_files": payload.get("required_files", []), + } + for name, payload in sorted(registries.get("databases", {}).items()) + }, + } + + +def explain_missing(check: dict[str, Any]) -> str: + if check.get("ok"): + return f"{check['bundle']} is present under {check.get('root')}." + lines = [ + f"{check.get('bundle')} is not ready.", + f"Root: {check.get('root') or 'not configured'}", + ] + if check.get("error"): + lines.append(f"Error: {check['error']}") + if check.get("missing"): + lines.append("Missing files:") + lines.extend(f"- {item}" for item in check["missing"]) + license_note = check.get("metadata", {}).get("license_note") + if license_note: + lines.append(f"License/source note: {license_note}") + return "\n".join(lines) + "\n" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--reference-registry", type=Path, default=DEFAULT_REFERENCE_REGISTRY) + parser.add_argument("--database-registry", type=Path, default=DEFAULT_DATABASE_REGISTRY) + sub = parser.add_subparsers(dest="command", required=True) + + sub.add_parser("list", help="List known reference and database bundle contracts.") + + check = sub.add_parser("check", help="Check one named bundle.") + check.add_argument("--kind", choices=["reference", "database"], required=True) + check.add_argument("--bundle", required=True) + check.add_argument("--root", type=Path) + check.add_argument("--include-checksums", action="store_true") + check.add_argument("--output", type=Path) + + explain = sub.add_parser( + "explain-missing", help="Print a human-readable missing-resource explanation." + ) + explain.add_argument("--kind", choices=["reference", "database"], required=True) + explain.add_argument("--bundle", required=True) + explain.add_argument("--root", type=Path) + + plan = sub.add_parser("plan", help="Create a pipeline-aware reference/database readiness plan.") + plan.add_argument("--pipeline", required=True) + plan.add_argument( + "--genome-build", + help="Genome build or alias for genome-backed pipelines, e.g. GRCh38, hg38, GRCm39, mm39, or a configured local alias.", + ) + plan.add_argument( + "--bundle-root", + action="append", + default=[], + help="Override a bundle root as bundle=/path. May be repeated.", + ) + plan.add_argument( + "--include-optional", + action="store_true", + help="Include optional databases such as Bracken/HUMAnN or alternate taxonomy bundles.", + ) + plan.add_argument("--include-checksums", action="store_true") + plan.add_argument( + "--outdir", + type=Path, + help="Write resource_plan.json, resource_manifest.tsv, resource_env.sh, resource_readiness.md, and setup-plan artifacts.", + ) + + setup_plan = sub.add_parser( + "setup-plan", help="Create an actionable setup checklist for missing pipeline resources." + ) + setup_plan.add_argument("--pipeline", required=True) + setup_plan.add_argument( + "--genome-build", + help="Genome build or alias for genome-backed pipelines, e.g. GRCh38, hg38, GRCm39, mm39, or a configured local alias.", + ) + setup_plan.add_argument( + "--bundle-root", + action="append", + default=[], + help="Override a bundle root as bundle=/path. May be repeated.", + ) + setup_plan.add_argument( + "--include-optional", + action="store_true", + help="Include optional databases such as Bracken/HUMAnN or alternate taxonomy bundles.", + ) + setup_plan.add_argument("--include-checksums", action="store_true") + setup_plan.add_argument( + "--include-ready", + action="store_true", + help="Include already-ready bundles in the setup plan for documentation.", + ) + setup_plan.add_argument( + "--outdir", + type=Path, + required=True, + help="Write resource_setup_plan.json, .tsv, .md, and resource_setup_commands.sh.", + ) + setup_plan.add_argument( + "--fail-on-blocking", + action="store_true", + help="Exit non-zero when required resources need setup.", + ) + + check_all = sub.add_parser("check-all", help="Check every known bundle contract.") + check_all.add_argument("--kind", choices=["all", "reference", "database"], default="all") + check_all.add_argument( + "--bundle-root", + action="append", + default=[], + help="Override a bundle root as bundle=/path. May be repeated.", + ) + check_all.add_argument("--include-checksums", action="store_true") + check_all.add_argument("--output", type=Path) + + inventory = sub.add_parser( + "inventory", help="Write a project-level reference/database inventory dashboard." + ) + inventory.add_argument("--kind", choices=["all", "reference", "database"], default="all") + inventory.add_argument( + "--bundle-root", + action="append", + default=[], + help="Override a bundle root as bundle=/path. May be repeated.", + ) + inventory.add_argument("--include-checksums", action="store_true") + inventory.add_argument( + "--outdir", + type=Path, + help="Write resource_inventory.json, resource_inventory.tsv, resource_env.sh, and resource_dashboard.md.", + ) + inventory.add_argument( + "--fail-on-missing", + action="store_true", + help="Exit non-zero when any inventoried bundle is incomplete.", + ) + + lock = sub.add_parser( + "lock", help="Create a reproducible resource lockfile from the current inventory." + ) + lock.add_argument("--kind", choices=["all", "reference", "database"], default="all") + lock.add_argument( + "--bundle-root", + action="append", + default=[], + help="Override a bundle root as bundle=/path. May be repeated.", + ) + lock.add_argument( + "--include-checksums", + action="store_true", + help="Record SHA256 checksums for files below the checksum threshold.", + ) + lock.add_argument( + "--outdir", + type=Path, + required=True, + help="Write resource_lock.json, resource_lock.tsv, and resource_lock.md.", + ) + lock.add_argument( + "--fail-on-missing", + action="store_true", + help="Exit non-zero when the lock captures incomplete bundles.", + ) + + verify_lock = sub.add_parser( + "verify-lock", help="Verify a resource lockfile against the current filesystem." + ) + verify_lock.add_argument("--lockfile", type=Path, required=True) + verify_lock.add_argument( + "--bundle-root", + action="append", + default=[], + help="Override a locked bundle root as bundle=/path. May be repeated.", + ) + verify_lock.add_argument( + "--skip-checksums", + action="store_true", + help="Skip SHA256 comparison even when the lockfile contains checksums.", + ) + verify_lock.add_argument( + "--outdir", type=Path, help="Write resource_lock_verification.json, .tsv, and .md." + ) + verify_lock.add_argument( + "--fail-on-mismatch", + action="store_true", + help="Exit non-zero when the lock is incomplete or files no longer match.", + ) + + return parser.parse_args() + + +def main() -> int: + args = parse_args() + registries = load_registries(args.reference_registry, args.database_registry) + if args.command == "list": + print(json.dumps(list_bundles(registries), indent=2, sort_keys=True)) + return 0 + if args.command == "check": + result = check_named_bundle( + args.bundle, + kind=args.kind, + root=args.root, + include_checksums=args.include_checksums, + registries=registries, + ) + if args.output: + write_json(args.output, result) + print(json.dumps(result, indent=2, sort_keys=True)) + return 0 if result.get("ok") else 1 + if args.command == "explain-missing": + result = check_named_bundle( + args.bundle, kind=args.kind, root=args.root, registries=registries + ) + print(explain_missing(result), end="") + return 0 if result.get("ok") else 1 + if args.command == "plan": + result = plan_pipeline_resources( + args.pipeline, + genome_build=args.genome_build, + bundle_roots=parse_bundle_roots(args.bundle_root), + include_optional=args.include_optional, + include_checksums=args.include_checksums, + registries=registries, + ) + if args.outdir: + result["outputs"] = write_resource_plan_outputs( + result, args.outdir.expanduser().resolve() + ) + print(json.dumps(result, indent=2, sort_keys=True)) + return 0 if result.get("ok") else 1 + if args.command == "setup-plan": + resource_plan = plan_pipeline_resources( + args.pipeline, + genome_build=args.genome_build, + bundle_roots=parse_bundle_roots(args.bundle_root), + include_optional=args.include_optional, + include_checksums=args.include_checksums, + registries=registries, + ) + result = setup_plan_from_resource_plan(resource_plan, include_ready=args.include_ready) + result["resource_plan"] = resource_plan + result["outputs"] = write_resource_setup_plan_outputs( + result, args.outdir.expanduser().resolve() + ) + print(json.dumps(result, indent=2, sort_keys=True)) + return 1 if args.fail_on_blocking and result.get("blocking_count") else 0 + if args.command == "check-all": + result = check_all_bundles( + kind=args.kind, + bundle_roots=parse_bundle_roots(args.bundle_root), + include_checksums=args.include_checksums, + registries=registries, + ) + if args.output: + write_json(args.output, result) + print(json.dumps(result, indent=2, sort_keys=True)) + return 0 if result.get("ok") else 1 + if args.command == "inventory": + result = inventory_resources( + kind=args.kind, + bundle_roots=parse_bundle_roots(args.bundle_root), + include_checksums=args.include_checksums, + registries=registries, + ) + if args.outdir: + result["outputs"] = write_resource_inventory_outputs( + result, args.outdir.expanduser().resolve() + ) + print(json.dumps(result, indent=2, sort_keys=True)) + return 1 if args.fail_on_missing and not result.get("ok") else 0 + if args.command == "lock": + inventory = inventory_resources( + kind=args.kind, + bundle_roots=parse_bundle_roots(args.bundle_root), + include_checksums=args.include_checksums, + registries=registries, + ) + result = resource_lock_from_inventory(inventory) + result["outputs"] = write_resource_lock_outputs(result, args.outdir.expanduser().resolve()) + print(json.dumps(result, indent=2, sort_keys=True)) + return 1 if args.fail_on_missing and not result.get("ok") else 0 + if args.command == "verify-lock": + lock_payload = read_json(args.lockfile.expanduser().resolve()) + result = verify_resource_lock( + lock_payload, + bundle_roots=parse_bundle_roots(args.bundle_root), + verify_checksums=not args.skip_checksums, + ) + if args.outdir: + result["outputs"] = write_resource_lock_verification_outputs( + result, args.outdir.expanduser().resolve() + ) + print(json.dumps(result, indent=2, sort_keys=True)) + return 1 if args.fail_on_mismatch and not result.get("ok") else 0 + raise AssertionError(args.command) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/ngs_resource_gate.py b/plugins/ngs-analysis/scripts/ngs_resource_gate.py new file mode 100644 index 0000000..734dca3 --- /dev/null +++ b/plugins/ngs-analysis/scripts/ngs_resource_gate.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +"""Shared reference/database readiness gates for NGS run envelopes.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import ngs_reference_manager +from ngs_visualization_utils import artifact_entry + + +def relative_outputs(outputs: dict[str, str], run_dir: Path) -> dict[str, str]: + run_root = run_dir.resolve() + return {key: str(Path(value).resolve().relative_to(run_root)) for key, value in outputs.items()} + + +def write_pipeline_resource_plan( + *, + run_dir: Path, + pipeline: str, + genome_build: str | None = None, + bundle_roots: list[str] | None = None, + include_optional: bool = False, + include_checksums: bool = False, + skip: bool = False, + required: bool = False, +) -> dict[str, Any] | None: + if skip: + return None + plan = ngs_reference_manager.plan_pipeline_resources( + pipeline, + genome_build=genome_build, + bundle_roots=ngs_reference_manager.parse_bundle_roots(bundle_roots or []), + include_optional=include_optional, + include_checksums=include_checksums, + ) + plan["gate_mode"] = "required" if required else "advisory" + plan["blocking_for_run"] = bool(required and not plan.get("ok")) + outputs = ngs_reference_manager.write_resource_plan_outputs( + plan, run_dir.resolve() / "resources" + ) + plan["outputs"] = relative_outputs(outputs, run_dir) + return plan + + +def resource_messages(resource_plan: dict[str, Any] | None) -> list[str]: + if resource_plan is None or resource_plan.get("ok"): + return [] + messages = [] + for item in resource_plan.get("missing_required", []): + detail = item.get("error") or ", ".join(item.get("missing", [])) or "root not configured" + messages.append( + f"required {item.get('kind')} bundle `{item.get('bundle')}` is not ready: {detail}" + ) + return messages + + +def merge_resource_status( + validation: dict[str, Any], + resource_plan: dict[str, Any] | None, + *, + required: bool = False, +) -> dict[str, Any]: + merged = dict(validation) + errors = list(merged.get("errors", [])) + warnings = list(merged.get("warnings", [])) + if resource_plan is None: + merged["resource_plan_ok"] = None + merged["resource_plan_skipped"] = True + warnings.append( + "resource readiness plan was skipped; reference/database bundle contents were not checked" + ) + else: + messages = resource_messages(resource_plan) + merged["resource_plan_ok"] = bool(resource_plan.get("ok")) + merged["resource_plan_skipped"] = False + merged["resource_plan_mode"] = resource_plan.get( + "gate_mode", "required" if required else "advisory" + ) + merged["resource_plan_path"] = resource_plan.get("outputs", {}).get("resource_plan") + merged["missing_required_resources"] = resource_plan.get("missing_required", []) + if messages and required: + errors.extend(messages) + elif messages: + warnings.extend([f"advisory resource check: {message}" for message in messages]) + merged["errors"] = errors + merged["warnings"] = warnings + merged["ok"] = bool(validation.get("ok")) and ( + resource_plan is None or not required or bool(resource_plan.get("ok")) + ) + return merged + + +def resource_output_paths(resource_plan: dict[str, Any] | None) -> dict[str, str]: + return resource_plan.get("outputs", {}) if resource_plan else {} + + +def resource_visual_entries(resource_plan: dict[str, Any] | None) -> list[dict[str, Any]]: + if resource_plan is None: + return [] + return [ + artifact_entry( + artifact_id="resource_readiness", + title="Resource Readiness", + path="resources/resource_readiness.md", + kind="markdown", + status="created", + description="Human-readable reference/database readiness gate for this run.", + ), + artifact_entry( + artifact_id="resource_manifest", + title="Resource Manifest", + path="resources/resource_manifest.tsv", + kind="table", + status="created", + description="Resolved resource bundles, roots, env vars, and missing-file counts.", + ), + artifact_entry( + artifact_id="resource_plan", + title="Resource Plan", + path="resources/resource_plan.json", + kind="json", + status="created", + description="Structured reference/database readiness plan used by this run.", + ), + artifact_entry( + artifact_id="resource_setup_plan", + title="Resource Setup Plan", + path="resources/resource_setup_plan.md", + kind="markdown", + status="created", + description="Actionable setup checklist for missing reference/database bundles.", + ), + artifact_entry( + artifact_id="resource_setup_commands", + title="Resource Setup Commands", + path="resources/resource_setup_commands.sh", + kind="script", + status="created", + description="Reviewed shell skeleton with commented setup hints and validation commands.", + ), + ] + + +def resource_summary_lines(resource_plan: dict[str, Any] | None) -> list[str]: + if resource_plan is None: + return [] + lines = [ + "## Resource Readiness", + "", + f"Mode: `{resource_plan.get('gate_mode', 'required')}`", + f"Ready: `{str(resource_plan.get('ok')).lower()}`", + f"Resource contract: `{resource_plan.get('pipeline')}`", + f"Setup plan: `{resource_plan.get('outputs', {}).get('resource_setup_summary', 'resources/resource_setup_plan.md')}`", + ] + for item in resource_plan.get("resources", []): + state = "ready" if item.get("ok") else "missing" + required = "required" if item.get("required") else "optional" + lines.append(f"- `{item.get('bundle')}` ({item.get('kind')}, {required}): {state}") + lines.append("") + return lines diff --git a/plugins/ngs-analysis/scripts/ngs_run_utils.py b/plugins/ngs-analysis/scripts/ngs_run_utils.py new file mode 100644 index 0000000..8775354 --- /dev/null +++ b/plugins/ngs-analysis/scripts/ngs_run_utils.py @@ -0,0 +1,586 @@ +#!/usr/bin/env python3 +"""Shared helpers for plugin-owned NGS execution runners.""" + +from __future__ import annotations + +import hashlib +import importlib.util +import json +import os +import platform +import shlex +import shutil +import subprocess +import sys +from datetime import datetime +from pathlib import Path +from typing import Any + +RUN_ENVELOPE_SCHEMA_VERSION = "0.4.0" +MAX_AUTO_CHECKSUM_BYTES = 128 * 1024 * 1024 +LOCAL_ENV_FILE = Path.cwd() / ".ngs-analysis-local.env" + + +def _load_local_env_file(path: Path = LOCAL_ENV_FILE) -> dict[str, str]: + values: dict[str, str] = {} + if not path.exists(): + return values + for raw_line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#"): + continue + if line.startswith("export "): + line = line[len("export ") :].strip() + if "=" not in line: + continue + key, value = line.split("=", 1) + key = key.strip() + value = value.strip().strip('"').strip("'") + if not key: + continue + values[key] = os.path.expandvars(value) + return values + + +LOCAL_ENV = _load_local_env_file() + + +def _apply_local_env() -> None: + for key, value in LOCAL_ENV.items(): + if key == "NGS_TOOL_PATH_PREPEND": + continue + os.environ.setdefault(key, value) + + +def _effective_path() -> str: + base_path = os.environ.get("PATH", "") + prepend = LOCAL_ENV.get("NGS_TOOL_PATH_PREPEND", "").strip() + if not prepend: + return base_path + return os.pathsep.join([prepend, base_path]) if base_path else prepend + + +def _effective_env() -> dict[str, str]: + env = os.environ.copy() + for key, value in LOCAL_ENV.items(): + if key == "NGS_TOOL_PATH_PREPEND": + continue + env.setdefault(key, value) + env["PATH"] = _effective_path() + return env + + +_apply_local_env() + + +def now_iso() -> str: + return datetime.now().astimezone().isoformat(timespec="seconds") + + +def slug_timestamp(label: str) -> str: + safe_label = label.strip().replace("_", "-") + return datetime.now().strftime(f"%Y-%m-%dT%H-%M-%S-{safe_label}") + + +def write_json(path: Path, value: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(value, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def write_text(path: Path, value: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(value, encoding="utf-8") + + +def sha256_file(path: Path) -> str: + digest = hashlib.sha256() + with path.open("rb") as handle: + while True: + chunk = handle.read(1024 * 1024) + if not chunk: + break + digest.update(chunk) + return digest.hexdigest() + + +def sha256_json(value: Any) -> str: + payload = json.dumps(value, sort_keys=True, separators=(",", ":"), default=str).encode("utf-8") + return hashlib.sha256(payload).hexdigest() + + +def command_path(name: str) -> str | None: + return shutil.which(name, path=_effective_path()) + + +def module_present(name: str) -> bool: + return importlib.util.find_spec(name) is not None + + +def shell_tool_command(name: str) -> str | None: + resolved = command_path(name) + if resolved: + return name + module_fallbacks = { + "snakemake": "snakemake", + "multiqc": "multiqc", + "cutadapt": "cutadapt", + } + module_name = module_fallbacks.get(name) + if module_name and module_present(module_name): + return f"{sys.executable} -m {module_name}" + return None + + +def run_cmd(cmd: list[str], cwd: Path, timeout: int | None = None) -> dict[str, Any]: + started = now_iso() + try: + result = subprocess.run( + cmd, + cwd=cwd, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + timeout=timeout, + env=_effective_env(), + ) + output = result.stdout or "" + return { + "cmd": cmd, + "cwd": str(cwd), + "started_at": started, + "finished_at": now_iso(), + "returncode": result.returncode, + "ok": result.returncode == 0, + "stdout_tail": output[-12000:], + } + except subprocess.TimeoutExpired as exc: + output = exc.stdout if isinstance(exc.stdout, str) else "" + return { + "cmd": cmd, + "cwd": str(cwd), + "started_at": started, + "finished_at": now_iso(), + "returncode": None, + "ok": False, + "error": f"TimeoutExpired: exceeded {timeout}s", + "stdout_tail": output[-12000:], + } + + +def run_cmd_stdout_to_file( + cmd: list[str], + cwd: Path, + stdout_path: Path, + timeout: int | None = None, +) -> dict[str, Any]: + started = now_iso() + stdout_path.parent.mkdir(parents=True, exist_ok=True) + try: + with stdout_path.open("w", encoding="utf-8") as handle: + result = subprocess.run( + cmd, + cwd=cwd, + check=False, + stdout=handle, + stderr=subprocess.PIPE, + text=True, + timeout=timeout, + env=_effective_env(), + ) + stderr = result.stderr or "" + preview = stdout_path.read_text(encoding="utf-8")[-12000:] if stdout_path.exists() else "" + return { + "cmd": cmd, + "cwd": str(cwd), + "started_at": started, + "finished_at": now_iso(), + "returncode": result.returncode, + "ok": result.returncode == 0, + "stdout_path": str(stdout_path), + "stdout_tail": preview, + "stderr_tail": stderr[-12000:], + } + except subprocess.TimeoutExpired as exc: + stderr = exc.stderr if isinstance(exc.stderr, str) else "" + preview = stdout_path.read_text(encoding="utf-8")[-12000:] if stdout_path.exists() else "" + return { + "cmd": cmd, + "cwd": str(cwd), + "started_at": started, + "finished_at": now_iso(), + "returncode": None, + "ok": False, + "stdout_path": str(stdout_path), + "error": f"TimeoutExpired: exceeded {timeout}s", + "stdout_tail": preview, + "stderr_tail": stderr[-12000:], + } + + +def executable_status(name: str) -> dict[str, Any]: + resolved = command_path(name) + return {"name": name, "present": resolved is not None, "path": resolved} + + +def module_status(name: str) -> dict[str, Any]: + return {"name": name, "present": module_present(name)} + + +def tool_preflight(required: list[str], optional: list[str] | None = None) -> dict[str, Any]: + optional = optional or [] + checked = [] + for name in required + optional: + checked.append(executable_status(name)) + missing_required = [item["name"] for item in checked[: len(required)] if not item["present"]] + return { + "ok": not missing_required, + "required": required, + "optional": optional, + "checked": checked, + "missing_required": missing_required, + } + + +def software_versions(commands: dict[str, list[str]]) -> dict[str, str | None]: + versions: dict[str, str | None] = {} + for name, cmd in commands.items(): + if not command_path(cmd[0]): + versions[name] = None + continue + result = run_cmd(cmd, Path.cwd(), timeout=30) + detail = result.get("stdout_tail") or result.get("error") or "" + versions[name] = "\n".join(str(detail).splitlines()[:3]).strip() or None + return versions + + +def _iter_existing_paths(value: Any) -> list[Path]: + paths: list[Path] = [] + if isinstance(value, dict): + for item in value.values(): + paths.extend(_iter_existing_paths(item)) + elif isinstance(value, (list, tuple, set)): + for item in value: + paths.extend(_iter_existing_paths(item)) + elif isinstance(value, str): + candidate = Path(value).expanduser() + if candidate.exists() and candidate.is_file(): + paths.append(candidate.resolve()) + return paths + + +def input_checksums(inputs: dict[str, Any] | None) -> list[dict[str, Any]]: + seen: set[Path] = set() + checksums: list[dict[str, Any]] = [] + for path in _iter_existing_paths(inputs or {}): + if path in seen: + continue + seen.add(path) + size = path.stat().st_size + record: dict[str, Any] = {"path": str(path), "bytes": size} + if size <= MAX_AUTO_CHECKSUM_BYTES: + record["sha256"] = sha256_file(path) + else: + record["sha256"] = None + record["sha256_skipped_reason"] = ( + f"file exceeds {MAX_AUTO_CHECKSUM_BYTES} bytes auto-checksum threshold" + ) + checksums.append(record) + return sorted(checksums, key=lambda item: item["path"]) + + +def _flatten_declared_paths(value: Any, prefix: str = "") -> list[dict[str, str]]: + rows: list[dict[str, str]] = [] + if isinstance(value, dict): + for key, item in value.items(): + next_prefix = f"{prefix}.{key}" if prefix else str(key) + rows.extend(_flatten_declared_paths(item, next_prefix)) + elif isinstance(value, (list, tuple, set)): + for index, item in enumerate(value): + rows.extend(_flatten_declared_paths(item, f"{prefix}[{index}]")) + elif isinstance(value, str) and value.strip(): + rows.append({"logical_name": prefix or "value", "path": value}) + return rows + + +def _resolve_run_relative_path(run_dir: Path, raw_path: str) -> Path | None: + if any(char in raw_path for char in "*?[]{}"): + return None + candidate = Path(raw_path).expanduser() + if candidate.is_absolute(): + return candidate + return (run_dir / candidate).resolve() + + +def write_io_lineage( + run_dir: Path, inputs: dict[str, Any] | None, outputs: dict[str, Any] | None +) -> Path: + input_records = _flatten_declared_paths(inputs or {}) + output_records = _flatten_declared_paths(outputs or {}) + source_inputs = ";".join(sorted(record["logical_name"] for record in input_records)) + + lines = [ + "\t".join( + [ + "record_type", + "logical_name", + "path", + "exists", + "bytes", + "sha256", + "source_inputs", + ] + ) + ] + for record in input_records: + resolved = _resolve_run_relative_path(run_dir, record["path"]) + exists = bool(resolved and resolved.exists() and resolved.is_file()) + size = str(resolved.stat().st_size) if exists and resolved else "" + checksum = ( + sha256_file(resolved) + if exists and resolved and resolved.stat().st_size <= MAX_AUTO_CHECKSUM_BYTES + else "" + ) + lines.append( + "\t".join( + [ + "input", + record["logical_name"], + record["path"], + "true" if exists else "false", + size, + checksum, + "", + ] + ) + ) + for record in output_records: + resolved = _resolve_run_relative_path(run_dir, record["path"]) + exists = bool(resolved and resolved.exists() and resolved.is_file()) + size = str(resolved.stat().st_size) if exists and resolved else "" + checksum = ( + sha256_file(resolved) + if exists and resolved and resolved.stat().st_size <= MAX_AUTO_CHECKSUM_BYTES + else "" + ) + lines.append( + "\t".join( + [ + "output", + record["logical_name"], + record["path"], + "true" if exists else "false", + size, + checksum, + source_inputs, + ] + ) + ) + lineage_path = run_dir / "manifest" / "lineage.tsv" + write_text(lineage_path, "\n".join(lines) + "\n") + return lineage_path + + +def _find_plugin_root() -> Path | None: + for current in [Path(__file__).resolve().parent, *Path(__file__).resolve().parents]: + if (current / ".codex-plugin" / "plugin.json").exists(): + return current + return None + + +def plugin_metadata() -> dict[str, Any]: + root = _find_plugin_root() + metadata: dict[str, Any] = { + "name": None, + "version": None, + "repository": None, + "source_root": str(root) if root else None, + "git_commit": None, + } + if root is None: + return metadata + plugin_json = root / ".codex-plugin" / "plugin.json" + try: + payload = json.loads(plugin_json.read_text(encoding="utf-8")) + metadata["name"] = payload.get("name") + metadata["version"] = payload.get("version") + metadata["repository"] = payload.get("repository") + except (OSError, json.JSONDecodeError): + pass + git_root = next( + (candidate for candidate in [root, *root.parents] if (candidate / ".git").exists()), None + ) + if git_root is not None: + revision = run_cmd(["git", "rev-parse", "HEAD"], git_root, timeout=15) + if revision.get("ok") and revision.get("stdout_tail"): + metadata["git_commit"] = str(revision["stdout_tail"]).splitlines()[0].strip() + return metadata + + +def environment_snapshot() -> dict[str, Any]: + return { + "cwd": str(Path.cwd()), + "python_executable": sys.executable, + "python_version": sys.version.split()[0], + "platform": platform.platform(), + "argv": sys.argv, + "argv_string": shlex.join(sys.argv), + "selected_env": { + "MPLCONFIGDIR": str(Path(os.environ["MPLCONFIGDIR"])) + if os.environ.get("MPLCONFIGDIR") + else None, + "XDG_CACHE_HOME": str(Path(os.environ["XDG_CACHE_HOME"])) + if os.environ.get("XDG_CACHE_HOME") + else None, + }, + } + + +def build_artifact_index( + run_dir: Path, + patterns: list[str] | None = None, + extra_roots: dict[str, Path] | None = None, +) -> dict[str, Any]: + patterns = patterns or [ + "config*", + "commands.sh", + "run_manifest.json", + "summary.md", + "artifact_index.json", + "validation/**/*", + "logs/**/*", + "versions/**/*", + "workflow/**/*", + "fastqc/**/*", + "multiqc/**/*", + "rnaseq_salmon/**/*", + "qc/**/*", + "results/**/*", + "plots/**/*", + "visualizations/**/*", + "tables/**/*", + "notebooks/**/*", + "variants/**/*", + "alignment/**/*", + "peaks/**/*", + "tracks/**/*", + "motifs/**/*", + "consensus/**/*", + "f1r2/**/*", + "functional_profile/**/*", + "taxonomic_classification/**/*", + "bcl/**/*", + "demux/**/*", + "methods/**/*", + "manifest/**/*", + "resources/**/*", + "*.json", + ] + artifacts = [] + seen: set[Path] = set() + + def collect(root_label: str | None, root: Path, root_patterns: list[str]) -> None: + prefix = "" if not root_label else f"{root_label}/" + for pattern in root_patterns: + for path in root.glob(pattern): + if path.is_file() and path not in seen: + seen.add(path) + artifacts.append( + { + "path": f"{prefix}{path.relative_to(root)}", + "bytes": path.stat().st_size, + "modified_at": datetime.fromtimestamp(path.stat().st_mtime) + .astimezone() + .isoformat(timespec="seconds"), + "sha256": sha256_file(path) + if path.stat().st_size <= MAX_AUTO_CHECKSUM_BYTES + else "", + "sha256_skipped_reason": ( + None + if path.stat().st_size <= MAX_AUTO_CHECKSUM_BYTES + else f"file exceeds {MAX_AUTO_CHECKSUM_BYTES} bytes auto-checksum threshold" + ), + } + ) + + collect(None, run_dir, patterns) + if extra_roots: + for label, root in extra_roots.items(): + if root.exists(): + collect(label, root, ["**/*"]) + return { + "created_at": now_iso(), + "checksum_algorithm": "sha256", + "artifacts": sorted(artifacts, key=lambda item: item["path"]), + } + + +def write_standard_manifest( + run_dir: Path, + *, + run_id: str, + lane: str, + analysis_intent: str = "real_analysis", + workflow: str, + status: str, + execute_requested: bool, + validation: dict[str, Any], + tool_preflight_result: dict[str, Any], + dry_run: dict[str, Any] | None = None, + execution: dict[str, Any] | None = None, + inputs: dict[str, Any] | None = None, + outputs: dict[str, Any] | None = None, + method: dict[str, Any] | None = None, + audit: dict[str, Any] | None = None, + review_bundle: dict[str, Any] | None = None, +) -> dict[str, Any]: + lineage_path = write_io_lineage(run_dir, inputs, outputs) + parameter_hash = sha256_json( + { + "lane": lane, + "workflow": workflow, + "execute_requested": execute_requested, + "inputs": inputs or {}, + "outputs": outputs or {}, + "method": method or {}, + } + ) + merged_audit = { + "plugin": plugin_metadata(), + "environment": environment_snapshot(), + "input_checksums": input_checksums(inputs), + "parameter_sha256": parameter_hash, + "lineage_table_path": str(lineage_path.relative_to(run_dir)), + } + config_path = run_dir / "config.json" + if config_path.exists(): + merged_audit["config_sha256"] = sha256_file(config_path) + if audit: + merged_audit.update(audit) + manifest = { + "schema_version": RUN_ENVELOPE_SCHEMA_VERSION, + "run_id": run_id, + "created_at": now_iso(), + "lane": lane, + "analysis_intent": analysis_intent, + "workflow": workflow, + "run_dir": str(run_dir), + "status": status, + "execute_requested": execute_requested, + "validation_ok": validation.get("ok"), + "tool_preflight_ok": tool_preflight_result.get("ok"), + "ready_to_execute": bool(validation.get("ok") and tool_preflight_result.get("ok")), + "dry_run_performed": dry_run is not None, + "dry_run_ok": dry_run.get("ok") if dry_run else None, + "execution_ok": execution.get("ok") if execution else None, + "dry_run_result": dry_run, + "execution_result": execution, + "inputs": inputs or {}, + "outputs": outputs or {}, + "method": method or {}, + "audit": merged_audit, + "artifact_index_path": "artifact_index.json", + "review_bundle": review_bundle or {}, + } + write_json(run_dir / "run_manifest.json", manifest) + return manifest diff --git a/plugins/ngs-analysis/scripts/ngs_visualization_utils.py b/plugins/ngs-analysis/scripts/ngs_visualization_utils.py new file mode 100644 index 0000000..6282c55 --- /dev/null +++ b/plugins/ngs-analysis/scripts/ngs_visualization_utils.py @@ -0,0 +1,779 @@ +#!/usr/bin/env python3 +"""Small visualization helpers for plugin-owned NGS runners.""" + +from __future__ import annotations + +import csv +import html +import json +import os +import shlex +import shutil +import socket +import subprocess +import sys +import textwrap +import time +import urllib.error +import urllib.request +from pathlib import Path +from typing import Any + + +def _json_default(value: Any) -> Any: + if isinstance(value, Path): + return str(value) + return str(value) + + +def write_json(path: Path, value: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + json.dumps(value, indent=2, sort_keys=True, default=_json_default) + "\n", encoding="utf-8" + ) + + +def rel_link(index_dir: Path, target: Path) -> str: + try: + return os.path.relpath(target, index_dir) + except ValueError: + return str(target) + + +def _is_url(value: str) -> bool: + return value.startswith(("http://", "https://")) + + +def localhost_url_for_path( + relative_path: str | Path, *, port: int = 8765, host: str = "127.0.0.1" +) -> str: + rel = Path(relative_path).as_posix().lstrip("/") + return f"http://{host}:{port}/{rel}" + + +def preferred_http_server_python() -> str: + system_python = Path("/usr/bin/python3") + if system_python.exists(): + return str(system_python) + return shutil.which("python3") or sys.executable or "python3" + + +def reachable_localhost_url_for_path( + relative_path: str | Path, + *, + port: int = 8765, + host: str = "127.0.0.1", + timeout_seconds: float = 0.75, +) -> str | None: + url = localhost_url_for_path(relative_path, port=port, host=host) + request = urllib.request.Request(url, method="HEAD") + try: + with urllib.request.urlopen(request, timeout=timeout_seconds) as response: # noqa: S310 + if response.status == 200: + return url + except (urllib.error.URLError, TimeoutError, OSError): + return None + return None + + +def write_localhost_launch_hint( + run_dir: Path, + *, + report_entries: list[tuple[str, str | Path]], + port: int = 8765, + host: str = "127.0.0.1", +) -> Path: + python_cmd = preferred_http_server_python() + lines = [ + f"cd {run_dir}", + f"{python_cmd} -m http.server {port} --bind {host}", + "", + ] + reported = 0 + for label, rel_path in report_entries: + candidate = run_dir / rel_path + if candidate.exists(): + lines.append(f"{label}: {localhost_url_for_path(rel_path, port=port, host=host)}") + reported += 1 + if not reported: + lines.append("No MultiQC reports were generated for this run.") + path = run_dir / "visualizations" / "localhost_launch_hint.txt" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines) + "\n", encoding="utf-8") + return path + + +def artifact_entry( + *, + artifact_id: str, + title: str, + path: str | Path | None, + kind: str, + status: str, + description: str, + source: str | None = None, +) -> dict[str, Any]: + return { + "id": artifact_id, + "title": title, + "path": str(path) if path else None, + "kind": kind, + "status": status, + "description": description, + "source": source, + } + + +def copy_visual_asset(source: Path, dest: Path) -> Path | None: + if not source.exists(): + return None + dest.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, dest) + return dest + + +def find_free_localhost_port( + *, host: str = "127.0.0.1", start_port: int = 2719, max_tries: int = 50 +) -> int: + for port in range(start_port, start_port + max_tries): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + try: + sock.bind((host, port)) + except OSError: + continue + return port + raise RuntimeError(f"Unable to find a free localhost port starting at {start_port}") + + +def wait_for_http_ready(url: str, *, timeout_seconds: float = 12.0) -> bool: + deadline = time.time() + timeout_seconds + while time.time() < deadline: + try: + with urllib.request.urlopen(url, timeout=1.5) as response: # noqa: S310 + if response.status < 500: + return True + except (urllib.error.URLError, TimeoutError, OSError): + time.sleep(0.4) + return False + + +def launch_marimo_review_app( + *, + notebook_path: Path, + run_dir: Path, + host: str = "127.0.0.1", + start_port: int = 2719, + python_executable: str | None = None, +) -> dict[str, Any]: + runtime_root = run_dir / ".runtime" / "marimo" + logs_dir = run_dir / "logs" + runtime_root.mkdir(parents=True, exist_ok=True) + logs_dir.mkdir(parents=True, exist_ok=True) + xdg_cache_home = runtime_root / "xdg_cache" + xdg_state_home = runtime_root / "xdg_state" + xdg_cache_home.mkdir(parents=True, exist_ok=True) + xdg_state_home.mkdir(parents=True, exist_ok=True) + port = find_free_localhost_port(host=host, start_port=start_port) + url = f"http://{host}:{port}/" + log_path = logs_dir / "marimo_server.log" + env = os.environ.copy() + env["XDG_CACHE_HOME"] = str(xdg_cache_home) + env["XDG_STATE_HOME"] = str(xdg_state_home) + cmd = [ + python_executable or sys.executable, + "-m", + "marimo", + "run", + str(notebook_path), + "--host", + host, + "--port", + str(port), + "--headless", + "--no-token", + ] + with log_path.open("a", encoding="utf-8") as log_handle: + process = subprocess.Popen( # noqa: S603 + cmd, + cwd=run_dir, + env=env, + stdout=log_handle, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + ready = wait_for_http_ready(url, timeout_seconds=12.0) + result = { + "ok": ready, + "url": url, + "port": port, + "pid": process.pid, + "log_path": str(log_path), + "command": cmd, + "xdg_cache_home": str(xdg_cache_home), + "xdg_state_home": str(xdg_state_home), + } + if not ready: + result["error"] = "Marimo review app did not become ready before timeout." + return result + + +def write_visualization_index( + run_dir: Path, + *, + title: str, + description: str, + entries: list[dict[str, Any]], + notes: list[str] | None = None, + analysis_intent: str = "real_analysis", + provenance_summary: dict[str, Any] | None = None, +) -> Path: + index_dir = run_dir / "visualizations" + index_dir.mkdir(parents=True, exist_ok=True) + notes = notes or [] + manifest_path = index_dir / "visualization_manifest.json" + write_json( + manifest_path, + { + "title": title, + "description": description, + "analysis_intent": analysis_intent, + "provenance_summary": provenance_summary or {}, + "entries": entries, + "notes": notes, + }, + ) + + rows = [] + for entry in entries: + target = entry.get("path") + if target: + target_str = str(target) + if _is_url(target_str): + href = target_str + else: + href = ( + rel_link(index_dir, run_dir / target) + if not Path(target).is_absolute() + else rel_link(index_dir, Path(target)) + ) + link = f'{html.escape(str(target))}' + else: + link = "" + rows.append( + "" + f"{html.escape(entry.get('title', ''))}" + f'{html.escape(entry.get("status", "unknown"))}' + f"{html.escape(entry.get('kind', ''))}" + f"{link}" + f"{html.escape(entry.get('description', ''))}" + "" + ) + note_items = "\n".join(f"
  • {html.escape(note)}
  • " for note in notes) + body = f""" + + + + {html.escape(title)} + + + +

    {html.escape(title)}

    +

    {html.escape(description)}

    +

    Analysis intent: {html.escape(analysis_intent)}

    + + + + {"".join(rows)} + +
    ArtifactStatusKindPathDescription
    +
    +

    Notes

    +
      {note_items}
    +
    + + +""" + index_path = index_dir / "index.html" + index_path.write_text(body, encoding="utf-8") + return index_path + + +def _read_tsv_rows(path: Path) -> tuple[list[str], list[dict[str, str]]]: + with path.open(newline="", encoding="utf-8", errors="replace") as handle: + reader = csv.DictReader(handle, delimiter="\t") + rows = [{key: (value or "").strip() for key, value in row.items()} for row in reader] + return list(reader.fieldnames or []), rows + + +def _render_html_table( + headers: list[str], rows: list[dict[str, str]], *, limit: int | None = None +) -> str: + if not headers: + return "

    No table data were available.

    " + limited_rows = rows[:limit] if limit is not None else rows + head_html = "".join(f"{html.escape(column)}" for column in headers) + row_html = [] + for row in limited_rows: + row_html.append( + "" + + "".join(f"{html.escape(str(row.get(column, '')))}" for column in headers) + + "" + ) + return ( + '
    ' + "" + f"{head_html}" + f"{''.join(row_html)}" + "
    " + "
    " + ) + + +def _summarize_status_columns( + rows: list[dict[str, str]], headers: list[str] +) -> list[dict[str, Any]]: + summaries: list[dict[str, Any]] = [] + valid = {"pass", "warn", "fail"} + for column in headers: + counts = {"pass": 0, "warn": 0, "fail": 0} + for row in rows: + value = (row.get(column) or "").strip().lower() + if value in valid: + counts[value] += 1 + if any(counts.values()): + summaries.append({"module": column, **counts}) + return sorted(summaries, key=lambda item: (-item["fail"], -item["warn"], item["module"])) + + +def write_multiqc_browser_helper( + run_dir: Path, + *, + report_path: str | Path, + title: str, + localhost_port: int = 8765, +) -> Path | None: + report_rel = Path(report_path) + report_abs = report_rel if report_rel.is_absolute() else run_dir / report_rel + if not report_abs.exists(): + return None + + multiqc_dir = report_abs.parent + helper_path = multiqc_dir / "multiqc_browser_helper.html" + data_dir = multiqc_dir / "multiqc_data" + general_stats_headers: list[str] = [] + general_stats_rows: list[dict[str, str]] = [] + if (data_dir / "multiqc_general_stats.txt").exists(): + general_stats_headers, general_stats_rows = _read_tsv_rows( + data_dir / "multiqc_general_stats.txt" + ) + + fastqc_headers: list[str] = [] + fastqc_rows: list[dict[str, str]] = [] + if (data_dir / "multiqc_fastqc.txt").exists(): + fastqc_headers, fastqc_rows = _read_tsv_rows(data_dir / "multiqc_fastqc.txt") + module_summaries = _summarize_status_columns(fastqc_rows, fastqc_headers) if fastqc_rows else [] + + try: + report_rel_to_run_dir = report_abs.relative_to(run_dir) + except ValueError: + report_rel_to_run_dir = Path(report_abs.name) + localhost_url = reachable_localhost_url_for_path(report_rel_to_run_dir, port=localhost_port) + python_cmd = preferred_http_server_python() + serve_cmd = "\n".join( + [ + f"cd {shlex.quote(str(run_dir))}", + f"{python_cmd} -m http.server {localhost_port} --bind 127.0.0.1", + ] + ) + localhost_link_html = ( + f'Open full MultiQC over localhost' + if localhost_url + else "The localhost review URL is not live yet. Start the server below, then reload this helper." + ) + body = f""" + + + + {html.escape(title)} + + + +

    {html.escape(title)}

    +

    This helper is a browser-safe review surface for MultiQC outputs. The raw interactive MultiQC HTML can stall under file:// in the Codex in-app browser even when the report itself is valid.

    +
    + Recommended path: use this helper for quick review. If you need the full interactive report, start the local HTTP server below and then refresh this page. +
    + + +

    Serve Locally

    +

    Run this in a terminal if the full interactive report is stuck on “Loading report..”:

    +
    {html.escape(serve_cmd)}
    + +

    General Statistics

    +

    Embedded from multiqc_data/multiqc_general_stats.txt. This table is self-contained and does not depend on the raw MultiQC app booting correctly.

    + {_render_html_table(general_stats_headers, general_stats_rows, limit=50)} + +

    FastQC Module Status Summary

    +

    Aggregated from the embedded multiqc_fastqc.txt table when present.

    + {_render_html_table(["module", "fail", "warn", "pass"], module_summaries, limit=50) if module_summaries else "

    No FastQC module-status summary was available for this report.

    "} + + +""" + helper_path.write_text(body, encoding="utf-8") + return helper_path + + +def write_marimo_review_notebook( + notebook_path: Path, + *, + title: str, + run_dir: Path, + image_items: list[tuple[str, str]], + table_items: list[tuple[str, str]], + object_items: list[tuple[str, str]] | None = None, +) -> Path: + object_items = object_items or [] + notebook_path.parent.mkdir(parents=True, exist_ok=True) + image_literal = repr(image_items) + table_literal = repr(table_items) + object_literal = repr(object_items) + source = f'''import marimo + +__generated_with = "0.13.0" +app = marimo.App(width="full") + + +@app.cell +def _(): + import marimo as mo + from pathlib import Path + import pandas as pd + ROOT = Path({str(run_dir)!r}) + return ROOT, Path, mo, pd + + +@app.cell +def _(ROOT, mo): + mo.md(""" +# {title} + +Run directory: `{{}}` +""".format(ROOT)) + return + + +@app.cell +def _(ROOT, mo): + _items = [] + for _title, _rel_path in {image_literal}: + _path = ROOT / _rel_path + if _path.exists(): + _items.append(mo.md(f"## {{_title}}")) + _items.append(mo.image(src=str(_path))) + else: + _items.append(mo.md(f"## {{_title}}\\nMissing: `{{_rel_path}}`")) + mo.vstack(_items) + return + + +@app.cell +def _(ROOT, mo, pd): + _items = [] + for _title, _rel_path in {table_literal}: + _path = ROOT / _rel_path + if _path.exists(): + _items.append(mo.md(f"## {{_title}}")) + _sep = "\\t" if _path.suffix in {{".tsv", ".tab"}} else "," + _items.append(mo.ui.table(pd.read_csv(_path, sep=_sep).head(200))) + else: + _items.append(mo.md(f"## {{_title}}\\nMissing: `{{_rel_path}}`")) + mo.vstack(_items) + return + + +@app.cell +def _(ROOT, mo): + _lines = ["## Analysis Objects"] + for _title, _rel_path in {object_literal}: + _path = ROOT / _rel_path + _state = "present" if _path.exists() else "missing" + _lines.append(f"- {{_title}}: `{{_rel_path}}` ({{_state}})") + mo.md("\\n".join(_lines)) + return + + +if __name__ == "__main__": + app.run() +''' + notebook_path.write_text(textwrap.dedent(source), encoding="utf-8") + return notebook_path + + +def discover_vcf_artifacts( + run_dir: Path, + *, + search_roots: list[str] | None = None, +) -> list[tuple[str, str]]: + """Return display labels and run-relative paths for output VCF/gVCF artifacts.""" + search_roots = search_roots or ["variants", "gvcf", "joint", "results"] + seen: set[str] = set() + items: list[tuple[str, str]] = [] + for root_name in search_roots: + root = run_dir / root_name + if not root.exists(): + continue + for path in sorted(root.rglob("*.vcf.gz")): + rel = path.relative_to(run_dir).as_posix() + if rel in seen: + continue + seen.add(rel) + label = rel + if rel.startswith("variants/"): + label = f"Variant VCF: {path.name}" + elif rel.startswith("gvcf/"): + label = f"GVCF: {path.name}" + elif rel.startswith("joint/"): + label = f"Joint VCF: {path.name}" + items.append((label, rel)) + return items + + +def write_vcf_review_notebook( + notebook_path: Path, + *, + title: str, + run_dir: Path, + vcf_items: list[tuple[str, str]], + table_items: list[tuple[str, str]] | None = None, + object_items: list[tuple[str, str]] | None = None, +) -> Path: + """Write a generic Marimo notebook that previews discovered VCF artifacts.""" + table_items = table_items or [] + object_items = object_items or [] + notebook_path.parent.mkdir(parents=True, exist_ok=True) + vcf_literal = repr(vcf_items) + table_literal = repr(table_items) + object_literal = repr(object_items) + source = f'''import marimo + +__generated_with = "0.23.4" +app = marimo.App(width="full") + + +@app.cell +def _(): + import marimo as mo + import pandas as pd + import subprocess + from pathlib import Path + ROOT = Path({str(run_dir)!r}) + return ROOT, Path, mo, pd, subprocess + + +@app.cell +def _(ROOT, mo): + mo.md(""" +# {title} + +Run directory: `{{}}` +""".format(ROOT)) + return + + +@app.cell +def _(mo): + _vcf_items = {vcf_literal} + if not _vcf_items: + selector = None + _component = mo.md("## VCF Artifacts\\nNo `.vcf.gz` artifacts were discovered in the run envelope.") + else: + _labels = [item[0] for item in _vcf_items] + selector = mo.ui.dropdown(options=_labels, value=_labels[0], label="VCF artifact", full_width=False) + _component = mo.hstack([selector], justify="start") + _component + return (selector,) + + +@app.cell +def _(ROOT, mo, pd, subprocess): + _rows = [] + for _label, _rel_path in {vcf_literal}: + _abs = ROOT / _rel_path + _row = {{"artifact": _label, "path": _rel_path}} + if _abs.exists(): + _result = subprocess.run( + ["bcftools", "stats", str(_abs)], + check=True, + capture_output=True, + text=True, + ) + for _line in _result.stdout.splitlines(): + if not _line.startswith("SN\\t"): + continue + _, _, _key, _value = _line.split("\\t", 3) + if _key == "number of records:": + _row["record_count"] = int(_value) + elif _key == "number of SNPs:": + _row["snp_count"] = int(_value) + elif _key == "number of indels:": + _row["indel_count"] = int(_value) + else: + _row["error"] = "missing VCF" + _rows.append(_row) + _df = pd.DataFrame(_rows).fillna("") + mo.vstack([mo.md("## bcftools stats summary"), mo.ui.table(_df)]) + return + + +@app.cell +def _(ROOT, mo, pd): + _items = [] + for _title, _rel_path in {table_literal}: + _path = ROOT / _rel_path + if _path.exists(): + _items.append(mo.md(f"## {{_title}}")) + _sep = "\\t" if _path.suffix in {{".tsv", ".tab"}} else "," + _items.append(mo.ui.table(pd.read_csv(_path, sep=_sep).head(200))) + else: + _items.append(mo.md(f"## {{_title}}\\nMissing: `{{_rel_path}}`")) + if _items: + _component = mo.vstack(_items) + else: + _component = mo.md("## Tables\\nNo table artifacts were configured for this review.") + _component + return + + +@app.cell +def _(ROOT, mo, subprocess, selector): + _vcf_items = {vcf_literal} + if not _vcf_items or selector is None: + _component = mo.md("## Selected VCF\\nNo selectable VCF artifacts were discovered.") + else: + _selected_label = selector.value + _selected_rel = next(_rel for _label, _rel in _vcf_items if _label == _selected_label) + _selected_abs = ROOT / _selected_rel + _header = subprocess.run( + ["bcftools", "view", "-h", str(_selected_abs)], + check=True, + capture_output=True, + text=True, + ).stdout.splitlines() + _body = subprocess.run( + ["bcftools", "view", "-H", str(_selected_abs)], + check=True, + capture_output=True, + text=True, + ).stdout.splitlines() + _header_preview = "\\n".join(_header[-20:]) + _body_preview = "\\n".join(_body[:25]) if _body else "# no variant rows" + _component = mo.vstack( + [ + mo.md(f"## Selected VCF: `{{_selected_label}}`"), + mo.md(f"`{{_selected_rel}}`"), + mo.md("### Header preview"), + mo.md(f"```text\\n{{_header_preview}}\\n```"), + mo.md("### Variant rows"), + mo.md(f"```text\\n{{_body_preview}}\\n```"), + ] + ) + _component + return + + +@app.cell +def _(ROOT, mo): + _lines = ["## Analysis Objects"] + for _title, _rel_path in {object_literal}: + _path = ROOT / _rel_path + _state = "present" if _path.exists() else "missing" + _lines.append(f"- {{_title}}: `{{_rel_path}}` ({{_state}})") + mo.md("\\n".join(_lines)) + return + + +if __name__ == "__main__": + app.run() +''' + notebook_path.write_text(textwrap.dedent(source), encoding="utf-8") + return notebook_path + + +def add_vcf_review_notebook_entry( + run_dir: Path, + entries: list[dict[str, Any]], + *, + title: str, + notebook_filename: str = "vcf_review.marimo.py", + table_items: list[tuple[str, str]] | None = None, + object_items: list[tuple[str, str]] | None = None, +) -> dict[str, str]: + """Append a generic VCF review notebook artifact entry when VCF outputs exist.""" + vcf_items = discover_vcf_artifacts(run_dir) + if not vcf_items: + entries.append( + artifact_entry( + artifact_id="vcf_review_notebook", + title="VCF Review Notebook", + path=None, + kind="notebook", + status="not_available", + description="No output VCF/gVCF artifacts were present in this run, so the generic VCF review notebook was not created.", + ) + ) + return {} + notebook_path = write_vcf_review_notebook( + run_dir / "notebooks" / notebook_filename, + title=title, + run_dir=run_dir, + vcf_items=vcf_items, + table_items=table_items, + object_items=object_items, + ) + rel = notebook_path.relative_to(run_dir) + entries.append( + artifact_entry( + artifact_id="vcf_review_notebook", + title="VCF Review Notebook", + path=rel, + kind="notebook", + status="created", + description="Generic Marimo notebook that prepopulates any VCF/gVCF artifacts found in the run envelope.", + ) + ) + return {"review_notebook": str(rel)} diff --git a/plugins/ngs-analysis/scripts/run_amplicon_microbiome.py b/plugins/ngs-analysis/scripts/run_amplicon_microbiome.py new file mode 100644 index 0000000..d0ccef8 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_amplicon_microbiome.py @@ -0,0 +1,1208 @@ +#!/usr/bin/env python3 +"""Run or plan amplicon ASV, taxonomy, diversity, and visualization backends.""" + +from __future__ import annotations + +import argparse +import html +import math +import subprocess +from pathlib import Path +from typing import Any + +import ngs_resource_gate +from ngs_planner_utils import ( + command_plan_entry, + normalize_sample_name, + read_table, + resolve_path, + write_command_script, + write_tsv, +) +from ngs_run_utils import ( + build_artifact_index, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import artifact_entry, write_visualization_index + +WORKSPACE_ROOT = Path.cwd() +PLUGIN_ROOT = Path(__file__).resolve().parents[1] +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "amplicon_microbiome_backend" +DADA2_BACKEND_SCRIPT = PLUGIN_ROOT / "workflows" / "amplicon_microbiome" / "run_dada2_backend.R" + + +def validate_inputs(args: argparse.Namespace) -> tuple[dict[str, Any], list[dict[str, str]]]: + sample_sheet = args.sample_sheet.expanduser().resolve() + errors: list[str] = [] + warnings: list[str] = [] + rows: list[dict[str, str]] = [] + columns: list[str] = [] + samples: list[dict[str, str]] = [] + if not sample_sheet.exists(): + errors.append(f"sample sheet does not exist: {sample_sheet}") + else: + rows, columns = read_table(sample_sheet) + if not args.primer_forward or not args.primer_reverse: + errors.append( + "both --primer-forward and --primer-reverse are required for full amplicon backend execution" + ) + if args.taxonomy_classifier: + classifier = args.taxonomy_classifier.expanduser().resolve() + if not classifier.exists(): + errors.append(f"taxonomy classifier/database does not exist: {classifier}") + else: + warnings.append( + "no --taxonomy-classifier was provided; ASV generation can be planned but taxonomy assignment is blocked" + ) + if args.metadata: + metadata = args.metadata.expanduser().resolve() + if not metadata.exists(): + errors.append(f"metadata file does not exist: {metadata}") + else: + warnings.append( + "no sample metadata was provided; PERMANOVA and metadata-colored diversity plots will be limited" + ) + for row_index, row in enumerate(rows, start=2): + sample = normalize_sample_name( + first_present(row, ["sample", "sample_id", "sampleID"]), f"row_{row_index}" + ) + r1 = resolve_path( + first_present(row, ["r1", "fastq_1", "forwardReads", "read1"]), sample_sheet.parent + ) + r2 = resolve_path( + first_present(row, ["r2", "fastq_2", "reverseReads", "read2"]), sample_sheet.parent + ) + if not r1: + errors.append(f"row {row_index}: r1/fastq_1/forwardReads is required") + continue + if not r1.exists(): + errors.append(f"row {row_index}: R1 FASTQ does not exist: {r1}") + if r2 and not r2.exists(): + errors.append(f"row {row_index}: R2 FASTQ does not exist: {r2}") + samples.append( + { + "sample": sample, + "r1": str(r1), + "r2": str(r2) if r2 else "", + "row_index": str(row_index), + } + ) + if not samples: + errors.append("no usable amplicon samples found") + validation = { + "ok": not errors, + "sample_sheet": str(sample_sheet), + "metadata": str(args.metadata.expanduser().resolve()) if args.metadata else None, + "taxonomy_classifier": str(args.taxonomy_classifier.expanduser().resolve()) + if args.taxonomy_classifier + else None, + "backend": args.backend, + "marker": args.marker, + "primer_forward": args.primer_forward, + "primer_reverse": args.primer_reverse, + "sample_count": len(samples), + "columns": columns, + "errors": errors, + "warnings": warnings, + } + return validation, samples + + +def first_present(row: dict[str, str], names: list[str]) -> str | None: + for name in names: + value = row.get(name) + if value: + return value + return None + + +def build_qiime2_plan( + args: argparse.Namespace, samples: list[dict[str, str]] +) -> list[dict[str, Any]]: + paired = any(row["r2"] for row in samples) + manifest = "workflow/qiime2_manifest.tsv" + plan = [ + command_plan_entry( + "import FASTQs", + [ + "qiime", + "tools", + "import", + "--type", + "SampleData[PairedEndSequencesWithQuality]" + if paired + else "SampleData[SequencesWithQuality]", + "--input-path", + manifest, + "--output-path", + "qiime2/demux.qza", + "--input-format", + "PairedEndFastqManifestPhred33V2" if paired else "SingleEndFastqManifestPhred33V2", + ], + outputs=["qiime2/demux.qza"], + ), + command_plan_entry( + "trim primers", + [ + "qiime", + "cutadapt", + "trim-paired" if paired else "trim-single", + "--i-demultiplexed-sequences", + "qiime2/demux.qza", + "--p-front-f", + args.primer_forward, + "--p-front-r", + args.primer_reverse, + "--o-trimmed-sequences", + "qiime2/trimmed.qza", + ], + outputs=["qiime2/trimmed.qza"], + ), + command_plan_entry( + "DADA2 denoise", + [ + "qiime", + "dada2", + "denoise-paired" if paired else "denoise-single", + "--i-demultiplexed-seqs", + "qiime2/trimmed.qza", + "--p-trunc-len-f", + str(args.trunc_len_f or 0), + "--p-trunc-len-r", + str(args.trunc_len_r or 0), + "--o-table", + "qiime2/table.qza", + "--o-representative-sequences", + "qiime2/rep-seqs.qza", + "--o-denoising-stats", + "qiime2/denoising-stats.qza", + ], + outputs=["qiime2/table.qza", "qiime2/rep-seqs.qza"], + ), + ] + if args.taxonomy_classifier: + plan.append( + command_plan_entry( + "assign taxonomy", + [ + "qiime", + "feature-classifier", + "classify-sklearn", + "--i-classifier", + args.taxonomy_classifier.expanduser().resolve(), + "--i-reads", + "qiime2/rep-seqs.qza", + "--o-classification", + "qiime2/taxonomy.qza", + ], + outputs=["qiime2/taxonomy.qza"], + ) + ) + if args.metadata: + plan.append( + command_plan_entry( + "core diversity metrics", + [ + "qiime", + "diversity", + "core-metrics", + "--i-table", + "qiime2/table.qza", + "--p-sampling-depth", + str(args.sampling_depth), + "--m-metadata-file", + args.metadata.expanduser().resolve(), + "--output-dir", + "qiime2/core-metrics", + ], + outputs=["qiime2/core-metrics"], + ) + ) + plan.append( + command_plan_entry( + "export feature table", + [ + "qiime", + "tools", + "export", + "--input-path", + "qiime2/table.qza", + "--output-path", + "tables/asv_table_export", + ], + outputs=["tables/asv_table_export"], + ) + ) + if args.taxonomy_classifier: + plan.append( + command_plan_entry( + "export taxonomy", + [ + "qiime", + "tools", + "export", + "--input-path", + "qiime2/taxonomy.qza", + "--output-path", + "tables/taxonomy_export", + ], + outputs=["tables/taxonomy_export"], + ) + ) + plan.append( + command_plan_entry( + "export denoising stats", + [ + "qiime", + "tools", + "export", + "--input-path", + "qiime2/denoising-stats.qza", + "--output-path", + "tables/denoising_stats_export", + ], + outputs=["tables/denoising_stats_export"], + ) + ) + return plan + + +def build_nfcore_plan(args: argparse.Namespace) -> list[dict[str, Any]]: + cmd = [ + "python", + "plugins/ngs-analysis/scripts/run_nfcore_pipeline.py", + "--pipeline", + "ampliseq", + "--sample-sheet", + args.sample_sheet.expanduser().resolve(), + "--extra-param", + f"FW_primer={args.primer_forward}", + "--extra-param", + f"RV_primer={args.primer_reverse}", + ] + if args.profile: + cmd.extend(["--profile", args.profile]) + if args.execute: + cmd.append("--execute") + return [command_plan_entry("nf-core/ampliseq handoff", cmd, outputs=["nfcore/ampliseq"])] + + +def build_plan(args: argparse.Namespace, samples: list[dict[str, str]]) -> list[dict[str, Any]]: + if args.backend == "qiime2": + return build_qiime2_plan(args, samples) + if args.backend == "nf-core/ampliseq": + return build_nfcore_plan(args) + cmd: list[str | Path] = [ + "Rscript", + DADA2_BACKEND_SCRIPT, + "--sample-sheet", + args.sample_sheet.expanduser().resolve(), + "--outdir", + ".", + "--primer-forward", + args.primer_forward, + "--primer-reverse", + args.primer_reverse, + "--threads", + str(args.threads), + ] + if args.trunc_len_f is not None: + cmd.extend(["--trunc-len-f", str(args.trunc_len_f)]) + if args.trunc_len_r is not None: + cmd.extend(["--trunc-len-r", str(args.trunc_len_r)]) + if args.taxonomy_classifier: + cmd.extend(["--taxonomy-classifier", args.taxonomy_classifier.expanduser().resolve()]) + return [ + command_plan_entry( + "DADA2 R backend", + cmd, + outputs=[ + "tables/asv_table.tsv", + "tables/taxonomy.tsv", + "tables/read_retention.tsv", + "tables/representative_sequences.fasta", + ], + ) + ] + + +def r_package_preflight(packages: list[str]) -> dict[str, Any]: + if not packages: + return {"checked": [], "missing": [], "ok": True} + if not any(item["present"] for item in tool_preflight(["Rscript"]).get("checked", [])): + return { + "checked": [ + {"package": package, "present": False, "reason": "Rscript missing"} + for package in packages + ], + "missing": packages, + "ok": False, + } + expression = ( + "cat(paste(%s, as.integer(vapply(%s, requireNamespace, logical(1), quietly=TRUE)), sep='\\t'), sep='\\n')" + % ( + "c(%s)" % ",".join(repr(package) for package in packages), + "c(%s)" % ",".join(repr(package) for package in packages), + ) + ) + result = subprocess.run( + ["Rscript", "-e", expression], + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + ) + present: dict[str, bool] = {} + for line in result.stdout.splitlines(): + parts = line.split("\t") + if len(parts) == 2: + present[parts[0]] = parts[1] == "1" + checked = [{"package": package, "present": bool(present.get(package))} for package in packages] + missing = [item["package"] for item in checked if not item["present"]] + return { + "checked": checked, + "missing": missing, + "ok": result.returncode == 0 and not missing, + "stderr_tail": result.stderr.splitlines()[-5:], + } + + +def merge_tool_status( + tool_status: dict[str, Any], extra_status: dict[str, Any] | None +) -> dict[str, Any]: + if not extra_status: + return tool_status + merged = dict(tool_status) + runtime_missing = list(merged.get("runtime_missing", [])) + for package in extra_status.get("missing", []): + runtime_missing.append(f"R package:{package}") + merged["runtime_missing"] = runtime_missing + merged["r_packages"] = extra_status + merged["ok"] = bool(merged.get("ok")) and bool(extra_status.get("ok")) + return merged + + +def write_outputs( + run_dir: Path, + validation: dict[str, Any], + samples: list[dict[str, str]], + plan: list[dict[str, Any]], +) -> None: + write_tsv( + run_dir / "validation" / "samples.normalized.tsv", + samples, + ["sample", "r1", "r2", "row_index"], + ) + write_json(run_dir / "workflow" / "amplicon_backend_command_plan.json", {"commands": plan}) + write_command_script(run_dir / "commands.sh", [item["command"] for item in plan]) + write_json( + run_dir / "methods" / "amplicon_backend_methods.json", + { + "backend": validation.get("backend"), + "marker": validation.get("marker"), + "primer_forward": validation.get("primer_forward"), + "primer_reverse": validation.get("primer_reverse"), + "taxonomy_classifier": validation.get("taxonomy_classifier"), + "outputs_expected": [ + "ASV table", + "representative sequences", + "taxonomy table", + "alpha diversity", + "beta diversity", + "PCoA", + "read-retention summary", + ], + }, + ) + normalize_backend_exports(run_dir) + write_amplicon_review_outputs(run_dir) + + +def _data_lines(path: Path) -> list[str]: + lines = [] + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + if not line.strip(): + continue + if line.startswith("#") and "\t" not in line: + continue + lines.append(line[1:] if line.startswith("#") else line) + return lines + + +def normalize_generic_tsv( + source: Path, destination: Path, first_column: str, header_aliases: dict[str, str] | None = None +) -> int: + lines = _data_lines(source) + if not lines: + return 0 + header_aliases = header_aliases or {} + header = [part.strip() for part in lines[0].split("\t")] + header[0] = first_column + header = [header_aliases.get(item.lower(), item) for item in header] + output = ["\t".join(header)] + row_count = 0 + for line in lines[1:]: + parts = line.split("\t") + if not parts or not parts[0].strip(): + continue + output.append("\t".join(parts)) + row_count += 1 + destination.parent.mkdir(parents=True, exist_ok=True) + destination.write_text("\n".join(output) + "\n", encoding="utf-8") + return row_count + + +def first_existing(paths: list[Path]) -> Path | None: + return next((path for path in paths if path.exists()), None) + + +def normalize_backend_exports(run_dir: Path) -> dict[str, Any]: + tables_dir = run_dir / "tables" + tables_dir.mkdir(parents=True, exist_ok=True) + summary: dict[str, Any] = { + "status": "not_available", + "outputs": {}, + "inputs": {}, + "notes": [], + "conversion_commands": [], + } + + asv_output = tables_dir / "asv_table.tsv" + asv_source = first_existing( + [ + asv_output, + tables_dir / "asv_table_export" / "feature-table.tsv", + tables_dir / "asv_table_export" / "feature-table.txt", + tables_dir / "asv_table_export" / "biom.tsv", + ] + ) + biom_source = tables_dir / "asv_table_export" / "feature-table.biom" + if asv_source: + summary["inputs"]["asv_table"] = str(asv_source) + if asv_source != asv_output: + rows = normalize_generic_tsv( + asv_source, + asv_output, + "feature_id", + {"#otu id": "feature_id", "otu id": "feature_id"}, + ) + summary["notes"].append(f"normalized {rows} ASV/features from exported TSV") + summary["outputs"]["asv_table"] = "tables/asv_table.tsv" + elif biom_source.exists(): + summary["inputs"]["asv_biom"] = str(biom_source) + summary["notes"].append( + "QIIME2 exported a BIOM feature table; convert it to TSV before native visualization." + ) + summary["conversion_commands"].append( + "biom convert -i tables/asv_table_export/feature-table.biom -o tables/asv_table_export/feature-table.tsv --to-tsv" + ) + + taxonomy_output = tables_dir / "taxonomy.tsv" + taxonomy_source = first_existing( + [ + taxonomy_output, + tables_dir / "taxonomy_export" / "taxonomy.tsv", + tables_dir / "taxonomy_export" / "taxonomy.txt", + ] + ) + if taxonomy_source: + summary["inputs"]["taxonomy"] = str(taxonomy_source) + if taxonomy_source != taxonomy_output: + rows = normalize_generic_tsv( + taxonomy_source, + taxonomy_output, + "feature_id", + {"feature id": "feature_id", "taxon": "taxonomy", "confidence": "confidence"}, + ) + summary["notes"].append(f"normalized {rows} taxonomy rows from QIIME2 export") + summary["outputs"]["taxonomy_table"] = "tables/taxonomy.tsv" + + retention_output = tables_dir / "read_retention.tsv" + retention_source = first_existing( + [ + retention_output, + tables_dir / "denoising_stats_export" / "stats.tsv", + tables_dir / "denoising_stats_export" / "stats.txt", + ] + ) + if retention_source: + summary["inputs"]["read_retention"] = str(retention_source) + if retention_source != retention_output: + rows = normalize_generic_tsv( + retention_source, + retention_output, + "sample", + {"sample id": "sample", "sample-id": "sample"}, + ) + summary["notes"].append( + f"normalized {rows} denoising/read-retention rows from QIIME2 export" + ) + summary["outputs"]["read_retention"] = "tables/read_retention.tsv" + + diversity_dir = run_dir / "qiime2" / "core-metrics" + if diversity_dir.exists(): + summary["inputs"]["core_metrics"] = str(diversity_dir) + summary["outputs"]["core_metrics_dir"] = "qiime2/core-metrics" + + summary["status"] = "created" if summary["outputs"] else "not_available" + if not summary["outputs"]: + summary["notes"].append( + "No ASV, taxonomy, denoising, or QIIME2 core-metrics exports were found yet." + ) + write_json(tables_dir / "amplicon_backend_summary.json", summary) + return summary + + +def parse_numeric(value: Any) -> float: + try: + return float(str(value).replace(",", "").strip()) + except (TypeError, ValueError): + return 0.0 + + +def read_feature_matrix(path: Path) -> tuple[list[str], dict[str, dict[str, float]]]: + if not path.exists(): + return [], {} + rows, columns = read_table(path) + if len(columns) < 2: + return [], {} + feature_col = columns[0] + samples = columns[1:] + matrix: dict[str, dict[str, float]] = {} + for row in rows: + feature = row.get(feature_col, "").strip() + if not feature: + continue + matrix[feature] = {sample: parse_numeric(row.get(sample, "")) for sample in samples} + return samples, matrix + + +def taxonomy_label_map(path: Path) -> dict[str, str]: + if not path.exists(): + return {} + rows, columns = read_table(path) + if not columns: + return {} + feature_col = columns[0] + lower = {column.lower(): column for column in columns} + taxonomy_col = lower.get("taxonomy") or lower.get("taxon") or lower.get("classification") + labels: dict[str, str] = {} + for row in rows: + feature = row.get(feature_col, "").strip() + taxonomy = row.get(taxonomy_col, "").strip() if taxonomy_col else "" + if not feature or not taxonomy: + continue + parts = [part.strip() for part in taxonomy.split(";") if part.strip()] + labels[feature] = parts[-1] if parts else taxonomy + return labels + + +def write_bar_svg(path: Path, title: str, values: list[tuple[str, float]], *, subtitle: str) -> str: + path.parent.mkdir(parents=True, exist_ok=True) + if not values: + body = f""" + + {html.escape(title)} + {html.escape(subtitle)} + +""" + path.write_text(body, encoding="utf-8") + return str(path) + width = 980 + row_height = 38 + height = 92 + len(values) * row_height + max_value = max(value for _, value in values) or 1.0 + lines = [ + f'', + '', + f'{html.escape(title)}', + f'{html.escape(subtitle)}', + ] + for index, (label, value) in enumerate(values): + y = 84 + index * row_height + bar_width = max(2.0, min(460.0, value / max_value * 460.0)) + short_label = label if len(label) < 42 else label[:39] + "..." + lines.extend( + [ + f'{html.escape(short_label)}', + f'', + f'', + f'{value:.5g}', + ] + ) + lines.append("\n") + path.write_text("\n".join(lines), encoding="utf-8") + return str(path) + + +def write_amplicon_review_outputs(run_dir: Path) -> dict[str, Any]: + samples, matrix = read_feature_matrix(run_dir / "tables" / "asv_table.tsv") + taxonomy = taxonomy_label_map(run_dir / "tables" / "taxonomy.tsv") + outputs: dict[str, str] = {} + status = "not_available" + notes: list[str] = [] + + if not matrix: + notes.append( + "ASV table is not available; backend-derived diversity and taxa plots were not generated from real tables." + ) + else: + status = "created" + alpha_rows: list[dict[str, Any]] = [] + for sample in samples: + values = [feature_values.get(sample, 0.0) for feature_values in matrix.values()] + total = sum(values) + observed = sum(1 for value in values if value > 0) + shannon = 0.0 + if total: + for value in values: + if value > 0: + proportion = value / total + shannon -= proportion * math.log(proportion) + alpha_rows.append( + { + "sample": sample, + "total_reads": round(total, 6), + "observed_features": observed, + "shannon": round(shannon, 6), + } + ) + write_tsv( + run_dir / "tables" / "alpha_diversity.tsv", + alpha_rows, + ["sample", "total_reads", "observed_features", "shannon"], + ) + outputs["alpha_diversity"] = "tables/alpha_diversity.tsv" + + distance_rows = [] + for left in samples: + row: dict[str, Any] = {"sample": left} + for right in samples: + numerator = sum( + abs(values.get(left, 0.0) - values.get(right, 0.0)) + for values in matrix.values() + ) + denominator = sum( + values.get(left, 0.0) + values.get(right, 0.0) for values in matrix.values() + ) + row[right] = f"{(numerator / denominator if denominator else 0.0):.8g}" + distance_rows.append(row) + write_tsv( + run_dir / "tables" / "bray_curtis_distance.tsv", distance_rows, ["sample", *samples] + ) + outputs["bray_curtis_distance"] = "tables/bray_curtis_distance.tsv" + + grouped: dict[str, dict[str, float]] = {} + for feature, values in matrix.items(): + label = taxonomy.get(feature, feature) + grouped.setdefault(label, {sample: 0.0 for sample in samples}) + for sample in samples: + grouped[label][sample] += values.get(sample, 0.0) + taxa_rows = [] + for label, values in sorted( + grouped.items(), key=lambda item: sum(item[1].values()), reverse=True + ): + row: dict[str, Any] = { + "taxon_or_feature": label, + "total_abundance": round(sum(values.values()), 6), + } + row.update({sample: round(values.get(sample, 0.0), 6) for sample in samples}) + taxa_rows.append(row) + write_tsv( + run_dir / "tables" / "top_taxa_or_features.tsv", + taxa_rows[:25], + ["taxon_or_feature", "total_abundance", *samples], + ) + outputs["top_taxa_or_features"] = "tables/top_taxa_or_features.tsv" + + write_bar_svg( + run_dir / "visualizations" / "amplicon_sample_depth.svg", + "Amplicon Sample Depth", + [(row["sample"], float(row["total_reads"])) for row in alpha_rows], + subtitle="Total feature-table counts per sample.", + ) + write_bar_svg( + run_dir / "visualizations" / "amplicon_alpha_diversity.svg", + "Amplicon Alpha Diversity", + [(row["sample"], float(row["shannon"])) for row in alpha_rows], + subtitle="Shannon diversity computed from the normalized ASV/feature table.", + ) + write_bar_svg( + run_dir / "visualizations" / "amplicon_top_taxa.svg", + "Amplicon Top Taxa Or Features", + [(row["taxon_or_feature"], float(row["total_abundance"])) for row in taxa_rows[:15]], + subtitle="Aggregated abundance across samples; taxonomy labels are used when available.", + ) + outputs.update( + { + "sample_depth_plot": "visualizations/amplicon_sample_depth.svg", + "alpha_diversity_plot": "visualizations/amplicon_alpha_diversity.svg", + "top_taxa_plot": "visualizations/amplicon_top_taxa.svg", + } + ) + if not taxonomy: + notes.append("Taxonomy table is not available; top-taxa plot uses feature IDs.") + + dashboard_rows = [] + for output_label, output_path in outputs.items(): + href = ( + output_path.replace("visualizations/", "", 1) + if output_path.startswith("visualizations/") + else f"../{output_path}" + ) + dashboard_rows.append( + f'{html.escape(output_label)}{html.escape(output_path)}' + ) + if not dashboard_rows: + dashboard_rows.append( + 'No backend-derived amplicon review outputs are available yet.' + ) + dashboard = f""" + + + + Amplicon Backend Dashboard + + + +

    Amplicon Backend Dashboard

    +

    Native review of normalized backend ASV, diversity, taxonomy, and read-retention artifacts. Outputs remain absent until real backend tables are present.

    + {"".join(dashboard_rows)}
    ArtifactPath
    +

    Notes

    +
      {"".join(f"
    • {html.escape(note)}
    • " for note in notes)}
    + + +""" + dashboard_path = run_dir / "visualizations" / "amplicon_backend_dashboard.html" + dashboard_path.parent.mkdir(parents=True, exist_ok=True) + dashboard_path.write_text(dashboard, encoding="utf-8") + outputs["dashboard"] = "visualizations/amplicon_backend_dashboard.html" + summary = { + "status": status, + "outputs": outputs, + "notes": notes, + "sample_count": len(samples), + "feature_count": len(matrix), + } + write_json(run_dir / "tables" / "amplicon_diversity_summary.json", summary) + return summary + + +def write_qiime2_manifest(run_dir: Path, samples: list[dict[str, str]]) -> None: + paired = any(row["r2"] for row in samples) + fieldnames = ( + ["sample-id", "forward-absolute-filepath", "reverse-absolute-filepath"] + if paired + else ["sample-id", "absolute-filepath"] + ) + rows = [] + for row in samples: + if paired: + rows.append( + { + "sample-id": row["sample"], + "forward-absolute-filepath": row["r1"], + "reverse-absolute-filepath": row["r2"], + } + ) + else: + rows.append({"sample-id": row["sample"], "absolute-filepath": row["r1"]}) + write_tsv(run_dir / "workflow" / "qiime2_manifest.tsv", rows, fieldnames) + + +def execute_plan(run_dir: Path, plan: list[dict[str, Any]]) -> dict[str, Any]: + for dirname in ["qiime2", "tables", "logs", "workflow"]: + (run_dir / dirname).mkdir(parents=True, exist_ok=True) + result: dict[str, Any] = {"ok": True, "steps": []} + for index, item in enumerate(plan, start=1): + step = run_cmd(["bash", "-c", item["command"]], run_dir, timeout=7200) + safe = item["name"].replace(":", "").replace(" ", "_").replace("/", "_") + write_json(run_dir / "logs" / f"{index:02d}_{safe}.json", step) + result["steps"].append({"name": item["name"], "ok": step.get("ok")}) + result["ok"] = bool(result["ok"] and step.get("ok")) + if not step.get("ok"): + break + return result + + +def write_visuals( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> dict[str, str]: + entries = [ + artifact_entry( + artifact_id="samples", + title="Amplicon Samples", + path="validation/samples.normalized.tsv", + kind="table", + status="created", + description="Normalized sample FASTQ manifest.", + ), + artifact_entry( + artifact_id="command_plan", + title="Backend Command Plan", + path="workflow/amplicon_backend_command_plan.json", + kind="json", + status="created", + description="QIIME2, DADA2, or nf-core/ampliseq command contract.", + ), + artifact_entry( + artifact_id="methods", + title="Backend Methods", + path="methods/amplicon_backend_methods.json", + kind="json", + status="created", + description="Primer, marker, taxonomy, and diversity method contract.", + ), + artifact_entry( + artifact_id="backend_summary", + title="Backend Export Summary", + path="tables/amplicon_backend_summary.json", + kind="json", + status="created", + description="Records which QIIME2/DADA2/nf-core exported tables were normalized for review.", + ), + artifact_entry( + artifact_id="diversity_summary", + title="Diversity Summary", + path="tables/amplicon_diversity_summary.json", + kind="json", + status="created", + description="Backend-derived alpha diversity, Bray-Curtis, and taxa-plot availability summary.", + ), + artifact_entry( + artifact_id="backend_dashboard", + title="Backend Dashboard", + path="visualizations/amplicon_backend_dashboard.html", + kind="html", + status="created", + description="Native review dashboard for ASV depth, alpha diversity, taxa/features, and backend caveats.", + ), + artifact_entry( + artifact_id="sample_depth_plot", + title="Sample Depth Plot", + path="visualizations/amplicon_sample_depth.svg", + kind="svg", + status="created" + if (run_dir / "visualizations" / "amplicon_sample_depth.svg").exists() + else "not_available", + description="Per-sample feature-table count plot.", + ), + artifact_entry( + artifact_id="alpha_diversity_plot", + title="Alpha Diversity Plot", + path="visualizations/amplicon_alpha_diversity.svg", + kind="svg", + status="created" + if (run_dir / "visualizations" / "amplicon_alpha_diversity.svg").exists() + else "not_available", + description="Shannon diversity plot computed from the normalized ASV/feature table.", + ), + artifact_entry( + artifact_id="top_taxa_plot", + title="Top Taxa Or Features Plot", + path="visualizations/amplicon_top_taxa.svg", + kind="svg", + status="created" + if (run_dir / "visualizations" / "amplicon_top_taxa.svg").exists() + else "not_available", + description="Top taxa/features aggregated across samples.", + ), + artifact_entry( + artifact_id="asv_table", + title="ASV Table", + path="tables/asv_table.tsv", + kind="table", + status="created" + if (run_dir / "tables" / "asv_table.tsv").exists() + else "not_available", + description="ASV/feature table after backend execution or export.", + ), + artifact_entry( + artifact_id="taxonomy_table", + title="Taxonomy Table", + path="tables/taxonomy.tsv", + kind="table", + status="created" if (run_dir / "tables" / "taxonomy.tsv").exists() else "not_available", + description="Feature taxonomy table normalized from backend output.", + ), + artifact_entry( + artifact_id="alpha_diversity", + title="Alpha Diversity Table", + path="tables/alpha_diversity.tsv", + kind="table", + status="created" + if (run_dir / "tables" / "alpha_diversity.tsv").exists() + else "not_available", + description="Observed features, total reads, and Shannon diversity per sample.", + ), + artifact_entry( + artifact_id="bray_curtis", + title="Bray-Curtis Distance Matrix", + path="tables/bray_curtis_distance.tsv", + kind="table", + status="created" + if (run_dir / "tables" / "bray_curtis_distance.tsv").exists() + else "not_available", + description="Pairwise Bray-Curtis distances computed from normalized feature counts.", + ), + artifact_entry( + artifact_id="read_retention", + title="Read Retention", + path="tables/read_retention.tsv", + kind="table", + status="created" + if (run_dir / "tables" / "read_retention.tsv").exists() + else "not_available", + description="Denoising or read-retention summary normalized from backend output.", + ), + ] + entries.extend(ngs_resource_gate.resource_visual_entries(resource_plan)) + index = write_visualization_index( + run_dir, + title="Amplicon Microbiome Backend Review", + description="Review surface for ASV, taxonomy, diversity, and backend provenance artifacts.", + entries=entries, + notes=[ + *validation.get("warnings", []), + *ngs_resource_gate.resource_messages(resource_plan), + ], + analysis_intent="real_analysis" if status != "blocked" else "blocked_preflight", + provenance_summary={ + "status": status, + "backend": validation.get("backend"), + "sample_count": validation.get("sample_count", 0), + "resource_plan_ok": validation.get("resource_plan_ok"), + }, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + } + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, + tool_status: dict[str, Any] | None = None, +) -> None: + lines = [ + "# Amplicon Microbiome Backend Run Summary", + "", + f"Status: `{status}`", + f"Backend: `{validation.get('backend')}`", + f"Samples parsed: `{validation.get('sample_count', 0)}`", + "", + "## Key Artifacts", + "", + "- `workflow/amplicon_backend_command_plan.json`", + "- `workflow/qiime2_manifest.tsv` for QIIME2 runs", + "- `methods/amplicon_backend_methods.json`", + "- `tables/` ASV, taxonomy, and diversity outputs when executed/exported", + "- `tables/amplicon_backend_summary.json`", + "- `tables/amplicon_diversity_summary.json`, `tables/alpha_diversity.tsv`, and `tables/bray_curtis_distance.tsv` when ASV tables are available", + "- `visualizations/amplicon_backend_dashboard.html` and backend-derived SVG plots", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `visualizations/index.html`", + "- `run_manifest.json` and `artifact_index.json`", + "", + ] + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {item}" for item in validation["warnings"]) + lines.append("") + if tool_status and not tool_status.get("ok"): + lines.extend(["## Runtime Blockers", ""]) + for item in tool_status.get("missing_required", []): + lines.append(f"- missing executable: `{item}`") + for item in tool_status.get("runtime_missing", []): + lines.append(f"- missing runtime dependency: `{item}`") + lines.append("") + lines.extend(ngs_resource_gate.resource_summary_lines(resource_plan)) + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {item}" for item in validation["errors"]) + write_text(run_dir / "summary.md", "\n".join(lines) + "\n") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument( + "--backend", choices=["qiime2", "dada2", "nf-core/ampliseq"], default="qiime2" + ) + parser.add_argument("--marker", default="16S") + parser.add_argument("--primer-forward", required=True) + parser.add_argument("--primer-reverse", required=True) + parser.add_argument("--taxonomy-classifier", type=Path) + parser.add_argument("--metadata", type=Path) + parser.add_argument("--trunc-len-f", type=int) + parser.add_argument("--trunc-len-r", type=int) + parser.add_argument("--sampling-depth", type=int, default=1000) + parser.add_argument("--threads", type=int, default=4) + parser.add_argument("--profile") + parser.add_argument( + "--bundle-root", + action="append", + default=[], + help="Resource bundle override formatted as bundle=/path. May be repeated.", + ) + parser.add_argument( + "--include-optional-resources", + action="store_true", + help="Include optional taxonomy databases such as GTDB in readiness checks.", + ) + parser.add_argument("--resource-checksums", action="store_true") + parser.add_argument( + "--require-resource-plan", + action="store_true", + help="Treat missing registered taxonomy/reference bundles as blocking for this direct runner.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Skip registered taxonomy database readiness checks.", + ) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=slug_timestamp("amplicon-microbiome-backend")) + parser.add_argument("--execute", action="store_true") + return parser.parse_args() + + +def serializable_args(args: argparse.Namespace) -> dict[str, Any]: + return { + key: str(value) if isinstance(value, Path) else value for key, value in vars(args).items() + } + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + input_validation, samples = validate_inputs(args) + resource_plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="amplicon_microbiome", + bundle_roots=args.bundle_root, + include_optional=args.include_optional_resources, + include_checksums=args.resource_checksums, + skip=args.skip_resource_plan, + required=args.require_resource_plan, + ) + validation = ngs_resource_gate.merge_resource_status( + input_validation, resource_plan, required=args.require_resource_plan + ) + required = {"qiime2": ["qiime"], "dada2": ["Rscript"], "nf-core/ampliseq": ["nextflow"]}[ + args.backend + ] + tool_status = tool_preflight(required, optional=["cutadapt", "multiqc"]) + if args.backend == "dada2": + package_status = r_package_preflight(["dada2"]) + tool_status = merge_tool_status(tool_status, package_status) + plan = build_plan(args, samples) + write_qiime2_manifest(run_dir, samples) + write_json(run_dir / "config.json", {**serializable_args(args), "run_dir": str(run_dir)}) + write_json(run_dir / "validation" / "input_validation_summary.json", input_validation) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + { + "qiime": ["qiime", "--version"], + "Rscript": ["Rscript", "--version"], + "nextflow": ["nextflow", "-version"], + } + ), + ) + write_outputs(run_dir, validation, samples, plan) + dry_run = { + "ok": validation["ok"] and tool_status["ok"], + "detail": "amplicon backend inputs, primers, taxonomy resources, and tools validated", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + status = "blocked" if not dry_run["ok"] else "validated" + execution = None + if args.execute and dry_run["ok"]: + execution = execute_plan(run_dir, plan) + status = "completed" if execution.get("ok") else "failed" + normalize_backend_exports(run_dir) + write_amplicon_review_outputs(run_dir) + visuals = write_visuals(run_dir, status, validation, resource_plan) + resource_outputs = ngs_resource_gate.resource_output_paths(resource_plan) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="amplicon_microbiome", + workflow=f"backend_{args.backend}", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "metadata": str(args.metadata.expanduser().resolve()) if args.metadata else None, + "taxonomy_classifier": str(args.taxonomy_classifier.expanduser().resolve()) + if args.taxonomy_classifier + else None, + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs={ + "sample_table": "validation/samples.normalized.tsv", + "command_plan": "workflow/amplicon_backend_command_plan.json", + "methods": "methods/amplicon_backend_methods.json", + "backend_summary": "tables/amplicon_backend_summary.json", + "diversity_summary": "tables/amplicon_diversity_summary.json", + "asv_table": "tables/asv_table.tsv", + "taxonomy_table": "tables/taxonomy.tsv", + "alpha_diversity": "tables/alpha_diversity.tsv", + "bray_curtis_distance": "tables/bray_curtis_distance.tsv", + "top_taxa_or_features": "tables/top_taxa_or_features.tsv", + "read_retention": "tables/read_retention.tsv", + "backend_dashboard": "visualizations/amplicon_backend_dashboard.html", + "sample_depth_plot": "visualizations/amplicon_sample_depth.svg", + "alpha_diversity_plot": "visualizations/amplicon_alpha_diversity.svg", + "top_taxa_plot": "visualizations/amplicon_top_taxa.svg", + **resource_outputs, + **visuals, + }, + method={ + "backend": args.backend, + "marker": args.marker, + "primer_forward": args.primer_forward, + "primer_reverse": args.primer_reverse, + "resource_plan": resource_plan, + }, + audit={"resource_readiness": resource_plan} if resource_plan else None, + review_bundle=visuals, + ) + write_summary(run_dir, status, validation, resource_plan, tool_status) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_atacseq_peaks_qc.py b/plugins/ngs-analysis/scripts/run_atacseq_peaks_qc.py new file mode 100644 index 0000000..f292020 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_atacseq_peaks_qc.py @@ -0,0 +1,732 @@ +#!/usr/bin/env python3 +"""Run or plan local ATAC-seq alignment, QC, peak, signal, and FRiP artifacts.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Any + +import ngs_resource_gate +from ngs_epigenomics_utils import summarize_epigenomics_outputs +from ngs_planner_utils import ( + command_plan_entry, + normalize_sample_name, + read_table, + resolve_path, + shell_join, + write_command_script, + write_tsv, +) +from ngs_run_utils import ( + build_artifact_index, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import artifact_entry, write_visualization_index + +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "atacseq_peaks_qc" + + +def validate_inputs(args: argparse.Namespace) -> tuple[dict[str, Any], list[dict[str, str]]]: + sample_sheet = args.sample_sheet.expanduser().resolve() + errors: list[str] = [] + warnings: list[str] = [] + rows: list[dict[str, str]] = [] + columns: list[str] = [] + samples: list[dict[str, str]] = [] + if not sample_sheet.exists(): + errors.append(f"sample sheet does not exist: {sample_sheet}") + else: + rows, columns = read_table(sample_sheet) + if not args.bam_only and not args.bowtie2_index: + warnings.append( + "no --bowtie2-index was provided; FASTQ rows can only be planned, not aligned" + ) + if not args.genome_size: + errors.append( + "--genome-size is required for MACS2 peak calling, e.g. hs, mm, or an effective genome size" + ) + blacklist = args.blacklist_bed.expanduser().resolve() if args.blacklist_bed else None + if blacklist and not blacklist.exists(): + errors.append(f"blacklist BED does not exist: {blacklist}") + tss_bed = args.tss_bed.expanduser().resolve() if args.tss_bed else None + if tss_bed and not tss_bed.exists(): + warnings.append( + f"TSS BED does not exist; TSS enrichment commands will be skipped: {tss_bed}" + ) + if getattr(args, "run_motifs", False) and not getattr(args, "motif_genome", None): + errors.append( + "--run-motifs requires --motif-genome, for example hg38, mm10, or a HOMER genome identifier" + ) + + for row_index, row in enumerate(rows, start=2): + sample = normalize_sample_name( + row.get("sample") or row.get("sample_id"), f"row_{row_index}" + ) + bam = resolve_path(row.get("bam") or row.get("alignment"), sample_sheet.parent) + r1 = resolve_path(row.get("r1") or row.get("fastq_1"), sample_sheet.parent) + r2 = resolve_path(row.get("r2") or row.get("fastq_2"), sample_sheet.parent) + if bam: + if not bam.exists(): + errors.append(f"row {row_index}: BAM does not exist: {bam}") + layout = "bam" + elif r1: + if not r1.exists(): + errors.append(f"row {row_index}: R1 FASTQ does not exist: {r1}") + if r2 and not r2.exists(): + errors.append(f"row {row_index}: R2 FASTQ does not exist: {r2}") + layout = "fastq_pe" if r2 else "fastq_se" + else: + errors.append(f"row {row_index}: provide bam/alignment or r1/fastq_1") + continue + samples.append( + { + "sample": sample, + "condition": row.get("condition", ""), + "replicate": row.get("replicate", ""), + "layout": layout, + "bam": str(bam) if bam else "", + "r1": str(r1) if r1 else "", + "r2": str(r2) if r2 else "", + "row_index": str(row_index), + } + ) + if not samples: + errors.append("no usable ATAC-seq samples found") + validation = { + "ok": not errors, + "sample_sheet": str(sample_sheet), + "columns": columns, + "sample_count": len(samples), + "blacklist_bed": str(blacklist) if blacklist else None, + "tss_bed": str(tss_bed) if tss_bed else None, + "genome_size": args.genome_size, + "run_motifs": getattr(args, "run_motifs", False), + "motif_genome": getattr(args, "motif_genome", None), + "motif_size": getattr(args, "motif_size", None), + "errors": errors, + "warnings": warnings, + } + return validation, samples + + +def sample_bam_path(sample: dict[str, str]) -> str: + return ( + sample["bam"] if sample["layout"] == "bam" else f"alignment/{sample['sample']}.sorted.bam" + ) + + +def build_plan(args: argparse.Namespace, samples: list[dict[str, str]]) -> list[dict[str, Any]]: + plan: list[dict[str, Any]] = [] + for sample in samples: + name = sample["sample"] + bam = sample_bam_path(sample) + filtered_bam = f"alignment/{name}.filtered.bam" + if sample["layout"].startswith("fastq"): + bowtie = [ + "bowtie2", + "-x", + args.bowtie2_index or "MISSING_BOWTIE2_INDEX", + "-p", + str(args.threads), + ] + if sample["r2"]: + bowtie.extend(["-1", sample["r1"], "-2", sample["r2"]]) + else: + bowtie.extend(["-U", sample["r1"]]) + plan.append( + command_plan_entry( + f"{name}: align and sort", + f"{shell_join(bowtie)} | {shell_join(['samtools', 'sort', '-@', str(args.threads), '-o', bam, '-'])}", + outputs=[bam], + ) + ) + plan.append( + command_plan_entry(f"{name}: index aligned BAM", ["samtools", "index", bam]) + ) + plan.append( + command_plan_entry( + f"{name}: filter alignment", + [ + "samtools", + "view", + "-b", + "-q", + str(args.min_mapq), + "-F", + "1804", + "-o", + filtered_bam, + bam, + ], + outputs=[filtered_bam], + ) + ) + plan.append( + command_plan_entry(f"{name}: index filtered BAM", ["samtools", "index", filtered_bam]) + ) + plan.append( + command_plan_entry( + f"{name}: flagstat", + f"{shell_join(['samtools', 'flagstat', filtered_bam])} > {shell_join([f'qc/{name}.flagstat.txt'])}", + outputs=[f"qc/{name}.flagstat.txt"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: insert sizes", + f"{shell_join(['samtools', 'view', '-f', '2', filtered_bam])} | awk '{{t=$9; if (t<0) t=-t; if (t>0) print t}}' > {shell_join([f'qc/{name}.insert_sizes.txt'])}", + outputs=[f"qc/{name}.insert_sizes.txt"], + ) + ) + peak_cmd: list[str | Path] = [ + "macs2", + "callpeak", + "-t", + filtered_bam, + "-f", + "BAMPE", + "-g", + args.genome_size, + "-n", + name, + "--outdir", + "peaks", + "--keep-dup", + "all", + ] + plan.append( + command_plan_entry( + f"{name}: MACS2 peaks", peak_cmd, outputs=[f"peaks/{name}_peaks.narrowPeak"] + ) + ) + if args.blacklist_bed: + plan.append( + command_plan_entry( + f"{name}: blacklist-filter peaks", + f"{shell_join(['bedtools', 'intersect', '-v', '-a', f'peaks/{name}_peaks.narrowPeak', '-b', args.blacklist_bed.expanduser().resolve()])} > {shell_join([f'peaks/{name}.blacklist_filtered.narrowPeak'])}", + outputs=[f"peaks/{name}.blacklist_filtered.narrowPeak"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: FRiP numerator", + f"{shell_join(['bedtools', 'intersect', '-u', '-abam', filtered_bam, '-b', f'peaks/{name}_peaks.narrowPeak'])} | {shell_join(['samtools', 'view', '-c', '-'])} > {shell_join([f'qc/{name}.frip_reads.txt'])}", + outputs=[f"qc/{name}.frip_reads.txt"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: total filtered reads", + f"{shell_join(['samtools', 'view', '-c', filtered_bam])} > {shell_join([f'qc/{name}.filtered_reads.txt'])}", + outputs=[f"qc/{name}.filtered_reads.txt"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: bigWig signal", + [ + "bamCoverage", + "-b", + filtered_bam, + "-o", + f"tracks/{name}.bw", + "--numberOfProcessors", + str(args.threads), + ], + outputs=[f"tracks/{name}.bw"], + ) + ) + if args.tss_bed: + plan.append( + command_plan_entry( + f"{name}: TSS enrichment matrix", + [ + "computeMatrix", + "reference-point", + "-S", + f"tracks/{name}.bw", + "-R", + args.tss_bed.expanduser().resolve(), + "--referencePoint", + "TSS", + "-b", + "2000", + "-a", + "2000", + "-o", + f"qc/{name}.tss_matrix.gz", + ], + outputs=[f"qc/{name}.tss_matrix.gz"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: TSS enrichment profile", + [ + "plotProfile", + "-m", + f"qc/{name}.tss_matrix.gz", + "-out", + f"qc/{name}.tss_profile.png", + "--plotTitle", + f"{name} TSS enrichment", + ], + outputs=[f"qc/{name}.tss_profile.png"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: TSS enrichment heatmap", + [ + "plotHeatmap", + "-m", + f"qc/{name}.tss_matrix.gz", + "-out", + f"qc/{name}.tss_heatmap.png", + "--plotTitle", + f"{name} TSS enrichment", + ], + outputs=[f"qc/{name}.tss_heatmap.png"], + ) + ) + if getattr(args, "run_motifs", False): + motif_genome = getattr(args, "motif_genome", None) or "MISSING_MOTIF_GENOME" + motif_size = str(getattr(args, "motif_size", "given")) + motif_peak = ( + f"peaks/{name}.blacklist_filtered.narrowPeak" + if args.blacklist_bed + else f"peaks/{name}_peaks.narrowPeak" + ) + plan.append( + command_plan_entry( + f"{name}: motif enrichment", + [ + "findMotifsGenome.pl", + motif_peak, + motif_genome, + f"motifs/{name}", + "-size", + motif_size, + ], + outputs=[f"motifs/{name}/knownResults.txt", f"motifs/{name}/homerResults.html"], + ) + ) + plan.append( + command_plan_entry( + "consensus peak merge", + f"cat peaks/*_peaks.narrowPeak 2>/dev/null | sort -k1,1 -k2,2n | {shell_join(['bedtools', 'merge', '-i', '-'])} > peaks/consensus_peaks.bed", + outputs=["peaks/consensus_peaks.bed"], + ) + ) + return plan + + +def write_outputs( + run_dir: Path, + validation: dict[str, Any], + samples: list[dict[str, str]], + plan: list[dict[str, Any]], +) -> None: + write_tsv( + run_dir / "validation" / "samples.normalized.tsv", + samples, + ["sample", "condition", "replicate", "layout", "bam", "r1", "r2", "row_index"], + ) + write_json(run_dir / "workflow" / "atacseq_command_plan.json", {"commands": plan}) + write_command_script(run_dir / "commands.sh", [item["command"] for item in plan]) + write_json( + run_dir / "qc" / "atac_qc_contract.json", + { + "required_review_metrics": [ + "alignment_rate", + "duplicate_rate", + "mitochondrial_fraction", + "insert_size_periodicity", + "TSS_enrichment", + "FRiP", + "blacklist_overlap", + "replicate_concordance", + ], + "available_after_execution": [ + "qc/*.flagstat.txt", + "qc/*.insert_sizes.txt", + "qc/*.frip_reads.txt", + "qc/*.filtered_reads.txt", + "peaks/*.narrowPeak", + "tracks/*.bw", + ], + "warnings": validation.get("warnings", []), + }, + ) + summarize_epigenomics_outputs( + run_dir, samples, peak_mode="narrow", output_prefix="atacseq_qc", title="ATAC-seq" + ) + + +def execute_plan(run_dir: Path, plan: list[dict[str, Any]]) -> dict[str, Any]: + for dirname in ["alignment", "qc", "peaks", "tracks", "logs", "motifs"]: + (run_dir / dirname).mkdir(parents=True, exist_ok=True) + result: dict[str, Any] = {"ok": True, "steps": []} + for index, item in enumerate(plan, start=1): + step = run_cmd(["bash", "-c", item["command"]], run_dir, timeout=7200) + safe = item["name"].replace(":", "").replace(" ", "_").replace("/", "_") + write_json(run_dir / "logs" / f"{index:02d}_{safe}.json", step) + result["steps"].append({"name": item["name"], "ok": step.get("ok")}) + result["ok"] = bool(result["ok"] and step.get("ok")) + if not step.get("ok"): + break + return result + + +def write_visuals( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> dict[str, str]: + entries = [ + artifact_entry( + artifact_id="samples", + title="ATAC Samples", + path="validation/samples.normalized.tsv", + kind="table", + status="created", + description="Normalized ATAC sample table.", + ), + artifact_entry( + artifact_id="command_plan", + title="ATAC Command Plan", + path="workflow/atacseq_command_plan.json", + kind="json", + status="created", + description="Alignment, filtering, peak, FRiP, signal, and TSS command plan.", + ), + artifact_entry( + artifact_id="qc_contract", + title="ATAC QC Contract", + path="qc/atac_qc_contract.json", + kind="json", + status="created", + description="Metrics required for interpretation and artifacts expected after execution.", + ), + artifact_entry( + artifact_id="qc_summary", + title="ATAC QC Summary", + path="qc/atacseq_qc_summary.tsv", + kind="table", + status="created", + description="Parsed per-sample alignment, insert-size, FRiP, peak, TSS, motif, and track state.", + ), + artifact_entry( + artifact_id="qc_dashboard", + title="ATAC QC Dashboard", + path="qc/atacseq_qc_dashboard.html", + kind="html", + status="created", + description="Native dashboard summarizing FRiP, peak counts, insert sizes, track state, and caveats.", + ), + artifact_entry( + artifact_id="frip_peak_overview", + title="FRiP And Peak Plot", + path="qc/atacseq_qc_frip_peak_overview.svg", + kind="svg", + status="created", + description="Compact FRiP and peak-count plot generated from parsed run artifacts.", + ), + artifact_entry( + artifact_id="insert_size_distribution", + title="Insert-Size Plot", + path="qc/atacseq_qc_insert_size_distribution.svg", + kind="svg", + status="created", + description="Native insert-size distribution plot generated from parsed fragment sizes.", + ), + artifact_entry( + artifact_id="browser_tracks", + title="Browser Track Manifest", + path="tracks/browser_tracks.tsv", + kind="table", + status="created", + description="bigWig track lines and IGV/UCSC browser handoff metadata.", + ), + artifact_entry( + artifact_id="browser_track_preview", + title="Browser Track Preview", + path="tracks/browser_track_preview.html", + kind="html", + status="created", + description="HTML preview of bigWig track paths and UCSC track lines.", + ), + artifact_entry( + artifact_id="consensus_peaks", + title="Consensus Peaks", + path="peaks/consensus_peaks.bed", + kind="bed", + status="created" + if (run_dir / "peaks" / "consensus_peaks.bed").exists() + else "not_available", + description="Merged consensus peak set after execution.", + ), + artifact_entry( + artifact_id="motif_summary", + title="Motif Summary", + path="motifs/motif_summary.tsv", + kind="table", + status="created", + description="Motif-enrichment output summary when motif backend outputs are present.", + ), + ] + entries.extend(ngs_resource_gate.resource_visual_entries(resource_plan)) + index = write_visualization_index( + run_dir, + title="ATAC-seq Peaks QC Review", + description="Review surface for ATAC-seq alignment, peak, FRiP, TSS, and track outputs.", + entries=entries, + notes=[ + *validation.get("warnings", []), + *ngs_resource_gate.resource_messages(resource_plan), + ], + analysis_intent="real_analysis" if status != "blocked" else "blocked_preflight", + provenance_summary={ + "status": status, + "sample_count": validation.get("sample_count", 0), + "resource_plan_ok": validation.get("resource_plan_ok"), + }, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + } + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> None: + lines = [ + "# ATAC-seq Peaks QC Run Summary", + "", + f"Status: `{status}`", + f"Samples parsed: `{validation.get('sample_count', 0)}`", + "", + "## Key Artifacts", + "", + "- `workflow/atacseq_command_plan.json`", + "- `qc/atac_qc_contract.json`", + "- `qc/atacseq_qc_summary.tsv` and `qc/atacseq_qc_summary.json`", + "- `qc/atacseq_qc_dashboard.html`, `qc/atacseq_qc_frip_peak_overview.svg`, and `qc/atacseq_qc_insert_size_distribution.svg`", + "- `peaks/*.narrowPeak`, `peaks/consensus_peaks.bed`, and `tracks/*.bw` when executed", + "- `tracks/browser_tracks.tsv`, `tracks/browser_track_preview.html`, `tracks/ucsc_track_lines.txt`, and `tracks/igv_session.xml`", + "- `motifs/motif_summary.tsv` when motif outputs are generated", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `visualizations/index.html`", + "- `run_manifest.json` and `artifact_index.json`", + "", + ] + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {item}" for item in validation["warnings"]) + lines.append("") + lines.extend(ngs_resource_gate.resource_summary_lines(resource_plan)) + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {item}" for item in validation["errors"]) + write_text(run_dir / "summary.md", "\n".join(lines) + "\n") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--bowtie2-index") + parser.add_argument("--bam-only", action="store_true") + parser.add_argument("--genome-size", required=True) + parser.add_argument("--blacklist-bed", type=Path) + parser.add_argument("--tss-bed", type=Path) + parser.add_argument( + "--genome-build", + help="Genome build or registry alias for resource readiness, e.g. GRCh38, mm39, or a configured local alias.", + ) + parser.add_argument( + "--bundle-root", + action="append", + default=[], + help="Resource bundle override formatted as bundle=/path. May be repeated.", + ) + parser.add_argument("--include-optional-resources", action="store_true") + parser.add_argument("--resource-checksums", action="store_true") + parser.add_argument( + "--require-resource-plan", + action="store_true", + help="Treat missing registered reference bundles as blocking for this direct runner.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Skip registered reference bundle readiness checks.", + ) + parser.add_argument("--run-motifs", action="store_true") + parser.add_argument("--motif-genome") + parser.add_argument("--motif-size", default="given") + parser.add_argument("--min-mapq", type=int, default=30) + parser.add_argument("--threads", type=int, default=4) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=slug_timestamp("atacseq-peaks-qc")) + parser.add_argument("--execute", action="store_true") + return parser.parse_args() + + +def serializable_args(args: argparse.Namespace) -> dict[str, Any]: + return { + key: str(value) if isinstance(value, Path) else value for key, value in vars(args).items() + } + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + input_validation, samples = validate_inputs(args) + resource_plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="atacseq_peaks_qc", + genome_build=args.genome_build, + bundle_roots=args.bundle_root, + include_optional=args.include_optional_resources, + include_checksums=args.resource_checksums, + skip=args.skip_resource_plan, + required=args.require_resource_plan, + ) + validation = ngs_resource_gate.merge_resource_status( + input_validation, resource_plan, required=args.require_resource_plan + ) + needs_alignment = any(row["layout"].startswith("fastq") for row in samples) + required_tools = ( + ["samtools", "macs2", "bedtools", "bamCoverage"] + (["bowtie2"] if needs_alignment else []) + if args.execute + else [] + ) + if args.execute and args.tss_bed: + required_tools.extend(["computeMatrix", "plotProfile", "plotHeatmap"]) + if args.execute and args.run_motifs: + required_tools.append("findMotifsGenome.pl") + optional_tools = [ + name + for name in [ + "samtools", + "macs2", + "bedtools", + "bamCoverage", + "bowtie2", + "computeMatrix", + "plotProfile", + "plotHeatmap", + "findMotifsGenome.pl", + "multiqc", + ] + if name not in required_tools + ] + tool_status = tool_preflight(required_tools, optional=optional_tools) + plan = build_plan(args, samples) + write_json(run_dir / "config.json", {**serializable_args(args), "run_dir": str(run_dir)}) + write_json(run_dir / "validation" / "input_validation_summary.json", input_validation) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + { + "samtools": ["samtools", "--version"], + "macs2": ["macs2", "--version"], + "bedtools": ["bedtools", "--version"], + "bowtie2": ["bowtie2", "--version"], + "bamCoverage": ["bamCoverage", "--version"], + } + ), + ) + write_outputs(run_dir, validation, samples, plan) + dry_run = { + "ok": validation["ok"] and (tool_status["ok"] if args.execute else True), + "detail": "ATAC sample, metadata, and backend tool validation completed", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + status = "blocked" if not dry_run["ok"] else "validated" + execution = None + if args.execute and dry_run["ok"]: + execution = execute_plan(run_dir, plan) + status = "completed" if execution.get("ok") else "failed" + summarize_epigenomics_outputs( + run_dir, samples, peak_mode="narrow", output_prefix="atacseq_qc", title="ATAC-seq" + ) + visuals = write_visuals(run_dir, status, validation, resource_plan) + resource_outputs = ngs_resource_gate.resource_output_paths(resource_plan) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="atacseq_peaks_qc", + workflow="local_light_atacseq_alignment_peaks_qc", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "blacklist_bed": str(args.blacklist_bed.expanduser().resolve()) + if args.blacklist_bed + else None, + "tss_bed": str(args.tss_bed.expanduser().resolve()) if args.tss_bed else None, + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs={ + "sample_table": "validation/samples.normalized.tsv", + "command_plan": "workflow/atacseq_command_plan.json", + "qc_contract": "qc/atac_qc_contract.json", + "qc_summary": "qc/atacseq_qc_summary.tsv", + "qc_summary_json": "qc/atacseq_qc_summary.json", + "qc_dashboard": "qc/atacseq_qc_dashboard.html", + "frip_peak_overview": "qc/atacseq_qc_frip_peak_overview.svg", + "insert_size_distribution": "qc/atacseq_qc_insert_size_distribution.svg", + "peaks": "peaks/*.narrowPeak", + "consensus_peaks": "peaks/consensus_peaks.bed", + "tracks": "tracks/*.bw", + "browser_tracks": "tracks/browser_tracks.tsv", + "browser_track_preview": "tracks/browser_track_preview.html", + "igv_session": "tracks/igv_session.xml", + "motif_summary": "motifs/motif_summary.tsv", + **resource_outputs, + **visuals, + }, + method={ + "peak_caller": "MACS2", + "frip": "bedtools intersect + samtools count", + "tss_enrichment": "deepTools computeMatrix/plotProfile/plotHeatmap when --tss-bed is supplied", + "motif_enrichment": "HOMER findMotifsGenome.pl when --run-motifs is supplied", + "resource_plan": resource_plan, + }, + audit={"resource_readiness": resource_plan} if resource_plan else None, + review_bundle=visuals, + ) + write_summary(run_dir, status, validation, resource_plan) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_bcl_to_fastq.py b/plugins/ngs-analysis/scripts/run_bcl_to_fastq.py new file mode 100644 index 0000000..1c23447 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_bcl_to_fastq.py @@ -0,0 +1,823 @@ +#!/usr/bin/env python3 +"""Validate Illumina BCL run folders and run local BCL-to-FASTQ conversion when available.""" + +from __future__ import annotations + +import argparse +import csv +import shlex +import time +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import Any + +from ngs_run_utils import ( + build_artifact_index, + command_path, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) + +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "bcl_to_fastq" +DOCKER_MOUNT_ROOTS = (Path("/Users"), Path("/private"), Path("/Volumes")) +DEMUX_QC_THRESHOLDS = { + "warn_undetermined_fraction": 0.20, + "fail_undetermined_fraction": 0.50, + "warn_top_unknown_fraction": 0.01, +} + + +def parse_runinfo(path: Path) -> dict[str, Any]: + result: dict[str, Any] = { + "path": str(path), + "exists": path.exists(), + "ok": False, + "reads": [], + "errors": [], + } + if not path.exists(): + result["errors"].append("RunInfo.xml is missing") + return result + try: + root = ET.parse(path).getroot() + except ET.ParseError as exc: + result["errors"].append(f"RunInfo.xml parse failed: {exc}") + return result + + run = root.find(".//Run") + if run is not None: + result["run_id"] = run.attrib.get("Id") + result["flowcell"] = run.attrib.get("Flowcell") + result["instrument"] = run.attrib.get("Instrument") + + reads = [] + for read in root.findall(".//Reads/Read"): + read_info = { + "number": int(read.attrib.get("Number", "0") or 0), + "num_cycles": int(read.attrib.get("NumCycles", "0") or 0), + "is_indexed_read": read.attrib.get("IsIndexedRead", "").lower() == "y", + } + reads.append(read_info) + result["reads"] = sorted(reads, key=lambda item: item["number"]) + result["indexed_reads"] = [item for item in result["reads"] if item["is_indexed_read"]] + result["sequencing_reads"] = [item for item in result["reads"] if not item["is_indexed_read"]] + if not result["reads"]: + result["errors"].append("RunInfo.xml does not contain Reads/Read entries") + result["ok"] = not result["errors"] + return result + + +def parse_runparameters(path: Path) -> dict[str, Any]: + result: dict[str, Any] = { + "path": str(path), + "exists": path.exists(), + "ok": True, + "values": {}, + "errors": [], + } + if not path.exists(): + result["ok"] = False + result["errors"].append("RunParameters.xml is missing") + return result + try: + root = ET.parse(path).getroot() + except ET.ParseError as exc: + result["ok"] = False + result["errors"].append(f"RunParameters.xml parse failed: {exc}") + return result + + wanted = [ + "ApplicationName", + "ApplicationVersion", + "InstrumentType", + "RunID", + "ExperimentName", + "FlowCellType", + ] + values = {} + for tag in wanted: + node = root.find(f".//{tag}") + if node is not None and node.text: + values[tag] = node.text.strip() + result["values"] = values + return result + + +def normalize_header(value: str) -> str: + return value.strip().lstrip("\ufeff") + + +def parse_sample_sheet(path: Path) -> dict[str, Any]: + result: dict[str, Any] = { + "path": str(path), + "exists": path.exists(), + "ok": False, + "sections": [], + "data_columns": [], + "data_rows": [], + "reads": [], + "settings": {}, + "errors": [], + "warnings": [], + } + if not path.exists(): + result["errors"].append("sample sheet is missing") + return result + + with path.open(newline="", encoding="utf-8-sig") as handle: + parsed = list(csv.reader(handle)) + + current_section: str | None = None + data_header: list[str] | None = None + in_data = False + for raw_row in parsed: + row = [item.strip() for item in raw_row] + if not row or not any(row): + continue + first = normalize_header(row[0]) + if first.startswith("[") and first.endswith("]"): + current_section = first.strip("[]") + result["sections"].append(current_section) + in_data = current_section.lower().endswith("data") or current_section.lower() == "data" + data_header = None + continue + if current_section is None: + if data_header is None: + data_header = [normalize_header(item) for item in row] + result["data_columns"] = data_header + in_data = True + else: + values = row + [""] * (len(data_header) - len(row)) + result["data_rows"].append(dict(zip(data_header, values))) + continue + section_key = current_section.lower() + if section_key == "reads": + try: + result["reads"].append(int(row[0])) + except ValueError: + result["warnings"].append(f"could not parse [Reads] row: {','.join(row)}") + continue + if section_key in {"settings", "header"} and len(row) >= 2: + result["settings"][row[0]] = row[1] + continue + if in_data: + if data_header is None: + data_header = [normalize_header(item) for item in row] + result["data_columns"] = data_header + else: + values = row + [""] * (len(data_header) - len(row)) + result["data_rows"].append(dict(zip(data_header, values))) + + data_rows = result["data_rows"] + if not data_rows: + result["errors"].append("sample sheet does not contain data rows") + return result + + duplicate_keys: set[tuple[str, str, str]] = set() + for index, row in enumerate(data_rows, start=1): + sample = ( + row.get("Sample_ID") + or row.get("SampleID") + or row.get("Sample_Name") + or row.get("sample") + or "" + ) + i7 = row.get("index") or row.get("Index") or row.get("I7_Index_ID") or "" + i5 = row.get("index2") or row.get("Index2") or row.get("I5_Index_ID") or "" + lane = row.get("Lane") or row.get("lane") or "all" + if not sample: + result["errors"].append(f"data row {index}: sample identifier is missing") + if not i7 and not i5: + result["warnings"].append(f"data row {index}: no index sequence columns were found") + key = (lane, i7, i5) + if key in duplicate_keys: + result["errors"].append( + f"data row {index}: duplicate lane/index/index2 combination {key}" + ) + duplicate_keys.add(key) + + result["sample_count"] = len( + { + row.get("Sample_ID") + or row.get("SampleID") + or row.get("Sample_Name") + or row.get("sample") + or f"row_{i}" + for i, row in enumerate(data_rows, start=1) + } + ) + result["index_lengths"] = sorted( + { + len(row.get("index") or row.get("Index") or "") + for row in data_rows + if row.get("index") or row.get("Index") + } + ) + result["index2_lengths"] = sorted( + { + len(row.get("index2") or row.get("Index2") or "") + for row in data_rows + if row.get("index2") or row.get("Index2") + } + ) + result["ok"] = not result["errors"] + return result + + +def validate_index_lengths(runinfo: dict[str, Any], sample_sheet: dict[str, Any]) -> list[str]: + warnings = [] + indexed_cycles = [item.get("num_cycles", 0) for item in runinfo.get("indexed_reads", [])] + sequencing_cycles = [item.get("num_cycles", 0) for item in runinfo.get("sequencing_reads", [])] + sample_sheet_reads = sample_sheet.get("reads") or [] + if ( + sample_sheet_reads + and sequencing_cycles + and sample_sheet_reads != sequencing_cycles[: len(sample_sheet_reads)] + ): + warnings.append( + f"sample sheet [Reads] values do not match RunInfo sequencing reads: sample sheet {sample_sheet_reads}, RunInfo {sequencing_cycles}" + ) + if not indexed_cycles: + return warnings + index_lengths = sample_sheet.get("index_lengths") or [] + index2_lengths = sample_sheet.get("index2_lengths") or [] + if ( + index_lengths + and indexed_cycles + and any(length > indexed_cycles[0] for length in index_lengths) + ): + warnings.append( + f"i7 index length exceeds RunInfo index read cycles: sample sheet {index_lengths}, RunInfo {indexed_cycles[0]}" + ) + if ( + index2_lengths + and len(indexed_cycles) >= 2 + and any(length > indexed_cycles[1] for length in index2_lengths) + ): + warnings.append( + f"i5 index length exceeds RunInfo index read cycles: sample sheet {index2_lengths}, RunInfo {indexed_cycles[1]}" + ) + if index2_lengths and len(indexed_cycles) < 2: + warnings.append( + "sample sheet has index2 values but RunInfo.xml has fewer than two indexed reads" + ) + return warnings + + +def validate_inputs(args: argparse.Namespace) -> dict[str, Any]: + run_folder = args.run_folder.expanduser().resolve() + sample_sheet = args.sample_sheet.expanduser().resolve() + runinfo = parse_runinfo(run_folder / "RunInfo.xml") + runparameters = parse_runparameters(run_folder / "RunParameters.xml") + sheet = parse_sample_sheet(sample_sheet) + basecalls = run_folder / "Data" / "Intensities" / "BaseCalls" + errors = [] + warnings = [] + if not run_folder.exists(): + errors.append(f"run folder does not exist: {run_folder}") + if not basecalls.exists(): + errors.append(f"BaseCalls directory does not exist: {basecalls}") + errors.extend(runinfo.get("errors", [])) + errors.extend(sheet.get("errors", [])) + warnings.extend(runparameters.get("errors", [])) + warnings.extend(sheet.get("warnings", [])) + warnings.extend(validate_index_lengths(runinfo, sheet)) + + validation = { + "ok": not errors, + "run_folder": str(run_folder), + "sample_sheet": str(sample_sheet), + "output_directory": str(args.output_directory.expanduser().resolve()), + "runinfo": runinfo, + "runparameters": runparameters, + "sample_sheet_summary": sheet, + "basecalls_directory": str(basecalls), + "basecalls_directory_exists": basecalls.exists(), + "errors": errors, + "warnings": warnings, + } + return validation + + +def select_converter(args: argparse.Namespace) -> str | None: + if args.converter: + return args.converter if command_path(args.converter) else None + if command_path("bcl-convert"): + return "bcl-convert" + if command_path("bcl2fastq"): + return "bcl2fastq" + return None + + +def conversion_command(converter: str, args: argparse.Namespace) -> list[str]: + run_folder = str(args.run_folder.expanduser().resolve()) + sample_sheet = str(args.sample_sheet.expanduser().resolve()) + output_directory = str(args.output_directory.expanduser().resolve()) + if Path(converter).name == "bcl2fastq" or converter == "bcl2fastq": + return [ + converter, + "--runfolder-dir", + run_folder, + "--output-dir", + output_directory, + "--sample-sheet", + sample_sheet, + ] + return [ + converter, + "--bcl-input-directory", + run_folder, + "--output-directory", + output_directory, + "--sample-sheet", + sample_sheet, + ] + + +def read_csv_rows(path: Path) -> list[dict[str, str]]: + if not path.exists(): + return [] + with path.open(newline="", encoding="utf-8-sig") as handle: + return list(csv.DictReader(handle)) + + +def parse_int(value: str | None) -> int | None: + if not value: + return None + try: + return int(value.replace(",", "")) + except ValueError: + return None + + +def parse_float(value: str | None) -> float | None: + if value is None or value == "": + return None + try: + return float(value) + except ValueError: + return None + + +def path_within_roots(path: Path, roots: tuple[Path, ...]) -> bool: + return any(path == root or root in path.parents for root in roots) + + +def converter_runtime_preflight(converter: str | None, args: argparse.Namespace) -> dict[str, Any]: + result: dict[str, Any] = { + "converter": converter, + "converter_path": command_path(converter) if converter else None, + "uses_docker_wrapper": False, + "docker_daemon_ok": None, + "errors": [], + "warnings": [], + "mount_roots": [str(item) for item in DOCKER_MOUNT_ROOTS], + } + if not converter: + return result + converter_path = Path(result["converter_path"] or converter) + if converter_path.exists(): + try: + head = converter_path.read_text(encoding="utf-8", errors="ignore")[:2000] + result["uses_docker_wrapper"] = "docker run" in head + except OSError: + pass + if result["uses_docker_wrapper"]: + probe = run_cmd(["docker", "info"], WORKSPACE_ROOT, timeout=30) + result["docker_daemon_ok"] = probe.get("ok", False) + if not probe.get("ok", False): + result["warnings"].append( + "docker daemon is not ready for the Docker-backed bcl-convert wrapper" + ) + for label, candidate in { + "run_folder": args.run_folder.expanduser().resolve(), + "sample_sheet": args.sample_sheet.expanduser().resolve(), + "output_directory": args.output_directory.expanduser().resolve(), + }.items(): + if not path_within_roots(candidate, DOCKER_MOUNT_ROOTS): + result["errors"].append( + f"{label} is outside the Docker wrapper mount roots {', '.join(str(item) for item in DOCKER_MOUNT_ROOTS)}" + ) + return result + + +def docker_daemon_error(result: dict[str, Any]) -> bool: + output = str(result.get("stdout_tail", "") or "") + return "Cannot connect to the Docker daemon" in output + + +def wait_for_docker_daemon(timeout_seconds: int = 60, poll_seconds: int = 5) -> dict[str, Any]: + attempts = [] + deadline = time.time() + timeout_seconds + while time.time() <= deadline: + probe = run_cmd(["docker", "info"], WORKSPACE_ROOT, timeout=30) + attempts.append( + { + "ok": probe.get("ok", False), + "finished_at": probe.get("finished_at"), + "stdout_tail": probe.get("stdout_tail", ""), + } + ) + if probe.get("ok", False): + return {"ok": True, "attempts": attempts} + if time.time() + poll_seconds > deadline: + break + time.sleep(poll_seconds) + return {"ok": False, "attempts": attempts} + + +def normalize_report_path(path_value: str, output_directory: Path) -> Path: + normalized = ( + path_value.replace("/host", "", 1) if path_value.startswith("/host/") else path_value + ) + candidate = Path(normalized) + if candidate.exists(): + return candidate + return output_directory / Path(path_value).name + + +def summarize_fastq_outputs( + output_directory: Path, demux_rows: list[dict[str, str]] +) -> list[dict[str, Any]]: + by_sample = { + row.get("SampleID", ""): parse_int(row.get("# Reads")) + for row in demux_rows + if row.get("SampleID") + } + rows = [] + reports_dir = output_directory / "Reports" + for entry in read_csv_rows(reports_dir / "fastq_list.csv"): + for read_key in ("Read1File", "Read2File"): + path_value = entry.get(read_key) + if not path_value: + continue + path = normalize_report_path(path_value, output_directory) + rows.append( + { + "sample": entry.get("RGSM"), + "lane": entry.get("Lane"), + "read": "R1" if read_key == "Read1File" else "R2", + "path": str(path), + "bytes": path.stat().st_size if path.exists() else None, + "read_pairs": by_sample.get(entry.get("RGSM", "")), + } + ) + undetermined_reads = by_sample.get("Undetermined") + for path in sorted(output_directory.glob("Undetermined*.fastq.gz")): + rows.append( + { + "sample": "Undetermined", + "lane": "1", + "read": "R1" if "_R1_" in path.name else "R2", + "path": str(path), + "bytes": path.stat().st_size, + "read_pairs": undetermined_reads, + } + ) + return rows + + +def parse_report_bundle(output_directory: Path) -> dict[str, Any] | None: + reports_dir = output_directory / "Reports" + if not reports_dir.exists(): + return None + demux_rows = read_csv_rows(reports_dir / "Demultiplex_Stats.csv") + quality_rows = read_csv_rows(reports_dir / "Quality_Metrics.csv") + unknown_rows = read_csv_rows(reports_dir / "Top_Unknown_Barcodes.csv") + assigned_reads = sum( + parse_int(row.get("# Reads")) or 0 + for row in demux_rows + if row.get("SampleID") != "Undetermined" + ) + undetermined_reads = sum( + parse_int(row.get("# Reads")) or 0 + for row in demux_rows + if row.get("SampleID") == "Undetermined" + ) + total_reads = assigned_reads + undetermined_reads + assigned_fraction = (assigned_reads / total_reads) if total_reads else None + undetermined_fraction = (undetermined_reads / total_reads) if total_reads else None + top_unknown = [ + { + "index": row.get("index"), + "index2": row.get("index2"), + "reads": parse_int(row.get("# Reads")), + "fraction_of_all_reads": parse_float(row.get("% of All Reads")), + } + for row in unknown_rows[:5] + ] + quality_by_sample = {} + for row in quality_rows: + sample = row.get("SampleID", "unknown") + quality_by_sample.setdefault(sample, {})[f"read_{row.get('ReadNumber')}"] = { + "yield": parse_int(row.get("Yield")), + "q30_fraction": parse_float(row.get("% Q30")), + "mean_quality_pf": parse_float(row.get("Mean Quality Score (PF)")), + } + issues = [] + assessment = "pass" + if ( + undetermined_fraction is not None + and undetermined_fraction >= DEMUX_QC_THRESHOLDS["fail_undetermined_fraction"] + ): + assessment = "fail" + issues.append( + f"undetermined reads are {undetermined_fraction:.2%}, above the fail threshold of {DEMUX_QC_THRESHOLDS['fail_undetermined_fraction']:.0%}" + ) + elif ( + undetermined_fraction is not None + and undetermined_fraction >= DEMUX_QC_THRESHOLDS["warn_undetermined_fraction"] + ): + assessment = "warning" + issues.append( + f"undetermined reads are {undetermined_fraction:.2%}, above the warning threshold of {DEMUX_QC_THRESHOLDS['warn_undetermined_fraction']:.0%}" + ) + top_unknown_fraction = max( + ( + item["fraction_of_all_reads"] + for item in top_unknown + if item["fraction_of_all_reads"] is not None + ), + default=None, + ) + if ( + top_unknown_fraction is not None + and top_unknown_fraction >= DEMUX_QC_THRESHOLDS["warn_top_unknown_fraction"] + ): + assessment = "warning" if assessment == "pass" else assessment + issues.append( + f"top unknown barcode accounts for {top_unknown_fraction:.2%} of all reads, at or above the warning threshold of {DEMUX_QC_THRESHOLDS['warn_top_unknown_fraction']:.0%}" + ) + return { + "output_directory": str(output_directory), + "report_directory": str(reports_dir), + "assigned_reads": assigned_reads, + "undetermined_reads": undetermined_reads, + "total_reads": total_reads, + "assigned_fraction": assigned_fraction, + "undetermined_fraction": undetermined_fraction, + "assessment": assessment, + "issues": issues, + "quality_by_sample": quality_by_sample, + "top_unknown_barcodes": top_unknown, + "fastq_outputs": summarize_fastq_outputs(output_directory, demux_rows), + } + + +def write_commands(run_dir: Path, args: argparse.Namespace, converter: str | None) -> None: + lines = ["#!/usr/bin/env bash", "set -euo pipefail"] + if converter: + lines.append(shlex.join(conversion_command(converter, args))) + else: + lines.append("# bcl-convert or bcl2fastq is required before execution.") + lines.append( + "# " + + shlex.join( + [ + "bcl-convert", + "--bcl-input-directory", + str(args.run_folder.expanduser().resolve()), + "--output-directory", + str(args.output_directory.expanduser().resolve()), + "--sample-sheet", + str(args.sample_sheet.expanduser().resolve()), + ] + ) + ) + write_text(run_dir / "commands.sh", "\n".join(lines) + "\n") + + +def execute_conversion( + run_dir: Path, args: argparse.Namespace, converter: str, runtime_preflight: dict[str, Any] +) -> dict[str, Any]: + output_directory = args.output_directory.expanduser().resolve() + output_directory.parent.mkdir(parents=True, exist_ok=True) + command = conversion_command(converter, args) + attempts = [] + result = run_cmd(command, run_dir, timeout=args.timeout_seconds) + attempts.append(result) + if ( + runtime_preflight.get("uses_docker_wrapper") + and not result.get("ok") + and docker_daemon_error(result) + ): + daemon_wait = wait_for_docker_daemon() + if daemon_wait.get("ok"): + retry = run_cmd(command, run_dir, timeout=args.timeout_seconds) + retry["retry_reason"] = "docker_daemon_ready_after_wait" + attempts.append(retry) + result = retry + payload = { + "ok": result.get("ok"), + "converter": converter, + "output_directory": str(output_directory), + "command": result.get("cmd"), + "attempts": attempts, + } + write_json(run_dir / "logs" / "bcl_conversion.json", payload) + write_text(run_dir / "logs" / "bcl_conversion.log", result.get("stdout_tail", "")) + return payload + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + converter: str | None, + runtime_preflight: dict[str, Any], + report_bundle: dict[str, Any] | None, +) -> None: + sheet = validation.get("sample_sheet_summary", {}) + runinfo = validation.get("runinfo", {}) + lines = [ + "# BCL To FASTQ Run Summary", + "", + f"Status: `{status}`", + f"Converter: `{converter or 'not installed'}`", + f"Run folder: `{validation.get('run_folder')}`", + f"Samples parsed: `{sheet.get('sample_count', 0)}`", + f"Read structure: `{runinfo.get('reads', [])}`", + ] + if runtime_preflight.get("uses_docker_wrapper"): + lines.extend( + [ + f"Docker-backed wrapper: `{runtime_preflight.get('converter_path')}`", + f"Docker daemon ready: `{runtime_preflight.get('docker_daemon_ok')}`", + ] + ) + if report_bundle: + lines.extend( + [ + "", + "## Demux QC", + "", + f"Assessment: `{report_bundle['assessment']}`", + f"Assigned reads: `{report_bundle['assigned_reads']}` (`{report_bundle['assigned_fraction']:.2%}`)", + f"Undetermined reads: `{report_bundle['undetermined_reads']}` (`{report_bundle['undetermined_fraction']:.2%}`)", + ] + ) + if report_bundle["issues"]: + lines.extend(f"- {issue}" for issue in report_bundle["issues"]) + top_unknown = report_bundle["top_unknown_barcodes"][:3] + if top_unknown: + lines.extend(["", "Top unknown barcodes:"]) + lines.extend( + f"- {item['index']}-{item['index2']}: {item['reads']} reads ({item['fraction_of_all_reads']:.2%} of all reads)" + for item in top_unknown + if item["reads"] is not None and item["fraction_of_all_reads"] is not None + ) + lines.extend(["", "FASTQ outputs:"]) + lines.extend( + f"- `{Path(item['path']).name}`: {item['read_pairs']} read pairs, {item['bytes']} bytes" + for item in report_bundle["fastq_outputs"] + ) + lines.extend( + [ + "", + "## Key Artifacts", + "", + "- `validation/runinfo.json`", + "- `validation/samplesheet_summary.json`", + "- `qc/demux_qc_summary.json` when conversion succeeds", + "- `logs/bcl_conversion.log` when conversion executes", + "- `run_manifest.json` and `artifact_index.json`", + "", + ] + ) + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {error}" for error in validation["errors"]) + combined_warnings = [*validation.get("warnings", []), *runtime_preflight.get("warnings", [])] + if combined_warnings: + lines.extend(["", "## Warnings", ""]) + lines.extend(f"- {warning}" for warning in combined_warnings) + write_text(run_dir / "summary.md", "\n".join(lines) + "\n") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--run-folder", type=Path, required=True) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--output-directory", type=Path, required=True) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=slug_timestamp("bcl-to-fastq")) + parser.add_argument("--converter", choices=["bcl-convert", "bcl2fastq"]) + parser.add_argument("--execute", action="store_true") + parser.add_argument("--timeout-seconds", type=int, default=86400) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + + validation = validate_inputs(args) + converter = select_converter(args) + runtime_preflight = converter_runtime_preflight(converter, args) + tool_status = tool_preflight([], optional=["bcl-convert", "bcl2fastq"]) + if args.execute and converter is None: + tool_status["ok"] = False + tool_status["missing_required"] = ["bcl-convert or bcl2fastq"] + tool_status["runtime"] = runtime_preflight + if runtime_preflight.get("errors"): + tool_status["ok"] = False + tool_status["runtime_errors"] = runtime_preflight["errors"] + + write_json( + run_dir / "config.json", + { + "run_folder": str(args.run_folder.expanduser().resolve()), + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "output_directory": str(args.output_directory.expanduser().resolve()), + }, + ) + write_json( + run_dir / "validation" / "input_summary.json", + { + "run_folder": validation["run_folder"], + "sample_sheet": validation["sample_sheet"], + "output_directory": validation["output_directory"], + }, + ) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "runtime_preflight.json", runtime_preflight) + write_json(run_dir / "validation" / "runinfo.json", validation["runinfo"]) + write_json(run_dir / "validation" / "runparameters.json", validation["runparameters"]) + write_json( + run_dir / "validation" / "samplesheet_summary.json", validation["sample_sheet_summary"] + ) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_commands(run_dir, args, converter) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + {"bcl-convert": ["bcl-convert", "--version"], "bcl2fastq": ["bcl2fastq", "--version"]} + ), + ) + + dry_run = { + "ok": validation["ok"] and (converter is not None or not args.execute), + "detail": "run folder and sample sheet validation completed", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + execution = None + report_bundle = None + status = "blocked" if not dry_run["ok"] else "validated" + if args.execute and validation["ok"] and converter: + execution = execute_conversion(run_dir, args, converter, runtime_preflight) + status = "completed" if execution.get("ok") else "failed" + if execution.get("ok"): + report_bundle = parse_report_bundle(args.output_directory.expanduser().resolve()) + if report_bundle: + write_json(run_dir / "qc" / "demux_qc_summary.json", report_bundle) + elif args.execute and not converter: + execution = {"ok": False, "reason": "bcl-convert or bcl2fastq is not installed"} + status = "blocked" + + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="bcl_to_fastq", + workflow="local_bcl_convert_or_bcl2fastq", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={"run_folder": validation["run_folder"], "sample_sheet": validation["sample_sheet"]}, + outputs={ + "output_directory": validation["output_directory"], + "conversion_logs": "logs/bcl_conversion.log", + }, + method={ + "converter": converter, + "converter_selection": "bcl-convert preferred, bcl2fastq fallback", + }, + review_bundle={"demux_qc": report_bundle} if report_bundle else {}, + ) + write_summary(run_dir, status, validation, converter, runtime_preflight, report_bundle) + extra_roots = ( + {"output_directory": args.output_directory.expanduser().resolve()} + if report_bundle + else None + ) + write_json( + run_dir / "artifact_index.json", build_artifact_index(run_dir, extra_roots=extra_roots) + ) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_bulk_rnaseq_counts_qc.py b/plugins/ngs-analysis/scripts/run_bulk_rnaseq_counts_qc.py new file mode 100644 index 0000000..abbb0a5 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_bulk_rnaseq_counts_qc.py @@ -0,0 +1,1055 @@ +#!/usr/bin/env python3 +"""Run local bulk RNA-seq counts/QC with Salmon, FastQC, MultiQC, and matrices.""" + +from __future__ import annotations + +import argparse +import csv +import gzip +import json +import math +import re +import shlex +import shutil +import statistics +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +import ngs_resource_gate +from ngs_run_utils import ( + build_artifact_index, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import ( + artifact_entry, + reachable_localhost_url_for_path, + write_localhost_launch_hint, + write_marimo_review_notebook, + write_multiqc_browser_helper, + write_visualization_index, +) + +SCRIPT_PATH = Path(__file__).resolve() +PLUGIN_ROOT = SCRIPT_PATH.parents[1] +WORKFLOW_ROOT = PLUGIN_ROOT / "workflows" / "bulk_rnaseq_counts_qc" +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "bulk_rnaseq_counts_qc" +SAMPLE_RE = re.compile(r"^[A-Za-z0-9_.-]+$") +FASTQ_EXTENSIONS = (".fastq", ".fq", ".fastq.gz", ".fq.gz") +QC_THRESHOLDS = { + "mapping_rate_warn": 70.0, + "mapping_rate_fail": 50.0, + "duplication_warn": 60.0, + "duplication_fail": 80.0, + "strand_bias_warn": 0.10, + "outlier_delta_warn": 10.0, +} + + +def load_yaml_module() -> Any | None: + try: + import yaml as yaml_module # type: ignore[import-not-found] + except ModuleNotFoundError as exc: + if exc.name == "yaml": + return None + raise + return yaml_module + + +def yaml_dependency_status() -> dict[str, Any]: + yaml_module = load_yaml_module() + if yaml_module is None: + return { + "ok": False, + "python_modules": {"yaml": {"present": False, "package": "PyYAML"}}, + "errors": ["Python package PyYAML is required to write config.yaml."], + } + return { + "ok": True, + "python_modules": { + "yaml": { + "present": True, + "package": "PyYAML", + "version": getattr(yaml_module, "__version__", None), + } + }, + "errors": [], + } + + +def salmon_libtype(layout: str, strandedness: str) -> tuple[str, str]: + normalized = strandedness.lower().strip() + if normalized in {"auto", "unknown"}: + return "A", "infer_from_salmon" + if layout == "PE": + return { + "forward": ("ISF", "from_input"), + "reverse": ("ISR", "from_input"), + "unstranded": ("IU", "from_input"), + }.get(normalized, ("A", "infer_from_salmon")) + return { + "forward": ("SF", "from_input"), + "reverse": ("SR", "from_input"), + "unstranded": ("U", "from_input"), + }.get(normalized, ("A", "infer_from_salmon")) + + +def filename_from_uri(value: str) -> str: + if value.startswith(("http://", "https://", "s3://", "gs://")): + return Path(urlparse(value).path).name + return Path(value).name + + +def resolve_existing_path(raw: str, base: Path, roots: list[Path]) -> Path | None: + if not raw: + return None + if raw.startswith(("http://", "https://", "s3://", "gs://")): + basename = filename_from_uri(raw) + else: + candidate = Path(raw).expanduser() + if not candidate.is_absolute(): + candidate = base / candidate + if candidate.exists(): + return candidate.resolve() + basename = candidate.name + + matches: list[Path] = [] + for root in roots: + direct = root / basename + if direct.exists(): + matches.append(direct.resolve()) + if len(matches) == 1: + return matches[0] + if len(matches) > 1: + raise FileExistsError(f"ambiguous FASTQ basename {basename}: {matches}") + return None + + +def open_fastq_text(path: Path): + if path.name.endswith(".gz"): + return gzip.open(path, "rt", encoding="utf-8", errors="replace") + return path.open("rt", encoding="utf-8", errors="replace") + + +def check_fastq(path: Path, quick: bool, max_records: int) -> dict[str, Any]: + result: dict[str, Any] = { + "path": str(path), + "exists": path.exists(), + "readable": False, + "records_checked": 0, + "record_count": None, + "errors": [], + } + if not path.exists(): + result["errors"].append("file does not exist") + return result + if not path.is_file(): + result["errors"].append("path is not a file") + return result + if not path.name.endswith(FASTQ_EXTENSIONS): + result["errors"].append("file extension is not a recognized FASTQ extension") + result["readable"] = True + try: + with open_fastq_text(path) as handle: + record_count = 0 + while True: + header = handle.readline() + if not header: + break + sequence = handle.readline() + plus = handle.readline() + quality = handle.readline() + if not quality: + result["errors"].append(f"incomplete FASTQ record after record {record_count}") + break + record_count += 1 + if not header.startswith("@"): + result["errors"].append(f"record {record_count} header does not start with @") + if not plus.startswith("+"): + result["errors"].append( + f"record {record_count} separator does not start with +" + ) + if len(sequence.rstrip("\n\r")) != len(quality.rstrip("\n\r")): + result["errors"].append( + f"record {record_count} sequence and quality lengths differ" + ) + if quick and record_count >= max_records: + break + result["records_checked"] = record_count + result["record_count"] = None if quick else record_count + except OSError as exc: + result["errors"].append(f"read failed: {exc}") + return result + + +def read_samplesheet( + path: Path, fastq_roots: list[Path], quick: bool, max_records: int +) -> tuple[dict[str, Any], list[dict[str, str]], dict[str, Any]]: + rows: list[dict[str, str]] = [] + grouped: dict[str, dict[str, Any]] = {} + fastq_files: dict[str, dict[str, str]] = {} + errors: list[str] = [] + warnings: list[str] = [] + fastq_checks: list[dict[str, Any]] = [] + required = {"sample", "fastq_1", "strandedness"} + + with path.open(newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle) + observed = set(reader.fieldnames or []) + missing = sorted(required - observed) + if missing: + raise ValueError(f"sample sheet missing required columns: {', '.join(missing)}") + for row_index, row in enumerate(reader, start=2): + sample = (row.get("sample") or "").strip() + fastq_1_raw = (row.get("fastq_1") or "").strip() + fastq_2_raw = (row.get("fastq_2") or "").strip() + strandedness = (row.get("strandedness") or "").strip().lower() + if not sample or not fastq_1_raw: + errors.append(f"row {row_index}: sample and fastq_1 are required") + continue + if not SAMPLE_RE.match(sample): + errors.append(f"row {row_index}: sample contains unsupported characters: {sample}") + if strandedness not in {"forward", "reverse", "unstranded", "auto", "unknown"}: + errors.append(f"row {row_index}: unsupported strandedness value: {strandedness}") + + r1 = resolve_existing_path(fastq_1_raw, path.parent, fastq_roots) + r2 = ( + resolve_existing_path(fastq_2_raw, path.parent, fastq_roots) + if fastq_2_raw + else None + ) + if not r1: + errors.append(f"row {row_index}: could not resolve fastq_1 path {fastq_1_raw}") + continue + if fastq_2_raw and not r2: + errors.append(f"row {row_index}: could not resolve fastq_2 path {fastq_2_raw}") + continue + + layout = "PE" if r2 else "SE" + libtype, libtype_source = salmon_libtype(layout, strandedness) + grouped.setdefault( + sample, + { + "sample": sample, + "layout": layout, + "strandedness": strandedness, + "salmon_libtype": libtype, + "salmon_libtype_source": libtype_source, + "r1": [], + "r2": [], + "row_indices": [], + }, + ) + entry = grouped[sample] + if entry["layout"] != layout: + errors.append(f"sample {sample} mixes PE and SE rows") + if entry["strandedness"] != strandedness: + errors.append(f"sample {sample} mixes strandedness values") + if entry["salmon_libtype"] != libtype: + errors.append(f"sample {sample} mixes Salmon library types") + entry["r1"].append(str(r1)) + if r2: + entry["r2"].append(str(r2)) + entry["row_indices"].append(row_index) + + for read_label, read_path in [("r1", r1), ("r2", r2)]: + if read_path is None: + continue + unit = f"{sample}__row{row_index}__{read_label}" + fastq_files[unit] = {"sample": sample, "read": read_label, "path": str(read_path)} + stats = check_fastq(read_path, quick=quick, max_records=max_records) + stats["unit"] = unit + fastq_checks.append(stats) + if stats["errors"]: + errors.extend(f"{unit}: {error}" for error in stats["errors"]) + + rows.append( + { + "sample": sample, + "row_index": str(row_index), + "fastq_1": fastq_1_raw, + "fastq_2": fastq_2_raw, + "resolved_fastq_1": str(r1), + "resolved_fastq_2": str(r2) if r2 else "", + "layout": layout, + "strandedness": strandedness, + "salmon_libtype": libtype, + } + ) + + if not grouped: + errors.append("no valid samples found in sample sheet") + return ( + { + "ok": not errors, + "errors": errors, + "warnings": warnings, + "fastq_checks": fastq_checks, + "sample_count": len(grouped), + }, + rows, + {"rnaseq_salmon_samples": grouped, "fastq_files": fastq_files}, + ) + + +def write_normalized_samplesheet(path: Path, rows: list[dict[str, str]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + fieldnames = [ + "sample", + "row_index", + "fastq_1", + "fastq_2", + "resolved_fastq_1", + "resolved_fastq_2", + "layout", + "strandedness", + "salmon_libtype", + ] + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(rows) + + +def validate_references(args: argparse.Namespace) -> dict[str, Any]: + references = { + "transcriptome_fasta": args.transcriptome_fasta.expanduser().resolve() + if args.transcriptome_fasta + else None, + "genome_fasta": args.genome_fasta.expanduser().resolve() if args.genome_fasta else None, + "annotation_gtf": args.annotation_gtf.expanduser().resolve() + if args.annotation_gtf + else None, + } + errors = [] + warnings = [] + if not references["transcriptome_fasta"]: + errors.append("--transcriptome-fasta is required for Salmon quantification") + for key, path in references.items(): + if path and not path.exists(): + errors.append(f"{key} does not exist: {path}") + if not references["genome_fasta"]: + warnings.append( + "genome FASTA not provided; alignment-aware STAR path is not executed by this Salmon local runner" + ) + if not references["annotation_gtf"]: + warnings.append( + "annotation GTF not provided; gene-level summarization is not emitted by this transcript-level runner" + ) + return { + "ok": not errors, + "errors": errors, + "warnings": warnings, + "references": {key: str(path) if path else None for key, path in references.items()}, + } + + +def read_tsv_table(path: Path) -> list[dict[str, str]]: + if not path.exists(): + return [] + with path.open(newline="", encoding="utf-8", errors="replace") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return [{key: (value or "").strip() for key, value in row.items()} for row in reader] + + +def fastqc_sample_to_group(sample_name: str, fastq_files: dict[str, dict[str, str]]) -> str | None: + for details in fastq_files.values(): + basename = Path(details["path"]).name + for suffix in FASTQ_EXTENSIONS: + if basename.endswith(suffix): + basename = basename[: -len(suffix)] + break + if basename == sample_name: + return details["sample"] + return None + + +def compute_qc_verdict(run_dir: Path, config: dict[str, Any]) -> dict[str, Any]: + fastq_stats = read_tsv_table( + run_dir / "fastqc" / "multiqc" / "multiqc_data" / "multiqc_general_stats.txt" + ) + salmon_stats = read_tsv_table( + run_dir / "rnaseq_salmon" / "multiqc" / "multiqc_data" / "multiqc_general_stats.txt" + ) + samples = config.get("rnaseq_salmon_samples", {}) + fastq_files = config.get("fastq_files", {}) + per_sample_duplication: dict[str, list[float]] = {} + for row in fastq_stats: + sample_name = row.get("Sample", "") + grouped = fastqc_sample_to_group(sample_name, fastq_files) + if not grouped: + continue + try: + duplication = float(row.get("fastqc-percent_duplicates", "")) + except ValueError: + continue + per_sample_duplication.setdefault(grouped, []).append(duplication) + + sample_rows: list[dict[str, Any]] = [] + mapping_rates: list[float] = [] + for row in salmon_stats: + sample = row.get("Sample", "") + if sample not in samples: + continue + try: + mapping_rate = float(row.get("salmon-percent_mapped", "nan")) + except ValueError: + mapping_rate = math.nan + if math.isfinite(mapping_rate): + mapping_rates.append(mapping_rate) + expected_libtype = samples[sample].get("salmon_libtype", "A") + lib_format_path = run_dir / "rnaseq_salmon" / "quant" / sample / "lib_format_counts.json" + observed_format = None + strand_bias = None + if lib_format_path.exists(): + payload = json.loads(lib_format_path.read_text(encoding="utf-8")) + observed_format = payload.get("expected_format") + strand_bias = payload.get("strand_mapping_bias") + duplication_values = per_sample_duplication.get(sample, []) + duplication = statistics.mean(duplication_values) if duplication_values else None + mapping_status = "pass" + if mapping_rate < QC_THRESHOLDS["mapping_rate_fail"]: + mapping_status = "fail" + elif mapping_rate < QC_THRESHOLDS["mapping_rate_warn"]: + mapping_status = "warn" + duplication_status = "pass" + if duplication is not None: + if duplication > QC_THRESHOLDS["duplication_fail"]: + duplication_status = "fail" + elif duplication > QC_THRESHOLDS["duplication_warn"]: + duplication_status = "warn" + libtype_status = "pass" + if expected_libtype != "A" and observed_format and observed_format != expected_libtype: + libtype_status = "fail" + strand_bias_status = "pass" + if strand_bias is not None and strand_bias > QC_THRESHOLDS["strand_bias_warn"]: + strand_bias_status = "warn" + sample_rows.append( + { + "sample": sample, + "mapping_rate_percent": mapping_rate, + "duplication_percent": duplication, + "configured_libtype": expected_libtype, + "observed_libtype": observed_format, + "strand_bias": strand_bias, + "mapping_rate_status": mapping_status, + "duplication_status": duplication_status, + "libtype_status": libtype_status, + "strand_bias_status": strand_bias_status, + } + ) + median_mapping = statistics.median(mapping_rates) if mapping_rates else None + outlier_samples: list[str] = [] + if median_mapping is not None: + for row in sample_rows: + if row["mapping_rate_percent"] <= median_mapping - QC_THRESHOLDS["outlier_delta_warn"]: + outlier_samples.append(row["sample"]) + if row["mapping_rate_status"] == "pass": + row["mapping_rate_status"] = "warn" + + overall = "pass" + for row in sample_rows: + statuses = [ + row["mapping_rate_status"], + row["duplication_status"], + row["libtype_status"], + row["strand_bias_status"], + ] + if "fail" in statuses: + overall = "fail" + break + if "warn" in statuses: + overall = "warn" + return { + "overall_status": overall, + "thresholds": QC_THRESHOLDS, + "outlier_samples": outlier_samples, + "samples": sample_rows, + "de_readiness": { + "status": "caution", + "reason": "Gene-level expected counts are derived from transcript-level Salmon quantification and are suitable for QC and exploratory review, but downstream DE should validate assumptions and metadata before model fitting.", + "gene_level_counts": "rnaseq_salmon/matrices/gene_num_reads.tsv", + "tx2gene_provenance": "rnaseq_salmon/matrices/tx2gene.tsv", + }, + } + + +def write_workflow(run_dir: Path) -> None: + workflow_dir = run_dir / "workflow" + scripts_dir = workflow_dir / "scripts" + workflow_dir.mkdir(parents=True, exist_ok=True) + scripts_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(WORKFLOW_ROOT / "Snakefile.smk", workflow_dir / "Snakefile") + shutil.copy2( + WORKFLOW_ROOT / "aggregate_salmon_quant.py", scripts_dir / "aggregate_salmon_quant.py" + ) + + +def snakemake_cmd(run_dir: Path, cores: int, dry_run: bool) -> list[str]: + cmd = [ + "env", + f"XDG_CACHE_HOME={run_dir / '.cache'}", + "snakemake", + "--snakefile", + "workflow/Snakefile", + "--configfile", + "config.json", + "--cores", + str(cores), + "--shared-fs-usage", + "input-output", + "persistence", + "software-deployment", + "software-deployment-cache", + "sources", + "storage-local-copies", + ] + if dry_run: + cmd.append("--dry-run") + return cmd + + +def write_commands(run_dir: Path, cores: int) -> None: + write_text( + run_dir / "commands.sh", + "\n".join( + [ + "#!/usr/bin/env bash", + "set -euo pipefail", + shlex.join(snakemake_cmd(run_dir, cores, dry_run=True)), + shlex.join(snakemake_cmd(run_dir, cores, dry_run=False)), + "", + ] + ), + ) + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + reference_validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> None: + lines = [ + "# Bulk RNA-seq Counts/QC Run Summary", + "", + f"Status: `{status}`", + "", + f"Samples parsed: `{validation.get('sample_count', 0)}`", + "", + "## Key Artifacts", + "", + "- `rnaseq_salmon/quant/*/quant.sf`", + "- `rnaseq_salmon/matrices/tpm.tsv`", + "- `rnaseq_salmon/matrices/num_reads.tsv`", + "- `rnaseq_salmon/matrices/effective_length.tsv`", + "- `rnaseq_salmon/matrices/gene_num_reads.tsv`", + "- `rnaseq_salmon/matrices/tx2gene.tsv`", + "- `qc/qc_verdict.json`", + "- `visualizations/localhost_launch_hint.txt` for the preferred localhost MultiQC links", + "- `fastqc/multiqc/multiqc_browser_helper.html`", + "- `rnaseq_salmon/multiqc/multiqc_browser_helper.html`", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `artifact_index.json`", + "", + ] + issues = validation.get("errors", []) + reference_validation.get("errors", []) + if issues: + lines.extend(["## Blockers", ""]) + lines.extend(f"- {issue}" for issue in issues) + lines.append("") + lines.extend(ngs_resource_gate.resource_summary_lines(resource_plan)) + lines.append("Raw FASTQs and reference files were read-only inputs and were not modified.") + lines.append("") + write_text(run_dir / "summary.md", "\n".join(lines)) + + +def generate_visualizations( + run_dir: Path, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> dict[str, str]: + entries: list[dict[str, Any]] = [] + notes = [ + "artifact_index.json now includes per-file SHA256 and modification timestamps for provenance.", + "Use the MultiQC browser helpers when the raw MultiQC HTML stalls under file:// in the Codex browser.", + "Serve the run directory over localhost and open the browser helpers for a stable in-app review path.", + ] + if validation.get("warnings"): + notes.extend(str(warning) for warning in validation["warnings"]) + multiqc_specs = [ + ( + "fastqc_multiqc_localhost", + "FastQC MultiQC Localhost URL", + reachable_localhost_url_for_path("fastqc/multiqc/multiqc_report.html"), + "fastqc/multiqc/multiqc_report.html", + "Live localhost URL for the full FastQC MultiQC report when the run directory is already being served.", + "localhost_app", + ), + ( + "fastqc_multiqc_helper", + "FastQC MultiQC Browser Helper", + "fastqc/multiqc/multiqc_browser_helper.html", + "fastqc/multiqc/multiqc_browser_helper.html", + "Browser-safe review page for the FastQC MultiQC report.", + "html_report", + ), + ( + "salmon_multiqc_localhost", + "Salmon MultiQC Localhost URL", + reachable_localhost_url_for_path("rnaseq_salmon/multiqc/multiqc_report.html"), + "rnaseq_salmon/multiqc/multiqc_report.html", + "Live localhost URL for the full Salmon MultiQC report when the run directory is already being served.", + "localhost_app", + ), + ( + "salmon_multiqc_helper", + "Salmon MultiQC Browser Helper", + "rnaseq_salmon/multiqc/multiqc_browser_helper.html", + "rnaseq_salmon/multiqc/multiqc_browser_helper.html", + "Browser-safe review page for the Salmon MultiQC report.", + "html_report", + ), + ] + for artifact_id, title, entry_path, source_rel_path, description, kind in multiqc_specs: + source_path = run_dir / source_rel_path + available = bool(entry_path) if kind == "localhost_app" else source_path.exists() + entries.append( + artifact_entry( + artifact_id=artifact_id, + title=title, + path=entry_path if available else None, + kind=kind, + status="created" if available else "not_available", + description=description, + ) + ) + for artifact_id, title, rel_path, description in [ + ( + "sample_table", + "Resolved Sample Table", + "rnaseq_salmon/matrices/samples.tsv", + "Grouped samples with layout, strandedness, and row provenance.", + ), + ( + "tpm_matrix", + "TPM Matrix", + "rnaseq_salmon/matrices/tpm.tsv", + "Transcript-by-sample TPM matrix from Salmon quantification.", + ), + ( + "num_reads_matrix", + "Num Reads Matrix", + "rnaseq_salmon/matrices/num_reads.tsv", + "Transcript-by-sample expected fragment counts.", + ), + ( + "effective_length_matrix", + "Effective Length Matrix", + "rnaseq_salmon/matrices/effective_length.tsv", + "Transcript-by-sample effective lengths.", + ), + ( + "gene_num_reads_matrix", + "Gene Num Reads Matrix", + "rnaseq_salmon/matrices/gene_num_reads.tsv", + "Gene-by-sample expected counts aggregated from Salmon transcripts.", + ), + ( + "gene_tpm_matrix", + "Gene TPM Matrix", + "rnaseq_salmon/matrices/gene_tpm.tsv", + "Gene-by-sample TPM values aggregated from Salmon transcripts.", + ), + ( + "tx2gene_map", + "Transcript-to-Gene Map", + "rnaseq_salmon/matrices/tx2gene.tsv", + "tx2gene provenance derived from the provided GTF.", + ), + ( + "qc_verdict", + "QC Verdict", + "qc/qc_verdict.json", + "Compact pass/warn/fail summary over mapping rate, duplication, library-type agreement, and outliers.", + ), + ( + "normalized_samplesheet", + "Normalized Sample Sheet", + "validation/samplesheet.normalized.csv", + "Resolved FASTQ paths and grouped sample layout.", + ), + ]: + path = run_dir / rel_path + entries.append( + artifact_entry( + artifact_id=artifact_id, + title=title, + path=rel_path if path.exists() else None, + kind="table", + status="created" if path.exists() else "not_available", + description=description, + ) + ) + notebook_path = write_marimo_review_notebook( + run_dir / "notebooks" / "bulk_rnaseq_counts_qc_review.marimo.py", + title="Bulk RNA-seq Counts/QC Review", + run_dir=run_dir, + image_items=[], + table_items=[ + ("Resolved Sample Table", "rnaseq_salmon/matrices/samples.tsv"), + ("TPM Matrix", "rnaseq_salmon/matrices/tpm.tsv"), + ("Num Reads Matrix", "rnaseq_salmon/matrices/num_reads.tsv"), + ("Effective Length Matrix", "rnaseq_salmon/matrices/effective_length.tsv"), + ], + object_items=[ + ("FastQC MultiQC Browser Helper", "fastqc/multiqc/multiqc_browser_helper.html"), + ("Salmon MultiQC Browser Helper", "rnaseq_salmon/multiqc/multiqc_browser_helper.html"), + ("QC Verdict", "qc/qc_verdict.json"), + ("Localhost Launch Hint", "visualizations/localhost_launch_hint.txt"), + ], + ) + entries.append( + artifact_entry( + artifact_id="counts_qc_review_notebook", + title="Counts/QC Review Notebook", + path=notebook_path.relative_to(run_dir), + kind="notebook", + status="created", + description="Marimo review notebook over the key counts/QC tables and report helpers.", + ) + ) + launch_hint = write_localhost_launch_hint( + run_dir, + report_entries=[ + ("FastQC MultiQC", "fastqc/multiqc/multiqc_report.html"), + ("Salmon MultiQC", "rnaseq_salmon/multiqc/multiqc_report.html"), + ], + ) + entries.append( + artifact_entry( + artifact_id="localhost_launch_hint", + title="Localhost Launch Hint", + path=launch_hint.relative_to(run_dir), + kind="text", + status="created", + description="Command and URLs for serving the run directory over localhost and opening browser-safe report helpers.", + ) + ) + entries.extend(ngs_resource_gate.resource_visual_entries(resource_plan)) + index = write_visualization_index( + run_dir, + title="Bulk RNA-seq Counts/QC Review Bundle", + description="Human-readable review surface for the counts/QC lane, with links to the key reports and matrices.", + entries=entries, + notes=[*notes, *ngs_resource_gate.resource_messages(resource_plan)], + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + "review_notebook": str(notebook_path.relative_to(run_dir)), + "localhost_launch_hint": str(launch_hint.relative_to(run_dir)), + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--transcriptome-fasta", type=Path, required=True) + parser.add_argument("--genome-fasta", type=Path) + parser.add_argument("--annotation-gtf", type=Path) + parser.add_argument( + "--fastq-root", + type=Path, + action="append", + default=[], + help="Directory to search by FASTQ basename.", + ) + parser.add_argument( + "--outdir", + type=Path, + help="Run directory. Defaults to ngs_runs/bulk_rnaseq_counts_qc/.", + ) + parser.add_argument("--threads", type=int, default=4) + parser.add_argument("--run-id", default=slug_timestamp("bulk-rnaseq-counts-qc")) + parser.add_argument("--kmer", type=int, default=31) + parser.add_argument( + "--genome-build", + help="Genome build or registry alias for resource readiness, e.g. GRCh38, mm39, or a configured local alias.", + ) + parser.add_argument( + "--bundle-root", + action="append", + default=[], + help="Resource bundle override formatted as bundle=/path. May be repeated.", + ) + parser.add_argument("--include-optional-resources", action="store_true") + parser.add_argument("--resource-checksums", action="store_true") + parser.add_argument( + "--require-resource-plan", + action="store_true", + help="Treat missing registered reference bundles as blocking for this local runner.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Skip registered reference bundle readiness checks.", + ) + parser.add_argument("--execute", action="store_true") + parser.add_argument("--no-dry-run", action="store_true") + parser.add_argument("--quick-validation", action="store_true") + parser.add_argument("--fastq-record-check", type=int, default=1000) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + yaml_module = load_yaml_module() + if yaml_module is None: + yaml_status = yaml_dependency_status() + write_json(run_dir / "validation" / "tool_preflight.json", yaml_status) + write_text( + run_dir / "summary.md", + "Python dependency preflight failed: PyYAML is required to write config.yaml.\n", + ) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="bulk_rnaseq_counts_qc", + workflow="local_light_snakemake_salmon", + status="blocked", + execute_requested=args.execute, + validation={"ok": True, "errors": [], "warnings": []}, + tool_preflight_result=yaml_status, + dry_run={"ok": False, "detail": "Python dependency preflight failed"}, + execution={"ok": False, "detail": "execution not attempted"}, + inputs={}, + outputs={ + "summary": "summary.md", + "tool_preflight": "validation/tool_preflight.json", + }, + method={"quantifier": "salmon"}, + ) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + print(run_dir) + return 1 + + sample_sheet = args.sample_sheet.expanduser().resolve() + fastq_roots = [root.expanduser().resolve() for root in args.fastq_root] + fastq_roots.extend([sample_sheet.parent, Path.cwd()]) + if not sample_sheet.exists(): + raise FileNotFoundError(f"sample sheet does not exist: {sample_sheet}") + + validation, normalized_rows, config_parts = read_samplesheet( + sample_sheet, fastq_roots, args.quick_validation, args.fastq_record_check + ) + reference_validation = validate_references(args) + combined_validation = { + "ok": validation["ok"] and reference_validation["ok"], + "sample_sheet": str(sample_sheet), + "errors": validation.get("errors", []) + reference_validation.get("errors", []), + "warnings": validation.get("warnings", []) + reference_validation.get("warnings", []), + "sample_count": validation.get("sample_count", 0), + "fastq_checks": validation.get("fastq_checks", []), + "reference_validation": reference_validation, + } + resource_plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="bulk_rnaseq_counts_qc", + genome_build=args.genome_build, + bundle_roots=args.bundle_root, + include_optional=args.include_optional_resources, + include_checksums=args.resource_checksums, + skip=args.skip_resource_plan, + required=args.require_resource_plan, + ) + combined_validation = ngs_resource_gate.merge_resource_status( + combined_validation, + resource_plan, + required=args.require_resource_plan, + ) + tool_status = tool_preflight(["snakemake", "fastqc", "multiqc", "salmon"], optional=[]) + + config = { + "threads": args.threads, + "references": reference_validation["references"], + "salmon": {"kmer": args.kmer}, + **config_parts, + } + write_json(run_dir / "config.json", config) + write_text(run_dir / "config.yaml", yaml_module.safe_dump(config, sort_keys=True)) + write_normalized_samplesheet( + run_dir / "validation" / "samplesheet.normalized.csv", normalized_rows + ) + write_json( + run_dir / "validation" / "input_summary.json", + {"sample_sheet": str(sample_sheet), **config_parts}, + ) + write_json(run_dir / "validation" / "validation_summary.json", combined_validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_workflow(run_dir) + write_commands(run_dir, args.threads) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + { + "snakemake": ["snakemake", "--version"], + "fastqc": ["fastqc", "--version"], + "multiqc": ["multiqc", "--version"], + "salmon": ["salmon", "--no-version-check", "--version"], + } + ), + ) + + dry_run: dict[str, Any] | None = None + execution: dict[str, Any] | None = None + blocked = not combined_validation["ok"] or not tool_status["ok"] + status = "blocked" if blocked else "prepared" + if not blocked and not args.no_dry_run: + dry_run = run_cmd(snakemake_cmd(run_dir, args.threads, dry_run=True), run_dir, timeout=600) + write_json(run_dir / "logs" / "snakemake_dry_run.json", dry_run) + write_text(run_dir / "logs" / "snakemake_dry_run.log", dry_run.get("stdout_tail", "")) + if not dry_run.get("ok"): + blocked = True + status = "failed" + elif not blocked: + write_json( + run_dir / "logs" / "snakemake_dry_run_skipped.json", + {"ok": True, "reason": "--no-dry-run was requested"}, + ) + if args.execute and not blocked: + execution = run_cmd( + snakemake_cmd(run_dir, args.threads, dry_run=False), run_dir, timeout=86400 + ) + write_json(run_dir / "logs" / "snakemake_execute.json", execution) + write_text(run_dir / "logs" / "snakemake_execute.log", execution.get("stdout_tail", "")) + status = "completed" if execution.get("ok") else "failed" + elif not args.execute and status == "prepared": + status = "validated" + + write_multiqc_browser_helper( + run_dir, + report_path="fastqc/multiqc/multiqc_report.html", + title="FastQC MultiQC Browser Helper", + ) + write_multiqc_browser_helper( + run_dir, + report_path="rnaseq_salmon/multiqc/multiqc_report.html", + title="Salmon MultiQC Browser Helper", + ) + qc_verdict = ( + compute_qc_verdict(run_dir, config) + if args.execute and status == "completed" + else { + "overall_status": "not_available", + "thresholds": QC_THRESHOLDS, + "outlier_samples": [], + "samples": [], + "de_readiness": { + "status": "not_available", + "reason": "Execution did not complete, so QC verdict and DE-readiness assessment were not computed.", + }, + } + ) + write_json(run_dir / "qc" / "qc_verdict.json", qc_verdict) + review_bundle = generate_visualizations(run_dir, combined_validation, resource_plan) + review_bundle.update( + { + "suggested_localhost_port": 8765, + "localhost_report_examples": { + "fastqc_report": reachable_localhost_url_for_path( + "fastqc/multiqc/multiqc_report.html" + ), + "salmon_report": reachable_localhost_url_for_path( + "rnaseq_salmon/multiqc/multiqc_report.html" + ), + }, + } + ) + + outputs = { + "quant_glob": "rnaseq_salmon/quant/*/quant.sf", + "tpm_matrix": "rnaseq_salmon/matrices/tpm.tsv", + "num_reads_matrix": "rnaseq_salmon/matrices/num_reads.tsv", + "effective_length_matrix": "rnaseq_salmon/matrices/effective_length.tsv", + "gene_num_reads_matrix": "rnaseq_salmon/matrices/gene_num_reads.tsv", + "gene_tpm_matrix": "rnaseq_salmon/matrices/gene_tpm.tsv", + "tx2gene_map": "rnaseq_salmon/matrices/tx2gene.tsv", + "sample_table": "rnaseq_salmon/matrices/samples.tsv", + "qc_verdict": "qc/qc_verdict.json", + "fastq_multiqc_localhost": reachable_localhost_url_for_path( + "fastqc/multiqc/multiqc_report.html" + ) + if (run_dir / "fastqc/multiqc/multiqc_report.html").exists() + else None, + "fastq_multiqc_helper": "fastqc/multiqc/multiqc_browser_helper.html", + "salmon_multiqc_localhost": reachable_localhost_url_for_path( + "rnaseq_salmon/multiqc/multiqc_report.html" + ) + if (run_dir / "rnaseq_salmon/multiqc/multiqc_report.html").exists() + else None, + "salmon_multiqc_helper": "rnaseq_salmon/multiqc/multiqc_browser_helper.html", + "visualization_index": review_bundle["visualization_index"], + "visualization_manifest": review_bundle["visualization_manifest"], + "review_notebook": review_bundle["review_notebook"], + "localhost_launch_hint": review_bundle["localhost_launch_hint"], + } + resource_outputs = ngs_resource_gate.resource_output_paths(resource_plan) + outputs.update(resource_outputs) + write_summary(run_dir, status, combined_validation, reference_validation, resource_plan) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="bulk_rnaseq_counts_qc", + workflow="local_light_snakemake_salmon", + status=status, + execute_requested=args.execute, + validation=combined_validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(sample_sheet), + "references": reference_validation["references"], + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs=outputs, + method={ + "quantifier": "salmon", + "alignment": "transcriptome_pseudoalignment", + "gene_level_counts": "salmon_tx_aggregation_with_tx2gene", + "strandedness_policy": "respect_input_or_infer_when_unknown", + "resource_plan": resource_plan, + }, + audit={ + "qc_verdict_path": "qc/qc_verdict.json", + **({"resource_readiness": resource_plan} if resource_plan else {}), + }, + review_bundle=review_bundle, + ) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + + print(run_dir) + if status in {"blocked", "failed"}: + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_bulk_rnaseq_de.py b/plugins/ngs-analysis/scripts/run_bulk_rnaseq_de.py new file mode 100644 index 0000000..4eac6cb --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_bulk_rnaseq_de.py @@ -0,0 +1,707 @@ +#!/usr/bin/env python3 +"""Run bulk RNA-seq differential expression with validation and audited artifacts.""" + +from __future__ import annotations + +import argparse +import csv +import importlib.util +import math +import os +import shlex +import shutil +from pathlib import Path +from typing import Any + +from ngs_run_utils import ( + build_artifact_index, + command_path, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import ( + artifact_entry, + launch_marimo_review_app, + write_marimo_review_notebook, + write_visualization_index, +) + +SCRIPT_PATH = Path(__file__).resolve() +PLUGIN_ROOT = SCRIPT_PATH.parents[1] +WORKFLOW_ROOT = PLUGIN_ROOT / "workflows" / "bulk_rnaseq_differential_expression" +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "bulk_rnaseq_de" + + +def read_tsv(path: Path) -> list[dict[str, str]]: + with path.open(newline="", encoding="utf-8") as handle: + return [ + {key: (value or "").strip() for key, value in row.items()} + for row in csv.DictReader(handle, delimiter="\t") + ] + + +def r_package_available(package: str) -> bool: + if not command_path("Rscript"): + return False + result = run_cmd( + ["Rscript", "-e", f"cat(requireNamespace('{package}', quietly=TRUE))"], + Path.cwd(), + timeout=60, + ) + return result.get("ok") and "TRUE" in str(result.get("stdout_tail", "")) + + +def parse_count_matrix(path: Path) -> tuple[list[str], list[dict[str, str]], dict[str, Any]]: + with path.open(newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle, delimiter="\t") + columns = reader.fieldnames or [] + if "gene_id" not in columns: + raise ValueError("count matrix must include a gene_id column") + sample_cols = [column for column in columns if column not in {"gene_id", "gene_name"}] + rows = list(reader) + errors = [] + integer_like = True + finite_values = True + min_value = math.inf + max_value = -math.inf + for row_index, row in enumerate(rows, start=2): + for sample in sample_cols: + try: + value = float(row[sample]) + except ValueError: + errors.append(f"row {row_index} sample {sample}: non-numeric expression value") + continue + if not math.isfinite(value): + finite_values = False + errors.append(f"row {row_index} sample {sample}: non-finite expression value") + if abs(value - round(value)) > 1e-8: + integer_like = False + min_value = min(min_value, value) + max_value = max(max_value, value) + return ( + sample_cols, + rows, + { + "errors": errors, + "integer_like": integer_like, + "finite_values": finite_values, + "gene_count": len(rows), + "min_value": None if min_value == math.inf else min_value, + "max_value": None if max_value == -math.inf else max_value, + }, + ) + + +def infer_input_mode( + requested: str, matrix_status: dict[str, Any], warnings: list[str], errors: list[str] +) -> str: + integer_like = bool(matrix_status.get("integer_like")) + min_value = matrix_status.get("min_value") + if requested != "auto": + if requested == "raw_counts" and not integer_like: + errors.append("raw_counts input mode requires an integer-like matrix") + return requested + if integer_like: + return "raw_counts" + if min_value is not None and min_value < 0: + warnings.append( + "Auto-detected input_mode=log_expression because the matrix contains negative values. " + "Override with --input-mode if the matrix scale is known explicitly." + ) + return "log_expression" + warnings.append( + "Auto-detected input_mode=normalized_expression for a non-integer, non-negative matrix. " + "Override with --input-mode if the input is already log-transformed." + ) + return "normalized_expression" + + +def build_fit_formula(metadata_rows: list[dict[str, str]], selected_method: str) -> str: + batch_values = [row.get("batch", "") for row in metadata_rows if row.get("batch", "")] + has_batch = len(set(batch_values)) > 1 + if selected_method in {"limma_log2", "edgeR"}: + return "~ 0 + condition + batch" if has_batch else "~ 0 + condition" + return "~ batch + condition" if has_batch else "~ condition" + + +def build_outputs_map(input_mode: str) -> dict[str, str]: + outputs = { + "contrast_status": "manifest/contrast_status.tsv", + "qc_plots": "qc/*.png", + "de_tables": "results/*.tsv", + "contrast_plots": "plots/*.png", + "design_diagnostics": "qc/design_diagnostics.tsv", + "outlier_metrics": "qc/sample_outlier_metrics.tsv", + "statistical_summary": "qc/statistical_summary.tsv", + "statistical_warnings": "qc/statistical_warnings.tsv", + } + if input_mode == "raw_counts": + outputs.update( + { + "raw_counts": "results/raw_counts.tsv", + "normalized_expression": "results/normalized_expression_matrix.tsv", + "log_expression": "results/log2_expression_matrix.tsv", + } + ) + elif input_mode == "normalized_expression": + outputs.update( + { + "input_normalized_expression": "results/input_normalized_expression_matrix.tsv", + "log_expression": "results/log2_expression_matrix.tsv", + } + ) + else: + outputs.update( + { + "input_log_expression": "results/input_log_expression_matrix.tsv", + "modeling_expression": "results/modeling_expression_matrix.tsv", + } + ) + return outputs + + +def validate_inputs(args: argparse.Namespace) -> tuple[dict[str, Any], dict[str, Any]]: + count_matrix = args.count_matrix.expanduser().resolve() + sample_metadata = args.sample_metadata.expanduser().resolve() + contrasts = args.contrasts.expanduser().resolve() + errors = [] + warnings = [] + for label, path in [ + ("count_matrix", count_matrix), + ("sample_metadata", sample_metadata), + ("contrasts", contrasts), + ]: + if not path.exists(): + errors.append(f"{label} does not exist: {path}") + if errors: + return {"ok": False, "errors": errors, "warnings": warnings}, {} + + sample_cols, _, matrix_status = parse_count_matrix(count_matrix) + errors.extend(matrix_status["errors"]) + metadata_rows = read_tsv(sample_metadata) + contrast_rows = read_tsv(contrasts) + + metadata_samples = [row.get("sample_id", "") for row in metadata_rows] + if len(metadata_samples) != len(set(metadata_samples)): + errors.append("sample metadata contains duplicate sample_id values") + if set(sample_cols) != set(metadata_samples): + errors.append("count matrix sample columns and metadata sample_id values do not match") + if "condition" not in (metadata_rows[0].keys() if metadata_rows else []): + errors.append("sample metadata must include a condition column") + + condition_counts: dict[str, int] = {} + for row in metadata_rows: + condition = row.get("condition", "") + if not condition: + errors.append(f"sample {row.get('sample_id', '')} has no condition") + condition_counts[condition] = condition_counts.get(condition, 0) + 1 + + contrast_status = [] + required_contrast_cols = {"contrast", "numerator_condition", "denominator_condition"} + if contrast_rows and not required_contrast_cols.issubset(contrast_rows[0].keys()): + errors.append( + "contrasts file must include contrast, numerator_condition, and denominator_condition columns" + ) + for row in contrast_rows: + numerator = row.get("numerator_condition", "") + denominator = row.get("denominator_condition", "") + numerator_n = condition_counts.get(numerator, 0) + denominator_n = condition_counts.get(denominator, 0) + status = "valid" if numerator_n >= 2 and denominator_n >= 2 else "insufficient_replicates" + if status == "valid" and numerator_n == 2 and denominator_n == 2: + warnings.append( + f"Contrast {row.get('contrast', '')} is minimally powered (2 vs 2 replicates); treat p-values and effect sizes as exploratory and review QC plots carefully." + ) + contrast_status.append( + { + "contrast": row.get("contrast", ""), + "numerator_condition": numerator, + "denominator_condition": denominator, + "numerator_replicates": numerator_n, + "denominator_replicates": denominator_n, + "status": status, + "expected_status": row.get("expected_status", ""), + "notes": row.get("notes", ""), + } + ) + if not contrast_status: + errors.append("no contrasts were provided") + + package_status = { + package: r_package_available(package) for package in ["DESeq2", "edgeR", "limma"] + } + input_mode = infer_input_mode(args.input_mode, matrix_status, warnings, errors) + selected_method = select_method(args.method, input_mode, package_status, errors) + if selected_method == "limma_log2" and not package_status["limma"]: + errors.append("limma is required for limma_log2 execution but is not installed") + fit_formula = build_fit_formula(metadata_rows, selected_method) + + method_decision = { + "requested_method": args.method, + "selected_method": selected_method, + "requested_input_mode": args.input_mode, + "input_mode": input_mode, + "matrix_integer_like": matrix_status["integer_like"], + "r_packages": package_status, + "condition_counts": condition_counts, + "contrast_status": contrast_status, + "fit_formula": fit_formula, + } + validation = { + "ok": not errors, + "errors": errors, + "warnings": warnings, + "count_matrix": str(count_matrix), + "sample_metadata": str(sample_metadata), + "contrasts": str(contrasts), + "sample_count": len(sample_cols), + "gene_count": matrix_status["gene_count"], + "matrix_integer_like": matrix_status["integer_like"], + "contrast_status": contrast_status, + "method_decision": method_decision, + } + return validation, method_decision + + +def select_method( + requested: str, input_mode: str, packages: dict[str, bool], errors: list[str] +) -> str: + if requested != "auto": + if requested in {"DESeq2", "edgeR"} and input_mode != "raw_counts": + errors.append( + f"{requested} requires raw integer-like counts; input_mode={input_mode} is not compatible" + ) + if requested != "limma_log2" and not packages.get(requested, False): + errors.append(f"{requested} was requested but the R package is not installed") + return requested + if input_mode == "raw_counts" and packages.get("DESeq2", False): + return "DESeq2" + if input_mode == "raw_counts" and packages.get("edgeR", False): + return "edgeR" + return "limma_log2" + + +def write_workflow(run_dir: Path) -> None: + scripts_dir = run_dir / "workflow" / "scripts" + scripts_dir.mkdir(parents=True, exist_ok=True) + shutil.copy2(WORKFLOW_ROOT / "run_bulk_de.R", scripts_dir / "run_bulk_de.R") + + +def r_command(args: argparse.Namespace, run_dir: Path, method: dict[str, Any]) -> list[str]: + return [ + "Rscript", + "workflow/scripts/run_bulk_de.R", + str(args.count_matrix.expanduser().resolve()), + str(args.sample_metadata.expanduser().resolve()), + str(args.contrasts.expanduser().resolve()), + str(method.get("selected_method", "limma_log2")), + str(method.get("input_mode", "normalized_expression")), + str(method.get("fit_formula", "~ 0 + condition")), + str(run_dir), + ] + + +def write_commands(run_dir: Path, cmd: list[str]) -> None: + write_text( + run_dir / "commands.sh", "#!/usr/bin/env bash\nset -euo pipefail\n" + shlex.join(cmd) + "\n" + ) + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + method: dict[str, Any], + review_app_info: dict[str, Any] | None = None, +) -> None: + outputs = build_outputs_map(str(method.get("input_mode", "normalized_expression"))) + lines = [ + "# Bulk RNA-seq Differential Expression Run Summary", + "", + f"Status: `{status}`", + f"Selected method: `{method.get('selected_method')}`", + f"Input mode: `{method.get('input_mode')}`", + f"Matrix integer-like: `{method.get('matrix_integer_like')}`", + f"Fit formula: `{method.get('fit_formula')}`", + f"Review app URL: `{review_app_info.get('url') if review_app_info and review_app_info.get('ok') else 'not started'}`", + "", + "## Contrast Status", + "", + ] + for contrast in validation.get("contrast_status", []): + lines.append( + f"- `{contrast['contrast']}`: {contrast['status']} " + f"({contrast['numerator_condition']} n={contrast['numerator_replicates']} vs " + f"{contrast['denominator_condition']} n={contrast['denominator_replicates']})" + ) + lines.extend( + [ + "", + "## Key Artifacts", + "", + "- `manifest/contrast_status.tsv`", + ] + ) + for artifact in [ + outputs.get("raw_counts"), + outputs.get("input_normalized_expression"), + outputs.get("normalized_expression"), + outputs.get("input_log_expression"), + outputs.get("modeling_expression"), + outputs.get("log_expression"), + ]: + if artifact: + lines.append(f"- `{artifact}`") + lines.extend( + [ + "- `qc/pca.png`", + "- `qc/sample_distance_heatmap.png`", + "- `qc/design_diagnostics.tsv`", + "- `qc/sample_outlier_metrics.tsv`", + "- `qc/statistical_warnings.tsv`", + "- `plots/*_volcano.png` and `plots/*_ma.png` for executed limma contrasts", + "- `notebooks/marimo_server.json`", + "- `visualizations/index.html`", + "- `artifact_index.json`", + "", + ] + ) + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {warning}" for warning in validation["warnings"]) + lines.append("") + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {error}" for error in validation["errors"]) + lines.append("") + write_text(run_dir / "summary.md", "\n".join(lines)) + + +def generate_visualizations( + run_dir: Path, + validation: dict[str, Any], + review_app_info: dict[str, Any] | None = None, +) -> dict[str, str]: + entries: list[dict[str, Any]] = [] + notes = [ + "artifact_index.json includes per-file SHA256 and modification timestamps for provenance.", + "Use qc/statistical_warnings.tsv and manifest/contrast_status.tsv together when deciding whether a contrast is interpretable.", + ] + if validation.get("warnings"): + notes.extend(str(warning) for warning in validation["warnings"]) + + for artifact_id, title, rel_path, kind, description in [ + ( + "pca_plot", + "PCA Plot", + "qc/pca.png", + "plot", + "PCA on the modeling matrix with variance explained and condition colors.", + ), + ( + "sample_distance_heatmap", + "Sample Distance Heatmap", + "qc/sample_distance_heatmap.png", + "plot", + "Clustered sample-to-sample Euclidean distances.", + ), + ( + "mean_variance_trend", + "Mean-Variance Trend", + "qc/mean_variance_trend.png", + "plot", + "Method-specific mean-variance diagnostic.", + ), + ( + "library_sizes", + "Library Sizes", + "qc/library_sizes.png", + "plot", + "Per-sample total expression values from the supplied matrix.", + ), + ( + "contrast_status", + "Contrast Status", + "manifest/contrast_status.tsv", + "table", + "Executed and blocked contrasts with replicate counts.", + ), + ( + "design_diagnostics", + "Design Diagnostics", + "qc/design_diagnostics.tsv", + "table", + "Design rank and model structure checks.", + ), + ( + "statistical_warnings", + "Statistical Warnings", + "qc/statistical_warnings.tsv", + "table", + "Human-readable statistical UX warnings emitted by the runner.", + ), + ( + "sample_outliers", + "Sample Outlier Metrics", + "qc/sample_outlier_metrics.tsv", + "table", + "Mean distance and z-score outlier screen.", + ), + ]: + path = run_dir / rel_path + entries.append( + artifact_entry( + artifact_id=artifact_id, + title=title, + path=rel_path if path.exists() else None, + kind=kind, + status="created" if path.exists() else "not_available", + description=description, + ) + ) + for result_path in sorted((run_dir / "results").glob("*.tsv")): + rel_path = str(result_path.relative_to(run_dir)) + entries.append( + artifact_entry( + artifact_id=f"table_{result_path.stem}", + title=f"Result Table: {result_path.stem}", + path=rel_path, + kind="table", + status="created", + description="Differential expression table or blocked-contrast stub emitted by the selected method.", + ) + ) + for plot_path in sorted((run_dir / "plots").glob("*.png")): + rel_path = str(plot_path.relative_to(run_dir)) + entries.append( + artifact_entry( + artifact_id=f"plot_{plot_path.stem}", + title=f"Contrast Plot: {plot_path.stem}", + path=rel_path, + kind="plot", + status="created", + description="Per-contrast volcano or MA plot for an executed comparison.", + ) + ) + notebook_path = write_marimo_review_notebook( + run_dir / "notebooks" / "bulk_rnaseq_de_review.marimo.py", + title="Bulk RNA-seq Differential Expression Review", + run_dir=run_dir, + image_items=[ + ("PCA", "qc/pca.png"), + ("Sample Distance Heatmap", "qc/sample_distance_heatmap.png"), + ("Mean-Variance Trend", "qc/mean_variance_trend.png"), + ("Library Sizes", "qc/library_sizes.png"), + ] + + [ + (f"Contrast Plot: {path.stem}", str(path.relative_to(run_dir))) + for path in sorted((run_dir / "plots").glob("*.png")) + ], + table_items=[ + ("Contrast Status", "manifest/contrast_status.tsv"), + ("Design Diagnostics", "qc/design_diagnostics.tsv"), + ("Statistical Warnings", "qc/statistical_warnings.tsv"), + ("Sample Outlier Metrics", "qc/sample_outlier_metrics.tsv"), + ] + + [ + (f"Result Table: {path.stem}", str(path.relative_to(run_dir))) + for path in sorted((run_dir / "results").glob("*.tsv")) + ], + ) + entries.append( + artifact_entry( + artifact_id="de_review_notebook", + title="DE Review Notebook", + path=notebook_path.relative_to(run_dir), + kind="notebook", + status="created", + description="Marimo review notebook over the key DE plots, diagnostics, and result tables.", + ) + ) + if review_app_info: + entries.append( + artifact_entry( + artifact_id="de_review_launch", + title="DE Review App", + path=review_app_info.get("url"), + kind="localhost_app", + status="created" if review_app_info.get("ok") else "blocked", + description="Auto-launched localhost Marimo review app for the generated DE notebook.", + source="notebooks/marimo_server.json", + ) + ) + if review_app_info.get("ok"): + notes.append(f"Review app auto-launched at {review_app_info['url']}.") + else: + notes.append( + "Review app auto-launch did not become ready. See notebooks/marimo_server.json and logs/marimo_server.log." + ) + index = write_visualization_index( + run_dir, + title="Bulk RNA-seq Differential Expression Review Bundle", + description="Human-readable review surface for the DE lane, with an auto-launched Marimo review app and explicit statistical context.", + entries=entries, + notes=notes, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + "review_notebook": str(notebook_path.relative_to(run_dir)), + } + + +def maybe_launch_review_app( + args: argparse.Namespace, run_dir: Path, notebook_path: Path +) -> dict[str, Any]: + info_path = run_dir / "notebooks" / "marimo_server.json" + if not args.launch_review_app: + info = {"ok": False, "error": "Review app auto-launch disabled by CLI flag."} + write_json(info_path, info) + return info + if not importlib.util.find_spec("marimo"): + info = {"ok": False, "error": "marimo is not installed in the current Python environment."} + write_json(info_path, info) + return info + try: + info = launch_marimo_review_app( + notebook_path=notebook_path, + run_dir=run_dir, + start_port=args.review_app_port, + python_executable=os.environ.get("PYTHON_EXECUTABLE_OVERRIDE"), + ) + except Exception as exc: # noqa: BLE001 + info = {"ok": False, "error": str(exc)} + write_json(info_path, info) + return info + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--count-matrix", type=Path, required=True) + parser.add_argument("--sample-metadata", type=Path, required=True) + parser.add_argument("--contrasts", type=Path, required=True) + parser.add_argument( + "--input-mode", + choices=["auto", "raw_counts", "normalized_expression", "log_expression"], + default="auto", + help="Explicit matrix scale. Auto infers from integer-likeness and sign.", + ) + parser.add_argument( + "--method", choices=["auto", "DESeq2", "edgeR", "limma_log2"], default="auto" + ) + parser.add_argument( + "--outdir", + type=Path, + help="Run directory. Defaults to ngs_runs/bulk_rnaseq_de/.", + ) + parser.add_argument("--run-id", default=slug_timestamp("bulk-rnaseq-de")) + parser.add_argument("--execute", action="store_true") + parser.add_argument( + "--launch-review-app", + action=argparse.BooleanOptionalAction, + default=True, + help="Auto-launch the generated Marimo review app on localhost and record its URL in the run envelope.", + ) + parser.add_argument( + "--review-app-port", + type=int, + default=2718, + help="Starting port to use when auto-launching the Marimo review app.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + + validation, method_decision = validate_inputs(args) + tool_status = tool_preflight(["Rscript"], optional=[]) + write_json( + run_dir / "config.json", + { + "method": method_decision, + "inputs": { + "count_matrix": str(args.count_matrix.expanduser().resolve()), + "sample_metadata": str(args.sample_metadata.expanduser().resolve()), + "contrasts": str(args.contrasts.expanduser().resolve()), + }, + }, + ) + write_json(run_dir / "validation" / "input_summary.json", {"inputs": validation}) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions({"Rscript": ["Rscript", "--version"]}), + ) + write_workflow(run_dir) + + cmd = r_command(args, run_dir, method_decision) + write_commands(run_dir, cmd) + dry_run = { + "ok": validation.get("ok") and tool_status.get("ok"), + "detail": "input/method validation completed", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + write_text(run_dir / "logs" / "validation_dry_run.log", dry_run["detail"] + "\n") + execution: dict[str, Any] | None = None + status = "blocked" if not dry_run["ok"] else "validated" + if args.execute and dry_run["ok"]: + execution = run_cmd(cmd, run_dir, timeout=86400) + write_json(run_dir / "logs" / "rscript_execute.json", execution) + write_text(run_dir / "logs" / "rscript_execute.log", execution.get("stdout_tail", "")) + status = "completed" if execution.get("ok") else "failed" + + outputs = build_outputs_map(str(method_decision.get("input_mode", "normalized_expression"))) + review_bundle = generate_visualizations(run_dir, validation) + review_notebook_path = run_dir / review_bundle["review_notebook"] + review_app_info = ( + maybe_launch_review_app(args, run_dir, review_notebook_path) + if args.execute and status == "completed" and review_notebook_path.exists() + else None + ) + review_bundle = generate_visualizations(run_dir, validation, review_app_info=review_app_info) + outputs["review_app_record"] = "notebooks/marimo_server.json" + outputs.update(review_bundle) + write_summary(run_dir, status, validation, method_decision, review_app_info=review_app_info) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="bulk_rnaseq_differential_expression", + workflow="r_bioconductor_bulk_de", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "count_matrix": str(args.count_matrix.expanduser().resolve()), + "sample_metadata": str(args.sample_metadata.expanduser().resolve()), + "contrasts": str(args.contrasts.expanduser().resolve()), + }, + outputs=outputs, + method=method_decision, + review_bundle={**review_bundle, "review_app": review_app_info}, + ) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + + print(run_dir) + if status in {"blocked", "failed"}: + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_chip_cutrun_peaks_qc.py b/plugins/ngs-analysis/scripts/run_chip_cutrun_peaks_qc.py new file mode 100644 index 0000000..6b62ba7 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_chip_cutrun_peaks_qc.py @@ -0,0 +1,781 @@ +#!/usr/bin/env python3 +"""Run or plan local ChIP-seq, CUT&RUN, or CUT&Tag peak/QC artifacts.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Any + +import ngs_resource_gate +from ngs_epigenomics_utils import summarize_epigenomics_outputs +from ngs_planner_utils import ( + command_plan_entry, + normalize_sample_name, + read_table, + resolve_path, + shell_join, + write_command_script, + write_tsv, +) +from ngs_run_utils import ( + build_artifact_index, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import artifact_entry, write_visualization_index + +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "chip_cutrun_peaks_qc" + + +def is_control_row(target: str | None, condition: str | None, sample: str | None) -> bool: + labels = { + str(target or "").strip().lower(), + str(condition or "").strip().lower(), + str(sample or "").strip().lower(), + } + return any( + label in {"input", "igg", "control", "no_antibody", "no-antibody"} for label in labels + ) + + +def validate_inputs(args: argparse.Namespace) -> tuple[dict[str, Any], list[dict[str, str]]]: + sample_sheet = args.sample_sheet.expanduser().resolve() + errors: list[str] = [] + warnings: list[str] = [] + rows: list[dict[str, str]] = [] + columns: list[str] = [] + samples: list[dict[str, str]] = [] + if not sample_sheet.exists(): + errors.append(f"sample sheet does not exist: {sample_sheet}") + else: + rows, columns = read_table(sample_sheet) + if not args.bam_only and not args.bowtie2_index: + warnings.append( + "no --bowtie2-index was provided; FASTQ rows can only be planned, not aligned" + ) + if not args.genome_size: + errors.append("--genome-size is required for MACS2 peak calling") + blacklist = args.blacklist_bed.expanduser().resolve() if args.blacklist_bed else None + if blacklist and not blacklist.exists(): + errors.append(f"blacklist BED does not exist: {blacklist}") + if getattr(args, "run_motifs", False) and not getattr(args, "motif_genome", None): + errors.append( + "--run-motifs requires --motif-genome, for example hg38, mm10, or a HOMER genome identifier" + ) + for row_index, row in enumerate(rows, start=2): + sample = normalize_sample_name( + row.get("sample") or row.get("sample_id"), f"row_{row_index}" + ) + condition = row.get("condition", "") + target = row.get("target") or args.target_class + control_sample = ( + normalize_sample_name( + row.get("control") or row.get("control_sample") or row.get("negative_control"), "" + ) + or "" + ) + bam = resolve_path(row.get("bam") or row.get("alignment"), sample_sheet.parent) + r1 = resolve_path(row.get("r1") or row.get("fastq_1"), sample_sheet.parent) + r2 = resolve_path(row.get("r2") or row.get("fastq_2"), sample_sheet.parent) + control_bam = resolve_path( + row.get("control_bam") or row.get("input_bam") or row.get("igg_bam"), + sample_sheet.parent, + ) + if bam: + if not bam.exists(): + errors.append(f"row {row_index}: BAM does not exist: {bam}") + layout = "bam" + elif r1: + if not r1.exists(): + errors.append(f"row {row_index}: R1 FASTQ does not exist: {r1}") + if r2 and not r2.exists(): + errors.append(f"row {row_index}: R2 FASTQ does not exist: {r2}") + layout = "fastq_pe" if r2 else "fastq_se" + else: + errors.append(f"row {row_index}: provide bam/alignment or r1/fastq_1") + continue + if control_bam and not control_bam.exists(): + warnings.append(f"row {row_index}: control BAM does not exist: {control_bam}") + samples.append( + { + "sample": sample, + "condition": condition, + "replicate": row.get("replicate", ""), + "target": target, + "layout": layout, + "bam": str(bam) if bam else "", + "r1": str(r1) if r1 else "", + "r2": str(r2) if r2 else "", + "control_bam": str(control_bam) if control_bam else "", + "control_sample": control_sample, + "is_control": str(is_control_row(target, condition, sample)).lower(), + "row_index": str(row_index), + } + ) + sample_names = {sample["sample"] for sample in samples} + for sample in samples: + control_sample = sample.get("control_sample", "") + if control_sample and control_sample not in sample_names: + errors.append( + f"sample {sample['sample']}: referenced control sample does not exist in sample sheet: {control_sample}" + ) + if ( + args.assay == "chipseq" + and sample["is_control"] != "true" + and not control_sample + and not sample["control_bam"] + ): + warnings.append( + f"row {sample['row_index']}: ChIP-seq usually needs input/IgG control for robust peak calling" + ) + if not samples: + errors.append("no usable ChIP/CUT&RUN samples found") + validation = { + "ok": not errors, + "sample_sheet": str(sample_sheet), + "assay": args.assay, + "columns": columns, + "sample_count": len(samples), + "blacklist_bed": str(blacklist) if blacklist else None, + "genome_size": args.genome_size, + "peak_mode": args.peak_mode, + "run_motifs": getattr(args, "run_motifs", False), + "motif_genome": getattr(args, "motif_genome", None), + "motif_size": getattr(args, "motif_size", None), + "errors": errors, + "warnings": warnings, + } + return validation, samples + + +def aligned_bam(sample: dict[str, str]) -> str: + return ( + sample["bam"] if sample["layout"] == "bam" else f"alignment/{sample['sample']}.sorted.bam" + ) + + +def build_plan(args: argparse.Namespace, samples: list[dict[str, str]]) -> list[dict[str, Any]]: + plan: list[dict[str, Any]] = [] + samples_by_name = {sample["sample"]: sample for sample in samples} + + for sample in samples: + name = sample["sample"] + bam = aligned_bam(sample) + filtered_bam = f"alignment/{name}.filtered.bam" + if sample["layout"].startswith("fastq"): + bowtie = [ + "bowtie2", + "-x", + args.bowtie2_index or "MISSING_BOWTIE2_INDEX", + "-p", + str(args.threads), + ] + if sample["r2"]: + bowtie.extend(["-1", sample["r1"], "-2", sample["r2"]]) + else: + bowtie.extend(["-U", sample["r1"]]) + plan.append( + command_plan_entry( + f"{name}: align and sort", + f"{shell_join(bowtie)} | {shell_join(['samtools', 'sort', '-@', str(args.threads), '-o', bam, '-'])}", + outputs=[bam], + ) + ) + plan.append( + command_plan_entry(f"{name}: index aligned BAM", ["samtools", "index", bam]) + ) + plan.append( + command_plan_entry( + f"{name}: filter alignment", + [ + "samtools", + "view", + "-b", + "-q", + str(args.min_mapq), + "-F", + "1804", + "-o", + filtered_bam, + bam, + ], + outputs=[filtered_bam], + ) + ) + plan.append( + command_plan_entry(f"{name}: index filtered BAM", ["samtools", "index", filtered_bam]) + ) + plan.append( + command_plan_entry( + f"{name}: flagstat", + f"{shell_join(['samtools', 'flagstat', filtered_bam])} > {shell_join([f'qc/{name}.flagstat.txt'])}", + outputs=[f"qc/{name}.flagstat.txt"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: insert sizes", + f"{shell_join(['samtools', 'view', '-f', '2', filtered_bam])} | awk '{{t=$9; if (t<0) t=-t; if (t>0) print t}}' > {shell_join([f'qc/{name}.insert_sizes.txt'])}", + outputs=[f"qc/{name}.insert_sizes.txt"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: total filtered reads", + f"{shell_join(['samtools', 'view', '-c', filtered_bam])} > {shell_join([f'qc/{name}.filtered_reads.txt'])}", + outputs=[f"qc/{name}.filtered_reads.txt"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: bigWig signal", + [ + "bamCoverage", + "-b", + filtered_bam, + "-o", + f"tracks/{name}.bw", + "--numberOfProcessors", + str(args.threads), + ], + outputs=[f"tracks/{name}.bw"], + ) + ) + + for sample in samples: + if sample.get("is_control") == "true": + continue + + name = sample["sample"] + filtered_bam = f"alignment/{name}.filtered.bam" + peak_name = name + peak_cmd: list[str | Path] = [ + "macs2", + "callpeak", + "-t", + filtered_bam, + "-f", + "BAMPE", + "-g", + args.genome_size, + "-n", + peak_name, + "--outdir", + "peaks", + ] + control_bam = sample["control_bam"] + if not control_bam and sample.get("control_sample"): + control_sample = samples_by_name.get(sample["control_sample"]) + if control_sample: + control_bam = f"alignment/{control_sample['sample']}.filtered.bam" + if control_bam: + peak_cmd.extend(["-c", control_bam]) + if args.peak_mode == "broad": + peak_cmd.extend(["--broad"]) + plan.append( + command_plan_entry( + f"{name}: MACS2 peaks", + peak_cmd, + outputs=[ + f"peaks/{name}_peaks.narrowPeak" + if args.peak_mode == "narrow" + else f"peaks/{name}_peaks.broadPeak" + ], + ) + ) + peak_path = ( + f"peaks/{name}_peaks.narrowPeak" + if args.peak_mode == "narrow" + else f"peaks/{name}_peaks.broadPeak" + ) + if args.blacklist_bed: + plan.append( + command_plan_entry( + f"{name}: blacklist-filter peaks", + f"{shell_join(['bedtools', 'intersect', '-v', '-a', peak_path, '-b', args.blacklist_bed.expanduser().resolve()])} > {shell_join([f'peaks/{name}.blacklist_filtered.{args.peak_mode}Peak'])}", + outputs=[f"peaks/{name}.blacklist_filtered.{args.peak_mode}Peak"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: FRiP numerator", + f"{shell_join(['bedtools', 'intersect', '-u', '-abam', filtered_bam, '-b', peak_path])} | {shell_join(['samtools', 'view', '-c', '-'])} > {shell_join([f'qc/{name}.frip_reads.txt'])}", + outputs=[f"qc/{name}.frip_reads.txt"], + ) + ) + if getattr(args, "run_motifs", False): + motif_genome = getattr(args, "motif_genome", None) or "MISSING_MOTIF_GENOME" + motif_size = str(getattr(args, "motif_size", "given")) + motif_peak = ( + f"peaks/{name}.blacklist_filtered.{args.peak_mode}Peak" + if args.blacklist_bed + else peak_path + ) + plan.append( + command_plan_entry( + f"{name}: motif enrichment", + [ + "findMotifsGenome.pl", + motif_peak, + motif_genome, + f"motifs/{name}", + "-size", + motif_size, + ], + outputs=[f"motifs/{name}/knownResults.txt", f"motifs/{name}/homerResults.html"], + ) + ) + plan.append( + command_plan_entry( + "consensus peak merge", + f"cat peaks/*_peaks.{'broadPeak' if args.peak_mode == 'broad' else 'narrowPeak'} 2>/dev/null | sort -k1,1 -k2,2n | {shell_join(['bedtools', 'merge', '-i', '-'])} > peaks/consensus_peaks.bed", + outputs=["peaks/consensus_peaks.bed"], + ) + ) + return plan + + +def write_outputs( + run_dir: Path, + validation: dict[str, Any], + samples: list[dict[str, str]], + plan: list[dict[str, Any]], +) -> None: + write_tsv( + run_dir / "validation" / "samples.normalized.tsv", + samples, + [ + "sample", + "condition", + "replicate", + "target", + "layout", + "bam", + "r1", + "r2", + "control_bam", + "control_sample", + "is_control", + "row_index", + ], + ) + write_json(run_dir / "workflow" / "chip_cutrun_command_plan.json", {"commands": plan}) + write_command_script(run_dir / "commands.sh", [item["command"] for item in plan]) + write_json( + run_dir / "qc" / "chip_cutrun_qc_contract.json", + { + "required_review_metrics": [ + "alignment_rate", + "duplicate_rate", + "FRiP", + "peak_count", + "blacklist_overlap", + "control_use", + "replicate_concordance", + "signal_tracks", + "motif_enrichment_if_requested", + ], + "available_after_execution": [ + "qc/*.flagstat.txt", + "qc/*.insert_sizes.txt", + "qc/*.frip_reads.txt", + "qc/*.filtered_reads.txt", + "peaks/*Peak", + "tracks/*.bw", + "tracks/browser_tracks.tsv", + "motifs/motif_summary.tsv", + ], + "warnings": validation.get("warnings", []), + }, + ) + write_json( + run_dir / "motifs" / "motif_enrichment_plan.json", + { + "status": "planned", + "note": "Motif enrichment requires a motif backend such as HOMER, MEME, or chromVAR and a genome/motif database selected by the user.", + "enabled": validation.get("run_motifs", False), + "motif_genome": validation.get("motif_genome"), + "motif_size": validation.get("motif_size"), + "input_peak_glob": "peaks/*Peak", + }, + ) + summarize_epigenomics_outputs( + run_dir, + samples, + peak_mode=validation.get("peak_mode", "narrow"), + output_prefix="chip_cutrun_qc", + title="ChIP/CUT&RUN", + ) + + +def execute_plan(run_dir: Path, plan: list[dict[str, Any]]) -> dict[str, Any]: + for dirname in ["alignment", "qc", "peaks", "tracks", "logs", "motifs"]: + (run_dir / dirname).mkdir(parents=True, exist_ok=True) + result: dict[str, Any] = {"ok": True, "steps": []} + for index, item in enumerate(plan, start=1): + step = run_cmd(["bash", "-c", item["command"]], run_dir, timeout=7200) + safe = item["name"].replace(":", "").replace(" ", "_").replace("/", "_") + write_json(run_dir / "logs" / f"{index:02d}_{safe}.json", step) + result["steps"].append({"name": item["name"], "ok": step.get("ok")}) + result["ok"] = bool(result["ok"] and step.get("ok")) + if not step.get("ok"): + break + return result + + +def write_visuals( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> dict[str, str]: + entries = [ + artifact_entry( + artifact_id="samples", + title="ChIP/CUT&RUN Samples", + path="validation/samples.normalized.tsv", + kind="table", + status="created", + description="Normalized antibody/enrichment sample table.", + ), + artifact_entry( + artifact_id="command_plan", + title="Peak Calling Command Plan", + path="workflow/chip_cutrun_command_plan.json", + kind="json", + status="created", + description="Alignment, control-aware peak calling, FRiP, and signal-track commands.", + ), + artifact_entry( + artifact_id="qc_contract", + title="QC Contract", + path="qc/chip_cutrun_qc_contract.json", + kind="json", + status="created", + description="Metrics required before interpreting enrichment peaks.", + ), + artifact_entry( + artifact_id="qc_summary", + title="Epigenomics QC Summary", + path="qc/chip_cutrun_qc_summary.tsv", + kind="table", + status="created", + description="Parsed per-sample alignment, insert-size, FRiP, peak, motif, and track state.", + ), + artifact_entry( + artifact_id="qc_dashboard", + title="Epigenomics QC Dashboard", + path="qc/chip_cutrun_qc_dashboard.html", + kind="html", + status="created", + description="Native dashboard summarizing FRiP, peak counts, insert sizes, track state, control caveats, and motifs.", + ), + artifact_entry( + artifact_id="frip_peak_overview", + title="FRiP And Peak Plot", + path="qc/chip_cutrun_qc_frip_peak_overview.svg", + kind="svg", + status="created", + description="Compact FRiP and peak-count plot generated from parsed run artifacts.", + ), + artifact_entry( + artifact_id="insert_size_distribution", + title="Insert-Size Plot", + path="qc/chip_cutrun_qc_insert_size_distribution.svg", + kind="svg", + status="created", + description="Native insert-size distribution plot generated from parsed fragment sizes.", + ), + artifact_entry( + artifact_id="browser_tracks", + title="Browser Track Manifest", + path="tracks/browser_tracks.tsv", + kind="table", + status="created", + description="bigWig track lines and IGV/UCSC browser handoff metadata.", + ), + artifact_entry( + artifact_id="browser_track_preview", + title="Browser Track Preview", + path="tracks/browser_track_preview.html", + kind="html", + status="created", + description="HTML preview of bigWig track paths and UCSC track lines.", + ), + artifact_entry( + artifact_id="motif_plan", + title="Motif Enrichment Plan", + path="motifs/motif_enrichment_plan.json", + kind="json", + status="created", + description="Motif backend handoff contract.", + ), + artifact_entry( + artifact_id="motif_summary", + title="Motif Summary", + path="motifs/motif_summary.tsv", + kind="table", + status="created", + description="Motif-enrichment output summary when motif backend outputs are present.", + ), + ] + entries.extend(ngs_resource_gate.resource_visual_entries(resource_plan)) + index = write_visualization_index( + run_dir, + title="ChIP/CUT&RUN Peaks QC Review", + description="Review surface for control-aware peak calling, FRiP, signal tracks, and motif handoff.", + entries=entries, + notes=[ + *validation.get("warnings", []), + *ngs_resource_gate.resource_messages(resource_plan), + ], + analysis_intent="real_analysis" if status != "blocked" else "blocked_preflight", + provenance_summary={ + "status": status, + "assay": validation.get("assay"), + "sample_count": validation.get("sample_count", 0), + "resource_plan_ok": validation.get("resource_plan_ok"), + }, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + } + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> None: + lines = [ + "# ChIP/CUT&RUN Peaks QC Run Summary", + "", + f"Status: `{status}`", + f"Assay: `{validation.get('assay')}`", + f"Samples parsed: `{validation.get('sample_count', 0)}`", + "", + "## Key Artifacts", + "", + "- `workflow/chip_cutrun_command_plan.json`", + "- `qc/chip_cutrun_qc_contract.json`", + "- `qc/chip_cutrun_qc_summary.tsv` and `qc/chip_cutrun_qc_summary.json`", + "- `qc/chip_cutrun_qc_dashboard.html`, `qc/chip_cutrun_qc_frip_peak_overview.svg`, and `qc/chip_cutrun_qc_insert_size_distribution.svg`", + "- `peaks/*Peak`, `peaks/consensus_peaks.bed`, and `tracks/*.bw` when executed", + "- `tracks/browser_tracks.tsv`, `tracks/browser_track_preview.html`, `tracks/ucsc_track_lines.txt`, and `tracks/igv_session.xml`", + "- `motifs/motif_enrichment_plan.json`", + "- `motifs/motif_summary.tsv` when motif outputs are generated", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `visualizations/index.html`", + "- `run_manifest.json` and `artifact_index.json`", + "", + ] + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {item}" for item in validation["warnings"]) + lines.append("") + lines.extend(ngs_resource_gate.resource_summary_lines(resource_plan)) + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {item}" for item in validation["errors"]) + write_text(run_dir / "summary.md", "\n".join(lines) + "\n") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--assay", choices=["chipseq", "cutandrun", "cutandtag"], default="chipseq") + parser.add_argument( + "--target-class", default="tf", choices=["tf", "histone", "chromatin_regulator", "custom"] + ) + parser.add_argument("--peak-mode", choices=["narrow", "broad"], default="narrow") + parser.add_argument("--bowtie2-index") + parser.add_argument("--bam-only", action="store_true") + parser.add_argument("--genome-size", required=True) + parser.add_argument("--blacklist-bed", type=Path) + parser.add_argument( + "--genome-build", + help="Genome build or registry alias for resource readiness, e.g. GRCh38, mm39, or a configured local alias.", + ) + parser.add_argument( + "--bundle-root", + action="append", + default=[], + help="Resource bundle override formatted as bundle=/path. May be repeated.", + ) + parser.add_argument("--include-optional-resources", action="store_true") + parser.add_argument("--resource-checksums", action="store_true") + parser.add_argument( + "--require-resource-plan", + action="store_true", + help="Treat missing registered reference bundles as blocking for this direct runner.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Skip registered reference bundle readiness checks.", + ) + parser.add_argument("--run-motifs", action="store_true") + parser.add_argument("--motif-genome") + parser.add_argument("--motif-size", default="given") + parser.add_argument("--min-mapq", type=int, default=30) + parser.add_argument("--threads", type=int, default=4) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=slug_timestamp("chip-cutrun-peaks-qc")) + parser.add_argument("--execute", action="store_true") + return parser.parse_args() + + +def serializable_args(args: argparse.Namespace) -> dict[str, Any]: + return { + key: str(value) if isinstance(value, Path) else value for key, value in vars(args).items() + } + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + input_validation, samples = validate_inputs(args) + resource_plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="chip_cutrun_peaks_qc", + genome_build=args.genome_build, + bundle_roots=args.bundle_root, + include_optional=args.include_optional_resources, + include_checksums=args.resource_checksums, + skip=args.skip_resource_plan, + required=args.require_resource_plan, + ) + validation = ngs_resource_gate.merge_resource_status( + input_validation, resource_plan, required=args.require_resource_plan + ) + needs_alignment = any(row["layout"].startswith("fastq") for row in samples) + required_tools = ( + ["samtools", "macs2", "bedtools", "bamCoverage"] + (["bowtie2"] if needs_alignment else []) + if args.execute + else [] + ) + if args.execute and args.run_motifs: + required_tools.append("findMotifsGenome.pl") + optional_tools = [ + name + for name in [ + "samtools", + "macs2", + "bedtools", + "bamCoverage", + "bowtie2", + "findMotifsGenome.pl", + "multiqc", + ] + if name not in required_tools + ] + tool_status = tool_preflight(required_tools, optional=optional_tools) + plan = build_plan(args, samples) + write_json(run_dir / "config.json", {**serializable_args(args), "run_dir": str(run_dir)}) + write_json(run_dir / "validation" / "input_validation_summary.json", input_validation) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + { + "samtools": ["samtools", "--version"], + "macs2": ["macs2", "--version"], + "bedtools": ["bedtools", "--version"], + "bowtie2": ["bowtie2", "--version"], + "bamCoverage": ["bamCoverage", "--version"], + } + ), + ) + write_outputs(run_dir, validation, samples, plan) + dry_run = { + "ok": validation["ok"] and (tool_status["ok"] if args.execute else True), + "detail": "ChIP/CUT&RUN sample, control, metadata, and backend tool validation completed", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + status = "blocked" if not dry_run["ok"] else "validated" + execution = None + if args.execute and dry_run["ok"]: + execution = execute_plan(run_dir, plan) + status = "completed" if execution.get("ok") else "failed" + summarize_epigenomics_outputs( + run_dir, + samples, + peak_mode=args.peak_mode, + output_prefix="chip_cutrun_qc", + title="ChIP/CUT&RUN", + ) + visuals = write_visuals(run_dir, status, validation, resource_plan) + resource_outputs = ngs_resource_gate.resource_output_paths(resource_plan) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="chip_cutrun_peaks_qc", + workflow="local_light_chip_cutrun_alignment_peaks_qc", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "blacklist_bed": str(args.blacklist_bed.expanduser().resolve()) + if args.blacklist_bed + else None, + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs={ + "sample_table": "validation/samples.normalized.tsv", + "command_plan": "workflow/chip_cutrun_command_plan.json", + "qc_contract": "qc/chip_cutrun_qc_contract.json", + "qc_summary": "qc/chip_cutrun_qc_summary.tsv", + "qc_summary_json": "qc/chip_cutrun_qc_summary.json", + "qc_dashboard": "qc/chip_cutrun_qc_dashboard.html", + "frip_peak_overview": "qc/chip_cutrun_qc_frip_peak_overview.svg", + "insert_size_distribution": "qc/chip_cutrun_qc_insert_size_distribution.svg", + "peaks": "peaks/*Peak", + "consensus_peaks": "peaks/consensus_peaks.bed", + "tracks": "tracks/*.bw", + "browser_tracks": "tracks/browser_tracks.tsv", + "browser_track_preview": "tracks/browser_track_preview.html", + "igv_session": "tracks/igv_session.xml", + "motif_plan": "motifs/motif_enrichment_plan.json", + "motif_summary": "motifs/motif_summary.tsv", + **resource_outputs, + **visuals, + }, + method={ + "assay": args.assay, + "peak_caller": "MACS2", + "peak_mode": args.peak_mode, + "frip": "bedtools intersect + samtools count", + "motif_enrichment": "HOMER findMotifsGenome.pl when --run-motifs is supplied", + "resource_plan": resource_plan, + }, + audit={"resource_readiness": resource_plan} if resource_plan else None, + review_bundle=visuals, + ) + write_summary(run_dir, status, validation, resource_plan) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_dna_germline_variants.py b/plugins/ngs-analysis/scripts/run_dna_germline_variants.py new file mode 100644 index 0000000..209f2f2 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_dna_germline_variants.py @@ -0,0 +1,632 @@ +#!/usr/bin/env python3 +"""Run germline DNA variant calling with optional BQSR, gVCF, and joint genotyping.""" + +from __future__ import annotations + +import argparse +import csv +import shlex +from pathlib import Path +from typing import Any + +import ngs_resource_gate +from ngs_run_utils import ( + build_artifact_index, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import ( + add_vcf_review_notebook_entry, + artifact_entry, + write_visualization_index, +) + +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "dna_germline_variants" + + +def detect_delimiter(path: Path) -> str: + if path.suffix.lower() in {".tsv", ".tab"}: + return "\t" + return "," + + +def read_samples(path: Path) -> tuple[list[dict[str, str]], list[str]]: + with path.open(newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle, delimiter=detect_delimiter(path)) + rows = [{key: (value or "").strip() for key, value in row.items()} for row in reader] + return rows, list(reader.fieldnames or []) + + +def bqsr_enabled(args: argparse.Namespace) -> bool: + if args.bqsr_mode == "off": + return False + if args.bqsr_mode == "force": + return True + return bool(args.known_sites) + + +def use_gvcf(args: argparse.Namespace, sample_count: int) -> bool: + return bool(args.emit_gvcf or args.joint_call or sample_count > 1) + + +def validate_inputs(args: argparse.Namespace) -> tuple[dict[str, Any], list[dict[str, str]]]: + sample_sheet = args.sample_sheet.expanduser().resolve() + reference = args.reference_fasta.expanduser().resolve() + target_bed = args.target_bed.expanduser().resolve() if args.target_bed else None + rows, columns = read_samples(sample_sheet) + errors: list[str] = [] + warnings: list[str] = [] + normalized: list[dict[str, str]] = [] + + if not reference.exists(): + errors.append(f"reference FASTA does not exist: {reference}") + if not (Path(str(reference) + ".fai")).exists(): + warnings.append( + f"reference FASTA index is missing and may be created by samtools faidx: {reference}.fai" + ) + if not (reference.with_suffix(".dict")).exists(): + warnings.append( + f"reference sequence dictionary is missing and may be created by GATK: {reference.with_suffix('.dict')}" + ) + if target_bed and not target_bed.exists(): + errors.append(f"target BED does not exist: {target_bed}") + if args.bqsr_mode == "force" and not args.known_sites: + errors.append("BQSR was forced but no --known-sites VCFs were provided") + + known_sites: list[str] = [] + for item in args.known_sites: + resource = item.expanduser().resolve() + known_sites.append(str(resource)) + if not resource.exists(): + errors.append(f"known-sites VCF does not exist: {resource}") + if ( + not (Path(str(resource) + ".tbi")).exists() + and not (Path(str(resource) + ".csi")).exists() + ): + warnings.append( + f"known-sites VCF index is missing and may be required by GATK: {resource}.tbi" + ) + + for row_index, row in enumerate(rows, start=2): + sample = row.get("sample") or row.get("sample_id") or f"row_{row_index}" + bam_raw = row.get("bam") or row.get("cram") or "" + if not bam_raw: + errors.append(f"row {row_index}: bam or cram column is required") + continue + bam = Path(bam_raw).expanduser() + if not bam.is_absolute(): + bam = sample_sheet.parent / bam + bam = bam.resolve() + if not bam.exists(): + errors.append(f"row {row_index}: alignment file does not exist: {bam}") + if bam.suffix == ".bam" and not (Path(str(bam) + ".bai")).exists(): + warnings.append( + f"row {row_index}: BAM index is missing and may be created by samtools index: {bam}.bai" + ) + normalized.append({"sample": sample, "alignment": str(bam), "row_index": str(row_index)}) + + if not normalized: + errors.append("no usable alignment rows found") + if args.joint_call and len(normalized) < 2: + warnings.append( + "joint calling was requested with fewer than two samples; GenotypeGVCFs can still run, but a cohort VCF may not add value" + ) + + validation = { + "ok": not errors, + "sample_sheet": str(sample_sheet), + "reference_fasta": str(reference), + "target_bed": str(target_bed) if target_bed else None, + "known_sites": known_sites, + "columns": columns, + "sample_count": len(normalized), + "sample_model": args.sample_model, + "bqsr_enabled": bqsr_enabled(args), + "emit_gvcf": use_gvcf(args, len(normalized)), + "joint_call": args.joint_call, + "errors": errors, + "warnings": warnings, + } + return validation, normalized + + +def write_normalized_samples(run_dir: Path, rows: list[dict[str, str]]) -> None: + path = run_dir / "validation" / "samples.normalized.tsv" + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter( + handle, fieldnames=["sample", "alignment", "row_index"], delimiter="\t" + ) + writer.writeheader() + writer.writerows(rows) + + +def write_commands(run_dir: Path, args: argparse.Namespace, rows: list[dict[str, str]]) -> None: + reference = args.reference_fasta.expanduser().resolve() + lines = [ + "#!/usr/bin/env bash", + "set -euo pipefail", + shlex.join(["samtools", "faidx", str(reference)]), + ] + if bqsr_enabled(args): + for row in rows: + sample = row["sample"] + known_sites_args = [] + for resource in args.known_sites: + known_sites_args.extend(["--known-sites", str(resource.expanduser().resolve())]) + lines.append( + shlex.join( + [ + "gatk", + "BaseRecalibrator", + "-R", + str(reference), + "-I", + row["alignment"], + *known_sites_args, + "-O", + f"recal/{sample}.recal.table", + ] + ) + ) + lines.append( + shlex.join( + [ + "gatk", + "ApplyBQSR", + "-R", + str(reference), + "-I", + row["alignment"], + "--bqsr-recal-file", + f"recal/{sample}.recal.table", + "-O", + f"recal/{sample}.recal.bam", + ] + ) + ) + for row in rows: + sample = row["sample"] + input_bam = ( + Path(f"recal/{sample}.recal.bam") if bqsr_enabled(args) else Path(row["alignment"]) + ) + lines.append( + shlex.join(["samtools", "flagstat", str(input_bam)]) + f" > qc/{sample}.flagstat.txt" + ) + lines.append( + shlex.join(["samtools", "idxstats", str(input_bam)]) + f" > qc/{sample}.idxstats.tsv" + ) + hc_cmd = ["gatk", "HaplotypeCaller", "-R", str(reference), "-I", str(input_bam)] + if args.target_bed: + hc_cmd.extend(["-L", str(args.target_bed.expanduser().resolve())]) + if use_gvcf(args, len(rows)): + hc_cmd.extend(["-ERC", "GVCF", "-O", f"gvcf/{sample}.g.vcf.gz"]) + else: + hc_cmd.extend(["-O", f"variants/{sample}.vcf.gz"]) + lines.append(shlex.join(hc_cmd)) + if args.joint_call: + combine_cmd = ["gatk", "CombineGVCFs", "-R", str(reference)] + for row in rows: + combine_cmd.extend(["-V", f"gvcf/{row['sample']}.g.vcf.gz"]) + combine_cmd.extend(["-O", "joint/cohort.combined.g.vcf.gz"]) + lines.append(shlex.join(combine_cmd)) + lines.append( + shlex.join( + [ + "gatk", + "GenotypeGVCFs", + "-R", + str(reference), + "-V", + "joint/cohort.combined.g.vcf.gz", + "-O", + "joint/cohort.joint.vcf.gz", + ] + ) + ) + write_text(run_dir / "commands.sh", "\n".join(lines) + "\n") + + +def execute(run_dir: Path, args: argparse.Namespace, rows: list[dict[str, str]]) -> dict[str, Any]: + reference = args.reference_fasta.expanduser().resolve() + results: dict[str, Any] = {"ok": True, "steps": []} + (run_dir / "qc").mkdir(parents=True, exist_ok=True) + (run_dir / "recal").mkdir(parents=True, exist_ok=True) + (run_dir / "gvcf").mkdir(parents=True, exist_ok=True) + (run_dir / "variants").mkdir(parents=True, exist_ok=True) + (run_dir / "joint").mkdir(parents=True, exist_ok=True) + + if not (Path(str(reference) + ".fai")).exists(): + faidx = run_cmd(["samtools", "faidx", str(reference)], run_dir, timeout=600) + write_json(run_dir / "logs" / "samtools_faidx.json", faidx) + results["steps"].append({"name": "samtools_faidx", "ok": faidx.get("ok")}) + results["ok"] = bool(results["ok"] and faidx.get("ok")) + + reference_dict = reference.with_suffix(".dict") + if not reference_dict.exists(): + create_dict = run_cmd( + ["gatk", "CreateSequenceDictionary", "-R", str(reference), "-O", str(reference_dict)], + run_dir, + timeout=600, + ) + write_json(run_dir / "logs" / "gatk_create_dict.json", create_dict) + results["steps"].append({"name": "gatk_create_dict", "ok": create_dict.get("ok")}) + results["ok"] = bool(results["ok"] and create_dict.get("ok")) + + gvcfs: list[Path] = [] + for row in rows: + sample = row["sample"] + input_bam = Path(row["alignment"]) + if bqsr_enabled(args): + known_sites_args = [] + for resource in args.known_sites: + known_sites_args.extend(["--known-sites", str(resource.expanduser().resolve())]) + recal_table = run_dir / "recal" / f"{sample}.recal.table" + recal = run_cmd( + [ + "gatk", + "BaseRecalibrator", + "-R", + str(reference), + "-I", + str(input_bam), + *known_sites_args, + "-O", + str(recal_table), + ], + run_dir, + timeout=3600, + ) + write_json(run_dir / "logs" / f"{sample}.base_recalibrator.json", recal) + apply_bqsr = run_cmd( + [ + "gatk", + "ApplyBQSR", + "-R", + str(reference), + "-I", + str(input_bam), + "--bqsr-recal-file", + str(recal_table), + "-O", + str(run_dir / "recal" / f"{sample}.recal.bam"), + ], + run_dir, + timeout=3600, + ) + write_json(run_dir / "logs" / f"{sample}.apply_bqsr.json", apply_bqsr) + input_bam = run_dir / "recal" / f"{sample}.recal.bam" + bam_index = run_cmd(["samtools", "index", str(input_bam)], run_dir, timeout=600) + write_json(run_dir / "logs" / f"{sample}.recal_index.json", bam_index) + quickcheck = run_cmd(["samtools", "quickcheck", "-v", str(input_bam)], run_dir, timeout=300) + write_json(run_dir / "logs" / f"{sample}.quickcheck.json", quickcheck) + flagstat = run_cmd(["samtools", "flagstat", str(input_bam)], run_dir, timeout=600) + write_json(run_dir / "logs" / f"{sample}.flagstat.json", flagstat) + write_text(run_dir / "qc" / f"{sample}.flagstat.txt", flagstat.get("stdout_tail", "")) + idxstats = run_cmd(["samtools", "idxstats", str(input_bam)], run_dir, timeout=600) + write_json(run_dir / "logs" / f"{sample}.idxstats.json", idxstats) + write_text(run_dir / "qc" / f"{sample}.idxstats.tsv", idxstats.get("stdout_tail", "")) + + output_vcf = ( + run_dir + / ("gvcf" if use_gvcf(args, len(rows)) else "variants") + / (f"{sample}.g.vcf.gz" if use_gvcf(args, len(rows)) else f"{sample}.vcf.gz") + ) + haplotype_caller_cmd = [ + "gatk", + "HaplotypeCaller", + "-R", + str(reference), + "-I", + str(input_bam), + ] + if args.target_bed: + haplotype_caller_cmd.extend(["-L", str(args.target_bed.expanduser().resolve())]) + if use_gvcf(args, len(rows)): + haplotype_caller_cmd.extend(["-ERC", "GVCF"]) + haplotype_caller_cmd.extend(["-O", str(output_vcf)]) + hc = run_cmd(haplotype_caller_cmd, run_dir, timeout=7200) + write_json(run_dir / "logs" / f"{sample}.haplotypecaller.json", hc) + sample_ok = bool( + quickcheck.get("ok") and flagstat.get("ok") and idxstats.get("ok") and hc.get("ok") + ) + results["steps"].append({"name": sample, "ok": sample_ok}) + results["ok"] = bool(results["ok"] and sample_ok) + if use_gvcf(args, len(rows)): + gvcfs.append(output_vcf) + + if args.joint_call and gvcfs: + combine_cmd = ["gatk", "CombineGVCFs", "-R", str(reference)] + for item in gvcfs: + combine_cmd.extend(["-V", str(item)]) + combined_gvcf = run_dir / "joint" / "cohort.combined.g.vcf.gz" + combine_cmd.extend(["-O", str(combined_gvcf)]) + combine = run_cmd(combine_cmd, run_dir, timeout=7200) + write_json(run_dir / "logs" / "cohort.combine_gvcfs.json", combine) + genotype = run_cmd( + [ + "gatk", + "GenotypeGVCFs", + "-R", + str(reference), + "-V", + str(combined_gvcf), + "-O", + str(run_dir / "joint" / "cohort.joint.vcf.gz"), + ], + run_dir, + timeout=7200, + ) + write_json(run_dir / "logs" / "cohort.genotype_gvcfs.json", genotype) + joint_ok = bool(combine.get("ok") and genotype.get("ok")) + results["steps"].append({"name": "joint_call", "ok": joint_ok}) + results["ok"] = bool(results["ok"] and joint_ok) + return results + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> None: + lines = [ + "# Germline DNA Variant Calling Run Summary", + "", + f"Status: `{status}`", + f"Sample model: `{validation.get('sample_model')}`", + f"BQSR enabled: `{validation.get('bqsr_enabled')}`", + f"Emit gVCF: `{validation.get('emit_gvcf')}`", + f"Joint call: `{validation.get('joint_call')}`", + "", + "## Key Artifacts", + "", + "- `qc/*.flagstat.txt`", + "- `qc/*.idxstats.tsv`", + "- `recal/*.recal.table` and `recal/*.recal.bam` when BQSR runs", + "- `gvcf/*.g.vcf.gz` for per-sample GVCFs", + "- `joint/cohort.joint.vcf.gz` for joint genotyping", + "- `visualizations/index.html` and `visualizations/visualization_manifest.json`", + "- `notebooks/vcf_review.marimo.py` when output VCF/gVCF artifacts are present", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `run_manifest.json` and `artifact_index.json`", + "", + ] + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {item}" for item in validation["warnings"]) + lines.append("") + lines.extend(ngs_resource_gate.resource_summary_lines(resource_plan)) + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {item}" for item in validation["errors"]) + lines.append("") + write_text(run_dir / "summary.md", "\n".join(lines)) + + +def write_visuals( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> dict[str, str]: + entries = [ + artifact_entry( + artifact_id="sample_table", + title="Resolved Sample Table", + path="validation/samples.normalized.tsv", + kind="table", + status="created", + description="Resolved sample table with absolute alignment paths used by the germline runner.", + ), + ] + review_outputs = add_vcf_review_notebook_entry( + run_dir, + entries, + title="Germline DNA VCF Review", + table_items=[("Resolved Sample Table", "validation/samples.normalized.tsv")], + object_items=[("Run Summary", "summary.md"), ("Artifact Index", "artifact_index.json")], + ) + entries.extend(ngs_resource_gate.resource_visual_entries(resource_plan)) + index = write_visualization_index( + run_dir, + title="Germline DNA Review Bundle", + description="Review surface for the GATK germline lane, including generic VCF/gVCF notebook previews when variant artifacts are present.", + entries=entries, + notes=[ + *validation.get("warnings", []), + *ngs_resource_gate.resource_messages(resource_plan), + ], + analysis_intent="real_analysis" if status != "blocked" else "blocked_preflight", + provenance_summary={ + "status": status, + "sample_count": validation.get("sample_count", 0), + "resource_plan_ok": validation.get("resource_plan_ok"), + }, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + **review_outputs, + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--reference-fasta", type=Path, required=True) + parser.add_argument( + "--known-sites", + type=Path, + action="append", + default=[], + help="Repeat for each known-sites VCF used by BaseRecalibrator.", + ) + parser.add_argument("--target-bed", type=Path) + parser.add_argument( + "--sample-model", + choices=["singleton", "cohort", "duo", "trio", "family"], + default="singleton", + ) + parser.add_argument("--bqsr-mode", choices=["auto", "off", "force"], default="auto") + parser.add_argument( + "--emit-gvcf", + action="store_true", + help="Emit per-sample gVCFs even without joint genotyping.", + ) + parser.add_argument( + "--joint-call", action="store_true", help="Combine per-sample gVCFs and run GenotypeGVCFs." + ) + parser.add_argument( + "--genome-build", + help="Genome build or registry alias for resource readiness, e.g. GRCh38, mm39, or a configured local alias.", + ) + parser.add_argument( + "--bundle-root", + action="append", + default=[], + help="Resource bundle override formatted as bundle=/path. May be repeated.", + ) + parser.add_argument("--include-optional-resources", action="store_true") + parser.add_argument("--resource-checksums", action="store_true") + parser.add_argument( + "--require-resource-plan", + action="store_true", + help="Treat missing registered reference bundles as blocking for this local runner.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Skip registered reference bundle readiness checks.", + ) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=slug_timestamp("dna-germline-variants")) + parser.add_argument("--execute", action="store_true") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + + input_validation, rows = validate_inputs(args) + resource_plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="dna_germline_variants", + genome_build=args.genome_build, + bundle_roots=args.bundle_root, + include_optional=args.include_optional_resources, + include_checksums=args.resource_checksums, + skip=args.skip_resource_plan, + required=args.require_resource_plan, + ) + validation = ngs_resource_gate.merge_resource_status( + input_validation, + resource_plan, + required=args.require_resource_plan, + ) + required_tools = ["samtools", "gatk"] if args.execute else [] + optional_tools = [ + name + for name in ["samtools", "gatk", "bcftools", "deepvariant"] + if name not in required_tools + ] + tool_status = tool_preflight(required_tools, optional=optional_tools) + write_json( + run_dir / "config.json", + { + "reference_fasta": str(args.reference_fasta.expanduser().resolve()), + "known_sites": [str(item.expanduser().resolve()) for item in args.known_sites], + "target_bed": str(args.target_bed.expanduser().resolve()) if args.target_bed else None, + "sample_model": args.sample_model, + "bqsr_mode": args.bqsr_mode, + "emit_gvcf": args.emit_gvcf, + "joint_call": args.joint_call, + }, + ) + write_json(run_dir / "validation" / "input_summary.json", {"samples": rows}) + write_json(run_dir / "validation" / "input_validation_summary.json", input_validation) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_normalized_samples(run_dir, rows) + write_commands(run_dir, args, rows) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + { + "samtools": ["samtools", "--version"], + "gatk": ["gatk", "--version"], + "bcftools": ["bcftools", "--version"], + } + ), + ) + + dry_run_ok = validation["ok"] and (tool_status["ok"] if args.execute else True) + dry_run = {"ok": dry_run_ok, "detail": "input, resource, and tool validation completed"} + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + execution = None + status = "blocked" if not dry_run["ok"] else "validated" + if args.execute and dry_run["ok"]: + execution = execute(run_dir, args, rows) + status = "completed" if execution.get("ok") else "failed" + + visuals = write_visuals(run_dir, status, validation, resource_plan) + resource_outputs = ngs_resource_gate.resource_output_paths(resource_plan) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="dna_germline_variants", + workflow="gatk_bqsr_haplotypecaller_joint_genotyping", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "reference_fasta": str(args.reference_fasta.expanduser().resolve()), + "known_sites": [str(item.expanduser().resolve()) for item in args.known_sites], + "target_bed": str(args.target_bed.expanduser().resolve()) if args.target_bed else None, + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs={ + "flagstat_glob": "qc/*.flagstat.txt", + "idxstats_glob": "qc/*.idxstats.tsv", + "recal_table_glob": "recal/*.recal.table", + "recal_bam_glob": "recal/*.recal.bam", + "gvcf_glob": "gvcf/*.g.vcf.gz", + "joint_vcf_glob": "joint/cohort.joint.vcf.gz" if args.joint_call else None, + **resource_outputs, + **visuals, + }, + method={ + "sample_model": args.sample_model, + "bqsr_mode": args.bqsr_mode, + "emit_gvcf": use_gvcf(args, len(rows)), + "joint_call": args.joint_call, + "resource_plan": resource_plan, + }, + audit={"resource_readiness": resource_plan} if resource_plan else None, + review_bundle=visuals, + ) + write_summary(run_dir, status, validation, resource_plan) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_dna_somatic_variants.py b/plugins/ngs-analysis/scripts/run_dna_somatic_variants.py new file mode 100644 index 0000000..a7c46e2 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_dna_somatic_variants.py @@ -0,0 +1,844 @@ +#!/usr/bin/env python3 +"""Run or plan local somatic SNV/indel calling with GATK Mutect2.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from typing import Any + +import ngs_resource_gate +from ngs_planner_utils import ( + command_plan_entry, + normalize_sample_name, + read_table, + resolve_path, + shell_join, + write_command_script, + write_tsv, +) +from ngs_run_utils import ( + build_artifact_index, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import ( + add_vcf_review_notebook_entry, + artifact_entry, + write_visualization_index, +) + +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "dna_somatic_variants" +SOMATIC_PAIR_REVIEW_FIELDS = [ + "pair_id", + "design", + "tumor_sample", + "normal_sample", + "filtered_vcf", + "filtered_vcf_exists", + "bcftools_stats", + "variant_records", + "snp_count", + "indel_count", + "contamination_table", + "contamination_table_exists", + "panel_of_normals_ready", + "germline_resource_ready", + "orientation_bias_model_requested", + "status", + "notes", +] + + +def parse_first_int(value: str) -> int | None: + try: + return int(str(value).strip().split()[0]) + except (ValueError, IndexError): + return None + + +def parse_bcftools_stats(path: Path) -> dict[str, int | None]: + metrics: dict[str, int | None] = { + "variant_records": None, + "snp_count": None, + "indel_count": None, + } + if not path.exists(): + return metrics + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + if not line.startswith("SN\t0\t"): + continue + fields = line.split("\t") + if len(fields) < 4: + continue + value = parse_first_int(fields[3]) + if fields[2] == "number of records:": + metrics["variant_records"] = value + elif fields[2] == "number of SNPs:": + metrics["snp_count"] = value + elif fields[2] == "number of indels:": + metrics["indel_count"] = value + return metrics + + +def optional_existing_path( + raw: str | None, + base: Path, + errors: list[str], + warnings: list[str], + label: str, + *, + required: bool = False, +) -> Path | None: + path = resolve_path(raw, base) + if path is None: + if required: + errors.append(f"{label} is required") + return None + if not path.exists(): + message = f"{label} does not exist: {path}" + if required: + errors.append(message) + else: + warnings.append(message) + return path + + +def validate_inputs(args: argparse.Namespace) -> tuple[dict[str, Any], list[dict[str, str]]]: + sample_sheet = args.sample_sheet.expanduser().resolve() + errors: list[str] = [] + warnings: list[str] = [] + pairs: list[dict[str, str]] = [] + rows: list[dict[str, str]] = [] + columns: list[str] = [] + if not sample_sheet.exists(): + errors.append(f"sample sheet does not exist: {sample_sheet}") + else: + try: + rows, columns = read_table(sample_sheet) + except Exception as exc: # pragma: no cover - defensive parser guard + errors.append(f"failed to parse sample sheet {sample_sheet}: {exc}") + + reference = optional_existing_path( + str(args.reference_fasta), + sample_sheet.parent, + errors, + warnings, + "reference FASTA", + required=True, + ) + if reference: + if not Path(str(reference) + ".fai").exists(): + warnings.append( + f"reference FASTA index is missing and may be created by samtools faidx: {reference}.fai" + ) + if not reference.with_suffix(".dict").exists(): + warnings.append( + f"reference sequence dictionary is missing and may be created by GATK: {reference.with_suffix('.dict')}" + ) + target_bed = optional_existing_path( + str(args.target_bed) if args.target_bed else None, + sample_sheet.parent, + errors, + warnings, + "target BED", + ) + panel_of_normals = optional_existing_path( + str(args.panel_of_normals) if args.panel_of_normals else None, + sample_sheet.parent, + errors, + warnings, + "panel-of-normals VCF", + ) + germline_resource = optional_existing_path( + str(args.germline_resource) if args.germline_resource else None, + sample_sheet.parent, + errors, + warnings, + "germline resource VCF", + ) + annotation_vcf = optional_existing_path( + str(args.annotation_vcf) if args.annotation_vcf else None, + sample_sheet.parent, + errors, + warnings, + "annotation VCF", + ) + + for row_index, row in enumerate(rows, start=2): + pair_id = normalize_sample_name( + row.get("pair_id") + or row.get("case_id") + or row.get("sample") + or row.get("tumor_sample"), + f"row_{row_index}", + ) + tumor_sample = normalize_sample_name( + row.get("tumor_sample") or row.get("sample") or pair_id, f"{pair_id}_tumor" + ) + normal_sample = ( + normalize_sample_name(row.get("normal_sample"), f"{pair_id}_normal") + if row.get("normal_sample") + else "" + ) + tumor_bam = optional_existing_path( + row.get("tumor_bam") or row.get("tumor_cram") or row.get("bam") or row.get("cram"), + sample_sheet.parent, + errors, + warnings, + f"row {row_index} tumor BAM/CRAM", + required=True, + ) + normal_bam = optional_existing_path( + row.get("normal_bam") or row.get("normal_cram"), + sample_sheet.parent, + errors, + warnings, + f"row {row_index} normal BAM/CRAM", + ) + if tumor_bam and tumor_bam.suffix == ".bam" and not Path(str(tumor_bam) + ".bai").exists(): + warnings.append( + f"row {row_index}: tumor BAM index is missing and may be created by samtools index: {tumor_bam}.bai" + ) + if ( + normal_bam + and normal_bam.suffix == ".bam" + and not Path(str(normal_bam) + ".bai").exists() + ): + warnings.append( + f"row {row_index}: normal BAM index is missing and may be created by samtools index: {normal_bam}.bai" + ) + if tumor_bam: + design = "tumor_normal" if normal_bam else "tumor_only" + if design == "tumor_only": + warnings.append( + f"row {row_index}: tumor-only somatic calling requires stronger germline filtering caveats" + ) + pairs.append( + { + "pair_id": pair_id, + "design": design, + "tumor_sample": tumor_sample, + "tumor_alignment": str(tumor_bam), + "normal_sample": normal_sample, + "normal_alignment": str(normal_bam) if normal_bam else "", + "row_index": str(row_index), + } + ) + if not pairs: + errors.append("no usable tumor rows found") + if any(pair["design"] == "tumor_only" for pair in pairs) and not germline_resource: + warnings.append( + "tumor-only runs should provide --germline-resource to reduce germline false positives" + ) + if not panel_of_normals: + warnings.append( + "no panel-of-normals was provided; recurrent technical artifacts may be harder to filter" + ) + validation = { + "ok": not errors, + "sample_sheet": str(sample_sheet), + "reference_fasta": str(reference) if reference else str(args.reference_fasta), + "target_bed": str(target_bed) if target_bed else None, + "panel_of_normals": str(panel_of_normals) if panel_of_normals else None, + "germline_resource": str(germline_resource) if germline_resource else None, + "annotation_vcf": str(annotation_vcf) if annotation_vcf else None, + "columns": columns, + "pair_count": len(pairs), + "designs": sorted({pair["design"] for pair in pairs}), + "errors": errors, + "warnings": warnings, + } + return validation, pairs + + +def mutect2_plan(args: argparse.Namespace, pairs: list[dict[str, str]]) -> list[dict[str, Any]]: + reference = args.reference_fasta.expanduser().resolve() + commands: list[dict[str, Any]] = [] + for pair in pairs: + pair_id = pair["pair_id"] + tumor_bam = Path(pair["tumor_alignment"]) + unfiltered = f"variants/{pair_id}.unfiltered.vcf.gz" + filtered = f"variants/{pair_id}.filtered.vcf.gz" + cmd: list[str | Path] = [ + "gatk", + "Mutect2", + "-R", + reference, + "-I", + tumor_bam, + "-tumor", + pair["tumor_sample"], + ] + if pair["normal_alignment"]: + cmd.extend(["-I", Path(pair["normal_alignment"]), "-normal", pair["normal_sample"]]) + if args.germline_resource: + cmd.extend(["--germline-resource", args.germline_resource.expanduser().resolve()]) + if args.panel_of_normals: + cmd.extend(["-pon", args.panel_of_normals.expanduser().resolve()]) + if args.target_bed: + cmd.extend(["-L", args.target_bed.expanduser().resolve()]) + if args.f1r2_orientation_model: + cmd.extend(["--f1r2-tar-gz", f"f1r2/{pair_id}.f1r2.tar.gz"]) + cmd.extend(["-O", unfiltered]) + commands.append(command_plan_entry(f"{pair_id}: mutect2", cmd, outputs=[unfiltered])) + if args.f1r2_orientation_model: + commands.append( + command_plan_entry( + f"{pair_id}: learn read orientation model", + [ + "gatk", + "LearnReadOrientationModel", + "-I", + f"f1r2/{pair_id}.f1r2.tar.gz", + "-O", + f"f1r2/{pair_id}.read-orientation-model.tar.gz", + ], + outputs=[f"f1r2/{pair_id}.read-orientation-model.tar.gz"], + ) + ) + contamination_args: list[str | Path] = [] + if args.germline_resource: + pileup_intervals = ( + args.target_bed.expanduser().resolve() + if args.target_bed + else args.germline_resource.expanduser().resolve() + ) + tumor_pileups = f"qc/{pair_id}.tumor.pileups.table" + commands.append( + command_plan_entry( + f"{pair_id}: tumor pileup summaries", + [ + "gatk", + "GetPileupSummaries", + "-I", + tumor_bam, + "-V", + args.germline_resource.expanduser().resolve(), + "-L", + pileup_intervals, + "-O", + tumor_pileups, + ], + outputs=[tumor_pileups], + ) + ) + contamination_args.extend( + ["--contamination-table", f"qc/{pair_id}.contamination.table"] + ) + if pair["normal_alignment"]: + normal_pileups = f"qc/{pair_id}.normal.pileups.table" + commands.append( + command_plan_entry( + f"{pair_id}: normal pileup summaries", + [ + "gatk", + "GetPileupSummaries", + "-I", + Path(pair["normal_alignment"]), + "-V", + args.germline_resource.expanduser().resolve(), + "-L", + pileup_intervals, + "-O", + normal_pileups, + ], + outputs=[normal_pileups], + ) + ) + commands.append( + command_plan_entry( + f"{pair_id}: contamination estimate", + [ + "gatk", + "CalculateContamination", + "-I", + tumor_pileups, + "-matched", + normal_pileups, + "-O", + f"qc/{pair_id}.contamination.table", + ], + outputs=[f"qc/{pair_id}.contamination.table"], + ) + ) + else: + commands.append( + command_plan_entry( + f"{pair_id}: contamination estimate", + [ + "gatk", + "CalculateContamination", + "-I", + tumor_pileups, + "-O", + f"qc/{pair_id}.contamination.table", + ], + outputs=[f"qc/{pair_id}.contamination.table"], + ) + ) + filter_cmd: list[str | Path] = [ + "gatk", + "FilterMutectCalls", + "-R", + reference, + "-V", + unfiltered, + "-O", + filtered, + ] + filter_cmd.extend(contamination_args) + if args.f1r2_orientation_model: + filter_cmd.extend(["--ob-priors", f"f1r2/{pair_id}.read-orientation-model.tar.gz"]) + commands.append( + command_plan_entry(f"{pair_id}: filter mutect calls", filter_cmd, outputs=[filtered]) + ) + if args.annotation_vcf: + annotated = f"variants/{pair_id}.filtered.annotated.vcf.gz" + commands.append( + command_plan_entry( + f"{pair_id}: annotate filtered VCF", + [ + "bcftools", + "annotate", + "-a", + args.annotation_vcf.expanduser().resolve(), + "-c", + "ID,INFO/AF", + "-O", + "z", + "-o", + annotated, + filtered, + ], + outputs=[annotated], + ) + ) + commands.append( + command_plan_entry( + f"{pair_id}: bcftools stats", + f"{shell_join(['bcftools', 'stats', filtered])} > {shell_join([f'variants/{pair_id}.bcftools_stats.txt'])}", + outputs=[f"variants/{pair_id}.bcftools_stats.txt"], + ) + ) + return commands + + +def summarize_somatic_artifacts( + run_dir: Path, + validation: dict[str, Any], + pairs: list[dict[str, str]], + args: argparse.Namespace, +) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for pair in pairs: + pair_id = pair["pair_id"] + filtered_vcf = run_dir / "variants" / f"{pair_id}.filtered.vcf.gz" + stats_path = run_dir / "variants" / f"{pair_id}.bcftools_stats.txt" + contamination_path = run_dir / "qc" / f"{pair_id}.contamination.table" + stats = parse_bcftools_stats(stats_path) + observed = [filtered_vcf.exists(), stats_path.exists(), contamination_path.exists()] + status = "created" if all(observed[:2]) else ("partial" if any(observed) else "planned") + notes: list[str] = [] + if pair["design"] == "tumor_only": + notes.append("tumor-only design; matched-normal evidence unavailable") + if not validation.get("germline_resource"): + notes.append("germline resource not provided") + if not validation.get("panel_of_normals"): + notes.append("panel-of-normals not provided") + if not args.f1r2_orientation_model: + notes.append("orientation-bias model not requested") + if stats["variant_records"] is None: + notes.append("variant stats not found") + rows.append( + { + "pair_id": pair_id, + "design": pair["design"], + "tumor_sample": pair["tumor_sample"], + "normal_sample": pair["normal_sample"], + "filtered_vcf": str(filtered_vcf), + "filtered_vcf_exists": str(filtered_vcf.exists()).lower(), + "bcftools_stats": str(stats_path), + "variant_records": stats["variant_records"] + if stats["variant_records"] is not None + else "", + "snp_count": stats["snp_count"] if stats["snp_count"] is not None else "", + "indel_count": stats["indel_count"] if stats["indel_count"] is not None else "", + "contamination_table": str(contamination_path), + "contamination_table_exists": str(contamination_path.exists()).lower(), + "panel_of_normals_ready": str(bool(validation.get("panel_of_normals"))).lower(), + "germline_resource_ready": str(bool(validation.get("germline_resource"))).lower(), + "orientation_bias_model_requested": str(bool(args.f1r2_orientation_model)).lower(), + "status": status, + "notes": "; ".join(notes), + } + ) + write_tsv(run_dir / "qc" / "somatic_pair_review.tsv", rows, SOMATIC_PAIR_REVIEW_FIELDS) + write_json( + run_dir / "qc" / "somatic_pair_review.json", + { + "pairs": rows, + "pair_count": len(rows), + "tumor_only_count": sum(1 for row in rows if row["design"] == "tumor_only"), + "pairs_with_filtered_vcf": sum( + 1 for row in rows if row["filtered_vcf_exists"] == "true" + ), + "pairs_with_variant_stats": sum(1 for row in rows if row["variant_records"] != ""), + "pairs_with_contamination_table": sum( + 1 for row in rows if row["contamination_table_exists"] == "true" + ), + }, + ) + return rows + + +def write_outputs( + run_dir: Path, + validation: dict[str, Any], + pairs: list[dict[str, str]], + plan: list[dict[str, Any]], + args: argparse.Namespace, +) -> None: + write_tsv( + run_dir / "validation" / "pairs.normalized.tsv", + pairs, + [ + "pair_id", + "design", + "tumor_sample", + "tumor_alignment", + "normal_sample", + "normal_alignment", + "row_index", + ], + ) + write_json( + run_dir / "qc" / "somatic_qc_summary.json", + { + "pair_count": validation.get("pair_count", 0), + "designs": validation.get("designs", []), + "tumor_only_pair_ids": [ + pair["pair_id"] for pair in pairs if pair["design"] == "tumor_only" + ], + "resource_status": { + "germline_resource": bool(validation.get("germline_resource")), + "panel_of_normals": bool(validation.get("panel_of_normals")), + "target_bed": bool(validation.get("target_bed")), + }, + "warnings": validation.get("warnings", []), + }, + ) + write_tsv( + run_dir / "qc" / "somatic_filter_reasons.tsv", + [ + { + "pair_id": pair["pair_id"], + "status": "not_executed", + "note": "Filter annotations are populated after Mutect2 execution.", + } + for pair in pairs + ], + ["pair_id", "status", "note"], + ) + summarize_somatic_artifacts(run_dir, validation, pairs, args) + write_json(run_dir / "workflow" / "somatic_command_plan.json", {"commands": plan}) + write_command_script(run_dir / "commands.sh", [item["command"] for item in plan]) + + +def execute_plan(run_dir: Path, plan: list[dict[str, Any]]) -> dict[str, Any]: + for dirname in ["variants", "qc", "logs", "f1r2"]: + (run_dir / dirname).mkdir(parents=True, exist_ok=True) + result: dict[str, Any] = {"ok": True, "steps": []} + for index, item in enumerate(plan, start=1): + step = run_cmd(["bash", "-c", item["command"]], run_dir, timeout=7200) + safe_name = item["name"].replace(":", "").replace(" ", "_").replace("/", "_") + write_json(run_dir / "logs" / f"{index:02d}_{safe_name}.json", step) + result["steps"].append({"name": item["name"], "ok": step.get("ok")}) + result["ok"] = bool(result["ok"] and step.get("ok")) + if not step.get("ok"): + break + return result + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> None: + lines = [ + "# Somatic DNA Variant Run Summary", + "", + f"Status: `{status}`", + f"Pairs parsed: `{validation.get('pair_count', 0)}`", + f"Designs: `{', '.join(validation.get('designs', [])) or 'none'}`", + "", + "## Key Artifacts", + "", + "- `validation/pairs.normalized.tsv`", + "- `workflow/somatic_command_plan.json`", + "- `qc/somatic_qc_summary.json`", + "- `qc/somatic_pair_review.tsv` and `qc/somatic_pair_review.json`", + "- `qc/somatic_filter_reasons.tsv`", + "- `variants/*.unfiltered.vcf.gz` and `variants/*.filtered.vcf.gz` when executed", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `visualizations/index.html` and `visualizations/visualization_manifest.json`", + "- `notebooks/vcf_review.marimo.py` when output VCF/gVCF artifacts are present", + "- `run_manifest.json` and `artifact_index.json`", + "", + "## Guardrails", + "", + "- Tumor-only calls are not confirmed somatic without matched-normal or strong germline-resource filtering.", + "- Panel-of-normals and orientation-bias filtering should match the capture kit, library prep, and reference build.", + "", + ] + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {item}" for item in validation["warnings"]) + lines.append("") + lines.extend(ngs_resource_gate.resource_summary_lines(resource_plan)) + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {item}" for item in validation["errors"]) + write_text(run_dir / "summary.md", "\n".join(lines) + "\n") + + +def write_visuals( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> dict[str, str]: + entries = [ + artifact_entry( + artifact_id="pairing_table", + title="Tumor/Normal Pairing Table", + path="validation/pairs.normalized.tsv", + kind="table", + status="created", + description="Normalized tumor-normal or tumor-only sample design used by the runner.", + ), + artifact_entry( + artifact_id="command_plan", + title="Somatic Command Plan", + path="workflow/somatic_command_plan.json", + kind="json", + status="created", + description="Executable Mutect2, contamination, filtering, and optional annotation commands.", + ), + artifact_entry( + artifact_id="somatic_qc_summary", + title="Somatic QC Summary", + path="qc/somatic_qc_summary.json", + kind="json", + status="created", + description="Pairing, resource, and tumor-only caveat summary.", + ), + artifact_entry( + artifact_id="somatic_pair_review", + title="Somatic Pair Review", + path="qc/somatic_pair_review.tsv", + kind="table", + status="created", + description="Per-pair review of design, matched-normal state, resource caveats, filtered VCF status, contamination table, and variant stats.", + ), + ] + review_outputs = add_vcf_review_notebook_entry( + run_dir, + entries, + title="Somatic DNA VCF Review", + table_items=[ + ("Tumor/Normal Pairing Table", "validation/pairs.normalized.tsv"), + ("Somatic Pair Review", "qc/somatic_pair_review.tsv"), + ], + object_items=[ + ("Somatic QC Summary", "qc/somatic_qc_summary.json"), + ("Run Summary", "summary.md"), + ], + ) + entries.extend(ngs_resource_gate.resource_visual_entries(resource_plan)) + index = write_visualization_index( + run_dir, + title="Somatic DNA Variant Review", + description="Review surface for tumor-normal/tumor-only Mutect2 planning and execution artifacts.", + entries=entries, + notes=[ + *validation.get("warnings", []), + *ngs_resource_gate.resource_messages(resource_plan), + ], + analysis_intent="real_analysis" if status != "blocked" else "blocked_preflight", + provenance_summary={ + "status": status, + "pair_count": validation.get("pair_count", 0), + "resource_plan_ok": validation.get("resource_plan_ok"), + }, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + **review_outputs, + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--reference-fasta", type=Path, required=True) + parser.add_argument("--target-bed", type=Path) + parser.add_argument("--panel-of-normals", type=Path) + parser.add_argument("--germline-resource", type=Path) + parser.add_argument("--annotation-vcf", type=Path) + parser.add_argument("--f1r2-orientation-model", action="store_true") + parser.add_argument( + "--genome-build", + help="Genome build or registry alias for resource readiness, e.g. GRCh38, mm39, or a configured local alias.", + ) + parser.add_argument( + "--bundle-root", + action="append", + default=[], + help="Resource bundle override formatted as bundle=/path. May be repeated.", + ) + parser.add_argument("--include-optional-resources", action="store_true") + parser.add_argument("--resource-checksums", action="store_true") + parser.add_argument( + "--require-resource-plan", + action="store_true", + help="Treat missing registered reference bundles as blocking for this direct runner.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Skip registered reference bundle readiness checks.", + ) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=slug_timestamp("dna-somatic-variants")) + parser.add_argument("--execute", action="store_true") + return parser.parse_args() + + +def serializable_args(args: argparse.Namespace) -> dict[str, Any]: + return { + key: str(value) if isinstance(value, Path) else value for key, value in vars(args).items() + } + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + + input_validation, pairs = validate_inputs(args) + resource_plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="dna_somatic_variants", + genome_build=args.genome_build, + bundle_roots=args.bundle_root, + include_optional=args.include_optional_resources, + include_checksums=args.resource_checksums, + skip=args.skip_resource_plan, + required=args.require_resource_plan, + ) + validation = ngs_resource_gate.merge_resource_status( + input_validation, resource_plan, required=args.require_resource_plan + ) + required_tools = ["gatk", "samtools", "bcftools"] if args.execute else [] + optional_tools = [ + name for name in ["gatk", "samtools", "bcftools"] if name not in required_tools + ] + tool_status = tool_preflight(required_tools, optional=optional_tools) + plan = mutect2_plan(args, pairs) + write_json(run_dir / "config.json", {**serializable_args(args), "run_dir": str(run_dir)}) + write_json(run_dir / "validation" / "input_validation_summary.json", input_validation) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + { + "gatk": ["gatk", "--version"], + "samtools": ["samtools", "--version"], + "bcftools": ["bcftools", "--version"], + } + ), + ) + write_outputs(run_dir, validation, pairs, plan, args) + dry_run = { + "ok": validation["ok"] and (tool_status["ok"] if args.execute else True), + "detail": "input, pairing, resource, and tool validation completed", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + status = "blocked" if not dry_run["ok"] else "validated" + execution = None + if args.execute and dry_run["ok"]: + execution = execute_plan(run_dir, plan) + status = "completed" if execution.get("ok") else "failed" + summarize_somatic_artifacts(run_dir, validation, pairs, args) + visuals = write_visuals(run_dir, status, validation, resource_plan) + resource_outputs = ngs_resource_gate.resource_output_paths(resource_plan) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="dna_somatic_variants", + workflow="local_light_gatk_mutect2", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "reference_fasta": str(args.reference_fasta.expanduser().resolve()), + "target_bed": str(args.target_bed.expanduser().resolve()) if args.target_bed else None, + "panel_of_normals": str(args.panel_of_normals.expanduser().resolve()) + if args.panel_of_normals + else None, + "germline_resource": str(args.germline_resource.expanduser().resolve()) + if args.germline_resource + else None, + "annotation_vcf": str(args.annotation_vcf.expanduser().resolve()) + if args.annotation_vcf + else None, + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs={ + "pairing_table": "validation/pairs.normalized.tsv", + "command_plan": "workflow/somatic_command_plan.json", + "qc_summary": "qc/somatic_qc_summary.json", + "pair_review": "qc/somatic_pair_review.tsv", + "pair_review_json": "qc/somatic_pair_review.json", + "filter_reasons": "qc/somatic_filter_reasons.tsv", + "filtered_vcf_glob": "variants/*.filtered.vcf.gz", + **resource_outputs, + **visuals, + }, + method={ + "caller": "GATK Mutect2", + "filter": "GATK FilterMutectCalls", + "tumor_normal_designs": validation.get("designs", []), + "orientation_bias_model_requested": args.f1r2_orientation_model, + "resource_plan": resource_plan, + }, + audit={"resource_readiness": resource_plan} if resource_plan else None, + review_bundle=visuals, + ) + write_summary(run_dir, status, validation, resource_plan) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_dna_umi_panel_variants.py b/plugins/ngs-analysis/scripts/run_dna_umi_panel_variants.py new file mode 100644 index 0000000..76562e9 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_dna_umi_panel_variants.py @@ -0,0 +1,997 @@ +#!/usr/bin/env python3 +"""Run or plan UMI-aware targeted panel variant calling from consensus or raw BAMs.""" + +from __future__ import annotations + +import argparse +import csv +import shutil +import statistics +import subprocess +from pathlib import Path +from typing import Any + +import ngs_resource_gate +from ngs_planner_utils import ( + command_plan_entry, + normalize_sample_name, + read_table, + resolve_path, + shell_join, + write_command_script, + write_tsv, +) +from ngs_run_utils import ( + build_artifact_index, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import ( + add_vcf_review_notebook_entry, + artifact_entry, + write_visualization_index, +) + +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "dna_umi_panel_variants" +UMI_POSTRUN_FIELDS = [ + "sample", + "consensus_state", + "consensus_bam", + "consensus_bam_exists", + "total_consensus_reads", + "mapped_consensus_reads", + "mean_target_depth", + "target_bases_covered", + "variant_records", + "snp_count", + "indel_count", + "median_family_size", + "duplex_fraction", + "status", + "notes", +] +UMI_EVIDENCE_FIELDS = [ + "sample", + "umi_mode", + "consensus_state", + "min_af", + "min_reads_per_molecule", + "consensus_bam", + "consensus_bam_exists", + "family_metrics_path", + "family_metrics_exists", + "variant_vcf", + "variant_vcf_exists", + "variant_stats_path", + "variant_stats_exists", + "hotspot_vcf", + "hotspot_review", + "duplex_review", + "low_af_review_status", + "notes", +] +UMI_SAMPLE_FIELDS = [ + "sample", + "raw_alignment", + "consensus_alignment", + "consensus_state", + "fgbio_readiness", + "raw_umi_tag_status", + "mate_tag_status", + "row_index", +] + + +def maybe_path(raw: str | None, base: Path) -> Path | None: + return resolve_path(raw, base) if raw else None + + +def inspect_alignment_tags( + path: Path, required_tags: tuple[str, ...] = ("RX", "MQ"), max_records: int = 200 +) -> dict[str, Any]: + """Inspect a BAM/CRAM for required per-read tags using the first few alignments.""" + status = { + "inspectable": False, + "reason": "", + "records_inspected": 0, + "tags": {tag: False for tag in required_tags}, + "all_present": False, + } + samtools = shutil.which("samtools") + if samtools is None: + status["reason"] = "samtools_not_available" + return status + if path.suffix.lower() not in {".bam", ".cram"}: + status["reason"] = "unsupported_alignment_extension" + return status + + proc = subprocess.Popen( + [samtools, "view", str(path)], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + encoding="utf-8", + errors="replace", + ) + stderr = "" + try: + assert proc.stdout is not None + for line in proc.stdout: + status["records_inspected"] += 1 + fields = line.rstrip("\n").split("\t") + for item in fields[11:]: + tag = item.split(":", 1)[0] + if tag in status["tags"]: + status["tags"][tag] = True + if all(status["tags"].values()) or status["records_inspected"] >= max_records: + break + finally: + if proc.stdout is not None: + proc.stdout.close() + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=2) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait(timeout=2) + if proc.stderr is not None: + stderr = proc.stderr.read().strip() + proc.stderr.close() + + status["inspectable"] = status["records_inspected"] > 0 + status["all_present"] = status["inspectable"] and all(status["tags"].values()) + if not status["inspectable"]: + status["reason"] = stderr.splitlines()[0] if stderr else "no_alignment_records_observed" + return status + + +def validate_inputs(args: argparse.Namespace) -> tuple[dict[str, Any], list[dict[str, str]]]: + sample_sheet = args.sample_sheet.expanduser().resolve() + errors: list[str] = [] + warnings: list[str] = [] + rows: list[dict[str, str]] = [] + columns: list[str] = [] + samples: list[dict[str, str]] = [] + if not sample_sheet.exists(): + errors.append(f"sample sheet does not exist: {sample_sheet}") + else: + rows, columns = read_table(sample_sheet) + + reference = args.reference_fasta.expanduser().resolve() + if not reference.exists(): + errors.append(f"reference FASTA does not exist: {reference}") + if not Path(str(reference) + ".fai").exists(): + warnings.append( + f"reference FASTA index is missing and may be created by samtools faidx: {reference}.fai" + ) + target_bed = args.target_bed.expanduser().resolve() if args.target_bed else None + if target_bed and not target_bed.exists(): + errors.append(f"target BED does not exist: {target_bed}") + hotspot_vcf = args.hotspot_vcf.expanduser().resolve() if args.hotspot_vcf else None + if hotspot_vcf and not hotspot_vcf.exists(): + warnings.append(f"hotspot VCF does not exist: {hotspot_vcf}") + + for row_index, row in enumerate(rows, start=2): + sample = normalize_sample_name( + row.get("sample") or row.get("sample_id"), f"row_{row_index}" + ) + raw_bam = maybe_path( + row.get("bam") or row.get("raw_bam") or row.get("cram"), sample_sheet.parent + ) + consensus_bam = maybe_path( + row.get("consensus_bam") or row.get("duplex_bam") or row.get("simplex_bam"), + sample_sheet.parent, + ) + tag_status = { + "inspectable": False, + "reason": "", + "tags": {"RX": False, "MQ": False}, + "all_present": False, + } + if not raw_bam and not consensus_bam: + errors.append(f"row {row_index}: provide bam/raw_bam/cram or consensus_bam") + continue + if raw_bam and not raw_bam.exists(): + errors.append(f"row {row_index}: raw alignment does not exist: {raw_bam}") + elif raw_bam: + tag_status = inspect_alignment_tags(raw_bam) + if consensus_bam and not consensus_bam.exists(): + warnings.append(f"row {row_index}: consensus BAM does not exist yet: {consensus_bam}") + if not consensus_bam and not raw_bam: + errors.append(f"row {row_index}: no usable alignment path") + if not consensus_bam and not args.umi_tag: + warnings.append( + f"row {row_index}: no consensus BAM and no --umi-tag was supplied; consensus generation is not fully specified" + ) + raw_umi_tag_status = "unknown" + mate_tag_status = "unknown" + fgbio_readiness = "not_applicable" + if raw_bam: + if tag_status["inspectable"]: + raw_umi_tag_status = "present" if tag_status["tags"].get("RX") else "missing" + mate_tag_status = "present" if tag_status["tags"].get("MQ") else "missing" + fgbio_readiness = "ready" if tag_status["all_present"] else "review_contract_only" + if fgbio_readiness == "review_contract_only" and not consensus_bam: + missing = [tag for tag, present in tag_status["tags"].items() if not present] + warnings.append( + f"row {row_index}: raw alignment lacks required UMI tags ({','.join(missing)}); " + "treat as a review-contract input unless a consensus BAM is already provided" + ) + else: + fgbio_readiness = "unknown" + warnings.append( + f"row {row_index}: could not verify RX/MQ tags on raw alignment; " + "end-to-end fgbio readiness remains unconfirmed" + ) + consensus_state = ( + "provided" + if consensus_bam + else ( + "review_contract_only" + if fgbio_readiness == "review_contract_only" + else "needs_generation" + ) + ) + samples.append( + { + "sample": sample, + "raw_alignment": str(raw_bam) if raw_bam else "", + "consensus_alignment": str(consensus_bam) if consensus_bam else "", + "consensus_state": consensus_state, + "fgbio_readiness": fgbio_readiness, + "raw_umi_tag_status": raw_umi_tag_status, + "mate_tag_status": mate_tag_status, + "row_index": str(row_index), + } + ) + if not samples: + errors.append("no usable UMI panel samples found") + if args.min_af < 0 or args.min_af > 1: + errors.append("--min-af must be between 0 and 1") + validation = { + "ok": not errors, + "sample_sheet": str(sample_sheet), + "reference_fasta": str(reference), + "target_bed": str(target_bed) if target_bed else None, + "hotspot_vcf": str(hotspot_vcf) if hotspot_vcf else None, + "columns": columns, + "sample_count": len(samples), + "umi_mode": args.umi_mode, + "umi_tag": args.umi_tag, + "min_af": args.min_af, + "errors": errors, + "warnings": warnings, + } + return validation, samples + + +def build_plan(args: argparse.Namespace, samples: list[dict[str, str]]) -> list[dict[str, Any]]: + reference = args.reference_fasta.expanduser().resolve() + plan: list[dict[str, Any]] = [] + for sample in samples: + name = sample["sample"] + raw = sample["raw_alignment"] + consensus = sample["consensus_alignment"] or f"consensus/{name}.consensus.bam" + can_generate_consensus = ( + sample["consensus_state"] == "needs_generation" + and bool(raw) + and sample.get("fgbio_readiness") != "review_contract_only" + ) + consensus_available_for_plan = ( + sample["consensus_state"] == "provided" or can_generate_consensus + ) + if can_generate_consensus: + grouped = f"consensus/{name}.grouped.bam" + plan.append( + command_plan_entry( + f"{name}: group reads by UMI", + [ + "fgbio", + "GroupReadsByUmi", + "-i", + raw, + "-o", + grouped, + "-s", + args.grouping_strategy, + "--edits", + str(args.umi_edits), + "-t", + args.umi_tag or "RX", + ], + outputs=[grouped], + ) + ) + plan.append( + command_plan_entry( + f"{name}: call molecular consensus reads", + [ + "fgbio", + "CallMolecularConsensusReads", + "-i", + grouped, + "-o", + consensus, + "-M", + str(args.min_reads_per_molecule), + ], + outputs=[consensus], + ) + ) + if not consensus_available_for_plan: + continue + plan.append( + command_plan_entry( + f"{name}: consensus flagstat", + f"{shell_join(['samtools', 'flagstat', consensus])} > {shell_join([f'qc/{name}.consensus.flagstat.txt'])}", + outputs=[f"qc/{name}.consensus.flagstat.txt"], + ) + ) + if args.target_bed: + plan.append( + command_plan_entry( + f"{name}: target coverage", + f"{shell_join(['samtools', 'coverage', '-b', args.target_bed.expanduser().resolve(), consensus])} > {shell_join([f'qc/{name}.target_coverage.tsv'])}", + outputs=[f"qc/{name}.target_coverage.tsv"], + ) + ) + mpileup = ["bcftools", "mpileup", "-Ou", "-f", reference] + if args.target_bed: + mpileup.extend(["-R", args.target_bed.expanduser().resolve()]) + mpileup.append(consensus) + call = ["bcftools", "call", "-mv", "-Oz", "-o", f"variants/{name}.consensus.vcf.gz"] + plan.append( + command_plan_entry( + f"{name}: consensus variant calling", + f"{shell_join(mpileup)} | {shell_join(call)}", + outputs=[f"variants/{name}.consensus.vcf.gz"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: index consensus VCF", + ["bcftools", "index", "-t", f"variants/{name}.consensus.vcf.gz"], + ) + ) + plan.append( + command_plan_entry( + f"{name}: variant stats", + f"{shell_join(['bcftools', 'stats', f'variants/{name}.consensus.vcf.gz'])} > {shell_join([f'variants/{name}.bcftools_stats.txt'])}", + outputs=[f"variants/{name}.bcftools_stats.txt"], + ) + ) + return plan + + +def parse_first_int(value: str) -> int | None: + try: + return int(str(value).strip().split()[0]) + except (ValueError, IndexError): + return None + + +def parse_flagstat(path: Path) -> dict[str, int | None]: + metrics: dict[str, int | None] = {"total_reads": None, "mapped_reads": None} + if not path.exists(): + return metrics + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + if " in total " in line: + metrics["total_reads"] = parse_first_int(line) + elif " mapped (" in line and " mate mapped" not in line: + metrics["mapped_reads"] = parse_first_int(line) + return metrics + + +def parse_coverage(path: Path) -> dict[str, float | int | None]: + metrics: dict[str, float | int | None] = { + "mean_target_depth": None, + "target_bases_covered": None, + } + if not path.exists(): + return metrics + covered = 0 + depths: list[float] = [] + with path.open(newline="", encoding="utf-8", errors="replace") as handle: + reader = csv.DictReader(handle, delimiter="\t") + for row in reader: + covbases = row.get("covbases") or row.get("coverage") or row.get("cov_bases") + depth = row.get("meandepth") or row.get("mean_depth") + try: + if covbases is not None: + covered += int(float(covbases)) + except ValueError: + pass + try: + if depth is not None: + depths.append(float(depth)) + except ValueError: + pass + metrics["target_bases_covered"] = covered if covered else None + metrics["mean_target_depth"] = round(sum(depths) / len(depths), 3) if depths else None + return metrics + + +def parse_bcftools_stats(path: Path) -> dict[str, int | None]: + metrics: dict[str, int | None] = { + "variant_records": None, + "snp_count": None, + "indel_count": None, + } + if not path.exists(): + return metrics + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + if not line.startswith("SN\t0\t"): + continue + fields = line.split("\t") + if len(fields) < 4: + continue + label = fields[2] + value = parse_first_int(fields[3]) + if label == "number of records:": + metrics["variant_records"] = value + elif label == "number of SNPs:": + metrics["snp_count"] = value + elif label == "number of indels:": + metrics["indel_count"] = value + return metrics + + +def parse_family_metrics(path: Path) -> dict[str, float | None]: + metrics: dict[str, float | None] = {"median_family_size": None, "duplex_fraction": None} + if not path.exists(): + return metrics + sizes: list[float] = [] + duplex_total = 0.0 + total = 0.0 + with path.open(newline="", encoding="utf-8", errors="replace") as handle: + reader = csv.DictReader(handle, delimiter="\t") + for row in reader: + size_value = row.get("family_size") or row.get("size") or row.get("umi_family_size") + count_value = row.get("count") or row.get("families") or row.get("n") + family_type = ( + row.get("family_type") or row.get("type") or row.get("strand") or "" + ).lower() + try: + size = float(size_value) if size_value not in {None, ""} else None + count = float(count_value) if count_value not in {None, ""} else 1.0 + except ValueError: + continue + if size is not None: + sizes.extend([size] * max(1, min(int(count), 10000))) + total += count + if "duplex" in family_type: + duplex_total += count + metrics["median_family_size"] = round(float(statistics.median(sizes)), 3) if sizes else None + metrics["duplex_fraction"] = round(duplex_total / total, 4) if total else None + return metrics + + +def summarize_postrun_artifacts( + run_dir: Path, samples: list[dict[str, str]] +) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for sample in samples: + name = sample["sample"] + consensus_bam = sample["consensus_alignment"] or f"consensus/{name}.consensus.bam" + consensus_path = Path(consensus_bam) + if not consensus_path.is_absolute(): + consensus_path = run_dir / consensus_path + flagstat = parse_flagstat(run_dir / "qc" / f"{name}.consensus.flagstat.txt") + coverage = parse_coverage(run_dir / "qc" / f"{name}.target_coverage.tsv") + stats = parse_bcftools_stats(run_dir / "variants" / f"{name}.bcftools_stats.txt") + family_metrics = {"median_family_size": None, "duplex_fraction": None} + for candidate in [ + run_dir / "qc" / f"{name}.family_size.tsv", + run_dir / "qc" / f"{name}.umi_family_size.tsv", + run_dir / "consensus" / f"{name}.family_size.tsv", + ]: + if candidate.exists(): + family_metrics = parse_family_metrics(candidate) + break + observed_files = [ + consensus_path.exists(), + (run_dir / "qc" / f"{name}.consensus.flagstat.txt").exists(), + (run_dir / "qc" / f"{name}.target_coverage.tsv").exists(), + (run_dir / "variants" / f"{name}.bcftools_stats.txt").exists(), + ] + status = ( + "created" + if all(observed_files[:2]) + else ("partial" if any(observed_files) else "not_executed") + ) + notes = [] + if family_metrics["median_family_size"] is None: + notes.append("family-size metrics not found") + if coverage["mean_target_depth"] is None: + notes.append("target coverage not found") + if stats["variant_records"] is None: + notes.append("variant stats not found") + rows.append( + { + "sample": name, + "consensus_state": sample["consensus_state"], + "consensus_bam": str(consensus_path), + "consensus_bam_exists": str(consensus_path.exists()).lower(), + "total_consensus_reads": flagstat["total_reads"] + if flagstat["total_reads"] is not None + else "", + "mapped_consensus_reads": flagstat["mapped_reads"] + if flagstat["mapped_reads"] is not None + else "", + "mean_target_depth": coverage["mean_target_depth"] + if coverage["mean_target_depth"] is not None + else "", + "target_bases_covered": coverage["target_bases_covered"] + if coverage["target_bases_covered"] is not None + else "", + "variant_records": stats["variant_records"] + if stats["variant_records"] is not None + else "", + "snp_count": stats["snp_count"] if stats["snp_count"] is not None else "", + "indel_count": stats["indel_count"] if stats["indel_count"] is not None else "", + "median_family_size": family_metrics["median_family_size"] + if family_metrics["median_family_size"] is not None + else "", + "duplex_fraction": family_metrics["duplex_fraction"] + if family_metrics["duplex_fraction"] is not None + else "", + "status": status, + "notes": "; ".join(notes), + } + ) + write_tsv(run_dir / "qc" / "umi_postrun_summary.tsv", rows, UMI_POSTRUN_FIELDS) + write_json( + run_dir / "qc" / "umi_postrun_summary.json", + { + "samples": rows, + "samples_with_consensus_bam": sum( + 1 for row in rows if row["consensus_bam_exists"] == "true" + ), + "samples_with_variant_stats": sum(1 for row in rows if row["variant_records"] != ""), + "samples_with_family_metrics": sum( + 1 for row in rows if row["median_family_size"] != "" + ), + }, + ) + return rows + + +def first_existing_family_metrics(run_dir: Path, sample: str) -> Path: + for candidate in [ + run_dir / "qc" / f"{sample}.family_size.tsv", + run_dir / "qc" / f"{sample}.umi_family_size.tsv", + run_dir / "consensus" / f"{sample}.family_size.tsv", + ]: + if candidate.exists(): + return candidate + return run_dir / "qc" / f"{sample}.family_size.tsv" + + +def write_molecular_evidence_contract( + run_dir: Path, + validation: dict[str, Any], + samples: list[dict[str, str]], + args: argparse.Namespace, +) -> list[dict[str, Any]]: + rows: list[dict[str, Any]] = [] + for sample in samples: + name = sample["sample"] + consensus_bam = sample["consensus_alignment"] or f"consensus/{name}.consensus.bam" + consensus_path = Path(consensus_bam) + if not consensus_path.is_absolute(): + consensus_path = run_dir / consensus_path + family_metrics = first_existing_family_metrics(run_dir, name) + variant_vcf = run_dir / "variants" / f"{name}.consensus.vcf.gz" + variant_stats = run_dir / "variants" / f"{name}.bcftools_stats.txt" + hotspot_vcf = str(validation.get("hotspot_vcf") or "") + notes: list[str] = [] + if sample["consensus_state"] == "needs_generation": + notes.append("consensus BAM must be generated before variant evidence review") + elif sample["consensus_state"] == "review_contract_only": + notes.append( + "raw BAM lacks RX/MQ tags; treat as a review-contract input or start from raw UMI FASTQs before evidence review" + ) + if not family_metrics.exists(): + notes.append("family-size or molecule-support metrics not found") + if not variant_stats.exists(): + notes.append("variant stats not found") + if args.umi_mode == "duplex" and not family_metrics.exists(): + notes.append("duplex fraction cannot be reviewed without family metrics") + if not hotspot_vcf: + notes.append("hotspot VCF not provided") + evidence_ready = ( + consensus_path.exists() + and family_metrics.exists() + and variant_vcf.exists() + and variant_stats.exists() + ) + rows.append( + { + "sample": name, + "umi_mode": validation.get("umi_mode"), + "consensus_state": sample["consensus_state"], + "min_af": validation.get("min_af"), + "min_reads_per_molecule": args.min_reads_per_molecule, + "consensus_bam": str(consensus_path), + "consensus_bam_exists": str(consensus_path.exists()).lower(), + "family_metrics_path": str(family_metrics), + "family_metrics_exists": str(family_metrics.exists()).lower(), + "variant_vcf": str(variant_vcf), + "variant_vcf_exists": str(variant_vcf.exists()).lower(), + "variant_stats_path": str(variant_stats), + "variant_stats_exists": str(variant_stats.exists()).lower(), + "hotspot_vcf": hotspot_vcf, + "hotspot_review": "available" if hotspot_vcf else "not_configured", + "duplex_review": "required" if args.umi_mode == "duplex" else "optional", + "low_af_review_status": "ready_for_review" if evidence_ready else "planned", + "notes": "; ".join(notes), + } + ) + write_tsv(run_dir / "qc" / "umi_molecular_evidence_contract.tsv", rows, UMI_EVIDENCE_FIELDS) + write_json( + run_dir / "qc" / "umi_molecular_evidence_contract.json", + { + "samples": rows, + "sample_count": len(rows), + "ready_for_review_count": sum( + 1 for row in rows if row["low_af_review_status"] == "ready_for_review" + ), + "duplex_review_required_count": sum( + 1 for row in rows if row["duplex_review"] == "required" + ), + "hotspot_review_available_count": sum( + 1 for row in rows if row["hotspot_review"] == "available" + ), + }, + ) + return rows + + +def write_outputs( + run_dir: Path, + validation: dict[str, Any], + samples: list[dict[str, str]], + plan: list[dict[str, Any]], + args: argparse.Namespace, +) -> None: + write_tsv(run_dir / "validation" / "samples.normalized.tsv", samples, UMI_SAMPLE_FIELDS) + write_json( + run_dir / "qc" / "umi_consensus_plan.json", + { + "umi_mode": validation.get("umi_mode"), + "umi_tag": validation.get("umi_tag"), + "min_af": validation.get("min_af"), + "samples_needing_consensus": [ + row["sample"] for row in samples if row["consensus_state"] == "needs_generation" + ], + "review_contract_only_samples": [ + row["sample"] + for row in samples + if row.get("fgbio_readiness") == "review_contract_only" + ], + "fgbio_ready_samples": [ + row["sample"] for row in samples if row.get("fgbio_readiness") == "ready" + ], + "warnings": validation.get("warnings", []), + }, + ) + summarize_postrun_artifacts(run_dir, samples) + write_molecular_evidence_contract(run_dir, validation, samples, args) + write_json(run_dir / "workflow" / "umi_panel_command_plan.json", {"commands": plan}) + write_command_script(run_dir / "commands.sh", [item["command"] for item in plan]) + + +def execute_plan(run_dir: Path, plan: list[dict[str, Any]]) -> dict[str, Any]: + for dirname in ["variants", "qc", "logs", "consensus"]: + (run_dir / dirname).mkdir(parents=True, exist_ok=True) + result: dict[str, Any] = {"ok": True, "steps": []} + for index, item in enumerate(plan, start=1): + step = run_cmd(["bash", "-c", item["command"]], run_dir, timeout=7200) + safe = item["name"].replace(":", "").replace(" ", "_").replace("/", "_") + write_json(run_dir / "logs" / f"{index:02d}_{safe}.json", step) + result["steps"].append({"name": item["name"], "ok": step.get("ok")}) + result["ok"] = bool(result["ok"] and step.get("ok")) + if not step.get("ok"): + break + return result + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> None: + lines = [ + "# UMI Panel Variant Run Summary", + "", + f"Status: `{status}`", + f"Samples parsed: `{validation.get('sample_count', 0)}`", + f"UMI mode: `{validation.get('umi_mode')}`", + f"Minimum allele fraction goal: `{validation.get('min_af')}`", + "", + "## Key Artifacts", + "", + "- `validation/samples.normalized.tsv`", + "- `workflow/umi_panel_command_plan.json`", + "- `qc/umi_consensus_plan.json`", + "- `qc/umi_postrun_summary.tsv` and `qc/umi_postrun_summary.json`", + "- `qc/umi_molecular_evidence_contract.tsv` and `qc/umi_molecular_evidence_contract.json`", + "- `consensus/*.bam` and `variants/*.consensus.vcf.gz` when executed", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `visualizations/index.html` and `visualizations/visualization_manifest.json`", + "- `notebooks/vcf_review.marimo.py` when output VCF/gVCF artifacts are present", + "- `run_manifest.json` and `artifact_index.json`", + "", + "## Guardrails", + "", + "- Generic recalibrated BAMs without RX and MQ tags are review-contract fixtures; do not treat them as end-to-end fgbio inputs.", + "- Raw read depth, consensus depth, and unique molecular depth must be interpreted separately.", + "- Low-AF calls require molecule-count, strand/duplex, and hotspot/artifact review before biological interpretation.", + "", + ] + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {item}" for item in validation["warnings"]) + lines.append("") + lines.extend(ngs_resource_gate.resource_summary_lines(resource_plan)) + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {item}" for item in validation["errors"]) + write_text(run_dir / "summary.md", "\n".join(lines) + "\n") + + +def write_visuals( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> dict[str, str]: + entries = [ + artifact_entry( + artifact_id="samples", + title="UMI Panel Samples", + path="validation/samples.normalized.tsv", + kind="table", + status="created", + description="Normalized sample table with raw/consensus alignment state.", + ), + artifact_entry( + artifact_id="consensus_plan", + title="Consensus Plan", + path="qc/umi_consensus_plan.json", + kind="json", + status="created", + description="UMI grouping, consensus, and low-frequency calling settings.", + ), + artifact_entry( + artifact_id="postrun_summary", + title="UMI Post-run Summary", + path="qc/umi_postrun_summary.tsv", + kind="table", + status="created", + description="Consensus-read, target-coverage, variant-count, and family-size summary parsed from run artifacts.", + ), + artifact_entry( + artifact_id="molecular_evidence_contract", + title="Molecular Evidence Contract", + path="qc/umi_molecular_evidence_contract.tsv", + kind="table", + status="created", + description="Per-sample evidence requirements for low-AF review: consensus BAM, family metrics, variant stats, hotspot review, and duplex review.", + ), + artifact_entry( + artifact_id="command_plan", + title="UMI Panel Command Plan", + path="workflow/umi_panel_command_plan.json", + kind="json", + status="created", + description="Executable consensus and consensus-BAM variant-calling commands.", + ), + ] + review_outputs = add_vcf_review_notebook_entry( + run_dir, + entries, + title="UMI Panel VCF Review", + table_items=[ + ("Resolved Sample Table", "validation/samples.normalized.tsv"), + ("UMI Post-run Summary", "qc/umi_postrun_summary.tsv"), + ("Molecular Evidence Contract", "qc/umi_molecular_evidence_contract.tsv"), + ], + object_items=[("Run Summary", "summary.md")], + ) + entries.extend(ngs_resource_gate.resource_visual_entries(resource_plan)) + index = write_visualization_index( + run_dir, + title="UMI Panel Variant Review", + description="Review surface for molecular consensus, panel coverage, and low-frequency variant calling artifacts.", + entries=entries, + notes=[ + *validation.get("warnings", []), + *ngs_resource_gate.resource_messages(resource_plan), + ], + analysis_intent="real_analysis" if status != "blocked" else "blocked_preflight", + provenance_summary={ + "status": status, + "sample_count": validation.get("sample_count", 0), + "resource_plan_ok": validation.get("resource_plan_ok"), + }, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + **review_outputs, + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--reference-fasta", type=Path, required=True) + parser.add_argument("--target-bed", type=Path) + parser.add_argument("--hotspot-vcf", type=Path) + parser.add_argument("--umi-mode", default="single", choices=["single", "duplex", "unknown"]) + parser.add_argument("--umi-tag", default="RX") + parser.add_argument( + "--grouping-strategy", + default="adjacency", + choices=["identity", "edit", "adjacency", "paired"], + ) + parser.add_argument("--umi-edits", type=int, default=1) + parser.add_argument("--min-reads-per-molecule", type=int, default=2) + parser.add_argument("--min-af", type=float, default=0.005) + parser.add_argument( + "--genome-build", + help="Genome build or registry alias for resource readiness, e.g. GRCh38, mm39, or a configured local alias.", + ) + parser.add_argument( + "--bundle-root", + action="append", + default=[], + help="Resource bundle override formatted as bundle=/path. May be repeated.", + ) + parser.add_argument("--include-optional-resources", action="store_true") + parser.add_argument("--resource-checksums", action="store_true") + parser.add_argument( + "--require-resource-plan", + action="store_true", + help="Treat missing registered reference bundles as blocking for this direct runner.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Skip registered reference bundle readiness checks.", + ) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=slug_timestamp("dna-umi-panel-variants")) + parser.add_argument("--execute", action="store_true") + return parser.parse_args() + + +def serializable_args(args: argparse.Namespace) -> dict[str, Any]: + return { + key: str(value) if isinstance(value, Path) else value for key, value in vars(args).items() + } + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + + input_validation, samples = validate_inputs(args) + resource_plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="dna_umi_panel_variants", + genome_build=args.genome_build, + bundle_roots=args.bundle_root, + include_optional=args.include_optional_resources, + include_checksums=args.resource_checksums, + skip=args.skip_resource_plan, + required=args.require_resource_plan, + ) + validation = ngs_resource_gate.merge_resource_status( + input_validation, resource_plan, required=args.require_resource_plan + ) + needs_consensus = any(row["consensus_state"] == "needs_generation" for row in samples) + required_tools = ( + (["samtools", "bcftools"] + (["fgbio"] if needs_consensus else [])) if args.execute else [] + ) + optional_tools = [ + name for name in ["samtools", "bcftools", "fgbio", "gatk"] if name not in required_tools + ] + tool_status = tool_preflight(required_tools, optional=optional_tools) + plan = build_plan(args, samples) + write_json(run_dir / "config.json", {**serializable_args(args), "run_dir": str(run_dir)}) + write_json(run_dir / "validation" / "input_validation_summary.json", input_validation) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + { + "fgbio": ["fgbio", "--version"], + "samtools": ["samtools", "--version"], + "bcftools": ["bcftools", "--version"], + } + ), + ) + write_outputs(run_dir, validation, samples, plan, args) + dry_run = { + "ok": validation["ok"] and (tool_status["ok"] if args.execute else True), + "detail": "input, UMI, target, and tool validation completed", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + status = "blocked" if not dry_run["ok"] else "validated" + execution = None + if args.execute and dry_run["ok"]: + execution = execute_plan(run_dir, plan) + status = "completed" if execution.get("ok") else "failed" + summarize_postrun_artifacts(run_dir, samples) + write_molecular_evidence_contract(run_dir, validation, samples, args) + visuals = write_visuals(run_dir, status, validation, resource_plan) + resource_outputs = ngs_resource_gate.resource_output_paths(resource_plan) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="dna_umi_panel_variants", + workflow="local_light_umi_consensus_panel", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "reference_fasta": str(args.reference_fasta.expanduser().resolve()), + "target_bed": str(args.target_bed.expanduser().resolve()) if args.target_bed else None, + "hotspot_vcf": str(args.hotspot_vcf.expanduser().resolve()) + if args.hotspot_vcf + else None, + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs={ + "sample_table": "validation/samples.normalized.tsv", + "command_plan": "workflow/umi_panel_command_plan.json", + "consensus_plan": "qc/umi_consensus_plan.json", + "postrun_summary": "qc/umi_postrun_summary.tsv", + "postrun_summary_json": "qc/umi_postrun_summary.json", + "molecular_evidence_contract": "qc/umi_molecular_evidence_contract.tsv", + "molecular_evidence_contract_json": "qc/umi_molecular_evidence_contract.json", + "consensus_bam_glob": "consensus/*.bam", + "vcf_glob": "variants/*.consensus.vcf.gz", + **resource_outputs, + **visuals, + }, + method={ + "umi_mode": args.umi_mode, + "umi_tag": args.umi_tag, + "grouping_strategy": args.grouping_strategy, + "min_reads_per_molecule": args.min_reads_per_molecule, + "min_af": args.min_af, + "resource_plan": resource_plan, + }, + audit={"resource_readiness": resource_plan} if resource_plan else None, + review_bundle=visuals, + ) + write_summary(run_dir, status, validation, resource_plan) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_dna_variant_calling.py b/plugins/ngs-analysis/scripts/run_dna_variant_calling.py new file mode 100644 index 0000000..96b7e67 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_dna_variant_calling.py @@ -0,0 +1,1064 @@ +#!/usr/bin/env python3 +"""Run BAM-to-VCF DNA variant calling with samtools and bcftools.""" + +from __future__ import annotations + +import argparse +import csv +import json +import shlex +import statistics +from pathlib import Path +from typing import Any + +import ngs_resource_gate +from ngs_run_utils import ( + build_artifact_index, + run_cmd, + run_cmd_stdout_to_file, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import ( + add_vcf_review_notebook_entry, + artifact_entry, + write_visualization_index, +) + +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "dna_variant_calling" +MQ_INFO_INTEGER_HEADER = ( + '##INFO=' +) +MQ_INFO_FLOAT_HEADER = '##INFO=' +DEFAULT_ANNOTATION_INFO_TAGS = ["AF", "AC", "AN"] +DEFAULT_CALLABLE_MIN_DEPTH = 10 + + +def detect_delimiter(path: Path) -> str: + if path.suffix.lower() in {".tsv", ".tab"}: + return "\t" + return "," + + +def read_samples(path: Path) -> tuple[list[dict[str, str]], list[str]]: + with path.open(newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle, delimiter=detect_delimiter(path)) + rows = [{key: (value or "").strip() for key, value in row.items()} for row in reader] + return rows, list(reader.fieldnames or []) + + +def load_reference_contigs(reference: Path) -> dict[str, int]: + contigs: dict[str, int] = {} + fai = Path(str(reference) + ".fai") + if not fai.exists(): + return contigs + with fai.open("r", encoding="utf-8") as handle: + for line in handle: + fields = line.rstrip().split("\t") + if len(fields) >= 2: + contigs[fields[0]] = int(fields[1]) + return contigs + + +def parse_region(region: str) -> tuple[str, int, int]: + if ":" not in region: + raise ValueError(f"region must use contig:start-end syntax, got: {region}") + contig, coords = region.split(":", 1) + if "-" in coords: + start_s, end_s = coords.split("-", 1) + else: + start_s = coords + end_s = coords + start = int(start_s.replace(",", "")) + end = int(end_s.replace(",", "")) + if start < 1 or end < 1 or end < start: + raise ValueError(f"region has invalid coordinates: {region}") + return contig, start, end + + +def alt_contig_name(contig: str, available: dict[str, int]) -> str | None: + candidates = [f"chr{contig}", contig.removeprefix("chr")] + for item in candidates: + if item != contig and item in available: + return item + return None + + +def normalize_region(region: str | None, reference_contigs: dict[str, int]) -> dict[str, Any]: + if not region: + return {"requested": None, "normalized": None, "errors": [], "warnings": []} + errors: list[str] = [] + warnings: list[str] = [] + try: + contig, start, end = parse_region(region) + except ValueError as exc: + return {"requested": region, "normalized": None, "errors": [str(exc)], "warnings": []} + if contig not in reference_contigs: + suggestion = alt_contig_name(contig, reference_contigs) + if suggestion: + errors.append( + f"region contig '{contig}' was not found in the reference; did you mean '{suggestion}:{start}-{end}'?" + ) + else: + errors.append(f"region contig '{contig}' was not found in the reference") + return {"requested": region, "normalized": None, "errors": errors, "warnings": warnings} + contig_length = reference_contigs[contig] + if start > contig_length: + errors.append(f"region start {start} exceeds contig length {contig_length} for {contig}") + if end > contig_length: + warnings.append( + f"region end {end} exceeds contig length {contig_length} for {contig}; clipping to {contig_length}" + ) + end = contig_length + normalized = f"{contig}:{start}-{end}" if not errors else None + return { + "requested": region, + "normalized": normalized, + "contig": contig, + "start": start, + "end": end, + "contig_length": contig_length, + "errors": errors, + "warnings": warnings, + } + + +def validate_inputs(args: argparse.Namespace) -> tuple[dict[str, Any], list[dict[str, str]]]: + sample_sheet = args.sample_sheet.expanduser().resolve() + reference = args.reference_fasta.expanduser().resolve() + errors: list[str] = [] + warnings: list[str] = [] + normalized: list[dict[str, str]] = [] + columns: list[str] = [] + + if not sample_sheet.exists(): + errors.append(f"sample sheet does not exist: {sample_sheet}") + rows: list[dict[str, str]] = [] + else: + try: + rows, columns = read_samples(sample_sheet) + except Exception as exc: # pragma: no cover - defensive parse guard + rows = [] + errors.append(f"failed to parse sample sheet {sample_sheet}: {exc}") + + if not reference.exists(): + errors.append(f"reference FASTA does not exist: {reference}") + reference_contigs = load_reference_contigs(reference) + if not reference_contigs: + warnings.append( + f"reference FASTA index is missing and may be created by samtools faidx: {reference}.fai" + ) + region_summary = ( + normalize_region(args.region, reference_contigs) + if reference.exists() + else {"requested": args.region, "normalized": None, "errors": [], "warnings": []} + ) + errors.extend(region_summary.get("errors", [])) + warnings.extend(region_summary.get("warnings", [])) + if args.annotation_vcf: + annotation_vcf = args.annotation_vcf.expanduser().resolve() + if not annotation_vcf.exists(): + errors.append(f"annotation VCF does not exist: {annotation_vcf}") + if ( + not (Path(str(annotation_vcf) + ".tbi")).exists() + and not (Path(str(annotation_vcf) + ".csi")).exists() + ): + warnings.append( + f"annotation VCF index is missing and may be required by bcftools annotate: {annotation_vcf}.tbi" + ) + sample_names: set[str] = set() + duplicate_names: set[str] = set() + for row_index, row in enumerate(rows, start=2): + sample = row.get("sample") or row.get("sample_id") or f"row_{row_index}" + bam_raw = row.get("bam") or row.get("cram") or "" + if sample in sample_names: + duplicate_names.add(sample) + sample_names.add(sample) + if not bam_raw: + errors.append(f"row {row_index}: bam or cram column is required") + continue + bam = Path(bam_raw).expanduser() + if not bam.is_absolute(): + bam = sample_sheet.parent / bam + bam = bam.resolve() + if not bam.exists(): + errors.append(f"row {row_index}: alignment file does not exist: {bam}") + if bam.suffix == ".bam" and not (Path(str(bam) + ".bai")).exists(): + warnings.append( + f"row {row_index}: BAM index is missing and may be created by samtools index: {bam}.bai" + ) + normalized.append({"sample": sample, "alignment": str(bam), "row_index": str(row_index)}) + + if not normalized: + errors.append("no usable alignment rows found") + if duplicate_names: + warnings.append( + f"duplicate sample names detected in sample sheet: {', '.join(sorted(duplicate_names))}" + ) + validation = { + "ok": not errors, + "sample_sheet": str(sample_sheet), + "reference_fasta": str(reference), + "region": region_summary.get("normalized") or args.region, + "region_requested": args.region, + "region_summary": region_summary, + "columns": columns, + "sample_count": len(normalized), + "run_class": "targeted_region_check" + if region_summary.get("normalized") + else "alignment_wide_local_light", + "errors": errors, + "warnings": warnings, + } + return validation, normalized + + +def write_normalized_samples(run_dir: Path, rows: list[dict[str, str]]) -> None: + path = run_dir / "validation" / "samples.normalized.tsv" + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter( + handle, fieldnames=["sample", "alignment", "row_index"], delimiter="\t" + ) + writer.writeheader() + writer.writerows(rows) + + +def bcftools_call_command( + reference: Path, alignment: Path, output: Path, region: str | None +) -> str: + mpileup = ["bcftools", "mpileup", "-Ou", "-f", str(reference)] + if region: + mpileup.extend(["-r", region]) + mpileup.append(str(alignment)) + call = ["bcftools", "call", "-mv", "-Oz", "-o", str(output)] + return f"{shlex.join(mpileup)} | {shlex.join(call)}" + + +def detect_annotation_columns(run_dir: Path, annotation_vcf: Path, requested: str | None) -> str: + if requested: + return requested + tags_to_probe = "|".join(DEFAULT_ANNOTATION_INFO_TAGS) + probe = run_cmd( + [ + "bash", + "-lc", + f"bcftools view -h {shlex.quote(str(annotation_vcf))} | rg '^##INFO= tuple[Path, dict[str, Any]]: + if not annotation_vcf: + return input_vcf, {"ok": True, "changed": False, "reason": "annotation not requested"} + columns = detect_annotation_columns(run_dir, annotation_vcf, annotation_columns) + output_vcf = input_vcf.parent / f"{input_vcf.name.removesuffix('.vcf.gz')}.annotated.vcf.gz" + annotate = run_cmd( + [ + "bcftools", + "annotate", + "-a", + str(annotation_vcf), + "-c", + columns, + "--pair-logic", + "exact", + "-O", + "z", + "-o", + str(output_vcf), + str(input_vcf), + ], + run_dir, + timeout=3600, + ) + result: dict[str, Any] = { + "ok": bool(annotate.get("ok")), + "changed": bool(annotate.get("ok")), + "reason": "annotated from resource VCF" + if annotate.get("ok") + else "bcftools annotate failed", + "annotation_vcf": str(annotation_vcf), + "annotation_columns": columns, + "annotate": annotate, + } + return (output_vcf if annotate.get("ok") else input_vcf), result + + +def normalize_mq_header(run_dir: Path, sample: str, vcf: Path) -> dict[str, Any]: + """Rewrite INFO/MQ as Float so bcftools stats does not warn on the emitted header.""" + header = run_cmd(["bcftools", "view", "-h", str(vcf)], run_dir, timeout=600) + result: dict[str, Any] = {"ok": bool(header.get("ok")), "changed": False} + if not header.get("ok"): + result["reason"] = "failed to read VCF header" + return result + + header_lines = [ + line + for line in str(header.get("stdout_tail", "")).splitlines() + if line.startswith(("##", "#CHROM")) + ] + header_text = "\n".join(header_lines) + ("\n" if header_lines else "") + if MQ_INFO_INTEGER_HEADER not in header_text: + result["reason"] = "no MQ Integer header present" + return result + + normalized_header = header_text.replace(MQ_INFO_INTEGER_HEADER, MQ_INFO_FLOAT_HEADER) + header_path = run_dir / "logs" / f"{sample}.mq_header.normalized.hdr" + temp_vcf = vcf.parent / f"{vcf.name.removesuffix('.vcf.gz')}.reheader.vcf.gz" + write_text(header_path, normalized_header) + reheader = run_cmd( + ["bcftools", "reheader", "-h", str(header_path), "-o", str(temp_vcf), str(vcf)], + run_dir, + timeout=600, + ) + result["reheader"] = reheader + if not reheader.get("ok"): + result["ok"] = False + result["reason"] = "bcftools reheader failed" + return result + + temp_vcf.replace(vcf) + result["changed"] = True + result["reason"] = "rewrote INFO/MQ header to Float" + return result + + +def summarize_depth_file(depth_path: Path, callable_min_depth: int) -> dict[str, Any]: + positions = 0 + callable_positions = 0 + zero_depth_positions = 0 + depth_values: list[int] = [] + for line in depth_path.read_text(encoding="utf-8").splitlines(): + fields = line.split("\t") + if len(fields) < 3: + continue + depth = int(fields[2]) + positions += 1 + depth_values.append(depth) + if depth >= callable_min_depth: + callable_positions += 1 + if depth == 0: + zero_depth_positions += 1 + mean_depth = sum(depth_values) / positions if positions else 0.0 + median_depth = statistics.median(depth_values) if depth_values else 0.0 + return { + "positions": positions, + "callable_min_depth": callable_min_depth, + "callable_positions": callable_positions, + "callable_fraction": (callable_positions / positions) if positions else 0.0, + "zero_depth_positions": zero_depth_positions, + "mean_depth": round(mean_depth, 3), + "median_depth": round(float(median_depth), 3), + "max_depth": max(depth_values) if depth_values else 0, + } + + +def parse_variant_stats(stats_text: str) -> dict[str, Any]: + summary: dict[str, Any] = { + "record_count": None, + "snp_count": None, + "indel_count": None, + "interpretation": "variant stats unavailable", + } + for line in stats_text.splitlines(): + if not line.startswith("SN\t0\t"): + continue + _, _, key, value = line.split("\t", 3) + value_int = int(value) + if key == "number of records:": + summary["record_count"] = value_int + elif key == "number of SNPs:": + summary["snp_count"] = value_int + elif key == "number of indels:": + summary["indel_count"] = value_int + record_count = summary.get("record_count") + if record_count == 0: + summary["interpretation"] = "no variant records were emitted in the queried region" + elif isinstance(record_count, int) and record_count > 0: + summary["interpretation"] = "variant records were emitted in the queried region" + return summary + + +def run_region_qc( + run_dir: Path, + sample: str, + alignment: Path, + region: str | None, + callable_min_depth: int, +) -> dict[str, Any]: + qc_dir = run_dir / "qc" + coverage_path = qc_dir / f"{sample}.coverage.tsv" + coverage_cmd = ["samtools", "coverage"] + if region: + coverage_cmd.extend(["-r", region]) + coverage_cmd.append(str(alignment)) + coverage = run_cmd_stdout_to_file(coverage_cmd, run_dir, coverage_path, timeout=600) + + depth_summary: dict[str, Any] = { + "positions": 0, + "callable_min_depth": callable_min_depth, + "callable_positions": 0, + "callable_fraction": 0.0, + "zero_depth_positions": 0, + "mean_depth": 0.0, + "median_depth": 0.0, + "max_depth": 0, + "note": "per-base depth was omitted because no region was provided", + } + depth_log: dict[str, Any] = {"ok": True, "skipped": True, "reason": depth_summary["note"]} + if region: + depth_path = qc_dir / f"{sample}.depth.tsv" + depth_cmd = ["samtools", "depth", "-aa", "-r", region, str(alignment)] + depth_log = run_cmd_stdout_to_file(depth_cmd, run_dir, depth_path, timeout=600) + if depth_log.get("ok"): + depth_summary = summarize_depth_file(depth_path, callable_min_depth) + + return {"coverage": coverage, "depth": depth_log, "callability": depth_summary} + + +def filter_vcf( + run_dir: Path, + input_vcf: Path, + min_qual: float | None, + min_site_dp: int | None, +) -> tuple[Path, dict[str, Any]]: + expressions: list[str] = [] + if min_qual is not None: + expressions.append(f"QUAL<{min_qual}") + if min_site_dp is not None: + expressions.append(f"INFO/DP<{min_site_dp}") + if not expressions: + return input_vcf, {"ok": True, "changed": False, "reason": "filtering not requested"} + output_vcf = input_vcf.parent / f"{input_vcf.name.removesuffix('.vcf.gz')}.filtered.vcf.gz" + expr = " || ".join(expressions) + result = run_cmd( + [ + "bcftools", + "filter", + "-s", + "LOW_SUPPORT", + "-e", + expr, + "-O", + "z", + "-o", + str(output_vcf), + str(input_vcf), + ], + run_dir, + timeout=3600, + ) + payload = { + "ok": bool(result.get("ok")), + "changed": bool(result.get("ok")), + "reason": "soft-filtered VCF emitted" if result.get("ok") else "bcftools filter failed", + "expression": expr, + "filter": result, + } + return (output_vcf if result.get("ok") else input_vcf), payload + + +def write_commands(run_dir: Path, args: argparse.Namespace, rows: list[dict[str, str]]) -> None: + reference = args.reference_fasta.expanduser().resolve() + lines = [ + "#!/usr/bin/env bash", + "set -euo pipefail", + shlex.join(["samtools", "faidx", str(reference)]), + ] + for row in rows: + sample = row["sample"] + alignment = Path(row["alignment"]) + lines.append(shlex.join(["samtools", "quickcheck", "-v", str(alignment)])) + lines.append( + shlex.join(["samtools", "flagstat", str(alignment)]) + f" > qc/{sample}.flagstat.txt" + ) + lines.append( + shlex.join(["samtools", "idxstats", str(alignment)]) + f" > qc/{sample}.idxstats.tsv" + ) + lines.append( + shlex.join(["samtools", "coverage", "-r", args.region, str(alignment)]) + + f" > qc/{sample}.coverage.tsv" + if args.region + else shlex.join(["samtools", "coverage", str(alignment)]) + + f" > qc/{sample}.coverage.tsv" + ) + if args.region: + lines.append( + shlex.join(["samtools", "depth", "-aa", "-r", args.region, str(alignment)]) + + f" > qc/{sample}.depth.tsv" + ) + lines.append( + bcftools_call_command( + reference, alignment, Path("variants") / f"{sample}.vcf.gz", args.region + ) + ) + lines.append( + f"# The runner may normalize INFO/MQ in variants/{sample}.vcf.gz before bcftools stats." + ) + if args.annotation_vcf: + annotation_vcf = args.annotation_vcf.expanduser().resolve() + columns = detect_annotation_columns(run_dir, annotation_vcf, args.annotation_columns) + lines.append(shlex.join(["bcftools", "index", "-t", f"variants/{sample}.vcf.gz"])) + lines.append( + shlex.join( + [ + "bcftools", + "annotate", + "-a", + str(annotation_vcf), + "-c", + columns, + "--pair-logic", + "exact", + "-O", + "z", + "-o", + f"variants/{sample}.annotated.vcf.gz", + f"variants/{sample}.vcf.gz", + ] + ) + ) + lines.append( + shlex.join(["bcftools", "index", "-t", f"variants/{sample}.annotated.vcf.gz"]) + ) + if args.filter_min_qual is not None or args.filter_min_site_dp is not None: + expr = " || ".join( + [ + item + for item in [ + f"QUAL<{args.filter_min_qual}" + if args.filter_min_qual is not None + else None, + f"INFO/DP<{args.filter_min_site_dp}" + if args.filter_min_site_dp is not None + else None, + ] + if item + ] + ) + lines.append( + shlex.join( + [ + "bcftools", + "filter", + "-s", + "LOW_SUPPORT", + "-e", + expr, + "-O", + "z", + "-o", + f"variants/{sample}.annotated.filtered.vcf.gz", + f"variants/{sample}.annotated.vcf.gz", + ] + ) + ) + lines.append( + shlex.join( + ["bcftools", "index", "-t", f"variants/{sample}.annotated.filtered.vcf.gz"] + ) + ) + lines.append( + shlex.join( + ["bcftools", "stats", f"variants/{sample}.annotated.filtered.vcf.gz"] + ) + + f" > variants/{sample}.filtered.bcftools_stats.txt" + ) + else: + lines.append( + shlex.join(["bcftools", "stats", f"variants/{sample}.annotated.vcf.gz"]) + + f" > variants/{sample}.bcftools_stats.txt" + ) + else: + lines.append(shlex.join(["bcftools", "index", "-t", f"variants/{sample}.vcf.gz"])) + if args.filter_min_qual is not None or args.filter_min_site_dp is not None: + expr = " || ".join( + [ + item + for item in [ + f"QUAL<{args.filter_min_qual}" + if args.filter_min_qual is not None + else None, + f"INFO/DP<{args.filter_min_site_dp}" + if args.filter_min_site_dp is not None + else None, + ] + if item + ] + ) + lines.append( + shlex.join( + [ + "bcftools", + "filter", + "-s", + "LOW_SUPPORT", + "-e", + expr, + "-O", + "z", + "-o", + f"variants/{sample}.filtered.vcf.gz", + f"variants/{sample}.vcf.gz", + ] + ) + ) + lines.append( + shlex.join(["bcftools", "index", "-t", f"variants/{sample}.filtered.vcf.gz"]) + ) + lines.append( + shlex.join(["bcftools", "stats", f"variants/{sample}.filtered.vcf.gz"]) + + f" > variants/{sample}.filtered.bcftools_stats.txt" + ) + else: + lines.append( + shlex.join(["bcftools", "stats", f"variants/{sample}.vcf.gz"]) + + f" > variants/{sample}.bcftools_stats.txt" + ) + write_text(run_dir / "commands.sh", "\n".join(lines) + "\n") + + +def execute(run_dir: Path, args: argparse.Namespace, rows: list[dict[str, str]]) -> dict[str, Any]: + reference = args.reference_fasta.expanduser().resolve() + annotation_vcf = args.annotation_vcf.expanduser().resolve() if args.annotation_vcf else None + results: dict[str, Any] = {"ok": True, "steps": []} + (run_dir / "qc").mkdir(parents=True, exist_ok=True) + (run_dir / "variants").mkdir(parents=True, exist_ok=True) + if not (Path(str(reference) + ".fai")).exists(): + faidx = run_cmd(["samtools", "faidx", str(reference)], run_dir, timeout=600) + write_json(run_dir / "logs" / "samtools_faidx.json", faidx) + results["steps"].append({"name": "samtools_faidx", "ok": faidx.get("ok")}) + results["ok"] = bool(results["ok"] and faidx.get("ok")) + + for row in rows: + sample = row["sample"] + alignment = Path(row["alignment"]) + quickcheck = run_cmd(["samtools", "quickcheck", "-v", str(alignment)], run_dir, timeout=300) + write_json(run_dir / "logs" / f"{sample}.quickcheck.json", quickcheck) + flagstat = run_cmd(["samtools", "flagstat", str(alignment)], run_dir, timeout=600) + write_json(run_dir / "logs" / f"{sample}.flagstat.json", flagstat) + write_text(run_dir / "qc" / f"{sample}.flagstat.txt", flagstat.get("stdout_tail", "")) + idxstats = run_cmd(["samtools", "idxstats", str(alignment)], run_dir, timeout=600) + write_json(run_dir / "logs" / f"{sample}.idxstats.json", idxstats) + write_text(run_dir / "qc" / f"{sample}.idxstats.tsv", idxstats.get("stdout_tail", "")) + region_qc = run_region_qc(run_dir, sample, alignment, args.region, args.callable_min_depth) + write_json(run_dir / "logs" / f"{sample}.coverage.json", region_qc["coverage"]) + write_json(run_dir / "logs" / f"{sample}.depth.json", region_qc["depth"]) + write_json(run_dir / "qc" / f"{sample}.callability.json", region_qc["callability"]) + vcf = run_dir / "variants" / f"{sample}.vcf.gz" + call = run_cmd( + ["bash", "-c", bcftools_call_command(reference, alignment, vcf, args.region)], + run_dir, + timeout=3600, + ) + write_json(run_dir / "logs" / f"{sample}.bcftools_call.json", call) + write_text(run_dir / "logs" / f"{sample}.bcftools_call.log", call.get("stdout_tail", "")) + mq_header_fix = ( + normalize_mq_header(run_dir, sample, vcf) + if call.get("ok") + else {"ok": False, "skipped": True} + ) + write_json(run_dir / "logs" / f"{sample}.mq_header_fix.json", mq_header_fix) + pre_annotation_index = ( + run_cmd(["bcftools", "index", "-t", str(vcf)], run_dir, timeout=600) + if call.get("ok") and mq_header_fix.get("ok") + else {"ok": False, "skipped": True} + ) + write_json(run_dir / "logs" / f"{sample}.pre_annotation_index.json", pre_annotation_index) + final_vcf, annotation_result = ( + annotate_vcf(run_dir, sample, vcf, annotation_vcf, args.annotation_columns) + if pre_annotation_index.get("ok") + else (vcf, {"ok": False, "skipped": True}) + ) + write_json(run_dir / "logs" / f"{sample}.annotation.json", annotation_result) + filtered_vcf, filter_result = ( + filter_vcf(run_dir, final_vcf, args.filter_min_qual, args.filter_min_site_dp) + if call.get("ok") and mq_header_fix.get("ok") and annotation_result.get("ok") + else (final_vcf, {"ok": False, "skipped": True}) + ) + write_json(run_dir / "logs" / f"{sample}.filter.json", filter_result) + if call.get("ok") and mq_header_fix.get("ok") and annotation_result.get("ok"): + if filtered_vcf == vcf and pre_annotation_index.get("ok"): + index = { + **pre_annotation_index, + "reused": True, + "reason": "reused pre-annotation index for unannotated VCF", + } + else: + index = run_cmd( + ["bcftools", "index", "-t", str(filtered_vcf)], run_dir, timeout=600 + ) + else: + index = {"ok": False, "skipped": True} + write_json(run_dir / "logs" / f"{sample}.bcftools_index.json", index) + stats = ( + run_cmd(["bcftools", "stats", str(filtered_vcf)], run_dir, timeout=600) + if call.get("ok") + and mq_header_fix.get("ok") + and annotation_result.get("ok") + and filter_result.get("ok") + else {"ok": False, "skipped": True} + ) + write_json(run_dir / "logs" / f"{sample}.bcftools_stats.json", stats) + write_text( + run_dir / "variants" / f"{sample}.bcftools_stats.txt", stats.get("stdout_tail", "") + ) + write_json( + run_dir / "qc" / f"{sample}.variant_summary.json", + parse_variant_stats(str(stats.get("stdout_tail", ""))), + ) + sample_ok = bool( + quickcheck.get("ok") + and flagstat.get("ok") + and idxstats.get("ok") + and region_qc["coverage"].get("ok") + and region_qc["depth"].get("ok") + and call.get("ok") + and mq_header_fix.get("ok") + and annotation_result.get("ok") + and filter_result.get("ok") + and index.get("ok") + and stats.get("ok") + ) + results["steps"].append({"name": sample, "ok": sample_ok}) + results["ok"] = bool(results["ok"] and sample_ok) + return results + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + annotation_enabled: bool, + filtering_enabled: bool, + resource_plan: dict[str, Any] | None = None, +) -> None: + sample_name = next(iter((run_dir / "qc").glob("*.variant_summary.json")), None) + variant_summary = {} + if sample_name: + variant_summary = json.loads(sample_name.read_text(encoding="utf-8")) + lines = [ + "# DNA Variant Calling Run Summary", + "", + f"Status: `{status}`", + f"Samples parsed: `{validation.get('sample_count', 0)}`", + f"Region: `{validation.get('region') or 'whole input alignment'}`", + f"Run class: `{validation.get('run_class')}`", + "", + "## Key Artifacts", + "", + "- `qc/*.flagstat.txt`", + "- `qc/*.idxstats.tsv`", + "- `qc/*.coverage.tsv`", + "- `qc/*.depth.tsv` when `--region` is provided", + "- `qc/*.callability.json` and `qc/*.variant_summary.json`", + "- `variants/*.vcf.gz`", + "- `variants/*.annotated.vcf.gz`" if annotation_enabled else None, + "- `variants/*.filtered.vcf.gz`" if filtering_enabled else None, + "- `variants/*.bcftools_stats.txt`", + "- `visualizations/index.html` and `visualizations/visualization_manifest.json`", + "- `notebooks/vcf_review.marimo.py` when output VCF/gVCF artifacts are present", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `run_manifest.json` and `artifact_index.json`", + "", + ] + lines = [line for line in lines if line is not None] + if variant_summary: + lines.extend( + [ + "## Interpretation", + "", + f"- Record count: `{variant_summary.get('record_count')}`", + f"- SNP count: `{variant_summary.get('snp_count')}`", + f"- Indel count: `{variant_summary.get('indel_count')}`", + f"- Interpretation: {variant_summary.get('interpretation')}", + "", + ] + ) + lines.extend( + [ + "## Guardrails", + "", + "- This local lane is a targeted verification and audit envelope; use subtype lanes for full germline or somatic workflow requirements.", + "- When no annotation VCF or filter thresholds are provided, interpretation is limited to raw bcftools calls in the queried region.", + "- Use the germline/somatic subtype lanes for BQSR, cohort logic, or richer annotation/reporting.", + "", + ] + ) + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {warning}" for warning in validation["warnings"]) + lines.append("") + lines.extend(ngs_resource_gate.resource_summary_lines(resource_plan)) + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {error}" for error in validation["errors"]) + write_text(run_dir / "summary.md", "\n".join(lines) + "\n") + + +def write_visuals( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> dict[str, str]: + first_variant_summary = next( + iter(sorted((run_dir / "qc").glob("*.variant_summary.json"))), None + ) + entries = [ + artifact_entry( + artifact_id="sample_table", + title="Resolved Sample Table", + path="validation/samples.normalized.tsv", + kind="table", + status="created", + description="Resolved sample table with absolute BAM/CRAM alignment paths.", + ), + artifact_entry( + artifact_id="variant_summary", + title="Variant Summary", + path=str(first_variant_summary.relative_to(run_dir)) if first_variant_summary else None, + kind="json", + status="created" if first_variant_summary else "not_available", + description="Per-sample variant counts parsed from bcftools stats.", + ), + ] + review_outputs = add_vcf_review_notebook_entry( + run_dir, + entries, + title="DNA Variant VCF Review", + table_items=[("Resolved Sample Table", "validation/samples.normalized.tsv")], + object_items=[("Run Summary", "summary.md"), ("Artifact Index", "artifact_index.json")], + ) + entries.extend(ngs_resource_gate.resource_visual_entries(resource_plan)) + index = write_visualization_index( + run_dir, + title="DNA Variant Review Bundle", + description="Review surface for the local DNA variant lane, including VCF/gVCF notebook previews when variant artifacts are present.", + entries=entries, + notes=[ + *validation.get("warnings", []), + *ngs_resource_gate.resource_messages(resource_plan), + ], + analysis_intent="real_analysis" if status != "blocked" else "blocked_preflight", + provenance_summary={ + "status": status, + "sample_count": validation.get("sample_count", 0), + "resource_plan_ok": validation.get("resource_plan_ok"), + }, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + **review_outputs, + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--reference-fasta", type=Path, required=True) + parser.add_argument("--region") + parser.add_argument( + "--annotation-vcf", + type=Path, + help="Optional bgzip/tabix-indexed VCF used to annotate called variants.", + ) + parser.add_argument( + "--annotation-columns", + help="Optional bcftools annotate -c column list. Defaults to ID plus available AF/AC/AN tags from the resource VCF.", + ) + parser.add_argument( + "--filter-min-qual", + type=float, + help="Optional QUAL threshold for soft-filtering emitted variants.", + ) + parser.add_argument( + "--filter-min-site-dp", + type=int, + help="Optional INFO/DP threshold for soft-filtering emitted variants.", + ) + parser.add_argument( + "--callable-min-depth", + type=int, + default=DEFAULT_CALLABLE_MIN_DEPTH, + help="Minimum depth used to mark a locus callable in region-level depth summaries.", + ) + parser.add_argument( + "--genome-build", + help="Genome build or registry alias for resource readiness, e.g. GRCh38, mm39, or a configured local alias.", + ) + parser.add_argument( + "--bundle-root", + action="append", + default=[], + help="Resource bundle override formatted as bundle=/path. May be repeated.", + ) + parser.add_argument("--include-optional-resources", action="store_true") + parser.add_argument("--resource-checksums", action="store_true") + parser.add_argument( + "--require-resource-plan", + action="store_true", + help="Treat missing registered reference bundles as blocking for this local runner.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Skip registered reference bundle readiness checks.", + ) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=slug_timestamp("dna-variant-calling")) + parser.add_argument("--execute", action="store_true") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + + input_validation, rows = validate_inputs(args) + resource_plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="dna_variant_calling", + genome_build=args.genome_build, + bundle_roots=args.bundle_root, + include_optional=args.include_optional_resources, + include_checksums=args.resource_checksums, + skip=args.skip_resource_plan, + required=args.require_resource_plan, + ) + validation = ngs_resource_gate.merge_resource_status( + input_validation, + resource_plan, + required=args.require_resource_plan, + ) + tool_status = tool_preflight(["samtools", "bcftools"], optional=[]) + write_json( + run_dir / "config.json", + { + "reference_fasta": str(args.reference_fasta.expanduser().resolve()), + "region": validation.get("region"), + "region_requested": args.region, + "filter_min_qual": args.filter_min_qual, + "filter_min_site_dp": args.filter_min_site_dp, + "callable_min_depth": args.callable_min_depth, + "run_class": validation.get("run_class"), + }, + ) + write_json(run_dir / "validation" / "input_summary.json", {"samples": rows}) + write_json(run_dir / "validation" / "input_validation_summary.json", input_validation) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_normalized_samples(run_dir, rows) + write_commands(run_dir, args, rows) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + {"samtools": ["samtools", "--version"], "bcftools": ["bcftools", "--version"]} + ), + ) + + dry_run = { + "ok": validation["ok"] and tool_status["ok"], + "detail": "input and tool validation completed", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + execution = None + status = "blocked" if not dry_run["ok"] else "validated" + if args.execute and dry_run["ok"]: + execution = execute(run_dir, args, rows) + status = "completed" if execution.get("ok") else "failed" + + visuals = write_visuals(run_dir, status, validation, resource_plan) + resource_outputs = ngs_resource_gate.resource_output_paths(resource_plan) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="dna_variant_calling", + workflow="local_light_samtools_bcftools", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "reference_fasta": str(args.reference_fasta.expanduser().resolve()), + "annotation_vcf": str(args.annotation_vcf.expanduser().resolve()) + if args.annotation_vcf + else None, + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs={ + "vcf_glob": "variants/*.vcf.gz", + "annotated_vcf_glob": "variants/*.annotated.vcf.gz" if args.annotation_vcf else None, + "filtered_vcf_glob": "variants/*.filtered.vcf.gz" + if args.filter_min_qual is not None or args.filter_min_site_dp is not None + else None, + "flagstat_glob": "qc/*.flagstat.txt", + "idxstats_glob": "qc/*.idxstats.tsv", + "coverage_glob": "qc/*.coverage.tsv", + "depth_glob": "qc/*.depth.tsv" if validation.get("region") else None, + "callability_glob": "qc/*.callability.json", + **resource_outputs, + **visuals, + }, + method={ + "caller": "bcftools mpileup/call", + "region": validation.get("region"), + "annotation_columns": args.annotation_columns, + "filter_min_qual": args.filter_min_qual, + "filter_min_site_dp": args.filter_min_site_dp, + "callable_min_depth": args.callable_min_depth, + "run_class": validation.get("run_class"), + "resource_plan": resource_plan, + }, + audit={"resource_readiness": resource_plan} if resource_plan else None, + review_bundle=visuals, + ) + write_summary( + run_dir, + status, + validation, + bool(args.annotation_vcf), + bool(args.filter_min_qual is not None or args.filter_min_site_dp is not None), + resource_plan, + ) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_fastq_assay_package.py b/plugins/ngs-analysis/scripts/run_fastq_assay_package.py new file mode 100644 index 0000000..4ade77c --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_fastq_assay_package.py @@ -0,0 +1,3137 @@ +#!/usr/bin/env python3 +"""Run FASTQ-based assay packages for epigenomics, amplicon, and metagenomics lanes.""" + +from __future__ import annotations + +import argparse +import csv +import gzip +import json +import math +import shlex +import shutil +from pathlib import Path +from typing import Any +from urllib.parse import urlparse + +try: + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt # type: ignore + import numpy as np # type: ignore +except Exception: # pragma: no cover - optional plotting dependencies + plt = None + np = None + +from ngs_run_utils import ( + build_artifact_index, + command_path, + now_iso, + run_cmd, + sha256_file, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import ( + artifact_entry, + reachable_localhost_url_for_path, + write_localhost_launch_hint, + write_multiqc_browser_helper, + write_visualization_index, +) + +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "fastq_assay_package" +FASTQ_EXTENSIONS = (".fastq", ".fq", ".fastq.gz", ".fq.gz") +LANES = { + "epigenomics_peaks": { + "display": "Epigenomics peaks/QC", + "required": ["seqkit"], + "optional": ["fastqc", "multiqc", "cutadapt", "macs2"], + }, + "amplicon_microbiome": { + "display": "Amplicon microbiome QC", + "required": ["seqkit"], + "optional": ["fastqc", "multiqc", "cutadapt"], + }, + "shotgun_metagenomics": { + "display": "Shotgun metagenomics QC", + "required": ["seqkit"], + "optional": ["fastqc", "multiqc", "kraken2", "bracken", "metaphlan", "humann"], + }, +} +LANE_THRESHOLDS = { + "epigenomics_peaks": { + "min_reads_for_qc": 1_000_000, + "recommended_replicates": 2, + "short_read_max_avg_len": 300, + "expected_layout": "PE", + }, + "amplicon_microbiome": { + "min_reads_for_qc": 10_000, + "recommended_replicates": 1, + "short_read_max_avg_len": 350, + "expected_layout": "PE_or_SE", + }, + "shotgun_metagenomics": { + "min_reads_for_qc": 1_000_000, + "recommended_replicates": 1, + "short_read_max_avg_len": 350, + "expected_layout": "PE_or_SE", + }, +} +SYNTHETIC_MARKERS = ("synthetic", "simulated", "reduced") + + +def filename_from_uri(value: str) -> str: + if value.startswith(("http://", "https://", "s3://", "gs://")): + return Path(urlparse(value).path).name + return Path(value).name + + +def resolve_existing_path(raw: str, base: Path, roots: list[Path]) -> Path | None: + if not raw: + return None + if raw.startswith(("http://", "https://", "s3://", "gs://")): + basename = filename_from_uri(raw) + else: + candidate = Path(raw).expanduser() + if not candidate.is_absolute(): + candidate = base / candidate + if candidate.exists(): + return candidate.resolve() + basename = candidate.name + + matches = [] + for root in roots: + direct = root / basename + if direct.exists(): + matches.append(direct.resolve()) + if len(matches) == 1: + return matches[0] + if len(matches) > 1: + raise FileExistsError(f"ambiguous FASTQ basename {basename}: {matches}") + return None + + +def open_fastq_text(path: Path): + if path.name.endswith(".gz"): + return gzip.open(path, "rt", encoding="utf-8", errors="replace") + return path.open("rt", encoding="utf-8", errors="replace") + + +def detect_delimiter(path: Path) -> str: + if path.suffix.lower() in {".tsv", ".tab"}: + return "\t" + try: + with path.open(encoding="utf-8", errors="replace") as handle: + first_line = handle.readline() + if "\t" in first_line and "," not in first_line: + return "\t" + except OSError: + pass + return "," + + +def read_table(path: Path) -> tuple[list[dict[str, str]], list[str]]: + with path.open(newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle, delimiter=detect_delimiter(path)) + rows = [{key: (value or "").strip() for key, value in row.items()} for row in reader] + return rows, list(reader.fieldnames or []) + + +def first_present(row: dict[str, str], names: list[str]) -> str: + for name in names: + if row.get(name): + return row[name] + return "" + + +def check_fastq(path: Path, max_records: int) -> dict[str, Any]: + result: dict[str, Any] = { + "path": str(path), + "exists": path.exists(), + "records_checked": 0, + "errors": [], + } + if not path.exists(): + result["errors"].append("file does not exist") + return result + if not path.name.endswith(FASTQ_EXTENSIONS): + result["errors"].append("file extension is not a recognized FASTQ extension") + try: + with open_fastq_text(path) as handle: + for index in range(max_records): + header = handle.readline() + if not header: + break + sequence = handle.readline() + plus = handle.readline() + quality = handle.readline() + if not quality: + result["errors"].append(f"incomplete FASTQ record after record {index}") + break + result["records_checked"] += 1 + if not header.startswith("@"): + result["errors"].append(f"record {index + 1} header does not start with @") + if not plus.startswith("+"): + result["errors"].append(f"record {index + 1} separator does not start with +") + if len(sequence.rstrip()) != len(quality.rstrip()): + result["errors"].append( + f"record {index + 1} sequence and quality lengths differ" + ) + except OSError as exc: + result["errors"].append(f"read failed: {exc}") + return result + + +def truthy(value: str) -> bool: + return str(value).strip().lower() in {"1", "true", "yes", "y", "negative", "blank", "control"} + + +def summarize_input_context(args: argparse.Namespace, rows: list[dict[str, str]]) -> dict[str, Any]: + assays = sorted({row.get("assay", "").strip() for row in rows if row.get("assay", "").strip()}) + platforms = sorted( + { + row.get("instrument_platform", "").strip() + for row in rows + if row.get("instrument_platform", "").strip() + } + ) + layouts = sorted( + {row.get("layout", "").strip() for row in rows if row.get("layout", "").strip()} + ) + host_present = any(row.get("host_organism", "").strip() for row in rows) + host_depletion_present = any(row.get("host_depletion", "").strip() for row in rows) + negative_controls_present = any(truthy(row.get("control", "")) for row in rows) + control_metadata_present = any(row.get("control", "").strip() for row in rows) + batches_present = any(row.get("batch", "").strip() for row in rows) + replicate_metadata_present = any(row.get("replicate", "").strip() for row in rows) + markers_present = sorted( + {row.get("marker", "").strip() for row in rows if row.get("marker", "").strip()} + ) + genome_build_present = any(row.get("genome_build", "").strip() for row in rows) + blacklist_present = any(row.get("blacklist", "").strip() for row in rows) + peak_type_present = any(row.get("peak_type", "").strip() for row in rows) + primer_forward_present = any(row.get("primer_forward", "").strip() for row in rows) + primer_reverse_present = any(row.get("primer_reverse", "").strip() for row in rows) + primer_orientation_present = any(row.get("primer_orientation", "").strip() for row in rows) + merge_strategy_present = any(row.get("merge_reads", "").strip() for row in rows) + taxonomy_database_present = any(row.get("taxonomy_database", "").strip() for row in rows) + taxonomy_database_version_present = any( + row.get("taxonomy_database_version", "").strip() for row in rows + ) + sample_metadata_present = any(row.get("sample_metadata", "").strip() for row in rows) + return { + "assays": assays, + "instrument_platforms": platforms, + "layouts": layouts, + "host_organism_present": host_present, + "host_depletion_present": host_depletion_present, + "negative_controls_present": negative_controls_present, + "control_metadata_present": control_metadata_present, + "batch_metadata_present": batches_present, + "replicate_metadata_present": replicate_metadata_present, + "markers_present": markers_present, + "genome_build_present": genome_build_present, + "blacklist_present": blacklist_present, + "peak_type_present": peak_type_present, + "primer_forward_present": primer_forward_present, + "primer_reverse_present": primer_reverse_present, + "primer_sequences_present": primer_forward_present and primer_reverse_present, + "primer_orientation_present": primer_orientation_present, + "merge_strategy_present": merge_strategy_present, + "taxonomy_database_present": taxonomy_database_present, + "taxonomy_database_version_present": taxonomy_database_version_present, + "sample_metadata_present": sample_metadata_present, + "mixed_layouts": len(layouts) > 1, + "likely_short_read_platform": any( + platform.upper().startswith("ILLUMINA") for platform in platforms + ), + } + + +def metadata_warnings(args: argparse.Namespace, rows: list[dict[str, str]]) -> list[str]: + context = summarize_input_context(args, rows) + warnings: list[str] = [] + if context["mixed_layouts"]: + warnings.append( + "Input sample sheet mixes SE and PE layouts; downstream comparisons should verify that this is intentional." + ) + if args.lane == "shotgun_metagenomics": + if not context["host_organism_present"]: + warnings.append( + "Host organism is not declared in the sample sheet, so host-depletion decisions and privacy review remain unresolved." + ) + if not context["host_depletion_present"]: + warnings.append( + "Host-depletion intent is not declared in the sample sheet, so this run should be treated as readiness-only rather than analysis-ready." + ) + if not context["negative_controls_present"]: + warnings.append( + "No negative controls are flagged in the sample sheet, which weakens contamination interpretation for metagenomics." + ) + if args.lane == "epigenomics_peaks": + if not any(row.get("replicate", "").strip() for row in rows): + warnings.append( + "Replicate metadata are missing, so peak-level statistical comparisons cannot be validated from the sample sheet alone." + ) + if not context["host_organism_present"]: + warnings.append( + "Organism metadata are missing, so genome-build selection and blacklist choice are not yet audit-ready." + ) + if not context["genome_build_present"]: + warnings.append( + "Genome build is missing from the sample sheet, so alignment, TSS enrichment, FRiP, and track generation remain metadata-blocked." + ) + if not context["blacklist_present"]: + warnings.append( + "Blacklist BED/path is not declared, so blacklist-overlap QC and final peak filtering are not yet reproducible." + ) + if not context["control_metadata_present"]: + warnings.append( + "Control/input pairing is not declared, so ChIP/CUT&RUN-style background handling remains ambiguous even though FASTQ QC can still run." + ) + if not context["peak_type_present"]: + warnings.append( + "Peak type is not declared, so downstream peak-caller settings remain ambiguous." + ) + if args.lane == "amplicon_microbiome" and not any( + row.get("marker", "").strip() for row in rows + ): + warnings.append( + "Amplicon marker/region is missing from the sample sheet, which weakens primer and taxonomy interpretation." + ) + if args.lane == "amplicon_microbiome": + if not context["primer_sequences_present"]: + warnings.append( + "Primer sequences are not declared in the sample sheet, so full ASV inference remains blocked even if read-level QC passes." + ) + if not context["primer_orientation_present"]: + warnings.append( + "Primer orientation is not declared in the sample sheet, so trimming and read-merging settings remain ambiguous." + ) + if ( + not context["taxonomy_database_present"] + or not context["taxonomy_database_version_present"] + ): + warnings.append( + "Taxonomy database and version are not fully declared in the sample sheet, so taxa-level interpretation is not yet audit-ready." + ) + if not context["sample_metadata_present"]: + warnings.append( + "Sample metadata are not declared in the sample sheet, so diversity and differential-abundance interpretation would be incomplete." + ) + return warnings + + +def normalize_samples( + args: argparse.Namespace, +) -> tuple[dict[str, Any], list[dict[str, str]], list[Path]]: + sample_sheet = args.sample_sheet.expanduser().resolve() + rows, columns = read_table(sample_sheet) + roots = [root.expanduser().resolve() for root in args.fastq_root] + roots.extend([sample_sheet.parent, Path.cwd()]) + normalized: list[dict[str, str]] = [] + fastq_paths: list[Path] = [] + errors: list[str] = [] + warnings: list[str] = [] + fastq_checks = [] + + for row_index, row in enumerate(rows, start=2): + sample = ( + first_present(row, ["sample", "sample_id", "sampleID", "run_accession"]) + or f"row_{row_index}" + ) + r1_raw = first_present(row, ["fastq_1", "forwardReads", "r1", "read1"]) + r2_raw = first_present(row, ["fastq_2", "reverseReads", "r2", "read2"]) + fasta_raw = first_present(row, ["fasta"]) + if not r1_raw and not fasta_raw: + errors.append(f"row {row_index}: fastq_1/forwardReads or fasta is required") + continue + r1 = resolve_existing_path(r1_raw, sample_sheet.parent, roots) if r1_raw else None + r2 = resolve_existing_path(r2_raw, sample_sheet.parent, roots) if r2_raw else None + fasta = resolve_existing_path(fasta_raw, sample_sheet.parent, roots) if fasta_raw else None + if r1_raw and not r1: + errors.append(f"row {row_index}: could not resolve read 1 path {r1_raw}") + if r2_raw and not r2: + errors.append(f"row {row_index}: could not resolve read 2 path {r2_raw}") + if fasta_raw and not fasta: + errors.append(f"row {row_index}: could not resolve fasta path {fasta_raw}") + for read_label, read_path in [("r1", r1), ("r2", r2)]: + if read_path is None: + continue + fastq_paths.append(read_path) + check = check_fastq(read_path, args.fastq_record_check) + check["sample"] = sample + check["read"] = read_label + fastq_checks.append(check) + if check["errors"]: + errors.extend(f"{sample} {read_label}: {error}" for error in check["errors"]) + normalized.append( + { + "sample": sample, + "row_index": str(row_index), + "fastq_1": str(r1) if r1 else "", + "fastq_2": str(r2) if r2 else "", + "fasta": str(fasta) if fasta else "", + "layout": "PE" if r2 else ("SE" if r1 else "FASTA"), + "marker": first_present(row, ["marker", "target", "region"]), + "assay": first_present(row, ["assay", "library_strategy"]) or args.lane, + "instrument_platform": first_present(row, ["instrument_platform", "platform"]), + "host_organism": first_present( + row, ["host_organism", "host", "host_species", "organism"] + ), + "genome_build": first_present( + row, ["genome_build", "genome", "assembly", "reference", "reference_genome"] + ), + "blacklist": first_present(row, ["blacklist", "blacklist_bed", "blacklist_file"]), + "peak_type": first_present(row, ["peak_type", "peak_style", "peak_calling_mode"]), + "host_depletion": first_present( + row, ["host_depletion", "host_depletion_applied", "host_removal", "hostremoval"] + ), + "primer_forward": first_present( + row, ["primer_forward", "forward_primer", "fw_primer", "fwd_primer"] + ), + "primer_reverse": first_present( + row, ["primer_reverse", "reverse_primer", "rv_primer", "rev_primer"] + ), + "primer_orientation": first_present(row, ["primer_orientation", "orientation"]), + "merge_reads": first_present(row, ["merge_reads", "read_merge", "merge_policy"]), + "taxonomy_database": first_present( + row, ["taxonomy_database", "taxonomy_db", "classifier_db"] + ), + "taxonomy_database_version": first_present( + row, + ["taxonomy_database_version", "taxonomy_db_version", "classifier_db_version"], + ), + "sample_metadata": first_present( + row, ["sample_metadata", "metadata", "sample_metadata_file", "metadata_file"] + ), + "batch": first_present(row, ["batch", "batch_id"]), + "replicate": first_present(row, ["replicate", "replicate_id"]), + "control": first_present(row, ["control", "control_sample", "negative_control"]), + } + ) + if not normalized: + errors.append("no usable rows found in sample sheet") + warnings.extend(metadata_warnings(args, normalized)) + validation = { + "ok": not errors, + "lane": args.lane, + "sample_sheet": str(sample_sheet), + "columns": columns, + "sample_count": len({row["sample"] for row in normalized}), + "row_count": len(normalized), + "fastq_count": len(fastq_paths), + "errors": errors, + "warnings": warnings, + "fastq_checks": fastq_checks, + "input_context": summarize_input_context(args, normalized), + } + return validation, normalized, fastq_paths + + +def write_normalized_samples(run_dir: Path, rows: list[dict[str, str]]) -> None: + path = run_dir / "validation" / "samples.normalized.tsv" + if not rows: + write_text(path, "") + return + fieldnames = list(rows[0].keys()) + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames, delimiter="\t") + writer.writeheader() + writer.writerows(rows) + + +def write_resolved_sample_sheet( + run_dir: Path, source_path: Path, rows: list[dict[str, str]] +) -> Path: + destination = run_dir / "inputs" / "sample_sheet" / f"{source_path.stem}.resolved.tsv" + if not rows: + write_text(destination, "") + return destination + write_tsv(destination, rows, list(rows[0].keys())) + return destination + + +def _supplemental_sample_name( + sample_names: list[str], item_count: int, index: int, fallback: str +) -> str: + if len(sample_names) == 1: + return sample_names[0] + if len(sample_names) == item_count and index < len(sample_names): + return sample_names[index] + return fallback + + +def _copy_input(source: Path, destination: Path) -> Path: + destination.parent.mkdir(parents=True, exist_ok=True) + if source.resolve() != destination.resolve(): + shutil.copy2(source, destination) + return destination + + +def _rewrite_humann_headers(source: Path, destination: Path, sample_names: list[str]) -> Path: + lines = source.read_text(encoding="utf-8", errors="replace").splitlines() + header_index = next( + (index for index, line in enumerate(lines) if line and not line.startswith("#")), None + ) + if header_index is None: + return _copy_input(source, destination) + header = lines[header_index].split("\t") + provided = header[1:] + if len(sample_names) == 1 and len(provided) == 1: + header[1] = sample_names[0] + elif len(sample_names) == len(provided): + header[1:] = sample_names + lines[header_index] = "\t".join(header) + destination.parent.mkdir(parents=True, exist_ok=True) + destination.write_text("\n".join(lines) + "\n", encoding="utf-8") + return destination + + +def stage_analysis_inputs( + run_dir: Path, args: argparse.Namespace, rows: list[dict[str, str]] +) -> dict[str, Any]: + sample_names = [row["sample"] for row in rows if row.get("sample")] + sample_sheet_path = args.sample_sheet.expanduser().resolve() + sample_sheet_copy = _copy_input( + sample_sheet_path, run_dir / "inputs" / "sample_sheet" / sample_sheet_path.name + ) + resolved_sheet = write_resolved_sample_sheet(run_dir, sample_sheet_path, rows) + provenance: dict[str, Any] = { + "analysis_intent": "real_analysis", + "sample_sheet": { + "original_path": str(sample_sheet_path), + "copied_path": str(sample_sheet_copy.relative_to(run_dir)), + "resolved_path": str(resolved_sheet.relative_to(run_dir)), + "sha256": sha256_file(sample_sheet_copy), + }, + "supplemental_inputs": { + "kraken_reports": [], + "bracken_tables": [], + "humann_pathabundance": None, + "humann_genefamilies": None, + }, + } + + staged_kraken: list[Path] = [] + for index, source in enumerate(args.kraken_report): + source_path = source.expanduser().resolve() + sample_name = _supplemental_sample_name( + sample_names, len(args.kraken_report), index, sample_stem(source_path.name) + ) + destination = _copy_input( + source_path, run_dir / "inputs" / "kraken_reports" / f"{sample_name}.report.txt" + ) + staged_kraken.append(destination) + provenance["supplemental_inputs"]["kraken_reports"].append( + { + "original_path": str(source_path), + "staged_path": str(destination.relative_to(run_dir)), + "sha256": sha256_file(destination), + } + ) + args.kraken_report = staged_kraken + + staged_bracken: list[Path] = [] + for index, source in enumerate(args.bracken_table): + source_path = source.expanduser().resolve() + sample_name = _supplemental_sample_name( + sample_names, len(args.bracken_table), index, sample_stem(source_path.name) + ) + destination = _copy_input( + source_path, run_dir / "inputs" / "bracken_tables" / f"{sample_name}.bracken.tsv" + ) + staged_bracken.append(destination) + provenance["supplemental_inputs"]["bracken_tables"].append( + { + "original_path": str(source_path), + "staged_path": str(destination.relative_to(run_dir)), + "sha256": sha256_file(destination), + } + ) + args.bracken_table = staged_bracken + + if args.humann_pathabundance: + source_path = args.humann_pathabundance.expanduser().resolve() + sample_name = _supplemental_sample_name(sample_names, 1, 0, sample_stem(source_path.name)) + destination = _rewrite_humann_headers( + source_path, + run_dir / "inputs" / "humann" / f"{sample_name}.pathabundance.tsv", + sample_names, + ) + args.humann_pathabundance = destination + provenance["supplemental_inputs"]["humann_pathabundance"] = { + "original_path": str(source_path), + "staged_path": str(destination.relative_to(run_dir)), + "sha256": sha256_file(destination), + } + + if args.humann_genefamilies: + source_path = args.humann_genefamilies.expanduser().resolve() + sample_name = _supplemental_sample_name(sample_names, 1, 0, sample_stem(source_path.name)) + destination = _rewrite_humann_headers( + source_path, + run_dir / "inputs" / "humann" / f"{sample_name}.genefamilies.tsv", + sample_names, + ) + args.humann_genefamilies = destination + provenance["supplemental_inputs"]["humann_genefamilies"] = { + "original_path": str(source_path), + "staged_path": str(destination.relative_to(run_dir)), + "sha256": sha256_file(destination), + } + return provenance + + +def build_replay_command(args: argparse.Namespace, sample_sheet_path: Path) -> list[str]: + command = [ + "python", + str(Path(__file__).resolve()), + "--lane", + args.lane, + "--sample-sheet", + str(sample_sheet_path), + "--threads", + str(args.threads), + "--fastq-record-check", + str(args.fastq_record_check), + ] + if args.execute: + command.append("--execute") + if args.kraken_db: + command.extend(["--kraken-db", str(args.kraken_db.expanduser().resolve())]) + if args.asv_table: + command.extend(["--asv-table", str(args.asv_table.expanduser().resolve())]) + if args.taxonomy_table: + command.extend(["--taxonomy-table", str(args.taxonomy_table.expanduser().resolve())]) + if args.synthetic_downstream_inputs: + command.append("--synthetic-downstream-inputs") + if args.allow_synthetic_diversity: + command.append("--allow-synthetic-diversity") + for path in args.kraken_report: + command.extend(["--kraken-report", str(path)]) + for path in args.bracken_table: + command.extend(["--bracken-table", str(path)]) + if args.humann_pathabundance: + command.extend(["--humann-pathabundance", str(args.humann_pathabundance)]) + if args.humann_genefamilies: + command.extend(["--humann-genefamilies", str(args.humann_genefamilies)]) + return command + + +def write_commands( + run_dir: Path, args: argparse.Namespace, fastq_paths: list[Path], sample_sheet_path: Path +) -> None: + lines = ["#!/usr/bin/env bash", "set -euo pipefail"] + lines.append("# Full runner invocation for this bundle:") + lines.append(f"# {shlex.join(build_replay_command(args, sample_sheet_path))}") + if fastq_paths: + lines.append( + shlex.join(["seqkit", "stats", "-T", *map(str, fastq_paths)]) + " > qc/seqkit_stats.tsv" + ) + if fastq_paths: + lines.append( + shlex.join( + ["fastqc", "-t", str(args.threads), "-o", "fastqc/raw", *map(str, fastq_paths)] + ) + ) + lines.append( + shlex.join(["multiqc", "--no-version-check", "fastqc/raw", "-o", "fastqc/multiqc"]) + ) + write_text(run_dir / "commands.sh", "\n".join(lines) + "\n") + + +def parse_float(value: str) -> float: + text = str(value).strip().replace(",", "") + if not text or text in {"-", "NA", "nan"}: + return 0.0 + try: + return float(text) + except ValueError: + return 0.0 + + +def write_tsv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str] | None = None) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + if fieldnames is None: + keys: list[str] = [] + for row in rows: + for key in row: + if key not in keys: + keys.append(key) + fieldnames = keys + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames, delimiter="\t") + writer.writeheader() + writer.writerows(rows) + + +def path_has_synthetic_marker(path: Path | None) -> bool: + if path is None: + return False + name = path.name.lower() + return any(marker in name for marker in SYNTHETIC_MARKERS) + + +def fig_caption(fig: Any, caption: str | None) -> None: + if caption: + fig.text(0.99, 0.01, caption, ha="right", va="bottom", fontsize=8, color="#666666") + + +def read_seqkit_stats_file(path: Path) -> list[dict[str, Any]]: + if not path.exists() or path.stat().st_size == 0: + return [] + rows, _ = read_table(path) + parsed = [] + for row in rows: + raw_file = row.get("file", "").strip() + if not raw_file: + continue + file_path = Path(raw_file) + if not file_path.exists(): + continue + parsed.append( + { + "file": raw_file, + "num_seqs": parse_float(row.get("num_seqs", "")), + "sum_len": parse_float(row.get("sum_len", "")), + "min_len": parse_float(row.get("min_len", "")), + "avg_len": parse_float(row.get("avg_len", "")), + "max_len": parse_float(row.get("max_len", "")), + } + ) + return parsed + + +def read_multiqc_table(path: Path) -> list[dict[str, str]]: + if not path.exists() or path.stat().st_size == 0: + return [] + rows, _ = read_table(path) + return rows + + +def infer_read_technology(stats_rows: list[dict[str, Any]]) -> str: + avg_len = max((float(row.get("avg_len", 0.0)) for row in stats_rows), default=0.0) + max_len = max((float(row.get("max_len", 0.0)) for row in stats_rows), default=0.0) + if avg_len >= 1000 or max_len >= 5000: + return "long_read_like" + if avg_len >= 350 or max_len >= 1500: + return "mixed_or_long_insert" + return "short_read_like" + + +def sample_stem(text: str) -> str: + name = Path(text).name + for suffix in [".fastq.gz", ".fq.gz", ".fastq", ".fq", ".report.txt", ".report"]: + if name.endswith(suffix): + return name[: -len(suffix)] + return name + + +def summarize_fastqc_modules(multiqc_rows: list[dict[str, str]]) -> dict[str, dict[str, int]]: + excluded = { + "Sample", + "Filename", + "File type", + "Encoding", + "Total Sequences", + "Total Bases", + "Sequences flagged as poor quality", + "Sequence length", + "%GC", + "total_deduplicated_percentage", + "avg_sequence_length", + "median_sequence_length", + } + module_summary: dict[str, dict[str, int]] = {} + for row in multiqc_rows: + for column, value in row.items(): + if column in excluded: + continue + status = value.strip().lower() + if status not in {"pass", "warn", "fail"}: + continue + counts = module_summary.setdefault(column, {"pass": 0, "warn": 0, "fail": 0}) + counts[status] += 1 + return dict( + sorted( + module_summary.items(), key=lambda item: (-item[1]["fail"], -item[1]["warn"], item[0]) + ) + ) + + +def build_fastq_assay_qc_verdict( + run_dir: Path, args: argparse.Namespace, validation: dict[str, Any] +) -> dict[str, Any]: + context = validation.get("input_context", {}) + thresholds = LANE_THRESHOLDS[args.lane] + seqkit_rows = read_seqkit_stats_file(run_dir / "qc" / "seqkit_stats.tsv") + multiqc_rows = read_multiqc_table( + run_dir / "fastqc" / "multiqc" / "multiqc_data" / "multiqc_fastqc.txt" + ) + general_stats_rows = read_multiqc_table( + run_dir / "fastqc" / "multiqc" / "multiqc_data" / "multiqc_general_stats.txt" + ) + module_summary = summarize_fastqc_modules(multiqc_rows) + warnings: list[str] = list(validation.get("warnings", [])) + reason_codes: list[str] = [] + recommendations: list[str] = [] + per_sample: list[dict[str, Any]] = [] + + technology = infer_read_technology(seqkit_rows) + sample_count = int(validation.get("sample_count", 0) or 0) + replicate_count = sample_count if context.get("replicate_metadata_present") else 0 + + # Collect per-sample metrics and heuristic flags. + min_reads_observed = None + max_percent_fails = 0.0 + for row in seqkit_rows: + file_key = sample_stem(str(row.get("file", ""))) + sample_fastqc = next( + (item for item in multiqc_rows if sample_stem(item.get("Filename", "")) == file_key), {} + ) + sample_general = next( + ( + item + for item in general_stats_rows + if sample_stem(item.get("Sample", "")) == file_key + ), + {}, + ) + num_seqs = int(row.get("num_seqs", 0) or 0) + min_reads_observed = ( + num_seqs if min_reads_observed is None else min(min_reads_observed, num_seqs) + ) + percent_fails = parse_float(sample_general.get("fastqc-percent_fails", "")) + max_percent_fails = max(max_percent_fails, percent_fails) + per_sample.append( + { + "file": file_key, + "num_reads": num_seqs, + "avg_read_length": float(row.get("avg_len", 0.0) or 0.0), + "max_read_length": float(row.get("max_len", 0.0) or 0.0), + "fastqc_percent_fails": percent_fails, + "fastqc_duplication_percent": parse_float( + sample_general.get("fastqc-percent_duplicates", "") + ), + } + ) + + if min_reads_observed is not None and min_reads_observed < thresholds["min_reads_for_qc"]: + reason_codes.append("read_depth_below_recommended_minimum") + recommendations.append( + f"Observed read depth is below the lane heuristic minimum of {thresholds['min_reads_for_qc']:,} reads; treat this run as QC/readiness rather than interpretation-ready." + ) + if context.get("likely_short_read_platform") and technology != "short_read_like": + reason_codes.append("platform_read_length_mismatch") + warnings.append( + "Read-length statistics do not match the declared short-read platform, so FastQC module pass/fail calls should be interpreted cautiously." + ) + recommendations.append( + "Confirm instrument metadata and, if needed, apply technology-specific QC rather than relying on short-read FastQC expectations." + ) + if max_percent_fails >= 30.0: + reason_codes.append("fastqc_failure_rate_high") + recommendations.append( + "Inspect the highest-failing FastQC modules before advancing to downstream interpretation; a high module fail rate should block interpretation until reviewed." + ) + + if args.lane == "epigenomics_peaks": + per_base_fail_count = module_summary.get("per_base_sequence_content", {}).get("fail", 0) + adapter_fail_count = module_summary.get("adapter_content", {}).get("fail", 0) + max_duplication = max( + (sample.get("fastqc_duplication_percent", 0.0) for sample in per_sample), default=0.0 + ) + if any(layout != "PE" for layout in context.get("layouts", [])): + reason_codes.append("paired_end_layout_expected") + recommendations.append( + "ATAC/epigenomics inputs are usually paired-end for robust fragment metrics; confirm that SE layout is intentional." + ) + if sample_count < thresholds["recommended_replicates"]: + reason_codes.append("replicate_count_below_recommended_minimum") + recommendations.append( + "Provide at least two biological replicates before using this plugin surface to justify peak-level statistical comparisons." + ) + if not context.get("host_organism_present"): + reason_codes.append("organism_metadata_missing") + recommendations.append( + "Add organism metadata so the downstream assay-specific workflow can pin the correct reference bundle and TSS annotation." + ) + if not context.get("genome_build_present"): + reason_codes.append("genome_build_missing") + recommendations.append( + "Record the genome build in the sample sheet before aligning reads or generating tracks, FRiP, and TSS enrichment metrics." + ) + if not context.get("blacklist_present"): + reason_codes.append("blacklist_missing") + recommendations.append( + "Provide a blacklist BED path before treating blacklist overlap and final peaks as reproducible." + ) + if not context.get("control_metadata_present"): + reason_codes.append("control_metadata_missing") + recommendations.append( + "Declare control/input metadata so background-aware peak calling is auditable for ChIP, CUT&RUN, or CUT&Tag studies." + ) + if not context.get("peak_type_present"): + reason_codes.append("peak_type_missing") + recommendations.append( + "Declare whether downstream peaks are narrow, broad, or accessibility-style to keep peak-caller parameters explicit." + ) + if command_path("macs2") is None: + reason_codes.append("peak_caller_backend_missing") + recommendations.append( + "Install MACS2 or run a full nf-core backend before expecting peak-calling outputs from this lane." + ) + if per_base_fail_count: + warnings.append( + "FastQC flagged per-base sequence content; for ATAC/CUT&RUN/CUT&Tag libraries this can be assay-expected and should not be treated as an automatic trimming failure." + ) + if adapter_fail_count: + warnings.append( + "FastQC flagged adapter content; confirm trimming policy before alignment, but do not infer failed peak calling from this flag alone." + ) + if max_duplication >= 25.0: + warnings.append( + f"Duplicate estimates reach {max_duplication:.1f}% in the current MultiQC summary. For epigenomics libraries this is not necessarily fatal, but library complexity should be reassessed after alignment with mitochondrial fraction, FRiP, and TSS enrichment." + ) + recommendations.append( + "Compute mitochondrial fraction, insert-size periodicity, TSS enrichment, FRiP, blacklist overlap, and replicate concordance after alignment before making biological claims." + ) + elif args.lane == "amplicon_microbiome": + if not context.get("markers_present"): + reason_codes.append("marker_metadata_missing") + recommendations.append( + "Declare the marker region and primer pair in the sample sheet so trimming and taxonomy interpretation are auditable." + ) + if not context.get("primer_sequences_present"): + reason_codes.append("primer_sequences_missing") + recommendations.append( + "Provide forward and reverse primer sequences before treating this lane as a full amplicon analysis rather than QC/readiness." + ) + if not context.get("primer_orientation_present"): + reason_codes.append("primer_orientation_missing") + recommendations.append( + "Declare primer orientation so trimming and merging settings are reproducible." + ) + if not context.get("taxonomy_database_present"): + reason_codes.append("taxonomy_database_missing") + recommendations.append( + "Choose a taxonomy database before expecting taxa-level plots or assignments." + ) + if not context.get("taxonomy_database_version_present"): + reason_codes.append("taxonomy_database_version_missing") + recommendations.append( + "Record the taxonomy database version so taxa-level interpretation is audit-ready." + ) + if not context.get("sample_metadata_present"): + reason_codes.append("sample_metadata_missing") + recommendations.append( + "Provide sample metadata before treating diversity or differential-abundance outputs as interpretable." + ) + if command_path("cutadapt") is None: + reason_codes.append("primer_trimming_backend_missing") + recommendations.append( + "Install cutadapt before treating this lane as primer-trimming-ready." + ) + reason_codes.append("taxonomy_backend_required") + recommendations.append( + "Provide an ASV table and taxonomy resource or a QIIME2/DADA2 backend before treating the run as analysis-complete." + ) + elif args.lane == "shotgun_metagenomics": + classification_status = {} + status_path = run_dir / "taxonomic_classification_status.json" + if status_path.exists(): + classification_status = json.loads(status_path.read_text(encoding="utf-8")) + if not context.get("host_organism_present"): + reason_codes.append("host_metadata_missing") + recommendations.append( + "Add host organism and host-depletion intent to the sample sheet before treating metagenomics outputs as interpretation-ready." + ) + if not context.get("negative_controls_present"): + reason_codes.append("negative_controls_not_flagged") + recommendations.append( + "Flag negative controls explicitly in the sample sheet to make contamination interpretation auditable." + ) + if not classification_status.get("executed"): + reason_codes.append("classification_backend_not_executed") + recommendations.append( + "Provide a Kraken2 database or precomputed Kraken/Bracken/HUMAnN tables to emit taxonomic/functional interpretation artifacts." + ) + + verdict = "pass_with_caveats" if not reason_codes else "analysis_not_ready" + readiness = { + "epigenomics_peaks": "ready_for_alignment_handoff" + if verdict == "pass_with_caveats" + else "readiness_only", + "amplicon_microbiome": "ready_for_primer_trimming_handoff" + if verdict == "pass_with_caveats" + else "readiness_only", + "shotgun_metagenomics": "ready_for_taxonomic_profiling" + if verdict == "pass_with_caveats" + else "readiness_only", + }[args.lane] + result = { + "created_at": now_iso(), + "lane": args.lane, + "verdict": verdict, + "analysis_readiness": readiness, + "reason_codes": sorted(dict.fromkeys(reason_codes)), + "warnings": list(dict.fromkeys(warnings)), + "recommendations": list(dict.fromkeys(recommendations)), + "thresholds": thresholds, + "metadata_context": context, + "technology_inference": technology, + "fastqc_module_summary": module_summary, + "metrics_summary": { + "sample_count": sample_count, + "replicate_count": replicate_count, + "min_reads_observed": min_reads_observed, + "max_fastqc_percent_fails": max_percent_fails, + }, + "samples": per_sample, + } + if args.lane == "epigenomics_peaks": + result["follow_on_commands"] = build_epigenomics_follow_on_commands(args, run_dir, context) + if args.lane == "amplicon_microbiome": + result["follow_on_commands"] = build_amplicon_follow_on_commands(args, run_dir) + return result + + +def select_epigenomics_backend(context: dict[str, Any]) -> str: + assay_text = " ".join(context.get("assays", [])).lower() + if "atac" in assay_text: + return "nf-core/atacseq" + if ( + "cut&run" in assay_text + or "cutrun" in assay_text + or "cut&tag" in assay_text + or "cuttag" in assay_text + ): + return "nf-core/cutandrun" + if "chip" in assay_text: + return "nf-core/chipseq" + return "nf-core/atacseq" + + +def build_epigenomics_follow_on_commands( + args: argparse.Namespace, run_dir: Path, context: dict[str, Any] +) -> list[dict[str, str]]: + sample_sheet = str(args.sample_sheet.expanduser().resolve()) + backend = select_epigenomics_backend(context) + backend_outdir = str((run_dir / "backend" / backend.replace("/", "_")).resolve()) + return [ + { + "id": "epigenomics_backend_alignment_and_peaks", + "description": "Run the assay-specific backend with explicit genome, blacklist, and control metadata to generate aligned BAMs, tracks, and peaks.", + "command": ( + f"nextflow run {backend} " + f"-profile docker --input {shlex.quote(sample_sheet)} " + "--genome --blacklist " + f"--outdir {shlex.quote(backend_outdir)}" + ), + }, + { + "id": "render_epigenomics_qc_after_alignment", + "description": "Re-run the local lane after alignment/peak calling artifacts exist so the review bundle can include final readiness metrics and track links.", + "command": ( + "python plugins/ngs-analysis/scripts/run_fastq_assay_package.py " + f"--lane epigenomics_peaks --sample-sheet {shlex.quote(sample_sheet)} " + "--execute" + ), + }, + ] + + +def build_epigenomics_readiness( + run_dir: Path, + args: argparse.Namespace, + validation: dict[str, Any], + interpretation: dict[str, Any] | None = None, +) -> dict[str, Any]: + context = validation.get("input_context", {}) + missing_metadata: list[str] = [] + if not context.get("host_organism_present"): + missing_metadata.append("organism") + if not context.get("genome_build_present"): + missing_metadata.append("genome_build") + if not context.get("blacklist_present"): + missing_metadata.append("blacklist_bed") + if not context.get("control_metadata_present"): + missing_metadata.append("control_or_input") + if not context.get("replicate_metadata_present"): + missing_metadata.append("replicate_ids") + if not context.get("peak_type_present"): + missing_metadata.append("peak_type") + + alignment_missing = [ + field + for field in missing_metadata + if field in {"organism", "genome_build", "blacklist_bed"} + ] + peak_missing = [ + field + for field in missing_metadata + if field in {"control_or_input", "replicate_ids", "peak_type"} + ] + + checklist = [ + { + "id": "alignment", + "status": "ready" if not alignment_missing else "missing_metadata", + "requires_alignment": True, + "required_inputs": ["FASTQs", "genome_build", "aligner", "blacklist_bed"], + "missing_metadata": alignment_missing, + "note": "Coordinate-sorted, filtered BAMs are the prerequisite for all downstream epigenomics metrics.", + }, + { + "id": "mitochondrial_fraction", + "status": "requires_alignment", + "requires_alignment": True, + "required_inputs": ["filtered BAM", "genome_build"], + "missing_metadata": alignment_missing, + "note": "Mitochondrial fraction is measured on aligned reads and cannot be inferred from FASTQ QC alone.", + }, + { + "id": "fragment_periodicity", + "status": "requires_alignment", + "requires_alignment": True, + "required_inputs": ["paired-end BAM"], + "missing_metadata": [field for field in missing_metadata if field == "genome_build"], + "note": "Insert-size periodicity is an alignment-derived ATAC-seq quality metric.", + }, + { + "id": "tss_enrichment", + "status": "requires_alignment", + "requires_alignment": True, + "required_inputs": ["filtered BAM", "TSS annotation BED/GTF", "genome_build"], + "missing_metadata": [ + field for field in missing_metadata if field in {"organism", "genome_build"} + ], + "note": "TSS enrichment depends on aligned read pileups around annotated TSS loci.", + }, + { + "id": "frip", + "status": "requires_alignment_and_peaks", + "requires_alignment": True, + "required_inputs": ["filtered BAM", "called peaks"], + "missing_metadata": peak_missing, + "note": "FRiP is computed after peak calling and should be interpreted together with duplication and mitochondrial fraction.", + }, + { + "id": "blacklist_overlap", + "status": "requires_alignment", + "requires_alignment": True, + "required_inputs": ["filtered BAM", "blacklist_bed"], + "missing_metadata": [field for field in missing_metadata if field == "blacklist_bed"], + "note": "Blacklist overlap requires the chosen genome build and blacklist resource.", + }, + { + "id": "peaks", + "status": "requires_alignment_and_backend", + "requires_alignment": True, + "required_inputs": [ + "filtered BAM", + "peak_caller", + "peak_type", + "controls (if applicable)", + ], + "missing_metadata": peak_missing, + "note": "Peak calling needs aligned BAMs plus an explicit backend such as MACS2 or nf-core.", + }, + { + "id": "tracks", + "status": "requires_alignment", + "requires_alignment": True, + "required_inputs": ["filtered BAM", "genome_sizes", "normalization choice"], + "missing_metadata": [field for field in missing_metadata if field == "genome_build"], + "note": "Browser tracks are derived from aligned reads and require explicit normalization settings.", + }, + ] + payload = { + "schema_version": "2.0", + "created_at": now_iso(), + "lane": args.lane, + "review_surface_ok": ( + run_dir / "fastqc" / "multiqc" / "multiqc_browser_helper.html" + ).exists(), + "ready_for_alignment_handoff": not alignment_missing, + "macs2_present": command_path("macs2") is not None, + "alignment_required": True, + "missing_metadata": missing_metadata, + "metadata_context": context, + "checklist": checklist, + "note": "This package validates and summarizes epigenomics FASTQs. Downstream TSS enrichment, FRiP, peaks, and tracks require aligned BAMs plus assay-specific metadata and backends.", + } + if interpretation: + payload["verdict"] = interpretation.get("verdict") + payload["analysis_readiness"] = interpretation.get("analysis_readiness") + payload["reason_codes"] = interpretation.get("reason_codes", []) + payload["recommendations"] = interpretation.get("recommendations", []) + payload["follow_on_commands"] = interpretation.get("follow_on_commands", []) + return payload + + +def build_amplicon_follow_on_commands( + args: argparse.Namespace, run_dir: Path +) -> list[dict[str, str]]: + sample_sheet = str(args.sample_sheet.expanduser().resolve()) + backend_outdir = str((run_dir / "backend" / "ampliseq").resolve()) + asv_table = str((run_dir / "backend" / "ampliseq" / "feature-table.tsv").resolve()) + taxonomy_table = str((run_dir / "backend" / "ampliseq" / "taxonomy.tsv").resolve()) + return [ + { + "id": "nfcore_ampliseq_backend", + "description": "Generate ASV and taxonomy tables from the same sample sheet once primer sequences, orientation, and taxonomy DB are chosen.", + "command": ( + "nextflow run nf-core/ampliseq " + f"-profile docker --input {shlex.quote(sample_sheet)} " + "--FW_primer --RV_primer " + "--database " + f"--outdir {shlex.quote(backend_outdir)}" + ), + }, + { + "id": "render_amplicon_visuals", + "description": "Re-render plugin-native diversity and taxa plots after the backend emits ASV and taxonomy tables.", + "command": ( + "python plugins/ngs-analysis/scripts/run_fastq_assay_package.py " + f"--lane amplicon_microbiome --sample-sheet {shlex.quote(sample_sheet)} " + f"--asv-table {shlex.quote(asv_table)} " + f"--taxonomy-table {shlex.quote(taxonomy_table)} " + "--execute" + ), + }, + ] + + +def build_shotgun_qc_interpretation( + run_dir: Path, args: argparse.Namespace, validation: dict[str, Any] +) -> dict[str, Any]: + context = validation.get("input_context", {}) + seqkit_rows = read_seqkit_stats_file(run_dir / "qc" / "seqkit_stats.tsv") + multiqc_rows = read_multiqc_table( + run_dir / "fastqc" / "multiqc" / "multiqc_data" / "multiqc_fastqc.txt" + ) + general_stats_rows = read_multiqc_table( + run_dir / "fastqc" / "multiqc" / "multiqc_data" / "multiqc_general_stats.txt" + ) + classification_status = {} + status_path = run_dir / "taxonomic_classification_status.json" + if status_path.exists(): + classification_status = json.loads(status_path.read_text(encoding="utf-8")) + + technology = infer_read_technology(seqkit_rows) + conclusions: list[str] = [] + warnings: list[str] = list(validation.get("warnings", [])) + recommendations: list[str] = [] + if not context.get("host_organism_present") or not context.get("host_depletion_present"): + conclusions.append("host_depletion_context_missing") + recommendations.append( + "Add host organism and host-depletion intent/reference metadata before treating the run as analysis-ready." + ) + if not classification_status.get("executed"): + conclusions.append("classification_blocked") + recommendations.append( + "Provide Kraken/Bracken/HUMAnN inputs or a Kraken2 database before expecting taxonomic or functional interpretation artifacts." + ) + if context.get("likely_short_read_platform") and technology != "short_read_like": + conclusions.append("technology_mismatch") + warnings.append( + "Read-length statistics look long-read-like while the sample metadata declares a short-read platform; FastQC modules are less interpretable under this mismatch." + ) + recommendations.append( + "Confirm the sequencing platform and consider long-read-aware QC before relying on FastQC module pass/fail calls." + ) + elif technology != "short_read_like": + warnings.append( + "Read lengths are long-read-like, so FastQC module pass/fail calls should be interpreted cautiously." + ) + recommendations.append( + "Supplement FastQC with technology-aware QC if this dataset is truly long-read or mixed-read." + ) + + percent_fails = max( + (parse_float(row.get("fastqc-percent_fails", "")) for row in general_stats_rows), + default=0.0, + ) + if percent_fails >= 20.0: + warnings.append( + f"FastQC modules show a high fail rate ({percent_fails:.1f}%), which should be interpreted in the context of read length and platform." + ) + + readiness = "analysis_ready_for_taxonomic_profiling" + if conclusions: + readiness = "readiness_only" + + per_sample = [] + for row in seqkit_rows: + name = Path(str(row.get("file", ""))).name + sample_fastqc = next( + ( + item + for item in multiqc_rows + if item.get("Filename", "").startswith( + name.replace(".fastq.gz", "").replace(".fq.gz", "") + ) + ), + {}, + ) + sample_general = next( + ( + item + for item in general_stats_rows + if item.get("Sample", "").startswith( + name.replace(".fastq.gz", "").replace(".fq.gz", "") + ) + ), + {}, + ) + per_sample.append( + { + "file": name, + "num_seqs": int(row.get("num_seqs", 0) or 0), + "avg_len": float(row.get("avg_len", 0.0) or 0.0), + "max_len": float(row.get("max_len", 0.0) or 0.0), + "fastqc_percent_fails": parse_float(sample_general.get("fastqc-percent_fails", "")), + "fastqc_sequence_length_distribution": sample_fastqc.get( + "sequence_length_distribution", "" + ), + } + ) + return { + "analysis_readiness": readiness, + "conclusions": conclusions, + "warnings": warnings, + "recommendations": recommendations, + "technology_inference": technology, + "metadata_context": context, + "classification_status": classification_status, + "samples": per_sample, + } + + +def save_barplot( + labels: list[str], + values: list[float], + out_path: Path, + *, + title: str, + ylabel: str, + caption: str | None = None, +) -> bool: + if plt is None or not labels: + return False + width = max(7, min(16, len(labels) * 0.65 + 3)) + fig, ax = plt.subplots(figsize=(width, 5.2)) + ax.bar(range(len(labels)), values, color="#3b6ea8") + ax.set_xticks(range(len(labels))) + ax.set_xticklabels(labels, rotation=45, ha="right") + ax.set_ylabel(ylabel) + ax.set_title(title) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + fig_caption(fig, caption) + fig.tight_layout() + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=160) + plt.close(fig) + return True + + +def save_stacked_barplot( + samples: list[str], + categories: list[str], + values_by_category: dict[str, list[float]], + out_path: Path, + *, + title: str, + ylabel: str, + caption: str | None = None, +) -> bool: + if plt is None or np is None or not samples or not categories: + return False + width = max(8, min(18, len(samples) * 0.7 + 4)) + fig, ax = plt.subplots(figsize=(width, 5.8)) + bottom = np.zeros(len(samples)) + cmap = plt.get_cmap("tab20") + for index, category in enumerate(categories): + values = np.asarray(values_by_category.get(category, [0.0] * len(samples)), dtype=float) + ax.bar(samples, values, bottom=bottom, label=category, color=cmap(index % 20)) + bottom = bottom + values + ax.set_ylabel(ylabel) + ax.set_title(title) + ax.tick_params(axis="x", rotation=45) + ax.legend(loc="upper left", bbox_to_anchor=(1.01, 1.0), frameon=False) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + fig_caption(fig, caption) + fig.tight_layout() + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=160) + plt.close(fig) + return True + + +def save_heatmap( + matrix: Any, + row_labels: list[str], + col_labels: list[str], + out_path: Path, + *, + title: str, + colorbar_label: str, + caption: str | None = None, +) -> bool: + if plt is None or np is None or not row_labels or not col_labels: + return False + data = np.asarray(matrix, dtype=float) + height = max(5, min(18, len(row_labels) * 0.35 + 2)) + width = max(7, min(18, len(col_labels) * 0.55 + 4)) + fig, ax = plt.subplots(figsize=(width, height)) + image = ax.imshow(data, aspect="auto", cmap="viridis") + ax.set_xticks(range(len(col_labels))) + ax.set_xticklabels(col_labels, rotation=45, ha="right") + ax.set_yticks(range(len(row_labels))) + ax.set_yticklabels(row_labels) + ax.set_title(title) + cbar = fig.colorbar(image, ax=ax) + cbar.set_label(colorbar_label) + fig_caption(fig, caption) + fig.tight_layout() + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=160) + plt.close(fig) + return True + + +def save_scatter( + x_values: list[float], + y_values: list[float], + labels: list[str], + out_path: Path, + *, + title: str, + caption: str | None = None, +) -> bool: + if plt is None or not labels: + return False + fig, ax = plt.subplots(figsize=(7.2, 6.2)) + ax.scatter(x_values, y_values, color="#4f7d4a", s=55) + for label, x_value, y_value in zip(labels, x_values, y_values, strict=True): + ax.text(x_value, y_value, label, fontsize=8, ha="left", va="bottom") + ax.axhline(0, color="#ddd", linewidth=0.8) + ax.axvline(0, color="#ddd", linewidth=0.8) + ax.set_xlabel("PCoA1") + ax.set_ylabel("PCoA2") + ax.set_title(title) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + fig_caption(fig, caption) + fig.tight_layout() + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=160) + plt.close(fig) + return True + + +def read_feature_count_table(path: Path) -> tuple[list[str], list[str], Any]: + if np is None: + raise RuntimeError("numpy is required for feature-table visualizations") + rows, columns = read_table(path) + if not rows or not columns: + raise ValueError(f"feature table is empty: {path}") + feature_col = columns[0] + excluded = {feature_col, "taxonomy", "taxon", "confidence", "sequence"} + sample_columns = [column for column in columns[1:] if column not in excluded] + numeric_sample_columns = [] + for column in sample_columns: + if any(parse_float(row.get(column, "")) > 0 for row in rows): + numeric_sample_columns.append(column) + if not numeric_sample_columns: + raise ValueError(f"no numeric sample columns found in feature table: {path}") + features = [row.get(feature_col, f"feature_{index}") for index, row in enumerate(rows, start=1)] + matrix = np.asarray( + [[parse_float(row.get(sample, "")) for sample in numeric_sample_columns] for row in rows], + dtype=float, + ) + return features, numeric_sample_columns, matrix + + +def read_asv_sample_columns(path: Path | None) -> list[str]: + if path is None: + return [] + resolved = path.expanduser().resolve() + if not resolved.exists(): + return [] + try: + _, samples, _ = read_feature_count_table(resolved) + return samples + except Exception: + return [] + + +def shannon(counts: Any) -> float: + if np is None: + return 0.0 + total = float(np.asarray(counts, dtype=float).sum()) + if total <= 0: + return 0.0 + proportions = np.asarray(counts, dtype=float) / total + proportions = proportions[proportions > 0] + return float(-(proportions * np.log(proportions)).sum()) + + +def bray_curtis(sample_by_feature: Any) -> Any: + if np is None: + raise RuntimeError("numpy is required for beta-diversity visualizations") + sample_by_feature = np.asarray(sample_by_feature, dtype=float) + n_samples = sample_by_feature.shape[0] + matrix = np.zeros((n_samples, n_samples), dtype=float) + for i in range(n_samples): + for j in range(n_samples): + denominator = float(sample_by_feature[i].sum() + sample_by_feature[j].sum()) + matrix[i, j] = ( + 0.0 + if denominator == 0 + else float(np.abs(sample_by_feature[i] - sample_by_feature[j]).sum() / denominator) + ) + return matrix + + +def pcoa(distance_matrix: Any) -> tuple[Any, list[float]]: + if np is None: + raise RuntimeError("numpy is required for PCoA visualizations") + distances = np.asarray(distance_matrix, dtype=float) + n_samples = distances.shape[0] + if n_samples < 2: + return np.zeros((n_samples, 2)), [0.0, 0.0] + centering = np.eye(n_samples) - np.ones((n_samples, n_samples)) / n_samples + gram = -0.5 * centering @ (distances**2) @ centering + eigenvalues, eigenvectors = np.linalg.eigh(gram) + order = np.argsort(eigenvalues)[::-1] + eigenvalues = eigenvalues[order] + eigenvectors = eigenvectors[:, order] + positive = np.maximum(eigenvalues[:2], 0.0) + coords = eigenvectors[:, :2] * np.sqrt(positive) + total_positive = float(np.maximum(eigenvalues, 0.0).sum()) + variance = [float(value / total_positive) if total_positive else 0.0 for value in positive] + return coords, variance + + +def expected_rarefied_features(counts: Any, depth: int) -> float: + counts = [int(value) for value in counts if int(value) > 0] + total = sum(counts) + if depth <= 0 or total <= 0: + return 0.0 + depth = min(depth, total) + + def log_choose(n: int, k: int) -> float: + if k < 0 or k > n: + return float("-inf") + return math.lgamma(n + 1) - math.lgamma(k + 1) - math.lgamma(n - k + 1) + + denominator = log_choose(total, depth) + expected = 0.0 + for count in counts: + if total - count < depth: + expected += 1.0 + continue + missing_probability = math.exp(log_choose(total - count, depth) - denominator) + expected += 1.0 - missing_probability + return expected + + +def taxonomy_label(raw_taxonomy: str, rank: str) -> str: + text = raw_taxonomy.strip() + if not text: + return "Unassigned" + rank_prefix = {"phylum": "p__", "genus": "g__", "species": "s__"}.get(rank, "") + parts = [part.strip() for part in text.replace("|", ";").split(";")] + if rank_prefix: + for part in parts: + if part.lower().startswith(rank_prefix): + label = part[len(rank_prefix) :].strip() + return label or "Unassigned" + rank_offsets = {"phylum": 1, "genus": 5, "species": 6} + offset = rank_offsets.get(rank) + if offset is not None and offset < len(parts): + label = parts[offset].split("__")[-1].strip() + return label or "Unassigned" + return parts[-1].split("__")[-1].strip() or "Unassigned" + + +def read_taxonomy_map(path: Path, rank: str) -> dict[str, str]: + rows, columns = read_table(path) + if not rows or not columns: + return {} + feature_col = columns[0] + taxonomy_col = next( + (column for column in columns if column.lower() in {"taxonomy", "taxon", "lineage"}), + columns[-1], + ) + return { + row.get(feature_col, ""): taxonomy_label(row.get(taxonomy_col, ""), rank) for row in rows + } + + +def build_amplicon_downstream_context( + args: argparse.Namespace, + rows: list[dict[str, str]], + asv_samples: list[str] | None = None, +) -> dict[str, Any]: + real_samples = sorted( + {row.get("sample", "").strip() for row in rows if row.get("sample", "").strip()} + ) + asv_samples = [sample for sample in (asv_samples or []) if sample] + extra_table_samples = sorted({sample for sample in asv_samples if sample not in real_samples}) + synthetic_reasons: list[str] = [] + if args.synthetic_downstream_inputs: + synthetic_reasons.append("synthetic_downstream_inputs flag was supplied explicitly.") + if path_has_synthetic_marker(args.asv_table.expanduser().resolve() if args.asv_table else None): + synthetic_reasons.append("ASV table filename contains a synthetic or reduced-data marker.") + if path_has_synthetic_marker( + args.taxonomy_table.expanduser().resolve() if args.taxonomy_table else None + ): + synthetic_reasons.append( + "Taxonomy table filename contains a synthetic or reduced-data marker." + ) + if extra_table_samples: + synthetic_reasons.append( + "ASV table contains sample columns not present in the sample sheet: " + + ", ".join(extra_table_samples) + ) + synthetic_detected = bool(synthetic_reasons) + beta_diversity_allowed = len(real_samples) >= 2 and not synthetic_detected + return { + "real_samples": real_samples, + "real_sample_count": len(real_samples), + "asv_samples": asv_samples, + "asv_sample_count": len(asv_samples), + "extra_table_samples": extra_table_samples, + "synthetic_downstream_inputs": synthetic_detected, + "synthetic_reasons": synthetic_reasons, + "beta_diversity_allowed": beta_diversity_allowed, + "review_only": synthetic_detected, + } + + +def build_amplicon_methods_manifest( + run_dir: Path, + args: argparse.Namespace, + rows: list[dict[str, str]], + downstream_context: dict[str, Any], +) -> tuple[dict[str, Any], Path]: + marker_values = sorted( + {row.get("marker", "").strip() for row in rows if row.get("marker", "").strip()} + ) + methods = { + "created_at": now_iso(), + "lane": args.lane, + "marker_regions": marker_values, + "sample_count": len(rows), + "real_samples": downstream_context["real_samples"], + "read_processing": { + "merge_reads": args.merge_reads, + "trunc_len_f": args.trunc_len_f, + "trunc_len_r": args.trunc_len_r, + "primer_forward": args.primer_forward, + "primer_reverse": args.primer_reverse, + "primer_orientation": args.primer_orientation, + "denoiser": args.denoiser, + }, + "normalization": { + "strategy": args.normalization, + "rarefaction_depth": args.rarefaction_depth, + }, + "taxonomy": { + "database": args.taxonomy_database, + "database_version": args.taxonomy_database_version, + "rank": args.taxonomy_rank, + }, + "downstream_inputs": { + "asv_table": str(args.asv_table.expanduser().resolve()) if args.asv_table else None, + "taxonomy_table": str(args.taxonomy_table.expanduser().resolve()) + if args.taxonomy_table + else None, + "synthetic_detected": downstream_context["synthetic_downstream_inputs"], + "synthetic_reasons": downstream_context["synthetic_reasons"], + "beta_diversity_allowed": downstream_context["beta_diversity_allowed"], + }, + } + methods_path = run_dir / "methods" / "amplicon_methods.json" + write_json(methods_path, methods) + return methods, methods_path + + +def write_amplicon_backend_bundle( + run_dir: Path, + args: argparse.Namespace, + methods_manifest_path: Path, +) -> dict[str, Any]: + missing_required: list[str] = [] + if not args.primer_forward: + missing_required.append("primer_forward") + if not args.primer_reverse: + missing_required.append("primer_reverse") + if not args.taxonomy_database: + missing_required.append("taxonomy_database") + if not args.taxonomy_database_version: + missing_required.append("taxonomy_database_version") + backend_dir = run_dir / "workflow" + backend_dir.mkdir(parents=True, exist_ok=True) + result_dir = backend_dir / "ampliseq_results" + command = [ + "nextflow", + "run", + "nf-core/ampliseq", + "-profile", + "docker", + "--input", + str(args.sample_sheet.expanduser().resolve()), + "--outdir", + str(result_dir), + ] + if args.primer_forward: + command.extend(["--FW_primer", args.primer_forward]) + if args.primer_reverse: + command.extend(["--RV_primer", args.primer_reverse]) + if args.trunc_len_f is not None: + command.extend(["--trunclenf", str(args.trunc_len_f)]) + if args.trunc_len_r is not None: + command.extend(["--trunclenr", str(args.trunc_len_r)]) + command_path_file = backend_dir / "amplicon_backend_command.sh" + command_lines = [ + "#!/usr/bin/env bash", + "set -euo pipefail", + "# Review the command below against your nf-core/ampliseq release before execution.", + shlex.join(command), + ] + write_text(command_path_file, "\n".join(command_lines) + "\n") + backend_status = { + "created_at": now_iso(), + "workflow": args.amplicon_backend, + "ready_to_run": not missing_required, + "missing_required_inputs": missing_required, + "command_path": str(command_path_file.relative_to(run_dir)), + "methods_manifest_path": str(methods_manifest_path.relative_to(run_dir)), + "notes": [ + "This bundle captures a concrete backend handoff for real ASV/taxonomy generation.", + "Review primer and truncation parameters against the target nf-core/ampliseq release before execution.", + ], + } + write_json(backend_dir / "amplicon_backend_status.json", backend_status) + write_json( + backend_dir / "amplicon_backend_plan.json", + { + "workflow": args.amplicon_backend, + "command": command, + "missing_required_inputs": missing_required, + "result_dir": str(result_dir.relative_to(run_dir)), + }, + ) + return backend_status + + +def add_amplicon_visualizations( + run_dir: Path, + args: argparse.Namespace, + entries: list[dict[str, Any]], + notes: list[str], + rows: list[dict[str, str]], +) -> None: + if not args.asv_table: + entries.append( + artifact_entry( + artifact_id="amplicon_diversity", + title="Amplicon Diversity Plots", + path=None, + kind="plot_bundle", + status="not_available", + description="Provide --asv-table to generate alpha diversity, beta diversity, and rarefaction plots.", + ) + ) + return + asv_table = args.asv_table.expanduser().resolve() + if not asv_table.exists(): + entries.append( + artifact_entry( + artifact_id="amplicon_diversity", + title="Amplicon Diversity Plots", + path=None, + kind="plot_bundle", + status="blocked", + description=f"ASV table was requested but does not exist: {asv_table}", + ) + ) + return + + try: + features, samples, feature_by_sample = read_feature_count_table(asv_table) + except Exception as exc: + entries.append( + artifact_entry( + artifact_id="amplicon_diversity", + title="Amplicon Diversity Plots", + path=None, + kind="plot_bundle", + status="blocked", + description=f"Could not parse ASV table: {exc}", + ) + ) + return + + downstream_context = build_amplicon_downstream_context(args, rows, asv_samples=samples) + if downstream_context["synthetic_downstream_inputs"]: + notes.append( + "Review-only downstream inputs were detected; generated amplicon tables and plots are for runner verification and should not be used for biological interpretation." + ) + notes.extend(downstream_context["synthetic_reasons"]) + + sample_by_feature = feature_by_sample.T + review_caption = ( + "REVIEW ONLY - synthetic downstream inputs detected" + if downstream_context["review_only"] + else None + ) + alpha_rows = [] + for sample, counts in zip(samples, sample_by_feature, strict=True): + alpha_rows.append( + { + "sample": sample, + "total_reads": int(counts.sum()), + "observed_features": int((counts > 0).sum()), + "shannon": f"{shannon(counts):.6g}", + } + ) + alpha_path = run_dir / "tables" / "alpha_diversity.tsv" + write_tsv(alpha_path, alpha_rows, ["sample", "total_reads", "observed_features", "shannon"]) + entries.append( + artifact_entry( + artifact_id="alpha_diversity_table", + title="Alpha Diversity Table", + path=alpha_path.relative_to(run_dir), + kind="table", + status="created", + description="Observed feature counts and Shannon diversity per sample.", + source=str(asv_table), + ) + ) + + shannon_plot = run_dir / "visualizations" / "alpha_diversity_shannon.png" + observed_plot = run_dir / "visualizations" / "alpha_diversity_observed_features.png" + if save_barplot( + samples, + [parse_float(row["shannon"]) for row in alpha_rows], + shannon_plot, + title="Shannon Diversity", + ylabel="Shannon index", + caption=review_caption, + ): + entries.append( + artifact_entry( + artifact_id="alpha_shannon_plot", + title="Alpha Diversity: Shannon", + path=shannon_plot.relative_to(run_dir), + kind="plot", + status="created", + description="Per-sample Shannon diversity from the provided ASV table.", + ) + ) + if save_barplot( + samples, + [parse_float(row["observed_features"]) for row in alpha_rows], + observed_plot, + title="Observed Features", + ylabel="Observed features", + caption=review_caption, + ): + entries.append( + artifact_entry( + artifact_id="alpha_observed_plot", + title="Alpha Diversity: Observed Features", + path=observed_plot.relative_to(run_dir), + kind="plot", + status="created", + description="Per-sample observed ASV/feature counts.", + ) + ) + + beta_diversity_allowed = ( + downstream_context["beta_diversity_allowed"] or args.allow_synthetic_diversity + ) + if len(samples) >= 2 and beta_diversity_allowed: + distance = bray_curtis(sample_by_feature) + distance_rows = [] + for sample, values in zip(samples, distance, strict=True): + row = {"sample": sample} + row.update( + {other: f"{float(value):.6g}" for other, value in zip(samples, values, strict=True)} + ) + distance_rows.append(row) + distance_path = run_dir / "tables" / "beta_bray_curtis_distance.tsv" + write_tsv(distance_path, distance_rows, ["sample", *samples]) + coords, variance = pcoa(distance) + pcoa_rows = [ + {"sample": sample, "PCoA1": f"{float(coord[0]):.6g}", "PCoA2": f"{float(coord[1]):.6g}"} + for sample, coord in zip(samples, coords, strict=True) + ] + pcoa_path = run_dir / "tables" / "beta_pcoa.tsv" + write_tsv(pcoa_path, pcoa_rows, ["sample", "PCoA1", "PCoA2"]) + pcoa_plot = run_dir / "visualizations" / "beta_diversity_pcoa_bray_curtis.png" + if save_scatter( + [float(row["PCoA1"]) for row in pcoa_rows], + [float(row["PCoA2"]) for row in pcoa_rows], + samples, + pcoa_plot, + title=f"Bray-Curtis PCoA ({variance[0]:.1%}, {variance[1]:.1%})", + caption=review_caption, + ): + entries.append( + artifact_entry( + artifact_id="beta_pcoa_plot", + title="Beta Diversity: Bray-Curtis PCoA", + path=pcoa_plot.relative_to(run_dir), + kind="plot", + status="created", + description="PCoA from Bray-Curtis distances computed from the provided ASV table.", + ) + ) + entries.append( + artifact_entry( + artifact_id="beta_distance_table", + title="Beta Diversity Distance Matrix", + path=distance_path.relative_to(run_dir), + kind="table", + status="created", + description="Bray-Curtis distance matrix.", + ) + ) + elif len(samples) < 2: + notes.append("Beta diversity PCoA requires at least two samples.") + else: + entries.append( + artifact_entry( + artifact_id="beta_pcoa_plot", + title="Beta Diversity: Bray-Curtis PCoA", + path=None, + kind="plot", + status="blocked", + description="Beta-diversity and PCoA are blocked because downstream inputs are marked synthetic or review-only. Pass --allow-synthetic-diversity to override for visualization-only review runs.", + ) + ) + notes.append( + "Beta-diversity and PCoA were blocked because downstream inputs were marked synthetic or review-only. Pass --allow-synthetic-diversity to override for visualization-only review runs." + ) + + rarefaction_rows: list[dict[str, Any]] = [] + max_depth = int(max((counts.sum() for counts in sample_by_feature), default=0)) + if max_depth > 0: + depths = sorted({max(1, int(max_depth * fraction / 10)) for fraction in range(1, 11)}) + for sample, counts in zip(samples, sample_by_feature, strict=True): + for depth in depths: + rarefaction_rows.append( + { + "sample": sample, + "depth": depth, + "expected_observed_features": f"{expected_rarefied_features(counts, depth):.6g}", + } + ) + rarefaction_path = run_dir / "tables" / "rarefaction.tsv" + write_tsv(rarefaction_path, rarefaction_rows, ["sample", "depth", "expected_observed_features"]) + if plt is not None and rarefaction_rows: + fig, ax = plt.subplots(figsize=(7.8, 5.5)) + for sample in samples: + sample_rows = [row for row in rarefaction_rows if row["sample"] == sample] + ax.plot( + [int(row["depth"]) for row in sample_rows], + [float(row["expected_observed_features"]) for row in sample_rows], + marker="o", + label=sample, + ) + ax.set_title("Rarefaction Curves") + ax.set_xlabel("Subsampled reads") + ax.set_ylabel("Expected observed features") + ax.legend(frameon=False, fontsize=8) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + fig_caption(fig, review_caption) + fig.tight_layout() + rarefaction_plot = run_dir / "visualizations" / "rarefaction_curves.png" + fig.savefig(rarefaction_plot, dpi=160) + plt.close(fig) + entries.append( + artifact_entry( + artifact_id="rarefaction_plot", + title="Rarefaction Curves", + path=rarefaction_plot.relative_to(run_dir), + kind="plot", + status="created", + description="Expected observed features across subsampling depths.", + ) + ) + + if args.taxonomy_table: + taxonomy_path = args.taxonomy_table.expanduser().resolve() + taxonomy_map = ( + read_taxonomy_map(taxonomy_path, args.taxonomy_rank) if taxonomy_path.exists() else {} + ) + if taxonomy_map: + taxa = sorted({taxonomy_map.get(feature, "Unassigned") for feature in features}) + abundance = {sample: {taxon: 0.0 for taxon in taxa} for sample in samples} + for feature, counts in zip(features, feature_by_sample, strict=True): + taxon = taxonomy_map.get(feature, "Unassigned") + for sample, count in zip(samples, counts, strict=True): + abundance[sample][taxon] += float(count) + totals_by_taxon = { + taxon: sum(abundance[sample][taxon] for sample in samples) for taxon in taxa + } + top_taxa = [ + taxon + for taxon, _ in sorted( + totals_by_taxon.items(), key=lambda item: item[1], reverse=True + )[: args.top_n_taxa] + ] + if len(taxa) > len(top_taxa): + top_taxa.append("Other") + rows = [] + values_by_taxon = {taxon: [] for taxon in top_taxa} + for sample in samples: + sample_total = sum(abundance[sample].values()) or 1.0 + other = 0.0 + for taxon in taxa: + value = abundance[sample][taxon] / sample_total + if taxon in values_by_taxon: + values_by_taxon[taxon].append(value) + rows.append( + {"sample": sample, "taxon": taxon, "relative_abundance": f"{value:.6g}"} + ) + else: + other += value + if "Other" in values_by_taxon: + values_by_taxon["Other"].append(other) + rows.append( + {"sample": sample, "taxon": "Other", "relative_abundance": f"{other:.6g}"} + ) + taxa_table = run_dir / "tables" / f"taxa_abundance_{args.taxonomy_rank}.tsv" + write_tsv(taxa_table, rows, ["sample", "taxon", "relative_abundance"]) + taxa_plot = run_dir / "visualizations" / f"taxa_barplot_{args.taxonomy_rank}.png" + if save_stacked_barplot( + samples, + top_taxa, + values_by_taxon, + taxa_plot, + title=f"Taxa Barplot ({args.taxonomy_rank})", + ylabel="Relative abundance", + caption=review_caption, + ): + entries.append( + artifact_entry( + artifact_id="taxa_barplot", + title=f"Taxa Barplot ({args.taxonomy_rank})", + path=taxa_plot.relative_to(run_dir), + kind="plot", + status="created", + description="Stacked relative abundance by sample from the provided taxonomy table.", + source=str(taxonomy_path), + ) + ) + else: + notes.append( + "Taxonomy table was provided but could not be parsed into feature-to-taxon labels." + ) + else: + entries.append( + artifact_entry( + artifact_id="taxa_barplot", + title="Taxa Barplot", + path=None, + kind="plot", + status="not_available", + description="Provide --taxonomy-table with --asv-table to generate taxa barplots.", + ) + ) + + +def parse_kraken_report(path: Path, rank_filter: set[str]) -> list[dict[str, Any]]: + rows = [] + for line in path.read_text(encoding="utf-8", errors="replace").splitlines(): + parts = line.strip().split(maxsplit=5) + if len(parts) < 6: + continue + percent, clade_reads, taxon_reads, rank, taxid, name = parts + if rank_filter and rank not in rank_filter: + continue + rows.append( + { + "sample": path.name.replace(".report.txt", "").replace(".report", ""), + "percent": parse_float(percent), + "clade_reads": parse_float(clade_reads), + "taxon_reads": parse_float(taxon_reads), + "rank": rank, + "taxid": taxid, + "name": name.strip(), + } + ) + return rows + + +def read_bracken_table(path: Path) -> list[dict[str, Any]]: + rows, columns = read_table(path) + sample = path.name.replace(".bracken", "").replace(".tsv", "").replace(".txt", "") + name_col = next( + (column for column in columns if column.lower() in {"name", "taxonomy", "taxon"}), + columns[0] if columns else "name", + ) + reads_col = next( + ( + column + for column in columns + if column.lower() in {"new_est_reads", "fraction_total_reads", "reads"} + ), + "", + ) + fraction_col = next( + (column for column in columns if column.lower() == "fraction_total_reads"), "" + ) + parsed = [] + for row in rows: + parsed.append( + { + "sample": sample, + "name": row.get(name_col, ""), + "reads": parse_float(row.get(reads_col, "")), + "fraction": parse_float(row.get(fraction_col, "")) if fraction_col else 0.0, + } + ) + return parsed + + +def read_humann_table(path: Path) -> tuple[list[str], list[str], Any]: + if np is None: + raise RuntimeError("numpy is required for HUMAnN visualizations") + lines = [ + line + for line in path.read_text(encoding="utf-8", errors="replace").splitlines() + if line and not line.startswith("#") + ] + if not lines: + raise ValueError(f"HUMAnN table is empty: {path}") + header = lines[0].split("\t") + sample_names = header[1:] + features = [] + values = [] + for line in lines[1:]: + parts = line.split("\t") + if len(parts) < 2: + continue + features.append(parts[0]) + values.append([parse_float(value) for value in parts[1 : len(sample_names) + 1]]) + return features, sample_names, np.asarray(values, dtype=float) + + +def add_shotgun_visualizations( + run_dir: Path, args: argparse.Namespace, entries: list[dict[str, Any]], notes: list[str] +) -> None: + kraken_reports = [path.expanduser().resolve() for path in args.kraken_report] + kraken_reports.extend(sorted((run_dir / "taxonomic_classification").glob("*.report.txt"))) + kraken_reports = [ + path + for index, path in enumerate(kraken_reports) + if path.exists() and path not in kraken_reports[:index] + ] + if kraken_reports: + rank_filter = set(args.kraken_rank) + kraken_rows = [] + for path in kraken_reports: + kraken_rows.extend(parse_kraken_report(path, rank_filter)) + if kraken_rows: + kraken_table = run_dir / "tables" / "kraken_top_taxa.tsv" + top_rows = sorted(kraken_rows, key=lambda row: row["clade_reads"], reverse=True)[ + : args.top_n_taxa * max(1, len(kraken_reports)) + ] + write_tsv( + kraken_table, + top_rows, + ["sample", "percent", "clade_reads", "taxon_reads", "rank", "taxid", "name"], + ) + samples = sorted({row["sample"] for row in top_rows}) + taxa = [ + taxon + for taxon, _ in sorted( + { + row["name"]: sum(r["percent"] for r in top_rows if r["name"] == row["name"]) + for row in top_rows + }.items(), + key=lambda item: item[1], + reverse=True, + )[: args.top_n_taxa] + ] + values_by_taxon = {taxon: [] for taxon in taxa} + for sample in samples: + total = sum(row["percent"] for row in top_rows if row["sample"] == sample) or 1.0 + for taxon in taxa: + values_by_taxon[taxon].append( + sum( + row["percent"] + for row in top_rows + if row["sample"] == sample and row["name"] == taxon + ) + / total + ) + kraken_plot = run_dir / "visualizations" / "kraken_top_taxa_barplot.png" + if save_stacked_barplot( + samples, + taxa, + values_by_taxon, + kraken_plot, + title="Kraken Top Taxa", + ylabel="Relative share of displayed taxa", + ): + entries.append( + artifact_entry( + artifact_id="kraken_top_taxa", + title="Kraken Top Taxa", + path=kraken_plot.relative_to(run_dir), + kind="plot", + status="created", + description="Stacked barplot from Kraken report files.", + ) + ) + entries.append( + artifact_entry( + artifact_id="kraken_top_taxa_table", + title="Kraken Top Taxa Table", + path=kraken_table.relative_to(run_dir), + kind="table", + status="created", + description="Parsed top Kraken report rows.", + ) + ) + else: + entries.append( + artifact_entry( + artifact_id="kraken_top_taxa", + title="Kraken Top Taxa", + path=None, + kind="plot", + status="not_available", + description="Provide --kraken-report or run Kraken2 with --kraken-db to generate taxonomic plots.", + ) + ) + + bracken_rows = [] + for path in [item.expanduser().resolve() for item in args.bracken_table]: + if path.exists(): + bracken_rows.extend(read_bracken_table(path)) + if bracken_rows: + bracken_table = run_dir / "tables" / "bracken_relative_abundance.tsv" + write_tsv(bracken_table, bracken_rows, ["sample", "name", "reads", "fraction"]) + samples = sorted({row["sample"] for row in bracken_rows}) + top_taxa = [ + taxon + for taxon, _ in sorted( + { + row["name"]: sum( + r["fraction"] or r["reads"] + for r in bracken_rows + if r["name"] == row["name"] + ) + for row in bracken_rows + }.items(), + key=lambda item: item[1], + reverse=True, + )[: args.top_n_taxa] + ] + matrix = [] + for taxon in top_taxa: + matrix.append( + [ + sum( + row["fraction"] or row["reads"] + for row in bracken_rows + if row["sample"] == sample and row["name"] == taxon + ) + for sample in samples + ] + ) + bracken_heatmap = run_dir / "visualizations" / "bracken_relative_abundance_heatmap.png" + if save_heatmap( + matrix, + top_taxa, + samples, + bracken_heatmap, + title="Bracken Relative Abundance", + colorbar_label="Fraction or reads", + ): + entries.append( + artifact_entry( + artifact_id="bracken_heatmap", + title="Bracken Relative Abundance Heatmap", + path=bracken_heatmap.relative_to(run_dir), + kind="plot", + status="created", + description="Top Bracken taxa across samples.", + ) + ) + else: + entries.append( + artifact_entry( + artifact_id="bracken_heatmap", + title="Bracken Relative Abundance Heatmap", + path=None, + kind="plot", + status="not_available", + description="Provide --bracken-table to generate Bracken relative-abundance plots.", + ) + ) + + humann_inputs = [ + ("pathway", args.humann_pathabundance, "--humann-pathabundance"), + ("gene_family", args.humann_genefamilies, "--humann-genefamilies"), + ] + for label, path, option_name in humann_inputs: + if not path: + entries.append( + artifact_entry( + artifact_id=f"humann_{label}_heatmap", + title=f"HUMAnN {label.replace('_', ' ').title()} Heatmap", + path=None, + kind="plot", + status="not_available", + description=f"Provide {option_name} to generate this HUMAnN visual layer.", + ) + ) + continue + resolved = path.expanduser().resolve() + if not resolved.exists(): + notes.append(f"HUMAnN {label} table was provided but does not exist: {resolved}") + continue + try: + features, samples, matrix = read_humann_table(resolved) + except Exception as exc: + notes.append(f"Could not parse HUMAnN {label} table: {exc}") + continue + totals = matrix.sum(axis=1) + order = list(np.argsort(totals)[::-1][: args.top_n_taxa]) if np is not None else [] + top_features = [features[index] for index in order] + top_matrix = matrix[order, :] if np is not None and order else [] + table_rows = [] + for feature_index in order: + row = {"feature": features[feature_index]} + row.update( + { + sample: f"{float(value):.6g}" + for sample, value in zip(samples, matrix[feature_index, :], strict=True) + } + ) + table_rows.append(row) + humann_table = run_dir / "tables" / f"humann_{label}_top_features.tsv" + write_tsv(humann_table, table_rows, ["feature", *samples]) + humann_heatmap = run_dir / "visualizations" / f"humann_{label}_heatmap.png" + if save_heatmap( + top_matrix, + top_features, + samples, + humann_heatmap, + title=f"HUMAnN {label.replace('_', ' ').title()}", + colorbar_label="Abundance", + ): + entries.append( + artifact_entry( + artifact_id=f"humann_{label}_heatmap", + title=f"HUMAnN {label.replace('_', ' ').title()} Heatmap", + path=humann_heatmap.relative_to(run_dir), + kind="plot", + status="created", + description=f"Top HUMAnN {label.replace('_', ' ')} features across samples.", + source=str(resolved), + ) + ) + + +def add_read_qc_visualizations( + run_dir: Path, entries: list[dict[str, Any]], notes: list[str] +) -> None: + seqkit_stats = run_dir / "qc" / "seqkit_stats.tsv" + if not seqkit_stats.exists() or seqkit_stats.stat().st_size == 0: + entries.append( + artifact_entry( + artifact_id="read_count_plot", + title="Read Counts", + path=None, + kind="plot", + status="not_available", + description="Run with --execute and seqkit available to generate read-count plots.", + ) + ) + return + rows, columns = read_table(seqkit_stats) + if not rows: + notes.append("seqkit stats file exists but no rows could be parsed.") + return + file_col = "file" if "file" in columns else columns[0] + count_col = ( + "num_seqs" + if "num_seqs" in columns + else next((column for column in columns if "seq" in column.lower()), "") + ) + avg_len_col = ( + "avg_len" + if "avg_len" in columns + else next( + (column for column in columns if "avg" in column.lower() and "len" in column.lower()), + "", + ) + ) + labels = [ + Path(row.get(file_col, f"read_{index}")).name for index, row in enumerate(rows, start=1) + ] + if count_col: + count_plot = run_dir / "visualizations" / "read_counts.png" + if save_barplot( + labels, + [parse_float(row.get(count_col, "")) for row in rows], + count_plot, + title="Read Counts", + ylabel="Reads", + ): + entries.append( + artifact_entry( + artifact_id="read_count_plot", + title="Read Counts", + path=count_plot.relative_to(run_dir), + kind="plot", + status="created", + description="Read counts parsed from seqkit stats.", + ) + ) + if avg_len_col: + length_plot = run_dir / "visualizations" / "average_read_lengths.png" + if save_barplot( + labels, + [parse_float(row.get(avg_len_col, "")) for row in rows], + length_plot, + title="Average Read Lengths", + ylabel="Bases", + ): + entries.append( + artifact_entry( + artifact_id="average_read_length_plot", + title="Average Read Lengths", + path=length_plot.relative_to(run_dir), + kind="plot", + status="created", + description="Average read lengths parsed from seqkit stats.", + ) + ) + + +def generate_visualizations( + run_dir: Path, + args: argparse.Namespace, + validation: dict[str, Any], + rows: list[dict[str, str]], + input_provenance: dict[str, Any], + interpretation: dict[str, Any] | None = None, +) -> dict[str, str]: + entries: list[dict[str, Any]] = [] + notes: list[str] = [] + (run_dir / "visualizations").mkdir(parents=True, exist_ok=True) + (run_dir / "tables").mkdir(parents=True, exist_ok=True) + multiqc_report_exists = (run_dir / "fastqc" / "multiqc" / "multiqc_report.html").exists() + localhost_report = reachable_localhost_url_for_path("fastqc/multiqc/multiqc_report.html") + multiqc_helper = write_multiqc_browser_helper( + run_dir, + report_path="fastqc/multiqc/multiqc_report.html", + title="FastQC MultiQC Browser Helper", + ) + launch_hint = write_localhost_launch_hint( + run_dir, + report_entries=[("FastQC MultiQC", "fastqc/multiqc/multiqc_report.html")], + ) + + entries.append( + artifact_entry( + artifact_id="multiqc_localhost", + title="FastQC MultiQC Localhost URL", + path=localhost_report if localhost_report else None, + kind="localhost_app", + status="created" if localhost_report else "not_available", + description="Live review surface for the full interactive MultiQC report when the run directory is already being served over localhost.", + ) + ) + entries.append( + artifact_entry( + artifact_id="multiqc_browser_helper", + title="FastQC MultiQC Browser Helper", + path=str(multiqc_helper.relative_to(run_dir)) if multiqc_helper else None, + kind="html_report", + status="created" if multiqc_helper else "not_available", + description="Browser-safe MultiQC helper with embedded tables and localhost instructions for the full interactive report.", + ) + ) + entries.append( + artifact_entry( + artifact_id="localhost_launch_hint", + title="Localhost Launch Hint", + path=str(launch_hint.relative_to(run_dir)), + kind="text", + status="created", + description="Command and localhost URL for serving the run directory and opening the full MultiQC report.", + ) + ) + entries.append( + artifact_entry( + artifact_id="qc_verdict", + title="QC Verdict", + path="qc_verdict.json" if (run_dir / "qc_verdict.json").exists() else None, + kind="json", + status="created" if (run_dir / "qc_verdict.json").exists() else "not_available", + description="Machine-readable QC/readiness verdict with thresholds, reason codes, and next-step recommendations.", + ) + ) + entries.append( + artifact_entry( + artifact_id="qc_interpretation", + title="QC Interpretation", + path="qc_interpretation.json" + if (run_dir / "qc_interpretation.json").exists() + else None, + kind="json", + status="created" if (run_dir / "qc_interpretation.json").exists() else "not_available", + description="Lane-specific interpretation alias for user-facing review surfaces that expect a stable qc_interpretation.json path.", + ) + ) + add_read_qc_visualizations(run_dir, entries, notes) + if args.lane == "amplicon_microbiome": + add_amplicon_visualizations(run_dir, args, entries, notes, rows) + elif args.lane == "shotgun_metagenomics": + add_shotgun_visualizations(run_dir, args, entries, notes) + elif args.lane == "epigenomics_peaks": + entries.append( + artifact_entry( + artifact_id="peak_calling_readiness", + title="Peak Calling Readiness", + path="peak_calling_readiness.json" + if (run_dir / "peak_calling_readiness.json").exists() + else None, + kind="json", + status="created" + if (run_dir / "peak_calling_readiness.json").exists() + else "not_available", + description="FASTQ-stage readiness record for the alignment and peak-calling handoff.", + ) + ) + if validation.get("warnings"): + notes.extend(validation["warnings"]) + if validation.get("errors"): + notes.extend(validation["errors"]) + if interpretation and interpretation.get("warnings"): + notes.extend(interpretation["warnings"]) + notes = list(dict.fromkeys(notes)) + analysis_intent = "real_analysis" + provenance_summary: dict[str, Any] = { + "sample_sheet_resolved": input_provenance.get("sample_sheet", {}).get("resolved_path"), + "supplemental_input_count": sum( + len(value) if isinstance(value, list) else int(bool(value)) + for value in input_provenance.get("supplemental_inputs", {}).values() + ), + } + if args.lane == "amplicon_microbiome": + downstream_context = build_amplicon_downstream_context( + args, rows, asv_samples=read_asv_sample_columns(args.asv_table) + ) + provenance_summary.update( + { + "synthetic_downstream_inputs": downstream_context["synthetic_downstream_inputs"], + "synthetic_reasons": downstream_context["synthetic_reasons"], + "real_sample_count": downstream_context["real_sample_count"], + "asv_sample_count": downstream_context["asv_sample_count"], + } + ) + if provenance_summary.get("supplemental_input_count"): + notes.append( + "Supplemental taxonomy/function inputs were copied under inputs/ and checksummed in run_manifest audit." + ) + notes = list(dict.fromkeys(notes)) + index = write_visualization_index( + run_dir, + title=f"{LANES[args.lane]['display']} Visualizations", + description="Native artifact bundle generated by the Life Sciences NGS Analysis plugin for Codex review and handoff.", + entries=entries, + notes=notes, + analysis_intent=analysis_intent, + provenance_summary=provenance_summary, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + "localhost_launch_hint": str(launch_hint.relative_to(run_dir)), + "fastqc_multiqc_localhost": localhost_report if multiqc_report_exists else None, + } + + +def execute_package( + run_dir: Path, + args: argparse.Namespace, + fastq_paths: list[Path], + tool_status: dict[str, Any], + validation: dict[str, Any], +) -> dict[str, Any]: + results: dict[str, Any] = {"ok": True, "steps": []} + if not fastq_paths: + return results + (run_dir / "qc").mkdir(parents=True, exist_ok=True) + + seqkit = run_cmd(["seqkit", "stats", "-T", *map(str, fastq_paths)], run_dir, timeout=600) + write_json(run_dir / "logs" / "seqkit_stats.json", seqkit) + write_text(run_dir / "qc" / "seqkit_stats.tsv", seqkit.get("stdout_tail", "")) + results["steps"].append({"name": "seqkit_stats", "ok": seqkit.get("ok")}) + results["ok"] = bool(results["ok"] and seqkit.get("ok")) + + if command_path("fastqc") and command_path("multiqc"): + (run_dir / "fastqc" / "raw").mkdir(parents=True, exist_ok=True) + fastqc = run_cmd( + ["fastqc", "-t", str(args.threads), "-o", "fastqc/raw", *map(str, fastq_paths)], + run_dir, + timeout=3600, + ) + write_json(run_dir / "logs" / "fastqc.json", fastqc) + write_text(run_dir / "logs" / "fastqc.log", fastqc.get("stdout_tail", "")) + multiqc = run_cmd( + ["multiqc", "--no-version-check", "fastqc/raw", "-o", "fastqc/multiqc"], + run_dir, + timeout=600, + ) + write_json(run_dir / "logs" / "multiqc.json", multiqc) + write_text(run_dir / "logs" / "multiqc.log", multiqc.get("stdout_tail", "")) + results["steps"].extend( + [ + {"name": "fastqc", "ok": fastqc.get("ok")}, + {"name": "multiqc", "ok": multiqc.get("ok")}, + ] + ) + results["ok"] = bool(results["ok"] and fastqc.get("ok") and multiqc.get("ok")) + + if args.lane == "shotgun_metagenomics": + status = { + "analysis_intent": "real_analysis", + "requested_local_backend": bool(args.kraken_db), + "executed": False, + "reason": None, + "supplemental_reports_present": bool(args.kraken_report), + } + if args.kraken_db and command_path("kraken2"): + kraken_dir = run_dir / "taxonomic_classification" + kraken_dir.mkdir(parents=True, exist_ok=True) + for fastq in fastq_paths: + out = ( + kraken_dir + / f"{fastq.stem.replace('.fastq', '').replace('.fq', '')}.kraken2.txt" + ) + report = ( + kraken_dir / f"{fastq.stem.replace('.fastq', '').replace('.fq', '')}.report.txt" + ) + kraken = run_cmd( + ["kraken2", "--db", str(args.kraken_db), "--report", str(report), str(fastq)], + run_dir, + timeout=3600, + ) + write_text(out, kraken.get("stdout_tail", "")) + status["executed"] = bool(status["executed"] or kraken.get("ok")) + elif args.kraken_db: + status["reason"] = "kraken2 is not installed" + elif args.kraken_report: + status["reason"] = ( + "local Kraken2 classification was not run; supplied Kraken reports were used for downstream visualization." + ) + else: + status["reason"] = "no Kraken2 database was provided" + write_json(run_dir / "taxonomic_classification_status.json", status) + + if args.lane == "epigenomics_peaks": + write_json( + run_dir / "peak_calling_readiness.json", + build_epigenomics_readiness(run_dir, args, validation), + ) + if args.lane == "amplicon_microbiome": + downstream_context = build_amplicon_downstream_context(args, []) + write_json( + run_dir / "amplicon_analysis_status.json", + { + "primer_trimming_ready": command_path("cutadapt") is not None, + "taxonomy_backend_required": True, + "synthetic_downstream_inputs_detected": downstream_context[ + "synthetic_downstream_inputs" + ], + "beta_diversity_allowed": downstream_context["beta_diversity_allowed"] + or args.allow_synthetic_diversity, + "backend_status_path": "workflow/amplicon_backend_status.json", + "note": "This package validates amplicon reads and summarizes read content; ASV/taxonomy assignment remains database/backend gated.", + }, + ) + return results + + +def write_summary( + run_dir: Path, + args: argparse.Namespace, + status: str, + validation: dict[str, Any], + interpretation: dict[str, Any] | None = None, +) -> None: + lines = [ + f"# {LANES[args.lane]['display']} Run Summary", + "", + f"Status: `{status}`", + f"Rows parsed: `{validation.get('row_count', 0)}`", + f"FASTQs parsed: `{validation.get('fastq_count', 0)}`", + "", + "## Key Artifacts", + "", + "- `validation/samples.normalized.tsv`", + "- `qc/seqkit_stats.tsv`", + "- `visualizations/localhost_launch_hint.txt` for the preferred localhost MultiQC link", + "- `fastqc/multiqc/multiqc_browser_helper.html` when FastQC/MultiQC execute", + "- `visualizations/index.html` and `visualizations/visualization_manifest.json`", + "- `methods/amplicon_methods.json` and `workflow/amplicon_backend_status.json` for amplicon provenance/handoff", + "- `tables/` for optional ASV, taxonomy, Kraken, Bracken, or HUMAnN-derived summaries", + "- lane-specific readiness/status JSON", + "- `run_manifest.json` and `artifact_index.json`", + "", + ] + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {warning}" for warning in validation["warnings"]) + lines.append("") + if interpretation: + lines.extend(["## Interpretation", ""]) + lines.append(f"- Verdict: `{interpretation.get('verdict', 'unknown')}`") + lines.append( + f"- Analysis readiness: `{interpretation.get('analysis_readiness', 'unknown')}`" + ) + for reason in interpretation.get("reason_codes", []): + lines.append(f"- Reason code: `{reason}`") + for recommendation in interpretation.get("recommendations", []): + lines.append(f"- Recommendation: {recommendation}") + for follow_on in interpretation.get("follow_on_commands", []): + lines.append( + f"- Follow-on command ({follow_on.get('id', 'next')}): `{follow_on.get('command', '')}`" + ) + lines.append("") + if args.lane == "epigenomics_peaks": + readiness_path = run_dir / "peak_calling_readiness.json" + if readiness_path.exists(): + readiness = json.loads(readiness_path.read_text(encoding="utf-8")) + lines.extend(["## Epigenomics Readiness", ""]) + lines.append(f"- Review surface OK: `{readiness.get('review_surface_ok')}`") + lines.append( + f"- Alignment handoff ready: `{readiness.get('ready_for_alignment_handoff')}`" + ) + for missing in readiness.get("missing_metadata", []): + lines.append(f"- Missing metadata: `{missing}`") + for item in readiness.get("checklist", []): + lines.append(f"- {item.get('id')}: `{item.get('status')}`") + lines.append("") + methods_path = run_dir / "methods" / "amplicon_methods.json" + if methods_path.exists(): + methods_payload = json.loads(methods_path.read_text(encoding="utf-8")) + if methods_payload.get("downstream_inputs", {}).get("synthetic_detected"): + lines.extend(["## Review Guardrail", ""]) + lines.append( + "- Downstream ASV/taxonomy inputs were flagged as synthetic or review-only; diversity plots are emitted for runner verification, not biological interpretation." + ) + lines.append("") + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {error}" for error in validation["errors"]) + write_text(run_dir / "summary.md", "\n".join(lines) + "\n") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--lane", choices=sorted(LANES), required=True) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--fastq-root", type=Path, action="append", default=[]) + parser.add_argument("--kraken-db", type=Path) + parser.add_argument( + "--asv-table", + type=Path, + help="Optional amplicon feature/ASV count table for diversity visualizations.", + ) + parser.add_argument( + "--taxonomy-table", type=Path, help="Optional feature taxonomy table paired to --asv-table." + ) + parser.add_argument("--taxonomy-rank", default="genus", choices=["phylum", "genus", "species"]) + parser.add_argument( + "--synthetic-downstream-inputs", + action="store_true", + help="Mark ASV/taxonomy inputs as synthetic or review-only so downstream interpretation is blocked or labeled.", + ) + parser.add_argument( + "--allow-synthetic-diversity", + dest="allow_synthetic_diversity", + action="store_true", + help="Allow beta-diversity/PCoA even when downstream inputs are synthetic or review-only.", + ) + parser.add_argument( + "--primer-forward", + default=None, + help="Forward primer sequence for backend handoff and methods manifest.", + ) + parser.add_argument( + "--primer-reverse", + default=None, + help="Reverse primer sequence for backend handoff and methods manifest.", + ) + parser.add_argument( + "--primer-orientation", + default=None, + help="Primer orientation for backend handoff and methods manifest.", + ) + parser.add_argument( + "--merge-reads", + default="auto", + choices=["auto", "yes", "no"], + help="Read-merging policy recorded in the methods manifest.", + ) + parser.add_argument( + "--trunc-len-f", + type=int, + default=None, + help="Forward read truncation length for backend handoff.", + ) + parser.add_argument( + "--trunc-len-r", + type=int, + default=None, + help="Reverse read truncation length for backend handoff.", + ) + parser.add_argument( + "--denoiser", + default="dada2", + choices=["dada2", "qiime2-dada2", "deblur"], + help="Denoiser recorded in the methods manifest.", + ) + parser.add_argument( + "--taxonomy-database", + default=None, + help="Taxonomy database name for methods manifest and backend handoff.", + ) + parser.add_argument( + "--taxonomy-database-version", + default=None, + help="Taxonomy database version for methods manifest and backend handoff.", + ) + parser.add_argument( + "--normalization", + default="relative_abundance", + choices=["relative_abundance", "rarefy", "none"], + help="Normalization policy recorded in the methods manifest.", + ) + parser.add_argument( + "--rarefaction-depth", + type=int, + default=None, + help="Rarefaction depth recorded in the methods manifest.", + ) + parser.add_argument( + "--amplicon-backend", + default="nf-core/ampliseq", + choices=["nf-core/ampliseq", "qiime2", "dada2"], + help="Backend workflow captured in the amplicon handoff bundle.", + ) + parser.add_argument( + "--kraken-report", + type=Path, + action="append", + default=[], + help="Optional Kraken report file; may be repeated.", + ) + parser.add_argument( + "--kraken-rank", + action="append", + default=["S"], + help="Kraken rank code to plot, e.g. S, G, P. May be repeated.", + ) + parser.add_argument( + "--bracken-table", + type=Path, + action="append", + default=[], + help="Optional Bracken abundance table; may be repeated.", + ) + parser.add_argument( + "--humann-pathabundance", type=Path, help="Optional HUMAnN pathabundance table." + ) + parser.add_argument( + "--humann-genefamilies", type=Path, help="Optional HUMAnN genefamilies table." + ) + parser.add_argument( + "--top-n-taxa", + type=int, + default=12, + help="Maximum taxa/features shown in stacked bars and heatmaps.", + ) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=None) + parser.add_argument("--threads", type=int, default=4) + parser.add_argument("--execute", action="store_true") + parser.add_argument("--fastq-record-check", type=int, default=200) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run_id = args.run_id or slug_timestamp(args.lane.replace("_", "-")) + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.lane / run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + + validation, rows, fastq_paths = normalize_samples(args) + input_provenance = stage_analysis_inputs(run_dir, args, rows) + replay_sample_sheet = run_dir / input_provenance["sample_sheet"]["resolved_path"] + tool_status = tool_preflight( + LANES[args.lane]["required"], optional=LANES[args.lane]["optional"] + ) + write_json( + run_dir / "config.json", + { + "lane": args.lane, + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "resolved_sample_sheet": str(replay_sample_sheet), + "kraken_db": str(args.kraken_db) if args.kraken_db else None, + "asv_table": str(args.asv_table.expanduser().resolve()) if args.asv_table else None, + "taxonomy_table": str(args.taxonomy_table.expanduser().resolve()) + if args.taxonomy_table + else None, + "synthetic_downstream_inputs": args.synthetic_downstream_inputs, + "allow_synthetic_diversity": args.allow_synthetic_diversity, + "primer_forward": args.primer_forward, + "primer_reverse": args.primer_reverse, + "primer_orientation": args.primer_orientation, + "merge_reads": args.merge_reads, + "trunc_len_f": args.trunc_len_f, + "trunc_len_r": args.trunc_len_r, + "denoiser": args.denoiser, + "taxonomy_database": args.taxonomy_database, + "taxonomy_database_version": args.taxonomy_database_version, + "normalization": args.normalization, + "rarefaction_depth": args.rarefaction_depth, + "amplicon_backend": args.amplicon_backend, + "kraken_reports": [str(path.expanduser().resolve()) for path in args.kraken_report], + "bracken_tables": [str(path.expanduser().resolve()) for path in args.bracken_table], + "humann_pathabundance": str(args.humann_pathabundance.expanduser().resolve()) + if args.humann_pathabundance + else None, + "humann_genefamilies": str(args.humann_genefamilies.expanduser().resolve()) + if args.humann_genefamilies + else None, + }, + ) + write_json(run_dir / "validation" / "input_summary.json", {"samples": rows}) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_normalized_samples(run_dir, rows) + write_json(run_dir / "inputs" / "input_provenance.json", input_provenance) + write_commands(run_dir, args, fastq_paths, replay_sample_sheet) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + { + "seqkit": ["seqkit", "version"], + "fastqc": ["fastqc", "--version"], + "multiqc": ["multiqc", "--version"], + "cutadapt": ["cutadapt", "--version"], + "macs2": ["macs2", "--version"], + "kraken2": ["kraken2", "--version"], + } + ), + ) + + dry_run = { + "ok": validation["ok"] and tool_status["ok"], + "detail": "input and tool validation completed", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + execution = None + interpretation: dict[str, Any] | None = None + methods_manifest: dict[str, Any] | None = None + methods_manifest_path: Path | None = None + backend_status: dict[str, Any] | None = None + if args.lane == "amplicon_microbiome": + downstream_context = build_amplicon_downstream_context( + args, rows, asv_samples=read_asv_sample_columns(args.asv_table) + ) + methods_manifest, methods_manifest_path = build_amplicon_methods_manifest( + run_dir, args, rows, downstream_context + ) + backend_status = write_amplicon_backend_bundle(run_dir, args, methods_manifest_path) + status = "blocked" if not dry_run["ok"] else "validated" + if args.execute and dry_run["ok"]: + execution = execute_package(run_dir, args, fastq_paths, tool_status, validation) + status = "completed" if execution.get("ok") else "failed" + if ( + args.lane == "amplicon_microbiome" + and (run_dir / "amplicon_analysis_status.json").exists() + ): + status_payload = json.loads( + (run_dir / "amplicon_analysis_status.json").read_text(encoding="utf-8") + ) + status_payload.update( + { + "synthetic_downstream_inputs_detected": bool( + methods_manifest + and methods_manifest.get("downstream_inputs", {}).get("synthetic_detected") + ), + "beta_diversity_allowed": bool( + methods_manifest + and methods_manifest.get("downstream_inputs", {}).get( + "beta_diversity_allowed" + ) + ) + or args.allow_synthetic_diversity, + "backend_ready_to_run": bool( + backend_status and backend_status.get("ready_to_run") + ), + } + ) + write_json(run_dir / "amplicon_analysis_status.json", status_payload) + if execution.get("ok"): + interpretation = build_fastq_assay_qc_verdict(run_dir, args, validation) + write_json(run_dir / "qc_verdict.json", interpretation) + if args.lane in {"shotgun_metagenomics", "amplicon_microbiome", "epigenomics_peaks"}: + write_json(run_dir / "qc_interpretation.json", interpretation) + if args.lane == "epigenomics_peaks": + write_json( + run_dir / "peak_calling_readiness.json", + build_epigenomics_readiness(run_dir, args, validation, interpretation), + ) + if args.lane == "amplicon_microbiome": + status_payload = json.loads( + (run_dir / "amplicon_analysis_status.json").read_text(encoding="utf-8") + ) + status_payload.update( + { + "analysis_readiness": interpretation.get("analysis_readiness"), + "verdict": interpretation.get("verdict"), + "missing_analysis_context": [ + code + for code in interpretation.get("reason_codes", []) + if code.endswith("_missing") + ], + "follow_on_commands": interpretation.get("follow_on_commands", []), + "synthetic_downstream_inputs_detected": bool( + methods_manifest + and methods_manifest.get("downstream_inputs", {}).get( + "synthetic_detected" + ) + ), + "beta_diversity_allowed": bool( + methods_manifest + and methods_manifest.get("downstream_inputs", {}).get( + "beta_diversity_allowed" + ) + ) + or args.allow_synthetic_diversity, + "backend_ready_to_run": bool( + backend_status and backend_status.get("ready_to_run") + ), + } + ) + write_json(run_dir / "amplicon_analysis_status.json", status_payload) + + visualization_outputs = generate_visualizations( + run_dir, args, validation, rows, input_provenance, interpretation + ) + if args.lane == "epigenomics_peaks" and args.execute and dry_run["ok"]: + write_json( + run_dir / "peak_calling_readiness.json", + build_epigenomics_readiness(run_dir, args, validation, interpretation), + ) + review_bundle = {**visualization_outputs} + outputs = { + "sample_table": "validation/samples.normalized.tsv", + "seqkit_stats": "qc/seqkit_stats.tsv", + "fastqc_multiqc_helper": "fastqc/multiqc/multiqc_browser_helper.html", + **visualization_outputs, + } + if methods_manifest_path: + outputs["amplicon_methods_manifest"] = str(methods_manifest_path.relative_to(run_dir)) + outputs["amplicon_backend_status"] = "workflow/amplicon_backend_status.json" + outputs["amplicon_backend_plan"] = "workflow/amplicon_backend_plan.json" + outputs["amplicon_backend_command"] = "workflow/amplicon_backend_command.sh" + if interpretation: + outputs["qc_verdict"] = "qc_verdict.json" + if args.lane in {"shotgun_metagenomics", "amplicon_microbiome", "epigenomics_peaks"}: + outputs["qc_interpretation"] = "qc_interpretation.json" + review_bundle["verdict"] = interpretation.get("verdict") + helper_path = run_dir / outputs["fastqc_multiqc_helper"] + review_bundle["review_surface_ok"] = helper_path.exists() + review_bundle["preferred_review_surface"] = ( + outputs["fastqc_multiqc_localhost"] + if helper_path.exists() + else outputs.get("visualization_index") + ) + write_standard_manifest( + run_dir, + run_id=run_id, + lane=args.lane, + analysis_intent="real_analysis", + workflow="local_light_fastq_assay_package", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "resolved_sample_sheet": str(replay_sample_sheet), + "input_provenance_path": "inputs/input_provenance.json", + "kraken_reports": [str(path) for path in args.kraken_report], + "bracken_tables": [str(path) for path in args.bracken_table], + "humann_pathabundance": str(args.humann_pathabundance) + if args.humann_pathabundance + else None, + "humann_genefamilies": str(args.humann_genefamilies) + if args.humann_genefamilies + else None, + }, + outputs=outputs, + method=methods_manifest + if methods_manifest + else { + "package": LANES[args.lane]["display"], + "taxonomy_database": args.taxonomy_database + if args.lane == "amplicon_microbiome" + else (str(args.kraken_db) if args.kraken_db else None), + }, + audit={ + "resolved_executables": tool_status.get("checked", []), + "software_versions_path": "versions/software_versions.json", + "review_only": False, + "backend_status": backend_status, + "input_provenance_path": "inputs/input_provenance.json", + }, + review_bundle=review_bundle, + ) + write_summary(run_dir, args, status, validation, interpretation) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_fastq_qc.py b/plugins/ngs-analysis/scripts/run_fastq_qc.py new file mode 100755 index 0000000..86be9ae --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_fastq_qc.py @@ -0,0 +1,1259 @@ +#!/usr/bin/env python3 +"""Run local FASTQ QC with validation, Snakemake execution, and artifacts.""" + +from __future__ import annotations + +import argparse +import csv +import gzip +import importlib.util +import json +import re +import shlex +import shutil +import subprocess +import sys +import zipfile +from datetime import datetime +from pathlib import Path +from typing import Any + +from ngs_visualization_utils import ( + artifact_entry, + reachable_localhost_url_for_path, + write_localhost_launch_hint, + write_multiqc_browser_helper, + write_visualization_index, +) + +SCRIPT_PATH = Path(__file__).resolve() +PLUGIN_ROOT = SCRIPT_PATH.parents[1] +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "fastq_qc" +SAMPLE_RE = re.compile(r"^[A-Za-z0-9_.-]+$") +FASTQ_EXTENSIONS = (".fastq", ".fq", ".fastq.gz", ".fq.gz") +READ_NAME_PREVIEW_LIMIT = 25 +FASTQ_ASSAY_CHOICES = ("generic", "rna_seq", "amplicon", "small_rna", "targeted") + + +def now_iso() -> str: + return datetime.now().astimezone().isoformat(timespec="seconds") + + +def slug_timestamp() -> str: + return datetime.now().strftime("%Y-%m-%dT%H-%M-%S-fastq-qc") + + +def write_json(path: Path, value: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(value, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def command_path(name: str) -> str | None: + return shutil.which(name) + + +def module_present(name: str) -> bool: + return importlib.util.find_spec(name) is not None + + +def shell_tool_command(name: str) -> str | None: + resolved = command_path(name) + if resolved: + return name + if name == "snakemake" and module_present("snakemake"): + return f"{sys.executable} -m snakemake" + if name == "multiqc" and module_present("multiqc"): + return f"{sys.executable} -m multiqc" + if name == "cutadapt" and module_present("cutadapt"): + return f"{sys.executable} -m cutadapt" + return None + + +def run_cmd(cmd: list[str], cwd: Path, timeout: int | None) -> dict[str, Any]: + started = now_iso() + try: + result = subprocess.run( + cmd, + cwd=cwd, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + timeout=timeout, + ) + output = result.stdout or "" + return { + "cmd": cmd, + "cwd": str(cwd), + "started_at": started, + "finished_at": now_iso(), + "returncode": result.returncode, + "ok": result.returncode == 0, + "stdout_tail": output[-12000:], + } + except subprocess.TimeoutExpired as exc: + output = exc.stdout if isinstance(exc.stdout, str) else "" + return { + "cmd": cmd, + "cwd": str(cwd), + "started_at": started, + "finished_at": now_iso(), + "returncode": None, + "ok": False, + "error": f"TimeoutExpired: exceeded {timeout}s", + "stdout_tail": output[-12000:], + } + + +def resolve_path(value: str, base: Path) -> Path: + path = Path(value).expanduser() + if not path.is_absolute(): + path = base / path + return path.resolve() + + +def is_remote_uri(value: str) -> bool: + return value.startswith(("http://", "https://", "s3://", "gs://")) + + +def normalize_read_name(header: str) -> str: + name = header.strip() + if name.startswith("@"): + name = name[1:] + name = name.split()[0] + if name.endswith(("/1", "/2")): + name = name[:-2] + return name + + +def open_fastq_text(path: Path): + if path.name.endswith(".gz"): + return gzip.open(path, "rt", encoding="utf-8", errors="replace") + return path.open("rt", encoding="utf-8", errors="replace") + + +def fastq_stats(path: Path, pair_check_reads: int, quick: bool) -> dict[str, Any]: + stats: dict[str, Any] = { + "path": str(path), + "exists": path.exists(), + "readable": False, + "gzip_ok": None, + "record_count": None, + "records_checked": 0, + "first_read_names": [], + "errors": [], + } + if not path.exists(): + stats["errors"].append("file does not exist") + return stats + if not path.is_file(): + stats["errors"].append("path is not a file") + return stats + if not path.name.endswith(FASTQ_EXTENSIONS): + stats["errors"].append("file extension is not a recognized FASTQ extension") + stats["readable"] = True + + try: + record_count = 0 + with open_fastq_text(path) as handle: + while True: + header = handle.readline() + if not header: + break + sequence = handle.readline() + plus = handle.readline() + quality = handle.readline() + if not quality: + stats["errors"].append(f"incomplete FASTQ record after record {record_count}") + break + record_count += 1 + if not header.startswith("@"): + stats["errors"].append(f"record {record_count} header does not start with @") + if not plus.startswith("+"): + stats["errors"].append(f"record {record_count} separator does not start with +") + if len(sequence.rstrip("\n\r")) != len(quality.rstrip("\n\r")): + stats["errors"].append( + f"record {record_count} sequence and quality lengths differ" + ) + if len(stats["first_read_names"]) < min(pair_check_reads, READ_NAME_PREVIEW_LIMIT): + stats["first_read_names"].append(normalize_read_name(header)) + if quick and record_count >= pair_check_reads: + break + stats["record_count"] = None if quick else record_count + stats["records_checked"] = record_count + stats["gzip_ok"] = True if path.name.endswith(".gz") else None + except gzip.BadGzipFile: + stats["gzip_ok"] = False + stats["errors"].append("gzip stream is invalid") + except OSError as exc: + stats["errors"].append(f"read failed: {exc}") + return stats + + +def parse_samples(args: argparse.Namespace) -> tuple[list[dict[str, Any]], list[str]]: + errors: list[str] = [] + samples: list[dict[str, Any]] = [] + if args.sample_sheet: + sheet = Path(args.sample_sheet).expanduser().resolve() + if not sheet.exists(): + return [], [f"sample sheet does not exist: {sheet}"] + sample_counts: dict[str, int] = {} + with sheet.open(newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle) + columns = set(reader.fieldnames or []) + sample_col = ( + "sample" if "sample" in columns else "sample_id" if "sample_id" in columns else None + ) + r1_col = "fastq_1" if "fastq_1" in columns else "r1" if "r1" in columns else None + r2_col = "fastq_2" if "fastq_2" in columns else "r2" if "r2" in columns else None + if not sample_col or not r1_col: + return [], ["sample sheet must include sample/sample_id and fastq_1/r1 columns"] + for index, row in enumerate(reader, start=2): + sample = (row.get(sample_col) or "").strip() + r1 = (row.get(r1_col) or "").strip() + r2 = (row.get(r2_col) or "").strip() if r2_col else "" + if not sample or not r1: + errors.append(f"row {index}: sample and fastq_1 are required") + continue + if is_remote_uri(r1) or (r2 and is_remote_uri(r2)): + errors.append( + f"row {index}: remote FASTQ URLs are not supported by local execution; download or stage files first" + ) + continue + sample_counts[sample] = sample_counts.get(sample, 0) + 1 + unit = sample if sample_counts[sample] == 1 else f"{sample}__row{index}" + samples.append( + { + "sample": unit, + "original_sample": sample, + "r1": str(resolve_path(r1, sheet.parent)), + "r2": str(resolve_path(r2, sheet.parent)) if r2 else None, + "layout": "paired" if r2 else "single", + } + ) + elif args.r1: + sample = args.sample or Path(args.r1).name.split(".")[0] + samples.append( + { + "sample": sample, + "original_sample": sample, + "r1": str(Path(args.r1).expanduser().resolve()), + "r2": str(Path(args.r2).expanduser().resolve()) if args.r2 else None, + "layout": "paired" if args.r2 else "single", + } + ) + else: + errors.append("provide --sample-sheet or --r1") + + seen: set[str] = set() + for sample in samples: + name = sample["sample"] + if not SAMPLE_RE.match(name): + errors.append(f"sample name {name!r} must match {SAMPLE_RE.pattern}") + seen.add(name) + return samples, errors + + +def validate_samples( + samples: list[dict[str, Any]], pair_check_reads: int, quick: bool +) -> dict[str, Any]: + sample_summaries = [] + errors: list[str] = [] + warnings: list[str] = [] + for sample in samples: + r1_stats = fastq_stats(Path(sample["r1"]), pair_check_reads, quick) + r2_stats = ( + fastq_stats(Path(sample["r2"]), pair_check_reads, quick) if sample.get("r2") else None + ) + pair_summary = {"checked": False, "mismatches": [], "record_count_match": None} + if r2_stats: + pair_summary["checked"] = True + r1_names = r1_stats.get("first_read_names", []) + r2_names = r2_stats.get("first_read_names", []) + mismatch_count = 0 + for index, (r1_name, r2_name) in enumerate(zip(r1_names, r2_names), start=1): + if r1_name != r2_name: + mismatch_count += 1 + if len(pair_summary["mismatches"]) < 10: + pair_summary["mismatches"].append( + {"record": index, "r1": r1_name, "r2": r2_name} + ) + if mismatch_count: + errors.append(f"{sample['sample']}: first read names do not match between R1/R2") + if ( + r1_stats.get("record_count") is not None + and r2_stats.get("record_count") is not None + ): + pair_summary["record_count_match"] = ( + r1_stats["record_count"] == r2_stats["record_count"] + ) + if not pair_summary["record_count_match"]: + errors.append(f"{sample['sample']}: R1/R2 record counts differ") + else: + warnings.append( + f"{sample['sample']}: quick validation skipped full R1/R2 count parity" + ) + for label, stats in [("R1", r1_stats), ("R2", r2_stats)]: + if not stats: + continue + for error in stats.get("errors", []): + errors.append(f"{sample['sample']} {label}: {error}") + sample_summaries.append( + { + "sample": sample["sample"], + "layout": sample["layout"], + "r1": r1_stats, + "r2": r2_stats, + "pairing": pair_summary, + } + ) + return { + "ok": not errors, + "errors": errors, + "warnings": warnings, + "samples": sample_summaries, + "quick_validation": quick, + "pair_check_reads": pair_check_reads, + } + + +def tool_preflight(trim_mode: str) -> dict[str, Any]: + required = ["snakemake", "fastqc", "multiqc"] + if trim_mode == "fastp": + required.append("fastp") + if trim_mode == "cutadapt": + required.append("cutadapt") + tools = {name: shell_tool_command(name) for name in sorted(set(required + ["seqkit"]))} + missing = [name for name in required if not tools.get(name)] + return { + "created_at": now_iso(), + "required": required, + "tools": tools, + "missing": missing, + "ok": not missing, + } + + +def render_snakefile(trim_mode: str) -> str: + trim_enabled = trim_mode != "none" + base = f'''"""Plugin-owned local FASTQ QC workflow.""" + +SAMPLES = config["samples"] +THREADS = int(config.get("threads", 4)) +TRIM_MODE = config.get("trim_mode", "none") +TRIM_ENABLED = {str(trim_enabled)} +MULTIQC = config.get("commands", {{}}).get("multiqc", "multiqc") +CUTADAPT = config.get("commands", {{}}).get("cutadapt", "cutadapt") +PAIRED = [sample for sample, meta in SAMPLES.items() if meta.get("r2")] +SINGLE = [sample for sample, meta in SAMPLES.items() if not meta.get("r2")] + + +def raw_targets(): + targets = ["multiqc/raw/multiqc_report.html"] + if TRIM_ENABLED: + targets.append("multiqc/trimmed/multiqc_report.html") + return targets + + +rule all: + input: + raw_targets() + + +rule fastqc_raw: + input: + lambda wildcards: [SAMPLES[wildcards.sample]["r1"]] + ([SAMPLES[wildcards.sample]["r2"]] if SAMPLES[wildcards.sample].get("r2") else []) + output: + touch("fastqc/raw/{{sample}}.done") + threads: THREADS + shell: + "mkdir -p fastqc/raw && fastqc -t {{threads}} -o fastqc/raw {{input}}" + + +rule multiqc_raw: + input: + expand("fastqc/raw/{{sample}}.done", sample=SAMPLES.keys()) + output: + "multiqc/raw/multiqc_report.html" + shell: + "mkdir -p multiqc/raw && {{MULTIQC}} --force --no-version-check --no-megaqc-upload fastqc/raw -o multiqc/raw" +''' + + if not trim_enabled: + return base + + if trim_mode == "fastp": + trim_rules = """ + + +rule fastp_paired: + input: + r1=lambda wildcards: SAMPLES[wildcards.sample]["r1"], + r2=lambda wildcards: SAMPLES[wildcards.sample]["r2"], + output: + r1="trimmed/{{sample}}/{{sample}}_R1.fastq.gz", + r2="trimmed/{{sample}}/{{sample}}_R2.fastq.gz", + html="trimmed/{{sample}}/{{sample}}.fastp.html", + json="trimmed/{{sample}}/{{sample}}.fastp.json", + threads: THREADS + shell: + "mkdir -p trimmed/{{wildcards.sample}} && " + "fastp -i {{input.r1}} -I {{input.r2}} -o {{output.r1}} -O {{output.r2}} " + "--html {{output.html}} --json {{output.json}} --thread {{threads}}" + + +rule fastp_single: + input: + r1=lambda wildcards: SAMPLES[wildcards.sample]["r1"], + output: + r1="trimmed/{{sample}}/{{sample}}.fastq.gz", + html="trimmed/{{sample}}/{{sample}}.fastp.html", + json="trimmed/{{sample}}/{{sample}}.fastp.json", + threads: THREADS + shell: + "mkdir -p trimmed/{{wildcards.sample}} && " + "fastp -i {{input.r1}} -o {{output.r1}} --html {{output.html}} " + "--json {{output.json}} --thread {{threads}}" +""" + elif trim_mode == "cutadapt": + trim_rules = """ + + +rule cutadapt_paired: + input: + r1=lambda wildcards: SAMPLES[wildcards.sample]["r1"], + r2=lambda wildcards: SAMPLES[wildcards.sample]["r2"], + output: + r1="trimmed/{{sample}}/{{sample}}_R1.fastq.gz", + r2="trimmed/{{sample}}/{{sample}}_R2.fastq.gz", + log="trimmed/{{sample}}/{{sample}}.cutadapt.log", + params: + a=lambda wildcards: config.get("adapter_r1", ""), + A=lambda wildcards: config.get("adapter_r2", ""), + shell: + "mkdir -p trimmed/{{wildcards.sample}} && " + "{{CUTADAPT}} -a {{params.a}} -A {{params.A}} -o {{output.r1}} -p {{output.r2}} " + "{{input.r1}} {{input.r2}} > {{output.log}}" + + +rule cutadapt_single: + input: + r1=lambda wildcards: SAMPLES[wildcards.sample]["r1"], + output: + r1="trimmed/{{sample}}/{{sample}}.fastq.gz", + log="trimmed/{{sample}}/{{sample}}.cutadapt.log", + params: + a=lambda wildcards: config.get("adapter_r1", ""), + shell: + "mkdir -p trimmed/{{wildcards.sample}} && " + "{{CUTADAPT}} -a {{params.a}} -o {{output.r1}} {{input.r1}} > {{output.log}}" +""" + else: + raise ValueError(f"unsupported trim mode: {trim_mode}") + + trim_qc_rules = """ + + +rule fastqc_trimmed_paired: + input: + r1="trimmed/{{sample}}/{{sample}}_R1.fastq.gz", + r2="trimmed/{{sample}}/{{sample}}_R2.fastq.gz", + output: + touch("fastqc/trimmed/paired/{{sample}}.done") + threads: THREADS + shell: + "mkdir -p fastqc/trimmed && fastqc -t {{threads}} -o fastqc/trimmed {{input.r1}} {{input.r2}}" + + +rule fastqc_trimmed_single: + input: + r1="trimmed/{{sample}}/{{sample}}.fastq.gz", + output: + touch("fastqc/trimmed/single/{{sample}}.done") + threads: THREADS + shell: + "mkdir -p fastqc/trimmed && fastqc -t {{threads}} -o fastqc/trimmed {{input.r1}}" + + +rule multiqc_trimmed: + input: + expand("fastqc/trimmed/paired/{{sample}}.done", sample=PAIRED), + expand("fastqc/trimmed/single/{{sample}}.done", sample=SINGLE) + output: + "multiqc/trimmed/multiqc_report.html" + shell: + "mkdir -p multiqc/trimmed && {{MULTIQC}} --force --no-version-check --no-megaqc-upload fastqc/trimmed trimmed -o multiqc/trimmed" +""" + trim_rules = trim_rules.replace("{{", "{").replace("}}", "}") + trim_qc_rules = trim_qc_rules.replace("{{", "{").replace("}}", "}") + return base + trim_rules + trim_qc_rules + + +def write_workflow(run_dir: Path, trim_mode: str) -> None: + workflow_dir = run_dir / "workflow" + workflow_dir.mkdir(parents=True, exist_ok=True) + snakefile = render_snakefile(trim_mode) + (workflow_dir / "Snakefile").write_text(snakefile, encoding="utf-8") + + +def write_commands(run_dir: Path, cores: int) -> None: + commands = [ + "#!/usr/bin/env bash", + "set -euo pipefail", + f"cd {json.dumps(str(run_dir))}", + shlex.join(snakemake_cmd(run_dir, cores, dry_run=True)), + shlex.join(snakemake_cmd(run_dir, cores, dry_run=False)), + "", + ] + path = run_dir / "commands.sh" + path.write_text("\n".join(commands), encoding="utf-8") + path.chmod(0o755) + + +def snakemake_cmd(run_dir: Path, cores: int, dry_run: bool) -> list[str]: + snakemake = shell_tool_command("snakemake") or "snakemake" + cmd = [ + "env", + f"XDG_CACHE_HOME={run_dir / '.cache'}", + *shlex.split(snakemake), + "--snakefile", + "workflow/Snakefile", + "--configfile", + "config.json", + "--cores", + str(cores), + "--shared-fs-usage", + "input-output", + "persistence", + "software-deployment", + "software-deployment-cache", + "sources", + "storage-local-copies", + ] + if dry_run: + cmd.append("--dry-run") + return cmd + + +def _safe_float(value: str | None) -> float | None: + if value is None: + return None + try: + return float(value) + except (TypeError, ValueError): + return None + + +def _safe_int(value: str | None) -> int | None: + if value is None: + return None + try: + return int(float(value)) + except (TypeError, ValueError): + return None + + +def _poly_n_fraction(sequence: str) -> float: + if not sequence: + return 0.0 + n_count = sum(1 for base in sequence.upper() if base == "N") + return n_count / len(sequence) + + +def _extract_fastqc_metrics(data_text: str) -> tuple[dict[str, str], dict[str, Any]]: + sections: dict[str, list[str]] = {} + current_section: str | None = None + for line in data_text.splitlines(): + if line.startswith(">>END_MODULE"): + current_section = None + continue + if line.startswith(">>"): + parts = line[2:].split("\t") + current_section = parts[0] + sections[current_section] = [] + continue + if current_section is not None: + sections[current_section].append(line) + + basic_stats: dict[str, str] = {} + metrics: dict[str, Any] = { + "total_sequences": None, + "sequence_length": None, + "gc_percent": None, + "deduplicated_percent": None, + "duplicate_percent": None, + "overrepresented_top_percent": 0.0, + "overrepresented_adapter_like_percent": 0.0, + "overrepresented_no_hit_percent": 0.0, + "overrepresented_poly_n_percent": 0.0, + "adapter_content_max_percent": 0.0, + } + + for line in sections.get("Basic Statistics", []): + if not line or line.startswith("#"): + continue + key, value = line.split("\t", 1) + basic_stats[key] = value + metrics["total_sequences"] = _safe_int(basic_stats.get("Total Sequences")) + metrics["sequence_length"] = basic_stats.get("Sequence length") + metrics["gc_percent"] = _safe_float(basic_stats.get("%GC")) + + for line in sections.get("Sequence Duplication Levels", []): + if line.startswith("#Total Deduplicated Percentage"): + _, value = line.split("\t", 1) + metrics["deduplicated_percent"] = _safe_float(value) + if metrics["deduplicated_percent"] is not None: + metrics["duplicate_percent"] = round(100.0 - metrics["deduplicated_percent"], 3) + break + + for line in sections.get("Overrepresented sequences", []): + if not line or line.startswith("#"): + continue + fields = line.split("\t") + if len(fields) < 4: + continue + sequence, _count, percent_raw, source = fields[:4] + percent = _safe_float(percent_raw) or 0.0 + metrics["overrepresented_top_percent"] = max( + metrics["overrepresented_top_percent"], percent + ) + if source != "No Hit": + metrics["overrepresented_adapter_like_percent"] += percent + else: + metrics["overrepresented_no_hit_percent"] += percent + if _poly_n_fraction(sequence) >= 0.5: + metrics["overrepresented_poly_n_percent"] += percent + + adapter_rows = [ + line for line in sections.get("Adapter Content", []) if line and not line.startswith("#") + ] + if adapter_rows: + header = sections["Adapter Content"][0].split("\t") + for line in adapter_rows: + fields = line.split("\t") + for value in fields[1 : len(header)]: + metrics["adapter_content_max_percent"] = max( + metrics["adapter_content_max_percent"], _safe_float(value) or 0.0 + ) + + return basic_stats, metrics + + +def fastqc_zip_summaries(root: Path) -> dict[str, Any]: + summaries: dict[str, Any] = {} + for zip_path in sorted(root.glob("*_fastqc.zip")): + modules = [] + basic_stats: dict[str, str] = {} + metrics: dict[str, Any] = {} + try: + with zipfile.ZipFile(zip_path) as archive: + summary_name = next( + name for name in archive.namelist() if name.endswith("/summary.txt") + ) + for line in ( + archive.read(summary_name).decode("utf-8", errors="replace").splitlines() + ): + status, module, filename = line.split("\t")[:3] + modules.append({"status": status, "module": module, "file": filename}) + data_name = next( + name for name in archive.namelist() if name.endswith("/fastqc_data.txt") + ) + data_text = archive.read(data_name).decode("utf-8", errors="replace") + basic_stats, metrics = _extract_fastqc_metrics(data_text) + except Exception as exc: # noqa: BLE001 + modules.append({"status": "ERROR", "module": "FastQC zip parsing", "detail": str(exc)}) + summaries[zip_path.name] = { + "modules": modules, + "basic_statistics": basic_stats, + "metrics": metrics, + } + return summaries + + +def interpret_qc( + raw_summaries: dict[str, Any], trimmed_summaries: dict[str, Any] | None, assay_type: str +) -> dict[str, Any]: + module_statuses: dict[str, dict[str, int]] = {} + sample_metrics: dict[str, Any] = {} + recommendation_reasons: list[str] = [] + context_warnings: list[str] = [] + quality_issue = False + adapter_issue = False + poly_n_issue = False + for sample_name, summary in raw_summaries.items(): + sample_metrics[sample_name] = summary.get("metrics", {}) + for module in summary.get("modules", []): + name = module.get("module", "unknown") + status = module.get("status", "unknown") + module_statuses.setdefault(name, {}).setdefault(status, 0) + module_statuses[name][status] += 1 + metrics = summary.get("metrics", {}) + if (metrics.get("adapter_content_max_percent") or 0.0) >= 5.0 or ( + metrics.get("overrepresented_adapter_like_percent") or 0.0 + ) >= 1.0: + adapter_issue = True + if (metrics.get("overrepresented_poly_n_percent") or 0.0) >= 1.0: + poly_n_issue = True + for module_name in ["Per base sequence quality", "Per sequence quality scores"]: + counts = module_statuses.get(module_name, {}) + if counts.get("WARN") or counts.get("FAIL"): + quality_issue = True + + if quality_issue: + recommendation_reasons.append( + "Per-base or per-sequence quality modules showed WARN/FAIL, which can justify end trimming or closer review." + ) + if adapter_issue: + recommendation_reasons.append( + "Adapter-like signal reached an actionable fraction in raw FastQC metrics." + ) + if poly_n_issue: + recommendation_reasons.append( + "Poly-N overrepresented sequences reached an actionable fraction in raw FastQC metrics." + ) + + if not recommendation_reasons: + recommendation = "no_trimming_by_default" + recommendation_reasons.append( + "Raw FastQC metrics did not show trimming-level adapter or quality degradation." + ) + elif quality_issue: + recommendation = "quality_trim_or_investigate" + elif adapter_issue: + recommendation = "trim_adapters_or_primers" + else: + recommendation = "investigate_low_complexity_or_n_content" + + duplication = module_statuses.get("Sequence Duplication Levels", {}) + if duplication.get("WARN") or duplication.get("FAIL"): + if assay_type in {"rna_seq", "amplicon", "small_rna"}: + context_warnings.append( + f"Sequence duplication is elevated, but that can be expected for {assay_type.replace('_', '-')} libraries and is not sufficient on its own to trigger trimming." + ) + else: + context_warnings.append( + "Sequence duplication is elevated; interpret it as a library-complexity signal before filtering." + ) + base_content = module_statuses.get("Per base sequence content", {}) + if base_content.get("WARN") or base_content.get("FAIL"): + if assay_type in {"rna_seq", "amplicon", "small_rna"}: + context_warnings.append( + f"Per-base sequence content is WARN/FAIL, which often reflects composition bias in {assay_type.replace('_', '-')} rather than removable adapter sequence." + ) + else: + context_warnings.append( + "Per-base sequence content is WARN/FAIL and should be interpreted in assay context rather than treated as an automatic trimming trigger." + ) + + return { + "created_at": now_iso(), + "assay_type": assay_type, + "raw_fastqc_files": raw_summaries, + "trimmed_fastqc_files": trimmed_summaries or {}, + "module_status_counts": module_statuses, + "sample_metrics": sample_metrics, + "recommendation": recommendation, + "recommendation_reasons": recommendation_reasons, + "context_warnings": context_warnings, + "notes": [ + "Do not trim solely because a FastQC module warns; weight the affected fraction, assay context, and downstream requirements.", + "Preserve raw FastQC and MultiQC outputs even when a trimming branch is executed.", + ], + } + + +def artifact_index(run_dir: Path) -> dict[str, Any]: + patterns = [ + "fastqc/**/*.html", + "fastqc/**/*.zip", + "multiqc/**/*.html", + "multiqc/**/*", + "trimmed/**/*", + "visualizations/**/*", + "qc_interpretation.json", + "validation/*.json", + "logs/*.json", + "logs/*.log", + "workflow/Snakefile", + "commands.sh", + "config.json", + "run_manifest.json", + "summary.md", + ] + artifacts = [] + seen: set[Path] = set() + for pattern in patterns: + for path in run_dir.glob(pattern): + if path.is_file() and path not in seen: + seen.add(path) + artifacts.append( + { + "path": str(path.relative_to(run_dir)), + "bytes": path.stat().st_size, + } + ) + return {"created_at": now_iso(), "artifacts": sorted(artifacts, key=lambda item: item["path"])} + + +def write_summary( + run_dir: Path, + status: str, + interpretation: dict[str, Any] | None, + review_outputs: dict[str, str | None], +) -> None: + lines = [ + "# FASTQ QC Run Summary", + "", + f"Status: `{status}`", + "", + ] + if interpretation: + lines.extend( + [ + f"Recommendation: `{interpretation['recommendation']}`", + "", + "## Reasons", + "", + ] + ) + reasons = interpretation.get("recommendation_reasons") or [ + "No trimming signal was detected in raw FastQC summaries." + ] + lines.extend(f"- {reason}" for reason in reasons) + lines.append("") + warnings = interpretation.get("context_warnings") or [] + if warnings: + lines.extend( + [ + "## Context Warnings", + "", + ] + ) + lines.extend(f"- {warning}" for warning in warnings) + lines.append("") + lines.extend( + [ + "## Key Artifacts", + "", + ] + ) + key_artifacts = [ + review_outputs.get("visualization_index"), + review_outputs.get("multiqc_raw_helper"), + review_outputs.get("localhost_launch_hint"), + review_outputs.get("multiqc_raw_localhost"), + review_outputs.get("multiqc_trimmed_helper"), + review_outputs.get("multiqc_trimmed_localhost"), + "qc_interpretation.json" if interpretation else None, + "validation/input_summary.json", + "validation/validation_summary.json", + "artifact_index.json", + ] + for path in key_artifacts: + if not path: + continue + if path.startswith(("http://", "https://")) or (run_dir / path).exists(): + lines.append(f"- `{path}`") + lines.extend( + [ + "", + "Raw FASTQs were read-only inputs and were not modified.", + "", + ] + ) + (run_dir / "summary.md").write_text("\n".join(lines), encoding="utf-8") + + +def build_review_bundle( + run_dir: Path, interpretation: dict[str, Any] | None, trim_mode: str +) -> dict[str, str | None]: + notes = [ + "Use the browser helper or visualization index first when the raw MultiQC HTML struggles under file:// in the in-app browser.", + "Recommendation logic combines FastQC status modules with parsed fractions from raw FastQC tables, rather than treating every WARN as trimming-worthy.", + ] + entries: list[dict[str, Any]] = [] + localhost_reports = [("Raw MultiQC", "multiqc/raw/multiqc_report.html")] + helper_specs = [ + ( + "multiqc_raw_helper", + "Raw MultiQC Browser Helper", + "multiqc/raw/multiqc_browser_helper.html", + "Browser-safe view over the raw FastQC MultiQC report.", + ), + ] + if trim_mode != "none": + localhost_reports.append(("Trimmed MultiQC", "multiqc/trimmed/multiqc_report.html")) + helper_specs.extend( + [ + ( + "multiqc_trimmed_helper", + "Trimmed MultiQC Browser Helper", + "multiqc/trimmed/multiqc_browser_helper.html", + "Browser-safe view over the trimmed FastQC MultiQC report.", + ), + ] + ) + launch_hint = write_localhost_launch_hint(run_dir, report_entries=localhost_reports) + for label, rel_path in localhost_reports: + path = run_dir / rel_path + path_parts = Path(rel_path).parts + link_key = path_parts[1] if len(path_parts) > 1 else Path(rel_path).stem + live_url = reachable_localhost_url_for_path(rel_path) + entries.append( + artifact_entry( + artifact_id=f"{link_key}_localhost", + title=f"{label} Localhost URL", + path=live_url, + kind="localhost_app", + status="created" if live_url else "not_available", + description=f"Live localhost review URL for the full {label.lower()} interactive report when the run directory is already being served.", + ) + ) + for artifact_id, title, rel_path, description in helper_specs: + path = run_dir / rel_path + entries.append( + artifact_entry( + artifact_id=artifact_id, + title=title, + path=rel_path if path.exists() else None, + kind="html_report", + status="created" if path.exists() else "not_available", + description=description, + ) + ) + entries.append( + artifact_entry( + artifact_id="localhost_launch_hint", + title="Localhost Launch Hint", + path=str(launch_hint.relative_to(run_dir)), + kind="text", + status="created", + description="Command and localhost URLs for serving the run directory and opening the full MultiQC reports.", + ) + ) + for zip_name in sorted((interpretation or {}).get("raw_fastqc_files", {})): + sample_prefix = zip_name.replace("_fastqc.zip", "") + for suffix, kind in [("_fastqc.html", "html_report"), ("_fastqc.zip", "archive")]: + rel_path = f"fastqc/raw/{sample_prefix}{suffix}" + path = run_dir / rel_path + entries.append( + artifact_entry( + artifact_id=f"{sample_prefix}{suffix}".replace(".", "_"), + title=f"{sample_prefix} {suffix.removeprefix('_').replace('_', ' ')}", + path=rel_path if path.exists() else None, + kind=kind, + status="created" if path.exists() else "not_available", + description=f"Raw FastQC {suffix.removeprefix('_')} for {sample_prefix}.", + ) + ) + for artifact_id, title, rel_path, kind, description in [ + ( + "qc_interpretation", + "QC Interpretation JSON", + "qc_interpretation.json", + "json", + "Machine-readable QC interpretation with recommendation, metrics, and context warnings.", + ), + ( + "summary", + "Run Summary", + "summary.md", + "markdown", + "Concise run summary for human review.", + ), + ( + "manifest", + "Run Manifest", + "run_manifest.json", + "json", + "Run-level manifest with inputs, outputs, and execution status.", + ), + ( + "commands", + "Execution Commands", + "commands.sh", + "text", + "Recorded validation and execute commands for reproducibility.", + ), + ]: + path = run_dir / rel_path + entries.append( + artifact_entry( + artifact_id=artifact_id, + title=title, + path=rel_path if path.exists() else None, + kind=kind, + status="created" if path.exists() else "not_available", + description=description, + ) + ) + index = write_visualization_index( + run_dir, + title="FASTQ QC Review Bundle", + description="Human-readable review surface for raw FASTQ QC, with links to the key reports, command envelope, and machine-readable interpretation.", + entries=entries, + notes=notes, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + "multiqc_raw_helper": "multiqc/raw/multiqc_browser_helper.html" + if (run_dir / "multiqc/raw/multiqc_browser_helper.html").exists() + else None, + "localhost_launch_hint": str(launch_hint.relative_to(run_dir)), + "multiqc_raw_localhost": reachable_localhost_url_for_path("multiqc/raw/multiqc_report.html") + if (run_dir / "multiqc/raw/multiqc_report.html").exists() + else None, + "multiqc_trimmed_helper": "multiqc/trimmed/multiqc_browser_helper.html" + if (run_dir / "multiqc/trimmed/multiqc_browser_helper.html").exists() + else None, + "multiqc_trimmed_localhost": reachable_localhost_url_for_path( + "multiqc/trimmed/multiqc_report.html" + ) + if (run_dir / "multiqc/trimmed/multiqc_report.html").exists() + else None, + } + + +def build_config(samples: list[dict[str, Any]], args: argparse.Namespace) -> dict[str, Any]: + return { + "samples": { + sample["sample"]: { + "original_sample": sample.get("original_sample", sample["sample"]), + "r1": sample["r1"], + "r2": sample.get("r2"), + "layout": sample["layout"], + } + for sample in samples + }, + "threads": args.threads, + "assay_type": args.assay_type, + "trim_mode": args.trim_mode, + "adapter_r1": args.adapter_r1 or "", + "adapter_r2": args.adapter_r2 or args.adapter_r1 or "", + "commands": { + "multiqc": shell_tool_command("multiqc") or "multiqc", + "cutadapt": shell_tool_command("cutadapt") or "cutadapt", + }, + } + + +def validate_trim_args(args: argparse.Namespace, samples: list[dict[str, Any]]) -> list[str]: + errors: list[str] = [] + if args.trim_mode == "cutadapt" and not args.adapter_r1: + errors.append("--trim-mode cutadapt requires --adapter-r1") + if args.trim_mode != "none" and not samples: + errors.append("trimming requested but no samples were parsed") + return errors + + +def write_manifest( + run_dir: Path, + run_id: str, + args: argparse.Namespace, + status: str, + config: dict[str, Any], + validation: dict[str, Any], + tool_status: dict[str, Any], + dry_run: dict[str, Any] | None, + execution: dict[str, Any] | None, + interpretation: dict[str, Any] | None, + review_outputs: dict[str, str | None], +) -> None: + manifest = { + "run_id": run_id, + "created_at": now_iso(), + "status": status, + "workflow": "fastq_qc_local_snakemake", + "run_dir": str(run_dir), + "execute_requested": args.execute, + "dry_run_performed": dry_run is not None, + "assay_type": args.assay_type, + "trim_mode": args.trim_mode, + "sample_count": len(config["samples"]), + "samples": sorted(config["samples"]), + "validation_ok": validation.get("ok"), + "ready_to_execute": validation.get("ok") and tool_status.get("ok"), + "tool_preflight_ok": tool_status.get("ok"), + "dry_run_ok": dry_run.get("ok") if dry_run else None, + "execution_ok": execution.get("ok") if execution else None, + "recommendation": interpretation.get("recommendation") if interpretation else None, + "inputs": config["samples"], + "outputs": { + **review_outputs, + "qc_interpretation": "qc_interpretation.json" if interpretation else None, + }, + } + write_json(run_dir / "run_manifest.json", manifest) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--sample-sheet", help="CSV with sample, fastq_1, and optional fastq_2 columns." + ) + parser.add_argument("--sample", help="Sample name for --r1/--r2 single-sample mode.") + parser.add_argument("--r1", help="FASTQ R1 or single-end FASTQ for single-sample mode.") + parser.add_argument("--r2", help="FASTQ R2 for single-sample paired mode.") + parser.add_argument( + "--outdir", type=Path, help="Run directory. Defaults to ngs_runs/fastq_qc/." + ) + parser.add_argument("--threads", type=int, default=4) + parser.add_argument("--run-id", default=slug_timestamp()) + parser.add_argument( + "--execute", + action="store_true", + help="Run Snakemake after validation and workflow validation.", + ) + parser.add_argument( + "--no-dry-run", action="store_true", help="Skip Snakemake workflow validation." + ) + parser.add_argument( + "--quick-validation", action="store_true", help="Check only the first N records per FASTQ." + ) + parser.add_argument("--pair-check-reads", type=int, default=10000) + parser.add_argument("--assay-type", choices=list(FASTQ_ASSAY_CHOICES), default="generic") + parser.add_argument("--trim-mode", choices=["none", "fastp", "cutadapt"], default="none") + parser.add_argument("--adapter-r1") + parser.add_argument("--adapter-r2") + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + + samples, parse_errors = parse_samples(args) + trim_errors = validate_trim_args(args, samples) + validation = ( + validate_samples(samples, args.pair_check_reads, args.quick_validation) + if samples + else { + "ok": False, + "errors": [], + "warnings": [], + "samples": [], + "quick_validation": args.quick_validation, + "pair_check_reads": args.pair_check_reads, + } + ) + validation["errors"] = parse_errors + trim_errors + validation.get("errors", []) + validation["ok"] = not validation["errors"] + tool_status = tool_preflight(args.trim_mode) + config = build_config(samples, args) + + write_json(run_dir / "config.json", config) + write_json(run_dir / "validation" / "input_summary.json", {"samples": config["samples"]}) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_workflow(run_dir, args.trim_mode) + write_commands(run_dir, args.threads) + + dry_run: dict[str, Any] | None = None + execution: dict[str, Any] | None = None + interpretation: dict[str, Any] | None = None + review_outputs: dict[str, str | None] = {} + status = "prepared" + blocked = not validation["ok"] or not tool_status["ok"] + if blocked: + status = "blocked" + elif not args.no_dry_run: + dry_run = run_cmd(snakemake_cmd(run_dir, args.threads, dry_run=True), run_dir, timeout=600) + write_json(run_dir / "logs" / "snakemake_dry_run.json", dry_run) + (run_dir / "logs" / "snakemake_dry_run.log").write_text( + dry_run.get("stdout_tail", ""), encoding="utf-8" + ) + if not dry_run.get("ok"): + status = "failed" + blocked = True + if args.execute and not blocked: + execution = run_cmd( + snakemake_cmd(run_dir, args.threads, dry_run=False), run_dir, timeout=86400 + ) + write_json(run_dir / "logs" / "snakemake_execute.json", execution) + (run_dir / "logs" / "snakemake_execute.log").write_text( + execution.get("stdout_tail", ""), encoding="utf-8" + ) + status = "completed" if execution.get("ok") else "failed" + if execution.get("ok"): + raw_summaries = fastqc_zip_summaries(run_dir / "fastqc" / "raw") + trimmed_summaries = ( + fastqc_zip_summaries(run_dir / "fastqc" / "trimmed") + if (run_dir / "fastqc" / "trimmed").exists() + else None + ) + interpretation = interpret_qc(raw_summaries, trimmed_summaries, args.assay_type) + write_json(run_dir / "qc_interpretation.json", interpretation) + elif not args.execute and status == "prepared": + status = "validated" + + write_multiqc_browser_helper( + run_dir, + report_path="multiqc/raw/multiqc_report.html", + title="FASTQ QC Raw MultiQC Browser Helper", + ) + if args.trim_mode != "none": + write_multiqc_browser_helper( + run_dir, + report_path="multiqc/trimmed/multiqc_report.html", + title="FASTQ QC Trimmed MultiQC Browser Helper", + ) + + review_outputs = { + "multiqc_raw_helper": "multiqc/raw/multiqc_browser_helper.html" + if (run_dir / "multiqc/raw/multiqc_browser_helper.html").exists() + else None, + "multiqc_raw_localhost": reachable_localhost_url_for_path("multiqc/raw/multiqc_report.html") + if (run_dir / "multiqc/raw/multiqc_report.html").exists() + else None, + "localhost_launch_hint": str( + write_localhost_launch_hint( + run_dir, report_entries=[("Raw MultiQC", "multiqc/raw/multiqc_report.html")] + ).relative_to(run_dir) + ), + "visualization_index": None, + "visualization_manifest": None, + } + write_summary(run_dir, status, interpretation, review_outputs) + write_manifest( + run_dir, + args.run_id, + args, + status, + config, + validation, + tool_status, + dry_run, + execution, + interpretation, + review_outputs, + ) + review_outputs = build_review_bundle(run_dir, interpretation, args.trim_mode) + write_summary(run_dir, status, interpretation, review_outputs) + write_manifest( + run_dir, + args.run_id, + args, + status, + config, + validation, + tool_status, + dry_run, + execution, + interpretation, + review_outputs, + ) + write_json(run_dir / "artifact_index.json", artifact_index(run_dir)) + + print(run_dir) + if status in {"blocked", "failed"}: + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_nfcore_pipeline.py b/plugins/ngs-analysis/scripts/run_nfcore_pipeline.py new file mode 100644 index 0000000..1630329 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_nfcore_pipeline.py @@ -0,0 +1,529 @@ +#!/usr/bin/env python3 +"""Generate and optionally execute a standardized nf-core pipeline run envelope.""" + +from __future__ import annotations + +import argparse +import shlex +from pathlib import Path +from typing import Any + +import ngs_reference_manager +from ngs_planner_utils import shell_join, write_command_script +from ngs_run_utils import ( + build_artifact_index, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import ( + add_vcf_review_notebook_entry, + artifact_entry, + write_visualization_index, +) + +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "nfcore" + +NFCORE_PIPELINES: dict[str, dict[str, Any]] = { + "rnaseq": { + "workflow": "nf-core/rnaseq", + "description": "Bulk RNA-seq FASTQ to QC, alignment/pseudoalignment, quantification, and MultiQC.", + "resource_pipeline": "bulk_rnaseq_counts_qc", + }, + "scrnaseq": { + "workflow": "nf-core/scrnaseq", + "description": "Single-cell or single-nucleus RNA-seq FASTQ to count matrices and QC outputs.", + "resource_pipeline": "scrnaseq_fastq_to_count", + }, + "sarek": { + "workflow": "nf-core/sarek", + "description": "DNA germline/somatic variant analysis using nf-core/sarek.", + "resource_pipeline": "dna_variant_calling", + }, + "atacseq": { + "workflow": "nf-core/atacseq", + "description": "ATAC-seq alignment, QC, peak calling, consensus peaks, and signal outputs.", + "resource_pipeline": "atacseq_peaks_qc", + }, + "chipseq": { + "workflow": "nf-core/chipseq", + "description": "ChIP-seq alignment, QC, peak calling, consensus peaks, and signal outputs.", + "resource_pipeline": "chip_cutrun_peaks_qc", + }, + "cutandrun": { + "workflow": "nf-core/cutandrun", + "description": "CUT&RUN/CUT&Tag alignment, QC, peak calling, and reporting.", + "resource_pipeline": "chip_cutrun_peaks_qc", + }, + "ampliseq": { + "workflow": "nf-core/ampliseq", + "description": "Marker-gene amplicon denoising, taxonomy, diversity, and reporting.", + "resource_pipeline": "amplicon_microbiome", + }, + "taxprofiler": { + "workflow": "nf-core/taxprofiler", + "description": "Shotgun metagenomics taxonomic and optional functional profiling.", + "resource_pipeline": "shotgun_metagenomics", + }, +} + + +def validate_inputs(args: argparse.Namespace) -> dict[str, Any]: + errors: list[str] = [] + warnings: list[str] = [] + sample_sheet = args.sample_sheet.expanduser().resolve() + if not sample_sheet.exists(): + errors.append(f"sample sheet does not exist: {sample_sheet}") + params_file = args.params_file.expanduser().resolve() if args.params_file else None + if params_file and not params_file.exists(): + errors.append(f"params file does not exist: {params_file}") + if args.pipeline not in NFCORE_PIPELINES: + errors.append(f"unsupported nf-core pipeline: {args.pipeline}") + if not args.profile: + warnings.append( + "no Nextflow profile was provided; nf-core usually needs docker, singularity, conda, or institutional profiles" + ) + return { + "ok": not errors, + "input_ok": not errors, + "pipeline": args.pipeline, + "workflow": NFCORE_PIPELINES.get(args.pipeline, {}).get("workflow"), + "sample_sheet": str(sample_sheet), + "params_file": str(params_file) if params_file else None, + "profile": args.profile, + "revision": args.revision, + "errors": errors, + "warnings": warnings, + } + + +def resource_genome_build(args: argparse.Namespace) -> str | None: + return args.genome_build or args.genome + + +def summarize_resource_blockers(resource_plan: dict[str, Any] | None) -> list[str]: + if resource_plan is None or resource_plan.get("ok"): + return [] + blockers = [] + for item in resource_plan.get("missing_required", []): + detail = item.get("error") or ", ".join(item.get("missing", [])) or "root not configured" + blockers.append( + f"required {item.get('kind')} bundle `{item.get('bundle')}` is not ready: {detail}" + ) + return blockers + + +def write_resource_plan(args: argparse.Namespace, run_dir: Path) -> dict[str, Any] | None: + if args.skip_resource_plan: + return None + run_root = run_dir.resolve() + pipeline = NFCORE_PIPELINES[args.pipeline]["resource_pipeline"] + plan = ngs_reference_manager.plan_pipeline_resources( + pipeline, + genome_build=resource_genome_build(args), + bundle_roots=ngs_reference_manager.parse_bundle_roots(args.bundle_root), + include_optional=args.include_optional_resources, + include_checksums=args.resource_checksums, + ) + outputs = ngs_reference_manager.write_resource_plan_outputs(plan, run_root / "resources") + plan["outputs"] = { + key: str(Path(value).resolve().relative_to(run_root)) for key, value in outputs.items() + } + return plan + + +def merge_resource_status( + validation: dict[str, Any], resource_plan: dict[str, Any] | None +) -> dict[str, Any]: + merged = dict(validation) + errors = list(merged.get("errors", [])) + warnings = list(merged.get("warnings", [])) + if resource_plan is None: + merged["resource_plan_ok"] = None + merged["resource_plan_skipped"] = True + warnings.append( + "resource readiness plan was skipped; perform a separate reference/database check before marking the run ready" + ) + else: + merged["resource_plan_skipped"] = False + merged["resource_plan_ok"] = bool(resource_plan.get("ok")) + merged["resource_plan_pipeline"] = resource_plan.get("pipeline") + merged["resource_plan_path"] = resource_plan.get("outputs", {}).get("resource_plan") + merged["missing_required_resources"] = resource_plan.get("missing_required", []) + errors.extend(summarize_resource_blockers(resource_plan)) + merged["errors"] = errors + merged["warnings"] = warnings + merged["ok"] = bool(validation.get("ok")) and ( + resource_plan is None or bool(resource_plan.get("ok")) + ) + return merged + + +def generated_params(args: argparse.Namespace, run_dir: Path) -> dict[str, Any]: + params: dict[str, Any] = { + "input": str(args.sample_sheet.expanduser().resolve()), + "outdir": str((run_dir / "results").resolve()), + } + if args.genome: + params["genome"] = args.genome + if args.fasta: + params["fasta"] = str(args.fasta.expanduser().resolve()) + if args.gtf: + params["gtf"] = str(args.gtf.expanduser().resolve()) + if args.extra_param: + for item in args.extra_param: + if "=" not in item: + raise ValueError(f"--extra-param must use key=value syntax, got: {item}") + key, value = item.split("=", 1) + params[key] = value + return params + + +def build_command(args: argparse.Namespace, run_dir: Path, params_path: Path) -> str: + workflow = NFCORE_PIPELINES[args.pipeline]["workflow"] + cmd: list[str | Path] = [ + "nextflow", + "run", + workflow, + "-params-file", + params_path, + "-work-dir", + run_dir / "work", + "-with-report", + run_dir / "workflow" / "nextflow_report.html", + "-with-timeline", + run_dir / "workflow" / "timeline.html", + "-with-trace", + run_dir / "workflow" / "trace.txt", + "-with-dag", + run_dir / "workflow" / "dag.html", + ] + if args.revision: + cmd.extend(["-r", args.revision]) + if args.profile: + cmd.extend(["-profile", args.profile]) + base = shell_join(cmd) + if args.nextflow_arg: + base = " ".join([base, *args.nextflow_arg]) + return base + + +def execute_command(run_dir: Path, command: str) -> dict[str, Any]: + return run_cmd(["bash", "-c", command], run_dir, timeout=None) + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None, +) -> None: + lines = [ + "# nf-core Pipeline Run Summary", + "", + f"Status: `{status}`", + f"Pipeline: `{validation.get('workflow')}`", + f"Profile: `{validation.get('profile') or 'not provided'}`", + "", + "## Key Artifacts", + "", + "- `workflow/params.generated.json`", + "- `workflow/nfcore_command.json`", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `commands.sh`", + "- `workflow/nextflow_report.html`, `timeline.html`, `trace.txt`, and `dag.html` when executed", + "- `results/` published outputs when execution completes", + "- `visualizations/index.html`", + "- `run_manifest.json` and `artifact_index.json`", + "", + ] + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {item}" for item in validation["warnings"]) + lines.append("") + if resource_plan is not None: + lines.extend(["## Resource Readiness", ""]) + lines.append(f"Ready: `{str(resource_plan.get('ok')).lower()}`") + lines.append(f"Resource contract: `{resource_plan.get('pipeline')}`") + lines.append( + f"Setup plan: `{resource_plan.get('outputs', {}).get('resource_setup_summary', 'resources/resource_setup_plan.md')}`" + ) + for item in resource_plan.get("resources", []): + state = "ready" if item.get("ok") else "missing" + required = "required" if item.get("required") else "optional" + lines.append(f"- `{item.get('bundle')}` ({item.get('kind')}, {required}): {state}") + lines.append("") + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {item}" for item in validation["errors"]) + write_text(run_dir / "summary.md", "\n".join(lines) + "\n") + + +def write_visuals( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None, +) -> dict[str, str]: + entries = [ + artifact_entry( + artifact_id="params", + title="Generated Params", + path="workflow/params.generated.json", + kind="json", + status="created", + description="Nextflow params generated by the plugin adapter.", + ), + artifact_entry( + artifact_id="command", + title="Nextflow Command", + path="workflow/nfcore_command.json", + kind="json", + status="created", + description="Exact command used or ready to run.", + ), + artifact_entry( + artifact_id="nextflow_report", + title="Nextflow Report", + path="workflow/nextflow_report.html", + kind="html", + status="created" + if (run_dir / "workflow" / "nextflow_report.html").exists() + else "not_available", + description="Nextflow execution report, emitted after a successful or partially successful run.", + ), + ] + if resource_plan is not None: + entries.extend( + [ + artifact_entry( + artifact_id="resource_readiness", + title="Resource Readiness", + path="resources/resource_readiness.md", + kind="markdown", + status="created", + description="Human-readable reference/database readiness gate for this nf-core run.", + ), + artifact_entry( + artifact_id="resource_manifest", + title="Resource Manifest", + path="resources/resource_manifest.tsv", + kind="table", + status="created", + description="Pipeline resource bundles, roots, env vars, and missing-file counts.", + ), + artifact_entry( + artifact_id="resource_plan", + title="Resource Plan", + path="resources/resource_plan.json", + kind="json", + status="created", + description="Structured resource readiness plan used to gate this run.", + ), + artifact_entry( + artifact_id="resource_setup_plan", + title="Resource Setup Plan", + path="resources/resource_setup_plan.md", + kind="markdown", + status="created", + description="Actionable setup checklist for missing reference/database bundles.", + ), + artifact_entry( + artifact_id="resource_setup_commands", + title="Resource Setup Commands", + path="resources/resource_setup_commands.sh", + kind="script", + status="created", + description="Reviewed shell skeleton with commented setup hints and validation commands.", + ), + ] + ) + review_outputs = add_vcf_review_notebook_entry( + run_dir, + entries, + title="nf-core VCF Review", + object_items=[ + ("Nextflow Report", "workflow/nextflow_report.html"), + ("Run Summary", "summary.md"), + ], + ) + index = write_visualization_index( + run_dir, + title="nf-core Execution Review", + description="Standard review surface for nf-core execution.", + entries=entries, + notes=[*validation.get("warnings", []), *summarize_resource_blockers(resource_plan)], + analysis_intent="real_analysis" if status != "blocked" else "blocked_preflight", + provenance_summary={ + "pipeline": validation.get("workflow"), + "status": status, + "resource_plan_ok": validation.get("resource_plan_ok"), + }, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + **review_outputs, + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--pipeline", choices=sorted(NFCORE_PIPELINES), required=True) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--params-file", type=Path) + parser.add_argument( + "--profile", help="Nextflow profile, e.g. docker, singularity, conda, or a site profile." + ) + parser.add_argument("--revision", help="Pinned nf-core revision/tag/commit.") + parser.add_argument("--genome") + parser.add_argument( + "--genome-build", + help="Genome build/alias for the reference resource plan. Defaults to --genome when omitted.", + ) + parser.add_argument("--fasta", type=Path) + parser.add_argument("--gtf", type=Path) + parser.add_argument( + "--extra-param", + action="append", + default=[], + help="Additional generated params as key=value. May be repeated.", + ) + parser.add_argument( + "--nextflow-arg", + action="append", + default=[], + help="Raw extra Nextflow argument appended to the command. May be repeated.", + ) + parser.add_argument( + "--bundle-root", + action="append", + default=[], + help="Resource bundle override formatted as bundle=/path. May be repeated.", + ) + parser.add_argument( + "--include-optional-resources", + action="store_true", + help="Include optional resource bundles such as Bracken/HUMAnN in readiness checks.", + ) + parser.add_argument( + "--resource-checksums", + action="store_true", + help="Compute checksums for resource files below the reference-manager checksum threshold.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Generate the nf-core run envelope without gating on reference/database bundle readiness.", + ) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=None) + parser.add_argument("--execute", action="store_true") + return parser.parse_args() + + +def serializable_args(args: argparse.Namespace) -> dict[str, Any]: + return { + key: str(value) if isinstance(value, Path) else value for key, value in vars(args).items() + } + + +def main() -> int: + args = parse_args() + run_id = args.run_id or slug_timestamp(f"nfcore-{args.pipeline}") + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.pipeline / run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "workflow").mkdir(parents=True, exist_ok=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + + input_validation = validate_inputs(args) + resource_plan = write_resource_plan(args, run_dir) + validation = merge_resource_status(input_validation, resource_plan) + tool_status = tool_preflight(["nextflow"], optional=[]) + params = generated_params(args, run_dir) + generated_params_path = run_dir / "workflow" / "params.generated.json" + write_json(generated_params_path, params) + command = build_command(args, run_dir, generated_params_path) + write_json( + run_dir / "workflow" / "nfcore_command.json", + {"command": command, "argv_preview": shlex.split(command)}, + ) + write_command_script(run_dir / "commands.sh", [command]) + write_json(run_dir / "config.json", {**serializable_args(args), "run_dir": str(run_dir)}) + write_json(run_dir / "validation" / "input_validation_summary.json", input_validation) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions({"nextflow": ["nextflow", "-version"]}), + ) + dry_run = { + "ok": validation["ok"] and tool_status["ok"], + "detail": "nf-core inputs, params, and Nextflow runtime validated", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + status = "blocked" if not dry_run["ok"] else "validated" + execution = None + if args.execute and dry_run["ok"]: + execution = execute_command(run_dir, command) + write_json(run_dir / "logs" / "nextflow_execute.json", execution) + write_text(run_dir / "logs" / "nextflow_execute.log", str(execution.get("stdout_tail", ""))) + status = "completed" if execution.get("ok") else "failed" + visuals = write_visuals(run_dir, status, validation, resource_plan) + resource_outputs = resource_plan.get("outputs", {}) if resource_plan else {} + write_standard_manifest( + run_dir, + run_id=run_id, + lane=f"nfcore_{args.pipeline}", + workflow=NFCORE_PIPELINES[args.pipeline]["workflow"], + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "params_file": str(args.params_file.expanduser().resolve()) + if args.params_file + else None, + "generated_params": "workflow/params.generated.json", + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs={ + "published_results": "results/", + "nextflow_report": "workflow/nextflow_report.html", + "timeline": "workflow/timeline.html", + "trace": "workflow/trace.txt", + "dag": "workflow/dag.html", + **resource_outputs, + **visuals, + }, + method={ + "adapter": "nf-core", + "pipeline": NFCORE_PIPELINES[args.pipeline], + "resource_plan": resource_plan, + }, + audit={"resource_readiness": resource_plan} if resource_plan else None, + review_bundle=visuals, + ) + write_summary(run_dir, status, validation, resource_plan) + write_json( + run_dir / "artifact_index.json", + build_artifact_index(run_dir, patterns=None, extra_roots={"results": run_dir / "results"}), + ) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_scrnaseq_fastq_to_count.py b/plugins/ngs-analysis/scripts/run_scrnaseq_fastq_to_count.py new file mode 100755 index 0000000..fceae21 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_scrnaseq_fastq_to_count.py @@ -0,0 +1,863 @@ +#!/usr/bin/env python3 +"""Run local scRNA FASTQ-to-count processing with validation, Snakemake execution, and artifacts.""" + +from __future__ import annotations + +import argparse +import csv +import gzip +import importlib.util +import json +import platform +import re +import shlex +import shutil +import statistics +import subprocess +import sys +from datetime import datetime +from pathlib import Path +from typing import Any + +import ngs_resource_gate +from ngs_run_utils import ( + build_artifact_index, + software_versions, + write_standard_manifest, + write_text, +) + +SCRIPT_PATH = Path(__file__).resolve() +PLUGIN_ROOT = SCRIPT_PATH.parents[1] +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "scrnaseq_fastq_to_count" +WORKFLOW_TEMPLATE = PLUGIN_ROOT / "workflows" / "scrnaseq_fastq_to_count" / "Snakefile.smk" +WORKFLOW_DIR = WORKFLOW_TEMPLATE.parent +DEFAULT_STAR_IMAGE = ( + "josousa/star@sha256:2683d370b9c91a2e497d776d9b0dff2ddcc01dfec5029103ffa66b2a8da7b0c2" +) +DEFAULT_STAR_IMAGE_TAG = "josousa/star:2.7.11b" +PINNED_SNAKEMAKE_VERSION = "9.19.0" +PINNED_STAR_VERSION = "2.7.11b" +SAMPLE_RE = re.compile(r"^[A-Za-z0-9_.-]+$") +FASTQ_EXTENSIONS = (".fastq", ".fq", ".fastq.gz", ".fq.gz") +READ_NAME_PREVIEW_LIMIT = 25 + + +def now_iso() -> str: + return datetime.now().astimezone().isoformat(timespec="seconds") + + +def slug_timestamp() -> str: + return datetime.now().strftime("%Y-%m-%dT%H-%M-%S-scrnaseq-fastq-to-count") + + +def write_json(path: Path, value: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(value, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def command_path(name: str) -> str | None: + return shutil.which(name) + + +def module_present(name: str) -> bool: + return importlib.util.find_spec(name) is not None + + +def shell_tool_command(name: str) -> str | None: + resolved = command_path(name) + if resolved: + return name + if name == "snakemake" and module_present("snakemake"): + return f"{sys.executable} -m snakemake" + return None + + +def run_cmd(cmd: list[str], cwd: Path, timeout: int | None) -> dict[str, Any]: + started = now_iso() + try: + result = subprocess.run( + cmd, + cwd=cwd, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + timeout=timeout, + ) + output = result.stdout or "" + return { + "cmd": cmd, + "cwd": str(cwd), + "started_at": started, + "finished_at": now_iso(), + "returncode": result.returncode, + "ok": result.returncode == 0, + "stdout_tail": output[-12000:], + } + except subprocess.TimeoutExpired as exc: + output = exc.stdout if isinstance(exc.stdout, str) else "" + return { + "cmd": cmd, + "cwd": str(cwd), + "started_at": started, + "finished_at": now_iso(), + "returncode": None, + "ok": False, + "error": f"TimeoutExpired: exceeded {timeout}s", + "stdout_tail": output[-12000:], + } + + +def resolve_path(value: str, base: Path) -> Path: + path = Path(value).expanduser() + if not path.is_absolute(): + path = base / path + return path.resolve() + + +def is_remote_uri(value: str) -> bool: + return value.startswith(("http://", "https://", "s3://", "gs://")) + + +def normalize_read_name(header: str) -> str: + name = header.strip() + if name.startswith("@"): + name = name[1:] + name = name.split()[0] + if name.endswith(("/1", "/2")): + name = name[:-2] + return name + + +def open_fastq_text(path: Path): + if path.name.endswith(".gz"): + return gzip.open(path, "rt", encoding="utf-8", errors="replace") + return path.open("rt", encoding="utf-8", errors="replace") + + +def fastq_stats(path: Path, pair_check_reads: int, quick: bool) -> dict[str, Any]: + stats: dict[str, Any] = { + "path": str(path), + "exists": path.exists(), + "readable": False, + "gzip_ok": None, + "record_count": None, + "records_checked": 0, + "first_read_names": [], + "read_lengths_preview": [], + "errors": [], + } + if not path.exists(): + stats["errors"].append("file does not exist") + return stats + if not path.is_file(): + stats["errors"].append("path is not a file") + return stats + if not path.name.endswith(FASTQ_EXTENSIONS): + stats["errors"].append("file extension is not a recognized FASTQ extension") + stats["readable"] = True + + try: + record_count = 0 + with open_fastq_text(path) as handle: + while True: + header = handle.readline() + if not header: + break + sequence = handle.readline() + plus = handle.readline() + quality = handle.readline() + if not quality: + stats["errors"].append(f"incomplete FASTQ record after record {record_count}") + break + record_count += 1 + sequence_len = len(sequence.rstrip("\n\r")) + if not header.startswith("@"): + stats["errors"].append(f"record {record_count} header does not start with @") + if not plus.startswith("+"): + stats["errors"].append(f"record {record_count} separator does not start with +") + if sequence_len != len(quality.rstrip("\n\r")): + stats["errors"].append( + f"record {record_count} sequence and quality lengths differ" + ) + if len(stats["first_read_names"]) < min(pair_check_reads, READ_NAME_PREVIEW_LIMIT): + stats["first_read_names"].append(normalize_read_name(header)) + if len(stats["read_lengths_preview"]) < pair_check_reads: + stats["read_lengths_preview"].append(sequence_len) + if quick and record_count >= pair_check_reads: + break + stats["record_count"] = None if quick else record_count + stats["records_checked"] = record_count + stats["gzip_ok"] = True if path.name.endswith(".gz") else None + if stats["read_lengths_preview"]: + stats["median_read_length"] = statistics.median(stats["read_lengths_preview"]) + except gzip.BadGzipFile: + stats["gzip_ok"] = False + stats["errors"].append("gzip stream is invalid") + except OSError as exc: + stats["errors"].append(f"read failed: {exc}") + return stats + + +def parse_samples(args: argparse.Namespace) -> tuple[list[dict[str, Any]], list[str]]: + errors: list[str] = [] + samples: list[dict[str, Any]] = [] + if args.sample_sheet: + sheet = Path(args.sample_sheet).expanduser().resolve() + if not sheet.exists(): + return [], [f"sample sheet does not exist: {sheet}"] + sample_counts: dict[str, int] = {} + with sheet.open(newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle) + columns = set(reader.fieldnames or []) + legacy_mode = {"group", "replicate", "fastq_1", "fastq_2"}.issubset(columns) + sample_col = "group" if legacy_mode else "sample" if "sample" in columns else None + r1_col = "fastq_1" if "fastq_1" in columns else None + r2_col = "fastq_2" if "fastq_2" in columns else None + if not sample_col or not r1_col or not r2_col: + return [], ["sample sheet must include group/sample and fastq_1/fastq_2 columns"] + for index, row in enumerate(reader, start=2): + sample = (row.get(sample_col) or "").strip() + r1 = (row.get(r1_col) or "").strip() + r2 = (row.get(r2_col) or "").strip() + if not sample or not r1 or not r2: + errors.append(f"row {index}: group/sample, fastq_1, and fastq_2 are required") + continue + if is_remote_uri(r1) or is_remote_uri(r2): + errors.append( + f"row {index}: remote FASTQ URLs are not supported by local execution; download or stage files first" + ) + continue + sample_counts[sample] = sample_counts.get(sample, 0) + 1 + unit = sample if sample_counts[sample] == 1 else f"{sample}__row{index}" + samples.append( + { + "sample": unit, + "original_sample": sample, + "barcode_fastq": str(resolve_path(r1, sheet.parent)), + "cdna_fastq": str(resolve_path(r2, sheet.parent)), + "expected_cells": (row.get("expected_cells") or "").strip(), + "replicate": (row.get("replicate") or "").strip(), + } + ) + elif args.barcode_fastq and args.cdna_fastq: + sample = args.sample or Path(args.barcode_fastq).name.split(".")[0] + samples.append( + { + "sample": sample, + "original_sample": sample, + "barcode_fastq": str(Path(args.barcode_fastq).expanduser().resolve()), + "cdna_fastq": str(Path(args.cdna_fastq).expanduser().resolve()), + "expected_cells": args.expected_cells or "", + "replicate": "", + } + ) + else: + errors.append("provide --sample-sheet or --barcode-fastq with --cdna-fastq") + + for sample in samples: + if not SAMPLE_RE.match(sample["sample"]): + errors.append(f"sample name {sample['sample']!r} must match {SAMPLE_RE.pattern}") + return samples, errors + + +def validate_samples( + samples: list[dict[str, Any]], pair_check_reads: int, quick: bool +) -> dict[str, Any]: + sample_summaries = [] + errors: list[str] = [] + warnings: list[str] = [] + for sample in samples: + barcode_stats = fastq_stats(Path(sample["barcode_fastq"]), pair_check_reads, quick) + cdna_stats = fastq_stats(Path(sample["cdna_fastq"]), pair_check_reads, quick) + pairing = {"checked": True, "mismatches": [], "record_count_match": None} + barcode_names = barcode_stats.get("first_read_names", []) + cdna_names = cdna_stats.get("first_read_names", []) + for index, (barcode_name, cdna_name) in enumerate(zip(barcode_names, cdna_names), start=1): + if barcode_name != cdna_name and len(pairing["mismatches"]) < 10: + pairing["mismatches"].append( + {"record": index, "barcode": barcode_name, "cdna": cdna_name} + ) + if pairing["mismatches"]: + errors.append(f"{sample['sample']}: barcode and cDNA read names do not match") + if ( + barcode_stats.get("record_count") is not None + and cdna_stats.get("record_count") is not None + ): + pairing["record_count_match"] = ( + barcode_stats["record_count"] == cdna_stats["record_count"] + ) + if not pairing["record_count_match"]: + errors.append(f"{sample['sample']}: barcode and cDNA FASTQ record counts differ") + else: + warnings.append( + f"{sample['sample']}: quick validation skipped full barcode/cDNA count parity" + ) + for label, stats in [("barcode", barcode_stats), ("cdna", cdna_stats)]: + for error in stats.get("errors", []): + errors.append(f"{sample['sample']} {label}: {error}") + barcode_len = barcode_stats.get("median_read_length") + if barcode_len and barcode_len < 20: + warnings.append( + f"{sample['sample']}: barcode read length {barcode_len} is shorter than expected for 10x chemistry" + ) + sample_summaries.append( + { + "sample": sample["sample"], + "barcode_fastq": barcode_stats, + "cdna_fastq": cdna_stats, + "pairing": pairing, + "expected_cells": sample.get("expected_cells", ""), + } + ) + return { + "ok": not errors, + "errors": errors, + "warnings": warnings, + "samples": sample_summaries, + "quick_validation": quick, + "pair_check_reads": pair_check_reads, + } + + +def whitelist_stats(path: Path) -> dict[str, Any]: + lengths: dict[int, int] = {} + total = 0 + with path.open("r", encoding="utf-8", errors="replace") as handle: + for line in handle: + barcode = line.strip() + if not barcode: + continue + total += 1 + lengths[len(barcode)] = lengths.get(len(barcode), 0) + 1 + return { + "path": str(path), + "count": total, + "length_histogram": lengths, + "dominant_length": max(lengths, key=lengths.get) if lengths else None, + } + + +def chemistry_evidence(samples: list[dict[str, Any]], args: argparse.Namespace) -> dict[str, Any]: + read_lengths = [] + for sample in samples: + stats = fastq_stats(Path(sample["barcode_fastq"]), args.pair_check_reads, quick=True) + if stats.get("median_read_length"): + read_lengths.append(float(stats["median_read_length"])) + whitelist = whitelist_stats(Path(args.cb_whitelist).expanduser().resolve()) + dominant_whitelist_len = whitelist.get("dominant_length") + observed_barcode_read_len = statistics.median(read_lengths) if read_lengths else None + predicted = None + confidence = "low" + reasons: list[str] = [] + if ( + dominant_whitelist_len == 16 + and observed_barcode_read_len + and observed_barcode_read_len >= 26 + ): + predicted = "10x_v2" + confidence = "high" if int(observed_barcode_read_len) == 26 else "medium" + reasons.append( + "Whitelist length is 16 and median barcode read length is consistent with 16bp CB + 10bp UMI." + ) + elif dominant_whitelist_len == 16: + predicted = "10x_like_16bp_cb" + confidence = "medium" + reasons.append( + "Whitelist length suggests a 10x-style 16bp cell barcode, but barcode read length is not fully diagnostic." + ) + else: + reasons.append( + "Whitelist length/read structure did not uniquely identify a known chemistry." + ) + selected = args.chemistry + compatible = selected == predicted if predicted else None + if compatible: + reasons.append("Selected chemistry matches the detected chemistry signature.") + elif predicted and selected != predicted: + reasons.append( + "Selected chemistry does not match the detected chemistry signature; review override or barcode layout." + ) + return { + "selected": selected, + "predicted": predicted, + "compatible": compatible, + "confidence": confidence, + "observed_barcode_read_length_median": observed_barcode_read_len, + "whitelist": whitelist, + "reasons": reasons, + } + + +def tool_preflight() -> dict[str, Any]: + required = ["snakemake"] + if platform.system() == "Darwin": + required.append("docker") + else: + required.append("STAR") + tools = {name: shell_tool_command(name) for name in required} + missing = [name for name in required if not tools.get(name)] + docker_daemon = None + if "docker" in required and not missing: + docker_probe = run_cmd(["docker", "info"], WORKSPACE_ROOT, timeout=30) + docker_daemon = docker_probe.get("ok", False) + if not docker_daemon: + missing.append("docker_daemon") + return { + "created_at": now_iso(), + "required": required, + "tools": tools, + "missing": missing, + "docker_daemon": docker_daemon, + "ok": not missing, + } + + +def runtime_version_snapshot() -> dict[str, Any]: + versions = software_versions( + { + "snakemake": ["snakemake", "--version"], + "docker": ["docker", "version"], + "star_native": ["STAR", "--version"], + } + ) + image_digest = None + if command_path("docker"): + inspect = run_cmd( + [ + "docker", + "image", + "inspect", + DEFAULT_STAR_IMAGE_TAG, + "--format", + "{{json .RepoDigests}}", + ], + WORKSPACE_ROOT, + timeout=30, + ) + if inspect.get("ok") and inspect.get("stdout_tail"): + try: + digests = json.loads(str(inspect["stdout_tail"]).splitlines()[-1].strip()) + image_digest = digests[0] if digests else None + except json.JSONDecodeError: + image_digest = None + mismatches = [] + if versions.get("snakemake") and PINNED_SNAKEMAKE_VERSION not in versions["snakemake"]: + mismatches.append( + f"snakemake version differs from pinned version {PINNED_SNAKEMAKE_VERSION}" + ) + if versions.get("star_native") and PINNED_STAR_VERSION not in versions["star_native"]: + mismatches.append(f"STAR native version differs from pinned version {PINNED_STAR_VERSION}") + if image_digest and image_digest != DEFAULT_STAR_IMAGE: + mismatches.append("Docker STAR image digest differs from the pinned plugin default.") + return { + "software_versions": versions, + "pinned_versions": { + "snakemake": PINNED_SNAKEMAKE_VERSION, + "star": PINNED_STAR_VERSION, + "star_image": DEFAULT_STAR_IMAGE, + "star_image_tag": DEFAULT_STAR_IMAGE_TAG, + }, + "resolved_star_image_digest": image_digest, + "mismatch_warnings": mismatches, + } + + +def build_config(samples: list[dict[str, Any]], args: argparse.Namespace) -> dict[str, Any]: + star_runner = "docker" if platform.system() == "Darwin" else "native" + chemistry = chemistry_evidence(samples, args) + return { + "samples": { + sample["sample"]: { + "original_sample": sample.get("original_sample", sample["sample"]), + "barcode_fastq": sample["barcode_fastq"], + "cdna_fastq": sample["cdna_fastq"], + "expected_cells": sample.get("expected_cells", ""), + "read_files_command": "zcat" if sample["barcode_fastq"].endswith(".gz") else "cat", + } + for sample in samples + }, + "threads": args.threads, + "execution": { + "star_runner": star_runner, + "star_image": DEFAULT_STAR_IMAGE, + "star_image_tag": DEFAULT_STAR_IMAGE_TAG, + }, + "references": { + "genome_fasta": str(Path(args.genome_fasta).expanduser().resolve()), + "annotation_gtf": str(Path(args.annotation_gtf).expanduser().resolve()), + "cb_whitelist": str(Path(args.cb_whitelist).expanduser().resolve()), + }, + "chemistry": { + "name": args.chemistry, + "cb_start": args.cb_start, + "cb_len": args.cb_len, + "umi_start": args.umi_start, + "umi_len": args.umi_len, + "sjdb_overhang": args.sjdb_overhang, + "solo_type": "CB_UMI_Simple", + "solo_cell_filter": "CellRanger2.2 3000 0.99 10", + "features_mode": "Gene", + }, + "chemistry_detection": chemistry, + "runtime_pins": { + "snakemake": PINNED_SNAKEMAKE_VERSION, + "star": PINNED_STAR_VERSION, + "star_image": DEFAULT_STAR_IMAGE, + "star_image_tag": DEFAULT_STAR_IMAGE_TAG, + }, + } + + +def write_workflow(run_dir: Path) -> None: + workflow_dir = run_dir / "workflow" + workflow_dir.mkdir(parents=True, exist_ok=True) + for source in WORKFLOW_DIR.iterdir(): + if source.is_file(): + target_name = "Snakefile" if source == WORKFLOW_TEMPLATE else source.name + shutil.copy2(source, workflow_dir / target_name) + + +def runtime_source_cache_path(run_dir: Path) -> Path: + cache_dir = run_dir / ".snakemake" / "runtime-source-cache" + cache_dir.mkdir(parents=True, exist_ok=True) + return cache_dir + + +def write_working_sample_sheet(run_dir: Path, samples: list[dict[str, Any]]) -> Path: + manifest_dir = run_dir / "manifest" + manifest_dir.mkdir(parents=True, exist_ok=True) + path = manifest_dir / "working_samplesheet.csv" + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter( + handle, + fieldnames=[ + "sample", + "original_sample", + "fastq_1", + "fastq_2", + "expected_cells", + "replicate", + ], + ) + writer.writeheader() + for sample in samples: + writer.writerow( + { + "sample": sample["sample"], + "original_sample": sample.get("original_sample", sample["sample"]), + "fastq_1": sample["barcode_fastq"], + "fastq_2": sample["cdna_fastq"], + "expected_cells": sample.get("expected_cells", ""), + "replicate": sample.get("replicate", ""), + } + ) + return path + + +def write_inputs_manifest( + run_dir: Path, args: argparse.Namespace, samples: list[dict[str, Any]] +) -> Path: + manifest_dir = run_dir / "manifest" + manifest_dir.mkdir(parents=True, exist_ok=True) + path = manifest_dir / "inputs_manifest.tsv" + rows = [ + ( + "sample_sheet", + str(Path(args.sample_sheet).expanduser().resolve()) if args.sample_sheet else "", + "user_input", + ), + ("genome_fasta", str(Path(args.genome_fasta).expanduser().resolve()), "reference"), + ("annotation_gtf", str(Path(args.annotation_gtf).expanduser().resolve()), "reference"), + ("cb_whitelist", str(Path(args.cb_whitelist).expanduser().resolve()), "reference"), + ] + for sample in samples: + rows.append((f"{sample['sample']}.fastq_1", sample["barcode_fastq"], "fastq")) + rows.append((f"{sample['sample']}.fastq_2", sample["cdna_fastq"], "fastq")) + lines = ["logical_name\tpath\trole"] + for logical_name, file_path, role in rows: + lines.append(f"{logical_name}\t{file_path}\t{role}") + write_text(path, "\n".join(lines) + "\n") + return path + + +def write_commands(run_dir: Path, cores: int, runtime_cache_path: Path) -> None: + commands = [ + "#!/usr/bin/env bash", + "set -euo pipefail", + f"cd {json.dumps(str(run_dir))}", + "snakemake --snakefile workflow/Snakefile --configfile config.json " + f"--runtime-source-cache-path {shlex.quote(str(runtime_cache_path))} " + f"--cores {cores} --dry-run", + "snakemake --snakefile workflow/Snakefile --configfile config.json " + f"--runtime-source-cache-path {shlex.quote(str(runtime_cache_path))} " + f"--cores {cores}", + "", + ] + path = run_dir / "commands.sh" + path.write_text("\n".join(commands), encoding="utf-8") + path.chmod(0o755) + + +def snakemake_cmd(cores: int, dry_run: bool, runtime_cache_path: Path) -> list[str]: + snakemake = shell_tool_command("snakemake") or "snakemake" + cmd = [ + *shlex.split(snakemake), + "--snakefile", + "workflow/Snakefile", + "--configfile", + "config.json", + "--runtime-source-cache-path", + str(runtime_cache_path), + "--cores", + str(cores), + ] + if dry_run: + cmd.append("--dry-run") + return cmd + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> None: + lines = [ + "# scRNA FASTQ-to-count Run Summary", + "", + f"Status: `{status}`", + "", + "## Key Artifacts", + "", + "- `counts//Solo.out/Gene/raw/matrix.mtx`", + "- `counts//Solo.out/Gene/raw/barcodes.tsv`", + "- `counts//Solo.out/Gene/raw/features.tsv`", + "- `validation/input_summary.json`", + "- `validation/validation_summary.json`", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `artifact_index.json`", + "", + ] + if validation.get("warnings"): + lines.extend(["## Validation Warnings", ""]) + lines.extend(f"- {warning}" for warning in validation["warnings"]) + lines.append("") + lines.extend(ngs_resource_gate.resource_summary_lines(resource_plan)) + lines.append("Raw FASTQs were read-only inputs and were not modified.") + lines.append("") + (run_dir / "summary.md").write_text("\n".join(lines), encoding="utf-8") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--sample-sheet", help="CSV with group/sample, fastq_1, and fastq_2 columns." + ) + parser.add_argument("--sample", help="Sample name for single-sample mode.") + parser.add_argument("--barcode-fastq", help="R1 barcode/UMI FASTQ for single-sample mode.") + parser.add_argument("--cdna-fastq", help="R2 cDNA FASTQ for single-sample mode.") + parser.add_argument("--genome-fasta", required=True) + parser.add_argument("--annotation-gtf", required=True) + parser.add_argument("--cb-whitelist", required=True) + parser.add_argument("--expected-cells") + parser.add_argument( + "--outdir", + type=Path, + help="Run directory. Defaults to ngs_runs/scrnaseq_fastq_to_count/.", + ) + parser.add_argument("--threads", type=int, default=4) + parser.add_argument("--run-id", default=slug_timestamp()) + parser.add_argument( + "--execute", + action="store_true", + help="Run Snakemake after validation and workflow validation.", + ) + parser.add_argument( + "--no-dry-run", action="store_true", help="Skip Snakemake workflow validation." + ) + parser.add_argument( + "--quick-validation", action="store_true", help="Check only the first N records per FASTQ." + ) + parser.add_argument("--pair-check-reads", type=int, default=1000) + parser.add_argument("--chemistry", default="10x_v2") + parser.add_argument("--cb-start", type=int, default=1) + parser.add_argument("--cb-len", type=int, default=16) + parser.add_argument("--umi-start", type=int, default=17) + parser.add_argument("--umi-len", type=int, default=10) + parser.add_argument("--sjdb-overhang", type=int, default=99) + parser.add_argument( + "--genome-build", + help="Genome build or registry alias for resource readiness, e.g. GRCh38, mm39, or a configured local alias.", + ) + parser.add_argument( + "--bundle-root", + action="append", + default=[], + help="Resource bundle override formatted as bundle=/path. May be repeated.", + ) + parser.add_argument("--include-optional-resources", action="store_true") + parser.add_argument("--resource-checksums", action="store_true") + parser.add_argument( + "--require-resource-plan", + action="store_true", + help="Treat missing registered reference bundles as blocking for this local runner.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Skip registered reference bundle readiness checks.", + ) + return parser.parse_args() + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + + config: dict[str, Any] = {} + validation: dict[str, Any] = {"ok": False, "errors": [], "warnings": [], "samples": []} + tool_status: dict[str, Any] = {"ok": False} + dry_run: dict[str, Any] | None = None + execution: dict[str, Any] | None = None + status = "failed" + samples, parse_errors = parse_samples(args) + validation = ( + validate_samples(samples, args.pair_check_reads, args.quick_validation) + if samples + else { + "ok": False, + "errors": [], + "warnings": [], + "samples": [], + "quick_validation": args.quick_validation, + "pair_check_reads": args.pair_check_reads, + } + ) + validation["errors"] = parse_errors + validation.get("errors", []) + validation["ok"] = not validation["errors"] + tool_status = tool_preflight() + config = build_config(samples, args) if samples else {} + input_validation = dict(validation) + resource_plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="scrnaseq", + genome_build=args.genome_build, + bundle_roots=args.bundle_root, + include_optional=args.include_optional_resources, + include_checksums=args.resource_checksums, + skip=args.skip_resource_plan, + required=args.require_resource_plan, + ) + validation = ngs_resource_gate.merge_resource_status( + validation, + resource_plan, + required=args.require_resource_plan, + ) + + if config: + write_json(run_dir / "config.json", config) + write_json( + run_dir / "validation" / "input_summary.json", {"samples": config.get("samples", {})} + ) + write_json(run_dir / "validation" / "input_validation_summary.json", input_validation) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_json(run_dir / "versions" / "software_versions.json", runtime_version_snapshot()) + working_samplesheet = write_working_sample_sheet(run_dir, samples) + inputs_manifest = write_inputs_manifest(run_dir, args, samples) + source_cache_path = runtime_source_cache_path(run_dir) + write_workflow(run_dir) + write_commands(run_dir, args.threads, source_cache_path) + + try: + status = "prepared" + blocked = not validation["ok"] or not tool_status["ok"] + if blocked: + status = "blocked" + elif not args.no_dry_run: + dry_run = run_cmd( + snakemake_cmd(args.threads, dry_run=True, runtime_cache_path=source_cache_path), + run_dir, + timeout=600, + ) + write_json(run_dir / "logs" / "snakemake_dry_run.json", dry_run) + write_text(run_dir / "logs" / "snakemake_dry_run.log", dry_run.get("stdout_tail", "")) + if not dry_run.get("ok"): + status = "failed" + blocked = True + if args.execute and not blocked: + execution = run_cmd( + snakemake_cmd(args.threads, dry_run=False, runtime_cache_path=source_cache_path), + run_dir, + timeout=86400, + ) + write_json(run_dir / "logs" / "snakemake_execute.json", execution) + write_text(run_dir / "logs" / "snakemake_execute.log", execution.get("stdout_tail", "")) + status = "completed" if execution.get("ok") else "failed" + elif not args.execute and status == "prepared": + status = "validated" + except Exception as exc: # pragma: no cover - defensive manifest completion + write_text(run_dir / "logs" / "runner_exception.txt", f"{type(exc).__name__}: {exc}\n") + execution = execution or {"ok": False, "error": f"{type(exc).__name__}: {exc}"} + status = "failed" + + write_summary(run_dir, status, validation, resource_plan) + resource_outputs = ngs_resource_gate.resource_output_paths(resource_plan) + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="scrnaseq_fastq_to_count", + workflow="local_snakemake_starsolo", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(Path(args.sample_sheet).expanduser().resolve()) + if args.sample_sheet + else None, + "working_sample_sheet": str(working_samplesheet), + "inputs_manifest": str(inputs_manifest), + "references": config.get("references", {}), + "samples": config.get("samples", {}), + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs={ + "raw_matrix_glob": "counts/*/Solo.out/Gene/raw/*.tsv", + "raw_matrix_mtx_glob": "counts/*/Solo.out/Gene/raw/*.mtx", + "filtered_matrix_glob": "counts/*/Solo.out/Gene/filtered/*", + "star_logs_glob": "counts/*/Log*", + "versions": "versions/software_versions.json", + "manifest_glob": "manifest/*", + **resource_outputs, + }, + method={ + "runner": "STARsolo", + "chemistry": config.get("chemistry"), + "chemistry_detection": config.get("chemistry_detection"), + "runtime_pins": config.get("runtime_pins"), + "resource_plan": resource_plan, + }, + audit={"resource_readiness": resource_plan} if resource_plan else None, + ) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + + print(run_dir) + if status in {"blocked", "failed"}: + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_scrnaseq_post_count_qc.py b/plugins/ngs-analysis/scripts/run_scrnaseq_post_count_qc.py new file mode 100644 index 0000000..5468419 --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_scrnaseq_post_count_qc.py @@ -0,0 +1,1342 @@ +#!/usr/bin/env python3 +"""Run matrix-level scRNA post-count QC with raw-count preservation and auditable artifacts.""" + +from __future__ import annotations + +import argparse +import importlib.metadata +import importlib.util +import json +import os +import shutil +import subprocess +import tempfile +from datetime import datetime +from pathlib import Path +from textwrap import dedent +from typing import Any + +SCRIPT_PATH = Path(__file__).resolve() +PLUGIN_ROOT = SCRIPT_PATH.parents[1] +WORKSPACE_ROOT = Path.cwd() + + +def configure_runtime_environment() -> dict[str, str]: + runtime_root = ( + Path(os.environ.get("NGS_ANALYSIS_RUNTIME_ROOT", tempfile.gettempdir())) + / "ngs-analysis-runtime" + / "scrnaseq-post-count" + ) + dirs = { + "runtime_root": runtime_root, + "mplconfig": runtime_root / "matplotlib", + "xdg_cache": runtime_root / "xdg_cache", + "xdg_state": runtime_root / "xdg_state", + "numba_cache": runtime_root / "numba", + } + for path in dirs.values(): + path.mkdir(parents=True, exist_ok=True) + os.environ.setdefault("MPLCONFIGDIR", str(dirs["mplconfig"])) + os.environ.setdefault("XDG_CACHE_HOME", str(dirs["xdg_cache"])) + os.environ.setdefault("XDG_STATE_HOME", str(dirs["xdg_state"])) + os.environ.setdefault("NUMBA_CACHE_DIR", str(dirs["numba_cache"])) + os.environ.setdefault("LOKY_MAX_CPU_COUNT", str(os.cpu_count() or 1)) + return {key: str(value) for key, value in dirs.items()} + + +RUNTIME_ENV = configure_runtime_environment() + + +def patch_numba_cache_decorators() -> None: + try: + import numba # type: ignore + except ImportError: + return + + def wrap_strip_cache(fn: Any) -> Any: + def wrapped(*args: Any, **kwargs: Any) -> Any: + kwargs.pop("cache", None) + return fn(*args, **kwargs) + + return wrapped + + for name in ("jit", "njit", "vectorize", "guvectorize", "cfunc"): + if hasattr(numba, name): + setattr(numba, name, wrap_strip_cache(getattr(numba, name))) + + +patch_numba_cache_decorators() + +from ngs_run_utils import build_artifact_index, write_standard_manifest # noqa: E402 +from ngs_visualization_utils import ( # noqa: E402 + artifact_entry, + copy_visual_asset, + launch_marimo_review_app, + write_marimo_review_notebook, + write_visualization_index, +) + +PYTHON_ANALYSIS_MODULES = { + "anndata": "anndata", + "matplotlib": "matplotlib", + "numpy": "numpy", + "pandas": "pandas", + "scanpy": "scanpy", +} +REQUIRED_R_PACKAGES = ("DropletUtils", "scDblFinder", "SoupX") + +ad: Any = None +matplotlib: Any = None +plt: Any = None +np: Any = None +pd: Any = None +sc: Any = None + + +def python_dependency_status() -> dict[str, Any]: + modules = {} + missing = [] + for module_name, package_name in PYTHON_ANALYSIS_MODULES.items(): + try: + present = importlib.util.find_spec(module_name) is not None + except (ImportError, AttributeError, ValueError): + present = False + modules[module_name] = {"present": present, "package": package_name} + if not present: + missing.append(module_name) + return { + "ok": not missing, + "python_modules": modules, + "missing": missing, + "errors": ["Missing Python analysis packages: " + ", ".join(missing)] if missing else [], + } + + +def installed_package_version(package_name: str) -> str | None: + try: + return importlib.metadata.version(package_name) + except importlib.metadata.PackageNotFoundError: + return None + + +def load_analysis_modules() -> dict[str, Any]: + status = python_dependency_status() + if not status["ok"]: + return status + + global ad, matplotlib, np, pd, plt, sc + + import anndata as ad_module # type: ignore[import-not-found] + import matplotlib as matplotlib_module # type: ignore[import-not-found] + import numpy as np_module + import pandas as pd_module + import scanpy as sc_module # type: ignore[import-not-found] + + matplotlib_module.use("Agg") + import matplotlib.pyplot as plt_module # type: ignore[import-not-found] + + ad = ad_module + matplotlib = matplotlib_module + np = np_module + pd = pd_module + plt = plt_module + sc = sc_module + for module_name, loaded_module in { + "anndata": ad, + "matplotlib": matplotlib, + "numpy": np, + "pandas": pd, + "scanpy": sc, + }.items(): + package_name = PYTHON_ANALYSIS_MODULES[module_name] + status["python_modules"][module_name]["version"] = installed_package_version( + package_name + ) or getattr(loaded_module, "__version__", None) + return status + + +def now_iso() -> str: + return datetime.now().astimezone().isoformat(timespec="seconds") + + +def slug_timestamp() -> str: + return datetime.now().strftime("%Y-%m-%dT%H-%M-%S-scrnaseq-post-count-qc") + + +def write_json(path: Path, value: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(value, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +def write_text(path: Path, text: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text, encoding="utf-8") + + +def command_path(name: str) -> str | None: + return shutil.which(name) + + +def run_cmd(cmd: list[str], cwd: Path, timeout: int | None) -> dict[str, Any]: + started = now_iso() + try: + result = subprocess.run( + cmd, + cwd=cwd, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + timeout=timeout, + ) + output = result.stdout or "" + return { + "cmd": cmd, + "cwd": str(cwd), + "started_at": started, + "finished_at": now_iso(), + "returncode": result.returncode, + "ok": result.returncode == 0, + "stdout_tail": output[-12000:], + } + except subprocess.TimeoutExpired as exc: + output = exc.stdout if isinstance(exc.stdout, str) else "" + return { + "cmd": cmd, + "cwd": str(cwd), + "started_at": started, + "finished_at": now_iso(), + "returncode": None, + "ok": False, + "error": f"TimeoutExpired: exceeded {timeout}s", + "stdout_tail": output[-12000:], + } + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--input-dir", + type=Path, + required=True, + help="Directory containing matrix/, manifest.tsv, dataset_metadata.json.", + ) + parser.add_argument( + "--output-dir", + type=Path, + help="Where to write QC artifacts. Defaults to /output//", + ) + parser.add_argument( + "--matrix-dir", + type=Path, + help="Optional explicit matrix directory with matrix.mtx, barcodes.tsv, genes.tsv.", + ) + parser.add_argument( + "--raw-matrix-dir", + type=Path, + help="Optional raw droplet matrix directory for emptyDrops-style cell calling.", + ) + parser.add_argument( + "--dataset-metadata", type=Path, help="Optional explicit metadata JSON path." + ) + parser.add_argument( + "--resolution", + type=float, + default=0.5, + help="Leiden resolution for plot subset clustering.", + ) + parser.add_argument( + "--timeout-seconds", type=int, default=1800, help="Timeout for the R doublet step." + ) + parser.add_argument( + "--rscript", type=Path, help="Optional explicit Rscript path for the scDblFinder step." + ) + parser.add_argument( + "--launch-review-app", + action=argparse.BooleanOptionalAction, + default=True, + help="Auto-launch the generated Marimo review app on localhost and record its URL in the run envelope.", + ) + parser.add_argument( + "--review-app-port", + type=int, + default=2719, + help="Starting port to use when auto-launching the Marimo review app.", + ) + return parser.parse_args() + + +def matrix_paths(args: argparse.Namespace) -> tuple[Path, Path, Path | None, Path, Path]: + input_dir = args.input_dir.expanduser().resolve() + matrix_dir = (args.matrix_dir or input_dir / "matrix").expanduser().resolve() + raw_matrix_dir = args.raw_matrix_dir.expanduser().resolve() if args.raw_matrix_dir else None + metadata = (args.dataset_metadata or input_dir / "dataset_metadata.json").expanduser().resolve() + return input_dir, matrix_dir, raw_matrix_dir, metadata, input_dir / "manifest.tsv" + + +def validate_inputs( + matrix_dir: Path, raw_matrix_dir: Path | None, metadata: Path, manifest: Path +) -> dict[str, Any]: + files = { + "matrix": matrix_dir / "matrix.mtx", + "barcodes": matrix_dir / "barcodes.tsv", + "genes": matrix_dir / "genes.tsv", + "metadata": metadata, + "manifest": manifest, + } + if raw_matrix_dir: + files["raw_matrix"] = raw_matrix_dir / "matrix.mtx" + files["raw_barcodes"] = raw_matrix_dir / "barcodes.tsv" + files["raw_genes"] = raw_matrix_dir / "genes.tsv" + summary: dict[str, Any] = {"files": {}, "errors": []} + for name, path in files.items(): + exists = path.exists() + summary["files"][name] = { + "path": str(path), + "exists": exists, + "size": path.stat().st_size if exists else None, + } + if not exists and name != "manifest": + summary["errors"].append(f"required file missing: {path}") + if summary["errors"]: + return summary + with (matrix_dir / "matrix.mtx").open("rt", encoding="utf-8", errors="replace") as handle: + first = handle.readline().strip() + if not first.startswith("%%MatrixMarket"): + summary["errors"].append("matrix.mtx does not start with MatrixMarket header") + return summary + + +def r_dependency_status(rscript: Path | None) -> dict[str, Any]: + rscript_cmd = str(rscript) if rscript else command_path("Rscript") + status: dict[str, Any] = { + "rscript": rscript_cmd, + "packages": {}, + "required_packages": list(REQUIRED_R_PACKAGES), + "missing": [], + "ok": False, + } + if not rscript_cmd: + status["missing"] = ["Rscript", *REQUIRED_R_PACKAGES] + return status + probe = run_cmd( + [ + rscript_cmd, + "-e", + 'pkgs<-c("DropletUtils","scDblFinder","SoupX");' + "st<-sapply(pkgs, requireNamespace, quietly=TRUE);" + 'cat(paste(names(st), st, sep="=", collapse=";"))', + ], + WORKSPACE_ROOT, + timeout=60, + ) + status["probe"] = probe + if probe.get("ok"): + parsed = {} + for item in str(probe.get("stdout_tail", "")).strip().split(";"): + if "=" in item: + key, value = item.split("=", 1) + parsed[key] = value == "TRUE" + status["packages"] = parsed + status["missing"] = [pkg for pkg in REQUIRED_R_PACKAGES if not parsed.get(pkg, False)] + status["ok"] = not status["missing"] + else: + status["missing"] = list(REQUIRED_R_PACKAGES) + return status + + +def combined_tool_preflight_status( + python_dep_status: dict[str, Any], r_dep_status: dict[str, Any] +) -> dict[str, Any]: + return { + "ok": bool(python_dep_status.get("ok") and r_dep_status.get("ok")), + "python_dependencies": python_dep_status, + "r_dependencies": r_dep_status, + } + + +def matrix_nonzero_entries(matrix: Any) -> int: + if hasattr(matrix, "nnz"): + return int(matrix.nnz) + return int(np.count_nonzero(np.asarray(matrix))) + + +def scdbfinder_readiness(adata: ad.AnnData) -> dict[str, Any]: + informative_mask = np.asarray(adata.obs["total_counts"] > 0, dtype=bool) + informative_cells = int(np.count_nonzero(informative_mask)) + nonzero_entries = matrix_nonzero_entries(adata.layers.get("counts", adata.X)) + if informative_cells < 2: + return { + "ok": False, + "reason": "too_few_informative_cells", + "detail": ( + "scDblFinder requires at least 2 informative cells with non-zero counts. " + f"Observed {informative_cells} informative cells across {adata.n_obs} barcodes " + f"and {nonzero_entries} non-zero matrix entries." + ), + "informative_cells": informative_cells, + "barcodes": int(adata.n_obs), + "nonzero_entries": nonzero_entries, + } + return { + "ok": True, + "informative_cells": informative_cells, + "barcodes": int(adata.n_obs), + "nonzero_entries": nonzero_entries, + } + + +def scdbfinder_blocker_reason(dbl_result: dict[str, Any]) -> str: + reason = dbl_result.get("reason") + if reason == "too_few_informative_cells": + return "scDblFinder was skipped because the matrix has too few informative cells for doublet modeling." + if reason == "missing_r_dependencies": + return "scDblFinder was skipped because required R/Bioconductor packages were unavailable." + if reason == "rscript_missing": + return "scDblFinder was skipped because Rscript was not available." + return "scDblFinder could not run; final cell set is not doublet-complete." + + +def robust_lower(values: np.ndarray, use_log: bool = True, z: float = 3.0) -> float: + arr = np.asarray(values, dtype=float) + arr = arr[np.isfinite(arr) & (arr > 0)] + if arr.size == 0: + return 0.0 + work = np.log10(arr + 1.0) if use_log else arr + median = float(np.median(work)) + mad = float(np.median(np.abs(work - median))) + if mad == 0: + return float(np.min(arr)) + bound = median - z * mad + return float(max(0.0, 10**bound - 1.0)) if use_log else float(max(0.0, bound)) + + +def robust_upper(values: np.ndarray, use_log: bool = True, z: float = 3.0) -> float: + arr = np.asarray(values, dtype=float) + arr = arr[np.isfinite(arr) & (arr > 0)] + if arr.size == 0: + return float("nan") + work = np.log10(arr + 1.0) if use_log else arr + median = float(np.median(work)) + mad = float(np.median(np.abs(work - median))) + if mad == 0: + return float(np.max(arr)) + bound = median + z * mad + return float(10**bound - 1.0) if use_log else float(bound) + + +def threshold_dict(adata: ad.AnnData) -> dict[str, Any]: + lower_genes = robust_lower(adata.obs["n_genes_by_counts"].to_numpy(), use_log=True, z=3.0) + lower_counts = robust_lower(adata.obs["total_counts"].to_numpy(), use_log=True, z=3.0) + upper_mito = robust_upper(adata.obs["pct_counts_mt"].to_numpy(), use_log=False, z=3.0) + upper_genes = robust_upper(adata.obs["n_genes_by_counts"].to_numpy(), use_log=True, z=3.0) + upper_counts = robust_upper(adata.obs["total_counts"].to_numpy(), use_log=True, z=3.0) + return { + "n_genes_by_counts": { + "lower": lower_genes, + "upper_review": upper_genes, + "method": "log10 median +/- 3 MAD", + }, + "total_counts": { + "lower": lower_counts, + "upper_review": upper_counts, + "method": "log10 median +/- 3 MAD", + }, + "pct_counts_mt": { + "upper": None if not np.isfinite(upper_mito) else upper_mito, + "method": "median + 3 MAD; disabled when no mitochondrial signal is present", + }, + } + + +def plot_thresholds(adata: ad.AnnData, thresholds: dict[str, Any], out_path: Path) -> None: + fig, axes = plt.subplots(1, 3, figsize=(14, 4.5)) + items = [ + ( + "total_counts", + "Total counts", + thresholds["total_counts"]["lower"], + thresholds["total_counts"]["upper_review"], + ), + ( + "n_genes_by_counts", + "Detected genes", + thresholds["n_genes_by_counts"]["lower"], + thresholds["n_genes_by_counts"]["upper_review"], + ), + ("pct_counts_mt", "Mito %", None, thresholds["pct_counts_mt"]["upper"]), + ] + for ax, (column, title, lower, upper) in zip(axes, items, strict=True): + values = adata.obs[column].to_numpy() + ax.hist(values, bins=60, color="#4f7d4a", alpha=0.85) + if lower is not None: + ax.axvline(lower, color="#b22222", linestyle="--", linewidth=1.5) + if upper is not None: + ax.axvline(upper, color="#204a87", linestyle=":", linewidth=1.5) + ax.set_title(title) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + fig.tight_layout() + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=160) + plt.close(fig) + + +def write_tsv(path: Path, rows: list[dict[str, Any]], fieldnames: list[str]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + pd.DataFrame(rows, columns=fieldnames).to_csv(path, sep="\t", index=False) + + +def plot_count_summary(rows: list[dict[str, Any]], out_path: Path, *, title: str) -> None: + labels = [str(row["metric"]) for row in rows] + values = [float(row["cells"]) for row in rows] + fig, ax = plt.subplots(figsize=(8.5, 5.0)) + ax.bar(labels, values, color="#3b6ea8") + ax.set_title(title) + ax.set_ylabel("Cells") + ax.tick_params(axis="x", rotation=35) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + fig.tight_layout() + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=160) + plt.close(fig) + + +def plot_simple_embedding( + coords: np.ndarray, labels: pd.Series, out_path: Path, *, title: str +) -> None: + fig, ax = plt.subplots(figsize=(7.5, 6.5)) + ax.scatter(coords[:, 0], coords[:, 1], s=65, c="#3b6ea8", alpha=0.9) + for idx, label in enumerate(labels.astype(str).tolist()): + ax.text(coords[idx, 0] + 0.03, coords[idx, 1] + 0.03, label, fontsize=8) + ax.set_title(title) + ax.set_xlabel("UMAP1") + ax.set_ylabel("UMAP2") + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + fig.tight_layout() + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, dpi=160) + plt.close(fig) + + +def write_scrna_visual_bundle( + output_root: Path, review_app_info: dict[str, Any] | None +) -> dict[str, str]: + entries: list[dict[str, Any]] = [] + notes = [ + "The Marimo notebook is a review surface over generated artifacts; the PNG/CSV/H5AD files remain the portable source of truth.", + "Marker labels are conservative PBMC fallback labels unless a matched reference was provided upstream.", + ] + visualizations = output_root / "visualizations" + visualizations.mkdir(parents=True, exist_ok=True) + (output_root / "tables").mkdir(parents=True, exist_ok=True) + (output_root / "notebooks").mkdir(parents=True, exist_ok=True) + + copy_specs = [ + ( + "threshold_justification", + "QC Threshold Justification", + "qc/threshold_justification.png", + "visualizations/threshold_justification.png", + "Threshold histograms with selected review/filter cutoffs.", + ), + ( + "umap_global", + "UMAP by Coarse Label", + "plots/umap_global.png", + "visualizations/umap_by_coarse_label.png", + "Global UMAP colored by conservative coarse labels.", + ), + ( + "umap_by_cluster", + "UMAP by Leiden Cluster", + "plots/umap_by_coarse_label.png", + "visualizations/umap_by_cluster.png", + "Global UMAP colored by Leiden cluster.", + ), + ( + "qc_pass_fail_counts", + "QC Pass/Fail Counts", + "qc/qc_pass_fail_counts.png", + "visualizations/qc_pass_fail_counts.png", + "Cell counts for major QC inclusion states.", + ), + ] + for artifact_id, title, source_rel, dest_rel, description in copy_specs: + dest = copy_visual_asset(output_root / source_rel, output_root / dest_rel) + entries.append( + artifact_entry( + artifact_id=artifact_id, + title=title, + path=dest.relative_to(output_root) if dest else None, + kind="plot", + status="created" if dest else "not_available", + description=description, + ) + ) + + table_specs = [ + ( + "cell_qc_summary", + "Cell QC Summary", + "tables/cell_qc_summary.tsv", + "Aggregate cell counts for major QC states.", + ), + ( + "cell_qc_metrics", + "Cell QC Metrics", + "qc/cell_qc_metrics.csv", + "Per-cell QC metrics and filter flags.", + ), + ( + "cell_labels", + "Cell Labels", + "annotation/cell_labels.csv", + "Per-cell labels, marker scores, and annotation confidence.", + ), + ( + "umap_coords", + "UMAP Coordinates", + "embeddings/umap_coords.csv", + "UMAP coordinates for downstream plotting.", + ), + ] + for artifact_id, title, rel_path, description in table_specs: + exists = (output_root / rel_path).exists() + entries.append( + artifact_entry( + artifact_id=artifact_id, + title=title, + path=rel_path if exists else None, + kind="table", + status="created" if exists else "not_available", + description=description, + ) + ) + + notebook_path = write_marimo_review_notebook( + output_root / "notebooks" / "scrna_qc_review.marimo.py", + title="scRNA QC Review", + run_dir=output_root, + image_items=[ + ("QC Threshold Justification", "visualizations/threshold_justification.png"), + ("QC Pass/Fail Counts", "visualizations/qc_pass_fail_counts.png"), + ("UMAP by Coarse Label", "visualizations/umap_by_coarse_label.png"), + ("UMAP by Leiden Cluster", "visualizations/umap_by_cluster.png"), + ], + table_items=[ + ("Cell QC Summary", "tables/cell_qc_summary.tsv"), + ("Cell QC Metrics", "qc/cell_qc_metrics.csv"), + ("Cell Labels", "annotation/cell_labels.csv"), + ("UMAP Coordinates", "embeddings/umap_coords.csv"), + ], + object_items=[ + ("Analysis with flags", "analysis_with_flags.h5ad"), + ("Filtered review object", "filtered_view.h5ad"), + ], + ) + entries.append( + artifact_entry( + artifact_id="scrna_qc_review_notebook", + title="scRNA QC Review Notebook", + path=notebook_path.relative_to(output_root), + kind="marimo_notebook", + status="created", + description="Interactive review notebook over the generated QC, UMAP, and annotation artifacts.", + ) + ) + if review_app_info: + entries.append( + artifact_entry( + artifact_id="scrna_qc_review_launch", + title="scRNA QC Review App", + path=review_app_info.get("url"), + kind="localhost_app", + status="created" if review_app_info.get("ok") else "blocked", + description="Auto-launched localhost Marimo review app for the generated notebook.", + source="notebooks/marimo_server.json", + ) + ) + if review_app_info.get("ok"): + notes.append(f"Review app auto-launched at {review_app_info['url']}.") + else: + notes.append( + "Review app auto-launch did not become ready. See notebooks/marimo_server.json and logs/marimo_server.log." + ) + + index_path = write_visualization_index( + output_root, + title="scRNA Post-count QC Visualizations", + description="Portable scRNA QC artifact bundle with an auto-launched Marimo review app and a notebook backup.", + entries=entries, + notes=notes, + ) + return { + "visualization_index": str(index_path.relative_to(output_root)), + "visualization_manifest": "visualizations/visualization_manifest.json", + "review_notebook": str(notebook_path.relative_to(output_root)), + } + + +def run_scdbfinder( + matrix_dir: Path, out_csv: Path, timeout: int, rscript: Path | None +) -> dict[str, Any]: + rscript_cmd = str(rscript) if rscript else command_path("Rscript") + if not rscript_cmd: + return {"ok": False, "error": "Rscript not found"} + r_script = dedent( + f""" + suppressPackageStartupMessages({{ + library(DropletUtils) + library(scDblFinder) + }}) + sce <- read10xCounts("{matrix_dir.as_posix()}", col.names=TRUE) + sce <- scDblFinder(sce) + out <- data.frame( + barcode=colnames(sce), + scDblFinder_score=colData(sce)$scDblFinder.score, + scDblFinder_class=colData(sce)$scDblFinder.class, + row.names=NULL + ) + write.csv(out, "{out_csv.as_posix()}", row.names=FALSE) + """ + ).strip() + with tempfile.NamedTemporaryFile("w", suffix=".R", delete=False, encoding="utf-8") as handle: + handle.write(r_script) + temp_script = Path(handle.name) + try: + result = run_cmd([rscript_cmd, str(temp_script)], WORKSPACE_ROOT, timeout=timeout) + finally: + temp_script.unlink(missing_ok=True) + return result + + +def run_emptydrops( + matrix_dir: Path, out_csv: Path, timeout: int, rscript: Path | None +) -> dict[str, Any]: + rscript_cmd = str(rscript) if rscript else command_path("Rscript") + if not rscript_cmd: + return {"ok": False, "error": "Rscript not found"} + r_script = dedent( + f""" + suppressPackageStartupMessages({{ + library(DropletUtils) + }}) + sce <- read10xCounts("{matrix_dir.as_posix()}", col.names=TRUE) + ed <- emptyDrops(counts(sce)) + out <- data.frame( + barcode=rownames(ed), + FDR=ed$FDR, + LogProb=ed$LogProb, + Limited=ed$Limited, + row.names=NULL + ) + write.csv(out, "{out_csv.as_posix()}", row.names=FALSE) + """ + ).strip() + with tempfile.NamedTemporaryFile("w", suffix=".R", delete=False, encoding="utf-8") as handle: + handle.write(r_script) + temp_script = Path(handle.name) + try: + result = run_cmd([rscript_cmd, str(temp_script)], WORKSPACE_ROOT, timeout=timeout) + finally: + temp_script.unlink(missing_ok=True) + return result + + +def add_marker_scores(adata: ad.AnnData) -> list[str]: + marker_sets = { + "T_cell": ["IL7R", "LTB", "MALAT1", "IL32", "LTB"], + "NK_cell": ["NKG7", "GNLY", "PRF1", "CCL5", "GZMB"], + "B_cell": ["MS4A1", "CD79A", "CD79B", "HLA-DRA", "CD74"], + "CD14_mono": ["LST1", "S100A8", "S100A9", "FCN1", "LGALS3"], + "FCGR3A_mono": ["FCGR3A", "MS4A7", "LST1", "IFITM3", "SAT1"], + "DC": ["FCER1A", "CST3", "HLA-DRA", "CLEC10A", "FCER1G"], + "Platelet": ["PPBP", "PF4", "SDPR", "NRGN", "GNG11"], + } + created: list[str] = [] + for label, genes in marker_sets.items(): + present = [gene for gene in genes if gene in adata.var_names] + if not present: + continue + score_name = f"{label}_score" + sc.tl.score_genes(adata, gene_list=present, score_name=score_name, use_raw=False) + created.append(score_name) + return created + + +def assign_labels(adata: ad.AnnData, score_columns: list[str]) -> pd.DataFrame: + if not score_columns: + result = pd.DataFrame(index=adata.obs_names) + result["coarse_label"] = "unknown" + result["label_confidence"] = 0.0 + return result + scores = adata.obs[score_columns].copy() + top_label = scores.idxmax(axis=1).str.replace("_score", "", regex=False) + top_score = scores.max(axis=1) + runner_up = scores.apply( + lambda row: row.nlargest(2).iloc[-1] if row.notna().sum() >= 2 else 0.0, axis=1 + ) + delta = top_score - runner_up + label = np.where((top_score > 0.1) & (delta > 0.02), top_label, "ambiguous") + confidence = delta.clip(lower=0.0) + return pd.DataFrame( + {"coarse_label": label, "label_confidence": confidence}, index=adata.obs_names + ) + + +def package_versions() -> dict[str, str]: + import importlib.metadata + + packages = [ + "scanpy", + "anndata", + "numpy", + "pandas", + "matplotlib", + "scipy", + "igraph", + "leidenalg", + "marimo", + ] + versions: dict[str, str] = {} + for pkg in packages: + try: + versions[pkg] = importlib.metadata.version(pkg) + except importlib.metadata.PackageNotFoundError: + versions[pkg] = "missing" + return versions + + +def build_analysis_status( + *, + dataset_metadata: dict[str, Any], + dbl_result: dict[str, Any], + review_app_info: dict[str, Any] | None, +) -> dict[str, Any]: + blocking_issues: list[dict[str, Any]] = [] + warnings: list[str] = [] + if not dbl_result.get("ok"): + blocking_issues.append( + { + "component": "doublet_detection", + "reason": scdbfinder_blocker_reason(dbl_result), + "detail": dbl_result.get("stdout_tail") or dbl_result.get("error"), + } + ) + if dataset_metadata.get("batch_count", 0) <= 1: + warnings.append( + "No cell-level batch or channel metadata were provided, so QC and doublet logic could not be partition-aware." + ) + warnings.append( + "Ambient RNA modeling was skipped because the bundle contains a filtered matrix without raw droplets." + ) + warnings.append( + "Annotation used a conservative PBMC marker fallback because no matched reference atlas was provided." + ) + if review_app_info and not review_app_info.get("ok"): + warnings.append( + "The Marimo review app was generated but did not become ready before timeout; reopen it from the recorded localhost URL if needed." + ) + return { + "completion_state": "complete" if not blocking_issues else "partial", + "blocking_issues": blocking_issues, + "warnings": warnings, + } + + +def write_summary_md( + output_root: Path, + *, + dataset_metadata: dict[str, Any], + thresholds: dict[str, Any], + qc_summary_rows: list[dict[str, Any]], + analysis_status: dict[str, Any], + review_app_info: dict[str, Any] | None, +) -> None: + metric_map = {row["metric"]: row["cells"] for row in qc_summary_rows} + lines = [ + "# scRNA Post-count QC Summary", + "", + f"- Dataset ID: `{dataset_metadata.get('dataset_id', 'unknown')}`", + f"- Organism / assay: `{dataset_metadata.get('organism', 'unknown')}` / `{dataset_metadata.get('assay', 'unknown')}`", + f"- Completion state: `{analysis_status['completion_state']}`", + f"- Input cells: `{metric_map.get('input_cells', 'n/a')}`", + f"- QC-passing cells: `{metric_map.get('passes_qc', 'n/a')}`", + f"- Plot-included cells: `{metric_map.get('plot_include', 'n/a')}`", + f"- Doublet detection complete: `{not bool(analysis_status['blocking_issues'])}`", + f"- Review app URL: `{review_app_info.get('url') if review_app_info else 'not started'}`", + "", + "## Thresholds", + f"- Detected genes lower bound: `{thresholds['n_genes_by_counts']['lower']:.2f}`", + f"- Total counts lower bound: `{thresholds['total_counts']['lower']:.2f}`", + f"- Mitochondrial fraction upper bound: `{thresholds['pct_counts_mt']['upper']}`", + "", + "## Notes", + ] + for warning in analysis_status["warnings"]: + lines.append(f"- {warning}") + if analysis_status["blocking_issues"]: + lines.extend(["", "## Blocking Issues"]) + for issue in analysis_status["blocking_issues"]: + lines.append(f"- {issue['component']}: {issue['reason']}") + write_text(output_root / "summary.md", "\n".join(lines) + "\n") + + +def maybe_launch_review_app( + args: argparse.Namespace, output_root: Path, notebook_path: Path +) -> dict[str, Any]: + info_path = output_root / "notebooks" / "marimo_server.json" + if not args.launch_review_app: + info = {"ok": False, "error": "Review app auto-launch disabled by CLI flag."} + write_json(info_path, info) + return info + if not importlib.util.find_spec("marimo"): + info = {"ok": False, "error": "marimo is not installed in the current Python environment."} + write_json(info_path, info) + return info + try: + info = launch_marimo_review_app( + notebook_path=notebook_path, + run_dir=output_root, + start_port=args.review_app_port, + python_executable=os.environ.get("PYTHON_EXECUTABLE_OVERRIDE"), + ) + except Exception as exc: # noqa: BLE001 + info = {"ok": False, "error": str(exc)} + write_json(info_path, info) + return info + + +def run_pipeline(args: argparse.Namespace) -> tuple[Path, dict[str, Any]]: + started_at = now_iso() + input_dir, matrix_dir, raw_matrix_dir, metadata_path, manifest_path = matrix_paths(args) + output_root = ( + args.output_dir.expanduser().resolve() + if args.output_dir + else input_dir / "output" / slug_timestamp() + ) + output_root.mkdir(parents=True, exist_ok=True) + for child in [ + "validation", + "qc", + "annotation", + "embeddings", + "plots", + "provenance", + "tables", + "visualizations", + "notebooks", + "logs", + "manifest", + "versions", + ]: + (output_root / child).mkdir(parents=True, exist_ok=True) + + validation = validate_inputs(matrix_dir, raw_matrix_dir, metadata_path, manifest_path) + write_json(output_root / "validation" / "input_summary.json", validation) + dep_status = r_dependency_status(args.rscript) + write_json(output_root / "validation" / "tool_preflight.json", dep_status) + if validation["errors"]: + write_text( + output_root / "summary.md", + "Input validation failed. See validation/input_summary.json for details.\n", + ) + write_standard_manifest( + output_root, + run_id=output_root.name, + lane="scrnaseq_post_count_qc", + workflow="local_light_scanpy_qc", + status="blocked", + execute_requested=True, + validation={"ok": False, **validation}, + tool_preflight_result={"ok": dep_status.get("ok", False), **dep_status}, + dry_run={"ok": False, "detail": "input validation failed"}, + execution={"ok": False, "detail": "execution not attempted"}, + inputs={ + "input_dir": str(input_dir), + "matrix_dir": str(matrix_dir), + "raw_matrix_dir": str(raw_matrix_dir) if raw_matrix_dir else None, + "dataset_metadata": str(metadata_path), + "manifest": str(manifest_path), + }, + outputs={"summary": "summary.md", "validation": "validation/input_summary.json"}, + method={"resolution": args.resolution}, + ) + write_json(output_root / "artifact_index.json", build_artifact_index(output_root)) + raise SystemExit("Input validation failed. See validation/input_summary.json for details.") + + python_dep_status = load_analysis_modules() + write_json(output_root / "validation" / "python_dependency_preflight.json", python_dep_status) + if not python_dep_status["ok"]: + write_text( + output_root / "summary.md", + "Python dependency preflight failed. See validation/python_dependency_preflight.json for details.\n", + ) + write_standard_manifest( + output_root, + run_id=output_root.name, + lane="scrnaseq_post_count_qc", + workflow="local_light_scanpy_qc", + status="blocked", + execute_requested=True, + validation={"ok": True, **validation}, + tool_preflight_result=combined_tool_preflight_status(python_dep_status, dep_status), + dry_run={"ok": False, "detail": "Python dependency preflight failed"}, + execution={"ok": False, "detail": "execution not attempted"}, + inputs={ + "input_dir": str(input_dir), + "matrix_dir": str(matrix_dir), + "raw_matrix_dir": str(raw_matrix_dir) if raw_matrix_dir else None, + "dataset_metadata": str(metadata_path), + "manifest": str(manifest_path), + }, + outputs={ + "summary": "summary.md", + "python_dependency_preflight": "validation/python_dependency_preflight.json", + "r_dependency_preflight": "validation/tool_preflight.json", + }, + method={"resolution": args.resolution}, + ) + write_json(output_root / "artifact_index.json", build_artifact_index(output_root)) + raise SystemExit( + "Python dependency preflight failed. See validation/python_dependency_preflight.json for details." + ) + + dataset_metadata = json.loads(metadata_path.read_text(encoding="utf-8")) + adata = sc.read_10x_mtx(matrix_dir, var_names="gene_symbols", cache=False) + adata.var_names_make_unique() + adata.layers["counts"] = adata.X.copy() + adata.uns["input_metadata"] = dataset_metadata + adata.uns["runtime_environment"] = RUNTIME_ENV + adata.obs["barcode"] = adata.obs_names.to_numpy() + adata.var["mt"] = adata.var_names.str.upper().str.startswith("MT-") + sc.pp.calculate_qc_metrics(adata, qc_vars=["mt"], percent_top=None, log1p=False, inplace=True) + + thresholds = threshold_dict(adata) + mito_upper = thresholds["pct_counts_mt"]["upper"] + mito_mask = True if mito_upper is None else adata.obs["pct_counts_mt"] <= mito_upper + adata.obs["passes_qc"] = ( + (adata.obs["n_genes_by_counts"] >= thresholds["n_genes_by_counts"]["lower"]) + & (adata.obs["total_counts"] >= thresholds["total_counts"]["lower"]) + & mito_mask + ) + adata.obs["high_gene_outlier"] = ( + adata.obs["n_genes_by_counts"] > thresholds["n_genes_by_counts"]["upper_review"] + ) + adata.obs["high_count_outlier"] = ( + adata.obs["total_counts"] > thresholds["total_counts"]["upper_review"] + ) + plot_thresholds(adata, thresholds, output_root / "qc" / "threshold_justification.png") + write_json(output_root / "qc" / "thresholds.json", thresholds) + + dbl_out = output_root / "qc" / "doublet_calls.csv" + dbl_readiness = scdbfinder_readiness(adata) + if not dep_status.get("packages", {}).get("scDblFinder", False) or not dep_status.get( + "packages", {} + ).get("DropletUtils", False): + dbl_result = { + "ok": False, + "reason": "missing_r_dependencies", + "error": "scDblFinder and DropletUtils must both be available in the selected R runtime.", + } + elif not args.rscript and not command_path("Rscript"): + dbl_result = {"ok": False, "reason": "rscript_missing", "error": "Rscript not found"} + elif not dbl_readiness["ok"]: + dbl_result = { + "ok": False, + "reason": dbl_readiness["reason"], + "error": dbl_readiness["detail"], + "readiness": dbl_readiness, + } + else: + dbl_result = run_scdbfinder( + matrix_dir, dbl_out, timeout=args.timeout_seconds, rscript=args.rscript + ) + if dbl_result["ok"] and dbl_out.exists(): + dbl_df = pd.read_csv(dbl_out) + dbl_df = dbl_df.set_index("barcode").reindex(adata.obs_names) + adata.obs["doublet_score"] = dbl_df["scDblFinder_score"].to_numpy() + adata.obs["doublet_class"] = dbl_df["scDblFinder_class"].fillna("unknown").to_numpy() + else: + adata.obs["doublet_score"] = np.nan + adata.obs["doublet_class"] = "blocked" + write_text( + output_root / "qc" / "doublet_blocker.txt", + f"{scdbfinder_blocker_reason(dbl_result)}\n" + f"Command status: {json.dumps(dbl_result, indent=2, sort_keys=True)}\n", + ) + + emptydrops_result = {"ok": False, "reason": "raw droplet matrix not provided"} + if raw_matrix_dir is None: + write_text( + output_root / "qc" / "emptydrops_skip.txt", + "emptyDrops skipped: no raw droplet matrix directory was provided.\n", + ) + elif not dep_status.get("packages", {}).get("DropletUtils", False): + write_text( + output_root / "qc" / "emptydrops_skip.txt", + "emptyDrops skipped: DropletUtils is not available in the current R runtime.\n", + ) + emptydrops_result = {"ok": False, "reason": "DropletUtils unavailable"} + else: + emptydrops_out = output_root / "qc" / "emptydrops_calls.csv" + emptydrops_result = run_emptydrops( + raw_matrix_dir, emptydrops_out, timeout=args.timeout_seconds, rscript=args.rscript + ) + if emptydrops_result.get("ok") and emptydrops_out.exists(): + ed_df = pd.read_csv(emptydrops_out) + ed_df = ed_df.set_index("barcode").reindex(adata.obs_names) + adata.obs["emptydrops_fdr"] = ed_df["FDR"].to_numpy() + else: + write_text( + output_root / "qc" / "emptydrops_skip.txt", + "emptyDrops attempted but did not complete successfully.\n" + f"Command status: {json.dumps(emptydrops_result, indent=2, sort_keys=True)}\n", + ) + + ambient_note = ( + "Ambient RNA estimation was not executed. A raw droplet matrix plus a supported ambient backend are required for robust modeling; " + "ambient-derived hard filters were not applied in this run." + ) + adata.obs["ambient_flag"] = False + write_text(output_root / "qc" / "ambient_limitations.md", ambient_note + "\n") + + adata.obs["plot_include"] = adata.obs["passes_qc"] & adata.obs["doublet_class"].ne("doublet") + qc_metrics = adata.obs[ + [ + "barcode", + "total_counts", + "n_genes_by_counts", + "pct_counts_mt", + "passes_qc", + "high_gene_outlier", + "high_count_outlier", + "doublet_score", + "doublet_class", + "ambient_flag", + "plot_include", + ] + ].copy() + qc_metrics.to_csv(output_root / "qc" / "cell_qc_metrics.csv", index=False) + qc_summary_rows = [ + {"metric": "input_cells", "cells": int(adata.n_obs)}, + {"metric": "passes_qc", "cells": int(adata.obs["passes_qc"].sum())}, + {"metric": "plot_include", "cells": int(adata.obs["plot_include"].sum())}, + {"metric": "high_gene_outlier", "cells": int(adata.obs["high_gene_outlier"].sum())}, + {"metric": "high_count_outlier", "cells": int(adata.obs["high_count_outlier"].sum())}, + { + "metric": "doublet_blocked_or_called", + "cells": int(adata.obs["doublet_class"].ne("unknown").sum()), + }, + ] + write_tsv(output_root / "tables" / "cell_qc_summary.tsv", qc_summary_rows, ["metric", "cells"]) + plot_count_summary( + qc_summary_rows, + output_root / "qc" / "qc_pass_fail_counts.png", + title="scRNA QC Cell Counts", + ) + + plot_view = adata[adata.obs["plot_include"].to_numpy()].copy() + fallback_note = None + if plot_view.n_obs == 0: + nonzero_mask = adata.obs["total_counts"].to_numpy() > 0 + if int(nonzero_mask.sum()) > 0: + plot_view = adata[nonzero_mask].copy() + fallback_note = ( + "No cells passed the default QC mask. Nonzero-count cells were used for visualization-only outputs so the " + "artifact bundle remains reviewable." + ) + else: + plot_view = adata[:1].copy() + fallback_note = "No nonzero-count cells were available. A one-cell placeholder view was used for visualization-only outputs." + if fallback_note: + write_text(output_root / "qc" / "plot_view_fallback.txt", fallback_note + "\n") + + if plot_view.n_obs >= 3: + sc.pp.normalize_total(plot_view, target_sum=1e4) + sc.pp.log1p(plot_view) + sc.pp.highly_variable_genes(plot_view, n_top_genes=2000, flavor="seurat", subset=False) + sc.tl.pca(plot_view, use_highly_variable=True, svd_solver="arpack") + sc.pp.neighbors(plot_view, n_neighbors=15, n_pcs=30) + sc.tl.umap(plot_view) + sc.tl.leiden(plot_view, resolution=args.resolution, key_added="leiden") + score_columns = add_marker_scores(plot_view) + labels = assign_labels(plot_view, score_columns) + plot_view.obs["coarse_label"] = labels["coarse_label"].to_numpy() + plot_view.obs["label_confidence"] = labels["label_confidence"].to_numpy() + marker_summary = [] + if score_columns: + for score_name in score_columns: + marker_summary.append( + { + "score_name": score_name, + "n_cells_positive": int((plot_view.obs[score_name] > 0).sum()), + } + ) + reference_manifest = { + "mode": "marker_based_fallback", + "reason": "No matched reference atlas was provided in the input bundle; conservative PBMC marker scoring was used.", + "score_columns": score_columns, + } + + fig, ax = plt.subplots(figsize=(7.5, 6.5)) + sc.pl.umap( + plot_view, + color="coarse_label", + ax=ax, + show=False, + frameon=False, + legend_loc="right margin", + ) + fig.tight_layout() + fig.savefig(output_root / "plots" / "umap_global.png", dpi=160) + plt.close(fig) + + fig, ax = plt.subplots(figsize=(7.5, 6.5)) + sc.pl.umap( + plot_view, color="leiden", ax=ax, show=False, frameon=False, legend_loc="on data" + ) + fig.tight_layout() + fig.savefig(output_root / "plots" / "umap_by_coarse_label.png", dpi=160) + plt.close(fig) + else: + score_columns = [] + plot_view.obs["leiden"] = [str(i) for i in range(plot_view.n_obs)] + plot_view.obs["coarse_label"] = "unknown" + plot_view.obs["label_confidence"] = 0.0 + coords = np.zeros((plot_view.n_obs, 2), dtype=float) + if plot_view.n_obs > 1: + coords[:, 0] = np.arange(plot_view.n_obs, dtype=float) + plot_view.obsm["X_umap"] = coords + marker_summary = [] + reference_manifest = { + "mode": "tiny_dataset_fallback", + "reason": "Fewer than 3 cells were available for embedding; simple coordinates were emitted so the portable review bundle stays intact.", + "score_columns": score_columns, + } + plot_simple_embedding( + coords, + plot_view.obs["coarse_label"], + output_root / "plots" / "umap_global.png", + title="UMAP by Coarse Label", + ) + plot_simple_embedding( + coords, + plot_view.obs["leiden"], + output_root / "plots" / "umap_by_coarse_label.png", + title="UMAP by Leiden Cluster", + ) + + plot_view.obs.to_csv(output_root / "annotation" / "cell_labels.csv") + umap_coords = pd.DataFrame( + plot_view.obsm["X_umap"], index=plot_view.obs_names, columns=["UMAP1", "UMAP2"] + ) + umap_coords.to_csv(output_root / "embeddings" / "umap_coords.csv") + pd.DataFrame(marker_summary).to_csv( + output_root / "annotation" / "marker_summary.csv", index=False + ) + write_json(output_root / "annotation" / "reference_manifest.json", reference_manifest) + + adata.write_h5ad(output_root / "analysis_with_flags.h5ad", compression="gzip") + plot_view.write_h5ad(output_root / "filtered_view.h5ad", compression="gzip") + + visualization_outputs = write_scrna_visual_bundle(output_root, None) + review_notebook_path = output_root / visualization_outputs["review_notebook"] + review_app_info = ( + maybe_launch_review_app(args, output_root, review_notebook_path) + if review_notebook_path.exists() + else None + ) + analysis_status = build_analysis_status( + dataset_metadata=dataset_metadata, dbl_result=dbl_result, review_app_info=review_app_info + ) + write_json(output_root / "provenance" / "analysis_status.json", analysis_status) + + versions = package_versions() + write_json(output_root / "versions" / "software_versions.json", versions) + write_json(output_root / "provenance" / "package_versions.json", versions) + write_json( + output_root / "provenance" / "run_manifest.json", + { + "started_at": started_at, + "finished_at": now_iso(), + "input_dir": str(input_dir), + "matrix_dir": str(matrix_dir), + "raw_matrix_dir": str(raw_matrix_dir) if raw_matrix_dir else None, + "metadata_path": str(metadata_path), + "thresholds": thresholds, + "resolution": args.resolution, + "doublet_result": dbl_result, + "emptydrops_result": emptydrops_result, + "analysis_status": analysis_status, + "runtime_environment": RUNTIME_ENV, + "review_app": review_app_info, + }, + ) + + visualization_outputs = write_scrna_visual_bundle(output_root, review_app_info) + write_summary_md( + output_root, + dataset_metadata=dataset_metadata, + thresholds=thresholds, + qc_summary_rows=qc_summary_rows, + analysis_status=analysis_status, + review_app_info=review_app_info, + ) + write_standard_manifest( + output_root, + run_id=output_root.name, + lane="scrnaseq_post_count_qc", + workflow="local_light_scanpy_qc", + status="completed", + execute_requested=True, + validation={"ok": True, "errors": [], "warnings": [], **validation}, + tool_preflight_result=combined_tool_preflight_status(python_dep_status, dep_status), + dry_run={"ok": True, "detail": "matrix and metadata validation completed"}, + execution={"ok": True, "detail": "post-count QC completed"}, + inputs={ + "input_dir": str(input_dir), + "matrix_dir": str(matrix_dir), + "raw_matrix_dir": str(raw_matrix_dir) if raw_matrix_dir else None, + "dataset_metadata": str(metadata_path), + "manifest": str(manifest_path), + }, + outputs={ + "analysis_h5ad": "analysis_with_flags.h5ad", + "filtered_h5ad": "filtered_view.h5ad", + "qc_summary": "tables/cell_qc_summary.tsv", + "labels": "annotation/cell_labels.csv", + "visualization_manifest": "visualizations/visualization_manifest.json", + "versions": "versions/software_versions.json", + }, + method={ + "thresholds": thresholds, + "resolution": args.resolution, + "doublet_method": "scDblFinder", + "emptydrops_enabled": raw_matrix_dir is not None, + "ambient_step": "explicit_skip_unless_raw_and_backend_available", + }, + review_bundle=visualization_outputs, + ) + write_json(output_root / "artifact_index.json", build_artifact_index(output_root)) + return output_root, { + "cells_input": int(adata.n_obs), + "cells_plot_include": int(plot_view.n_obs), + "doublet_ok": dbl_result["ok"], + "emptydrops_ok": emptydrops_result.get("ok", False), + "review_app_url": review_app_info.get("url") if review_app_info else None, + } + + +def main() -> int: + args = parse_args() + out_dir, summary = run_pipeline(args) + print(json.dumps({"output_dir": str(out_dir), "summary": summary}, indent=2)) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/scripts/run_shotgun_metagenomics.py b/plugins/ngs-analysis/scripts/run_shotgun_metagenomics.py new file mode 100644 index 0000000..bb578fc --- /dev/null +++ b/plugins/ngs-analysis/scripts/run_shotgun_metagenomics.py @@ -0,0 +1,1377 @@ +#!/usr/bin/env python3 +"""Run or plan shotgun metagenomics Kraken2/Bracken/HUMAnN backend artifacts.""" + +from __future__ import annotations + +import argparse +import html +import re +from pathlib import Path +from typing import Any + +import ngs_reference_manager +from ngs_planner_utils import ( + command_plan_entry, + normalize_sample_name, + read_table, + resolve_path, + shell_join, + write_command_script, + write_tsv, +) +from ngs_run_utils import ( + build_artifact_index, + run_cmd, + slug_timestamp, + software_versions, + tool_preflight, + write_json, + write_standard_manifest, + write_text, +) +from ngs_visualization_utils import artifact_entry, write_visualization_index + +WORKSPACE_ROOT = Path.cwd() +DEFAULT_RUN_ROOT = WORKSPACE_ROOT / "ngs_runs" / "shotgun_metagenomics_backend" + + +def parse_float(value: Any) -> float: + text = str(value).strip().replace(",", "") + if not text or text.lower() in {"-", "na", "nan", "none"}: + return 0.0 + try: + return float(text) + except ValueError: + return 0.0 + + +def humann_database_paths(root: Path) -> tuple[Path, Path]: + chocophlan = root / "chocophlan" + uniref = root / "uniref" + if chocophlan.is_dir() and uniref.is_dir(): + return chocophlan, uniref + return root, root + + +def resolve_bracken_read_length( + bracken_db: Path, requested_read_length: int +) -> tuple[int, str | None]: + requested_path = bracken_db / f"database{requested_read_length}mers.kmer_distrib" + if requested_path.exists(): + return requested_read_length, None + available_lengths = sorted( + int(match.group(1)) + for path in bracken_db.glob("database*mers.kmer_distrib") + if (match := re.match(r"database(\d+)mers\.kmer_distrib$", path.name)) + ) + if not available_lengths: + return requested_read_length, None + selected = min(available_lengths, key=lambda value: abs(value - requested_read_length)) + if selected == requested_read_length: + return selected, None + return ( + selected, + f"Bracken database lacks database{requested_read_length}mers.kmer_distrib; using available read length {selected} instead.", + ) + + +def validate_inputs(args: argparse.Namespace) -> tuple[dict[str, Any], list[dict[str, str]]]: + sample_sheet = args.sample_sheet.expanduser().resolve() + errors: list[str] = [] + warnings: list[str] = [] + rows: list[dict[str, str]] = [] + columns: list[str] = [] + samples: list[dict[str, str]] = [] + if not sample_sheet.exists(): + errors.append(f"sample sheet does not exist: {sample_sheet}") + else: + rows, columns = read_table(sample_sheet) + kraken_db = args.kraken_db.expanduser().resolve() if args.kraken_db else None + if not kraken_db: + errors.append("--kraken-db is required for taxonomic classification") + elif not kraken_db.exists(): + errors.append(f"Kraken2 database does not exist: {kraken_db}") + bracken_db = args.bracken_db.expanduser().resolve() if args.bracken_db else kraken_db + if args.run_bracken and bracken_db and not bracken_db.exists(): + errors.append(f"Bracken database path does not exist: {bracken_db}") + humann_db = args.humann_db.expanduser().resolve() if args.humann_db else None + if args.run_humann and not humann_db: + errors.append("--run-humann requires --humann-db") + if humann_db and not humann_db.exists(): + errors.append(f"HUMAnN database root does not exist: {humann_db}") + if humann_db and humann_db.exists() and args.run_humann: + nucleotide_db, protein_db = humann_database_paths(humann_db) + if nucleotide_db == humann_db and protein_db == humann_db: + warnings.append( + "HUMAnN database root does not expose chocophlan/uniref subdirectories; runner will pass the root directly." + ) + host_reference = args.host_reference.expanduser().resolve() if args.host_reference else None + if host_reference and not host_reference.exists(): + errors.append(f"host reference does not exist: {host_reference}") + if not args.metadata: + warnings.append( + "no metadata table was supplied; diversity and differential-abundance interpretation will be limited" + ) + for row_index, row in enumerate(rows, start=2): + sample = normalize_sample_name( + row.get("sample") or row.get("sample_id"), f"row_{row_index}" + ) + r1 = resolve_path(row.get("r1") or row.get("fastq_1"), sample_sheet.parent) + r2 = resolve_path(row.get("r2") or row.get("fastq_2"), sample_sheet.parent) + if not r1: + errors.append(f"row {row_index}: r1/fastq_1 is required") + continue + if not r1.exists(): + errors.append(f"row {row_index}: R1 FASTQ does not exist: {r1}") + if r2 and not r2.exists(): + errors.append(f"row {row_index}: R2 FASTQ does not exist: {r2}") + samples.append( + { + "sample": sample, + "r1": str(r1), + "r2": str(r2) if r2 else "", + "row_index": str(row_index), + } + ) + if not samples: + errors.append("no usable shotgun samples found") + validation = { + "ok": not errors, + "sample_sheet": str(sample_sheet), + "metadata": str(args.metadata.expanduser().resolve()) if args.metadata else None, + "sample_count": len(samples), + "kraken_db": str(kraken_db) if kraken_db else None, + "bracken_db": str(bracken_db) if bracken_db else None, + "humann_db": str(humann_db) if humann_db else None, + "host_reference": str(host_reference) if host_reference else None, + "columns": columns, + "errors": errors, + "warnings": warnings, + } + return validation, samples + + +def _missing_required_resources(resources: list[dict[str, Any]]) -> list[dict[str, Any]]: + return [ + { + "kind": item["kind"], + "bundle": item["bundle"], + "root": item["root"], + "missing": item["check"].get("missing", []), + "error": item["check"].get("error"), + } + for item in resources + if item.get("blocking") + ] + + +def promote_requested_database_steps( + plan: dict[str, Any], args: argparse.Namespace +) -> dict[str, Any]: + requested = { + "bracken_standard": args.run_bracken, + "humann_uniref90": args.run_humann, + } + if not any(requested.values()): + return plan + updated = dict(plan) + resources: list[dict[str, Any]] = [] + for resource in plan.get("resources", []): + item = dict(resource) + if requested.get(str(item.get("bundle"))): + item["required"] = True + item["blocking"] = not bool(item.get("ok")) + resources.append(item) + updated["resources"] = resources + updated["missing_required"] = _missing_required_resources(resources) + updated["ok"] = not updated["missing_required"] + return updated + + +def resource_blockers(resource_plan: dict[str, Any] | None) -> list[str]: + if resource_plan is None or resource_plan.get("ok"): + return [] + blockers = [] + for item in resource_plan.get("missing_required", []): + detail = item.get("error") or ", ".join(item.get("missing", [])) or "root not configured" + blockers.append( + f"required {item.get('kind')} bundle `{item.get('bundle')}` is not ready: {detail}" + ) + return blockers + + +def write_resource_plan(args: argparse.Namespace, run_dir: Path) -> dict[str, Any] | None: + if args.skip_resource_plan: + return None + roots = {"kraken2_standard": args.kraken_db.expanduser().resolve()} + if args.run_bracken or args.bracken_db: + roots["bracken_standard"] = (args.bracken_db or args.kraken_db).expanduser().resolve() + if args.run_humann and args.humann_db: + roots["humann_uniref90"] = args.humann_db.expanduser().resolve() + plan = ngs_reference_manager.plan_pipeline_resources( + "shotgun_metagenomics", + bundle_roots=roots, + include_optional=args.include_optional_resources or args.run_bracken or args.run_humann, + include_checksums=args.resource_checksums, + ) + plan = promote_requested_database_steps(plan, args) + run_root = run_dir.resolve() + outputs = ngs_reference_manager.write_resource_plan_outputs(plan, run_root / "resources") + plan["outputs"] = { + key: str(Path(value).resolve().relative_to(run_root)) for key, value in outputs.items() + } + return plan + + +def merge_resource_status( + validation: dict[str, Any], resource_plan: dict[str, Any] | None +) -> dict[str, Any]: + merged = dict(validation) + errors = list(merged.get("errors", [])) + warnings = list(merged.get("warnings", [])) + if resource_plan is None: + merged["resource_plan_ok"] = None + merged["resource_plan_skipped"] = True + warnings.append( + "resource readiness plan was skipped; database roots were not checked against the registry contract" + ) + else: + merged["resource_plan_ok"] = bool(resource_plan.get("ok")) + merged["resource_plan_skipped"] = False + merged["resource_plan_path"] = resource_plan.get("outputs", {}).get("resource_plan") + merged["missing_required_resources"] = resource_plan.get("missing_required", []) + errors.extend(resource_blockers(resource_plan)) + merged["errors"] = errors + merged["warnings"] = warnings + merged["ok"] = bool(validation.get("ok")) and ( + resource_plan is None or bool(resource_plan.get("ok")) + ) + return merged + + +def build_plan(args: argparse.Namespace, samples: list[dict[str, str]]) -> list[dict[str, Any]]: + plan: list[dict[str, Any]] = [] + kraken_db = args.kraken_db.expanduser().resolve() if args.kraken_db else "MISSING_KRAKEN_DB" + bracken_db = args.bracken_db.expanduser().resolve() if args.bracken_db else kraken_db + bracken_read_length, _ = ( + resolve_bracken_read_length(bracken_db, args.read_length) + if isinstance(bracken_db, Path) + else (args.read_length, None) + ) + humann_nucleotide_db, humann_protein_db = ( + humann_database_paths(args.humann_db.expanduser().resolve()) + if args.humann_db + else (Path("MISSING_HUMANN_DB"), Path("MISSING_HUMANN_DB")) + ) + for sample in samples: + name = sample["sample"] + current_r1 = sample["r1"] + current_r2 = sample["r2"] + if args.host_reference: + clean_r1, clean_r2 = host_depleted_paths(sample) + plan.append(build_host_depletion_step(args, sample, clean_r1, clean_r2)) + current_r1 = clean_r1 + current_r2 = clean_r2 + kraken_cmd: list[str | Path] = [ + "kraken2", + "--db", + kraken_db, + "--threads", + str(args.threads), + "--report", + f"taxonomic_classification/{name}.kraken.report", + "--output", + f"taxonomic_classification/{name}.kraken.output", + ] + if current_r2: + kraken_cmd.extend(["--paired", current_r1, current_r2]) + else: + kraken_cmd.append(current_r1) + plan.append( + command_plan_entry( + f"{name}: kraken2 classify", + kraken_cmd, + outputs=[ + f"taxonomic_classification/{name}.kraken.report", + f"taxonomic_classification/{name}.kraken.output", + ], + ) + ) + if args.run_bracken: + plan.append( + command_plan_entry( + f"{name}: bracken abundance", + [ + "bracken", + "-d", + bracken_db, + "-i", + f"taxonomic_classification/{name}.kraken.report", + "-o", + f"taxonomic_classification/{name}.bracken.tsv", + "-r", + str(bracken_read_length), + "-l", + args.bracken_level, + ], + outputs=[f"taxonomic_classification/{name}.bracken.tsv"], + ) + ) + if args.run_humann: + humann_input = current_r1 if not current_r2 else f"workflow/{name}.paired.fastq" + if current_r2: + plan.append( + command_plan_entry( + f"{name}: concatenate paired reads for HUMAnN", + f"zcat -f {shell_join([current_r1])} {shell_join([current_r2])} > {shell_join([humann_input])}", + outputs=[humann_input], + ) + ) + plan.append( + command_plan_entry( + f"{name}: HUMAnN functional profile", + [ + "humann", + "--input", + humann_input, + "--output", + f"functional_profile/{name}", + "--threads", + str(args.threads), + "--nucleotide-database", + humann_nucleotide_db, + "--protein-database", + humann_protein_db, + ], + outputs=[f"functional_profile/{name}"], + ) + ) + return plan + + +def host_depleted_paths(sample: dict[str, str]) -> tuple[str, str]: + name = sample["sample"] + if sample.get("r2"): + return f"host_depletion/{name}.clean_R1.fastq", f"host_depletion/{name}.clean_R2.fastq" + return f"host_depletion/{name}.clean.fastq", "" + + +def build_host_depletion_step( + args: argparse.Namespace, sample: dict[str, str], clean_r1: str, clean_r2: str +) -> dict[str, Any]: + name = sample["sample"] + out_dir = f"host_depletion/{name}" + reference = args.host_reference.expanduser().resolve() + if sample.get("r2"): + kneaddata_cmd = [ + "kneaddata", + "--input1", + sample["r1"], + "--input2", + sample["r2"], + "--reference-db", + reference, + "--output", + out_dir, + "--output-prefix", + name, + "--threads", + str(args.threads), + ] + command = " && ".join( + [ + f"mkdir -p {shell_join([out_dir])}", + shell_join(kneaddata_cmd), + f"clean_r1=$(find {shell_join([out_dir])} -type f \\( -name '*paired_1.fastq' -o -name '*paired_1.fastq.gz' -o -name '*clean_R1.fastq' -o -name '*clean_R1.fastq.gz' \\) | head -n 1)", + f"clean_r2=$(find {shell_join([out_dir])} -type f \\( -name '*paired_2.fastq' -o -name '*paired_2.fastq.gz' -o -name '*clean_R2.fastq' -o -name '*clean_R2.fastq.gz' \\) | head -n 1)", + 'test -n "$clean_r1"', + 'test -n "$clean_r2"', + f'ln -sf "$PWD/$clean_r1" {shell_join([clean_r1])}', + f'ln -sf "$PWD/$clean_r2" {shell_join([clean_r2])}', + ] + ) + return command_plan_entry( + f"{name}: KneadData host depletion", command, outputs=[clean_r1, clean_r2, out_dir] + ) + kneaddata_cmd = [ + "kneaddata", + "--input", + sample["r1"], + "--reference-db", + reference, + "--output", + out_dir, + "--output-prefix", + name, + "--threads", + str(args.threads), + ] + command = " && ".join( + [ + f"mkdir -p {shell_join([out_dir])}", + shell_join(kneaddata_cmd), + f"clean=$(find {shell_join([out_dir])} -type f \\( -name '*kneaddata.fastq' -o -name '*kneaddata.fastq.gz' -o -name '*clean.fastq' -o -name '*clean.fastq.gz' \\) | head -n 1)", + 'test -n "$clean"', + f'ln -sf "$PWD/$clean" {shell_join([clean_r1])}', + ] + ) + return command_plan_entry( + f"{name}: KneadData host depletion", command, outputs=[clean_r1, out_dir] + ) + + +def parse_bracken_table(path: Path, sample: str | None = None) -> list[dict[str, Any]]: + rows, columns = read_table(path) + sample_name = sample or path.name.replace(".bracken", "").replace(".tsv", "").replace( + ".txt", "" + ) + if not columns: + return [] + lower_columns = {column.lower(): column for column in columns} + name_col = ( + lower_columns.get("name") + or lower_columns.get("taxonomy") + or lower_columns.get("taxon") + or columns[0] + ) + taxid_col = ( + lower_columns.get("taxonomy_id") + or lower_columns.get("taxid") + or lower_columns.get("taxon_id") + or "" + ) + rank_col = ( + lower_columns.get("taxonomy_lvl") + or lower_columns.get("rank") + or lower_columns.get("level") + or "" + ) + reads_col = ( + lower_columns.get("new_est_reads") + or lower_columns.get("reads") + or lower_columns.get("kraken_assigned_reads") + or "" + ) + fraction_col = ( + lower_columns.get("fraction_total_reads") + or lower_columns.get("fraction") + or lower_columns.get("relative_abundance") + or "" + ) + parsed = [] + for row in rows: + taxon = (row.get(name_col) or "").strip() + if not taxon: + continue + parsed.append( + { + "sample": sample_name, + "taxon": taxon, + "taxonomy_id": (row.get(taxid_col) or "").strip() if taxid_col else "", + "taxonomy_lvl": (row.get(rank_col) or "").strip() if rank_col else "", + "est_reads": parse_float(row.get(reads_col, "")) if reads_col else 0.0, + "fraction_total_reads": parse_float(row.get(fraction_col, "")) + if fraction_col + else 0.0, + } + ) + return parsed + + +def write_matrix( + path: Path, + matrix: dict[tuple[str, str, str], dict[str, float]], + samples: list[str], + value_label: str, +) -> int: + rows = [] + for key, values in sorted(matrix.items(), key=lambda item: sum(item[1].values()), reverse=True): + taxon, taxid, rank = key + row: dict[str, Any] = {"taxon": taxon, "taxonomy_id": taxid, "taxonomy_lvl": rank} + for sample in samples: + value = values.get(sample, 0.0) + row[sample] = f"{value:.8g}" + rows.append(row) + if rows: + write_tsv(path, rows, ["taxon", "taxonomy_id", "taxonomy_lvl", *samples]) + return len(rows) + + +def merge_bracken_outputs(run_dir: Path, samples: list[dict[str, str]]) -> dict[str, Any]: + sample_names = [row["sample"] for row in samples] + observed: list[dict[str, Any]] = [] + for sample in sample_names: + for candidate in [ + run_dir / "taxonomic_classification" / f"{sample}.bracken.tsv", + run_dir / "taxonomic_classification" / f"{sample}.bracken.txt", + ]: + if candidate.exists(): + observed.extend(parse_bracken_table(candidate, sample=sample)) + break + if not observed: + summary = { + "status": "not_available", + "input_tables": [], + "taxa": 0, + "samples": sample_names, + "outputs": {}, + "note": "No Bracken tables were found under taxonomic_classification/*.bracken.tsv.", + } + write_json(run_dir / "tables" / "bracken_summary.json", summary) + return summary + + read_matrix: dict[tuple[str, str, str], dict[str, float]] = {} + fraction_matrix: dict[tuple[str, str, str], dict[str, float]] = {} + input_tables = sorted( + { + str(run_dir / "taxonomic_classification" / f"{row['sample']}.bracken.tsv") + for row in observed + } + ) + for row in observed: + key = (row["taxon"], row["taxonomy_id"], row["taxonomy_lvl"]) + read_matrix.setdefault(key, {})[row["sample"]] = read_matrix.setdefault(key, {}).get( + row["sample"], 0.0 + ) + float(row["est_reads"]) + fraction_matrix.setdefault(key, {})[row["sample"]] = fraction_matrix.setdefault( + key, {} + ).get(row["sample"], 0.0) + float(row["fraction_total_reads"]) + + read_count = write_matrix( + run_dir / "tables" / "bracken_est_reads_matrix.tsv", read_matrix, sample_names, "est_reads" + ) + fraction_count = write_matrix( + run_dir / "tables" / "bracken_relative_abundance_matrix.tsv", + fraction_matrix, + sample_names, + "fraction_total_reads", + ) + summary = { + "status": "created", + "input_tables": input_tables, + "taxa": max(read_count, fraction_count), + "samples": sample_names, + "outputs": { + "est_reads_matrix": "tables/bracken_est_reads_matrix.tsv", + "relative_abundance_matrix": "tables/bracken_relative_abundance_matrix.tsv", + }, + } + write_json(run_dir / "tables" / "bracken_summary.json", summary) + return summary + + +def infer_humann_sample(path: Path, sample_names: list[str]) -> str: + for part in [path.parent.name, path.stem]: + for sample in sample_names: + if part == sample or part.startswith(sample): + return sample + stem = path.stem + for suffix in ["_pathabundance", "_genefamilies", "_abundance"]: + stem = stem.replace(suffix, "") + return stem or path.parent.name + + +def parse_humann_table(path: Path, sample_hint: str) -> dict[str, dict[str, float]]: + lines = [ + line + for line in path.read_text(encoding="utf-8", errors="replace").splitlines() + if line and (not line.startswith("#") or "\t" in line) + ] + if not lines: + return {} + header = lines[0].lstrip("#").split("\t") + value_columns = header[1:] or [sample_hint] + if len(value_columns) == 1: + value_columns = [sample_hint] + matrix: dict[str, dict[str, float]] = {} + for line in lines[1:]: + parts = line.split("\t") + if len(parts) < 2: + continue + feature = parts[0].strip() + if not feature: + continue + for sample, value in zip(value_columns, parts[1:], strict=False): + matrix.setdefault(feature, {})[sample] = matrix.setdefault(feature, {}).get( + sample, 0.0 + ) + parse_float(value) + return matrix + + +def find_humann_tables(run_dir: Path, label: str) -> list[Path]: + root = run_dir / "functional_profile" + if not root.exists(): + return [] + patterns = { + "pathabundance": ["*pathabundance*.tsv", "*path_abundance*.tsv"], + "genefamilies": ["*genefamilies*.tsv", "*gene_families*.tsv"], + }[label] + seen: set[Path] = set() + tables: list[Path] = [] + for pattern in patterns: + for path in root.rglob(pattern): + if path.is_file() and path not in seen: + seen.add(path) + tables.append(path) + return sorted(tables) + + +def write_humann_matrix(path: Path, matrix: dict[str, dict[str, float]], samples: list[str]) -> int: + observed_samples = sorted({sample for values in matrix.values() for sample in values}) + columns = samples or observed_samples + for sample in observed_samples: + if sample not in columns: + columns.append(sample) + rows = [] + for feature, values in sorted( + matrix.items(), key=lambda item: sum(item[1].values()), reverse=True + ): + row: dict[str, Any] = {"feature": feature} + for sample in columns: + row[sample] = f"{values.get(sample, 0.0):.8g}" + rows.append(row) + if rows: + write_tsv(path, rows, ["feature", *columns]) + return len(rows) + + +def merge_humann_outputs(run_dir: Path, samples: list[dict[str, str]]) -> dict[str, Any]: + sample_names = [row["sample"] for row in samples] + summary: dict[str, Any] = {"status": "not_available", "samples": sample_names, "outputs": {}} + any_created = False + for label, output_name in [ + ("pathabundance", "humann_pathabundance_matrix.tsv"), + ("genefamilies", "humann_genefamilies_matrix.tsv"), + ]: + tables = find_humann_tables(run_dir, label) + combined: dict[str, dict[str, float]] = {} + for table in tables: + sample_hint = infer_humann_sample(table, sample_names) + parsed = parse_humann_table(table, sample_hint) + for feature, values in parsed.items(): + for sample, value in values.items(): + combined.setdefault(feature, {})[sample] = ( + combined.setdefault(feature, {}).get(sample, 0.0) + value + ) + feature_count = write_humann_matrix( + run_dir / "tables" / output_name, combined, sample_names + ) + summary[label] = {"input_tables": [str(path) for path in tables], "features": feature_count} + if feature_count: + summary["outputs"][label] = f"tables/{output_name}" + any_created = True + summary["status"] = "created" if any_created else "not_available" + if not any_created: + summary["note"] = ( + "No HUMAnN pathabundance or genefamilies tables were found under functional_profile/." + ) + write_json(run_dir / "tables" / "humann_summary.json", summary) + return summary + + +def summarize_backend_outputs(run_dir: Path, samples: list[dict[str, str]]) -> dict[str, Any]: + return { + "bracken": merge_bracken_outputs(run_dir, samples), + "humann": merge_humann_outputs(run_dir, samples), + } + + +def read_abundance_matrix( + path: Path, feature_column: str = "feature" +) -> tuple[list[str], list[dict[str, Any]]]: + if not path.exists(): + return [], [] + rows, columns = read_table(path) + if not columns: + return [], [] + first_col = columns[0] + sample_columns = [ + column for column in columns[1:] if column not in {"taxonomy_id", "taxonomy_lvl"} + ] + parsed: list[dict[str, Any]] = [] + for row in rows: + feature = row.get(first_col, "").strip() + if not feature: + continue + values = {sample: parse_float(row.get(sample, "")) for sample in sample_columns} + parsed.append( + { + feature_column: feature, + "taxonomy_id": row.get("taxonomy_id", ""), + "taxonomy_lvl": row.get("taxonomy_lvl", ""), + "total_abundance": sum(values.values()), + **values, + } + ) + return sample_columns, parsed + + +def write_top_rows( + path: Path, rows: list[dict[str, Any]], key: str, sample_columns: list[str], *, limit: int = 25 +) -> int: + top = sorted(rows, key=lambda row: float(row.get("total_abundance", 0.0)), reverse=True)[:limit] + if top: + fieldnames = [ + key, + "total_abundance", + *( + [ + column + for column in ["taxonomy_id", "taxonomy_lvl"] + if any(row.get(column) for row in top) + ] + ), + *sample_columns, + ] + write_tsv(path, top, fieldnames) + return len(top) + + +def write_backend_bar_svg( + path: Path, title: str, rows: list[dict[str, Any]], key: str, *, empty_message: str +) -> str: + path.parent.mkdir(parents=True, exist_ok=True) + values = [ + (str(row.get(key, "")), float(row.get("total_abundance", 0.0))) + for row in rows[:15] + if float(row.get("total_abundance", 0.0)) > 0 + ] + if not values: + body = f""" + + {html.escape(title)} + {html.escape(empty_message)} + +""" + path.write_text(body, encoding="utf-8") + return str(path) + width = 980 + row_height = 38 + height = 92 + row_height * len(values) + max_value = max(value for _, value in values) or 1.0 + lines = [ + f'', + '', + f'{html.escape(title)}', + ] + for index, (label, value) in enumerate(values): + y = 78 + index * row_height + width_value = max(2.0, min(470.0, value / max_value * 470.0)) + short_label = label if len(label) < 44 else label[:41] + "..." + lines.extend( + [ + f'{html.escape(short_label)}', + f'', + f'', + f'{value:.5g}', + ] + ) + lines.append("\n") + path.write_text("\n".join(lines), encoding="utf-8") + return str(path) + + +def write_shotgun_review_outputs(run_dir: Path) -> dict[str, Any]: + outputs: dict[str, str] = {} + notes: list[str] = [] + status = "not_available" + + def add_output(label: str, rel_path: str) -> None: + if (run_dir / rel_path).exists(): + outputs[label] = rel_path + + # Surface staged backend-like inputs and normalized matrices so the dashboard + # does not hide Kraken/Bracken/HUMAnN layers when they were supplied rather + # than executed in this environment. + for path in sorted((run_dir / "taxonomic_classification").glob("*.kraken.report")): + outputs[f"kraken_report:{path.stem}"] = str(path.relative_to(run_dir)) + for path in sorted((run_dir / "taxonomic_classification").glob("*.bracken.tsv")): + outputs[f"bracken_table:{path.stem}"] = str(path.relative_to(run_dir)) + for path in sorted((run_dir / "functional_profile").rglob("*pathabundance*.tsv")): + outputs[f"humann_pathabundance:{path.parent.name}"] = str(path.relative_to(run_dir)) + for path in sorted((run_dir / "functional_profile").rglob("*genefamilies*.tsv")): + outputs[f"humann_genefamilies:{path.parent.name}"] = str(path.relative_to(run_dir)) + + add_output("kraken_top_taxa_table", "tables/kraken_top_taxa.tsv") + add_output("kraken_top_taxa_plot", "visualizations/kraken_top_taxa_barplot.png") + add_output("bracken_summary", "tables/bracken_summary.json") + add_output("bracken_est_reads_matrix", "tables/bracken_est_reads_matrix.tsv") + add_output("bracken_relative_abundance_matrix", "tables/bracken_relative_abundance_matrix.tsv") + add_output("humann_summary", "tables/humann_summary.json") + add_output("humann_pathabundance_matrix", "tables/humann_pathabundance_matrix.tsv") + add_output("humann_genefamilies_matrix", "tables/humann_genefamilies_matrix.tsv") + + bracken_samples, bracken_rows = read_abundance_matrix( + run_dir / "tables" / "bracken_relative_abundance_matrix.tsv", feature_column="taxon" + ) + if bracken_rows: + status = "created" + write_top_rows( + run_dir / "tables" / "top_bracken_taxa.tsv", bracken_rows, "taxon", bracken_samples + ) + write_backend_bar_svg( + run_dir / "visualizations" / "shotgun_top_taxa.svg", + "Shotgun Top Bracken Taxa", + sorted( + bracken_rows, key=lambda row: float(row.get("total_abundance", 0.0)), reverse=True + ), + "taxon", + empty_message="Bracken abundance matrix is not available.", + ) + outputs["top_bracken_taxa"] = "tables/top_bracken_taxa.tsv" + outputs["top_taxa_plot"] = "visualizations/shotgun_top_taxa.svg" + else: + notes.append( + "Bracken relative-abundance matrix is not available; top taxa plot remains unavailable." + ) + + pathway_samples, pathway_rows = read_abundance_matrix( + run_dir / "tables" / "humann_pathabundance_matrix.tsv", feature_column="feature" + ) + if pathway_rows: + status = "created" + write_top_rows( + run_dir / "tables" / "top_humann_pathways.tsv", pathway_rows, "feature", pathway_samples + ) + write_backend_bar_svg( + run_dir / "visualizations" / "shotgun_top_pathways.svg", + "Shotgun Top HUMAnN Pathways", + sorted( + pathway_rows, key=lambda row: float(row.get("total_abundance", 0.0)), reverse=True + ), + "feature", + empty_message="HUMAnN pathway matrix is not available.", + ) + outputs["top_humann_pathways"] = "tables/top_humann_pathways.tsv" + outputs["top_pathways_plot"] = "visualizations/shotgun_top_pathways.svg" + else: + notes.append("HUMAnN pathway matrix is not available.") + + gene_samples, gene_rows = read_abundance_matrix( + run_dir / "tables" / "humann_genefamilies_matrix.tsv", feature_column="feature" + ) + if gene_rows: + status = "created" + write_top_rows( + run_dir / "tables" / "top_humann_gene_families.tsv", gene_rows, "feature", gene_samples + ) + write_backend_bar_svg( + run_dir / "visualizations" / "shotgun_top_gene_families.svg", + "Shotgun Top HUMAnN Gene Families", + sorted(gene_rows, key=lambda row: float(row.get("total_abundance", 0.0)), reverse=True), + "feature", + empty_message="HUMAnN gene-family matrix is not available.", + ) + outputs["top_humann_gene_families"] = "tables/top_humann_gene_families.tsv" + outputs["top_gene_families_plot"] = "visualizations/shotgun_top_gene_families.svg" + else: + notes.append("HUMAnN gene-family matrix is not available.") + + if any( + label.startswith( + ("kraken_report:", "bracken_table:", "humann_pathabundance:", "humann_genefamilies:") + ) + for label in outputs + ): + notes.append( + "Dashboard rows include staged support inputs when Kraken/Bracken/HUMAnN outputs were supplied rather than executed locally." + ) + + dashboard_rows = [] + for label, rel_path in outputs.items(): + href = ( + rel_path.replace("visualizations/", "", 1) + if rel_path.startswith("visualizations/") + else f"../{rel_path}" + ) + dashboard_rows.append( + f'{html.escape(label)}{html.escape(rel_path)}' + ) + if not dashboard_rows: + dashboard_rows.append( + 'No database-derived shotgun review outputs are available yet.' + ) + dashboard = f""" + + + + Shotgun Metagenomics Backend Dashboard + + + +

    Shotgun Metagenomics Backend Dashboard

    +

    Native review of Kraken, Bracken, and HUMAnN inputs plus normalized downstream outputs. When local backends are unavailable, supplied support tables are surfaced alongside the derived matrices and plots.

    + {"".join(dashboard_rows)}
    ArtifactPath
    +

    Notes

    +
      {"".join(f"
    • {html.escape(note)}
    • " for note in notes)}
    + + +""" + dashboard_path = run_dir / "visualizations" / "shotgun_backend_dashboard.html" + dashboard_path.parent.mkdir(parents=True, exist_ok=True) + dashboard_path.write_text(dashboard, encoding="utf-8") + outputs["dashboard"] = "visualizations/shotgun_backend_dashboard.html" + summary = { + "status": status, + "outputs": outputs, + "notes": notes, + "bracken_taxa": len(bracken_rows), + "humann_pathways": len(pathway_rows), + "humann_gene_families": len(gene_rows), + } + write_json(run_dir / "tables" / "metagenomics_backend_review.json", summary) + return summary + + +def write_outputs( + run_dir: Path, + validation: dict[str, Any], + samples: list[dict[str, str]], + plan: list[dict[str, Any]], +) -> None: + write_tsv( + run_dir / "validation" / "samples.normalized.tsv", + samples, + ["sample", "r1", "r2", "row_index"], + ) + write_json(run_dir / "workflow" / "shotgun_backend_command_plan.json", {"commands": plan}) + write_command_script(run_dir / "commands.sh", [item["command"] for item in plan]) + write_json( + run_dir / "qc" / "metagenomics_database_status.json", + { + "kraken_db": validation.get("kraken_db"), + "bracken_db": validation.get("bracken_db"), + "humann_db": validation.get("humann_db"), + "host_reference": validation.get("host_reference"), + "warnings": validation.get("warnings", []), + }, + ) + summarize_backend_outputs(run_dir, samples) + write_shotgun_review_outputs(run_dir) + + +def execute_plan(run_dir: Path, plan: list[dict[str, Any]]) -> dict[str, Any]: + for dirname in ["taxonomic_classification", "functional_profile", "tables", "logs", "workflow"]: + (run_dir / dirname).mkdir(parents=True, exist_ok=True) + result: dict[str, Any] = {"ok": True, "steps": []} + for index, item in enumerate(plan, start=1): + step = run_cmd(["bash", "-c", item["command"]], run_dir, timeout=7200) + safe = item["name"].replace(":", "").replace(" ", "_").replace("/", "_") + write_json(run_dir / "logs" / f"{index:02d}_{safe}.json", step) + result["steps"].append({"name": item["name"], "ok": step.get("ok")}) + result["ok"] = bool(result["ok"] and step.get("ok")) + if not step.get("ok"): + break + return result + + +def write_visuals( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> dict[str, str]: + entries = [ + artifact_entry( + artifact_id="samples", + title="Shotgun Samples", + path="validation/samples.normalized.tsv", + kind="table", + status="created", + description="Normalized shotgun sample FASTQ manifest.", + ), + artifact_entry( + artifact_id="command_plan", + title="Backend Command Plan", + path="workflow/shotgun_backend_command_plan.json", + kind="json", + status="created", + description="Kraken2, Bracken, and HUMAnN execution commands.", + ), + artifact_entry( + artifact_id="database_status", + title="Database Status", + path="qc/metagenomics_database_status.json", + kind="json", + status="created", + description="Resolved database and host-reference paths.", + ), + artifact_entry( + artifact_id="host_depletion", + title="Host Depletion Outputs", + path="host_depletion", + kind="directory", + status="created" if (run_dir / "host_depletion").exists() else "not_available", + description="KneadData cleaned reads when a host reference is supplied.", + ), + artifact_entry( + artifact_id="kraken_reports", + title="Kraken Reports", + path="taxonomic_classification", + kind="directory", + status="created" + if (run_dir / "taxonomic_classification").exists() + else "not_available", + description="Taxonomic classification outputs after execution.", + ), + artifact_entry( + artifact_id="bracken_matrix", + title="Bracken Relative Abundance Matrix", + path="tables/bracken_relative_abundance_matrix.tsv", + kind="table", + status="created" + if (run_dir / "tables" / "bracken_relative_abundance_matrix.tsv").exists() + else "not_available", + description="Merged Bracken relative abundance by taxon and sample.", + ), + artifact_entry( + artifact_id="humann_pathabundance", + title="HUMAnN Pathway Matrix", + path="tables/humann_pathabundance_matrix.tsv", + kind="table", + status="created" + if (run_dir / "tables" / "humann_pathabundance_matrix.tsv").exists() + else "not_available", + description="Merged HUMAnN pathway abundance by feature and sample.", + ), + artifact_entry( + artifact_id="humann_genefamilies", + title="HUMAnN Gene Family Matrix", + path="tables/humann_genefamilies_matrix.tsv", + kind="table", + status="created" + if (run_dir / "tables" / "humann_genefamilies_matrix.tsv").exists() + else "not_available", + description="Merged HUMAnN gene-family abundance by feature and sample.", + ), + artifact_entry( + artifact_id="backend_summaries", + title="Backend Output Summaries", + path="tables", + kind="directory", + status="created", + description="JSON summaries documenting which Bracken/HUMAnN backend artifacts were found and normalized.", + ), + artifact_entry( + artifact_id="backend_review", + title="Backend Review Summary", + path="tables/metagenomics_backend_review.json", + kind="json", + status="created", + description="Native review summary for normalized Bracken/HUMAnN tables and plots.", + ), + artifact_entry( + artifact_id="backend_dashboard", + title="Backend Dashboard", + path="visualizations/shotgun_backend_dashboard.html", + kind="html", + status="created", + description="Native dashboard for taxonomic and functional backend outputs.", + ), + artifact_entry( + artifact_id="top_taxa_plot", + title="Top Taxa Plot", + path="visualizations/shotgun_top_taxa.svg", + kind="svg", + status="created" + if (run_dir / "visualizations" / "shotgun_top_taxa.svg").exists() + else "not_available", + description="Top Bracken taxa plot from normalized relative abundance matrix.", + ), + artifact_entry( + artifact_id="top_pathways_plot", + title="Top Pathways Plot", + path="visualizations/shotgun_top_pathways.svg", + kind="svg", + status="created" + if (run_dir / "visualizations" / "shotgun_top_pathways.svg").exists() + else "not_available", + description="Top HUMAnN pathway plot from normalized pathabundance matrix.", + ), + artifact_entry( + artifact_id="top_gene_families_plot", + title="Top Gene Families Plot", + path="visualizations/shotgun_top_gene_families.svg", + kind="svg", + status="created" + if (run_dir / "visualizations" / "shotgun_top_gene_families.svg").exists() + else "not_available", + description="Top HUMAnN gene-family plot from normalized genefamilies matrix.", + ), + ] + if resource_plan is not None: + entries.extend( + [ + artifact_entry( + artifact_id="resource_readiness", + title="Resource Readiness", + path="resources/resource_readiness.md", + kind="markdown", + status="created", + description="Database readiness gate for Kraken2, Bracken, and HUMAnN bundles.", + ), + artifact_entry( + artifact_id="resource_manifest", + title="Resource Manifest", + path="resources/resource_manifest.tsv", + kind="table", + status="created", + description="Resolved database roots, expected files, and missing-file counts.", + ), + artifact_entry( + artifact_id="resource_plan", + title="Resource Plan", + path="resources/resource_plan.json", + kind="json", + status="created", + description="Structured database readiness plan used to gate this run.", + ), + artifact_entry( + artifact_id="resource_setup_plan", + title="Resource Setup Plan", + path="resources/resource_setup_plan.md", + kind="markdown", + status="created", + description="Actionable setup checklist for missing Kraken2, Bracken, and HUMAnN bundles.", + ), + artifact_entry( + artifact_id="resource_setup_commands", + title="Resource Setup Commands", + path="resources/resource_setup_commands.sh", + kind="script", + status="created", + description="Reviewed shell skeleton with commented setup hints and validation commands.", + ), + ] + ) + index = write_visualization_index( + run_dir, + title="Shotgun Metagenomics Backend Review", + description="Review surface for taxonomic classification, Bracken abundance, HUMAnN functional profiles, and database provenance.", + entries=entries, + notes=[*validation.get("warnings", []), *resource_blockers(resource_plan)], + analysis_intent="real_analysis" if status != "blocked" else "blocked_preflight", + provenance_summary={ + "status": status, + "sample_count": validation.get("sample_count", 0), + "resource_plan_ok": validation.get("resource_plan_ok"), + }, + ) + return { + "visualization_index": str(index.relative_to(run_dir)), + "visualization_manifest": "visualizations/visualization_manifest.json", + } + + +def write_summary( + run_dir: Path, + status: str, + validation: dict[str, Any], + resource_plan: dict[str, Any] | None = None, +) -> None: + lines = [ + "# Shotgun Metagenomics Backend Run Summary", + "", + f"Status: `{status}`", + f"Samples parsed: `{validation.get('sample_count', 0)}`", + "", + "## Key Artifacts", + "", + "- `workflow/shotgun_backend_command_plan.json`", + "- `qc/metagenomics_database_status.json`", + "- `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and resource setup-plan artifacts", + "- `host_depletion/` cleaned FASTQs when `--host-reference` is supplied", + "- `taxonomic_classification/*.kraken.report` and `*.bracken.tsv` when executed", + "- `tables/bracken_*_matrix.tsv` when Bracken outputs are available", + "- `tables/humann_*_matrix.tsv` when HUMAnN outputs are available", + "- `tables/metagenomics_backend_review.json`, `tables/top_*`, and `visualizations/shotgun_*` native backend review files", + "- `visualizations/index.html`", + "- `run_manifest.json` and `artifact_index.json`", + "", + ] + if validation.get("warnings"): + lines.extend(["## Warnings", ""]) + lines.extend(f"- {item}" for item in validation["warnings"]) + lines.append("") + if resource_plan is not None: + lines.extend(["## Resource Readiness", ""]) + lines.append(f"Ready: `{str(resource_plan.get('ok')).lower()}`") + lines.append( + f"Setup plan: `{resource_plan.get('outputs', {}).get('resource_setup_summary', 'resources/resource_setup_plan.md')}`" + ) + for item in resource_plan.get("resources", []): + state = "ready" if item.get("ok") else "missing" + required = "required" if item.get("required") else "optional" + lines.append(f"- `{item.get('bundle')}` ({required}): {state}") + lines.append("") + if validation.get("errors"): + lines.extend(["## Blockers", ""]) + lines.extend(f"- {item}" for item in validation["errors"]) + write_text(run_dir / "summary.md", "\n".join(lines) + "\n") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--sample-sheet", type=Path, required=True) + parser.add_argument("--kraken-db", type=Path, required=True) + parser.add_argument("--bracken-db", type=Path) + parser.add_argument("--run-bracken", action=argparse.BooleanOptionalAction, default=True) + parser.add_argument("--bracken-level", default="S") + parser.add_argument("--read-length", type=int, default=150) + parser.add_argument("--run-humann", action="store_true") + parser.add_argument("--humann-db", type=Path) + parser.add_argument("--host-reference", type=Path) + parser.add_argument("--metadata", type=Path) + parser.add_argument("--threads", type=int, default=4) + parser.add_argument( + "--include-optional-resources", + action="store_true", + help="Include optional database bundles in readiness output even if their analysis steps are not requested.", + ) + parser.add_argument( + "--resource-checksums", + action="store_true", + help="Compute checksums for database files below the reference-manager checksum threshold.", + ) + parser.add_argument( + "--skip-resource-plan", + action="store_true", + help="Skip registry-level database readiness checks and rely only on path/tool validation.", + ) + parser.add_argument("--outdir", type=Path) + parser.add_argument("--run-id", default=slug_timestamp("shotgun-metagenomics-backend")) + parser.add_argument("--execute", action="store_true") + return parser.parse_args() + + +def serializable_args(args: argparse.Namespace) -> dict[str, Any]: + return { + key: str(value) if isinstance(value, Path) else value for key, value in vars(args).items() + } + + +def main() -> int: + args = parse_args() + run_dir = (args.outdir or (DEFAULT_RUN_ROOT / args.run_id)).expanduser().resolve() + if run_dir.exists(): + raise FileExistsError(f"run directory already exists: {run_dir}") + run_dir.mkdir(parents=True) + (run_dir / "logs").mkdir(parents=True, exist_ok=True) + input_validation, samples = validate_inputs(args) + if args.run_bracken and args.bracken_db: + effective_read_length, read_length_warning = resolve_bracken_read_length( + args.bracken_db.expanduser().resolve(), args.read_length + ) + if read_length_warning: + input_validation.setdefault("warnings", []).append(read_length_warning) + args.read_length = effective_read_length + resource_plan = write_resource_plan(args, run_dir) + validation = merge_resource_status(input_validation, resource_plan) + required = ( + ["kraken2"] + + (["bracken"] if args.run_bracken else []) + + (["humann"] if args.run_humann else []) + + (["kneaddata"] if args.host_reference else []) + ) + optional = ["metaphlan", "multiqc"] + ([] if args.host_reference else ["kneaddata"]) + tool_status = tool_preflight(required, optional=optional) + plan = build_plan(args, samples) + write_json(run_dir / "config.json", {**serializable_args(args), "run_dir": str(run_dir)}) + write_json(run_dir / "validation" / "input_validation_summary.json", input_validation) + write_json(run_dir / "validation" / "validation_summary.json", validation) + write_json(run_dir / "validation" / "tool_preflight.json", tool_status) + write_json( + run_dir / "versions" / "software_versions.json", + software_versions( + { + "kraken2": ["kraken2", "--version"], + "bracken": ["bracken", "-v"], + "humann": ["humann", "--version"], + } + ), + ) + write_outputs(run_dir, validation, samples, plan) + dry_run = { + "ok": validation["ok"] and tool_status["ok"], + "detail": "shotgun sample, database, and tool validation completed", + } + write_json(run_dir / "logs" / "validation_dry_run.json", dry_run) + status = "blocked" if not dry_run["ok"] else "validated" + execution = None + if args.execute and dry_run["ok"]: + execution = execute_plan(run_dir, plan) + status = "completed" if execution.get("ok") else "failed" + summarize_backend_outputs(run_dir, samples) + write_shotgun_review_outputs(run_dir) + visuals = write_visuals(run_dir, status, validation, resource_plan) + resource_outputs = resource_plan.get("outputs", {}) if resource_plan else {} + write_standard_manifest( + run_dir, + run_id=args.run_id, + lane="shotgun_metagenomics", + workflow="backend_kraken2_bracken_humann", + status=status, + execute_requested=args.execute, + validation=validation, + tool_preflight_result=tool_status, + dry_run=dry_run, + execution=execution, + inputs={ + "sample_sheet": str(args.sample_sheet.expanduser().resolve()), + "kraken_db": str(args.kraken_db.expanduser().resolve()), + "bracken_db": str(args.bracken_db.expanduser().resolve()) if args.bracken_db else None, + "humann_db": str(args.humann_db.expanduser().resolve()) if args.humann_db else None, + "metadata": str(args.metadata.expanduser().resolve()) if args.metadata else None, + **( + {"resource_plan": resource_outputs.get("resource_plan")} if resource_outputs else {} + ), + }, + outputs={ + "sample_table": "validation/samples.normalized.tsv", + "command_plan": "workflow/shotgun_backend_command_plan.json", + "database_status": "qc/metagenomics_database_status.json", + "host_depletion": "host_depletion/" if args.host_reference else None, + "kraken_reports": "taxonomic_classification/*.kraken.report", + "bracken_tables": "taxonomic_classification/*.bracken.tsv", + "bracken_est_reads_matrix": "tables/bracken_est_reads_matrix.tsv", + "bracken_relative_abundance_matrix": "tables/bracken_relative_abundance_matrix.tsv", + "humann_pathabundance_matrix": "tables/humann_pathabundance_matrix.tsv", + "humann_genefamilies_matrix": "tables/humann_genefamilies_matrix.tsv", + "backend_summaries": ["tables/bracken_summary.json", "tables/humann_summary.json"], + "backend_review": "tables/metagenomics_backend_review.json", + "top_bracken_taxa": "tables/top_bracken_taxa.tsv", + "top_humann_pathways": "tables/top_humann_pathways.tsv", + "top_humann_gene_families": "tables/top_humann_gene_families.tsv", + "backend_dashboard": "visualizations/shotgun_backend_dashboard.html", + "top_taxa_plot": "visualizations/shotgun_top_taxa.svg", + "top_pathways_plot": "visualizations/shotgun_top_pathways.svg", + "top_gene_families_plot": "visualizations/shotgun_top_gene_families.svg", + **resource_outputs, + **visuals, + }, + method={ + "taxonomic_classifier": "Kraken2", + "host_depletion": "KneadData" if args.host_reference else None, + "host_reference": str(args.host_reference.expanduser().resolve()) + if args.host_reference + else None, + "abundance_estimator": "Bracken" if args.run_bracken else None, + "functional_profiler": "HUMAnN" if args.run_humann else None, + "resource_plan": resource_plan, + }, + audit={"resource_readiness": resource_plan} if resource_plan else None, + review_bundle=visuals, + ) + write_summary(run_dir, status, validation, resource_plan) + write_json(run_dir / "artifact_index.json", build_artifact_index(run_dir)) + print(run_dir) + return 1 if status in {"blocked", "failed"} else 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/skills/ngs-amplicon-microbiome/SKILL.md b/plugins/ngs-analysis/skills/ngs-amplicon-microbiome/SKILL.md new file mode 100644 index 0000000..950abfc --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-amplicon-microbiome/SKILL.md @@ -0,0 +1,99 @@ +--- +name: ngs-amplicon-microbiome +description: Kick off public 16S, 18S, ITS, COI, or other marker-gene amplicon microbiome workflows using nf-core/ampliseq, QIIME2, DADA2, and Cutadapt. +--- + +# Amplicon Microbiome + +Use this skill for marker-gene microbiome analysis from amplicon FASTQs. + +## Essential Inputs + +Confirm: + +- marker region: 16S, 18S, ITS, COI, or custom +- primer sequences and orientation +- paired-end or single-end reads +- whether reads should be merged +- taxonomy database and version +- sample metadata +- endpoint: ASV table, taxonomy, diversity, differential abundance, or plots + +## Public Defaults + +Prefer `nf-core/ampliseq` for reproducible end-to-end runs. Use QIIME2 or DADA2 directly when the user wants notebook-level control or an existing lab protocol requires it. + +## Preflight + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline amplicon_microbiome --emit-install-plan +``` + +## Local Execution Package + +For FASTQ intake/QC before primer, ASV, and taxonomy decisions, use: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_assay_package.py \ + --lane amplicon_microbiome \ + --sample-sheet amplicon_samples.tsv \ + --execute +``` + +This validates read paths and structure, runs seqkit stats and FastQC/MultiQC when available, and writes `amplicon_analysis_status.json`. The runner now also emits `methods/amplicon_methods.json` plus a concrete backend handoff bundle under `workflow/` so primer, denoiser, truncation, normalization, and taxonomy choices are machine-readable even before a full backend is run. + +If the user asks for a full amplicon analysis rather than QC/readiness, do not treat FASTQs alone as sufficient. Require primer sequences, primer orientation, taxonomy database plus version, and sample metadata before presenting the run as analysis-ready. Without that context, run the local execution package and describe the result as a read-QC/readiness bundle only. + +For backend ASV/taxonomy/diversity execution when primers, metadata, and taxonomy resources are available, use: + +```bash +python plugins/ngs-analysis/scripts/run_amplicon_microbiome.py \ + --sample-sheet amplicon_samples.tsv \ + --backend qiime2 \ + --primer-forward GTGYCAGCMGCCGCGGTAA \ + --primer-reverse GGACTACNVGGGTWTCTAAT \ + --taxonomy-classifier silva-138-classifier.qza \ + --metadata sample_metadata.tsv \ + --execute +``` + +Use `--backend dada2` for a direct R/Bioconductor ASV path. The plugin includes `workflows/amplicon_microbiome/run_dada2_backend.R`; the runner checks for `Rscript` and the `dada2` R package before execution, then writes normalized ASV, representative-sequence, read-retention, and optional taxonomy tables under `tables/`. + +For nf-core execution, use `plugins/ngs-analysis/scripts/run_nfcore_pipeline.py --pipeline ampliseq`. + +The direct backend runner also emits `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, and `resource_readiness.md`. The resource check is advisory by default when a QIIME classifier is supplied directly; add `--bundle-root silva_138_amplicon=`, `--include-optional-resources`, and `--require-resource-plan` when missing registered taxonomy databases should block readiness. + +The backend runner writes native normalized tables when QIIME2/DADA2/nf-core outputs are present: + +- `tables/asv_table.tsv` +- `tables/representative_sequences.fasta` for direct DADA2 runs +- `tables/taxonomy.tsv` +- `tables/read_retention.tsv` +- `tables/amplicon_backend_summary.json` +- `tables/alpha_diversity.tsv`, `tables/bray_curtis_distance.tsv`, and `tables/top_taxa_or_features.tsv` when a normalized ASV/feature table is available + +QIIME2 BIOM-only feature-table exports are recorded as requiring conversion, with a `biom convert` command in the backend summary. Do not claim diversity or taxonomy interpretation unless these normalized tables or equivalent supplied inputs exist. + +## Kickoff Pattern + +nf-core preflight run: + +```bash +nextflow run nf-core/ampliseq \ + -profile test,docker \ + --outdir results/ampliseq_test +``` + +Before a real run, verify primer trimming and truncation choices from read-quality profiles. + +## Visualization Outputs + +The local FASTQ package always writes `visualizations/index.html` and `visualizations/visualization_manifest.json`. With only FASTQs, this is a read-QC/readiness bundle. If an ASV/feature table is available, pass it to the runner with `--asv-table` to generate alpha diversity, Bray-Curtis PCoA, and rarefaction artifacts. If a feature taxonomy table is available, pass `--taxonomy-table` to generate taxa barplots. When downstream tables are labeled synthetic or contain sample columns that are not present in the real sample sheet, the runner marks the run review-only and blocks beta-diversity/PCoA unless `--allow-synthetic-diversity` is set explicitly. + +The run also emits `qc_verdict.json` and, for amplicon runs, `qc_interpretation.json` with machine-readable reason codes, a readiness verdict, and follow-on command templates for generating ASV/taxonomy tables and re-rendering plugin-native plots. Backend runs additionally write `tables/amplicon_backend_summary.json` so exported ASV, taxonomy, read-retention, and BIOM-conversion status are auditable. When a normalized ASV/feature table is available, the backend runner also writes `tables/amplicon_diversity_summary.json`, `visualizations/amplicon_backend_dashboard.html`, and SVG plots for sample depth, Shannon diversity, and top taxa/features. If the ASV table is absent, these outputs remain explicitly unavailable rather than inferred from FASTQ QC. + +## Guardrails + +- Do not choose truncation lengths before looking at quality distributions. +- Do not mix taxonomy database versions without recording them. +- Preserve negative controls and extraction blanks in metadata. diff --git a/plugins/ngs-analysis/skills/ngs-amplicon-microbiome/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-amplicon-microbiome/agents/openai.yaml new file mode 100644 index 0000000..5bf249a --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-amplicon-microbiome/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Amplicon Microbiome" + short_description: "Plan 16S/ITS/amplicon microbiome analysis" + default_prompt: "Inspect my amplicon FASTQs and metadata, resolve primers and taxonomy database choices, and prepare a public pipeline preflight run." diff --git a/plugins/ngs-analysis/skills/ngs-analysis-router/SKILL.md b/plugins/ngs-analysis/skills/ngs-analysis-router/SKILL.md new file mode 100644 index 0000000..1b3cac1 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-analysis-router/SKILL.md @@ -0,0 +1,97 @@ +--- +name: ngs-analysis-router +description: Route BCL, FASTQ, BAM/CRAM, count-matrix, or VCF sequencing requests to the right public NGS analysis skill and ask only the missing assay-specific setup questions. +--- + +# Life Sciences NGS Analysis Router + +Use this skill as the top-level entrypoint for ambiguous or broad sequencing-analysis requests. + +## Start Here + +Inspect the available inputs before asking the user questions. Look for: + +- Illumina run-folder files: `RunInfo.xml`, `RunParameters.xml`, `SampleSheet.csv`, `Data/Intensities/BaseCalls` +- FASTQs: `*.fastq`, `*.fq`, `*.fastq.gz`, `*.fq.gz` +- BAM/CRAM/VCF: `*.bam`, `*.cram`, `*.vcf`, `*.vcf.gz` +- count matrices: `matrix.mtx`, `features.tsv`, `barcodes.tsv`, `*.h5`, `*.h5ad`, `*.rds` +- metadata: sample sheets, design files, target BEDs, reference FASTA/GTF, primer files + +Read `references/intake-schema.json` and `references/pipeline-registry.json` when forming the route. + +## Intake Rules + +Ask the smallest set of missing questions needed to choose a defensible pipeline. Do not ask the full questionnaire if file inspection already answers a field. + +Always resolve: + +- input type +- assay type +- desired output +- organism/reference +- paired-end vs single-end when FASTQs are involved +- any assay-specific design file or metadata required for the requested result +- runtime constraints: local/HPC/cloud, container availability, and whether installs are allowed + +For human data, ask whether cloud upload is allowed before suggesting BaseSpace, Terra, DNAnexus, or any cloud path. + +## Routing + +Route to one leaf skill: + +- BCL run folder or demultiplexing: `ngs-bcl-to-fastq` +- QC/trimming only: `ngs-fastq-qc` +- WGS/WES/panel variants: `ngs-dna-variant-calling`, then a subtype skill when the analysis model is clear +- germline WGS/WES/panel variants: `ngs-dna-germline-variants` +- tumor-normal or tumor-only somatic variants: `ngs-dna-somatic-variants` +- UMI, duplex, or low-frequency targeted panels: `ngs-dna-umi-panel-variants` +- bulk RNA-seq kickoff: `ngs-bulk-rnaseq` +- bulk RNA-seq FASTQ-to-count QC: `ngs-bulk-rnaseq-counts-qc` +- bulk RNA-seq differential expression from counts: `ngs-bulk-rnaseq-differential-expression` +- single-cell or single-nucleus FASTQ-to-matrix kickoff: `ngs-scrna-seq` +- single-cell or single-nucleus post-count QC/annotation/UMAP: `scrna-seq-qc` +- epigenomics kickoff: `ngs-epigenomics-peaks` +- ATAC-seq QC/peaks/accessibility: `ngs-atacseq-peaks-qc` +- ChIP-seq, CUT&RUN, or CUT&Tag QC/peaks: `ngs-chip-cutrun-peaks-qc` +- 16S/18S/ITS/COI amplicons: `ngs-amplicon-microbiome` +- shotgun metagenomics: `ngs-shotgun-metagenomics` +- runtime/package setup only: `ngs-runtime-env` + +Prefer public, runtime-installable packages and nf-core workflows. Surface license/EULA/account boundaries before using proprietary or cloud tools. + +## Preflight + +Before proposing installation or execution, run a preflight plan from the repo root: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline --emit-install-plan +``` + +When the user needs an approval-ready install handoff, write persistent install artifacts: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline --manager micromamba --install-plan-outdir runtime_readiness/_install +``` + +Treat `install_plan.json` as the canonical review artifact. `install_commands.sh` is generated from the same plan and stays review-only unless the user explicitly approves execution with `NGS_RUN_INSTALL_COMMANDS=1`. + +For reference- or database-heavy pipelines, also create a resource plan before saying the workflow is runnable: + +```bash +python plugins/ngs-analysis/scripts/ngs_reference_manager.py plan --pipeline --genome-build --outdir resource_readiness/ +``` + +Use `--include-optional` for shotgun, amplicon, or motif-enabled epigenomics runs when optional databases materially affect the requested output. + +Use `--network-checks` only when the user allows network checks. Use `--install-missing --yes` only when the user explicitly asks to install. + +## Output Contract + +Return: + +1. the routed analysis type and confidence +2. missing essential parameters, if any +3. recommended public pipeline or package family +4. local tool preflight summary +5. preflight-first command or next concrete action +6. caveats around licenses, cloud upload, database size, and reference data diff --git a/plugins/ngs-analysis/skills/ngs-analysis-router/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-analysis-router/agents/openai.yaml new file mode 100644 index 0000000..40d09b8 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-analysis-router/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "NGS Router" + short_description: "Route sequencing inputs to the right public analysis workflow" + default_prompt: "Inspect my sequencing inputs, ask only the missing essential questions, choose the right public pipeline, and produce a preflight-first analysis plan." diff --git a/plugins/ngs-analysis/skills/ngs-atacseq-peaks-qc/SKILL.md b/plugins/ngs-analysis/skills/ngs-atacseq-peaks-qc/SKILL.md new file mode 100644 index 0000000..4aac45a --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-atacseq-peaks-qc/SKILL.md @@ -0,0 +1,79 @@ +--- +name: ngs-atacseq-peaks-qc +description: Run or plan ATAC-seq QC, alignment, TSS enrichment, fragment-size, blacklist, peak-calling, consensus peak, and differential accessibility workflows. +--- + +# ATAC-seq Peaks QC + +Use this skill for ATAC-seq accessibility analysis from FASTQ or BAM. If the assay is ChIP-seq, CUT&RUN, CUT&Tag, or antibody-targeted enrichment, use `ngs-chip-cutrun-peaks-qc`. + +## Essential Inputs + +Confirm: + +- FASTQ/BAM inputs and paired-end status +- organism, genome build, blacklist, and mitochondrial contig names +- biological replicates, conditions, batches, and sample metadata +- whether the target is QC only, peaks, consensus peaks, bigWigs, or differential accessibility +- whether Tn5 shifting is handled by the chosen workflow +- desired peak caller and downstream matrix generation + +## Route + +Prefer `nf-core/atacseq` for full reproducible processing. Use direct MACS2 only when BAMs are already aligned, duplicate/blacklist handling is known, and the user wants focused peak calling. + +Preflight command: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline atacseq_peaks_qc --emit-install-plan +``` + +For compact read-level intake/QC, use the shared epigenomics execution package: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_assay_package.py \ + --lane epigenomics_peaks \ + --sample-sheet atac_samples.csv \ + --execute +``` + +For local-light ATAC alignment, peaks, FRiP, TSS, bigWig tracks, and consensus peaks from FASTQ or prepared BAMs, use the dedicated ATAC runner: + +```bash +python plugins/ngs-analysis/scripts/run_atacseq_peaks_qc.py \ + --sample-sheet atac_samples.csv \ + --bowtie2-index /refs/GRCh38/bowtie2/genome \ + --genome-size hs \ + --blacklist-bed /refs/GRCh38/blacklists/encode_blacklist.bed \ + --tss-bed /refs/GRCh38/tss.bed \ + --execute +``` + +This runner emits `qc/atacseq_qc_summary.{tsv,json}`, `qc/atacseq_qc_dashboard.html`, native SVG FRiP/peak and insert-size plots, browser-track handoff files under `tracks/`, and TSS profile/heatmap commands when `--tss-bed` is supplied. Add `--run-motifs --motif-genome ` when HOMER motif enrichment should be part of the backend run. + +It also emits `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, and `resource_readiness.md`. The resource check is advisory by default for local-light runs; add `--genome-build`, `--bundle-root =`, and `--require-resource-plan` when missing registered reference bundles should block readiness. + +For nf-core execution, use `plugins/ngs-analysis/scripts/run_nfcore_pipeline.py --pipeline atacseq`. + +## QC Gates + +Review before biological interpretation: + +- read depth, alignment rate, duplicate rate, and mitochondrial fraction +- insert-size periodicity/nucleosome pattern +- TSS enrichment and FRiP score when available +- blacklist overlap and peak count per sample +- replicate concordance and consensus peak support + +Do not proceed to differential accessibility if replicate quality or metadata is insufficient. + +## Outputs + +Produce: + +- sample sheet and workflow command/profile +- QC summary and failed-sample flags +- narrowPeak/BED peak sets, consensus peaks, bigWigs, browser-track manifests, browser-track preview HTML, native QC dashboard/SVG plots, TSS plots, and peak-count matrix when requested +- motif summary files when a motif backend is requested +- differential-accessibility design and contrasts if applicable +- caveats for low TSS enrichment, high mitochondrial reads, weak replicate concordance, or poor FRiP diff --git a/plugins/ngs-analysis/skills/ngs-atacseq-peaks-qc/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-atacseq-peaks-qc/agents/openai.yaml new file mode 100644 index 0000000..c6a854b --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-atacseq-peaks-qc/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "ATAC-seq Peaks QC" + short_description: "Deep ATAC-seq QC, peak, and accessibility planning" + default_prompt: "Plan an ATAC-seq QC and peak-calling workflow, check replicate and TSS/FRiP requirements, and prepare a preflight-first command." diff --git a/plugins/ngs-analysis/skills/ngs-bcl-to-fastq/SKILL.md b/plugins/ngs-analysis/skills/ngs-bcl-to-fastq/SKILL.md new file mode 100644 index 0000000..6983d72 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-bcl-to-fastq/SKILL.md @@ -0,0 +1,91 @@ +--- +name: ngs-bcl-to-fastq +description: Validate Illumina BCL run folders and sample sheets, plan demultiplexing, review index/UMI/lane choices, run BCL-to-FASTQ conversion, and interpret demux metrics while surfacing license/download boundaries. +--- + +# BCL To FASTQ + +Use this skill when the input is an Illumina BCL run folder or the user asks to demultiplex a sequencing run. This is a deep demultiplexing and run-validation skill, not only a command wrapper. + +## Essential Inputs + +Confirm: + +- run folder path with `RunInfo.xml` +- sample sheet path and format +- output directory +- instrument/run metadata from `RunInfo.xml` and `RunParameters.xml` +- lane handling: split by lane or combine lanes +- index mismatch tolerance +- index read structure and dual-index orientation +- UMI layout, if any +- whether adapter trimming/masking should happen during conversion +- whether undetermined reads and demultiplexing metrics should be reviewed before downstream analysis + +## Public Tool Boundary + +Prefer `bcl-convert` if it is already installed. It is free for local use but proprietary and RPM-distributed by Illumina, so do not auto-download without explicit user approval. + +Legacy `bcl2fastq` may exist in older environments. Use it only when BCL Convert is unavailable or the run requires legacy compatibility. + +## Preflight + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline bcl_to_fastq --emit-install-plan +``` + +Also check run-folder structure: + +```bash +test -f /path/to/run/RunInfo.xml +test -f /path/to/SampleSheet.csv +find /path/to/run -maxdepth 4 -type d -name BaseCalls +``` + +## Local Execution Package + +Use the plugin-owned runner when the user provides a local run folder and sample sheet: + +```bash +python plugins/ngs-analysis/scripts/run_bcl_to_fastq.py \ + --run-folder /path/to/run \ + --sample-sheet /path/to/SampleSheet.csv \ + --output-directory /path/to/fastq_out +``` + +Add `--execute` only when conversion is requested. The runner validates `RunInfo.xml`, optional `RunParameters.xml`, the BaseCalls directory, sample-sheet rows, duplicate lane/index combinations, and index length compatibility. With `--execute`, it uses installed `bcl-convert`, then legacy `bcl2fastq` if available; if neither exists, it records the blocker instead of downloading proprietary software. + +## Validation Checklist + +Before conversion, validate: + +- `RunInfo.xml` exists and its read structure matches the expected sequencing design. +- `SampleSheet.csv` exists, is the intended version, and has no duplicate sample/index combinations within each lane. +- Index sequence lengths match the index reads and any trimming/masking requested by the sample sheet. +- Dual-index orientation is explicit for the instrument and library prep; do not infer i5 orientation from filenames. +- UMI bases are assigned to the intended read or index read and carried through to FASTQ headers or output metadata as needed. +- Lane-splitting, sample-name normalization, and output directory behavior are agreed before running. +- Disk space is sufficient for output FASTQs, reports, and temporary files. + +## Kickoff Pattern + +First produce a preflight plan with paths and sample sheet validation. Then run conversion only after the user confirms: + +```bash +bcl-convert \ + --bcl-input-directory /path/to/run \ + --output-directory /path/to/fastq_out \ + --sample-sheet /path/to/SampleSheet.csv +``` + +## Metrics Review + +After conversion, inspect and report: + +- total clusters, clusters passing filter, and yield by lane +- percent assigned by sample and percent undetermined by lane +- top undetermined index sequences when available +- per-sample FASTQ counts and read-pair consistency +- unexpected index hopping, barcode collision, or sample-sheet mismatch signals + +Record software version, command, sample sheet checksum, run-folder path, output path, and conversion metrics. Do not start downstream analysis until severe demultiplexing anomalies are surfaced. diff --git a/plugins/ngs-analysis/skills/ngs-bcl-to-fastq/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-bcl-to-fastq/agents/openai.yaml new file mode 100644 index 0000000..e9b9ccd --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-bcl-to-fastq/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "BCL to FASTQ" + short_description: "Validate BCL run folders and plan FASTQ conversion" + default_prompt: "Inspect this Illumina run folder and sample sheet, validate demultiplexing parameters, check BCL conversion tools, and prepare a metrics-aware demux plan." diff --git a/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-counts-qc/SKILL.md b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-counts-qc/SKILL.md new file mode 100644 index 0000000..da2c07e --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-counts-qc/SKILL.md @@ -0,0 +1,70 @@ +--- +name: ngs-bulk-rnaseq-counts-qc +description: Run or plan bulk RNA-seq FASTQ-to-count processing with sample-sheet, strandedness, genome annotation, alignment or pseudoalignment, MultiQC, and count-matrix QC checks. +--- + +# Bulk RNA-seq Counts QC + +Use this skill for bulk RNA-seq read processing, quantification, and count-matrix generation. If the user already has a count matrix and wants contrasts or statistics, use `ngs-bulk-rnaseq-differential-expression`. + +## Essential Inputs + +Confirm: + +- FASTQ or aligned-read inputs and paired-end/single-end status +- organism, genome build, FASTA, GTF, and gene ID convention +- strandedness or permission to infer strandedness +- sample sheet with biological condition, replicate, batch, and library metadata +- desired quantification: gene counts, transcript estimates, or both +- alignment strategy: `STAR/Salmon`, Salmon-only, featureCounts from BAMs, or existing lab protocol + +## Route + +Prefer `nf-core/rnaseq` for standard processing when a stable container or HPC runtime is available. Use the `local_light` Snakemake/Salmon path for small local/devbox feasibility runs when Docker, registry egress, or Nextflow process containers are the blocker. + +The plugin-owned local runner is: + +```bash +python plugins/ngs-analysis/scripts/run_bulk_rnaseq_counts_qc.py \ + --sample-sheet samplesheet.csv \ + --fastq-root path/to/fastqs \ + --transcriptome-fasta reference/transcriptome.fasta \ + --genome-fasta reference/genome.fa \ + --annotation-gtf reference/genes.gtf \ + --execute +``` + +Omit `--execute` for validation plus Snakemake workflow validation only. Use `--no-dry-run` only when the user wants input validation and run-envelope preparation without workflow graph validation. + +The runner emits a run-local `resources/` readiness bundle with `resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, and `resource_readiness.md`. Resource checks are advisory by default for custom or reduced references; add `--genome-build`, `--bundle-root =`, and `--require-resource-plan` when a registered genome bundle must be complete before the run is considered ready. + +Preflight command: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline bulk_rnaseq_counts_qc --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --profile local_light --emit-install-plan +``` + +## Decision Points + +- If strandedness is unknown, infer it before final counting; do not lock in a design based on library guesses. +- If strandedness is provided, carry it into the quantification command and flag any disagreement between the configured library type and Salmon's inferred format. +- Keep genome FASTA, GTF, transcriptome, and aligner indexes from the same build/release. +- Inspect per-sample reads, mapping rate, rRNA/mitochondrial fraction when available, duplication, insert size, gene-body bias, and assignment rate. +- Preserve raw counts separately from normalized expression. +- Carry sample metadata forward exactly; downstream DE depends on this table. + +## Outputs + +Produce: + +- sample sheet and command/profile +- reference manifest with genome and GTF release +- MultiQC or equivalent processing summary +- Salmon `quant.sf` outputs, TPM/NumReads/effective-length matrices, and carried-forward sample metadata +- Gene-level expected-count and TPM matrices derived from transcript-level Salmon outputs, plus a `tx2gene` provenance table +- Compact QC verdict JSON covering mapping rate, duplication, library-type agreement, and outlier samples +- Browser-safe MultiQC helper HTML pages and a localhost launch hint for reliable in-app review +- Run-local reference readiness artifacts under `resources/`, including the resource plan, manifest, environment exports, and Markdown readiness summary +- issues that block differential expression, such as missing replicates, mislabeled groups, or severe batch/library failures +- standard run envelope: `run_manifest.json`, `config.json`, `validation/`, `logs/`, `versions/`, `artifact_index.json`, and `summary.md` diff --git a/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-counts-qc/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-counts-qc/agents/openai.yaml new file mode 100644 index 0000000..82943b5 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-counts-qc/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Bulk RNA-seq Counts" + short_description: "Deep FASTQ-to-count processing and QC for bulk RNA-seq" + default_prompt: "Plan a bulk RNA-seq count-generation workflow, validate strandedness and references, check public tools, and prepare a preflight-first command." diff --git a/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-differential-expression/SKILL.md b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-differential-expression/SKILL.md new file mode 100644 index 0000000..a2f24e4 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-differential-expression/SKILL.md @@ -0,0 +1,74 @@ +--- +name: ngs-bulk-rnaseq-differential-expression +description: Run or plan bulk RNA-seq differential-expression analysis from count matrices with replicate, design formula, contrast, batch, normalization, QC plot, and result-table checks. +--- + +# Bulk RNA-seq Differential Expression + +Use this skill when the user has raw counts or a count-generation output and wants differential expression, contrasts, QC plots, or ranked gene tables. + +## Essential Inputs + +Confirm: + +- raw count matrix path and sample metadata path +- gene ID type and annotation mapping requirement +- biological conditions, replicates, batch variables, donor pairing, covariates, and exclusions +- exact contrasts and baseline levels +- preferred statistical framework: DESeq2, edgeR, limma-voom, or existing lab standard +- output needs: normalized counts, PCA, sample distance, volcano plots, heatmaps, ranked tables, GSEA-ready lists + +## Preconditions + +Do not start differential expression until: + +- raw counts are preserved +- each requested contrast has enough biological replication +- sample metadata row names match count matrix columns +- batch/covariate choices are explicit +- exploratory PCA/sample-distance plots do not reveal obvious swaps or failed libraries + +## Route + +For most count matrices, use DESeq2 or edgeR. Use limma-voom when the study design or lab standard favors it. Keep the analysis in R when using Bioconductor unless the user specifically asks for a Python-only workflow. + +The plugin-owned local runner is: + +```bash +python plugins/ngs-analysis/scripts/run_bulk_rnaseq_de.py \ + --count-matrix count_matrix.tsv \ + --sample-metadata sample_metadata.tsv \ + --contrasts contrasts.tsv \ + --execute +``` + +Use `--method auto` unless the user or lab standard specifies `DESeq2`, `edgeR`, or `limma_log2`. Auto mode uses DESeq2 when integer-like counts and the package are available, falls back to edgeR for integer-like counts, and uses `limma_log2` for non-integer expression matrices. + +Use `--input-mode` to declare whether the matrix is `raw_counts`, `normalized_expression`, or `log_expression`. When `--input-mode auto` is used, the runner infers the mode and records a warning if normalization is skipped because the matrix is already transformed. + +Preflight command: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline bulk_rnaseq_differential_expression --emit-install-plan +``` + +## Decision Points + +- Never compare groups without stating the design formula and contrast. +- Treat batch correction in modeling separately from visual batch removal. +- Do not filter genes using post-hoc knowledge of the contrast. +- For paired or repeated-measures designs, model subject/donor explicitly. +- Report genes with effect size, uncertainty, adjusted p-value, and filtering status. + +## Outputs + +Produce: + +- design formula and contrast manifest +- QC plots: library size, detected genes, PCA/sample distance, mean-variance trend, and outlier review +- input-mode-aware matrix exports plus the modeling/log-scale matrix used for DE +- differential-expression tables per contrast +- explicit `.not_tested.tsv` stubs for contrasts blocked by insufficient replication or confounding +- auto-launched localhost Marimo review app recorded in `notebooks/marimo_server.json` +- caveats for small n, confounded designs, failed samples, or batch variables that cannot be estimated +- standard run envelope: `run_manifest.json`, `config.json`, `validation/`, `logs/`, `versions/`, `visualizations/`, `notebooks/`, `artifact_index.json`, and `summary.md` diff --git a/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-differential-expression/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-differential-expression/agents/openai.yaml new file mode 100644 index 0000000..d7c2bfb --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq-differential-expression/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Bulk RNA-seq DE" + short_description: "Deep differential-expression design and result planning" + default_prompt: "Plan bulk RNA-seq differential expression from counts, verify design and contrasts, and produce an auditable analysis plan." diff --git a/plugins/ngs-analysis/skills/ngs-bulk-rnaseq/SKILL.md b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq/SKILL.md new file mode 100644 index 0000000..4ab07c2 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq/SKILL.md @@ -0,0 +1,102 @@ +--- +name: ngs-bulk-rnaseq +description: Dispatch bulk RNA-seq requests to FASTQ-to-count QC or count-matrix differential-expression skills using nf-core/rnaseq, STAR, Salmon, featureCounts, MultiQC, and R/Bioconductor workflows. +--- + +# Bulk RNA-seq + +Use this skill as the bulk RNA-seq dispatcher. Route FASTQ/BAM processing to count-generation QC, and route count-matrix statistical analysis to differential-expression guidance. + +## Essential Inputs + +Confirm: + +- organism and genome build +- FASTA and GTF, or supported nf-core genome key +- paired-end or single-end reads +- strandedness, or whether to infer strandedness +- sample sheet and metadata +- counts-only vs differential expression +- contrasts, covariates, and batch terms for differential expression + +## Dispatch + +- FASTQ or aligned reads to raw counts, transcript estimates, or MultiQC summaries: `ngs-bulk-rnaseq-counts-qc` +- Raw count matrix plus sample metadata to contrasts, plots, and DE result tables: `ngs-bulk-rnaseq-differential-expression` + +If the user asks for both, run count-generation planning first and start differential expression only after the raw count matrix, sample metadata, replicates, design formula, and contrasts are confirmed. + +## Public Default + +Prefer `nf-core/rnaseq` for standardized processing when a stable container or HPC runtime is available. Use the `local_light` Snakemake/Salmon path when Docker, registry egress, or Nextflow process containers are unavailable and a compact local run is appropriate. + +## Plugin-Owned Local Paths + +Use the counts/QC runner for local FASTQ-to-matrix execution: + +```bash +python plugins/ngs-analysis/scripts/run_bulk_rnaseq_counts_qc.py \ + --sample-sheet samplesheet.csv \ + --fastq-root path/to/fastqs \ + --transcriptome-fasta reference/transcriptome.fasta \ + --genome-fasta reference/genome.fa \ + --annotation-gtf reference/genes.gtf \ + --execute +``` + +Use the differential-expression runner when the user already has a count or expression matrix: + +```bash +python plugins/ngs-analysis/scripts/run_bulk_rnaseq_de.py \ + --count-matrix count_matrix.tsv \ + --sample-metadata sample_metadata.tsv \ + --contrasts contrasts.tsv \ + --execute +``` + +## Preflight + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline bulk_rnaseq --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline bulk_rnaseq_counts_qc --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline bulk_rnaseq_differential_expression --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --profile local_light --emit-install-plan +``` + +## Kickoff Pattern + +Preflight run: + +```bash +nextflow run nf-core/rnaseq \ + -profile test,docker \ + --outdir results/rnaseq_test +``` + +Real run skeleton: + +```bash +nextflow run nf-core/rnaseq \ + -profile docker \ + --input samplesheet.csv \ + --outdir results/rnaseq \ + --genome GRCh38 \ + --aligner star_salmon +``` + +If strandedness is unknown, run inference or use the pipeline's strandedness detection before committing to final counts. + +Local execution run: + +```bash +python plugins/ngs-analysis/scripts/run_bulk_rnaseq_counts_qc.py \ + --sample-sheet samplesheet.csv \ + --fastq-root path/to/fastqs \ + --transcriptome-fasta reference/transcriptome.fasta +``` + +The local runners create a standard run envelope with `run_manifest.json`, `config.json`, `validation/`, `logs/`, `versions/`, `artifact_index.json`, and `summary.md`. Do not depend on development-only eval harness paths in a shared package. + +## Downstream + +Only start DESeq2/edgeR/limma analysis after confirming biological replicates, design formula, and contrasts. Preserve the raw count matrix and sample metadata. diff --git a/plugins/ngs-analysis/skills/ngs-bulk-rnaseq/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq/agents/openai.yaml new file mode 100644 index 0000000..a439187 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-bulk-rnaseq/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Bulk RNA-seq" + short_description: "Plan bulk RNA-seq counts and DE analysis" + default_prompt: "Inspect my bulk RNA-seq inputs, route count generation versus differential expression, resolve required metadata, check public tools, and prepare a preflight-first plan." diff --git a/plugins/ngs-analysis/skills/ngs-chip-cutrun-peaks-qc/SKILL.md b/plugins/ngs-analysis/skills/ngs-chip-cutrun-peaks-qc/SKILL.md new file mode 100644 index 0000000..54ab4ed --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-chip-cutrun-peaks-qc/SKILL.md @@ -0,0 +1,80 @@ +--- +name: ngs-chip-cutrun-peaks-qc +description: Run or plan ChIP-seq, CUT&RUN, or CUT&Tag QC, control handling, spike-in, peak calling, broad-vs-narrow target selection, replicate, bigWig, and differential binding workflows. +--- + +# ChIP/CUT&RUN Peaks QC + +Use this skill for antibody-targeted enrichment workflows: ChIP-seq, CUT&RUN, or CUT&Tag. Use `ngs-atacseq-peaks-qc` for ATAC-seq. + +## Essential Inputs + +Confirm: + +- assay: ChIP-seq, CUT&RUN, or CUT&Tag +- target class: transcription factor, histone mark, chromatin regulator, or custom target +- FASTQ/BAM inputs and paired-end status +- input DNA, IgG, no-antibody, or spike-in controls +- organism, genome build, blacklist, and spike-in genome if used +- biological replicates, conditions, batches, and sample metadata +- desired endpoint: QC, peaks, bigWigs, consensus peaks, or differential binding + +## Route + +Use `nf-core/chipseq` for ChIP-seq and `nf-core/cutandrun` for CUT&RUN/CUT&Tag when they fit the assay. Use direct MACS2 only for prepared BAMs with known control and duplicate policy. + +Preflight command: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline chip_cutrun_peaks_qc --emit-install-plan +``` + +For compact FASTQ intake/QC, use the shared epigenomics execution package: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_assay_package.py \ + --lane epigenomics_peaks \ + --sample-sheet chip_or_cutrun_samples.csv \ + --execute +``` + +It records FASTQ-level QC and peak-calling readiness. + +For local-light alignment, control-aware MACS2 peak calling, FRiP, bigWig tracks, consensus peaks, and motif-handoff artifacts, use the dedicated ChIP/CUT&RUN runner: + +```bash +python plugins/ngs-analysis/scripts/run_chip_cutrun_peaks_qc.py \ + --sample-sheet chip_or_cutrun_samples.csv \ + --assay chipseq \ + --target-class tf \ + --peak-mode narrow \ + --bowtie2-index /refs/GRCh38/bowtie2/genome \ + --genome-size hs \ + --blacklist-bed /refs/GRCh38/blacklists/encode_blacklist.bed \ + --execute +``` + +This runner emits `qc/chip_cutrun_qc_summary.{tsv,json}`, `qc/chip_cutrun_qc_dashboard.html`, native SVG FRiP/peak and insert-size plots, browser-track handoff files under `tracks/`, and `motifs/motif_summary.tsv`. Add `--run-motifs --motif-genome ` when HOMER motif enrichment should be executed instead of only planned. + +It also emits `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, and `resource_readiness.md`. The resource check is advisory by default for local-light runs; add `--genome-build`, `--bundle-root =`, and `--require-resource-plan` when missing registered reference bundles should block readiness. + +For nf-core execution, use `plugins/ngs-analysis/scripts/run_nfcore_pipeline.py --pipeline chipseq` or `--pipeline cutandrun`. + +## Decision Points + +- Choose narrow versus broad peak mode from target biology, not from convenience. +- Preserve control pairing and spike-in metadata through sample sheets. +- For histone marks, expect broad or domain-like signal for many marks; for TFs, expect sharper peaks and stronger replicate checks. +- Review alignment rate, duplicate rate, fragment size, FRiP/peak signal, blacklist overlap, and replicate concordance. +- Keep consensus peak generation and differential binding design separate from raw peak calling. + +## Outputs + +Produce: + +- assay/target/control manifest +- command/profile and sample sheet +- QC summary with replicate/control status +- peaks, bigWigs, browser-track manifests, browser-track preview HTML, native QC dashboard/SVG plots, consensus peaks, and count matrix when requested +- motif summary files when a motif backend is requested +- differential binding design and caveats for missing controls, weak enrichment, or poor replicate concordance diff --git a/plugins/ngs-analysis/skills/ngs-chip-cutrun-peaks-qc/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-chip-cutrun-peaks-qc/agents/openai.yaml new file mode 100644 index 0000000..173f19e --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-chip-cutrun-peaks-qc/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "ChIP/CUT&RUN Peaks QC" + short_description: "Deep ChIP-seq, CUT&RUN, and CUT&Tag peak workflow planning" + default_prompt: "Plan a ChIP-seq or CUT&RUN/CUT&Tag workflow, validate controls and target type, check public tools, and prepare a preflight-first command." diff --git a/plugins/ngs-analysis/skills/ngs-dna-germline-variants/SKILL.md b/plugins/ngs-analysis/skills/ngs-dna-germline-variants/SKILL.md new file mode 100644 index 0000000..c2b38b6 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-dna-germline-variants/SKILL.md @@ -0,0 +1,79 @@ +--- +name: ngs-dna-germline-variants +description: Run or plan deep germline WGS, WES, targeted-panel, cohort, or trio variant-calling workflows with reference-build, known-sites, QC, joint-calling, and annotation checks. +--- + +# Germline DNA Variants + +Use this skill for germline WGS, WES, or inherited-disease panel analysis from FASTQ, BAM, or CRAM. If the request is tumor-only, tumor-normal, or low-frequency molecular-barcode panel calling, use a somatic or UMI-panel skill instead. + +## Essential Inputs + +Confirm: + +- data type: WGS, WES, or targeted panel +- sample model: singleton, cohort, duo, trio, family, or case/control +- input type: FASTQ, BAM, or CRAM +- organism, reference build, FASTA, indexes, and contig naming +- known-sites resources for BQSR, contamination, and annotation +- target BED and bait BED for WES/panel data +- sex/ploidy assumptions and mitochondrial/sex-chromosome requirements +- desired callers, annotation outputs, and final VCF/gVCF expectations + +## Route + +Prefer `nf-core/sarek` for full FASTQ/BAM-to-VCF workflows. Use direct GATK4, DeepVariant, samtools, or bcftools only for focused tasks or a custom workflow. + +Preflight command: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline dna_germline_variants --emit-install-plan +``` + +For compact local checks from prepared BAM/CRAM files, use the shared DNA execution package: + +```bash +python plugins/ngs-analysis/scripts/run_dna_variant_calling.py \ + --sample-sheet dna_samples.tsv \ + --reference-fasta reference.fa \ + --execute +``` + +Treat this as a focused samtools/bcftools run envelope, not as a substitute for full cohort, trio, gVCF, BQSR, or annotation workflows. + +For a higher-fidelity local germline run that owns BQSR, per-sample gVCFs, and joint genotyping assumptions, use the germline-specific runner: + +```bash +python plugins/ngs-analysis/scripts/run_dna_germline_variants.py \ + --sample-sheet dna_samples.tsv \ + --reference-fasta reference.fa \ + --known-sites dbsnp.vcf.gz \ + --known-sites mills.vcf.gz \ + --emit-gvcf \ + --joint-call \ + --execute +``` + +This runner still expects reference-matched resources and an available GATK toolchain. It packages the validation state and generated artifacts even when execution is blocked by missing tools or resources. + +It also writes advisory `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, and `resource_readiness.md` artifacts by default. Add `--genome-build`, `--bundle-root =`, and `--require-resource-plan` when complete registered reference and known-sites bundles should be mandatory for readiness. + +## Decision Points + +- For cohorts or families, decide whether the endpoint is per-sample VCFs, gVCFs for joint genotyping, or a jointly called cohort VCF. +- For WES/panels, carry the target BED through alignment metrics, calling, and coverage reports; do not call off-target regions by accident. +- Use BQSR only when reference-matched known-sites resources exist. Do not mix GRCh37, hg19, GRCh38, or T2T resources. +- Check sample identity, sex concordance, contamination, coverage, duplication, insert size, and transition/transversion where feasible. +- For trios, preserve pedigree metadata and report Mendelian/QC checks separately from variant interpretation. + +## Outputs + +Produce: + +- command or workflow profile and sample sheet +- reference/resource manifest with versions and checksums when available +- QC summary: coverage, duplication, insert size, contamination, sex/relatedness checks when run +- VCF/gVCF path, index path, and annotation path +- limitations: low coverage, missing known-sites, target design gaps, or build mismatches + +Clinical interpretation, pathogenicity classification, and report signing are out of scope unless the user provides a validated clinical workflow. diff --git a/plugins/ngs-analysis/skills/ngs-dna-germline-variants/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-dna-germline-variants/agents/openai.yaml new file mode 100644 index 0000000..c73e7f9 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-dna-germline-variants/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Germline DNA Variants" + short_description: "Deep germline WGS/WES/panel variant workflow planning" + default_prompt: "Plan a germline DNA variant-calling workflow, validate reference/resources and sample model, check public tools, and prepare a preflight-first command." diff --git a/plugins/ngs-analysis/skills/ngs-dna-somatic-variants/SKILL.md b/plugins/ngs-analysis/skills/ngs-dna-somatic-variants/SKILL.md new file mode 100644 index 0000000..17bb917 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-dna-somatic-variants/SKILL.md @@ -0,0 +1,67 @@ +--- +name: ngs-dna-somatic-variants +description: Run or plan tumor-normal, tumor-only, WGS, WES, or cancer-panel somatic variant workflows with pairing, contamination, panel-of-normals, purity, QC, and annotation checks. +--- + +# Somatic DNA Variants + +Use this skill for tumor-normal or tumor-only somatic SNV/indel calling from FASTQ, BAM, or CRAM. If the request is inherited germline calling or family analysis, use `ngs-dna-germline-variants`. + +## Essential Inputs + +Confirm: + +- tumor-normal, tumor-only, relapse-baseline, or multi-tumor design +- WGS, WES, or panel assay and target BED when applicable +- input type and whether reads are already aligned +- tumor/normal pairing table and sample identifiers +- reference build, known-sites, germline resource, and annotation cache +- panel-of-normals availability and matched-normal availability +- tumor purity, contamination expectations, and minimum allele fraction goals +- desired outputs: raw calls, filtered calls, VEP/SnpEff annotation, MAF, CNV/SV handoff + +## Route + +Prefer `nf-core/sarek` for an end-to-end public workflow when its supported callers fit the request. Use direct GATK Mutect2 or bcftools/samtools utilities for focused validation or prepared BAMs. + +Preflight command: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline dna_somatic_variants --emit-install-plan +``` + +For compact local checks from prepared tumor/normal BAM/CRAM files, use the dedicated Mutect2 runner: + +```bash +python plugins/ngs-analysis/scripts/run_dna_somatic_variants.py \ + --sample-sheet somatic_pairs.tsv \ + --reference-fasta reference.fa \ + --germline-resource af-only-gnomad.vcf.gz \ + --panel-of-normals pon.vcf.gz \ + --execute +``` + +This produces a tumor-normal/tumor-only pairing table, Mutect2 command plan, contamination/filtering artifacts, somatic QC summary, `qc/somatic_pair_review.{tsv,json}`, visualization index, and filtered VCF outputs when the local GATK resources are available. For nf-core execution, use `plugins/ngs-analysis/scripts/run_nfcore_pipeline.py --pipeline sarek`. + +The direct runner also emits `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, and `resource_readiness.md`. The resource check is advisory by default so custom or reduced references can still be planned; add `--genome-build`, `--bundle-root =`, and `--require-resource-plan` when missing registered reference bundles should block readiness. + +## Decision Points + +- Verify tumor-normal pair metadata before execution. A swapped or missing normal changes the biological meaning of the calls. +- For tumor-only analysis, explicitly state the false-positive risk and require a germline resource plus careful filtering. +- Use panel-of-normals when available and reference-matched; do not reuse a PON across incompatible capture kits or genome builds. +- Track contamination, orientation bias, strand artifacts, mapping quality, coverage, tumor purity, and allele-fraction filters. +- Keep germline filtering separate from somatic interpretation; avoid presenting tumor-only calls as confirmed somatic without supporting evidence. + +## Outputs + +Produce: + +- validated pairing/sample sheet +- caller/filter settings and reference/resource manifest +- QC summary: tumor/normal depth, contamination, duplication, insert size, on-target rate for panels/WES +- per-pair review table covering matched-normal state, PON/germline-resource availability, contamination-table status, filtered VCF status, and parsed variant counts +- VCF/MAF/annotation paths and a filtered-vs-raw call count summary +- caveats for tumor-only calls, low-purity tumors, low-depth regions, or missing matched normals + +Clinical actionability and treatment recommendations are out of scope unless the user supplies a validated clinical interpretation workflow. diff --git a/plugins/ngs-analysis/skills/ngs-dna-somatic-variants/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-dna-somatic-variants/agents/openai.yaml new file mode 100644 index 0000000..c33c8af --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-dna-somatic-variants/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Somatic DNA Variants" + short_description: "Deep tumor-normal or tumor-only somatic variant workflow planning" + default_prompt: "Plan a somatic variant-calling workflow, verify tumor-normal pairing and resources, check public tools, and prepare a preflight-first command." diff --git a/plugins/ngs-analysis/skills/ngs-dna-umi-panel-variants/SKILL.md b/plugins/ngs-analysis/skills/ngs-dna-umi-panel-variants/SKILL.md new file mode 100644 index 0000000..3586245 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-dna-umi-panel-variants/SKILL.md @@ -0,0 +1,66 @@ +--- +name: ngs-dna-umi-panel-variants +description: Run or plan targeted DNA panel variant workflows that use UMIs, duplex consensus reads, molecular barcodes, low-frequency calling, target coverage, and panel-specific QC. +--- + +# UMI Panel DNA Variants + +Use this skill for targeted DNA panels where molecular barcodes, UMIs, duplex consensus, or low-frequency allele detection are central to the analysis. If the panel is ordinary germline calling without molecular consensus, use `ngs-dna-germline-variants`. + +## Essential Inputs + +Confirm: + +- panel/capture kit name and target BED +- UMI layout: inline read, index read, single UMI, duplex UMI, or unknown +- whether consensus reads have already been generated +- FASTQ/BAM input and pairing convention +- reference build and panel-specific annotation requirements +- minimum allele fraction goal and intended use: screening, research, validation, or exploratory +- positive/negative controls and expected spike-ins when available + +## Route + +Use a lab-validated panel workflow when provided. For public-tool planning, combine FASTQ QC, UMI extraction/consensus generation, alignment, target coverage QC, and variant calling as separate audited stages. + +Preflight command: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline dna_umi_panel_variants --emit-install-plan +``` + +For compact local checks from prepared consensus or alignment BAM/CRAM files, use the dedicated UMI panel runner: + +```bash +python plugins/ngs-analysis/scripts/run_dna_umi_panel_variants.py \ + --sample-sheet umi_panel_samples.tsv \ + --reference-fasta reference.fa \ + --target-bed panel_targets.bed \ + --umi-mode duplex \ + --umi-tag RX \ + --execute +``` + +This writes the consensus/variant command plan, molecular-consensus state, low-frequency calling settings, visualization index, `qc/umi_postrun_summary.{tsv,json}`, `qc/umi_molecular_evidence_contract.{tsv,json}`, and consensus-BAM VCF outputs when the local fgbio/samtools/bcftools backend is available. The post-run summary parses consensus flagstat, target coverage, bcftools stats, and family-size/duplex files when present; missing metrics stay explicit in the notes column. The molecular evidence contract keeps the low-AF review requirements visible per sample: consensus BAM, family-size or molecule-support metrics, variant stats, hotspot review, and duplex review. + +The direct runner also emits `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, and `resource_readiness.md`. The resource check is advisory by default so custom or reduced references can still be planned; add `--genome-build`, `--bundle-root =`, and `--require-resource-plan` when missing registered reference bundles should block readiness. + +## Decision Points + +- Do not trim or discard UMI bases until their layout and destination are known. +- Separate raw read depth from unique molecular depth and consensus depth. +- Track on-target rate, coverage uniformity, family size distribution, strand/duplex support, and per-target dropout. +- Low allele fraction calls require stronger artifact review than ordinary germline calls. +- Use panel-specific hotspot/blacklist rules only when their provenance is known. + +## Outputs + +Produce: + +- UMI layout and consensus strategy +- target BED/resource manifest +- raw-depth, molecular-depth, and consensus-depth QC summary +- `qc/umi_postrun_summary.tsv` for consensus reads, target coverage, variant counts, family size, and duplex fraction +- `qc/umi_molecular_evidence_contract.tsv` for low-AF evidence readiness, hotspot review, and duplex review expectations +- variant calls with allele fraction, depth, strand/duplex support, and filtering rationale +- limitations around sensitivity, panel dropout, molecule count, and non-validated interpretation diff --git a/plugins/ngs-analysis/skills/ngs-dna-umi-panel-variants/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-dna-umi-panel-variants/agents/openai.yaml new file mode 100644 index 0000000..bfa3535 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-dna-umi-panel-variants/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "UMI Panel Variants" + short_description: "Deep targeted panel and UMI-aware variant workflow planning" + default_prompt: "Plan a UMI-aware targeted DNA panel workflow, preserve barcode handling, check public tools, and prepare an auditable preflight-first plan." diff --git a/plugins/ngs-analysis/skills/ngs-dna-variant-calling/SKILL.md b/plugins/ngs-analysis/skills/ngs-dna-variant-calling/SKILL.md new file mode 100644 index 0000000..3defe44 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-dna-variant-calling/SKILL.md @@ -0,0 +1,88 @@ +--- +name: ngs-dna-variant-calling +description: Dispatch WGS, WES, or targeted DNA variant requests to germline, somatic, or UMI-panel skills, then plan public nf-core/sarek, GATK4, DeepVariant, samtools, or bcftools workflows. +--- + +# DNA Variant Calling + +Use this skill as the DNA variant-calling dispatcher for WGS, WES, or targeted DNA panel analysis from FASTQ, BAM, or CRAM. Once the sample model is clear, hand off to the narrow subtype skill. + +## Essential Inputs + +Confirm: + +- data type: WGS, WES, or panel +- sample model: germline single sample, cohort, trio, tumor-only, or tumor-normal +- input type: FASTQ, BAM, or CRAM +- organism and reference genome +- known-sites resources for BQSR, if required +- target BED for WES or panels +- UMI or duplex handling +- desired callers and annotation outputs + +## Dispatch + +Route by biological/sample model: + +- Germline singleton, cohort, family, trio, WGS, WES, or ordinary inherited panel: `ngs-dna-germline-variants` +- Tumor-normal, tumor-only, relapse-baseline, or other cancer somatic calling: `ngs-dna-somatic-variants` +- UMI, duplex, molecular-barcode, or low-frequency targeted panel calling: `ngs-dna-umi-panel-variants` + +If the request is ambiguous, ask only for the missing sample model and assay design needed to choose among these three. Do not run one generic variant workflow when the request needs subtype-specific assumptions. + +## Public Default + +Prefer `nf-core/sarek` for an end-to-end public workflow. Use direct GATK4, DeepVariant, samtools, or bcftools commands only for smaller, focused tasks or when the user explicitly wants a custom pipeline. + +## Preflight + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline dna_variant_calling --emit-install-plan +``` + +## Local Execution Package + +For a compact BAM/CRAM-to-VCF run with a matching reference FASTA, use the plugin-owned samtools/bcftools runner: + +```bash +python plugins/ngs-analysis/scripts/run_dna_variant_calling.py \ + --sample-sheet dna_samples.tsv \ + --reference-fasta reference.fa \ + --region chr20:1-100000 \ + --filter-min-qual 30 \ + --filter-min-site-dp 10 \ + --execute +``` + +The sample sheet should include `sample` and `bam` or `cram` columns. When `--region` is provided the runner also emits per-base depth plus a callable-loci summary for that interval, and when filter thresholds are provided it emits a soft-filtered VCF alongside the raw calls. This package is suitable for focused local checks and run-envelope generation; subtype skills still own germline, somatic, UMI, reference-resource, cohort, annotation, and workflow assumptions. + +This compact runner now writes advisory `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, and `resource_readiness.md` artifacts for the selected genome bundle. Use `--require-resource-plan` when missing registered reference resources should block readiness; otherwise the explicit `--reference-fasta` remains enough for focused local checks. + +## Kickoff Pattern + +Preflight-first nf-core pattern: + +```bash +nextflow run nf-core/sarek \ + -profile test,docker \ + --outdir results/sarek_test +``` + +Real run skeleton: + +```bash +nextflow run nf-core/sarek \ + -profile docker \ + --input samplesheet.csv \ + --outdir results/sarek \ + --genome GRCh38 \ + --tools haplotypecaller,vep +``` + +For WES/panel data, include the target BED. For tumor-normal data, verify pair metadata before execution. For UMI panels, preserve barcode handling and molecule-level QC. + +## Guardrails + +- Do not mix genome builds across FASTA, GTF/BED, known sites, and VEP cache. +- Do not download large references without confirming disk space and target path. +- Treat clinical interpretation as out of scope unless the user has a validated clinical workflow. diff --git a/plugins/ngs-analysis/skills/ngs-dna-variant-calling/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-dna-variant-calling/agents/openai.yaml new file mode 100644 index 0000000..23f3d20 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-dna-variant-calling/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "DNA Variants" + short_description: "Plan WGS/WES/panel variant calling" + default_prompt: "Classify my DNA sequencing run, choose germline, somatic, or UMI-panel routing, collect required parameters, check public tools, and prepare a preflight-first plan." diff --git a/plugins/ngs-analysis/skills/ngs-epigenomics-peaks/SKILL.md b/plugins/ngs-analysis/skills/ngs-epigenomics-peaks/SKILL.md new file mode 100644 index 0000000..c5b562c --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-epigenomics-peaks/SKILL.md @@ -0,0 +1,80 @@ +--- +name: ngs-epigenomics-peaks +description: Dispatch ATAC-seq, ChIP-seq, CUT&RUN, or CUT&Tag requests to assay-specific QC, alignment, signal-track, peak-calling, consensus, and differential peak workflows. +--- + +# Epigenomics Peaks + +Use this skill as the epigenomics dispatcher for ATAC-seq, ChIP-seq, CUT&RUN, or CUT&Tag analysis. Hand off to the assay-specific deep skill once the assay type is known. + +## Essential Inputs + +Confirm: + +- assay type +- FASTQ or BAM input +- organism and genome build +- blacklist file, if available +- control samples: input DNA, IgG, or spike-in +- biological replicates +- peak type: narrow, broad, accessibility, or protocol-specific +- desired outputs: QC report, peaks, consensus peaks, bigWigs, differential peaks + +## Public Defaults + +Choose the workflow by assay: + +- ATAC-seq: `ngs-atacseq-peaks-qc` using `nf-core/atacseq` by default +- ChIP-seq: `ngs-chip-cutrun-peaks-qc` using `nf-core/chipseq` by default +- CUT&RUN or CUT&Tag: `ngs-chip-cutrun-peaks-qc` using `nf-core/cutandrun` by default + +Use direct MACS2 only for focused peak-calling tasks from prepared BAMs. + +## Preflight + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline epigenomics_peaks --emit-install-plan +``` + +## Local Execution Package + +For FASTQ intake/QC over ATAC-seq, ChIP-seq, CUT&RUN, or CUT&Tag data, use the shared FASTQ assay package: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_assay_package.py \ + --lane epigenomics_peaks \ + --sample-sheet assay_samples.csv \ + --execute +``` + +This validates sample-sheet paths and read structure, runs seqkit stats and FastQC/MultiQC when available, and writes `peak_calling_readiness.json`. Full alignment, signal tracks, TSS/FRiP, consensus peaks, and differential analyses still route through the assay-specific workflow. + +Assay-specific ATAC and ChIP/CUT&RUN runners now also emit native review files alongside TSV/JSON summaries: `qc/*_dashboard.html`, FRiP/peak SVG plots, insert-size SVG plots, browser-track preview HTML, UCSC track lines, and IGV session files. + +## Kickoff Pattern + +ATAC-seq preflight run: + +```bash +nextflow run nf-core/atacseq \ + -profile test,docker \ + --outdir results/atacseq_test +``` + +ChIP-seq preflight run: + +```bash +nextflow run nf-core/chipseq \ + -profile test,docker \ + --outdir results/chipseq_test +``` + +CUT&RUN/CUT&Tag preflight run: + +```bash +nextflow run nf-core/cutandrun \ + -profile test,docker \ + --outdir results/cutandrun_test +``` + +Carry replicate and control metadata through the sample sheet before running real analysis. diff --git a/plugins/ngs-analysis/skills/ngs-epigenomics-peaks/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-epigenomics-peaks/agents/openai.yaml new file mode 100644 index 0000000..cd88aa3 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-epigenomics-peaks/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Epigenomics Peaks" + short_description: "Plan ATAC/ChIP/CUT&RUN/CUT&Tag analysis" + default_prompt: "Classify my epigenomics assay, route ATAC-seq versus ChIP/CUT&RUN/CUT&Tag, check controls and replicates, and prepare a preflight-first peak plan." diff --git a/plugins/ngs-analysis/skills/ngs-fastq-qc/SKILL.md b/plugins/ngs-analysis/skills/ngs-fastq-qc/SKILL.md new file mode 100644 index 0000000..f9c8b7c --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-fastq-qc/SKILL.md @@ -0,0 +1,127 @@ +--- +name: ngs-fastq-qc +description: Validate FASTQ inputs, run local FastQC/MultiQC QC, interpret QC signals, and optionally execute fastp or Cutadapt trimming branches without overwriting raw reads. +--- + +# FASTQ QC + +Use this skill for QC-only, trimming-first, or FASTQ quality interpretation workflows. This skill can execute the plugin-owned local FastQ QC runner when the user approves a local run. It should decide whether trimming or additional investigation is warranted; it should not blindly trim by default. + +## Essential Inputs + +Confirm: + +- FASTQ paths and pairing convention +- whether output should be QC-only or trimmed FASTQs +- known adapter or primer sequences +- organism if contamination screening or host depletion is requested +- output directory +- whether FASTQs are raw, demultiplexed, previously trimmed, or downloaded from an archive +- whether downstream analysis expects original read lengths, UMIs, or inline barcodes + +## Public Tools + +Default tool set: + +- `FastQC` for raw read QC +- `MultiQC` for project-level summary +- `fastp` for all-in-one QC/trimming when acceptable +- `Cutadapt` when primer/adapter handling needs explicit sequences +- `seqkit` for quick counts, stats, and subsampling + +## Preflight + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline fastq_qc --emit-install-plan +``` + +## Local Execution + +Use the plugin-owned runner for local artifact-producing FASTQ QC: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_qc.py \ + --sample-sheet samplesheet.csv \ + --execute +``` + +Single paired sample: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_qc.py \ + --sample sampleA \ + --r1 sampleA_R1.fastq.gz \ + --r2 sampleA_R2.fastq.gz \ + --execute +``` + +Optional trimming branch: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_qc.py \ + --sample-sheet samplesheet.csv \ + --trim-mode fastp \ + --execute +``` + +For explicit adapters: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_qc.py \ + --sample-sheet samplesheet.csv \ + --trim-mode cutadapt \ + --adapter-r1 AGATCGGAAGAGC \ + --adapter-r2 AGATCGGAAGAGC \ + --execute +``` + +The runner performs pre-execution validation before Snakemake execution. It writes a timestamped run directory with `run_manifest.json`, `config.json`, `validation/`, `workflow/Snakefile`, logs, `artifact_index.json`, `summary.md`, FastQC/MultiQC outputs, and `qc_interpretation.json` after successful execution. + +## Interpretation Rules + +Inspect raw QC before recommending trimming: + +- Per-base quality drop at the read end: consider quality trimming, but preserve enough length for alignment or amplicon merging. +- Adapter or primer signal: use `cutadapt` when explicit sequences matter; use `fastp` only when automatic handling is acceptable. +- Poly-G or patterned-flowcell artifacts: handle with a tool that explicitly supports the artifact and report the assumption. +- Overrepresented sequences: classify adapters, primers, rRNA, PhiX, host contamination, or true biology before filtering. +- Per-tile failures or severe quality shifts: flag possible run-level issues and avoid treating them as ordinary adapter contamination. +- High duplication: interpret by assay; it may be expected for amplicons, targeted panels, or low-input libraries. +- Pairing issues: verify R1/R2 file counts and read-name pairing before any downstream workflow. + +Do not overwrite input FASTQs. Preserve the raw QC reports even when trimmed FASTQs are created. + +## Kickoff Pattern + +QC-only: + +```bash +mkdir -p results/fastqc results/multiqc +fastqc -t 4 -o results/fastqc *.fastq.gz +multiqc results/fastqc -o results/multiqc +``` + +QC plus trimming: + +```bash +fastp \ + -i sample_R1.fastq.gz \ + -I sample_R2.fastq.gz \ + -o results/trimmed/sample_R1.fastq.gz \ + -O results/trimmed/sample_R2.fastq.gz \ + --html results/fastp/sample.html \ + --json results/fastp/sample.json +multiqc results -o results/multiqc +``` + +## Output Review + +Return a short QC interpretation with: + +1. sample/read-pair inventory +2. QC modules that look normal +3. QC modules that require action or user confirmation +4. trimming or no-trimming recommendation with rationale +5. downstream caveats such as short reads, contaminated libraries, or failed pairs + +When using the local runner, ground the response in the generated `qc_interpretation.json`, `summary.md`, and MultiQC report instead of relying only on expected artifacts. diff --git a/plugins/ngs-analysis/skills/ngs-fastq-qc/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-fastq-qc/agents/openai.yaml new file mode 100644 index 0000000..e2de3f5 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-fastq-qc/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "FASTQ QC" + short_description: "Run public FASTQ QC and trimming workflows" + default_prompt: "Inspect my FASTQs, infer pairings, check QC tools, interpret FastQC/MultiQC signals, and recommend trimming only when warranted." diff --git a/plugins/ngs-analysis/skills/ngs-runtime-env/SKILL.md b/plugins/ngs-analysis/skills/ngs-runtime-env/SKILL.md new file mode 100644 index 0000000..dfb10d9 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-runtime-env/SKILL.md @@ -0,0 +1,91 @@ +--- +name: ngs-runtime-env +description: Check whether public NGS tools and packages already exist before downloading, installing, or running a sequencing pipeline. +--- + +# NGS Runtime Environment + +Use this skill whenever an NGS workflow needs package checks, install planning, or runtime validation. + +## Existence Check Order + +1. Check executables on `PATH` with `command -v` or `shutil.which`. +2. Check Python imports for Python-backed tools. +3. Check active package managers with `conda list`, `mamba list`, `micromamba list`, or `pip show`. +4. If requested, check package indexes or container registries. +5. Emit an install plan before installing. +6. Install only when explicitly requested by the user. + +Do not modify system Python. Prefer isolated conda/mamba environments or containers. + +## Script + +From the repo root: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --list +python plugins/ngs-analysis/scripts/ngs_preflight.py --tool fastqc --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --profile local_light --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline dna_variant_calling --network-checks --emit-install-plan +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline shotgun_metagenomics --manager micromamba --install-plan-outdir runtime_readiness/shotgun_install +``` + +Use `--install-plan-outdir` when a user needs a reviewable permission handoff. It writes `install_plan.json` as the canonical machine-readable plan and `install_commands.sh` as a guarded shell companion generated from the same plan. The shell companion is review-only by default; it exits without installing unless `NGS_RUN_INSTALL_COMMANDS=1` is set after explicit user approval. + +Check reference and database bundle readiness separately from executable readiness: + +```bash +python plugins/ngs-analysis/scripts/ngs_reference_manager.py list +python plugins/ngs-analysis/scripts/ngs_reference_manager.py check --kind reference --bundle grch38_core --root /refs/GRCh38 +python plugins/ngs-analysis/scripts/ngs_reference_manager.py explain-missing --kind database --bundle kraken2_standard --root /db/kraken2/standard +python plugins/ngs-analysis/scripts/ngs_reference_manager.py plan --pipeline shotgun_metagenomics --include-optional --outdir resource_readiness/shotgun +python plugins/ngs-analysis/scripts/ngs_reference_manager.py setup-plan --pipeline shotgun_metagenomics --include-optional --outdir resource_readiness/shotgun_setup +python plugins/ngs-analysis/scripts/ngs_reference_manager.py plan --pipeline atacseq --genome-build GRCh38 --bundle-root grch38_core=/refs/GRCh38 --outdir resource_readiness/atac +python plugins/ngs-analysis/scripts/ngs_reference_manager.py inventory --outdir resource_readiness/inventory +python plugins/ngs-analysis/scripts/ngs_reference_manager.py lock --outdir resource_readiness/lock --include-checksums +python plugins/ngs-analysis/scripts/ngs_reference_manager.py verify-lock --lockfile resource_readiness/lock/resource_lock.json --outdir resource_readiness/lock_verify --fail-on-mismatch +python plugins/ngs-analysis/scripts/ngs_reference_manager.py check-all --kind database --output resource_readiness/database_audit.json +``` + +Use `plan` before claiming that a reference- or database-heavy workflow is runnable. The plan output writes `resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, `resource_readiness.md`, and setup-plan artifacts; missing required bundles are blocking, while optional bundles such as Bracken/HUMAnN or HOMER motif resources should stay explicit. + +Use `setup-plan` when the user needs an actionable resource/database setup checklist without running an assay. It writes `resource_setup_plan.json`, `resource_setup_plan.tsv`, `resource_setup_plan.md`, and `resource_setup_commands.sh`. The shell skeleton keeps setup hints commented by default, so large reference/database downloads remain deliberate and reviewable. + +Use `inventory` when the user needs a broader resource/database audit across the plugin. It writes `resource_inventory.json`, `resource_inventory.tsv`, `resource_env.sh`, and `resource_dashboard.md`, including missing files, env vars, setup hints, license notes, and pipeline usage for every known bundle. + +Use `lock` after resources are ready for a project or handoff. It snapshots the resource inventory into `resource_lock.json`, `resource_lock.tsv`, and `resource_lock.md`; `verify-lock` compares the lockfile against current local paths and writes a drift report before reruns. + +The nf-core adapter performs the same resource gate automatically unless `--skip-resource-plan` is supplied: + +```bash +python plugins/ngs-analysis/scripts/run_nfcore_pipeline.py --pipeline taxprofiler --sample-sheet samples.csv --profile docker --bundle-root kraken2_standard=/db/kraken2/standard --include-optional-resources +``` + +The direct bulk RNA-seq counts/QC, scRNA FASTQ-to-count, generic DNA, germline DNA, somatic DNA, UMI panel, ATAC, ChIP/CUT&RUN, amplicon, and shotgun backend runners also emit run-local `resources/` readiness bundles. These direct runners use advisory resource checks by default so custom or reduced local inputs can still be planned; add `--require-resource-plan` when missing registered bundles should block readiness. + +Use `--install-missing --yes` only after explicit user approval: + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline fastq_qc --manager mamba --install-missing --yes +``` + +## Install Strategy + +Prefer these patterns: + +- nf-core workflows: install/check `nextflow`; use Docker/Singularity/Apptainer profiles for process tools. +- local execution: install/check `snakemake`; use `mamba` or `micromamba` environments and avoid containers by default. +- small QC tools: install with `mamba` or `micromamba` from `conda-forge` and `bioconda`. +- Python analysis packages: install in a dedicated environment, not global Python. +- large databases and references: estimate size and check existing paths before downloading. +- pipeline resource plans: use `--bundle-root bundle=/path` or the registry `root_env` variables so downstream runs can cite the exact local bundle roots. + +## Report + +Summarize: + +- present tools and paths +- missing tools +- package-index checks, if performed +- suggested install commands +- tools that are proprietary, EULA-bound, cloud-bound, or database-heavy diff --git a/plugins/ngs-analysis/skills/ngs-runtime-env/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-runtime-env/agents/openai.yaml new file mode 100644 index 0000000..8cbe280 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-runtime-env/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "NGS Runtime" + short_description: "Check sequencing packages before installing" + default_prompt: "Check which NGS tools already exist locally, then emit an install plan for the missing public packages without installing by default." diff --git a/plugins/ngs-analysis/skills/ngs-scrna-seq/SKILL.md b/plugins/ngs-analysis/skills/ngs-scrna-seq/SKILL.md new file mode 100644 index 0000000..fc23067 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-scrna-seq/SKILL.md @@ -0,0 +1,88 @@ +--- +name: ngs-scrna-seq +description: Route single-cell or single-nucleus RNA-seq FASTQs to public count-generation workflows and defer post-count matrix QC, annotation, clustering, and UMAP analysis to the embedded scrna-seq-qc skill. +--- + +# Single-cell RNA-seq + +Use this skill for scRNA-seq or snRNA-seq kickoff from FASTQs, Cell Ranger-style outputs, matrices, `.h5`, `.h5ad`, or `.rds`. This skill owns upstream intake and FASTQ-to-count routing; post-count QC, annotation, clustering, and UMAPs must route to the embedded `scrna-seq-qc` skill. + +## Essential Inputs + +Confirm: + +- input type: FASTQ, count matrix, `.h5`, `.h5ad`, or `.rds` +- assay: single-cell or single-nucleus +- chemistry or barcode/UMI layout +- organism and reference +- expected cells per sample when available +- sample, donor, batch, and channel metadata +- desired endpoint: count matrix only, QC, clustering, annotation, UMAP, or differential abundance/expression + +## Public Default + +For FASTQs, prefer public alternatives: + +- `nf-core/scrnaseq` +- STARsolo +- kallisto-bustools via `kb-python` +- alevin-fry + +Use 10x Cell Ranger only when the user explicitly wants vendor-standard output and has accepted the 10x EULA. + +## Implementation Sequence + +Treat scRNA as three ordered rows in the plugin state and execute them sequentially: + +1. FASTQ-to-count: + count matrix generation, barcode and feature tables, chemistry or whitelist choice, and a backend summary. +2. Post-count QC and annotation: + raw-count-preserving objects, QC metrics, threshold plots, doublet and ambient-RNA outputs, clustering, UMAPs, and annotation confidence. +3. Downstream stats: + pseudobulk matrices, differential expression or abundance tables, and per-condition plots. + +Cell Ranger is an optional backend when vendor-standard output is explicitly required. It is not a standalone roadmap row and it is not the default execution target. + +For post-count QC/annotation, use the embedded `skills/scrna-seq-qc` guidance. Route to that skill whenever the requested endpoint starts from a matrix, `.h5`, `.h5ad`, `.rds`, Cell Ranger output, or asks for QC, doublets, ambient RNA, annotation, clustering, UMAPs, or post-count differential summaries. + +## Preflight + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline scrnaseq --emit-install-plan +``` + +## Kickoff Pattern + +nf-core preflight run: + +```bash +python plugins/ngs-analysis/scripts/run_nfcore_pipeline.py \ + --pipeline scrnaseq \ + --sample-sheet samplesheet.csv \ + --profile docker \ + --genome GRCh38 \ + --bundle-root grch38_core=/refs/GRCh38 +``` + +This adapter captures the generated params, pinned Nextflow command, resource gate, trace/report paths, run manifest, and visualization index in the standard plugin envelope. Add `--revision ` for pinned nf-core execution and `--execute` only when Nextflow plus a container/HPC profile are ready. + +Plugin-owned local execution: + +```bash +python plugins/ngs-analysis/scripts/run_scrnaseq_fastq_to_count.py \ + --sample-sheet samplesheet.csv \ + --genome-fasta reference/genome.fa \ + --annotation-gtf reference/genes.gtf \ + --cb-whitelist reference/whitelist.txt \ + --execute +``` + +The FASTQ-to-count runner emits advisory `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, and `resource_readiness.md` outputs by default. Add `--genome-build`, `--bundle-root =`, and `--require-resource-plan` when STARsolo reference bundle completeness should block readiness. + +Matrix-level QC should be handled by `scrna-seq-qc` and must preserve raw counts, per-sample metadata, filter decisions, doublet calls, ambient-RNA handling, and plot outputs. + +## Guardrails + +- Do not assume 10x chemistry from filenames alone. +- Do not silently skip doublet or ambient-RNA assessment when doing QC. +- Do not over-annotate clusters without matched references or clear markers. diff --git a/plugins/ngs-analysis/skills/ngs-scrna-seq/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-scrna-seq/agents/openai.yaml new file mode 100644 index 0000000..bf16454 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-scrna-seq/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Single-cell RNA-seq" + short_description: "Plan scRNA-seq counting, QC, and annotation" + default_prompt: "Inspect my single-cell inputs, resolve chemistry/reference needs, choose a public FASTQ-to-count path, or route post-count analysis to scrna-seq-qc." diff --git a/plugins/ngs-analysis/skills/ngs-shotgun-metagenomics/SKILL.md b/plugins/ngs-analysis/skills/ngs-shotgun-metagenomics/SKILL.md new file mode 100644 index 0000000..84d08e9 --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-shotgun-metagenomics/SKILL.md @@ -0,0 +1,103 @@ +--- +name: ngs-shotgun-metagenomics +description: Kick off public shotgun metagenomics QC, host-depletion, taxonomic profiling, and functional profiling workflows using nf-core/taxprofiler, Kraken2, Bracken, MetaPhlAn, and HUMAnN. +--- + +# Shotgun Metagenomics + +Use this skill for shotgun metagenomic FASTQs. + +## Essential Inputs + +Confirm: + +- paired-end or single-end reads +- host organism and host-depletion requirement +- target outputs: taxonomic profile, functional profile, assembly, binning, or QC only +- preferred database family, if any +- database paths or permission to download large databases +- sample metadata, batches, and negative controls + +## Public Defaults + +Prefer `nf-core/taxprofiler` for reproducible taxonomic profiling. Use direct Kraken2/Bracken, MetaPhlAn, or HUMAnN when the user wants a focused path or already has databases installed. + +For direct backend execution, prefer the plugin runner over handwritten shell when possible because it validates database bundle contents and records `resources/resource_plan.json`, `resource_manifest.tsv`, `resource_env.sh`, and `resource_readiness.md`. `--run-bracken` and `--run-humann` make those database bundles blocking, not merely optional. + +## Preflight + +```bash +python plugins/ngs-analysis/scripts/ngs_preflight.py --pipeline shotgun_metagenomics --emit-install-plan +``` + +## Local Execution Package + +For FASTQ intake/QC before host-depletion, taxonomic profiling, or functional profiling, use: + +```bash +python plugins/ngs-analysis/scripts/run_fastq_assay_package.py \ + --lane shotgun_metagenomics \ + --sample-sheet shotgun_samples.csv \ + --execute +``` + +This validates read paths and structure, runs seqkit stats and FastQC/MultiQC when available, and writes `taxonomic_classification_status.json`. Add `--kraken-db /path/to/db` only when a local Kraken2 database is available; otherwise the package records the database/tool blocker explicitly. + +For backend taxonomic and functional profiling when databases are available, use: + +```bash +python plugins/ngs-analysis/scripts/run_shotgun_metagenomics.py \ + --sample-sheet shotgun_samples.csv \ + --kraken-db /db/kraken2/standard \ + --host-reference /refs/human_kneaddata_db \ + --run-bracken \ + --run-humann \ + --humann-db /db/humann \ + --metadata sample_metadata.tsv \ + --execute +``` + +For nf-core execution, use `plugins/ngs-analysis/scripts/run_nfcore_pipeline.py --pipeline taxprofiler`. + +When `--host-reference` is supplied, the backend runner adds a KneadData host-depletion step, requires `kneaddata` in tool preflight, writes cleaned FASTQs under `host_depletion/`, and uses those cleaned reads for downstream Kraken2 and HUMAnN steps. Keep the host reference path and host-depletion decision visible because it can change taxonomic and functional abundance conclusions. + +The backend runner writes native matrix artifacts when database tools produce outputs: + +- `tables/bracken_est_reads_matrix.tsv` +- `tables/bracken_relative_abundance_matrix.tsv` +- `tables/humann_pathabundance_matrix.tsv` +- `tables/humann_genefamilies_matrix.tsv` +- `tables/bracken_summary.json` and `tables/humann_summary.json` +- `tables/top_bracken_taxa.tsv`, `tables/top_humann_pathways.tsv`, `tables/top_humann_gene_families.tsv`, and `tables/metagenomics_backend_review.json` when normalized backend matrices are available + +If Kraken2/Bracken/HUMAnN outputs are absent, the summaries and visualization manifest keep those layers `not_available` instead of implying taxonomic or functional interpretation succeeded. + +## Kickoff Pattern + +nf-core preflight run: + +```bash +nextflow run nf-core/taxprofiler \ + -profile test,docker \ + --outdir results/taxprofiler_test +``` + +Direct Kraken2 skeleton: + +```bash +kraken2 \ + --db /path/to/kraken2_db \ + --paired sample_R1.fastq.gz sample_R2.fastq.gz \ + --report results/kraken2/sample.report \ + --output results/kraken2/sample.kraken +``` + +## Visualization Outputs + +The local FASTQ package always writes `visualizations/index.html` and `visualizations/visualization_manifest.json`. With only FASTQs, this is a read-QC/readiness bundle. Provide existing `--kraken-report`, `--bracken-table`, `--humann-pathabundance`, or `--humann-genefamilies` files to generate native taxonomy and functional-profile plots without requiring a Marimo notebook. For full backend runs, `run_shotgun_metagenomics.py` now also merges generated Bracken/HUMAnN outputs into plugin-native tables for the review bundle and writes `visualizations/shotgun_backend_dashboard.html` plus SVG plots for top Bracken taxa, HUMAnN pathways, and HUMAnN gene families when the corresponding matrices are present. + +## Guardrails + +- Do not auto-download large databases without confirming size and destination. +- Host depletion choices can change biological conclusions; document the reference and parameters. +- Negative controls should stay visible in QC and interpretation. diff --git a/plugins/ngs-analysis/skills/ngs-shotgun-metagenomics/agents/openai.yaml b/plugins/ngs-analysis/skills/ngs-shotgun-metagenomics/agents/openai.yaml new file mode 100644 index 0000000..ac33f1c --- /dev/null +++ b/plugins/ngs-analysis/skills/ngs-shotgun-metagenomics/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "Shotgun Metagenomics" + short_description: "Plan shotgun metagenomics profiling" + default_prompt: "Inspect my metagenomic FASTQs, resolve host-depletion and database choices, check public tools, and prepare a taxonomic or functional profiling preflight run." diff --git a/plugins/ngs-analysis/skills/scrna-seq-qc/SKILL.md b/plugins/ngs-analysis/skills/scrna-seq-qc/SKILL.md new file mode 100644 index 0000000..4c60e7f --- /dev/null +++ b/plugins/ngs-analysis/skills/scrna-seq-qc/SKILL.md @@ -0,0 +1,97 @@ +--- +name: scrna-seq-qc +description: Process, quality-control, annotate, and visualize single-cell or single-nucleus RNA-seq datasets across tissues and species. Use when Codex needs to build, adapt, or review a general scRNA-seq QC pipeline; choose dataset-appropriate cell-level filters from QC distributions; run required scDblFinder-based doublet and ambient-RNA filtering; annotate cells with matched references or marker-based fallbacks; or generate global and per-group UMAP visualizations for large scRNA-seq datasets. +--- + +# scRNA-seq QC + +## Start Here + +Read `references/qc-annotation-umap-heuristics.md` before picking thresholds, annotation backends, or UMAP feature-selection rules. + +Confirm what inputs exist before writing code: + +- An AnnData object or equivalent with raw counts preserved. +- Per-sample, per-batch, or per-channel metadata, because QC and doublet detection should respect technical partitions. +- Organism, tissue, assay type, chemistry, and whether the data are whole-cell or single-nucleus. +- Whether a matched cell atlas or label-transfer reference exists for the tissue and species. + +Preserve provenance in the output: package versions, thresholds, threshold-justification plots, counts removed or flagged at each filter, annotation backend and reference, marker-gene selection heuristic, and any manual cluster exclusions. + +## Workflow + +1. Choose QC thresholds from the data, not from a fixed template. + - Plot detected genes, total UMIs, mitochondrial fraction, and any tissue-specific nuisance signals overall and by batch. + - Inspect all available QC metrics, but default filtering should use only the standard metrics: detected genes, total counts, and `percent.mt`. + - Pick thresholds from the observed distributions and expected biology. + - Save a plot with threshold lines and record why each threshold is appropriate for this dataset. + - If another metric looks important enough to filter on, flag it as a dataset-specific issue, explain why, and consult the user before adding that extra filter. + - Keep QC plots legible: do not overload a single panel with too many batches or categories when faceting, splitting, or summary views would communicate the result more clearly. + +2. Run cell-level QC. + - Remove or flag obvious low-quality barcodes using the chosen thresholds on detected genes, total counts, and `percent.mt`. + - Use `scDblFinder` for doublet detection. Run it per batch or capture channel, and split very large batches before doublet calling. + - Do not skip doublet calling or silently substitute another method. If `scDblFinder` cannot run in the environment, surface the blocker explicitly or get user approval before using a different caller. + - Compute ambient-RNA style metrics and use them for filtering when the dataset and workflow support it. + - Compute any other informative QC metrics when feasible, but do not turn those additional nonstandard metrics into hard filters without explicit user approval unless the user already asked for a stricter policy. + - Prefer adding a `passes_QC` column instead of physically dropping cells when downstream provenance matters. + +3. Build a latent space and inspect residual artifacts. + - Decide whether `scVI` is warranted for this dataset and use case before training it. + - Prefer a standard PCA/Scanpy workflow for smaller, simpler datasets with limited batch structure or when a conventional embedding answers the question cleanly. + - Prefer `scVI` when integration across batches, donors, chemistries, or related datasets is important, or when the dataset is large and noisy enough that a learned latent space is likely to help. + - Record why `scVI` or a conventional PCA workflow was chosen for this dataset. + - Cluster and inspect low-quality, mixed-marker, or ambiguous clusters before downstream visualization. + - Remove or flag artifact clusters only with explicit evidence, and record the rationale. + +4. Annotate cells. + - If a suitable Allen Brain Cell Atlas reference exists and the dataset is a compatible brain tissue and species, use MapMyCells or `cell_type_mapper`. + - If no suitable Allen reference exists, use the closest matched reference for tissue, species, assay, and chemistry with an appropriate mapping tool. + - If no reliable reference exists, annotate conservatively from canonical markers and cluster-level markers. Assign coarse labels first and leave uncertain clusters as unknown or ambiguous rather than overlabeling them. + - Persist annotation confidence or probability fields when available, together with at least one coarse and one fine label. + +5. Choose a general marker panel for global UMAP. + - Do not rely on a perturbation-specific or brain-only marker panel. + - Start from HVGs selected in a batch-aware way. + - Add genes that distinguish major coarse compartments or high-confidence labels, for example top markers per coarse cluster or class. + - Exclude nuisance-dominated genes if they swamp the embedding unless the biology requires them. + - Document how the panel was chosen. + +6. Generate UMAP visualizations. + - For a global UMAP, use the learned latent space or the chosen informative marker panel, depending on which better matches the analytical goal and runtime constraints. + - For per-group UMAPs, subset by a stable coarse label and use the latent representation unless there is a strong reason to rebuild on expression features. + - Keep plotting separate from filtering so visualization choices do not mutate the core analysis object. + - Make every plot legible. Use a reasonable number of categories per panel, prefer coarse labels on overview plots, and split or facet figures when fine labels, batches, or neighborhoods would otherwise make the figure unreadable. + +7. Scale to large datasets without copying. + - Keep matrices sparse whenever possible. + - Avoid densifying whole matrices. + - Avoid whole-object copies of AnnData or Seurat objects; use views, backed mode, chunked operations, and per-batch or per-group manifests instead. + - When crossing Python and R boundaries, pass only the subset and metadata required for the step. + - Write checkpoints after major stages so failures do not require restarting from raw ingest. + +## Deliverables + +When implementing a pipeline, produce an auditable output set: + +- Filtered `.h5ad` or equivalent object with raw counts preserved and QC or annotation fields in metadata. +- QC summary table with input cells, cells removed or flagged by each filter, final cells, and per-batch summaries. +- Threshold-justification plots for detected genes, UMIs, mitochondrial fraction, plus any additional QC metric that was inspected; clearly separate metrics that informed review from metrics that actually drove filtering. +- Parameter manifest with thresholds, package versions, annotation backend and reference, marker-panel heuristic, and any manual exclusions. +- UMAP coordinates and plots for global and per-group views when requested, with category counts and panel layouts chosen so the figures remain legible. + +## Embedded Runner + +For 10x-style matrix bundles, a local runner is available: + +```bash +python plugins/ngs-analysis/scripts/run_scrnaseq_post_count_qc.py --input-dir path/to/scrna_bundle +``` + +The input directory should contain `matrix/`, `manifest.tsv`, and `dataset_metadata.json`, unless explicit paths are supplied. Treat the runner as an auditable analysis surface: its marker-based fallback is PBMC-oriented when no matched reference is provided, so tissue-specific annotation and integration choices still require review. + +The runner writes `visualizations/index.html` for portable artifact review, `summary.md` plus `provenance/analysis_status.json` for explicit completeness/blocker reporting, and auto-launches a localhost Marimo review app recorded in `notebooks/marimo_server.json`. It also writes `notebooks/scrna_qc_review.marimo.py` as a notebook backup over the generated PNG/CSV/H5AD outputs. Treat the notebook and review app as review layers, not as the source of truth; the run envelope and generated artifacts remain canonical. + +## Resources + +- `references/qc-annotation-umap-heuristics.md`: Threshold-selection heuristics, annotation fallback strategy, general marker-panel selection rules, and large-dataset memory practices. diff --git a/plugins/ngs-analysis/skills/scrna-seq-qc/agents/openai.yaml b/plugins/ngs-analysis/skills/scrna-seq-qc/agents/openai.yaml new file mode 100644 index 0000000..6dda7ac --- /dev/null +++ b/plugins/ngs-analysis/skills/scrna-seq-qc/agents/openai.yaml @@ -0,0 +1,4 @@ +interface: + display_name: "scRNA-seq QC" + short_description: "Help with scRNA-seq QC workflows" + default_prompt: "Build or review a general single-cell RNA-seq QC pipeline and choose dataset-appropriate thresholds, annotation strategy, and visualization outputs." diff --git a/plugins/ngs-analysis/skills/scrna-seq-qc/references/qc-annotation-umap-heuristics.md b/plugins/ngs-analysis/skills/scrna-seq-qc/references/qc-annotation-umap-heuristics.md new file mode 100644 index 0000000..f9edae9 --- /dev/null +++ b/plugins/ngs-analysis/skills/scrna-seq-qc/references/qc-annotation-umap-heuristics.md @@ -0,0 +1,184 @@ +# General scRNA-seq QC Heuristics + +Use this file when choosing dataset-specific QC thresholds, annotation strategy, or UMAP feature-selection rules for a new single-cell or single-nucleus RNA-seq dataset. + +## Choosing QC Thresholds + +Always choose thresholds from the observed data and expected biology. Do not copy a single min-genes or mitochondrial cutoff across tissues, species, assay types, or chemistries. + +Inspect these plots overall and per batch: + +- Histogram or density of detected genes per barcode +- Histogram or density of total UMIs per barcode +- Scatter of total UMIs vs detected genes +- Mitochondrial fraction by batch +- Any tissue-specific nuisance fraction such as ribosomal, hemoglobin, chloroplast, or stress-response signal + +Inspect the full QC panel, but by default only filter on the standard core metrics: + +- detected genes per cell +- total counts or UMIs per cell +- `percent.mt` + +Treat other QC metrics as review signals first, not automatic filters. Ambient RNA metrics are the main exception: they may be used for filtering when the dataset and workflow support them. If another nonstandard metric looks important enough to drive filtering for this dataset, surface that explicitly, explain the evidence, and consult the user before adding it as a hard cutoff. + +### Detected genes per cell + +Use one of these patterns: + +- Prefer the local minimum between low-complexity barcodes and the main cell distribution when the histogram shows a clear valley. +- If there is no clear valley, define putative cells broadly, then choose a conservative lower cutoff from a robust outlier rule such as 3 MAD below the median of putative cells. +- Review the cutoff against expected biology so that small but real cell types are not removed by default. + +Starting guesses are allowed only as a first pass: + +- Typical droplet scRNA-seq: often a few hundred to low thousands of genes +- Single-nucleus data: often lower than whole-cell data +- High-complexity tissues or deep sequencing: often higher than shallow droplet datasets + +Always keep the plot that justified the final choice. + +### Total UMIs and upper outliers + +Use total UMIs and detected genes together: + +- Very low UMI outliers usually track low-complexity barcodes. +- Very high UMI or very high gene-count outliers often indicate doublets or multiplets, but some large cell types are real. + +Use conservative upper-tail rules such as: + +- Above a very high percentile +- Several MAD above the median within batch + +Then review whether flagged cells belong to known large-cell populations before removing them. + +### Mitochondrial and nuisance fractions + +There is no universal mitochondrial cutoff. + +- In stressed whole-cell datasets, higher mitochondrial fractions can mark dying cells. +- In nucleus datasets, mitochondrial fraction is often less informative. +- Some tissues need additional nuisance metrics, for example hemoglobin in blood-contaminated samples or chloroplast genes in plant data. + +Choose the nuisance thresholds from the observed distributions and include a plot with the selected cutoff. + +Even when additional nuisance metrics are plotted and interpreted, default filtering should still be limited to detected genes, total counts, and `percent.mt`, with ambient RNA filtering allowed as a supported exception. Other extra filters should only be added with explicit user approval. + +## Required Doublet Detection Method + +Use `scDblFinder` as the default and required doublet caller for this skill. + +- Run `scDblFinder` per batch, capture channel, or other technical partition so the caller sees realistic collision structure. +- For very large partitions, split them into smaller chunks before calling `scDblFinder`, then merge the calls back onto the parent object by cell barcode. +- Preserve the doublet score, class call, and the partition used for calling in the output metadata. +- Do not silently replace `scDblFinder` with another method. If the runtime cannot support R or Bioconductor dependencies, report that as a blocker or get explicit approval before using a fallback. + +Why this is the default: + +- It is robust for droplet-style scRNA-seq and snRNA-seq data. +- It supports sample-aware calling, which matters for multiplexed or batched experiments. +- It works well with a provenance-preserving workflow where cells are flagged rather than immediately removed. + +## When to Use scVI + +Choose the latent-space method based on the analytical need, not by habit. + +Prefer a standard PCA or Scanpy workflow when: + +- the dataset is relatively small +- batch structure is minimal or already well controlled +- the main goal is straightforward QC, clustering, or plotting within one coherent dataset +- a conventional embedding is likely to answer the question without extra modeling overhead + +Prefer `scVI` when: + +- integration across batches, donors, chemistries, or studies is an explicit goal +- the dataset is large enough that a learned latent space is likely to improve robustness +- batch effects or technical heterogeneity clearly interfere with biological structure +- downstream annotation or comparison benefits from a shared latent representation + +Be cautious with `scVI` when: + +- the dataset has only a small number of cells overall +- individual batches are tiny +- the model complexity is unlikely to buy much beyond PCA + +Record why `scVI` was used or skipped so the choice is auditable. + +## Annotation When an Allen Brain Atlas Reference Is Not Appropriate + +Choose references in this order of preference: + +1. Same species +2. Same tissue or organ system +3. Same assay type and chemistry +4. Similar biological state, development stage, or disease context + +If a matched reference exists, use a compatible mapping workflow such as: + +- scANVI or scArches when you control or can rebuild the reference +- Seurat or Azimuth-style mapping when a mature Seurat reference exists +- SingleR when a suitable reference expression atlas is available +- CellTypist for immune or blood-rich datasets + +If no strong reference exists: + +- Cluster the dataset +- Call coarse compartments first +- Identify canonical and cluster-specific markers +- Use cluster-level or pseudobulk markers to refine labels +- Leave uncertain clusters as unknown, ambiguous, or mixed + +Prefer conservative hierarchical labels over overconfident fine labels. + +Cross-species mapping can still help, but use it mainly for coarse lineage assignment unless conservation is known to be strong. + +## Marker-Gene Selection for Global UMAP + +For a general-purpose global UMAP, use an informative panel rather than all genes when the dataset is large or when nuisance programs dominate the embedding. + +Recommended heuristic: + +1. Compute HVGs in a batch-aware way. +2. Define coarse groups from high-confidence labels or unsupervised clusters. +3. Add top marker genes per coarse group so all major compartments are represented. +4. Remove nuisance-heavy genes if they dominate the panel: + - mitochondrial genes + - ribosomal genes + - hemoglobin genes + - strong cell-cycle genes when they obscure lineage structure + - stress or dissociation genes when they dominate the signal +5. Keep the panel compact but diverse. + +Reasonable panel sizes are often: + +- Small datasets: roughly 1k to 3k genes +- Medium to large datasets: roughly 2k to 8k genes + +The goal is coverage of major biological structure, not maximum panel size. + +If no stable labels exist yet, use HVGs alone or use the learned latent space directly. + +## Plot Legibility + +All QC and embedding plots should be easy to read. + +- Do not put an excessive number of categories into one panel or legend. +- Use coarse labels for overview plots and reserve fine labels for per-group or faceted plots. +- Split figures by batch, neighborhood, lineage, or major compartment when that improves readability. +- If a categorical legend becomes too large to parse, reduce the categories shown in that panel or make separate figures rather than shrinking text until it is unreadable. +- Favor a smaller set of interpretable plots over a single overloaded summary figure. + +## Large-Dataset Memory Practices + +Treat memory movement as a first-class design constraint. + +- Avoid copying full AnnData or Seurat objects. +- Prefer views, backed mode, chunked readers, and subset manifests over duplicated matrices. +- Keep matrices sparse and avoid densifying whole objects. +- Run `scDblFinder` and similar expensive steps per batch or per split subset. +- When passing data between Python and R, pass only the required cells and metadata, not the entire parent object. +- Free intermediates aggressively after large steps. +- Write checkpoints after major QC, annotation, and embedding stages. + +Copying should be avoided not only for count matrices but also for whole analysis objects, temporary exports, and language-bridge handoffs. diff --git a/plugins/ngs-analysis/tests/test_bcl_to_fastq_runner.py b/plugins/ngs-analysis/tests/test_bcl_to_fastq_runner.py new file mode 100644 index 0000000..ac64fd2 --- /dev/null +++ b/plugins/ngs-analysis/tests/test_bcl_to_fastq_runner.py @@ -0,0 +1,164 @@ +import csv +import sys +import tempfile +import unittest +from pathlib import Path +from types import SimpleNamespace +from unittest import mock + +SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts" +sys.path.insert(0, str(SCRIPT_DIR)) + +import ngs_run_utils # noqa: E402 +import run_bcl_to_fastq # noqa: E402 + + +def write_csv(path: Path, rows: list[dict[str, str]]) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.DictWriter(handle, fieldnames=list(rows[0].keys())) + writer.writeheader() + writer.writerows(rows) + + +class ParseReportBundleTests(unittest.TestCase): + def test_parse_report_bundle_flags_high_undetermined_fraction(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + output_dir = Path(tmp) + reports_dir = output_dir / "Reports" + write_csv( + reports_dir / "Demultiplex_Stats.csv", + [ + { + "Lane": "1", + "SampleID": "sampleA", + "Index": "AAAA-BBBB", + "# Reads": "100", + "# Perfect Index Reads": "95", + "# One Mismatch Index Reads": "5", + "# Two Mismatch Index Reads": "0", + "% Reads": "0.10", + "% Perfect Index Reads": "0.95", + "% One Mismatch Index Reads": "0.05", + "% Two Mismatch Index Reads": "0.00", + }, + { + "Lane": "1", + "SampleID": "Undetermined", + "Index": "", + "# Reads": "900", + "# Perfect Index Reads": "900", + "# One Mismatch Index Reads": "0", + "# Two Mismatch Index Reads": "0", + "% Reads": "0.90", + "% Perfect Index Reads": "1.00", + "% One Mismatch Index Reads": "0.00", + "% Two Mismatch Index Reads": "0.00", + }, + ], + ) + write_csv( + reports_dir / "Quality_Metrics.csv", + [ + { + "Lane": "1", + "SampleID": "sampleA", + "index": "AAAA", + "index2": "BBBB", + "ReadNumber": "1", + "Yield": "1000", + "YieldQ30": "970", + "QualityScoreSum": "36000", + "Mean Quality Score (PF)": "36.0", + "% Q30": "0.97", + } + ], + ) + write_csv( + reports_dir / "Top_Unknown_Barcodes.csv", + [ + { + "Lane": "1", + "index": "CCCC", + "index2": "DDDD", + "# Reads": "20", + "% of Unknown Barcodes": "0.02", + "% of All Reads": "0.02", + } + ], + ) + write_csv( + reports_dir / "fastq_list.csv", + [ + { + "RGID": "AAAA.BBBB.1", + "RGSM": "sampleA", + "RGLB": "lib1", + "Lane": "1", + "Read1File": str(output_dir / "sampleA_S1_L001_R1_001.fastq.gz"), + "Read2File": str(output_dir / "sampleA_S1_L001_R2_001.fastq.gz"), + } + ], + ) + for name in [ + "sampleA_S1_L001_R1_001.fastq.gz", + "sampleA_S1_L001_R2_001.fastq.gz", + "Undetermined_S0_L001_R1_001.fastq.gz", + "Undetermined_S0_L001_R2_001.fastq.gz", + ]: + (output_dir / name).write_bytes(b"test") + + result = run_bcl_to_fastq.parse_report_bundle(output_dir) + self.assertIsNotNone(result) + self.assertEqual(result["assessment"], "fail") + self.assertAlmostEqual(result["undetermined_fraction"], 0.9) + self.assertEqual(len(result["fastq_outputs"]), 4) + + +class ArtifactIndexTests(unittest.TestCase): + def test_build_artifact_index_includes_extra_roots_and_skips_large_checksums(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + run_dir = root / "run" + out_dir = root / "out" + (run_dir / "config.json").parent.mkdir(parents=True, exist_ok=True) + (run_dir / "config.json").write_text("{}\n", encoding="utf-8") + (out_dir / "sample.fastq.gz").parent.mkdir(parents=True, exist_ok=True) + (out_dir / "sample.fastq.gz").write_bytes(b"0123456789abcdef") + with mock.patch.object(ngs_run_utils, "MAX_AUTO_CHECKSUM_BYTES", 8): + index = ngs_run_utils.build_artifact_index( + run_dir, extra_roots={"output_directory": out_dir} + ) + entry = next( + item + for item in index["artifacts"] + if item["path"] == "output_directory/sample.fastq.gz" + ) + self.assertEqual(entry["sha256"], "") + self.assertIn("auto-checksum threshold", entry["sha256_skipped_reason"]) + + +class RuntimePreflightTests(unittest.TestCase): + def test_converter_runtime_preflight_reports_mount_root_errors(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + wrapper = Path(tmp) / "bcl-convert" + wrapper.write_text('#!/bin/sh\ndocker run --rm image "$@"\n', encoding="utf-8") + args = SimpleNamespace( + run_folder=Path("/opt/run"), + sample_sheet=Path("/opt/SampleSheet.csv"), + output_directory=Path("/opt/out"), + ) + with mock.patch.object(run_bcl_to_fastq, "command_path", return_value=str(wrapper)): + with mock.patch.object( + run_bcl_to_fastq, + "run_cmd", + return_value={"ok": False, "stdout_tail": "daemon down"}, + ): + runtime = run_bcl_to_fastq.converter_runtime_preflight("bcl-convert", args) + self.assertTrue(runtime["uses_docker_wrapper"]) + self.assertFalse(runtime["docker_daemon_ok"]) + self.assertGreaterEqual(len(runtime["errors"]), 1) + + +if __name__ == "__main__": + unittest.main() diff --git a/plugins/ngs-analysis/tests/test_bulk_rnaseq_counts_qc_runner.py b/plugins/ngs-analysis/tests/test_bulk_rnaseq_counts_qc_runner.py new file mode 100644 index 0000000..cd0253d --- /dev/null +++ b/plugins/ngs-analysis/tests/test_bulk_rnaseq_counts_qc_runner.py @@ -0,0 +1,160 @@ +import json +import sys +import tempfile +import unittest +from pathlib import Path +from unittest import mock + +SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts" +WORKFLOW_DIR = Path(__file__).resolve().parents[1] / "workflows" / "bulk_rnaseq_counts_qc" +sys.path.insert(0, str(SCRIPT_DIR)) +sys.path.insert(0, str(WORKFLOW_DIR)) + +import aggregate_salmon_quant # noqa: E402 +import run_bulk_rnaseq_counts_qc # noqa: E402 + + +def write_text(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +class SalmonLibtypeTests(unittest.TestCase): + def test_yaml_preflight_reports_missing_pyyaml_without_import_crash(self) -> None: + with mock.patch.dict(sys.modules, {"yaml": None}): + status = run_bulk_rnaseq_counts_qc.yaml_dependency_status() + self.assertFalse(status["ok"]) + self.assertFalse(status["python_modules"]["yaml"]["present"]) + + def test_salmon_libtype_respects_layout_and_strandedness(self) -> None: + self.assertEqual( + run_bulk_rnaseq_counts_qc.salmon_libtype("PE", "reverse"), ("ISR", "from_input") + ) + self.assertEqual( + run_bulk_rnaseq_counts_qc.salmon_libtype("PE", "forward"), ("ISF", "from_input") + ) + self.assertEqual( + run_bulk_rnaseq_counts_qc.salmon_libtype("SE", "reverse"), ("SR", "from_input") + ) + self.assertEqual( + run_bulk_rnaseq_counts_qc.salmon_libtype("SE", "unstranded"), ("U", "from_input") + ) + self.assertEqual( + run_bulk_rnaseq_counts_qc.salmon_libtype("PE", "unknown"), ("A", "infer_from_salmon") + ) + + +class AggregateSalmonQuantTests(unittest.TestCase): + def test_aggregate_outputs_gene_level_matrices_and_tx2gene(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + config = { + "references": {"annotation_gtf": str(root / "genes.gtf")}, + "rnaseq_salmon_samples": { + "sampleA": { + "layout": "PE", + "strandedness": "reverse", + "row_indices": [2], + "r1": ["a"], + "r2": ["b"], + }, + "sampleB": { + "layout": "PE", + "strandedness": "reverse", + "row_indices": [3], + "r1": ["c"], + "r2": ["d"], + }, + }, + } + write_text( + root / "genes.gtf", + "\n".join( + [ + 'chr1\tsrc\ttranscript\t1\t100\t.\t+\t.\tgene_id "GENE1"; transcript_id "TX1"; gene_name "G1";', + 'chr1\tsrc\ttranscript\t200\t300\t.\t+\t.\tgene_id "GENE1"; transcript_id "TX2"; gene_name "G1";', + 'chr1\tsrc\ttranscript\t400\t500\t.\t+\t.\tgene_id "GENE2"; transcript_id "TX3"; gene_name "G2";', + ] + ) + + "\n", + ) + write_text(root / "config.json", json.dumps(config)) + for sample, rows in { + "sampleA": [("TX1", 10, 5.0, 100.0), ("TX2", 3, 2.0, 90.0), ("TX3", 4, 1.0, 80.0)], + "sampleB": [("TX1", 7, 4.0, 100.0), ("TX2", 1, 1.0, 90.0), ("TX3", 5, 6.0, 80.0)], + }.items(): + lines = ["Name\tLength\tEffectiveLength\tTPM\tNumReads"] + lines.extend([f"{tx}\t1000\t{eff}\t{tpm}\t{reads}" for tx, reads, tpm, eff in rows]) + write_text(root / f"{sample}.quant.sf", "\n".join(lines) + "\n") + outdir = root / "out" + argv = [ + "aggregate_salmon_quant.py", + "--config", + str(root / "config.json"), + "--outdir", + str(outdir), + "--quant", + f"sampleA={root / 'sampleA.quant.sf'}", + "--quant", + f"sampleB={root / 'sampleB.quant.sf'}", + ] + with mock.patch.object(sys, "argv", argv): + self.assertEqual(aggregate_salmon_quant.main(), 0) + self.assertIn("GENE1", (outdir / "gene_num_reads.tsv").read_text(encoding="utf-8")) + self.assertIn("TX1\tGENE1\tG1", (outdir / "tx2gene.tsv").read_text(encoding="utf-8")) + + +class QcVerdictTests(unittest.TestCase): + def test_compute_qc_verdict_surfaces_mismatch_and_outlier(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + config = { + "fastq_files": { + "SAMPLE1__r1": {"sample": "sample1", "path": str(root / "S1.fastq.gz")}, + "SAMPLE2__r1": {"sample": "sample2", "path": str(root / "S2.fastq.gz")}, + }, + "rnaseq_salmon_samples": { + "sample1": {"salmon_libtype": "ISR"}, + "sample2": {"salmon_libtype": "ISR"}, + }, + } + write_text( + root / "fastqc" / "multiqc" / "multiqc_data" / "multiqc_general_stats.txt", + "\n".join( + [ + "Sample\tfastqc-percent_duplicates", + "S1\t65.0", + "S2\t85.0", + ] + ) + + "\n", + ) + write_text( + root / "rnaseq_salmon" / "multiqc" / "multiqc_data" / "multiqc_general_stats.txt", + "\n".join( + [ + "Sample\tsalmon-percent_mapped", + "sample1\t80.0", + "sample2\t60.0", + ] + ) + + "\n", + ) + write_text( + root / "rnaseq_salmon" / "quant" / "sample1" / "lib_format_counts.json", + json.dumps({"expected_format": "ISR", "strand_mapping_bias": 0.05}), + ) + write_text( + root / "rnaseq_salmon" / "quant" / "sample2" / "lib_format_counts.json", + json.dumps({"expected_format": "ISF", "strand_mapping_bias": 0.20}), + ) + verdict = run_bulk_rnaseq_counts_qc.compute_qc_verdict(root, config) + self.assertEqual(verdict["overall_status"], "fail") + sample2 = next(item for item in verdict["samples"] if item["sample"] == "sample2") + self.assertEqual(sample2["libtype_status"], "fail") + self.assertEqual(sample2["duplication_status"], "fail") + self.assertEqual(sample2["strand_bias_status"], "warn") + + +if __name__ == "__main__": + unittest.main() diff --git a/plugins/ngs-analysis/tests/test_new_backend_planners.py b/plugins/ngs-analysis/tests/test_new_backend_planners.py new file mode 100644 index 0000000..61a94af --- /dev/null +++ b/plugins/ngs-analysis/tests/test_new_backend_planners.py @@ -0,0 +1,1025 @@ +import os +import sys +import tempfile +import unittest +from pathlib import Path +from types import SimpleNamespace +from unittest import mock + +SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts" +sys.path.insert(0, str(SCRIPT_DIR)) + +import ngs_epigenomics_utils # noqa: E402 +import ngs_reference_manager # noqa: E402 +import ngs_resource_gate # noqa: E402 +import ngs_visualization_utils # noqa: E402 +import run_amplicon_microbiome # noqa: E402 +import run_atacseq_peaks_qc # noqa: E402 +import run_chip_cutrun_peaks_qc # noqa: E402 +import run_dna_somatic_variants # noqa: E402 +import run_dna_umi_panel_variants # noqa: E402 +import run_nfcore_pipeline # noqa: E402 +import run_shotgun_metagenomics # noqa: E402 + + +def write(path: Path, text: str = "") -> Path: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(text, encoding="utf-8") + return path + + +class ReferenceManagerTests(unittest.TestCase): + def test_check_expected_files_reports_missing_bundle_members(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "genome.fa", ">chr1\nACGT\n") + result = ngs_reference_manager.check_expected_files( + bundle_name="reduced", + bundle={"kind": "reference", "required_files": ["genome.fa", "genome.fa.fai"]}, + override_root=root, + ) + self.assertFalse(result["ok"]) + self.assertEqual(result["missing"], ["genome.fa.fai"]) + + def test_pipeline_resource_plan_writes_manifest_and_env_hints(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + db = root / "kraken" + db.mkdir() + write(db / "hash.k2d", "hash") + outdir = root / "plan" + result = ngs_reference_manager.plan_pipeline_resources( + "shotgun_metagenomics", + bundle_roots={"kraken2_standard": db}, + include_optional=True, + ) + outputs = ngs_reference_manager.write_resource_plan_outputs(result, outdir) + self.assertFalse(result["ok"]) + self.assertEqual(result["resources"][0]["bundle"], "kraken2_standard") + self.assertIn("opts.k2d", result["missing_required"][0]["missing"]) + self.assertTrue(Path(outputs["resource_manifest"]).exists()) + env_text = Path(outputs["resource_env"]).read_text(encoding="utf-8") + self.assertIn("NGS_DB_KRAKEN2_ROOT", env_text) + self.assertTrue(Path(outputs["resource_setup_summary"]).exists()) + setup_text = Path(outputs["resource_setup_summary"]).read_text(encoding="utf-8") + self.assertIn("kraken2_standard", setup_text) + self.assertIn("kraken2-build", setup_text) + self.assertIn("Validation command", setup_text) + commands_text = Path(outputs["resource_setup_commands"]).read_text(encoding="utf-8") + self.assertIn("# kraken2-build", commands_text) + self.assertIn("--kind database --bundle kraken2_standard", commands_text) + + def test_setup_plan_lists_missing_optional_database_actions(self) -> None: + with mock.patch.dict(os.environ, {}, clear=True): + result = ngs_reference_manager.plan_pipeline_resources( + "shotgun_metagenomics", + include_optional=True, + ) + setup_plan = ngs_reference_manager.setup_plan_from_resource_plan(result) + bundles = {item["bundle"] for item in setup_plan["actions"]} + self.assertEqual(setup_plan["blocking_count"], 1) + self.assertIn("kraken2_standard", bundles) + self.assertIn("bracken_standard", bundles) + self.assertIn("humann_uniref90", bundles) + kraken = next( + item for item in setup_plan["actions"] if item["bundle"] == "kraken2_standard" + ) + self.assertIn("kraken2-build", "\n".join(kraken["suggested_setup"])) + self.assertIn( + "${NGS_DB_KRAKEN2_ROOT:-/path/to/kraken2_standard}", kraken["validation_command"] + ) + + def test_genome_pipeline_resource_plan_selects_build_bundle(self) -> None: + result = ngs_reference_manager.plan_pipeline_resources("atacseq", genome_build="mm39") + self.assertEqual(result["pipeline"], "atacseq_peaks_qc") + self.assertEqual(result["resources"][0]["bundle"], "grcm39_core") + self.assertFalse(result["ok"]) + + def test_resource_inventory_writes_dashboard_and_env_hints(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + bundle_root = root / "reduced_ref" + write(bundle_root / "genome.fa", ">chr1\nACGT\n") + write(bundle_root / "genome.fa.fai", "chr1\t4\t6\t4\t5\n") + registries = { + "references": { + "reduced_bundle": { + "display_name": "Local bundle", + "kind": "reduced_reference", + "genome_build": "reduced_local", + "root_env": "NGS_REF_REDUCED_BUNDLE_ROOT", + "source": "unit test", + "license_note": "test only", + "estimated_size": "small", + "suggested_setup": [ + 'samtools faidx "$NGS_REF_REDUCED_BUNDLE_ROOT"/genome.fa' + ], + "required_files": ["genome.fa", "genome.fa.fai"], + } + }, + "databases": {}, + } + inventory = ngs_reference_manager.inventory_resources( + kind="reference", + bundle_roots={"reduced_bundle": bundle_root}, + registries=registries, + ) + outputs = ngs_reference_manager.write_resource_inventory_outputs( + inventory, root / "inventory" + ) + self.assertTrue(inventory["ok"], inventory) + self.assertEqual(inventory["ready_count"], 1) + self.assertTrue(Path(outputs["resource_dashboard"]).exists()) + dashboard = Path(outputs["resource_dashboard"]).read_text(encoding="utf-8") + self.assertIn("Local bundle", dashboard) + env_text = Path(outputs["resource_env"]).read_text(encoding="utf-8") + self.assertIn("NGS_REF_REDUCED_BUNDLE_ROOT", env_text) + + def test_resource_lockfile_verifies_and_detects_drift(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + bundle_root = root / "reduced_ref" + write(bundle_root / "genome.fa", ">chr1\nACGT\n") + write(bundle_root / "genome.fa.fai", "chr1\t4\t6\t4\t5\n") + registries = { + "references": { + "reduced_bundle": { + "display_name": "Local bundle", + "kind": "reduced_reference", + "root_env": "NGS_REF_REDUCED_BUNDLE_ROOT", + "source": "unit test", + "license_note": "test only", + "required_files": ["genome.fa", "genome.fa.fai"], + } + }, + "databases": {}, + } + inventory = ngs_reference_manager.inventory_resources( + kind="reference", + bundle_roots={"reduced_bundle": bundle_root}, + include_checksums=True, + registries=registries, + ) + lock = ngs_reference_manager.resource_lock_from_inventory(inventory) + outputs = ngs_reference_manager.write_resource_lock_outputs(lock, root / "lock") + self.assertTrue(lock["ok"], lock) + self.assertTrue(Path(outputs["resource_lock"]).exists()) + verification = ngs_reference_manager.verify_resource_lock(lock) + self.assertTrue(verification["ok"], verification) + + (bundle_root / "genome.fa.fai").unlink() + drifted = ngs_reference_manager.verify_resource_lock(lock) + self.assertFalse(drifted["ok"], drifted) + self.assertEqual(drifted["mismatches"][0]["issue"], "missing_now") + + def test_direct_resource_gate_advisory_does_not_block_local_validation(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + run_dir = root / "run" + run_dir.mkdir() + plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="atacseq_peaks_qc", + genome_build="not_a_registered_reference_bundle", + required=False, + ) + validation = ngs_resource_gate.merge_resource_status( + {"ok": True, "errors": [], "warnings": []}, plan, required=False + ) + self.assertFalse(plan["ok"]) + self.assertTrue(validation["ok"]) + self.assertIn("advisory resource check", validation["warnings"][0]) + self.assertTrue((run_dir / "resources" / "resource_plan.json").exists()) + + def test_direct_resource_gate_required_blocks_validation(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + run_dir = root / "run" + run_dir.mkdir() + plan = ngs_resource_gate.write_pipeline_resource_plan( + run_dir=run_dir, + pipeline="dna_somatic_variants", + genome_build="not_a_registered_reference_bundle", + required=True, + ) + validation = ngs_resource_gate.merge_resource_status( + {"ok": True, "errors": [], "warnings": []}, plan, required=True + ) + self.assertFalse(validation["ok"]) + self.assertIn("required reference bundle", validation["errors"][0]) + + +class DnaSubtypePlannerTests(unittest.TestCase): + def test_vcf_review_notebook_helper_discovers_vcfs_and_writes_notebook(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "variants" / "S1.vcf.gz", "vcf") + entries: list[dict[str, object]] = [] + review = ngs_visualization_utils.add_vcf_review_notebook_entry( + root, + entries, + title="Unit Test VCF Review", + table_items=[("Sample Table", "validation/samples.normalized.tsv")], + ) + self.assertEqual(review["review_notebook"], "notebooks/vcf_review.marimo.py") + self.assertTrue((root / "notebooks" / "vcf_review.marimo.py").exists()) + self.assertEqual(entries[-1]["kind"], "notebook") + self.assertEqual(entries[-1]["status"], "created") + + def test_vcf_review_notebook_helper_marks_not_available_when_no_vcf(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + entries: list[dict[str, object]] = [] + review = ngs_visualization_utils.add_vcf_review_notebook_entry( + root, entries, title="Unit Test VCF Review" + ) + self.assertEqual(review, {}) + self.assertEqual(entries[-1]["status"], "not_available") + + +class VisualizationHelperTests(unittest.TestCase): + def test_reachable_localhost_url_for_path_returns_none_when_server_is_down(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "multiqc" / "raw" / "multiqc_report.html", "") + self.assertIsNone( + ngs_visualization_utils.reachable_localhost_url_for_path( + "multiqc/raw/multiqc_report.html", + port=65500, + timeout_seconds=0.05, + ) + ) + + def test_write_multiqc_browser_helper_omits_dead_localhost_link(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write( + root / "multiqc" / "raw" / "multiqc_report.html", "report" + ) + write( + root / "multiqc" / "raw" / "multiqc_data" / "multiqc_general_stats.txt", + "Sample\tReads\nsampleA\t10\n", + ) + helper = ngs_visualization_utils.write_multiqc_browser_helper( + root, + report_path="multiqc/raw/multiqc_report.html", + title="Helper", + localhost_port=65500, + ) + self.assertIsNotNone(helper) + helper_text = helper.read_text(encoding="utf-8") + self.assertIn("localhost review URL is not live yet", helper_text) + self.assertNotIn( + 'href="http://127.0.0.1:65500/multiqc/raw/multiqc_report.html"', helper_text + ) + + def test_somatic_plan_uses_dedicated_mutect2_contract(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + reference = write(root / "ref.fa", ">chr1\nACGT\n") + write(root / "tumor.bam", "bam") + write(root / "normal.bam", "bam") + sheet = write( + root / "pairs.tsv", + "\t".join(["pair_id", "tumor_sample", "tumor_bam", "normal_sample", "normal_bam"]) + + "\n" + + "\t".join(["P1", "T", "tumor.bam", "N", "normal.bam"]) + + "\n", + ) + args = SimpleNamespace( + sample_sheet=sheet, + reference_fasta=reference, + target_bed=None, + panel_of_normals=None, + germline_resource=None, + annotation_vcf=None, + f1r2_orientation_model=True, + ) + validation, pairs = run_dna_somatic_variants.validate_inputs(args) + plan = run_dna_somatic_variants.mutect2_plan(args, pairs) + self.assertTrue(validation["ok"], validation) + self.assertEqual(pairs[0]["design"], "tumor_normal") + self.assertTrue(any("Mutect2" in item["command"] for item in plan)) + self.assertTrue(any("FilterMutectCalls" in item["command"] for item in plan)) + + def test_somatic_pair_review_parses_postrun_stats(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + reference = write(root / "ref.fa", ">chr1\nACGT\n") + write(root / "tumor.bam", "bam") + sheet = write( + root / "pairs.tsv", "pair_id\ttumor_sample\ttumor_bam\nP1\tT\ttumor.bam\n" + ) + args = SimpleNamespace( + sample_sheet=sheet, + reference_fasta=reference, + target_bed=None, + panel_of_normals=None, + germline_resource=None, + annotation_vcf=None, + f1r2_orientation_model=False, + ) + validation, pairs = run_dna_somatic_variants.validate_inputs(args) + write(root / "variants" / "P1.filtered.vcf.gz", "vcf") + write( + root / "variants" / "P1.bcftools_stats.txt", + "SN\t0\tnumber of records:\t4\nSN\t0\tnumber of SNPs:\t3\nSN\t0\tnumber of indels:\t1\n", + ) + rows = run_dna_somatic_variants.summarize_somatic_artifacts( + root, validation, pairs, args + ) + self.assertEqual(rows[0]["status"], "created") + self.assertEqual(rows[0]["design"], "tumor_only") + self.assertEqual(rows[0]["variant_records"], 4) + self.assertTrue((root / "qc" / "somatic_pair_review.tsv").exists()) + + def test_somatic_visuals_include_vcf_review_notebook_when_vcf_exists(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "validation" / "pairs.normalized.tsv", "pair_id\ttumor_sample\nP1\tT\n") + write(root / "workflow" / "somatic_command_plan.json", "{}\n") + write(root / "qc" / "somatic_qc_summary.json", "{}\n") + write(root / "qc" / "somatic_pair_review.tsv", "pair_id\tstatus\nP1\tcreated\n") + write(root / "variants" / "P1.filtered.vcf.gz", "vcf") + visuals = run_dna_somatic_variants.write_visuals( + root, "completed", {"warnings": [], "pair_count": 1, "resource_plan_ok": True}, None + ) + self.assertIn("review_notebook", visuals) + self.assertTrue((root / visuals["review_notebook"]).exists()) + + def test_umi_plan_generates_consensus_and_variant_steps(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + reference = write(root / "ref.fa", ">chr1\nACGT\n") + write(root / "raw.bam", "bam") + sheet = write(root / "samples.tsv", "sample\traw_bam\nS1\traw.bam\n") + args = SimpleNamespace( + sample_sheet=sheet, + reference_fasta=reference, + target_bed=None, + hotspot_vcf=None, + umi_mode="duplex", + umi_tag="RX", + grouping_strategy="adjacency", + umi_edits=1, + min_reads_per_molecule=2, + min_af=0.005, + ) + validation, samples = run_dna_umi_panel_variants.validate_inputs(args) + plan = run_dna_umi_panel_variants.build_plan(args, samples) + self.assertTrue(validation["ok"]) + self.assertEqual(samples[0]["consensus_state"], "needs_generation") + self.assertTrue(any("CallMolecularConsensusReads" in item["command"] for item in plan)) + self.assertTrue(any("bcftools mpileup" in item["command"] for item in plan)) + + def test_umi_plan_treats_missing_rx_mq_bam_as_review_contract(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + reference = write(root / "ref.fa", ">chr1\nACGT\n") + write(root / "raw.bam", "bam") + sheet = write(root / "samples.tsv", "sample\traw_bam\nS1\traw.bam\n") + args = SimpleNamespace( + sample_sheet=sheet, + reference_fasta=reference, + target_bed=None, + hotspot_vcf=None, + umi_mode="duplex", + umi_tag="RX", + grouping_strategy="adjacency", + umi_edits=1, + min_reads_per_molecule=2, + min_af=0.005, + ) + with mock.patch.object( + run_dna_umi_panel_variants, + "inspect_alignment_tags", + return_value={ + "inspectable": True, + "reason": "", + "records_inspected": 20, + "tags": {"RX": False, "MQ": False}, + "all_present": False, + }, + ): + validation, samples = run_dna_umi_panel_variants.validate_inputs(args) + plan = run_dna_umi_panel_variants.build_plan(args, samples) + self.assertTrue(validation["ok"], validation) + self.assertEqual(samples[0]["fgbio_readiness"], "review_contract_only") + self.assertEqual(samples[0]["consensus_state"], "review_contract_only") + self.assertEqual(plan, []) + self.assertTrue( + any("review-contract input" in warning for warning in validation["warnings"]) + ) + + def test_umi_postrun_summary_parses_execution_artifacts(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "consensus" / "S1.consensus.bam", "bam") + write( + root / "qc" / "S1.consensus.flagstat.txt", + "120 + 0 in total (QC-passed reads + QC-failed reads)\n100 + 0 mapped (83.33% : N/A)\n", + ) + write( + root / "qc" / "S1.target_coverage.tsv", + "#rname\tstartpos\tendpos\tnumreads\tcovbases\tcoverage\tmeandepth\tmeanbaseq\tmeanmapq\nchr1\t1\t100\t100\t95\t95\t42.5\t30\t60\n", + ) + write( + root / "variants" / "S1.bcftools_stats.txt", + "SN\t0\tnumber of records:\t3\nSN\t0\tnumber of SNPs:\t2\nSN\t0\tnumber of indels:\t1\n", + ) + write(root / "variants" / "S1.consensus.vcf.gz", "vcf") + write( + root / "qc" / "S1.family_size.tsv", + "family_size\tcount\tfamily_type\n2\t2\tsimplex\n6\t1\tduplex\n", + ) + rows = run_dna_umi_panel_variants.summarize_postrun_artifacts( + root, + [ + { + "sample": "S1", + "consensus_alignment": "consensus/S1.consensus.bam", + "consensus_state": "provided", + } + ], + ) + evidence = run_dna_umi_panel_variants.write_molecular_evidence_contract( + root, + {"umi_mode": "duplex", "min_af": 0.005, "hotspot_vcf": None}, + [ + { + "sample": "S1", + "consensus_alignment": "consensus/S1.consensus.bam", + "consensus_state": "provided", + } + ], + SimpleNamespace(umi_mode="duplex", min_reads_per_molecule=2), + ) + self.assertEqual(rows[0]["status"], "created") + self.assertEqual(rows[0]["total_consensus_reads"], 120) + self.assertEqual(rows[0]["mapped_consensus_reads"], 100) + self.assertEqual(rows[0]["variant_records"], 3) + self.assertEqual(rows[0]["median_family_size"], 2.0) + self.assertEqual(evidence[0]["low_af_review_status"], "ready_for_review") + self.assertTrue((root / "qc" / "umi_postrun_summary.tsv").exists()) + self.assertTrue((root / "qc" / "umi_molecular_evidence_contract.tsv").exists()) + + +class BackendPlannerTests(unittest.TestCase): + def test_nfcore_command_captures_report_trace_timeline(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + sheet = write(root / "samples.csv", "sample,fastq_1\nS1,a.fastq.gz\n") + args = SimpleNamespace( + pipeline="rnaseq", + sample_sheet=sheet, + params_file=None, + profile="docker", + revision="3.18.0", + genome=None, + fasta=None, + gtf=None, + extra_param=[], + nextflow_arg=[], + ) + params_path = root / "params.json" + command = run_nfcore_pipeline.build_command(args, root, params_path) + self.assertIn("nf-core/rnaseq", command) + self.assertIn("-with-report", command) + self.assertIn("-with-trace", command) + + def test_nfcore_scrnaseq_adapter_uses_scrnaseq_resource_contract(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + sheet = write( + root / "samples.csv", "sample,fastq_1,fastq_2\nPBMC1,R1.fastq.gz,R2.fastq.gz\n" + ) + reduced_ref = root / "reduced_ref" + write(reduced_ref / "genome.fa", ">chr1\nACGT\n") + write(reduced_ref / "genome.fa.fai", "chr1\t4\t6\t4\t5\n") + write( + reduced_ref / "annotation.gtf", + 'chr1\treduced\tgene\t1\t4\t.\t+\t.\tgene_id "g1";\n', + ) + run_dir = root / "run" + run_dir.mkdir() + args = SimpleNamespace( + pipeline="scrnaseq", + sample_sheet=sheet, + params_file=None, + profile="docker", + revision="4.0.0", + genome="reduced_local", + genome_build=None, + fasta=None, + gtf=None, + extra_param=["aligner=star"], + nextflow_arg=[], + bundle_root=[f"reduced_micro_genome={reduced_ref}"], + include_optional_resources=False, + resource_checksums=False, + skip_resource_plan=False, + ) + input_validation = run_nfcore_pipeline.validate_inputs(args) + resource_plan = run_nfcore_pipeline.write_resource_plan(args, run_dir) + validation = run_nfcore_pipeline.merge_resource_status(input_validation, resource_plan) + self.assertTrue(validation["ok"], validation) + self.assertEqual(resource_plan["pipeline"], "scrnaseq_fastq_to_count") + self.assertEqual( + resource_plan["outputs"]["resource_plan"], "resources/resource_plan.json" + ) + self.assertTrue((run_dir / "resources" / "resource_manifest.tsv").exists()) + + def test_nfcore_missing_resource_blocks_adapter_validation(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + sheet = write(root / "samples.csv", "sample,fastq_1\nS1,a.fastq.gz\n") + run_dir = root / "run" + run_dir.mkdir() + args = SimpleNamespace( + pipeline="rnaseq", + sample_sheet=sheet, + params_file=None, + profile="docker", + revision=None, + genome="not_a_registered_bundle", + genome_build=None, + fasta=None, + gtf=None, + extra_param=[], + nextflow_arg=[], + bundle_root=[], + include_optional_resources=False, + resource_checksums=False, + skip_resource_plan=False, + ) + input_validation = run_nfcore_pipeline.validate_inputs(args) + resource_plan = run_nfcore_pipeline.write_resource_plan(args, run_dir) + validation = run_nfcore_pipeline.merge_resource_status(input_validation, resource_plan) + self.assertTrue(input_validation["ok"], input_validation) + self.assertFalse(validation["ok"], validation) + self.assertFalse(validation["resource_plan_ok"]) + self.assertIn("required reference bundle", validation["errors"][0]) + + def test_atac_plan_contains_peak_frip_and_track_steps(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "sample.bam", "bam") + sheet = write(root / "atac.tsv", "sample\tbam\nA1\tsample.bam\n") + args = SimpleNamespace( + sample_sheet=sheet, + bam_only=True, + bowtie2_index=None, + genome_size="hs", + blacklist_bed=None, + tss_bed=None, + min_mapq=30, + threads=2, + ) + validation, samples = run_atacseq_peaks_qc.validate_inputs(args) + plan = run_atacseq_peaks_qc.build_plan(args, samples) + self.assertTrue(validation["ok"]) + self.assertTrue(any("macs2 callpeak" in item["command"] for item in plan)) + self.assertTrue(any("frip_reads" in item["command"] for item in plan)) + self.assertTrue(any("bamCoverage" in item["command"] for item in plan)) + + def test_epigenomics_summary_builds_tracks_and_metrics(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "alignment" / "A1.filtered.bam", "bam") + write( + root / "qc" / "A1.flagstat.txt", + "100 + 0 in total (QC-passed reads + QC-failed reads)\n95 + 0 mapped (95.0% : N/A)\n5 + 0 duplicates\n", + ) + write(root / "qc" / "A1.filtered_reads.txt", "100\n") + write(root / "qc" / "A1.frip_reads.txt", "25\n") + write(root / "qc" / "A1.insert_sizes.txt", "50\n75\n200\n") + write(root / "qc" / "A1.tss_matrix.gz", "matrix") + write(root / "qc" / "A1.tss_profile.png", "png") + write(root / "peaks" / "A1_peaks.narrowPeak", "chr1\t10\t20\nchr1\t30\t40\n") + write(root / "peaks" / "consensus_peaks.bed", "chr1\t10\t40\n") + write(root / "tracks" / "A1.bw", "bigwig") + write(root / "motifs" / "A1" / "knownResults.txt", "Motif Name\tP-value\nRUNX\t1e-5\n") + summary = ngs_epigenomics_utils.summarize_epigenomics_outputs( + root, + [{"sample": "A1", "layout": "bam"}], + peak_mode="narrow", + output_prefix="atacseq_qc", + title="ATAC-seq", + ) + self.assertEqual(summary["status"], "created") + self.assertEqual(summary["samples"][0]["frip"], 0.25) + self.assertEqual(summary["samples"][0]["raw_peak_count"], 2) + self.assertTrue((root / "tracks" / "browser_tracks.tsv").exists()) + self.assertTrue((root / "tracks" / "igv_session.xml").exists()) + self.assertTrue((root / "tracks" / "browser_track_preview.html").exists()) + self.assertTrue((root / "qc" / "atacseq_qc_dashboard.html").exists()) + self.assertTrue((root / "qc" / "atacseq_qc_frip_peak_overview.svg").exists()) + self.assertTrue((root / "qc" / "atacseq_qc_insert_size_distribution.svg").exists()) + self.assertEqual(summary["outputs"]["dashboard"], "qc/atacseq_qc_dashboard.html") + self.assertIn( + "RUNX", (root / "motifs" / "motif_summary.tsv").read_text(encoding="utf-8") + ) + + def test_chip_plan_contains_optional_motif_step(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "chip.bam", "bam") + sheet = write(root / "chip.tsv", "sample\tbam\ttarget\nC1\tchip.bam\tCTCF\n") + args = SimpleNamespace( + sample_sheet=sheet, + assay="chipseq", + target_class="tf", + peak_mode="narrow", + bowtie2_index=None, + bam_only=True, + genome_size="hs", + blacklist_bed=None, + min_mapq=30, + threads=2, + run_motifs=True, + motif_genome="hg38", + motif_size="given", + ) + validation, samples = run_chip_cutrun_peaks_qc.validate_inputs(args) + plan = run_chip_cutrun_peaks_qc.build_plan(args, samples) + self.assertTrue(validation["ok"], validation) + self.assertTrue(any("findMotifsGenome.pl" in item["command"] for item in plan)) + + def test_chip_plan_resolves_control_sample_rows(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "IP_R1.fastq.gz", "fastq") + write(root / "IP_R2.fastq.gz", "fastq") + write(root / "INPUT_R1.fastq.gz", "fastq") + write(root / "INPUT_R2.fastq.gz", "fastq") + sheet = write( + root / "chip.tsv", + "sample\tfastq_1\tfastq_2\ttarget\tcondition\tcontrol\n" + "IP_1\tIP_R1.fastq.gz\tIP_R2.fastq.gz\tSPT5\tT0\tINPUT_1\n" + "INPUT_1\tINPUT_R1.fastq.gz\tINPUT_R2.fastq.gz\tinput\tINPUT\t\n", + ) + args = SimpleNamespace( + sample_sheet=sheet, + assay="chipseq", + target_class="chromatin_regulator", + peak_mode="broad", + bowtie2_index=None, + bam_only=False, + genome_size="12100000", + blacklist_bed=None, + min_mapq=30, + threads=2, + run_motifs=False, + motif_genome=None, + motif_size="given", + ) + validation, samples = run_chip_cutrun_peaks_qc.validate_inputs(args) + plan = run_chip_cutrun_peaks_qc.build_plan(args, samples) + self.assertTrue(validation["ok"], validation) + self.assertFalse( + any("needs input/IgG control" in warning for warning in validation["warnings"]) + ) + ip_peak = next(item for item in plan if item["name"] == "IP_1: MACS2 peaks") + self.assertIn("-c alignment/INPUT_1.filtered.bam", ip_peak["command"]) + self.assertFalse(any(item["name"] == "INPUT_1: MACS2 peaks" for item in plan)) + + def test_chip_plan_preprocesses_control_before_ip_peak_calling(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "IP_R1.fastq.gz", "fastq") + write(root / "IP_R2.fastq.gz", "fastq") + write(root / "INPUT_R1.fastq.gz", "fastq") + write(root / "INPUT_R2.fastq.gz", "fastq") + sheet = write( + root / "chip.tsv", + "sample\tfastq_1\tfastq_2\ttarget\tcondition\tcontrol\n" + "IP_1\tIP_R1.fastq.gz\tIP_R2.fastq.gz\tSPT5\tT0\tINPUT_1\n" + "INPUT_1\tINPUT_R1.fastq.gz\tINPUT_R2.fastq.gz\tinput\tINPUT\t\n", + ) + args = SimpleNamespace( + sample_sheet=sheet, + assay="chipseq", + target_class="chromatin_regulator", + peak_mode="broad", + bowtie2_index=None, + bam_only=False, + genome_size="12100000", + blacklist_bed=None, + min_mapq=30, + threads=2, + run_motifs=False, + motif_genome=None, + motif_size="given", + ) + _, samples = run_chip_cutrun_peaks_qc.validate_inputs(args) + plan = run_chip_cutrun_peaks_qc.build_plan(args, samples) + step_names = [item["name"] for item in plan] + self.assertLess( + step_names.index("INPUT_1: filter alignment"), + step_names.index("IP_1: MACS2 peaks"), + ) + self.assertLess( + step_names.index("INPUT_1: index filtered BAM"), + step_names.index("IP_1: MACS2 peaks"), + ) + + def test_chip_plan_consensus_glob_matches_peak_mode(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "chip.bam", "bam") + sheet = write(root / "chip.tsv", "sample\tbam\ttarget\nC1\tchip.bam\tSPT5\n") + broad_args = SimpleNamespace( + sample_sheet=sheet, + assay="chipseq", + target_class="chromatin_regulator", + peak_mode="broad", + bowtie2_index=None, + bam_only=True, + genome_size="12100000", + blacklist_bed=None, + min_mapq=30, + threads=2, + run_motifs=False, + motif_genome=None, + motif_size="given", + ) + narrow_args = SimpleNamespace(**{**broad_args.__dict__, "peak_mode": "narrow"}) + _, broad_samples = run_chip_cutrun_peaks_qc.validate_inputs(broad_args) + _, narrow_samples = run_chip_cutrun_peaks_qc.validate_inputs(narrow_args) + broad_plan = run_chip_cutrun_peaks_qc.build_plan(broad_args, broad_samples) + narrow_plan = run_chip_cutrun_peaks_qc.build_plan(narrow_args, narrow_samples) + broad_consensus = next( + item for item in broad_plan if item["name"] == "consensus peak merge" + ) + narrow_consensus = next( + item for item in narrow_plan if item["name"] == "consensus peak merge" + ) + self.assertIn("cat peaks/*_peaks.broadPeak", broad_consensus["command"]) + self.assertIn("cat peaks/*_peaks.narrowPeak", narrow_consensus["command"]) + + def test_amplicon_and_shotgun_backend_plans_are_database_aware(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "S1_R1.fastq.gz", "fastq") + write(root / "S1_R2.fastq.gz", "fastq") + classifier = write(root / "classifier.qza", "classifier") + amplicon_sheet = write( + root / "amplicon.tsv", "sample\tr1\tr2\nS1\tS1_R1.fastq.gz\tS1_R2.fastq.gz\n" + ) + amp_args = SimpleNamespace( + sample_sheet=amplicon_sheet, + backend="qiime2", + marker="16S", + primer_forward="AAA", + primer_reverse="TTT", + taxonomy_classifier=classifier, + metadata=None, + trunc_len_f=None, + trunc_len_r=None, + sampling_depth=1000, + profile=None, + execute=False, + ) + amp_validation, amp_samples = run_amplicon_microbiome.validate_inputs(amp_args) + amp_plan = run_amplicon_microbiome.build_plan(amp_args, amp_samples) + self.assertTrue(amp_validation["ok"]) + self.assertTrue(any("feature-classifier" in item["command"] for item in amp_plan)) + + kraken_db = root / "kraken_db" + kraken_db.mkdir() + shotgun_sheet = write( + root / "shotgun.tsv", "sample\tr1\tr2\nS1\tS1_R1.fastq.gz\tS1_R2.fastq.gz\n" + ) + shotgun_args = SimpleNamespace( + sample_sheet=shotgun_sheet, + kraken_db=kraken_db, + bracken_db=None, + run_bracken=True, + bracken_level="S", + read_length=150, + run_humann=False, + humann_db=None, + host_reference=None, + metadata=None, + threads=2, + ) + shot_validation, shot_samples = run_shotgun_metagenomics.validate_inputs(shotgun_args) + shot_plan = run_shotgun_metagenomics.build_plan(shotgun_args, shot_samples) + self.assertTrue(shot_validation["ok"]) + self.assertTrue(any("kraken2" in item["command"] for item in shot_plan)) + self.assertTrue(any("bracken" in item["command"] for item in shot_plan)) + + def test_amplicon_dada2_plan_uses_real_backend_script(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "S1_R1.fastq.gz", "fastq") + classifier = write(root / "silva_train_set.fa.gz", ">ref\nACGT\n") + amplicon_sheet = write( + root / "amplicon.tsv", "sampleID\tforwardReads\nS1\tS1_R1.fastq.gz\n" + ) + args = SimpleNamespace( + sample_sheet=amplicon_sheet, + backend="dada2", + marker="16S", + primer_forward="AAA", + primer_reverse="TTT", + taxonomy_classifier=classifier, + metadata=None, + trunc_len_f=120, + trunc_len_r=None, + sampling_depth=1000, + profile=None, + threads=3, + execute=False, + ) + validation, samples = run_amplicon_microbiome.validate_inputs(args) + plan = run_amplicon_microbiome.build_plan(args, samples) + self.assertTrue(validation["ok"], validation) + self.assertTrue(run_amplicon_microbiome.DADA2_BACKEND_SCRIPT.exists()) + self.assertIn("run_dada2_backend.R", plan[0]["command"]) + self.assertIn("--threads 3", plan[0]["command"]) + self.assertIn("--trunc-len-f 120", plan[0]["command"]) + self.assertIn("tables/representative_sequences.fasta", plan[0]["outputs"]) + + def test_amplicon_r_package_preflight_marks_missing_packages_blocking(self) -> None: + base = { + "ok": True, + "required": ["Rscript"], + "optional": [], + "checked": [], + "missing_required": [], + "runtime_missing": [], + } + merged = run_amplicon_microbiome.merge_tool_status( + base, + { + "ok": False, + "missing": ["dada2"], + "checked": [{"package": "dada2", "present": False}], + }, + ) + self.assertFalse(merged["ok"]) + self.assertIn("R package:dada2", merged["runtime_missing"]) + + def test_amplicon_summary_surfaces_runtime_blockers(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + run_amplicon_microbiome.write_summary( + root, + "blocked", + {"backend": "dada2", "sample_count": 1, "warnings": [], "errors": []}, + resource_plan=None, + tool_status={ + "ok": False, + "missing_required": [], + "runtime_missing": ["R package:dada2"], + }, + ) + summary = (root / "summary.md").read_text(encoding="utf-8") + self.assertIn("Runtime Blockers", summary) + self.assertIn("R package:dada2", summary) + + def test_shotgun_merges_bracken_and_humann_backend_outputs(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write( + root / "taxonomic_classification" / "S1.bracken.tsv", + "name\ttaxonomy_id\ttaxonomy_lvl\tkraken_assigned_reads\tadded_reads\tnew_est_reads\tfraction_total_reads\nEscherichia coli\t562\tS\t10\t5\t15\t0.75\nBacteroides fragilis\t817\tS\t2\t3\t5\t0.25\n", + ) + write( + root / "functional_profile" / "S1" / "S1_pathabundance.tsv", + "# Pathway\tS1_Abundance\nPWY-1\t12.5\nPWY-2\t1.5\n", + ) + write( + root / "functional_profile" / "S1" / "S1_genefamilies.tsv", + "# Gene Family\tS1_Abundance\nUniRef90_A\t3\nUniRef90_B\t1\n", + ) + summary = run_shotgun_metagenomics.summarize_backend_outputs(root, [{"sample": "S1"}]) + review = run_shotgun_metagenomics.write_shotgun_review_outputs(root) + self.assertEqual(summary["bracken"]["status"], "created") + self.assertEqual(summary["humann"]["status"], "created") + self.assertEqual(review["status"], "created") + self.assertTrue((root / "tables" / "bracken_relative_abundance_matrix.tsv").exists()) + self.assertTrue((root / "tables" / "top_bracken_taxa.tsv").exists()) + self.assertTrue((root / "visualizations" / "shotgun_backend_dashboard.html").exists()) + self.assertTrue((root / "visualizations" / "shotgun_top_taxa.svg").exists()) + self.assertTrue((root / "visualizations" / "shotgun_top_pathways.svg").exists()) + self.assertIn( + "Escherichia coli", + (root / "tables" / "bracken_est_reads_matrix.tsv").read_text(encoding="utf-8"), + ) + self.assertIn( + "PWY-1", + (root / "tables" / "humann_pathabundance_matrix.tsv").read_text(encoding="utf-8"), + ) + + def test_shotgun_host_depletion_routes_classification_over_clean_reads(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write(root / "S1_R1.fastq.gz", "fastq") + write(root / "S1_R2.fastq.gz", "fastq") + kraken_db = root / "kraken_db" + kraken_db.mkdir() + host_reference = root / "host_reference" + host_reference.mkdir() + sample_sheet = write( + root / "shotgun.tsv", "sample\tr1\tr2\nS1\tS1_R1.fastq.gz\tS1_R2.fastq.gz\n" + ) + args = SimpleNamespace( + sample_sheet=sample_sheet, + kraken_db=kraken_db, + bracken_db=None, + run_bracken=False, + bracken_level="S", + read_length=150, + run_humann=True, + humann_db=root / "humann_db", + host_reference=host_reference, + metadata=None, + threads=2, + ) + args.humann_db.mkdir() + validation, samples = run_shotgun_metagenomics.validate_inputs(args) + plan = run_shotgun_metagenomics.build_plan(args, samples) + self.assertTrue(validation["ok"], validation) + self.assertIn("KneadData host depletion", plan[0]["name"]) + self.assertIn("kneaddata", plan[0]["command"]) + kraken_command = next( + item["command"] for item in plan if "kraken2 classify" in item["name"] + ) + humann_concat = next( + item["command"] + for item in plan + if "concatenate paired reads for HUMAnN" in item["name"] + ) + self.assertIn("host_depletion/S1.clean_R1.fastq", kraken_command) + self.assertIn("host_depletion/S1.clean_R2.fastq", kraken_command) + self.assertIn("host_depletion/S1.clean_R1.fastq", humann_concat) + self.assertIn("host_depletion/S1.clean_R2.fastq", humann_concat) + + def test_shotgun_resource_plan_promotes_requested_bracken_database_to_blocking(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + kraken_db = root / "kraken" + kraken_db.mkdir() + write(kraken_db / "hash.k2d", "hash") + run_dir = root / "run" + run_dir.mkdir() + args = SimpleNamespace( + kraken_db=kraken_db, + bracken_db=None, + run_bracken=True, + run_humann=False, + humann_db=None, + include_optional_resources=False, + resource_checksums=False, + skip_resource_plan=False, + ) + resource_plan = run_shotgun_metagenomics.write_resource_plan(args, run_dir) + validation = run_shotgun_metagenomics.merge_resource_status( + {"ok": True, "errors": [], "warnings": []}, resource_plan + ) + self.assertFalse(resource_plan["ok"]) + self.assertFalse(validation["ok"]) + self.assertIn( + "kraken2_standard", [item["bundle"] for item in resource_plan["missing_required"]] + ) + self.assertIn( + "bracken_standard", [item["bundle"] for item in resource_plan["missing_required"]] + ) + self.assertTrue((run_dir / "resources" / "resource_manifest.tsv").exists()) + + def test_amplicon_normalizes_qiime2_exports(self) -> None: + with tempfile.TemporaryDirectory() as tmp: + root = Path(tmp) + write( + root / "tables" / "asv_table_export" / "feature-table.tsv", + "# Constructed from biom file\n#OTU ID\tS1\tS2\nASV1\t10\t0\nASV2\t2\t8\n", + ) + write( + root / "tables" / "taxonomy_export" / "taxonomy.tsv", + "Feature ID\tTaxon\tConfidence\nASV1\tk__Bacteria;g__Escherichia\t0.99\n", + ) + write( + root / "tables" / "denoising_stats_export" / "stats.tsv", + "sample-id\tinput\tfiltered\tdenoised\nS1\t100\t90\t80\n", + ) + summary = run_amplicon_microbiome.normalize_backend_exports(root) + review = run_amplicon_microbiome.write_amplicon_review_outputs(root) + self.assertEqual(summary["status"], "created") + self.assertEqual(review["status"], "created") + self.assertTrue((root / "tables" / "asv_table.tsv").exists()) + self.assertTrue((root / "tables" / "taxonomy.tsv").exists()) + self.assertTrue((root / "tables" / "read_retention.tsv").exists()) + self.assertTrue((root / "tables" / "alpha_diversity.tsv").exists()) + self.assertTrue((root / "tables" / "bray_curtis_distance.tsv").exists()) + self.assertTrue((root / "visualizations" / "amplicon_backend_dashboard.html").exists()) + self.assertTrue((root / "visualizations" / "amplicon_alpha_diversity.svg").exists()) + self.assertIn( + "feature_id\tS1\tS2", + (root / "tables" / "asv_table.tsv").read_text(encoding="utf-8"), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/plugins/ngs-analysis/tests/test_ngs_preflight.py b/plugins/ngs-analysis/tests/test_ngs_preflight.py new file mode 100644 index 0000000..e7fbcda --- /dev/null +++ b/plugins/ngs-analysis/tests/test_ngs_preflight.py @@ -0,0 +1,77 @@ +import json +import sys +import tempfile +import unittest +from pathlib import Path +from types import SimpleNamespace + +SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts" +sys.path.insert(0, str(SCRIPT_DIR)) + +import ngs_preflight # noqa: E402 + + +class InstallPlanArtifactTests(unittest.TestCase): + def test_bioconda_install_command_is_noninteractive(self) -> None: + cmd = ngs_preflight.install_command( + "fastqc", + {"install": {"conda": "bioconda::fastqc"}}, + "micromamba", + ) + self.assertEqual( + cmd, ["micromamba", "install", "-y", "-c", "conda-forge", "-c", "bioconda", "fastqc"] + ) + + def test_install_plan_writes_json_and_guarded_shell_script(self) -> None: + args = SimpleNamespace( + tool=None, + pipeline="shotgun_metagenomics", + profile=None, + manager="micromamba", + network_checks=False, + ) + registry = { + "tools": { + "kraken2": { + "executables": ["kraken2"], + "install": {"conda": "bioconda::kraken2"}, + "notes": "Database setup is separate.", + "license": "public_or_open", + } + } + } + entries = ngs_preflight.install_plan_entries(["kraken2"], registry, "micromamba") + plan = ngs_preflight.build_install_artifact( + args=args, + statuses=[ + { + "tool": "kraken2", + "executables": [{"name": "kraken2", "present": False, "path": None}], + } + ], + missing=["kraken2"], + runtime_missing=[], + blocking_missing=["kraken2"], + plan_entries=entries, + ) + with tempfile.TemporaryDirectory() as tmp: + outputs = ngs_preflight.write_install_artifacts(plan, Path(tmp)) + plan_path = Path(outputs["install_plan_json"]) + commands_path = Path(outputs["install_commands_sh"]) + self.assertTrue(plan_path.exists()) + self.assertTrue(commands_path.exists()) + loaded = json.loads(plan_path.read_text(encoding="utf-8")) + self.assertEqual( + loaded["permission_model"]["install_script_default_mode"], "review_only" + ) + self.assertEqual(loaded["install_plan"][0]["tool"], "kraken2") + script = commands_path.read_text(encoding="utf-8") + self.assertIn("NGS_RUN_INSTALL_COMMANDS=1", script) + self.assertIn("Review-only mode", script) + self.assertIn( + "run_cmd micromamba install -y -c conda-forge -c bioconda kraken2", script + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/plugins/ngs-analysis/tests/test_scrnaseq_post_count_qc_runner.py b/plugins/ngs-analysis/tests/test_scrnaseq_post_count_qc_runner.py new file mode 100644 index 0000000..3c47674 --- /dev/null +++ b/plugins/ngs-analysis/tests/test_scrnaseq_post_count_qc_runner.py @@ -0,0 +1,68 @@ +import sys +import unittest +from pathlib import Path +from unittest import mock + +SCRIPT_DIR = Path(__file__).resolve().parents[1] / "scripts" +sys.path.insert(0, str(SCRIPT_DIR)) + +try: + import run_scrnaseq_post_count_qc as runner # type: ignore +except ImportError: # pragma: no cover + runner = None + +analysis_stack_ready = False +try: + import anndata as ad # type: ignore + import scipy.sparse as sp # type: ignore + + if runner is not None: + analysis_stack_ready = bool(runner.load_analysis_modules().get("ok")) +except ImportError: # pragma: no cover + ad = None + sp = None + + +@unittest.skipIf(runner is None, "runner unavailable") +class ScrnaPythonDependencyTests(unittest.TestCase): + def test_python_dependency_status_reports_missing_module_without_importing_stack(self) -> None: + def fake_find_spec(name: str) -> object | None: + return None if name == "scanpy" else object() + + with mock.patch.object(runner.importlib.util, "find_spec", side_effect=fake_find_spec): + status = runner.python_dependency_status() + self.assertFalse(status["ok"]) + self.assertEqual(status["missing"], ["scanpy"]) + + def test_r_dependency_status_fails_when_required_package_is_missing(self) -> None: + probe = {"ok": True, "stdout_tail": "DropletUtils=TRUE;scDblFinder=FALSE;SoupX=TRUE"} + with ( + mock.patch.object(runner, "command_path", return_value="/usr/bin/Rscript"), + mock.patch.object(runner, "run_cmd", return_value=probe), + ): + status = runner.r_dependency_status(None) + self.assertFalse(status["ok"]) + self.assertEqual(status["missing"], ["scDblFinder"]) + + def test_combined_tool_preflight_status_preserves_failed_r_status(self) -> None: + status = runner.combined_tool_preflight_status( + {"ok": True, "python_modules": {}}, + {"ok": False, "missing": ["scDblFinder"]}, + ) + self.assertFalse(status["ok"]) + self.assertEqual(status["r_dependencies"]["missing"], ["scDblFinder"]) + + +@unittest.skipIf( + runner is None or ad is None or sp is None or not analysis_stack_ready, + "scanpy/anndata stack unavailable", +) +class ScrnaPostCountRunnerTests(unittest.TestCase): + def test_scdbfinder_readiness_flags_tiny_sparse_matrix(self) -> None: + adata = ad.AnnData(X=sp.csr_matrix([[1], [0]])) + adata.obs["total_counts"] = [1, 0] + readiness = runner.scdbfinder_readiness(adata) + self.assertFalse(readiness["ok"]) + self.assertEqual(readiness["reason"], "too_few_informative_cells") + self.assertEqual(readiness["informative_cells"], 1) + self.assertEqual(readiness["nonzero_entries"], 1) diff --git a/plugins/ngs-analysis/workflows/amplicon_microbiome/run_dada2_backend.R b/plugins/ngs-analysis/workflows/amplicon_microbiome/run_dada2_backend.R new file mode 100644 index 0000000..6e0c031 --- /dev/null +++ b/plugins/ngs-analysis/workflows/amplicon_microbiome/run_dada2_backend.R @@ -0,0 +1,223 @@ +#!/usr/bin/env Rscript + +parse_args <- function(argv) { + args <- list( + outdir = ".", + threads = 1, + trunc_len_f = 0, + trunc_len_r = 0, + taxonomy_classifier = "" + ) + i <- 1 + while (i <= length(argv)) { + key <- argv[[i]] + if (!startsWith(key, "--")) { + stop(sprintf("unexpected argument: %s", key)) + } + name <- gsub("-", "_", substring(key, 3), fixed = TRUE) + if (i == length(argv)) { + stop(sprintf("missing value for %s", key)) + } + args[[name]] <- argv[[i + 1]] + i <- i + 2 + } + required <- c("sample_sheet", "primer_forward", "primer_reverse") + missing <- required[!nzchar(vapply(required, function(name) args[[name]] %||% "", character(1)))] + if (length(missing)) { + stop(sprintf("missing required argument(s): %s", paste(missing, collapse = ", "))) + } + args$threads <- as.integer(args$threads) + args$trunc_len_f <- as.integer(args$trunc_len_f) + args$trunc_len_r <- as.integer(args$trunc_len_r) + args +} + +`%||%` <- function(left, right) { + if (is.null(left)) { + right + } else { + left + } +} + +detect_sep <- function(path) { + if (grepl("\\.(tsv|tab)$", path, ignore.case = TRUE)) "\t" else "," +} + +resolve_path <- function(raw, base_dir) { + if (is.na(raw) || !nzchar(raw)) { + return("") + } + raw <- path.expand(raw) + if (startsWith(raw, "/")) { + return(normalizePath(raw, mustWork = FALSE)) + } + normalizePath(file.path(base_dir, raw), mustWork = FALSE) +} + +count_fastq <- function(path) { + con <- if (grepl("\\.gz$", path, ignore.case = TRUE)) gzfile(path, "rt") else file(path, "rt") + on.exit(close(con), add = TRUE) + lines <- 0 + repeat { + chunk <- readLines(con, n = 400000, warn = FALSE) + if (!length(chunk)) { + break + } + lines <- lines + length(chunk) + } + as.integer(lines / 4) +} + +as_sample_sheet <- function(path) { + table <- read.table(path, sep = detect_sep(path), header = TRUE, stringsAsFactors = FALSE, check.names = FALSE, quote = "", comment.char = "") + names(table) <- trimws(names(table)) + lower_names <- tolower(names(table)) + sample_col <- match(TRUE, lower_names %in% tolower(c("sample", "sample_id", "sampleID"))) + r1_col <- match(TRUE, lower_names %in% tolower(c("r1", "fastq_1", "forwardReads", "read1"))) + r2_col <- match(TRUE, lower_names %in% tolower(c("r2", "fastq_2", "reverseReads", "read2"))) + if (is.na(sample_col) || is.na(r1_col)) { + stop("sample sheet must contain sample/sample_id/sampleID and r1/fastq_1/forwardReads columns") + } + base_dir <- dirname(normalizePath(path, mustWork = TRUE)) + data.frame( + sample = make.names(table[[sample_col]], unique = TRUE), + r1 = vapply(table[[r1_col]], resolve_path, character(1), base_dir = base_dir), + r2 = if (is.na(r2_col)) "" else vapply(table[[r2_col]], resolve_path, character(1), base_dir = base_dir), + stringsAsFactors = FALSE + ) +} + +write_table <- function(path, table) { + dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE) + write.table(table, path, sep = "\t", quote = FALSE, row.names = FALSE, na = "") +} + +write_fasta <- function(path, ids, sequences) { + dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE) + con <- file(path, "wt") + on.exit(close(con), add = TRUE) + for (i in seq_along(ids)) { + writeLines(sprintf(">%s", ids[[i]]), con) + writeLines(gsub("(.{1,80})", "\\1\n", sequences[[i]], perl = TRUE), con) + } +} + +taxonomy_to_table <- function(taxa, ids) { + ranks <- as.data.frame(taxa, stringsAsFactors = FALSE) + ranks[is.na(ranks)] <- "" + ranks$taxonomy <- apply(ranks, 1, function(row) paste(row[nzchar(row)], collapse = ";")) + data.frame(feature_id = ids, ranks, stringsAsFactors = FALSE, check.names = FALSE) +} + +if (!requireNamespace("dada2", quietly = TRUE)) { + stop("R package 'dada2' is required. Install with mamba install -c conda-forge -c bioconda bioconductor-dada2.") +} + +args <- parse_args(commandArgs(trailingOnly = TRUE)) +outdir <- normalizePath(args$outdir, mustWork = FALSE) +samples <- as_sample_sheet(args$sample_sheet) +paired <- any(nzchar(samples$r2)) + +if (paired && any(!nzchar(samples$r2))) { + stop("paired DADA2 run requires r2/fastq_2 for every sample") +} + +dir.create(file.path(outdir, "dada2", "filtered"), recursive = TRUE, showWarnings = FALSE) +dir.create(file.path(outdir, "tables"), recursive = TRUE, showWarnings = FALSE) +dir.create(file.path(outdir, "logs"), recursive = TRUE, showWarnings = FALSE) + +fnFs <- samples$r1 +filtFs <- file.path(outdir, "dada2", "filtered", paste0(samples$sample, "_F_filt.fastq.gz")) +names(fnFs) <- samples$sample +names(filtFs) <- samples$sample + +if (paired) { + fnRs <- samples$r2 + filtRs <- file.path(outdir, "dada2", "filtered", paste0(samples$sample, "_R_filt.fastq.gz")) + names(fnRs) <- samples$sample + names(filtRs) <- samples$sample + trunc_len <- c(args$trunc_len_f, args$trunc_len_r) + filtered <- dada2::filterAndTrim( + fnFs, + filtFs, + fnRs, + filtRs, + truncLen = trunc_len, + maxN = 0, + maxEE = c(2, 2), + truncQ = 2, + rm.phix = TRUE, + compress = TRUE, + multithread = args$threads + ) + errF <- dada2::learnErrors(filtFs, multithread = args$threads) + errR <- dada2::learnErrors(filtRs, multithread = args$threads) + dadaFs <- dada2::dada(filtFs, err = errF, multithread = args$threads) + dadaRs <- dada2::dada(filtRs, err = errR, multithread = args$threads) + mergers <- dada2::mergePairs(dadaFs, filtFs, dadaRs, filtRs) + seqtab <- dada2::makeSequenceTable(mergers) + denoised <- vapply(dadaFs, dada2::getN, integer(1)) + merged <- vapply(mergers, dada2::getN, integer(1)) +} else { + filtered <- dada2::filterAndTrim( + fnFs, + filtFs, + truncLen = args$trunc_len_f, + maxN = 0, + maxEE = 2, + truncQ = 2, + rm.phix = TRUE, + compress = TRUE, + multithread = args$threads + ) + errF <- dada2::learnErrors(filtFs, multithread = args$threads) + dadaFs <- dada2::dada(filtFs, err = errF, multithread = args$threads) + seqtab <- dada2::makeSequenceTable(dadaFs) + denoised <- vapply(dadaFs, dada2::getN, integer(1)) + merged <- denoised +} + +seqtab_nochim <- dada2::removeBimeraDenovo(seqtab, method = "consensus", multithread = args$threads) +sequences <- colnames(seqtab_nochim) +asv_ids <- paste0("ASV", seq_along(sequences)) +colnames(seqtab_nochim) <- asv_ids + +asv_table <- data.frame(feature_id = asv_ids, t(seqtab_nochim), check.names = FALSE) +write_table(file.path(outdir, "tables", "asv_table.tsv"), asv_table) +write_fasta(file.path(outdir, "tables", "representative_sequences.fasta"), asv_ids, sequences) + +filtered_out <- if (is.null(dim(filtered))) filtered else filtered[, "reads.out"] +retention <- data.frame( + sample = samples$sample, + input = vapply(fnFs, count_fastq, integer(1)), + filtered = as.integer(filtered_out), + denoised = as.integer(denoised[samples$sample]), + merged = as.integer(merged[samples$sample]), + nonchim = as.integer(rowSums(seqtab_nochim)[samples$sample]), + stringsAsFactors = FALSE +) +write_table(file.path(outdir, "tables", "read_retention.tsv"), retention) + +if (nzchar(args$taxonomy_classifier)) { + classifier <- normalizePath(args$taxonomy_classifier, mustWork = TRUE) + if (grepl("\\.qza$", classifier, ignore.case = TRUE)) { + writeLines("DADA2 backend skipped taxonomy assignment because .qza classifiers are QIIME2 artifacts.", file.path(outdir, "logs", "dada2_taxonomy_skipped.txt")) + } else { + taxa <- dada2::assignTaxonomy(seqtab_nochim, classifier, multithread = args$threads) + write_table(file.path(outdir, "tables", "taxonomy.tsv"), taxonomy_to_table(taxa, asv_ids)) + } +} + +saveRDS( + list( + samples = samples, + sequence_table = seqtab_nochim, + retention = retention, + primer_forward = args$primer_forward, + primer_reverse = args$primer_reverse + ), + file.path(outdir, "dada2", "dada2_backend_state.rds") +) + +writeLines("DADA2 backend completed", file.path(outdir, "logs", "dada2_backend_status.txt")) diff --git a/plugins/ngs-analysis/workflows/bulk_rnaseq_counts_qc/Snakefile.smk b/plugins/ngs-analysis/workflows/bulk_rnaseq_counts_qc/Snakefile.smk new file mode 100644 index 0000000..d0aa63f --- /dev/null +++ b/plugins/ngs-analysis/workflows/bulk_rnaseq_counts_qc/Snakefile.smk @@ -0,0 +1,110 @@ +"""Plugin-owned local-light bulk RNA-seq counts/QC workflow.""" + +import shlex + + +FASTQ_FILES = config.get("fastq_files", {}) +RNASEQ_SALMON = config.get("rnaseq_salmon_samples", {}) +REFERENCES = config.get("references", {}) +SALMON_CONFIG = config.get("salmon", {}) +THREADS = int(config.get("threads", 4)) + + +def _shell_join(paths): + return " ".join(shlex.quote(path) for path in paths) + + +rule all: + input: + "fastqc/multiqc/multiqc_report.html", + "rnaseq_salmon/multiqc/multiqc_report.html", + "rnaseq_salmon/matrices/tpm.tsv", + "rnaseq_salmon/matrices/num_reads.tsv", + "rnaseq_salmon/matrices/effective_length.tsv", + "rnaseq_salmon/matrices/samples.tsv" + + +rule fastqc_raw: + input: + lambda wildcards: FASTQ_FILES[wildcards.unit]["path"] + output: + touch("fastqc/raw/{unit}.done") + threads: THREADS + shell: + "mkdir -p fastqc/raw && fastqc -t {threads} -o fastqc/raw {input:q}" + + +rule multiqc_fastq: + input: + expand("fastqc/raw/{unit}.done", unit=FASTQ_FILES.keys()) + output: + "fastqc/multiqc/multiqc_report.html" + shell: + "mkdir -p fastqc/multiqc && multiqc --no-version-check fastqc/raw -o fastqc/multiqc" + + +rule salmon_index: + input: + transcriptome=lambda wildcards: REFERENCES["transcriptome_fasta"] + output: + directory("rnaseq_salmon/index") + params: + kmer=lambda wildcards: int(SALMON_CONFIG.get("kmer", 31)) + shell: + "salmon --no-version-check index -t {input.transcriptome:q} -i {output:q} -k {params.kmer}" + + +rule salmon_quant: + input: + index="rnaseq_salmon/index" + output: + "rnaseq_salmon/quant/{sample}/quant.sf" + threads: THREADS + params: + layout=lambda wildcards: RNASEQ_SALMON[wildcards.sample]["layout"], + libtype=lambda wildcards: RNASEQ_SALMON[wildcards.sample]["salmon_libtype"], + r1=lambda wildcards: _shell_join(RNASEQ_SALMON[wildcards.sample]["r1"]), + r2=lambda wildcards: _shell_join(RNASEQ_SALMON[wildcards.sample].get("r2", [])), + outdir=lambda wildcards: f"rnaseq_salmon/quant/{wildcards.sample}", + shell: + r""" + if [ "{params.layout}" = "PE" ]; then + salmon --no-version-check quant -i {input.index} -l {params.libtype} \ + -1 {params.r1} -2 {params.r2} \ + -p {threads} --validateMappings -o {params.outdir} + else + salmon --no-version-check quant -i {input.index} -l {params.libtype} \ + -r {params.r1} \ + -p {threads} --validateMappings -o {params.outdir} + fi + """ + + +rule multiqc_salmon: + input: + expand("rnaseq_salmon/quant/{sample}/quant.sf", sample=RNASEQ_SALMON.keys()) + output: + "rnaseq_salmon/multiqc/multiqc_report.html" + shell: + "mkdir -p rnaseq_salmon/multiqc && multiqc --no-version-check rnaseq_salmon/quant -o rnaseq_salmon/multiqc" + + +rule salmon_aggregate: + input: + expand("rnaseq_salmon/quant/{sample}/quant.sf", sample=RNASEQ_SALMON.keys()), + config_path="config.json" + output: + tpm="rnaseq_salmon/matrices/tpm.tsv", + num_reads="rnaseq_salmon/matrices/num_reads.tsv", + effective_length="rnaseq_salmon/matrices/effective_length.tsv", + samples="rnaseq_salmon/matrices/samples.tsv" + params: + quant_args=lambda wildcards: " ".join( + f"--quant {sample}=rnaseq_salmon/quant/{sample}/quant.sf" for sample in sorted(RNASEQ_SALMON.keys()) + ) + shell: + "mkdir -p rnaseq_salmon/matrices && " + "python workflow/scripts/aggregate_salmon_quant.py " + "--config {input.config_path} " + "--outdir rnaseq_salmon/matrices " + "{params.quant_args}" diff --git a/plugins/ngs-analysis/workflows/bulk_rnaseq_counts_qc/aggregate_salmon_quant.py b/plugins/ngs-analysis/workflows/bulk_rnaseq_counts_qc/aggregate_salmon_quant.py new file mode 100644 index 0000000..f460b95 --- /dev/null +++ b/plugins/ngs-analysis/workflows/bulk_rnaseq_counts_qc/aggregate_salmon_quant.py @@ -0,0 +1,178 @@ +#!/usr/bin/env python3 +"""Aggregate Salmon quant.sf files into transcript- and gene-level matrices.""" + +from __future__ import annotations + +import argparse +import csv +import json +from pathlib import Path + + +def parse_gtf_attributes(raw: str) -> dict[str, str]: + values: dict[str, str] = {} + for chunk in raw.strip().split(";"): + part = chunk.strip() + if not part or " " not in part: + continue + key, value = part.split(" ", 1) + values[key] = value.strip().strip('"') + return values + + +def tx2gene_from_gtf(path: Path | None) -> dict[str, dict[str, str]]: + if not path or not path.exists(): + return {} + mapping: dict[str, dict[str, str]] = {} + with path.open("rt", encoding="utf-8", errors="replace") as handle: + for line in handle: + if not line or line.startswith("#"): + continue + fields = line.rstrip("\n").split("\t") + if len(fields) < 9 or fields[2] != "transcript": + continue + attrs = parse_gtf_attributes(fields[8]) + transcript_id = attrs.get("transcript_id") + gene_id = attrs.get("gene_id") + if not transcript_id or not gene_id: + continue + mapping[transcript_id] = { + "gene_id": gene_id, + "gene_name": attrs.get("gene_name", gene_id), + } + return mapping + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--config", required=True) + parser.add_argument("--outdir", required=True) + parser.add_argument("--quant", action="append", default=[], help="sample=/path/to/quant.sf") + return parser.parse_args() + + +def read_quant_sf(path: Path) -> dict[str, dict[str, str]]: + with path.open(newline="", encoding="utf-8") as handle: + reader = csv.DictReader(handle, delimiter="\t") + return {row["Name"]: row for row in reader} + + +def write_matrix(path: Path, header: list[str], rows: list[list[str]]) -> None: + with path.open("w", newline="", encoding="utf-8") as handle: + writer = csv.writer(handle, delimiter="\t") + writer.writerow(header) + writer.writerows(rows) + + +def main() -> int: + args = parse_args() + config = json.loads(Path(args.config).read_text(encoding="utf-8")) + outdir = Path(args.outdir) + outdir.mkdir(parents=True, exist_ok=True) + + sample_to_quant: dict[str, Path] = {} + for item in args.quant: + sample, raw_path = item.split("=", 1) + sample_to_quant[sample] = Path(raw_path) + + sample_names = sorted(sample_to_quant) + per_sample = {sample: read_quant_sf(path) for sample, path in sample_to_quant.items()} + transcript_ids = sorted({tx for table in per_sample.values() for tx in table}) + gtf_path = ( + Path(config["references"]["annotation_gtf"]) + if config.get("references", {}).get("annotation_gtf") + else None + ) + tx2gene = tx2gene_from_gtf(gtf_path) + + tpm_rows: list[list[str]] = [] + num_reads_rows: list[list[str]] = [] + effective_length_rows: list[list[str]] = [] + for transcript_id in transcript_ids: + tpm_row = [transcript_id] + num_reads_row = [transcript_id] + effective_length_row = [transcript_id] + for sample in sample_names: + record = per_sample[sample].get(transcript_id) + tpm_row.append(record["TPM"] if record else "") + num_reads_row.append(record["NumReads"] if record else "") + effective_length_row.append(record["EffectiveLength"] if record else "") + tpm_rows.append(tpm_row) + num_reads_rows.append(num_reads_row) + effective_length_rows.append(effective_length_row) + + write_matrix(outdir / "tpm.tsv", ["transcript_id", *sample_names], tpm_rows) + write_matrix(outdir / "num_reads.tsv", ["transcript_id", *sample_names], num_reads_rows) + write_matrix( + outdir / "effective_length.tsv", ["transcript_id", *sample_names], effective_length_rows + ) + + tx2gene_rows: list[list[str]] = [] + gene_num_reads: dict[str, list[float]] = {} + gene_tpm: dict[str, list[float]] = {} + for transcript_id in transcript_ids: + gene_record = tx2gene.get(transcript_id) + gene_id = gene_record["gene_id"] if gene_record else transcript_id + gene_name = gene_record["gene_name"] if gene_record else transcript_id + tx2gene_rows.append([transcript_id, gene_id, gene_name]) + gene_num_reads.setdefault(gene_id, [0.0] * len(sample_names)) + gene_tpm.setdefault(gene_id, [0.0] * len(sample_names)) + for idx, sample in enumerate(sample_names): + record = per_sample[sample].get(transcript_id) + if not record: + continue + gene_num_reads[gene_id][idx] += float(record["NumReads"]) + gene_tpm[gene_id][idx] += float(record["TPM"]) + + write_matrix(outdir / "tx2gene.tsv", ["transcript_id", "gene_id", "gene_name"], tx2gene_rows) + write_matrix( + outdir / "gene_num_reads.tsv", + ["gene_id", *sample_names], + [ + [gene_id, *[f"{value:.6f}" for value in gene_num_reads[gene_id]]] + for gene_id in sorted(gene_num_reads) + ], + ) + write_matrix( + outdir / "gene_tpm.tsv", + ["gene_id", *sample_names], + [ + [gene_id, *[f"{value:.6f}" for value in gene_tpm[gene_id]]] + for gene_id in sorted(gene_tpm) + ], + ) + + sample_rows = [] + for sample in sample_names: + info = config["rnaseq_salmon_samples"][sample] + sample_rows.append( + [ + sample, + info["layout"], + info["strandedness"], + info.get("salmon_libtype", ""), + info.get("salmon_libtype_source", ""), + ",".join(str(index) for index in info["row_indices"]), + str(len(info["r1"])), + str(len(info.get("r2", []))), + ] + ) + write_matrix( + outdir / "samples.tsv", + [ + "sample", + "layout", + "strandedness", + "salmon_libtype", + "salmon_libtype_source", + "technical_replicate_rows", + "fastq_1_files", + "fastq_2_files", + ], + sample_rows, + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/plugins/ngs-analysis/workflows/bulk_rnaseq_differential_expression/run_bulk_de.R b/plugins/ngs-analysis/workflows/bulk_rnaseq_differential_expression/run_bulk_de.R new file mode 100644 index 0000000..925e5f0 --- /dev/null +++ b/plugins/ngs-analysis/workflows/bulk_rnaseq_differential_expression/run_bulk_de.R @@ -0,0 +1,455 @@ +#!/usr/bin/env Rscript + +suppressPackageStartupMessages({ + library(limma) +}) + +args <- commandArgs(trailingOnly = TRUE) +if (length(args) != 7) { + stop("usage: run_bulk_de.R ") +} + +count_path <- args[[1]] +metadata_path <- args[[2]] +contrasts_path <- args[[3]] +method <- args[[4]] +input_mode <- args[[5]] +fit_formula <- args[[6]] +outdir <- args[[7]] + +dir.create(outdir, recursive = TRUE, showWarnings = FALSE) +for (child in c("config", "manifest", "logs", "qc", "results", "plots", "versions")) { + dir.create(file.path(outdir, child), recursive = TRUE, showWarnings = FALSE) +} + +safe_name <- function(x) { + gsub("[^A-Za-z0-9_.-]+", "_", x) +} + +write_matrix_artifact <- function(path, values, counts, gene_name) { + write.table( + data.frame(gene_id = counts$gene_id, gene_name = gene_name, values, check.names = FALSE), + file = path, + sep = "\t", + quote = FALSE, + row.names = FALSE + ) +} + +mean_without_self <- function(mat) { + if (ncol(mat) < 2) { + return(rep(0, ncol(mat))) + } + apply(mat, 1, function(x) sum(x) / (length(x) - 1)) +} + +write_log <- function(...) { + write(paste(...), file = file.path(outdir, "logs", "run.log"), append = TRUE) +} + +pad_range <- function(values, frac = 0.08) { + rng <- range(values, finite = TRUE) + span <- diff(rng) + if (!is.finite(span) || span == 0) { + span <- max(abs(rng), 1) + } + c(rng[1] - span * frac, rng[2] + span * frac) +} + +condition_palette <- function(conditions) { + levs <- unique(as.character(conditions)) + cols <- c("#3b6ea8", "#d95f02", "#1b9e77", "#7570b3", "#e7298a", "#66a61e") + setNames(cols[seq_along(levs)], levs) +} + +label_top_points <- function(x, y, labels, rank_metric, n = 5, cex = 0.8) { + keep <- is.finite(x) & is.finite(y) & nzchar(labels) + if (!any(keep)) { + return(invisible(NULL)) + } + ord <- order(rank_metric[keep], decreasing = FALSE) + idx <- which(keep)[head(ord, min(n, length(ord)))] + text(x[idx], y[idx], labels = labels[idx], pos = 3, cex = cex, xpd = NA) +} + +write_log("started_at", format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")) +write_log("count_matrix", normalizePath(count_path)) +write_log("sample_metadata", normalizePath(metadata_path)) +write_log("contrasts", normalizePath(contrasts_path)) +write_log("method", method) + +counts <- read.delim(count_path, check.names = FALSE) +metadata <- read.delim(metadata_path, check.names = FALSE, stringsAsFactors = FALSE) +contrast_manifest <- read.delim(contrasts_path, check.names = FALSE, stringsAsFactors = FALSE) + +sample_cols <- setdiff(colnames(counts), c("gene_id", "gene_name")) +if (!setequal(sample_cols, metadata$sample_id)) { + stop("count matrix columns and metadata sample_id values do not match") +} +metadata <- metadata[match(sample_cols, metadata$sample_id), ] + +expr <- as.matrix(counts[, sample_cols]) +mode(expr) <- "numeric" +rownames(expr) <- counts$gene_id +gene_name <- if ("gene_name" %in% colnames(counts)) counts$gene_name else counts$gene_id +is_integer_like <- all(abs(expr - round(expr)) < 1e-8) + +condition_counts <- as.data.frame(table(metadata$condition), stringsAsFactors = FALSE) +colnames(condition_counts) <- c("condition", "n_replicates") +contrast_manifest$numerator_replicates <- condition_counts$n_replicates[ + match(contrast_manifest$numerator_condition, condition_counts$condition) +] +contrast_manifest$denominator_replicates <- condition_counts$n_replicates[ + match(contrast_manifest$denominator_condition, condition_counts$condition) +] +contrast_manifest$status <- ifelse( + contrast_manifest$numerator_replicates >= 2 & contrast_manifest$denominator_replicates >= 2, + "valid", + "insufficient_replicates" +) +contrast_manifest$executed <- FALSE +contrast_manifest$execution_method <- NA_character_ +contrast_manifest$stub_result <- NA_character_ +design_formula <- fit_formula + +write.table(metadata, file = file.path(outdir, "manifest", "sample_metadata.aligned.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +write.table(contrast_manifest, file = file.path(outdir, "manifest", "contrast_status.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +write.table( + data.frame( + role = c("count_matrix", "sample_metadata", "contrasts"), + path = c(normalizePath(count_path), normalizePath(metadata_path), normalizePath(contrasts_path)) + ), + file = file.path(outdir, "manifest", "input_files.tsv"), + sep = "\t", + quote = FALSE, + row.names = FALSE +) +write.table( + data.frame( + key = c("design_formula", "method", "count_matrix_integer_like"), + value = c(design_formula, method, is_integer_like) + ), + file = file.path(outdir, "config", "method_decision.tsv"), + sep = "\t", + quote = FALSE, + row.names = FALSE +) + +log_expr <- log2(expr + 1) +model_expr <- log_expr +normalization_warning <- NULL +if (input_mode == "raw_counts") { + write_matrix_artifact(file.path(outdir, "results", "raw_counts.tsv"), expr, counts, gene_name) +} else if (input_mode == "normalized_expression") { + normalization_warning <- paste( + "Normalization skipped because input_mode=normalized_expression.", + "The runner preserved the supplied matrix and generated log2(x+1) only for modeling/QC." + ) + write_matrix_artifact(file.path(outdir, "results", "input_normalized_expression_matrix.tsv"), expr, counts, gene_name) +} else if (input_mode == "log_expression") { + normalization_warning <- paste( + "Normalization and log transformation skipped because input_mode=log_expression.", + "The runner used the supplied matrix directly for modeling/QC." + ) + log_expr <- expr + model_expr <- expr + write_matrix_artifact(file.path(outdir, "results", "input_log_expression_matrix.tsv"), expr, counts, gene_name) +} else { + stop(paste("unsupported input_mode:", input_mode)) +} +if (!is.null(normalization_warning)) { + writeLines(normalization_warning, con = file.path(outdir, "qc", "input_mode_warning.txt")) +} +normalized_counts <- expr + +if (method == "edgeR") { + suppressPackageStartupMessages(library(edgeR)) + dge <- DGEList(counts = round(expr), group = metadata$condition) + dge <- calcNormFactors(dge) + normalized_counts <- cpm(dge, normalized.lib.sizes = TRUE) + model_expr <- cpm(dge, log = TRUE, prior.count = 1) +} else if (method == "DESeq2") { + suppressPackageStartupMessages(library(DESeq2)) + metadata$condition <- factor(metadata$condition) + dds <- DESeqDataSetFromMatrix(countData = round(expr), colData = metadata, design = as.formula(design_formula)) + dds <- DESeq(dds, quiet = TRUE) + normalized_counts <- counts(dds, normalized = TRUE) + model_expr <- assay(vst(dds, blind = TRUE)) +} + +if (input_mode == "raw_counts") { + write_matrix_artifact(file.path(outdir, "results", "normalized_expression_matrix.tsv"), normalized_counts, counts, gene_name) + write_matrix_artifact(file.path(outdir, "results", "log2_expression_matrix.tsv"), model_expr, counts, gene_name) +} else if (input_mode == "normalized_expression") { + write_matrix_artifact(file.path(outdir, "results", "log2_expression_matrix.tsv"), model_expr, counts, gene_name) +} else { + write_matrix_artifact(file.path(outdir, "results", "modeling_expression_matrix.tsv"), model_expr, counts, gene_name) +} + +lib_sizes <- data.frame(sample_id = sample_cols, library_size = colSums(expr), condition = metadata$condition) +write.table(lib_sizes, file = file.path(outdir, "qc", "library_sizes.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +condition_cols <- condition_palette(metadata$condition) +bar_cols <- unname(condition_cols[as.character(lib_sizes$condition)]) +png(file.path(outdir, "qc", "library_sizes.png"), width = 1300, height = 900, res = 160) +par(mar = c(10, 5, 4, 2) + 0.1) +barplot( + lib_sizes$library_size, + names.arg = lib_sizes$sample_id, + las = 2, + col = bar_cols, + main = "Library Sizes", + ylab = "Sum of provided expression values" +) +legend("topright", legend = names(condition_cols), fill = unname(condition_cols), bty = "n") +dev.off() + +pca_input <- model_expr[apply(model_expr, 1, var, na.rm = TRUE) > 0, , drop = FALSE] +if (nrow(pca_input) < 2) { + pca_input <- model_expr +} +pca <- prcomp(t(pca_input), center = TRUE, scale. = FALSE) +pca_df <- data.frame(sample_id = rownames(pca$x), PC1 = pca$x[, 1], PC2 = pca$x[, 2], condition = metadata$condition) +write.table(pca_df, file = file.path(outdir, "qc", "pca_scores.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +pc_var <- (pca$sdev^2 / sum(pca$sdev^2)) * 100 +png(file.path(outdir, "qc", "pca.png"), width = 1300, height = 900, res = 160) +par(mar = c(5, 5, 4, 5) + 0.1) +plot( + pca_df$PC1, + pca_df$PC2, + pch = 19, + col = unname(condition_cols[as.character(pca_df$condition)]), + xlab = sprintf("PC1 (%.1f%% variance)", pc_var[1]), + ylab = sprintf("PC2 (%.1f%% variance)", pc_var[2]), + main = "PCA on modeling expression", + xlim = pad_range(pca_df$PC1), + ylim = pad_range(pca_df$PC2) +) +text(pca_df$PC1, pca_df$PC2, labels = pca_df$sample_id, pos = 3, cex = 0.8, xpd = NA) +legend("topright", inset = c(-0.2, 0), legend = names(condition_cols), fill = unname(condition_cols), bty = "n", xpd = NA) +dev.off() + +dist_mat <- as.matrix(dist(t(model_expr))) +write.table( + cbind(sample_id = rownames(dist_mat), as.data.frame(dist_mat, check.names = FALSE)), + file = file.path(outdir, "qc", "sample_distance.tsv"), + sep = "\t", + quote = FALSE, + row.names = FALSE +) +png(file.path(outdir, "qc", "sample_distance_heatmap.png"), width = 1400, height = 1100, res = 170) +heatmap( + dist_mat, + symm = TRUE, + scale = "none", + margins = c(12, 12), + cexRow = 0.95, + cexCol = 0.95, + col = colorRampPalette(c("#fff7bc", "#fec44f", "#fe9929", "#d95f0e", "#993404"))(256), + main = "Sample Distance" +) +dev.off() + +outlier_mean_distance <- mean_without_self(dist_mat) +outlier_z <- as.numeric(scale(outlier_mean_distance)) +if (all(is.na(outlier_z))) { + outlier_z <- rep(0, length(outlier_mean_distance)) +} +sample_outliers <- data.frame( + sample_id = names(outlier_mean_distance), + mean_distance = as.numeric(outlier_mean_distance), + z_score = outlier_z, + flag_high_distance = outlier_z >= 2 +) +write.table(sample_outliers, file = file.path(outdir, "qc", "sample_outlier_metrics.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) + +design <- model.matrix(as.formula(design_formula), metadata) +write.table( + cbind(sample_id = metadata$sample_id, as.data.frame(design, check.names = FALSE)), + file = file.path(outdir, "qc", "design_matrix.tsv"), + sep = "\t", + quote = FALSE, + row.names = FALSE +) +design_rank <- qr(design)$rank +design_full_rank <- design_rank == ncol(design) +design_diagnostics <- data.frame( + key = c("design_formula", "input_mode", "sample_count", "design_columns", "design_rank", "design_full_rank"), + value = c(design_formula, input_mode, nrow(metadata), ncol(design), design_rank, design_full_rank) +) +write.table(design_diagnostics, file = file.path(outdir, "qc", "design_diagnostics.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +if ("batch" %in% colnames(metadata)) { + batch_condition_table <- as.data.frame.matrix(table(metadata$batch, metadata$condition)) + batch_condition_table <- cbind(batch = rownames(batch_condition_table), batch_condition_table) + rownames(batch_condition_table) <- NULL + write.table(batch_condition_table, file = file.path(outdir, "qc", "condition_by_batch.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +} +if (!design_full_rank) { + stop("design matrix is rank deficient; see qc/design_diagnostics.tsv and qc/condition_by_batch.tsv") +} + +valid_contrasts <- contrast_manifest[contrast_manifest$status == "valid", , drop = FALSE] +blocked_contrasts <- contrast_manifest[contrast_manifest$status != "valid", , drop = FALSE] + +warnings_df <- data.frame(severity = character(), message = character(), stringsAsFactors = FALSE) +if (nrow(valid_contrasts) == 0) { + warnings_df <- rbind(warnings_df, data.frame(severity = "error", message = "No contrasts were executable after replicate checks.", stringsAsFactors = FALSE)) +} +if (nrow(blocked_contrasts) > 0) { + warnings_df <- rbind(warnings_df, data.frame(severity = "warn", message = sprintf("%d contrast(s) were blocked due to insufficient biological replication.", nrow(blocked_contrasts)), stringsAsFactors = FALSE)) +} +if (any(valid_contrasts$numerator_replicates == 2 & valid_contrasts$denominator_replicates == 2)) { + warnings_df <- rbind(warnings_df, data.frame(severity = "warn", message = "At least one executed contrast is minimally powered (2 vs 2 replicates); interpret effect sizes and p-values as exploratory.", stringsAsFactors = FALSE)) +} +if (input_mode != "raw_counts") { + warnings_df <- rbind(warnings_df, data.frame(severity = "warn", message = sprintf("Input mode is %s; normalization and/or transformation was preserved from the supplied matrix rather than re-derived from raw counts.", input_mode), stringsAsFactors = FALSE)) +} +write.table(warnings_df, file = file.path(outdir, "qc", "statistical_warnings.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +write.table( + data.frame( + key = c("sample_count", "gene_count", "valid_contrasts", "blocked_contrasts", "minimal_replicate_contrasts"), + value = c(nrow(metadata), nrow(counts), nrow(valid_contrasts), nrow(blocked_contrasts), sum(valid_contrasts$numerator_replicates == 2 & valid_contrasts$denominator_replicates == 2)) + ), + file = file.path(outdir, "qc", "statistical_summary.tsv"), + sep = "\t", + quote = FALSE, + row.names = FALSE +) +if (nrow(blocked_contrasts) > 0) { + for (i in seq_len(nrow(blocked_contrasts))) { + out_name <- safe_name(blocked_contrasts$contrast[i]) + stub_path <- file.path(outdir, "results", paste0(out_name, ".not_tested.tsv")) + stub <- data.frame( + contrast = blocked_contrasts$contrast[i], + status = blocked_contrasts$status[i], + reason = "Insufficient biological replication for at least one condition", + numerator_condition = blocked_contrasts$numerator_condition[i], + denominator_condition = blocked_contrasts$denominator_condition[i], + numerator_replicates = blocked_contrasts$numerator_replicates[i], + denominator_replicates = blocked_contrasts$denominator_replicates[i], + input_mode = input_mode, + fit_formula = design_formula + ) + write.table(stub, file = stub_path, sep = "\t", quote = FALSE, row.names = FALSE) + contrast_manifest$stub_result[contrast_manifest$contrast == blocked_contrasts$contrast[i]] <- basename(stub_path) + } +} +if (nrow(valid_contrasts) > 0) { + metadata$condition <- factor(metadata$condition) + if (method == "limma_log2") { + fit <- lmFit(model_expr, design) + contrast_defs <- setNames( + paste0("condition", valid_contrasts$numerator_condition, " - condition", valid_contrasts$denominator_condition), + valid_contrasts$contrast + ) + contrast_matrix <- makeContrasts(contrasts = unname(contrast_defs), levels = design) + colnames(contrast_matrix) <- names(contrast_defs) + fit2 <- eBayes(contrasts.fit(fit, contrast_matrix), trend = TRUE, robust = TRUE) + png(file.path(outdir, "qc", "mean_variance_trend.png"), width = 1000, height = 850, res = 150) + plotSA(fit2, main = "Mean-variance trend") + dev.off() + for (contrast_name in colnames(contrast_matrix)) { + table <- topTable(fit2, coef = contrast_name, number = Inf, sort.by = "P") + table$gene_id <- rownames(table) + table$gene_name <- gene_name[match(rownames(table), counts$gene_id)] + table <- table[, c("gene_id", "gene_name", "logFC", "AveExpr", "t", "P.Value", "adj.P.Val", "B")] + out_name <- safe_name(contrast_name) + write.table(table, file = file.path(outdir, "results", paste0(out_name, ".tsv")), sep = "\t", quote = FALSE, row.names = FALSE) + sig <- !is.na(table$adj.P.Val) & table$adj.P.Val < 0.05 + png(file.path(outdir, "plots", paste0(out_name, "_volcano.png")), width = 1200, height = 900, res = 160) + par(mar = c(5, 5, 4, 2) + 0.1) + plot( + table$logFC, + -log10(table$P.Value), + pch = 19, + cex = 0.7, + col = ifelse(sig, "#b22222", "#444444"), + xlab = "log2 fold-change", + ylab = "-log10(P)", + main = paste("Volcano:", contrast_name), + xlim = pad_range(table$logFC), + ylim = pad_range(-log10(table$P.Value)) + ) + abline(h = -log10(0.05), lty = 2, col = "#1f78b4") + abline(v = c(-1, 1), lty = 3, col = "#9e9e9e") + label_top_points(table$logFC, -log10(table$P.Value), table$gene_name, table$adj.P.Val, n = 6) + dev.off() + png(file.path(outdir, "plots", paste0(out_name, "_ma.png")), width = 1200, height = 900, res = 160) + par(mar = c(5, 5, 4, 2) + 0.1) + plot( + table$AveExpr, + table$logFC, + pch = 19, + cex = 0.7, + col = ifelse(sig, "#b22222", "#444444"), + xlab = "Average expression", + ylab = "log2 fold-change", + main = paste("MA:", contrast_name), + xlim = pad_range(table$AveExpr), + ylim = pad_range(table$logFC) + ) + abline(h = 0, col = "red") + label_top_points(table$AveExpr, table$logFC, table$gene_name, table$adj.P.Val, n = 6) + dev.off() + contrast_manifest$executed[contrast_manifest$contrast == contrast_name] <- TRUE + contrast_manifest$execution_method[contrast_manifest$contrast == contrast_name] <- method + } + } else if (method == "edgeR") { + dge <- DGEList(counts = round(expr), group = metadata$condition) + dge <- calcNormFactors(dge) + dge <- estimateDisp(dge, design) + fit <- glmQLFit(dge, design) + png(file.path(outdir, "qc", "mean_variance_trend.png"), width = 1000, height = 850, res = 150) + plotBCV(dge, main = "edgeR mean-variance trend") + dev.off() + for (i in seq_len(nrow(valid_contrasts))) { + contrast_name <- valid_contrasts$contrast[i] + contrast_vec <- rep(0, ncol(design)) + names(contrast_vec) <- colnames(design) + contrast_vec[paste0("condition", valid_contrasts$numerator_condition[i])] <- 1 + contrast_vec[paste0("condition", valid_contrasts$denominator_condition[i])] <- -1 + qlf <- glmQLFTest(fit, contrast = contrast_vec) + table <- topTags(qlf, n = Inf)$table + table$gene_id <- rownames(table) + table$gene_name <- gene_name[match(rownames(table), counts$gene_id)] + out_name <- safe_name(contrast_name) + write.table(table, file = file.path(outdir, "results", paste0(out_name, ".tsv")), sep = "\t", quote = FALSE, row.names = FALSE) + contrast_manifest$executed[contrast_manifest$contrast == contrast_name] <- TRUE + contrast_manifest$execution_method[contrast_manifest$contrast == contrast_name] <- method + } + } else if (method == "DESeq2") { + png(file.path(outdir, "qc", "mean_variance_trend.png"), width = 1000, height = 850, res = 150) + plotDispEsts(dds, main = "DESeq2 dispersion estimates") + dev.off() + for (i in seq_len(nrow(valid_contrasts))) { + contrast_name <- valid_contrasts$contrast[i] + res <- results(dds, contrast = c("condition", valid_contrasts$numerator_condition[i], valid_contrasts$denominator_condition[i])) + table <- as.data.frame(res) + table$gene_id <- rownames(table) + table$gene_name <- gene_name[match(rownames(table), counts$gene_id)] + out_name <- safe_name(contrast_name) + write.table(table, file = file.path(outdir, "results", paste0(out_name, ".tsv")), sep = "\t", quote = FALSE, row.names = FALSE) + contrast_manifest$executed[contrast_manifest$contrast == contrast_name] <- TRUE + contrast_manifest$execution_method[contrast_manifest$contrast == contrast_name] <- method + } + } +} + +write.table(contrast_manifest, file = file.path(outdir, "manifest", "contrast_status.tsv"), sep = "\t", quote = FALSE, row.names = FALSE) +writeLines(capture.output(sessionInfo()), con = file.path(outdir, "versions", "sessionInfo.txt")) + +summary_lines <- c( + paste("Design formula:", design_formula), + paste("Selected method:", method), + paste("Input mode:", input_mode), + "", + "Replicates by condition:" +) +summary_lines <- c(summary_lines, apply(condition_counts, 1, function(x) paste(" -", x[["condition"]], ":", x[["n_replicates"]]))) +summary_lines <- c(summary_lines, "", "Contrast status:") +summary_lines <- c(summary_lines, apply(contrast_manifest, 1, function(x) paste(" -", x[["contrast"]], ":", x[["status"]], "| executed:", x[["executed"]]))) +if (!is.null(normalization_warning)) { + summary_lines <- c(summary_lines, "", "Warnings:", paste(" -", normalization_warning)) +} +writeLines(summary_lines, con = file.path(outdir, "summary.md")) + +write_log("finished_at", format(Sys.time(), "%Y-%m-%dT%H:%M:%S%z")) diff --git a/plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/Snakefile.smk b/plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/Snakefile.smk new file mode 100644 index 0000000..1129082 --- /dev/null +++ b/plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/Snakefile.smk @@ -0,0 +1,63 @@ +"""Plugin-owned local scRNA FASTQ-to-count workflow.""" + +SAMPLES = config["samples"] +REFS = config["references"] +CHEM = config["chemistry"] +THREADS = int(config.get("threads", 4)) + + +def count_targets(): + targets = ["references/star_index"] + for sample in SAMPLES: + base = f"counts/{sample}/Solo.out/Gene/raw" + targets.extend( + [ + f"{base}/matrix.mtx", + f"{base}/barcodes.tsv", + f"{base}/features.tsv", + f"counts/{sample}/Log.final.out", + ] + ) + return targets + + +rule all: + input: + count_targets() + + +rule starsolo_index: + input: + fasta=lambda wildcards: REFS["genome_fasta"], + gtf=lambda wildcards: REFS["annotation_gtf"], + output: + directory("references/star_index") + threads: THREADS + params: + sjdb_overhang=lambda wildcards: int(CHEM.get("sjdb_overhang", 99)) + script: + "run_star_genome_generate.py" + + +rule starsolo_count: + input: + index="references/star_index", + whitelist=lambda wildcards: REFS["cb_whitelist"], + barcode_fastq=lambda wildcards: SAMPLES[wildcards.sample]["barcode_fastq"], + cdna_fastq=lambda wildcards: SAMPLES[wildcards.sample]["cdna_fastq"], + output: + matrix="counts/{sample}/Solo.out/Gene/raw/matrix.mtx", + barcodes="counts/{sample}/Solo.out/Gene/raw/barcodes.tsv", + features="counts/{sample}/Solo.out/Gene/raw/features.tsv", + log="counts/{sample}/Log.final.out", + threads: THREADS + params: + cb_start=lambda wildcards: int(CHEM.get("cb_start", 1)), + cb_len=lambda wildcards: int(CHEM.get("cb_len", 16)), + umi_start=lambda wildcards: int(CHEM.get("umi_start", 17)), + umi_len=lambda wildcards: int(CHEM.get("umi_len", 10)), + solo_type=lambda wildcards: CHEM.get("solo_type", "CB_UMI_Simple"), + solo_cell_filter=lambda wildcards: CHEM.get("solo_cell_filter", "CellRanger2.2 3000 0.99 10"), + features_mode=lambda wildcards: CHEM.get("features_mode", "Gene"), + script: + "run_starsolo.py" diff --git a/plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/run_star_genome_generate.py b/plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/run_star_genome_generate.py new file mode 100644 index 0000000..3a3d649 --- /dev/null +++ b/plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/run_star_genome_generate.py @@ -0,0 +1,67 @@ +import os +import subprocess +from pathlib import Path + +execution = snakemake.config.get("execution", {}) +star_runner = execution.get("star_runner", "native") +star_image = execution.get( + "star_image", + "josousa/star@sha256:2683d370b9c91a2e497d776d9b0dff2ddcc01dfec5029103ffa66b2a8da7b0c2", +) + +output_dir = Path(str(snakemake.output[0])).resolve() +output_dir.mkdir(parents=True, exist_ok=True) + +if star_runner == "docker": + run_root = output_dir.parents[1] + fasta = Path(str(snakemake.input.fasta)).resolve() + gtf = Path(str(snakemake.input.gtf)).resolve() + cmd = [ + "docker", + "run", + "--rm", + "--platform", + "linux/amd64", + "-u", + f"{os.getuid()}:{os.getgid()}", + "-w", + "/work", + "-v", + f"{run_root}:/work", + "-v", + f"{fasta.parent}:/fasta_ro:ro", + "-v", + f"{gtf.parent}:/gtf_ro:ro", + star_image, + "STAR", + "--runThreadN", + str(snakemake.threads), + "--runMode", + "genomeGenerate", + "--genomeDir", + f"/work/{output_dir.relative_to(run_root)}", + "--genomeFastaFiles", + f"/fasta_ro/{fasta.name}", + "--sjdbGTFfile", + f"/gtf_ro/{gtf.name}", + "--sjdbOverhang", + str(snakemake.params.sjdb_overhang), + ] +else: + cmd = [ + "STAR", + "--runThreadN", + str(snakemake.threads), + "--runMode", + "genomeGenerate", + "--genomeDir", + str(output_dir), + "--genomeFastaFiles", + str(Path(str(snakemake.input.fasta)).resolve()), + "--sjdbGTFfile", + str(Path(str(snakemake.input.gtf)).resolve()), + "--sjdbOverhang", + str(snakemake.params.sjdb_overhang), + ] + +subprocess.run(cmd, check=True) diff --git a/plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/run_starsolo.py b/plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/run_starsolo.py new file mode 100644 index 0000000..1bcb8be --- /dev/null +++ b/plugins/ngs-analysis/workflows/scrnaseq_fastq_to_count/run_starsolo.py @@ -0,0 +1,125 @@ +import gzip +import os +import shutil +import subprocess +from pathlib import Path + +execution = snakemake.config.get("execution", {}) +star_runner = execution.get("star_runner", "native") +star_image = execution.get( + "star_image", + "josousa/star@sha256:2683d370b9c91a2e497d776d9b0dff2ddcc01dfec5029103ffa66b2a8da7b0c2", +) + + +def materialize_fastq(source_path: str, output_dir: Path) -> str: + source = Path(source_path).resolve() + if not source.exists(): + raise FileNotFoundError(source) + output_dir.mkdir(parents=True, exist_ok=True) + if source.suffix != ".gz": + return str(source) + destination = (output_dir / source.stem).resolve() + with gzip.open(source, "rb") as input_handle, destination.open("wb") as output_handle: + shutil.copyfileobj(input_handle, output_handle) + return str(destination) + + +sample_dir = Path(str(snakemake.output.log)).resolve().parent +sample_dir.mkdir(parents=True, exist_ok=True) +if star_runner == "docker": + run_root = sample_dir.parents[1] + cdna_fastq = Path(str(snakemake.input.cdna_fastq)).resolve() + barcode_fastq = Path(str(snakemake.input.barcode_fastq)).resolve() + whitelist = Path(str(snakemake.input.whitelist)).resolve() + read_files_command = "zcat" if cdna_fastq.suffix == ".gz" else "cat" + cmd = [ + "docker", + "run", + "--rm", + "--platform", + "linux/amd64", + "-u", + f"{os.getuid()}:{os.getgid()}", + "-w", + "/work", + "-v", + f"{run_root}:/work", + "-v", + f"{cdna_fastq.parent}:/cdna_ro:ro", + "-v", + f"{barcode_fastq.parent}:/barcode_ro:ro", + "-v", + f"{whitelist.parent}:/wl_ro:ro", + star_image, + "STAR", + "--genomeDir", + f"/work/{Path(str(snakemake.input.index)).resolve().relative_to(run_root)}", + "--runThreadN", + str(snakemake.threads), + "--readFilesIn", + f"/cdna_ro/{cdna_fastq.name}", + f"/barcode_ro/{barcode_fastq.name}", + "--readFilesCommand", + read_files_command, + "--outFileNamePrefix", + f"/work/{sample_dir.relative_to(run_root)}/", + "--soloType", + str(snakemake.params.solo_type), + "--soloCBwhitelist", + f"/wl_ro/{whitelist.name}", + "--soloCBstart", + str(snakemake.params.cb_start), + "--soloCBlen", + str(snakemake.params.cb_len), + "--soloUMIstart", + str(snakemake.params.umi_start), + "--soloUMIlen", + str(snakemake.params.umi_len), + "--soloBarcodeReadLength", + "0", + "--soloFeatures", + str(snakemake.params.features_mode), + "--soloCellFilter", + *str(snakemake.params.solo_cell_filter).split(), + "--outSAMtype", + "None", + ] +else: + scratch_dir = sample_dir / "_inputs" + cdna_input = materialize_fastq(str(snakemake.input.cdna_fastq), scratch_dir / "cdna") + barcode_input = materialize_fastq(str(snakemake.input.barcode_fastq), scratch_dir / "barcode") + cmd = [ + "STAR", + "--genomeDir", + str(Path(str(snakemake.input.index)).resolve()), + "--runThreadN", + str(snakemake.threads), + "--readFilesIn", + cdna_input, + barcode_input, + "--outFileNamePrefix", + str(sample_dir) + "/", + "--soloType", + str(snakemake.params.solo_type), + "--soloCBwhitelist", + str(Path(str(snakemake.input.whitelist)).resolve()), + "--soloCBstart", + str(snakemake.params.cb_start), + "--soloCBlen", + str(snakemake.params.cb_len), + "--soloUMIstart", + str(snakemake.params.umi_start), + "--soloUMIlen", + str(snakemake.params.umi_len), + "--soloBarcodeReadLength", + "0", + "--soloFeatures", + str(snakemake.params.features_mode), + "--soloCellFilter", + *str(snakemake.params.solo_cell_filter).split(), + "--outSAMtype", + "None", + ] + +subprocess.run(cmd, check=True)