From 72f088143c5b1204d463a257362d3a630d55f777 Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Tue, 26 May 2026 13:12:10 +0200 Subject: [PATCH 1/3] feat: add v0.2 evidence pack workflow --- README.md | 6 + docs/benchmarking.md | 27 ++ docs/evidence/fastaguard-v0.2-evidence.md | 100 +++++++ docs/evidence/public_assemblies.json | 19 ++ docs/tool-landscape.md | 4 +- scripts/collect_evidence.py | 340 ++++++++++++++++++++++ tests/python/test_adoption_assets.py | 121 ++++++++ 7 files changed, 616 insertions(+), 1 deletion(-) create mode 100644 docs/evidence/fastaguard-v0.2-evidence.md create mode 100644 docs/evidence/public_assemblies.json create mode 100755 scripts/collect_evidence.py diff --git a/README.md b/README.md index f8cbd6c..a7554ea 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,7 @@ FastaGuard catches FASTA-level assembly problems before expensive assembly QC. - [Adoption plan](docs/adoption-plan.md) - [LLM and tooling vision](docs/llm-tooling-vision.md) - [Benchmarking](docs/benchmarking.md) +- [v0.2 evidence pack](docs/evidence/fastaguard-v0.2-evidence.md) - [Packaging](docs/packaging.md) - [v0.2.0 release notes](docs/releases/v0.2.0.md) - [v0.1.1 release notes](docs/releases/v0.1.1.md) @@ -179,3 +180,8 @@ v0.2.0 is published on GitHub with Linux and macOS release binaries. The Bioconda v0.2.0 recipe metadata is ready with the published source archive SHA; Bioconda may still serve v0.1.1 until the upstream recipe update is merged. BioContainers image availability is still pending confirmation. + +The next internal milestone is the +[v0.2 evidence pack](docs/evidence/fastaguard-v0.2-evidence.md): reproducible +local and public FASTA runs that document runtime, verdicts, and top findings +before new biological profiles are added. diff --git a/docs/benchmarking.md b/docs/benchmarking.md index c889e9b..d422f22 100644 --- a/docs/benchmarking.md +++ b/docs/benchmarking.md @@ -117,3 +117,30 @@ For each run, record: - whether downstream tools would have been blocked or recommended This evidence matters more than synthetic speed alone because it shows the wedge: cheap FASTA preflight before expensive downstream QC. + +## Evidence Pack Workflow + +The v0.2 evidence workflow is documented in +`docs/evidence/fastaguard-v0.2-evidence.md`. + +CI-safe local run: + +```bash +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/local-smoke \ + --local-only +``` + +Public NCBI run: + +```bash +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/v0.2 +``` + +The public run uses NCBI Datasets commands such as +`datasets download genome accession --include genome --filename `. +It writes compact `evidence_summary.json` and `evidence_summary.tsv` files while +leaving downloaded FASTA files and full reports under `target/`. diff --git a/docs/evidence/fastaguard-v0.2-evidence.md b/docs/evidence/fastaguard-v0.2-evidence.md new file mode 100644 index 0000000..057488d --- /dev/null +++ b/docs/evidence/fastaguard-v0.2-evidence.md @@ -0,0 +1,100 @@ +# FastaGuard v0.2 Evidence Pack + +This page records the evidence workflow for FastaGuard v0.2. It is intended to +make the preflight claim inspectable before adding new biological profiles. + +FastaGuard is a FASTA preflight tool. It is not biological completeness +analysis, not assembly correctness analysis, and not contamination confirmation. +Passing FastaGuard means the FASTA-level contract is sane enough to route into +downstream tools such as QUAST, BUSCO, BlobToolKit, CheckM, or annotation. + +## Local Evidence Run + +Build the release binary: + +```bash +cargo build --release --locked +``` + +Run the CI-safe local evidence path: + +```bash +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/local-smoke \ + --local-only +``` + +Local-only mode does not require network access or the NCBI Datasets CLI. It +runs: + +- a deterministic synthetic FASTA +- `testdata/problem_assembly.fa` +- a gzipped copy of `testdata/valid_assembly.fa` + +## Public NCBI Evidence Run + +Install the NCBI Datasets CLI, then run: + +```bash +python3 scripts/collect_evidence.py \ + --binary target/release/fastaguard \ + --out-dir target/evidence/v0.2 +``` + +The public workflow downloads genomic FASTA packages with commands shaped like: + +```bash +datasets download genome accession GCF_000005845.2 --include genome --filename target/evidence/v0.2/ecoli_k12_mg1655/ncbi_dataset.zip +``` + +If `datasets` is not installed, the script exits before running public cases. +Use `--local-only` for offline smoke tests. + +The default public manifest is: + +```text +docs/evidence/public_assemblies.json +``` + +It currently includes: + +- `GCF_000005845.2`: Escherichia coli K-12 MG1655 +- `GCF_000182925.2`: Neurospora crassa OR74A + +## Outputs + +Each case writes FastaGuard artifacts under `target/evidence//`: + +- `fastaguard.json` +- `fastaguard.tsv` +- `fastaguard_report.html` +- `fastaguard_mqc.json` + +The workflow also writes compact summaries: + +- `evidence_summary.json` +- `evidence_summary.tsv` + +The summary records the command used, FastaGuard version, git commit, platform, +date, input size, sequence count, elapsed seconds, verdict, and top findings. +Commit the evidence page and compact summaries when useful. Do not commit +downloaded FASTA files, NCBI zip archives, or full generated reports. + +## Interpretation + +Use this evidence to answer practical adoption questions: + +- how quickly does FastaGuard produce a preflight report? +- does it catch duplicate IDs, invalid symbols, high-N records, and composition outliers? +- does it produce JSON, TSV, HTML, and MultiQC-ready output before heavier tools run? + +Use QUAST, BUSCO, BlobToolKit, CheckM, sourmash, Kraken, or other tools for +deeper biological interpretation after FastaGuard has checked the FASTA-level +contract. + +## References + +- [NCBI Datasets genome download reference](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/reference-docs/command-line/datasets/download/genome/) +- [NCBI Datasets genome download guide](https://www.ncbi.nlm.nih.gov/datasets/docs/v2/how-tos/genomes/download-genome/) +- [Neurospora crassa OR74A BioProject](https://www.ncbi.nlm.nih.gov/bioproject/132) diff --git a/docs/evidence/public_assemblies.json b/docs/evidence/public_assemblies.json new file mode 100644 index 0000000..80aeaf7 --- /dev/null +++ b/docs/evidence/public_assemblies.json @@ -0,0 +1,19 @@ +{ + "schema_version": 1, + "assemblies": [ + { + "id": "ecoli_k12_mg1655", + "accession": "GCF_000005845.2", + "label": "Escherichia coli K-12 MG1655", + "category": "bacterial", + "source_url": "https://www.ncbi.nlm.nih.gov/datasets/genome/GCF_000005845.2/" + }, + { + "id": "neurospora_crassa_or74a", + "accession": "GCF_000182925.2", + "label": "Neurospora crassa OR74A", + "category": "fungal", + "source_url": "https://www.ncbi.nlm.nih.gov/bioproject/132" + } + ] +} diff --git a/docs/tool-landscape.md b/docs/tool-landscape.md index 37ee343..e99f38b 100644 --- a/docs/tool-landscape.md +++ b/docs/tool-landscape.md @@ -60,10 +60,12 @@ Current product evidence: - A native MultiQC plugin starter exists under `integrations/multiqc/`. - Bioconda recipe mirror exists under `packaging/bioconda/`. - nf-core, Nextflow, and Snakemake starters exist under `examples/`. +- The v0.2 evidence workflow is documented in + `docs/evidence/fastaguard-v0.2-evidence.md`. Evidence still needed: -- benchmarks on public assemblies +- committed benchmark summaries from public assemblies - user feedback from real pipeline authors - BioContainers image/tag confirmation - official MultiQC module or packaged plugin diff --git a/scripts/collect_evidence.py b/scripts/collect_evidence.py new file mode 100755 index 0000000..5385c3a --- /dev/null +++ b/scripts/collect_evidence.py @@ -0,0 +1,340 @@ +#!/usr/bin/env python3 +"""Collect reproducible FastaGuard evidence runs.""" + +from __future__ import annotations + +import argparse +import csv +import gzip +import json +import platform +import shutil +import subprocess +import sys +import time +import zipfile +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + + +ROOT = Path(__file__).resolve().parents[1] +LOCAL_SUCCESS_CODES = {0, 1, 2} +SUMMARY_COLUMNS = [ + "id", + "label", + "category", + "source", + "accession", + "input_bytes", + "elapsed_seconds", + "exit_code", + "verdict", + "sequence_count", + "total_length", + "n50", + "n90", + "finding_count", + "top_findings", +] + + +def main() -> int: + args = parse_args() + binary = args.binary.resolve() + out_dir = args.out_dir.resolve() + manifest_path = args.manifest.resolve() + + if not binary.exists(): + raise SystemExit(f"FastaGuard binary not found: {binary}") + + if not args.local_only and shutil.which("datasets") is None: + raise SystemExit( + "NCBI Datasets CLI not found. Install `datasets` or rerun with --local-only." + ) + + out_dir.mkdir(parents=True, exist_ok=True) + cases = local_cases(out_dir) + if not args.local_only: + cases.extend(public_cases(manifest_path, out_dir)) + + summary = { + "schema_version": 1, + "generated_at": datetime.now(timezone.utc).isoformat(timespec="seconds"), + "fastaguard_version": fastaguard_version(binary), + "git_commit": git_commit(), + "platform": platform.platform(), + "python": platform.python_version(), + "command": " ".join(sys.argv), + "local_only": args.local_only, + "cases": [], + } + + for case in cases: + if case["source"] == "public_ncbi": + prepare_public_input(case) + result = run_case(binary, case) + summary["cases"].append(result) + + write_summary(out_dir, summary) + print(json.dumps(summary, indent=2, sort_keys=True)) + return 0 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Collect FastaGuard evidence runs for local fixtures and public assemblies." + ) + parser.add_argument( + "--binary", + type=Path, + default=Path("target/release/fastaguard"), + help="Path to the FastaGuard binary to run.", + ) + parser.add_argument( + "--out-dir", + type=Path, + default=Path("target/evidence/v0.2"), + help="Directory for evidence outputs and summaries.", + ) + parser.add_argument( + "--manifest", + type=Path, + default=Path("docs/evidence/public_assemblies.json"), + help="Public assembly manifest.", + ) + parser.add_argument( + "--local-only", + action="store_true", + help="Skip public NCBI downloads and run only local evidence cases.", + ) + return parser.parse_args() + + +def local_cases(out_dir: Path) -> list[dict[str, Any]]: + synthetic_dir = out_dir / "synthetic_valid" + synthetic_path = synthetic_dir / "synthetic.fa" + synthetic_dir.mkdir(parents=True, exist_ok=True) + write_synthetic_fasta(synthetic_path) + + gzip_dir = out_dir / "gzipped_valid" + gzip_path = gzip_dir / "valid_assembly.fa.gz" + gzip_dir.mkdir(parents=True, exist_ok=True) + gzip_fasta(ROOT / "testdata" / "valid_assembly.fa", gzip_path) + + return [ + { + "id": "synthetic_valid", + "label": "Deterministic synthetic FASTA", + "category": "synthetic", + "source": "local", + "accession": None, + "input_path": synthetic_path, + "case_dir": synthetic_dir, + }, + { + "id": "problem_fixture", + "label": "Problem assembly fixture", + "category": "fixture", + "source": "local", + "accession": None, + "input_path": ROOT / "testdata" / "problem_assembly.fa", + "case_dir": out_dir / "problem_fixture", + }, + { + "id": "gzipped_valid", + "label": "Gzipped valid assembly fixture", + "category": "fixture", + "source": "local", + "accession": None, + "input_path": gzip_path, + "case_dir": gzip_dir, + }, + ] + + +def public_cases(manifest_path: Path, out_dir: Path) -> list[dict[str, Any]]: + manifest = json.loads(manifest_path.read_text()) + cases = [] + for assembly in manifest["assemblies"]: + case_dir = out_dir / assembly["id"] + cases.append( + { + **assembly, + "source": "public_ncbi", + "input_path": case_dir / "genomic.fna", + "case_dir": case_dir, + } + ) + return cases + + +def write_synthetic_fasta(path: Path) -> None: + records = [ + ("synthetic_0001", "ACGT" * 40), + ("synthetic_0002", "TGCA" * 40), + ("synthetic_0003", "GATTACA" * 20), + ("synthetic_0004", "CCGGTTAA" * 18), + ] + with path.open("w", encoding="utf-8", newline="\n") as handle: + for record_id, sequence in records: + handle.write(f">{record_id}\n") + for offset in range(0, len(sequence), 80): + handle.write(sequence[offset : offset + 80]) + handle.write("\n") + + +def gzip_fasta(source: Path, destination: Path) -> None: + with source.open("rb") as src, gzip.open(destination, "wb") as dst: + shutil.copyfileobj(src, dst) + + +def prepare_public_input(case: dict[str, Any]) -> None: + case_dir = case["case_dir"] + case_dir.mkdir(parents=True, exist_ok=True) + zip_path = case_dir / "ncbi_dataset.zip" + command = [ + "datasets", + "download", + "genome", + "accession", + case["accession"], + "--include", + "genome", + "--filename", + str(zip_path), + ] + completed = subprocess.run(command, capture_output=True, text=True, check=False) + if completed.returncode != 0: + raise SystemExit( + "NCBI Datasets download failed for " + f"{case['accession']} with exit code {completed.returncode}\n" + f"STDOUT:\n{completed.stdout}\nSTDERR:\n{completed.stderr}" + ) + + fasta_member = find_genomic_fasta(zip_path) + with zipfile.ZipFile(zip_path) as archive: + with archive.open(fasta_member) as src, case["input_path"].open("wb") as dst: + shutil.copyfileobj(src, dst) + + +def find_genomic_fasta(zip_path: Path) -> str: + with zipfile.ZipFile(zip_path) as archive: + candidates = [ + name + for name in archive.namelist() + if name.endswith((".fna", ".fa", ".fasta")) + and not name.endswith("/") + and "/data/" in name + ] + if not candidates: + raise SystemExit(f"No genomic FASTA found in {zip_path}") + return sorted(candidates)[0] + + +def run_case(binary: Path, case: dict[str, Any]) -> dict[str, Any]: + case_dir = case["case_dir"] + case_dir.mkdir(parents=True, exist_ok=True) + json_path = case_dir / "fastaguard.json" + html_path = case_dir / "fastaguard_report.html" + tsv_path = case_dir / "fastaguard.tsv" + multiqc_path = case_dir / "fastaguard_mqc.json" + command = [ + str(binary), + str(case["input_path"]), + "--profile", + "assembly", + "--min-contig-length", + "1", + "--out", + str(html_path), + "--json", + str(json_path), + "--tsv", + str(tsv_path), + "--multiqc", + str(multiqc_path), + ] + + started = time.perf_counter() + completed = subprocess.run(command, capture_output=True, text=True, check=False) + elapsed = time.perf_counter() - started + + if completed.returncode not in LOCAL_SUCCESS_CODES: + raise SystemExit( + "FastaGuard evidence run failed for " + f"{case['id']} with exit code {completed.returncode}\n" + f"Command: {' '.join(command)}\n" + f"STDOUT:\n{completed.stdout}\nSTDERR:\n{completed.stderr}" + ) + + report = json.loads(json_path.read_text()) + summary = report["summary"] + findings = report.get("findings", []) + top_findings = [finding.get("id", "unknown") for finding in findings[:5]] + + return { + "id": case["id"], + "label": case["label"], + "category": case["category"], + "source": case["source"], + "accession": case.get("accession"), + "input_path": str(case["input_path"]), + "input_bytes": case["input_path"].stat().st_size, + "elapsed_seconds": round(elapsed, 4), + "exit_code": completed.returncode, + "verdict": report["verdict"]["status"], + "sequence_count": summary["sequence_count"], + "total_length": summary["total_length"], + "n50": summary["n50"], + "n90": summary["n90"], + "finding_count": len(findings), + "top_findings": top_findings, + "command": " ".join(command), + "artifacts": { + "json": str(json_path), + "html": str(html_path), + "tsv": str(tsv_path), + "multiqc": str(multiqc_path), + }, + } + + +def write_summary(out_dir: Path, summary: dict[str, Any]) -> None: + json_path = out_dir / "evidence_summary.json" + tsv_path = out_dir / "evidence_summary.tsv" + json_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n") + + with tsv_path.open("w", encoding="utf-8", newline="") as handle: + writer = csv.DictWriter(handle, fieldnames=SUMMARY_COLUMNS, delimiter="\t") + writer.writeheader() + for case in summary["cases"]: + row = {key: case.get(key) for key in SUMMARY_COLUMNS} + row["top_findings"] = ",".join(case["top_findings"]) + writer.writerow(row) + + +def fastaguard_version(binary: Path) -> str: + completed = subprocess.run( + [str(binary), "--version"], capture_output=True, text=True, check=False + ) + if completed.returncode != 0: + return "unknown" + return completed.stdout.strip() or "unknown" + + +def git_commit() -> str: + completed = subprocess.run( + ["git", "rev-parse", "--short", "HEAD"], + cwd=ROOT, + capture_output=True, + text=True, + check=False, + ) + if completed.returncode != 0: + return "unknown" + return completed.stdout.strip() or "unknown" + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/python/test_adoption_assets.py b/tests/python/test_adoption_assets.py index d52ff07..b9aeba3 100644 --- a/tests/python/test_adoption_assets.py +++ b/tests/python/test_adoption_assets.py @@ -1,4 +1,5 @@ import json +import subprocess import sys import types import unittest @@ -295,6 +296,126 @@ def test_benchmarking_docs_include_v0_2_evidence_topics(self): self.assertIn("BUSCO", text) self.assertIn("BlobToolKit", text) + def test_public_evidence_manifest_declares_default_assemblies(self): + manifest = json.loads( + (ROOT / "docs" / "evidence" / "public_assemblies.json").read_text() + ) + + self.assertEqual(manifest["schema_version"], 1) + cases = manifest["assemblies"] + self.assertGreaterEqual(len(cases), 2) + accessions = {case["accession"] for case in cases} + self.assertIn("GCF_000005845.2", accessions) + self.assertIn("GCF_000182925.2", accessions) + + for case in cases: + with self.subTest(case=case): + self.assertEqual( + set(case), + {"id", "accession", "label", "category", "source_url"}, + ) + self.assertRegex(case["id"], r"^[a-z0-9][a-z0-9_-]+$") + self.assertRegex(case["accession"], r"^GC[AF]_[0-9]+\.[0-9]+$") + self.assertTrue(case["label"]) + self.assertIn(case["category"], {"bacterial", "fungal"}) + self.assertTrue(case["source_url"].startswith("https://")) + + def test_evidence_docs_reference_local_and_public_workflows(self): + evidence = (ROOT / "docs" / "evidence" / "fastaguard-v0.2-evidence.md") + benchmarking = ROOT / "docs" / "benchmarking.md" + readme = ROOT / "README.md" + landscape = ROOT / "docs" / "tool-landscape.md" + + evidence_text = evidence.read_text() + self.assertIn("python3 scripts/collect_evidence.py", evidence_text) + self.assertIn("--local-only", evidence_text) + self.assertIn("datasets download genome accession", evidence_text) + self.assertIn("evidence_summary.json", evidence_text) + self.assertIn("not biological completeness", evidence_text) + self.assertIn("not contamination confirmation", evidence_text) + + for path in (benchmarking, readme, landscape): + with self.subTest(path=path): + self.assertIn( + "docs/evidence/fastaguard-v0.2-evidence.md", path.read_text() + ) + + def test_collect_evidence_local_only_smoke_does_not_require_network(self): + with TemporaryDirectory() as temp_dir: + temp_path = Path(temp_dir) + fake_binary = temp_path / "fake_fastaguard.py" + fake_binary.write_text( + """#!/usr/bin/env python3 +import json +import sys +from pathlib import Path + +args = sys.argv[1:] +input_path = Path(args[0]) + +def option_path(flag): + try: + return Path(args[args.index(flag) + 1]) + except ValueError: + return None + +json_path = option_path("--json") +html_path = option_path("--out") +tsv_path = option_path("--tsv") +multiqc_path = option_path("--multiqc") +summary = { + "sequence_count": 1, + "total_length": input_path.stat().st_size, + "n50": input_path.stat().st_size, + "n90": input_path.stat().st_size, +} +report = { + "tool": {"name": "fastaguard", "version": "test"}, + "verdict": {"status": "PASS"}, + "summary": summary, + "findings": [], +} +json_path.write_text(json.dumps(report)) +html_path.write_text("fake") +tsv_path.write_text("metric\\tvalue\\n") +multiqc_path.write_text(json.dumps({"id": "fastaguard", "data": {}})) +""" + ) + fake_binary.chmod(fake_binary.stat().st_mode | 0o111) + out_dir = temp_path / "evidence" + + completed = subprocess.run( + [ + sys.executable, + str(ROOT / "scripts" / "collect_evidence.py"), + "--binary", + str(fake_binary), + "--out-dir", + str(out_dir), + "--local-only", + ], + cwd=ROOT, + capture_output=True, + text=True, + check=False, + ) + + self.assertEqual(completed.returncode, 0, completed.stderr) + self.assertNotIn("datasets download", completed.stdout) + summary_path = out_dir / "evidence_summary.json" + self.assertTrue(summary_path.exists()) + summary = json.loads(summary_path.read_text()) + case_ids = {case["id"] for case in summary["cases"]} + self.assertEqual( + case_ids, + {"synthetic_valid", "problem_fixture", "gzipped_valid"}, + ) + self.assertTrue((out_dir / "evidence_summary.tsv").exists()) + for case in summary["cases"]: + self.assertEqual(case["verdict"], "PASS") + self.assertGreater(case["elapsed_seconds"], 0) + self.assertIn("command", case) + def test_snakemake_wrapper_declares_bioconda_environment(self): environment = ( ROOT / "examples" / "snakemake" / "wrapper" / "environment.yaml" From 1025747f37a1ba43948efdac7baa109185bf7c64 Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Wed, 27 May 2026 14:53:56 +0200 Subject: [PATCH 2/3] docs: add long-term FastaGuard vision --- AGENTS.md | 38 +++++ README.md | 1 + docs/roadmap.md | 54 ++++--- docs/vision-plan.md | 219 +++++++++++++++++++++++++++ tests/python/test_adoption_assets.py | 26 ++++ 5 files changed, 320 insertions(+), 18 deletions(-) create mode 100644 docs/vision-plan.md diff --git a/AGENTS.md b/AGENTS.md index 759cf02..9b69d86 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -87,6 +87,44 @@ next: richer evidence tables for additional profiles and compare mode later: MCP/tool-agent interface and optional local summaries ``` +## Deep Release Vision + +Durable vision document: + +```text +docs/vision-plan.md +``` + +FastaGuard should become the FASTA preflight operating system for modern +bioinformatics pipelines: validate the FASTA, explain red flags, emit a stable +contract, and route to the right downstream tools. + +The release strategy is evidence before expansion: + +```text +v0.3: evidence pack + assembly gate + provenance checksums +v0.4: compare mode for many FASTA files +v0.5: transcriptome profile +v0.6: protein profile +v0.7: reference-panel profile +later: MCP/tool-agent interface and optional local summaries +``` + +Default product boundaries: + +- stay fast and database-free by default +- keep JSON as the source of truth +- keep HTML as a human view +- make findings machine-actionable with stable IDs, severity, evidence, thresholds, actions, and scope +- keep optional generated summaries local-metrics-only and traceable back to structured fields +- never claim to replace QUAST, BUSCO, BlobToolKit, CheckM, seqkit, MultiQC, or annotation workflows + +Recommended next big release: + +```text +v0.3 should make FastaGuard credible as the default assembly gate before adding broad new biological profiles. +``` + ## Collaboration Preference When moving the project forward, provide a clear recommendation first, then proceed when the user approves or explicitly asks to continue. The default recommendation should favor boring, stable contracts over flashy AI features. diff --git a/README.md b/README.md index a7554ea..fd3650d 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,7 @@ FastaGuard catches FASTA-level assembly problems before expensive assembly QC. - [Example reports](examples/reports/README.md) - [Product thesis](docs/product-thesis.md) +- [Vision plan](docs/vision-plan.md) - [MVP spec](docs/mvp-spec.md) - [Output contract](docs/output-contract.md) - [Tool landscape](docs/tool-landscape.md) diff --git a/docs/roadmap.md b/docs/roadmap.md index 551b6b8..f4e1aff 100644 --- a/docs/roadmap.md +++ b/docs/roadmap.md @@ -49,7 +49,40 @@ Capabilities: - ready Bioconda recipe metadata for the v0.2.0 update - benchmark evidence guidance for adoption decisions -## v0.3: Transcriptome Profile +## v0.3: Evidence And Assembly Gate + +Goal: + +```text +Make FastaGuard credible enough for pipeline authors to add as a default assembly gate. +``` + +Potential additions: + +- public evidence pack from local fixtures and public assemblies +- Bioconda and BioContainers v0.2 availability documented +- input checksum provenance +- clearer machine-readable threshold metadata +- assembly gate preset for common pipeline behavior +- clearer blocking vs follow-up recommendations + +## v0.4: Compare Mode + +Goal: + +```text +Make many FASTA files easy to rank, filter, and route. +``` + +Potential additions: + +- cross-file metrics table +- cohort-level outliers +- sample-to-sample summary +- batch pipeline reports +- combined JSON, TSV, HTML, and MultiQC outputs + +## v0.5: Transcriptome Profile Potential additions: @@ -59,7 +92,7 @@ Potential additions: - extreme GC outliers - isoform-heavy warning heuristics -## v0.4: Protein Profile +## v0.6: Protein Profile Potential additions: @@ -69,7 +102,7 @@ Potential additions: - low-complexity regions - suspicious nucleotide-looking proteins -## v0.5: Reference Panel Profile +## v0.7: Reference Panel Profile Potential additions: @@ -79,21 +112,6 @@ Potential additions: - panel consistency summaries - submission-readiness warnings -## v0.6: Compare Mode - -Example: - -```bash -fastaguard compare *.fa --profile assembly --out cohort_report.html -``` - -Potential additions: - -- cross-file metrics table -- cohort-level outliers -- sample-to-sample summary -- batch pipeline reports - ## Pipeline Integration Maturity Potential additions: diff --git a/docs/vision-plan.md b/docs/vision-plan.md new file mode 100644 index 0000000..9be159a --- /dev/null +++ b/docs/vision-plan.md @@ -0,0 +1,219 @@ +# FastaGuard Vision Plan + +## North Star + +FastaGuard should become the FASTA preflight operating system for modern +bioinformatics pipelines. + +That does not mean replacing FastQC, QUAST, BUSCO, BlobToolKit, CheckM, seqkit, +MultiQC, or annotation workflows. It means owning the layer before them: + +```text +Validate the FASTA. +Explain the red flags. +Emit a stable contract. +Route to the right downstream tools. +``` + +The mature product should feel boringly reliable to pipeline authors and +surprisingly useful to scientists: one fast command that tells them whether a +FASTA file is valid, sane, interpretable, and ready for heavier analysis. + +## Strategic Release Principle + +The release strategy is evidence before expansion. + +FastaGuard should not rush into many biological profiles until the assembly +preflight contract is trusted. The product should earn adoption in this order: + +1. **Trust:** reproducible evidence, stable schemas, clear exit codes, installable packages. +2. **Integration:** Bioconda, BioContainers, MultiQC, Nextflow, Snakemake, Galaxy. +3. **Scale:** compare mode for many FASTA files and batch pipeline reports. +4. **Breadth:** transcriptome, protein, and reference-panel profiles. +5. **Intelligence:** optional local-metrics-only summaries, MCP/tool-agent interfaces, and workflow routing. + +This keeps the project from becoming a bag of heuristics. Each release should +make the contract more useful, more trusted, or more integrated. + +## Big Release Direction + +### v0.3: Evidence And Assembly Gate + +Goal: + +```text +Make FastaGuard credible enough for pipeline authors to add as a default assembly gate. +``` + +Priorities: + +- publish a small evidence pack from local fixtures and public assemblies +- document Bioconda and BioContainers v0.2 availability +- add input checksums to provenance +- add clearer machine-readable threshold metadata +- add an assembly gate preset for common pipeline behavior +- improve report sections that explain what should block downstream tools + +The v0.3 promise should be: + +```text +FastaGuard gives assembly pipelines a fast, explainable PASS/WARN/FAIL gate before expensive QC. +``` + +### v0.4: Compare Mode + +Goal: + +```text +Make many FASTA files easy to rank, filter, and route. +``` + +Compare mode should support: + +- cohort-level table across many FASTA files +- sample-to-sample summaries +- batch outlier detection +- combined HTML report +- combined JSON/TSV/MultiQC output +- stable machine-actionable ranking fields + +This is more strategically important than adding many profile-specific checks +too early, because pipeline authors often need to triage batches, not one file. + +### v0.5: Transcriptome Profile + +Goal: + +```text +Extend the FASTA preflight contract to transcriptome assemblies. +``` + +Initial transcriptome checks should stay lightweight: + +- very short transcripts +- excessive duplicate transcript sequences +- polyA/polyT tail summaries +- GC outliers +- isoform-heavy warning heuristics + +FastaGuard should not claim transcriptome biological completeness. It should +route users to transcriptome-specific completeness and annotation tools when +needed. + +### v0.6: Protein Profile + +Goal: + +```text +Validate protein FASTA files before annotation, clustering, search, or database submission. +``` + +Initial protein checks: + +- invalid amino-acid symbols +- internal stop codons +- terminal stop codons +- low-complexity summaries +- suspicious nucleotide-looking proteins + +Protein mode should be strict about alphabet validity and careful about biology: +it should flag preflight problems, not infer functional correctness. + +### v0.7: Reference-Panel Profile + +Goal: + +```text +Make curated reference FASTA panels safer to maintain and distribute. +``` + +Initial reference-panel checks: + +- stricter ID normalization +- naming convention reports +- sequence uniqueness +- panel consistency summaries +- submission-readiness warnings + +This is useful for labs, databases, and core facilities maintaining reference +sets that many downstream workflows depend on. + +## Machine-Actionable Vision + +FastaGuard should be designed for humans, pipelines, and future tool-using +agents. + +Principles: + +- JSON remains the source of truth. +- HTML remains a human view. +- Machines should never scrape HTML or logs. +- Finding IDs must remain stable and documented. +- Every finding should expose severity, evidence, thresholds, actions, and scope. +- Optional generated summaries must be local-metrics-only and traceable back to structured fields. +- MCP or tool-server support should come after the CLI contract is stable. + +The long-term machine-actionable direction: + +```text +fastaguard run sample.fa --json report.json +fastaguard compare *.fa --json cohort.json +agent reads schema + finding catalog + report +agent routes safely to QUAST, BUSCO, BlobToolKit, CheckM, seqkit, or annotation +``` + +The agent should know what FastaGuard can conclude, what it cannot conclude, +and which downstream tool is appropriate next. + +## Product Boundaries + +FastaGuard should remain fast and database-free by default. + +Do not make default FastaGuard depend on: + +- taxonomy databases +- large marker-gene databases +- internet access +- GPU inference +- external aligners + +Optional integrations can exist later, but the default product should stay a +single reliable preflight binary that works in constrained pipelines. + +## Adoption Strategy + +The project should optimize for maintainers and workflow authors. + +Required adoption qualities: + +- one-command install through Bioconda +- generated BioContainers image +- stable JSON schema +- deterministic outputs +- clear exit codes +- MultiQC compatibility +- Nextflow, nf-core, Snakemake, and Galaxy examples +- small public evidence pack +- clear release notes and migration notes + +The best way to become frequent in bioinformatics pipelines is not flashy AI. +It is being the boring, dependable first QC gate that saves expensive downstream +time. + +## Current Recommendation + +The next big release should not be a huge biology expansion yet. + +Recommended sequence: + +```text +v0.3: evidence pack + assembly gate + provenance checksums +v0.4: compare mode for many FASTA files +v0.5: transcriptome profile +v0.6: protein profile +v0.7: reference-panel profile +later: MCP/tool-agent interface and optional local summaries +``` + +This path gives FastaGuard the best chance to become a default tool: prove the +assembly gate first, then scale to batches, then expand profiles. diff --git a/tests/python/test_adoption_assets.py b/tests/python/test_adoption_assets.py index b9aeba3..f23ae28 100644 --- a/tests/python/test_adoption_assets.py +++ b/tests/python/test_adoption_assets.py @@ -416,6 +416,32 @@ def option_path(flag): self.assertGreater(case["elapsed_seconds"], 0) self.assertIn("command", case) + def test_deep_release_vision_is_documented_and_memorized(self): + vision = (ROOT / "docs" / "vision-plan.md").read_text() + memory = (ROOT / "AGENTS.md").read_text() + readme = (ROOT / "README.md").read_text() + + required_phrases = [ + "FASTA preflight operating system", + "evidence before expansion", + "assembly gate", + "compare mode", + "transcriptome", + "protein", + "reference-panel", + "MCP", + "machine-actionable", + "local-metrics-only", + ] + + for phrase in required_phrases: + with self.subTest(phrase=phrase): + self.assertIn(phrase, vision) + + self.assertIn("Deep Release Vision", memory) + self.assertIn("FASTA preflight operating system", memory) + self.assertIn("docs/vision-plan.md", readme) + def test_snakemake_wrapper_declares_bioconda_environment(self): environment = ( ROOT / "examples" / "snakemake" / "wrapper" / "environment.yaml" From 3668402b71551a4dab30abe732fecc7b8715b7cf Mon Sep 17 00:00:00 2001 From: Ehsan ESTAJI <71376358+ehsanestaji@users.noreply.github.com> Date: Wed, 27 May 2026 15:08:33 +0200 Subject: [PATCH 3/3] docs: mark v0.2 packaging live --- README.md | 19 +++++++++++------ docs/adoption-plan.md | 14 ++++++------- docs/packaging.md | 21 ++++++++++--------- docs/releases/v0.2.0.md | 9 ++++++-- docs/tool-landscape.md | 8 ++++--- examples/nf-core/README.md | 9 ++++++-- .../nf-core/modules/local/fastaguard/main.nf | 1 + examples/snakemake/wrapper/README.md | 6 ++++++ examples/snakemake/wrapper/environment.yaml | 2 +- packaging/bioconda/README.md | 13 +++++++----- tests/python/test_adoption_assets.py | 15 +++++++++++-- tests/python/test_release_metadata.py | 4 ++-- 12 files changed, 81 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index fd3650d..160ae19 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,8 @@ tar -xzf fastaguard-v0.2.0-x86_64-unknown-linux-gnu.tar.gz ``` The v0.2.0 GitHub release binaries and source archive are published. Bioconda -may still serve v0.1.1 until the upstream v0.2.0 recipe update is merged. +serves v0.2.0 for Linux x86_64, Linux ARM64, macOS Intel, and macOS Apple +Silicon. Local development build: @@ -67,7 +68,7 @@ fastaguard --finding-catalog fastaguard --explain-finding high_n_rate ``` -Build and run the Docker image: +Build and run the local Docker image: ```bash docker build -t fastaguard:local . @@ -79,6 +80,12 @@ docker run --rm -v "$PWD:/data" fastaguard:local /data/sample.fa \ --multiqc /data/fastaguard_mqc.json ``` +Use the generated BioContainers image in workflow engines: + +```bash +docker pull quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0 +``` + Exit codes: ```text @@ -177,10 +184,10 @@ FastaGuard catches FASTA-level assembly problems before expensive assembly QC. ## Status -v0.2.0 is published on GitHub with Linux and macOS release binaries. The -Bioconda v0.2.0 recipe metadata is ready with the published source archive SHA; -Bioconda may still serve v0.1.1 until the upstream recipe update is merged. -BioContainers image availability is still pending confirmation. +v0.2.0 is published on GitHub with Linux and macOS release binaries. Bioconda +serves v0.2.0 for `linux-64`, `linux-aarch64`, `osx-64`, and `osx-arm64`. +BioContainers also publishes the pinned workflow image +`quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0`. The next internal milestone is the [v0.2 evidence pack](docs/evidence/fastaguard-v0.2-evidence.md): reproducible diff --git a/docs/adoption-plan.md b/docs/adoption-plan.md index 4b960a6..3623bdc 100644 --- a/docs/adoption-plan.md +++ b/docs/adoption-plan.md @@ -8,21 +8,21 @@ adding many new biological heuristics. Priority: ```text -Bioconda published -> BioContainers confirmation -> MultiQC plugin -> public benchmarks -> upstream workflow examples +Bioconda published -> BioContainers available -> MultiQC plugin -> public benchmarks -> upstream workflow examples ``` ## Phase 1: Package Goal: make installation natural for bioinformatics users. -Status: Bioconda is live for FastaGuard v0.1.1, and the v0.2.0 recipe update -is ready with the published GitHub source archive SHA. +Status: Bioconda is live for FastaGuard v0.2.0 on Linux and macOS x86_64/ARM64 +platforms. BioContainers publishes the pinned workflow image +`quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0`. - Keep GitHub release binaries working. - Keep Docker smoke tests passing. - Keep `packaging/bioconda/` aligned with the upstream Bioconda recipe. -- Confirm BioContainers image/tag publication after the Bioconda merge; - current candidate tags are not yet visible in the registry. +- Keep workflow examples pinned to the confirmed BioContainers image tag. Done when: @@ -31,8 +31,8 @@ mamba install -c conda-forge -c bioconda fastaguard fastaguard --schema ``` -works in a clean environment. This has been verified locally on macOS for -v0.1.1; repeat this check for v0.2.0 after the Bioconda update merges. +works in a clean environment, and workflow engines can pull the pinned +BioContainers image. ## Phase 2: Aggregate diff --git a/docs/packaging.md b/docs/packaging.md index 950c0b9..5d9140c 100644 --- a/docs/packaging.md +++ b/docs/packaging.md @@ -9,10 +9,9 @@ Bioconda -> GitHub release binaries -> Docker image -> BioContainers -> Homebrew ``` FastaGuard v0.2.0 is published on GitHub with Linux and macOS release binaries. -Bioconda currently publishes v0.1.1 until the upstream v0.2.0 recipe update is -merged. Docker remains useful for local smoke tests and pipeline containers, -while BioContainers should be confirmed after the Bioconda publication pipeline -catches up. +Bioconda serves v0.2.0 on Linux and macOS x86_64/ARM64 platforms. Docker remains +useful for local smoke tests, while BioContainers now provides the pinned +workflow image generated from the Bioconda package. ## Bioconda @@ -38,7 +37,7 @@ fastaguard --finding-catalog Current published package: -- Version: `0.1.1` +- Version: `0.2.0` - Platforms: `linux-64`, `linux-aarch64`, `osx-64`, `osx-arm64` - Package page: [anaconda.org/bioconda/fastaguard](https://anaconda.org/bioconda/fastaguard) @@ -141,12 +140,14 @@ The Docker image should stay boring: That makes it easy to run in Nextflow, Snakemake, Galaxy, and CI systems. -The Bioconda recipe has merged upstream. BioContainers should be confirmed -separately by checking the generated registry image/tag once it appears. That -path is preferable to maintaining a separate BioContainers Dockerfile unless -automated container publication proves unavailable. +The Bioconda recipe has merged upstream and generated a BioContainers image. +Use the pinned tag in workflow examples: -BioContainers image availability is still pending confirmation. +```bash +docker pull quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0 +``` + +That path is preferable to maintaining a separate BioContainers Dockerfile. ## MultiQC diff --git a/docs/releases/v0.2.0.md b/docs/releases/v0.2.0.md index 99de762..54e4c45 100644 --- a/docs/releases/v0.2.0.md +++ b/docs/releases/v0.2.0.md @@ -17,14 +17,19 @@ pipelines. ## Install -After the v0.2.0 Bioconda update merges, install it with: +Install the v0.2.0 Bioconda package with: ```bash mamba install -c conda-forge -c bioconda fastaguard ``` The v0.2.0 GitHub release binaries and source archive are published. Bioconda -may still serve v0.1.1 until the upstream recipe update is merged. +serves v0.2.0 for Linux and macOS x86_64/ARM64 platforms. BioContainers also +publishes the pinned workflow image: + +```bash +docker pull quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0 +``` ## Positioning diff --git a/docs/tool-landscape.md b/docs/tool-landscape.md index e99f38b..ad82794 100644 --- a/docs/tool-landscape.md +++ b/docs/tool-landscape.md @@ -51,9 +51,11 @@ Current product evidence: - Docker build and smoke test pass. - GitHub release workflow builds Linux and macOS binaries. - FastaGuard v0.2.0 is published on GitHub with Linux and macOS binaries. -- FastaGuard v0.1.1 is published on Bioconda for Linux and macOS platforms, - with the v0.2.0 recipe update ready for upstream merge. +- FastaGuard v0.2.0 is published on Bioconda for `linux-64`, + `linux-aarch64`, `osx-64`, and `osx-arm64`. - Clean Bioconda install has been smoke-tested with `fastaguard --schema`. +- BioContainers publishes + `quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0`. - JSON Schema validates committed golden reports. - Reports include bounded evidence records and suggested actions. - MultiQC custom-content JSON is emitted as `fastaguard_mqc.json`. @@ -67,7 +69,7 @@ Evidence still needed: - committed benchmark summaries from public assemblies - user feedback from real pipeline authors -- BioContainers image/tag confirmation +- broader public assembly evidence runs - official MultiQC module or packaged plugin - comparison examples showing what FastaGuard catches before QUAST/BUSCO/BlobToolKit diff --git a/examples/nf-core/README.md b/examples/nf-core/README.md index 961011b..8be540a 100644 --- a/examples/nf-core/README.md +++ b/examples/nf-core/README.md @@ -16,13 +16,18 @@ Emitted outputs: - `mqc` - `versions` -The module assumes `fastaguard` is available on `PATH`. The recommended install is now: +The module assumes `fastaguard` is available on `PATH` when run without a +container. The recommended install is: ```bash mamba install -c conda-forge -c bioconda fastaguard ``` -Once a BioContainers image is confirmed, the module can add a pinned container directive. +The local module also includes the pinned BioContainers image: + +```text +quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0 +``` Example include: diff --git a/examples/nf-core/modules/local/fastaguard/main.nf b/examples/nf-core/modules/local/fastaguard/main.nf index 9b6318a..b4726f8 100644 --- a/examples/nf-core/modules/local/fastaguard/main.nf +++ b/examples/nf-core/modules/local/fastaguard/main.nf @@ -1,6 +1,7 @@ process FASTAGUARD { tag "$meta.id" label 'process_low' + container 'quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0' input: tuple val(meta), path(fasta) diff --git a/examples/snakemake/wrapper/README.md b/examples/snakemake/wrapper/README.md index 81e44f8..3c63780 100644 --- a/examples/snakemake/wrapper/README.md +++ b/examples/snakemake/wrapper/README.md @@ -20,6 +20,12 @@ The wrapper also includes a Conda environment: snakemake -s Snakefile --cores 1 --use-conda ``` +For containerized workflow runs, use the pinned BioContainers image: + +```text +quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0 +``` + The wrapper emits: - `fastaguard_report.html` diff --git a/examples/snakemake/wrapper/environment.yaml b/examples/snakemake/wrapper/environment.yaml index 1f2903b..0c32f64 100644 --- a/examples/snakemake/wrapper/environment.yaml +++ b/examples/snakemake/wrapper/environment.yaml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - fastaguard=0.1.1 + - fastaguard=0.2.0 diff --git a/packaging/bioconda/README.md b/packaging/bioconda/README.md index c39b0b0..d9f3876 100644 --- a/packaging/bioconda/README.md +++ b/packaging/bioconda/README.md @@ -1,6 +1,6 @@ # Bioconda Recipe -Upstream Bioconda currently publishes FastaGuard v0.1.1. +Upstream Bioconda currently publishes FastaGuard v0.2.0. The recipe has been merged into `bioconda/bioconda-recipes` as `recipes/fastaguard/`, and the current published package is available from @@ -10,11 +10,14 @@ Bioconda: mamba install -c conda-forge -c bioconda fastaguard ``` -BioContainers image/tag availability should be confirmed separately after -Bioconda publication. +BioContainers publishes the pinned workflow image: -This local recipe directory is ready for the FastaGuard v0.2.0 Bioconda update. -The v0.2.0 GitHub source archive is published and `meta.yaml` includes the real +```bash +docker pull quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0 +``` + +This local recipe directory mirrors the FastaGuard v0.2.0 Bioconda recipe. The +v0.2.0 GitHub source archive is published and `meta.yaml` includes the real archive SHA256. ## Local Checks diff --git a/tests/python/test_adoption_assets.py b/tests/python/test_adoption_assets.py index f23ae28..c4fd604 100644 --- a/tests/python/test_adoption_assets.py +++ b/tests/python/test_adoption_assets.py @@ -272,6 +272,9 @@ def test_bioconda_recipe_declares_binary_and_contract_tests(self): def test_workflow_docs_reference_bioconda_and_container_status(self): nfcore_readme = (ROOT / "examples" / "nf-core" / "README.md").read_text() + nfcore_module = ( + ROOT / "examples" / "nf-core" / "modules" / "local" / "fastaguard" / "main.nf" + ).read_text() snakemake_readme = ( ROOT / "examples" / "snakemake" / "wrapper" / "README.md" ).read_text() @@ -280,9 +283,17 @@ def test_workflow_docs_reference_bioconda_and_container_status(self): self.assertIn(install, nfcore_readme) self.assertIn(install, snakemake_readme) self.assertIn( - "Once a BioContainers image is confirmed, the module can add a pinned container directive.", + "quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0", nfcore_readme, ) + self.assertIn( + "quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0", + nfcore_module, + ) + self.assertIn( + "quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0", + snakemake_readme, + ) def test_benchmarking_docs_include_v0_2_evidence_topics(self): text = (ROOT / "docs" / "benchmarking.md").read_text() @@ -457,7 +468,7 @@ def test_snakemake_wrapper_declares_bioconda_environment(self): " - conda-forge", " - bioconda", "dependencies:", - " - fastaguard=0.1.1", + " - fastaguard=0.2.0", ], ) self.assertIn('conda: "environment.yaml"', snakefile.read_text()) diff --git a/tests/python/test_release_metadata.py b/tests/python/test_release_metadata.py index 98981cf..77ebefd 100644 --- a/tests/python/test_release_metadata.py +++ b/tests/python/test_release_metadata.py @@ -25,9 +25,9 @@ def test_v0_2_0_release_notes_exist(self): self.assertIn("FastaGuard v0.2.0", text) self.assertIn("Assembly Trust", text) self.assertIn("Pipeline Adoption", text) - self.assertIn("After the v0.2.0 Bioconda update merges", text) + self.assertIn("Install the v0.2.0 Bioconda package", text) self.assertIn("v0.2.0 GitHub release binaries and source archive", text) - self.assertIn("may still serve v0.1.1", text) + self.assertIn("quay.io/biocontainers/fastaguard:0.2.0--hfa8f182_0", text) def test_bioconda_recipe_has_publishable_v0_2_0_source_sha(self): cargo = tomllib.loads((ROOT / "Cargo.toml").read_text())