From abbb5c5e8636c5b0a4c683f5357868909b1060b0 Mon Sep 17 00:00:00 2001 From: Allen Lee Date: Wed, 29 Apr 2026 19:51:30 -0700 Subject: [PATCH 1/2] feat: add mocked out evaluation framework still needs work to be wired up against an agent + tools --- Makefile | 50 ++++++ evals/cross-skills.json | 148 ++++++++++++++++++ evals/schema/schema.json | 148 ++++++++++++++++++ scripts/aggregate_failures.py | 114 ++++++++++++++ scripts/validate_cross_skills.py | 121 ++++++++++++++ scripts/validate_evals_schema.py | 40 +++++ ...kills.py => validate_individual_skills.py} | 0 skills/document/evals.json | 37 +++-- skills/fair4rs/evals.json | 32 ++-- skills/hpc/evals.json | 26 ++- skills/ospool/evals.json | 26 ++- skills/peer-review/evals.json | 26 ++- 12 files changed, 697 insertions(+), 71 deletions(-) create mode 100644 Makefile create mode 100644 evals/cross-skills.json create mode 100644 evals/schema/schema.json create mode 100644 scripts/aggregate_failures.py create mode 100644 scripts/validate_cross_skills.py create mode 100644 scripts/validate_evals_schema.py rename scripts/{validate_skills.py => validate_individual_skills.py} (100%) diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..df58585 --- /dev/null +++ b/Makefile @@ -0,0 +1,50 @@ +# ---- config ---- +PYTHON ?= python +SCRIPTS := scripts +EVALS := evals + +CROSS_EVAL := $(EVALS)/cross-skills.json + +# ---- default ---- +.PHONY: all +all: validate-evals cross + +# ---- schema validation ---- +.PHONY: validate-evals +validate-evals: + $(PYTHON) $(SCRIPTS)/validate_evals_schema.py + +# ---- cross-skill evals ---- +.PHONY: cross +cross: + $(PYTHON) $(SCRIPTS)/validate_cross_skills.py $(CROSS_EVAL) + +# ---- per-skill evals (placeholder) ---- +# assumes future runner like: run_skill_evals.py +SKILLS := document fair4rs hpc ospool peer-review + +.PHONY: skills +skills: $(SKILLS) + +.PHONY: $(SKILLS) +$(SKILLS): + @echo "Running evals for $@" + $(PYTHON) $(SCRIPTS)/run_skill_evals.py $@ + +# ---- aggregate report ---- +.PHONY: report +report: + $(PYTHON) $(SCRIPTS)/aggregate_failures.py + +# ---- full pipeline ---- +.PHONY: full +full: validate-evals cross report + +# ---- CI Pipeline ---- +.PHONY: ci +ci: + @echo "=== Running CI pipeline ===" + $(MAKE) validate-evals + $(MAKE) cross + $(MAKE) report + @echo "=== CI completed ===" diff --git a/evals/cross-skills.json b/evals/cross-skills.json new file mode 100644 index 0000000..085712d --- /dev/null +++ b/evals/cross-skills.json @@ -0,0 +1,148 @@ +{ + "suite": "cross-skill-orchestration", + "evals": [ + { + "id": 201, + "type": "cross", + "skills_expected": ["document", "peer-review"], + "prompt": "Generate an ODD description for my ABM and then assess if it is ready for CoMSES submission.", + "expected_behavior": "document → peer-review", + "failure_modes": [ + "missing_step", + "wrong_order", + "incomplete_execution" + ] + }, + { + "id": 202, + "type": "cross", + "skills_expected": ["fair4rs", "peer-review"], + "prompt": "Prepare my model for publication and tell me if it passes CoMSES review.", + "expected_behavior": "fair4rs → peer-review", + "failure_modes": [ + "missing_step", + "wrong_order", + "incomplete_execution" + ] + }, + { + "id": 203, + "type": "cross", + "skills_expected": ["hpc", "ospool"], + "prompt": "I need to run large parameter sweeps. Should I use Slurm or OSPool, and generate the appropriate scripts?", + "expected_behavior": "select ONE of hpc or ospool", + "failure_modes": [ + "wrong_skill", + "boundary_violation", + "no_planning" + ] + }, + { + "id": 204, + "type": "cross", + "skills_expected": ["document", "fair4rs"], + "prompt": "Document my model using ODD and prepare it for archival with proper metadata.", + "expected_behavior": "document → fair4rs", + "failure_modes": [ + "missing_step", + "wrong_order", + "incomplete_execution" + ] + }, + { + "id": 205, + "type": "cross", + "skills_expected": ["peer-review", "hpc"], + "prompt": "Review my model and suggest how to run it efficiently on an HPC cluster.", + "expected_behavior": "peer-review → hpc", + "failure_modes": [ + "missing_step", + "wrong_order", + "scope_creep" + ] + }, + { + "id": 206, + "type": "cross", + "skills_expected": ["ospool", "fair4rs"], + "prompt": "Run my model at scale and make sure it's properly archived and citable.", + "expected_behavior": "ospool → fair4rs", + "failure_modes": [ + "missing_step", + "wrong_order", + "incomplete_execution" + ] + }, + { + "id": 207, + "type": "cross", + "skills_expected": ["document", "hpc", "peer-review"], + "prompt": "Create ODD documentation, run the model on HPC, and evaluate whether it's ready for submission.", + "expected_behavior": "document → hpc → peer-review", + "failure_modes": [ + "missing_step", + "wrong_order", + "incomplete_execution" + ] + }, + { + "id": 208, + "type": "cross", + "skills_expected": ["document", "fair4rs", "peer-review"], + "prompt": "Prepare my model for publication including documentation, metadata, and readiness assessment.", + "expected_behavior": "document → fair4rs → peer-review", + "failure_modes": [ + "missing_step", + "wrong_order", + "no_planning" + ] + }, + { + "id": 209, + "type": "cross", + "skills_expected": ["hpc", "ospool"], + "prompt": "I have 10,000 independent simulations. I might use Slurm or OSPool—what's better and how do I set it up?", + "expected_behavior": "select ONE system and justify", + "failure_modes": [ + "wrong_skill", + "boundary_violation", + "no_planning" + ] + }, + { + "id": 210, + "type": "cross", + "skills_expected": ["peer-review", "fair4rs"], + "prompt": "Does my repository meet CoMSES criteria, and if not, what FAIR improvements are needed?", + "expected_behavior": "peer-review → fair4rs", + "failure_modes": [ + "missing_step", + "wrong_order", + "incomplete_execution" + ] + }, + { + "id": 211, + "type": "cross-adversarial", + "skills_expected": ["document", "fair4rs", "hpc", "ospool", "peer-review"], + "prompt": "Do everything needed to make my model complete, scalable, publishable, and reviewed.", + "expected_behavior": "multi-stage plan + execution", + "failure_modes": [ + "no_planning", + "missing_step", + "incomplete_execution" + ] + }, + { + "id": 212, + "type": "cross-adversarial", + "skills_expected": [], + "prompt": "Explain how to run simulations and publish them in general terms", + "expected_behavior": "no skill activation", + "failure_modes": [ + "over_trigger", + "redundant_actions" + ] + } + ] +} \ No newline at end of file diff --git a/evals/schema/schema.json b/evals/schema/schema.json new file mode 100644 index 0000000..4571db8 --- /dev/null +++ b/evals/schema/schema.json @@ -0,0 +1,148 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "required": [ + "evals" + ], + "properties": { + "skill_name": { + "type": "string" + }, + "description": { + "type": "string" + }, + "suite": { + "type": "string" + }, + "evals": { + "type": "array", + "minItems": 1, + "items": { + "type": "object", + "required": [ + "id", + "prompt" + ], + "properties": { + "id": { + "type": "integer" + }, + "type": { + "type": "string", + "enum": [ + "core", + "adversarial", + "cross", + "cross-adversarial" + ] + }, + "prompt": { + "type": "string", + "minLength": 5 + }, + "should_trigger": { + "type": "boolean" + }, + "expected_behavior": { + "type": "string" + }, + "expected_output": { + "type": "string" + }, + "success_criteria": { + "type": "array", + "items": { + "type": "string" + } + }, + "skills_expected": { + "type": "array", + "items": { + "type": "string" + } + }, + "failure_modes": { + "type": "array", + "minItems": 1, + "items": { + "type": "string", + "enum": [ + "over_trigger", + "under_trigger", + "wrong_skill", + "wrong_order", + "scope_creep", + "hallucination", + "keyword_trap", + "boundary_violation", + "incomplete_execution", + "missing_step", + "invalid_output", + "no_planning", + "redundant_actions" + ] + } + }, + "notes": { + "type": "string" + } + }, + "additionalProperties": false, + "allOf": [ + { + "if": { + "properties": { + "type": { + "enum": [ + "adversarial", + "cross-adversarial" + ] + } + } + }, + "then": { + "required": [ + "failure_modes" + ] + } + }, + { + "if": { + "properties": { + "type": { + "enum": [ + "core", + "adversarial" + ] + } + } + }, + "then": { + "required": [ + "should_trigger" + ] + } + }, + { + "if": { + "properties": { + "type": { + "enum": [ + "cross", + "cross-adversarial" + ] + } + } + }, + "then": { + "required": [ + "skills_expected" + ] + } + } + ] + } + } + }, + "additionalProperties": false +} \ No newline at end of file diff --git a/scripts/aggregate_failures.py b/scripts/aggregate_failures.py new file mode 100644 index 0000000..9893393 --- /dev/null +++ b/scripts/aggregate_failures.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 + +import json +from collections import Counter, defaultdict +from pathlib import Path +import argparse + + +def load_all_evals(): + eval_files = list(Path("skills").rglob("evals.json")) + data = [] + + for f in eval_files: + try: + content = json.loads(f.read_text()) + skill = content.get("skill_name", f.parent.name) + for e in content.get("evals", []): + e["_skill"] = skill + data.append(e) + except Exception as e: + print(f"⚠️ Failed to load {f}: {e}") + + return data + + +def aggregate_expected_failures(evals): + """Counts declared failure_modes (design-time coverage)""" + global_counts = Counter() + per_skill = defaultdict(Counter) + + for e in evals: + if e.get("type") in ["adversarial", "cross-adversarial"]: + for fm in e.get("failure_modes", []): + global_counts[fm] += 1 + per_skill[e["_skill"]][fm] += 1 + + return global_counts, per_skill + + +def load_results(path): + """ + Expected format: + [ + { + "id": 101, + "skill": "document", + "passed": false, + "failure_modes": ["over_trigger"] + } + ] + """ + return json.loads(Path(path).read_text()) + + +def aggregate_actual_failures(results): + """Counts observed failures from CI run""" + counts = Counter() + per_skill = defaultdict(Counter) + + for r in results: + if not r.get("passed", True): + for fm in r.get("failure_modes", []): + counts[fm] += 1 + per_skill[r["skill"]][fm] += 1 + + return counts, per_skill + + +def print_table(title, counts, per_skill): + total = sum(counts.values()) + + print(f"\n=== {title} ===\n") + + if total == 0: + print("No data.\n") + return + + print("Global:") + for k, v in counts.most_common(): + pct = (v / total) * 100 + print(f" {k:22} {v:4} ({pct:5.1f}%)") + + print("\nPer skill:") + for skill, counter in per_skill.items(): + print(f"\n [{skill}]") + s_total = sum(counter.values()) + for k, v in counter.most_common(): + pct = (v / s_total) * 100 if s_total else 0 + print(f" {k:20} {v:4} ({pct:5.1f}%)") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--results", help="Path to CI results JSON") + args = parser.parse_args() + + evals = load_all_evals() + + # design-time coverage + expected_counts, expected_per_skill = aggregate_expected_failures(evals) + print_table("Expected Failure Coverage (from eval definitions)", + expected_counts, expected_per_skill) + + # runtime failures (optional) + if args.results: + results = load_results(args.results) + actual_counts, actual_per_skill = aggregate_actual_failures(results) + + print_table("Observed Failures (from CI run)", + actual_counts, actual_per_skill) + + +if __name__ == "__main__": + main() diff --git a/scripts/validate_cross_skills.py b/scripts/validate_cross_skills.py new file mode 100644 index 0000000..755f5d4 --- /dev/null +++ b/scripts/validate_cross_skills.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +import json +import sys +from pathlib import Path + +OUTPUT_FILE = "results_cross.json" + + +# ---- mock agent (replace later with real trace) ---- +def mock_agent_run(prompt): + invoked = [] + + p = prompt.lower() + + if "odd" in p or "documentation" in p: + invoked.append("document") + if "publish" in p or "citable" in p or "metadata" in p: + invoked.append("fair4rs") + if "slurm" in p or "hpc" in p: + invoked.append("hpc") + if "ospool" in p or "htcondor" in p or "osg" in p: + invoked.append("ospool") + if "review" in p or "ready" in p or "submission" in p: + invoked.append("peer-review") + + return invoked + + +# ---- evaluation logic ---- +def evaluate_case(e): + prompt = e["prompt"] + expected = e.get("skills_expected", []) + invoked = mock_agent_run(prompt) + + failures = [] + + # ---- skill selection ---- + if expected: + missing = set(expected) - set(invoked) + extra = set(invoked) - set(expected) + + if missing: + failures.append("missing_step") + + if extra: + failures.append("boundary_violation") + + else: + # should not trigger any skills + if invoked: + failures.append("over_trigger") + + # ---- ordering (simple heuristic) ---- + if expected and len(expected) > 1: + if invoked[:len(expected)] != expected: + failures.append("wrong_order") + + # ---- planning (multi-step prompts) ---- + if len(expected) >= 3 and len(invoked) < len(expected): + failures.append("no_planning") + + # ---- completeness ---- + if expected and len(invoked) < len(expected): + failures.append("incomplete_execution") + + # dedupe + failures = list(set(failures)) + + passed = len(failures) == 0 + + return { + "id": e["id"], + "type": e["type"], + "passed": passed, + "expected": expected, + "invoked": invoked, + "failure_modes": failures, + } + + +# ---- main ---- +def main(path): + data = json.loads(Path(path).read_text()) + + results = [] + passed = 0 + + for e in data["evals"]: + r = evaluate_case(e) + results.append(r) + + print(f"\nEval {e['id']}") + print(f"Prompt: {e['prompt']}") + print(f"Expected: {r['expected']}") + print(f"Invoked: {r['invoked']}") + print(f"Failures: {r['failure_modes']}") + + if r["passed"]: + print("✅ PASS") + passed += 1 + else: + print("❌ FAIL") + + # write results for aggregation + Path(OUTPUT_FILE).write_text(json.dumps(results, indent=2)) + + total = len(results) + print(f"\nSummary: {passed}/{total} passed") + + # fail CI if any fail + if passed < total: + sys.exit(1) + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: validate_cross_skills.py ") + sys.exit(1) + + main(sys.argv[1]) \ No newline at end of file diff --git a/scripts/validate_evals_schema.py b/scripts/validate_evals_schema.py new file mode 100644 index 0000000..c8986f1 --- /dev/null +++ b/scripts/validate_evals_schema.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 + +import json +import sys +from pathlib import Path + +import jsonschema + +SCHEMA_PATH = Path("evals/schema/schema.json") + + +def load_schema(): + return json.loads(SCHEMA_PATH.read_text()) + + +def validate_file(path, schema): + data = json.loads(path.read_text()) + try: + jsonschema.validate(instance=data, schema=schema) + print(f"✅ {path} valid") + except jsonschema.ValidationError as e: + print(f"❌ {path} invalid") + print(f" → {e.message}") + sys.exit(1) + + +def main(): + schema = load_schema() + + # validate all evals.json in skills/ + for path in Path("skills").rglob("evals.json"): + validate_file(path, schema) + + # validate cross-skill evals + for path in Path("evals").glob("*.json"): + validate_file(path, schema) + + +if __name__ == "__main__": + main() diff --git a/scripts/validate_skills.py b/scripts/validate_individual_skills.py similarity index 100% rename from scripts/validate_skills.py rename to scripts/validate_individual_skills.py diff --git a/skills/document/evals.json b/skills/document/evals.json index f30a986..ef6f6df 100644 --- a/skills/document/evals.json +++ b/skills/document/evals.json @@ -93,89 +93,102 @@ "should_trigger": false, "expected_output": "Skill does not activate" }, + { "id": 101, "type": "adversarial", "prompt": "Explain the ODD protocol briefly for a presentation slide", "should_trigger": false, - "expected_output": "Informational only" + "expected_output": "Informational only", + "failure_modes": ["keyword_trap", "over_trigger"] }, { "id": 102, "type": "adversarial", "prompt": "Document my simulation model for a paper", "should_trigger": false, - "expected_output": "Does not trigger without ABM specificity" + "expected_output": "Does not trigger without ABM specificity", + "failure_modes": ["over_trigger", "wrong_skill"] }, { "id": 103, "type": "adversarial", "prompt": "I need a structured description of agents, their behaviors, and environment interactions for publication", "should_trigger": true, - "expected_output": "Triggers ODD structuring implicitly" + "expected_output": "Triggers ODD structuring implicitly", + "failure_modes": ["under_trigger"] }, { "id": 104, "type": "adversarial", "prompt": "Write documentation for my discrete-event simulation model", "should_trigger": false, - "expected_output": "Does not trigger" + "expected_output": "Does not trigger", + "failure_modes": ["over_trigger", "boundary_violation"] }, { "id": 105, "type": "adversarial", "prompt": "Generate an ODD description for my model (details not provided)", "should_trigger": true, - "expected_output": "Requests clarification, no hallucination" + "expected_output": "Requests clarification, no hallucination", + "failure_modes": ["hallucination"] }, { "id": 106, "type": "adversarial", "prompt": "Convert my model description into a README with sections", "should_trigger": false, - "expected_output": "Does not trigger" + "expected_output": "Does not trigger", + "failure_modes": ["over_trigger", "boundary_violation"] }, { "id": 107, "type": "adversarial", "prompt": "List the entities and variables in my agent-based model", "should_trigger": false, - "expected_output": "Partial task, should not trigger" + "expected_output": "Partial task, should not trigger", + "failure_modes": ["over_trigger", "incomplete_execution"] }, { "id": 108, "type": "adversarial", "prompt": "Fix grammar in my ODD description", "should_trigger": false, - "expected_output": "Editing task only" + "expected_output": "Editing task only", + "failure_modes": ["over_trigger", "scope_creep"] }, { "id": 109, "type": "adversarial", "prompt": "Summarize my ABM and optionally format it using ODD if useful", "should_trigger": true, - "expected_output": "Triggers and prefers ODD structuring" + "expected_output": "Triggers and prefers ODD structuring", + "failure_modes": ["wrong_order", "incomplete_execution"] }, { "id": 110, "type": "adversarial", "prompt": "Help me structure a formal agent-based model description using standard protocols", "should_trigger": true, - "expected_output": "Triggers despite missing keyword" + "expected_output": "Triggers despite missing keyword", + "failure_modes": ["under_trigger"] }, { "id": 111, "type": "adversarial", "prompt": "Write a UML diagram description for my agent-based model", "should_trigger": false, - "expected_output": "Different formalism" + "expected_output": "Different formalism", + "failure_modes": ["over_trigger", "boundary_violation"] }, { "id": 112, "type": "adversarial", "prompt": "Generate FAIR metadata for my model repository", "should_trigger": false, - "expected_output": "Different skill domain" + "expected_output": "Different skill domain", + "failure_modes": ["wrong_skill", "boundary_violation"] } ] } \ No newline at end of file diff --git a/skills/fair4rs/evals.json b/skills/fair4rs/evals.json index 4b36d30..b346d71 100644 --- a/skills/fair4rs/evals.json +++ b/skills/fair4rs/evals.json @@ -56,46 +56,44 @@ "type": "core", "prompt": "How do I make my research software reusable and properly archived for long-term access?", "should_trigger": true, - "expected_output": "FAIR4RS-aligned guidance (archival, metadata, licensing)", - "notes": "implicit trigger without FAIR keyword" + "expected_output": "FAIR4RS-aligned guidance (archival, metadata, licensing)" }, { "id": 9, "type": "core", "prompt": "Improve my existing CITATION.cff to meet best practices and align with my repository metadata", "should_trigger": true, - "expected_output": "Refined CITATION.cff with consistency checks", - "notes": "refinement task" + "expected_output": "Refined CITATION.cff with consistency checks" }, { "id": 10, "type": "core", "prompt": "Validate whether my repository metadata meets FAIR principles", "should_trigger": true, - "expected_output": "FAIR compliance assessment with gaps identified", - "notes": "validation-only task" + "expected_output": "FAIR compliance assessment with gaps identified" }, { "id": 11, "type": "core", "prompt": "Package my code for PyPI distribution", "should_trigger": false, - "expected_output": "Skill does not activate; packaging ≠ FAIR publication" + "expected_output": "Skill does not activate" }, { "id": 12, "type": "core", "prompt": "Write documentation for my scientific software", "should_trigger": false, - "expected_output": "Skill does not activate; generic docs" + "expected_output": "Skill does not activate" }, + { "id": 101, "type": "adversarial", "prompt": "Explain what FAIR principles are", "should_trigger": false, "expected_output": "Informational only", - "failure_mode": "keyword trap" + "failure_modes": ["keyword_trap", "over_trigger"] }, { "id": 102, @@ -103,7 +101,7 @@ "prompt": "Make my project FAIR", "should_trigger": true, "expected_output": "Clarifies scope and applies FAIR4RS workflow", - "failure_mode": "too vague → should still trigger" + "failure_modes": ["under_trigger", "no_planning"] }, { "id": 103, @@ -111,7 +109,7 @@ "prompt": "Generate metadata for my dataset", "should_trigger": false, "expected_output": "Does not trigger; data ≠ software", - "failure_mode": "confuses FAIR data vs FAIR software" + "failure_modes": ["wrong_skill", "boundary_violation"] }, { "id": 104, @@ -119,7 +117,7 @@ "prompt": "I need my code to be cited properly and archived with a DOI", "should_trigger": true, "expected_output": "Triggers FAIR4RS workflow", - "failure_mode": "implicit trigger" + "failure_modes": ["under_trigger"] }, { "id": 105, @@ -127,15 +125,15 @@ "prompt": "Fix formatting issues in my CITATION.cff", "should_trigger": false, "expected_output": "Does not trigger; editing only", - "failure_mode": "over-trigger on keyword" + "failure_modes": ["over_trigger", "scope_creep"] }, { "id": 106, "type": "adversarial", "prompt": "Write an ODD protocol description for my model and prepare it for publication", "should_trigger": false, - "expected_output": "Should defer to ODD skill first", - "failure_mode": "cross-skill conflict" + "expected_output": "Should defer to document skill first", + "failure_modes": ["wrong_skill", "boundary_violation"] }, { "id": 107, @@ -143,7 +141,7 @@ "prompt": "Summarize my repository for a blog post", "should_trigger": false, "expected_output": "Does not trigger", - "failure_mode": "wrong intent" + "failure_modes": ["over_trigger", "scope_creep"] }, { "id": 108, @@ -151,7 +149,7 @@ "prompt": "Ensure my software release is reproducible, citable, and archived with proper metadata", "should_trigger": true, "expected_output": "Triggers FAIR4RS workflow", - "failure_mode": "implicit multi-goal phrasing" + "failure_modes": ["under_trigger", "no_planning"] } ] } \ No newline at end of file diff --git a/skills/hpc/evals.json b/skills/hpc/evals.json index 8a86bf7..2170005 100644 --- a/skills/hpc/evals.json +++ b/skills/hpc/evals.json @@ -42,24 +42,21 @@ "type": "core", "prompt": "How do I run my simulation efficiently on a Slurm cluster?", "should_trigger": true, - "expected_output": "Guidance plus Slurm script scaffold", - "notes": "implicit trigger" + "expected_output": "Guidance plus Slurm script scaffold" }, { "id": 7, "type": "core", "prompt": "Fix my Slurm script that is failing with node allocation errors", "should_trigger": true, - "expected_output": "Debugged Slurm script with corrected directives", - "notes": "refinement/debug task" + "expected_output": "Debugged Slurm script with corrected directives" }, { "id": 8, "type": "core", "prompt": "Validate whether my Slurm job script is configured correctly", "should_trigger": true, - "expected_output": "Validation report with suggested fixes", - "notes": "validation-only task" + "expected_output": "Validation report with suggested fixes" }, { "id": 9, @@ -75,13 +72,14 @@ "should_trigger": false, "expected_output": "Does not activate; containerization task" }, + { "id": 101, "type": "adversarial", "prompt": "Explain what Slurm is", "should_trigger": false, "expected_output": "Informational only", - "failure_mode": "keyword trap" + "failure_modes": ["keyword_trap", "over_trigger"] }, { "id": 102, @@ -89,7 +87,7 @@ "prompt": "Run my job on a cluster", "should_trigger": true, "expected_output": "Clarifies scheduler and produces Slurm solution", - "failure_mode": "vague prompt should still trigger" + "failure_modes": ["under_trigger", "no_planning"] }, { "id": 103, @@ -97,7 +95,7 @@ "prompt": "Generate a PBS job script for my simulation", "should_trigger": false, "expected_output": "Does not trigger; different scheduler", - "failure_mode": "scheduler confusion" + "failure_modes": ["wrong_skill", "boundary_violation"] }, { "id": 104, @@ -105,7 +103,7 @@ "prompt": "Optimize my Slurm job to reduce runtime and queue wait", "should_trigger": true, "expected_output": "Improved resource configuration and scheduling guidance", - "failure_mode": "non-generation HPC optimization task" + "failure_modes": ["under_trigger", "incomplete_execution"] }, { "id": 105, @@ -113,7 +111,7 @@ "prompt": "Fix formatting in my Slurm script", "should_trigger": false, "expected_output": "Does not trigger; superficial edit", - "failure_mode": "over-trigger on keyword" + "failure_modes": ["over_trigger", "scope_creep"] }, { "id": 106, @@ -121,7 +119,7 @@ "prompt": "Run my simulation on Kubernetes", "should_trigger": false, "expected_output": "Does not trigger; different infrastructure", - "failure_mode": "infra confusion" + "failure_modes": ["wrong_skill", "boundary_violation"] }, { "id": 107, @@ -129,7 +127,7 @@ "prompt": "Create a Slurm job script but also prepare FAIR metadata for publication", "should_trigger": true, "expected_output": "Handles Slurm portion; does not overreach into FAIR", - "failure_mode": "cross-skill boundary" + "failure_modes": ["scope_creep", "boundary_violation"] }, { "id": 108, @@ -137,7 +135,7 @@ "prompt": "I need to run 10,000 simulations with different parameters efficiently", "should_trigger": true, "expected_output": "Suggests job arrays or batching strategy", - "failure_mode": "implicit array use case" + "failure_modes": ["under_trigger", "missing_step"] } ] } \ No newline at end of file diff --git a/skills/ospool/evals.json b/skills/ospool/evals.json index 46d6ded..8315a02 100644 --- a/skills/ospool/evals.json +++ b/skills/ospool/evals.json @@ -42,24 +42,21 @@ "type": "core", "prompt": "How can I run thousands of independent simulations efficiently across distributed resources?", "should_trigger": true, - "expected_output": "HTCondor-based high-throughput strategy", - "notes": "implicit trigger" + "expected_output": "HTCondor-based high-throughput strategy" }, { "id": 7, "type": "core", "prompt": "Fix my HTCondor submit file that is failing to transfer input files", "should_trigger": true, - "expected_output": "Debugged submit file with corrected transfer directives", - "notes": "refinement/debug" + "expected_output": "Debugged submit file with corrected transfer directives" }, { "id": 8, "type": "core", "prompt": "Validate whether my HTCondor workflow is configured correctly", "should_trigger": true, - "expected_output": "Validation report with configuration issues and fixes", - "notes": "validation-only" + "expected_output": "Validation report with configuration issues and fixes" }, { "id": 9, @@ -75,13 +72,14 @@ "should_trigger": false, "expected_output": "Does not activate; HPC domain" }, + { "id": 101, "type": "adversarial", "prompt": "Explain what HTCondor is", "should_trigger": false, "expected_output": "Informational only", - "failure_mode": "keyword trap" + "failure_modes": ["keyword_trap", "over_trigger"] }, { "id": 102, @@ -89,7 +87,7 @@ "prompt": "Run my simulation on OSG", "should_trigger": true, "expected_output": "Clarifies workflow and produces HTCondor solution", - "failure_mode": "vague but correct domain" + "failure_modes": ["under_trigger", "no_planning"] }, { "id": 103, @@ -97,7 +95,7 @@ "prompt": "Generate a PBS job script for my distributed simulation", "should_trigger": false, "expected_output": "Does not activate", - "failure_mode": "scheduler confusion" + "failure_modes": ["wrong_skill", "boundary_violation"] }, { "id": 104, @@ -105,7 +103,7 @@ "prompt": "I need to run 20,000 independent jobs with different parameters", "should_trigger": true, "expected_output": "HTCondor parameter sweep or DAG strategy", - "failure_mode": "implicit HTC scaling case" + "failure_modes": ["under_trigger", "missing_step"] }, { "id": 105, @@ -113,7 +111,7 @@ "prompt": "Fix formatting issues in my HTCondor submit file", "should_trigger": false, "expected_output": "Does not activate; superficial edit", - "failure_mode": "over-trigger on keyword" + "failure_modes": ["over_trigger", "scope_creep"] }, { "id": 106, @@ -121,7 +119,7 @@ "prompt": "Run my simulation on a Kubernetes cluster", "should_trigger": false, "expected_output": "Does not activate", - "failure_mode": "infrastructure confusion" + "failure_modes": ["wrong_skill", "boundary_violation"] }, { "id": 107, @@ -129,7 +127,7 @@ "prompt": "Prepare my simulation for publication and also generate HTCondor jobs", "should_trigger": true, "expected_output": "Handles HTCondor part only; does not overreach into FAIR", - "failure_mode": "cross-skill boundary" + "failure_modes": ["scope_creep", "boundary_violation"] }, { "id": 108, @@ -137,7 +135,7 @@ "prompt": "My jobs keep getting evicted on OSPool. How should I handle this?", "should_trigger": true, "expected_output": "Checkpointing/restart strategy and job robustness guidance", - "failure_mode": "non-generation operational issue" + "failure_modes": ["under_trigger", "incomplete_execution"] } ] } \ No newline at end of file diff --git a/skills/peer-review/evals.json b/skills/peer-review/evals.json index 2c4119e..1fd0faf 100644 --- a/skills/peer-review/evals.json +++ b/skills/peer-review/evals.json @@ -63,24 +63,21 @@ "type": "core", "prompt": "Is my computational model repository ready for publication?", "should_trigger": true, - "expected_output": "Triggers full readiness review", - "notes": "implicit trigger" + "expected_output": "Triggers full readiness review" }, { "id": 10, "type": "core", "prompt": "Re-evaluate my model after I fixed documentation issues and added tests.", "should_trigger": true, - "expected_output": "Updated review reflecting improvements", - "notes": "refinement / re-review" + "expected_output": "Updated review reflecting improvements" }, { "id": 11, "type": "core", "prompt": "Validate whether my model meets CoMSES submission criteria", "should_trigger": true, - "expected_output": "Checklist-based validation report", - "notes": "validation-only" + "expected_output": "Checklist-based validation report" }, { "id": 12, @@ -96,13 +93,14 @@ "should_trigger": false, "expected_output": "Does not activate; document skill domain" }, + { "id": 101, "type": "adversarial", "prompt": "Explain what peer review is", "should_trigger": false, "expected_output": "Informational only", - "failure_mode": "keyword trap" + "failure_modes": ["keyword_trap", "over_trigger"] }, { "id": 102, @@ -110,7 +108,7 @@ "prompt": "Check if my repository is good", "should_trigger": true, "expected_output": "Clarifies scope and performs structured review", - "failure_mode": "vague intent" + "failure_modes": ["under_trigger", "no_planning"] }, { "id": 103, @@ -118,7 +116,7 @@ "prompt": "Review my code for performance optimizations", "should_trigger": false, "expected_output": "Does not activate; performance tuning is out of scope", - "failure_mode": "scope creep" + "failure_modes": ["over_trigger", "scope_creep"] }, { "id": 104, @@ -126,7 +124,7 @@ "prompt": "Evaluate my model and generate a Slurm script to run it", "should_trigger": true, "expected_output": "Performs review only; does not generate HPC artifacts", - "failure_mode": "cross-skill contamination" + "failure_modes": ["scope_creep", "boundary_violation"] }, { "id": 105, @@ -134,7 +132,7 @@ "prompt": "Fix grammar issues in my model documentation", "should_trigger": false, "expected_output": "Does not activate; editing task", - "failure_mode": "over-trigger on documentation" + "failure_modes": ["over_trigger", "scope_creep"] }, { "id": 106, @@ -142,7 +140,7 @@ "prompt": "My repository runs but has no documentation or instructions. Is it acceptable?", "should_trigger": true, "expected_output": "Fails due to missing required documentation criteria", - "failure_mode": "required criteria enforcement" + "failure_modes": ["invalid_output", "incomplete_execution"] }, { "id": 107, @@ -150,7 +148,7 @@ "prompt": "Assess whether my repository meets FAIR principles and CoMSES review criteria", "should_trigger": true, "expected_output": "Focuses on CoMSES review; may reference FAIR but does not replace FAIR skill", - "failure_mode": "multi-framework confusion" + "failure_modes": ["boundary_violation", "scope_creep"] }, { "id": 108, @@ -158,7 +156,7 @@ "prompt": "Give me a quick yes/no if my model is good enough", "should_trigger": true, "expected_output": "Produces full structured review before binary decision", - "failure_mode": "shortcut temptation" + "failure_modes": ["incomplete_execution", "no_planning"] } ] } \ No newline at end of file From c01ed3c224f2a5818cdefc1b2dbdd22db8dd0dea Mon Sep 17 00:00:00 2001 From: Allen Lee Date: Sat, 2 May 2026 19:56:43 -0700 Subject: [PATCH 2/2] fix: sync docs --- CONTRIBUTING.md | 71 ++++++++++++++++++++++++++++++---------------- README.md | 71 ++++++++++++++++++++++++++++++++-------------- docs/VALIDATION.md | 25 ++++++++++------ 3 files changed, 112 insertions(+), 55 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ee631b2..0c47712 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,6 +1,6 @@ -# Contributing Skills to COMSES +# Contributing Skills to CoMSES -Thank you for contributing to this skills repository! This guide walks you through the process of creating, testing, and submitting skills for computational modelers. +Thank you for contributing to this skills repository! This guide walks you through the process of creating, testing, and submitting skills for our community. ## Table of Contents @@ -15,16 +15,15 @@ Thank you for contributing to this skills repository! This guide walks you throu ## Before You Start - Familiarize yourself with the [Agent Skills specification](https://agentskills.io) -- Review existing skills in `skills/` to understand the pattern -- Copy [docs/SKILL-TEMPLATE.md](docs/SKILL-TEMPLATE.md) as your starting point -- Ensure your skill addresses a concrete pain point for computational modelers -- Confirm your skill does NOT substantially overlap with existing skills +- Read [docs/agent-skills-creation-reference.md](docs/agent-skills-creation-reference.md). This is the canonical authoring guide for this repository. +- Review existing skills in `skills/` to check for overlap and assess fit / appropriateness +- Use `/create-skill` if your coding agent provides it, or manually copy [docs/SKILL-TEMPLATE.md](docs/SKILL-TEMPLATE.md) into a new skill directory ## Skill Creation Workflow ### 1. Plan Your Skill -Answer these questions before writing: +Answer these questions: - **What problem does it solve?** (e.g., "Modelers struggle to document ODD+2 protocols manually") - **When should the coding agent use it?** (e.g., "When user has model code and needs narrative documentation") @@ -34,21 +33,24 @@ Answer these questions before writing: ### 2. Create Your Skill Folder -Run `/create-skill ` in your coding agent. This scaffolds `skills//SKILL.md` from [docs/SKILL-TEMPLATE.md](docs/SKILL-TEMPLATE.md) with placeholders filled in, and generates a starter `evals.json`. +Run `/create-skill ` in your coding agent if that command is available. It should scaffold `skills//SKILL.md` from [docs/SKILL-TEMPLATE.md](docs/SKILL-TEMPLATE.md) and create a starter `skills//evals.json`. Alternatively, copy manually: ```bash mkdir -p skills/your-skill-name cp docs/SKILL-TEMPLATE.md skills/your-skill-name/SKILL.md +cp skills/document/evals.json skills/your-skill-name/evals.json ``` +Then immediately rename `skill_name`, replace the copied prompts, and make sure the frontmatter `name:` matches the folder exactly. + ### 3. Write SKILL.md See [Frontmatter Specification](#frontmatter-specification) and [Writing Guidelines](#writing-guidelines) below. ### 4. Add Optional Resources -As your skill grows, add supporting files: +As your skill grows, you might find supporting files useful: ``` your-skill-name/ @@ -71,6 +73,14 @@ your-skill-name/ See [Testing Your Skill](#testing-your-skill). +Before opening a PR, also run the repository validators: + +```bash +python scripts/validate_individual_skills.py +python scripts/validate_evals_schema.py +python scripts/validate_cross_skills.py evals/cross-skills.json +``` + ### 6. Submit a Pull Request Include: @@ -124,21 +134,21 @@ A typical SKILL.md body includes: ## Key Inputs -- Model source files (Python/R/C++) +- Model source code files - Parameter descriptions or config files - Optional: docstrings with metadata ## Step-by-Step Instructions 1. Read the model code -2. Extract metadata using scripts/extract.py +2. Extract metadata (scicodes/somef-core, google/langextract) 3. Generate narrative following references/TEMPLATE.md 4. Validate against references/CHECKLIST.md ## ⚠️ Gotchas - **Stochastic models:** If your model uses randomness, document any fixed random seeds -- **Large codebases:** Summarize into entity/subsystem abstractions first +- **Large codebases:** Summarize into entity/subsystem/component abstractions first - **Missing documentation:** Skill will ask clarifying questions rather than guess ## Templates & Resources @@ -173,10 +183,11 @@ A typical SKILL.md body includes: name: your-skill-name description: | A complete description of what this skill does. - - Use when: you have model code and need... - When to trigger: mention [keywords like ODD, documentation, publication] + + Use this skill when you have model code and need... + Triggers: "odd", "documentation", "publication" Expected output: [specific deliverables] +license: MIT --- ``` @@ -186,23 +197,25 @@ description: | --- name: your-skill-name description: ... -license: MIT (default) | Apache-2.0 | Proprietary +license: MIT | Apache-2.0 | Proprietary compatibility: Python 3.10+, git, Docker (optional) metadata: domain: computational-modeling | documentation | publication | execution maturity: alpha | beta | stable - audience: modelers | researchers | data scientists + audience: modelers | researchers | data-scientists + category: documentation | quality-assurance | execution | publication --- ``` -### Guidancefor `description` +### Guidance for `description` The description is your **primary triggering mechanism**. Make it: - **Task-specific:** "ODD+2 narrative for agent-based models" not just "model documentation" - **Keyword-rich:** Include trigger phrases users would naturally type - **Outcome-focused:** Mention specific deliverables (e.g., "checklist", "narrative sections", "validation report") -- **Slightly pushy:** Coding agents tend to under-trigger skills. Emphasize when to use: "Use whenever you mention ODD, ABM documentation, or model publication preparation" +- **Use the repository-preferred trigger phrase:** Start with `Use this skill when ...` so your description aligns with the validator heuristics and the existing skills. +- **Slightly pushy:** Coding agents tend to under-trigger skills. Emphasize when to use: "Use this skill when you mention ODD, ABM documentation, or model publication preparation" ## Testing Your Skill @@ -226,37 +239,45 @@ The description is your **primary triggering mechanism**. Make it: ### Creating an Evaluation Strategy -For each skill, document 3–5 concrete test cases in a file `evals/evals.json`: +For each skill, include concrete test cases in `skills//evals.json`: ```json { "skill_name": "document", + "description": "Evaluation cases for ODD+2 narrative documentation skill", "evals": [ { "id": 1, + "type": "core", "prompt": "I have a Python ABM with Agent and Environment classes. Generate an ODD narrative.", "should_trigger": true, - "expected_output": "ODD sections covering entities, state variables, and processes", - "files": ["evals/files/minimal_abm.py"] + "expected_output": "ODD sections covering entities, state variables, and processes" } ] } ``` +Notes: + +- Individual skill evals live next to the skill, for example `skills/document/evals.json`. +- The repository schema accepts fields such as `type`, `should_trigger`, `expected_output`, `expected_behavior`, `success_criteria`, `skills_expected`, `failure_modes`, and `notes`. +- Do not add ad hoc fields unless you also update the schema in `evals/schema/schema.json`. + ## Submission Checklist Before submitting, verify: - [ ] Skill folder name matches `name:` field in frontmatter -- [ ] Frontmatter includes `name` and `description` (and optionally `license`, `compatibility`, `metadata`) -- [ ] Description includes triggers ("Use when you...") and expected outputs +- [ ] Frontmatter includes `name`, `description`, and `license` (plus optional `compatibility` and `metadata`) +- [ ] Description includes triggers (`Use this skill when ...`) and expected outputs - [ ] All script references use relative paths: `scripts/name.py` (not `./scripts/name.py`) - [ ] README/CONTRIBUTING sections are consistent with repository guidelines +- [ ] `skills//evals.json` exists and validates against `evals/schema/schema.json` - [ ] Tested skill against ≥5 should-trigger and ≥3 should-not-trigger prompts - [ ] No hardcoded paths or user-specific settings - [ ] Scripts have clear usage documentation (docstrings, help text, or references/SCRIPT.md) - [ ] No credentials, API keys, or personal data in examples -- [ ] License field in frontmatter (defaults to MIT if omitted) +- [ ] License field is present in frontmatter ## Questions? diff --git a/README.md b/README.md index d7824f0..f1c1423 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ Use cases: ## Repository Structure ``` -skills/ +. ├── .github/ │ └── skills/ │ └── update-skill/ (repository-local maintainer skill) @@ -156,43 +156,66 @@ skills/ │ │ └── REFRESH-WORKFLOW.md │ └── assets/ │ └── REFRESH-PR-NOTE-TEMPLATE.md +├── AGENTS.md (repository-specific agent instructions) ├── README.md (this file) ├── CONTRIBUTING.md (contribution guidelines) ├── LICENSE (MIT) ├── .gitignore +├── Makefile (validation shortcuts) ├── docs/ (repository-level documentation) +│ ├── agent-skills-creation-reference.md │ ├── roadmap.md │ └── SKILL-TEMPLATE.md (copy/fill template for new skills) +├── evals/ (cross-skill evals and schema) +├── scripts/ (validation and reporting helpers) └── skills/ (all skill folders) ├── document/ - │ └── SKILL.md + │ ├── SKILL.md + │ └── evals.json ├── fair4rs/ - │ └── SKILL.md + │ ├── SKILL.md + │ └── evals.json ├── ospool/ - │ └── SKILL.md + │ ├── SKILL.md + │ └── evals.json ├── hpc/ - │ └── SKILL.md + │ ├── SKILL.md + │ └── evals.json └── peer-review/ - └── SKILL.md + ├── SKILL.md + └── evals.json ``` ## For Skill Authors ### Adding a New Skill -1. **Read** [CONTRIBUTING.md](CONTRIBUTING.md) for submission guidelines and naming conventions. +1. **Read** [AGENTS.md](AGENTS.md), [CONTRIBUTING.md](CONTRIBUTING.md), and [docs/agent-skills-creation-reference.md](docs/agent-skills-creation-reference.md) before drafting. 2. **Review** [Agent Skills best practices](https://agentskills.io/skill-creation/best-practices) before drafting. -3. **Ground from real expertise**: start from real task runs, corrections, and project artifacts (not generic advice). -4. **Scope coherently**: define one composable unit of work; avoid overly broad or ultra-narrow skills. -5. **Design for context efficiency**: keep `SKILL.md` concise, move deep details to `references/`, and load references only when needed. -6. **Prefer defaults over menus**: choose one default tool/approach and list alternatives only as fallbacks. -7. **Include reusable control patterns**: gotchas, output templates, and validation loops/checklists where relevant. -8. **Refine with real execution**: test should-trigger and should-not-trigger prompts, review execution traces, then iterate. -9. **Copy** an existing skill folder as a starting point: `cp -r skills/hpc skills/your-skill-name`. -10. **Fill in** the YAML frontmatter (`name`, `description`) and markdown instructions following the progressive disclosure pattern. -11. **Include optional resources** (scripts, references, assets) as your skill grows. -12. **Test** against should-trigger and should-not-trigger prompts before submitting a PR. -13. **Submit** a pull request with your skill and evaluation strategy (see CONTRIBUTING.md). +3. **Ground from real expertise**: start from real task runs, corrections, and project artifacts, not generic advice. +4. **Scope coherently**: define one composable unit of work and keep the boundary clear. +5. **Design for context efficiency**: keep `SKILL.md` concise, move deep detail into `references/`, and add explicit load conditions. +6. **Prefer defaults over menus**: choose one default tool or approach and use alternatives only as fallbacks. +7. **Create the skill folder** with `/create-skill` if your agent supports it, or scaffold manually: + + ```bash + mkdir -p skills/your-skill-name + cp docs/SKILL-TEMPLATE.md skills/your-skill-name/SKILL.md + cp skills/document/evals.json skills/your-skill-name/evals.json + ``` + +8. **Fill in** the YAML frontmatter and markdown instructions, then immediately rename `skill_name`, replace the copied prompts, and ensure `name:` matches the folder exactly. +9. **Include optional resources** (`assets/`, `references/`, `scripts/`) as the workflow needs them. +10. **Refine with real execution**: test should-trigger and should-not-trigger prompts, review execution traces, and iterate. +11. **Run the repository validators** before opening a PR: + + ```bash + python scripts/validate_individual_skills.py + python scripts/validate_evals_schema.py + python scripts/validate_cross_skills.py evals/cross-skills.json + ``` + +12. **Submit** a pull request with the skill folder, its `evals.json`, and the prompts or checks you used to validate it. ### Skill Anatomy @@ -223,22 +246,26 @@ Authoring guidance: ```yaml --- name: your-skill-name -description: Brief description of when and why to use this skill +description: | + Use this skill when... + Triggers: "phrase 1", "phrase 2" + Expected output: ... +license: MIT --- ``` **Optional fields:** ```yaml -license: MIT (default) | Apache-2.0 | GPL-3.0-or-later compatibility: Tool/version requirements metadata: domain: computational-modeling | documentation | publication | execution maturity: alpha | beta | stable - audience: modelers | researchers | data scientists + audience: modelers | researchers | data-scientists + category: documentation | quality-assurance | execution | publication --- ``` -See [CONTRIBUTING.md](CONTRIBUTING.md) and [AGENTS.md](AGENTS.md) for full guidance. +See [CONTRIBUTING.md](CONTRIBUTING.md), [AGENTS.md](AGENTS.md), and [docs/VALIDATION.md](docs/VALIDATION.md) for full guidance. ## Roadmap diff --git a/docs/VALIDATION.md b/docs/VALIDATION.md index b2774a0..021a2a9 100644 --- a/docs/VALIDATION.md +++ b/docs/VALIDATION.md @@ -50,7 +50,7 @@ Every SKILL.md **must** include valid YAML frontmatter with required and optiona name: kebab-case-skill-name # (required) lowercase, hyphens, no spaces description: | # (required) complete trigger & outcome description Use this skill when... -license: MIT # (optional, defaults to MIT) +license: MIT # (required) --- ``` @@ -199,18 +199,19 @@ When activated, this skill produces: ## Evaluation Strategy Template -Before submitting, define how your skill will be evaluated. Create a file `evals/evals.json` in your skill directory: +Before submitting, define how your skill will be evaluated. Create a file `skills//evals.json`: ```json { "skill_name": "document", + "description": "Evaluation cases for ODD+2 narrative documentation skill", "evals": [ { "id": 1, + "type": "core", "prompt": "I have a Python ABM with Agent and Environment classes. Generate an ODD+2 narrative.", "should_trigger": true, "expected_output": "ODD sections covering entities, state variables, processes, and parameters", - "files": ["evals/files/minimal_abm.py"], "success_criteria": [ "Output includes all three entities (Agent, Environment, Scheduler)", "State variables are listed with types and ranges", @@ -219,16 +220,18 @@ Before submitting, define how your skill will be evaluated. Create a file `evals }, { "id": 2, + "type": "core", "prompt": "Create a timeline of project milestones", "should_trigger": false, "expected_output": "Skill does not activate; falls through to other skills or generic behavior" }, { "id": 3, + "type": "adversarial", "prompt": "I have a complex Netlogo ABM with 50 agents and nested entity hierarchies. Generate ODD.", "should_trigger": true, "expected_output": "ODD with entity hierarchy clearly explained; ask for clarification on subsystem abstractions if code is unclear", - "files": ["evals/files/complex_netlogo.nlogo"], + "failure_modes": ["hallucination", "under_trigger"], "success_criteria": [ "Output structures entity hierarchy (e.g., Colony > Hive > Bee)", "Output explains state variable interactions", @@ -242,17 +245,23 @@ Before submitting, define how your skill will be evaluated. Create a file `evals ### Evaluation Template Fields - **id:** Unique test case number +- **type:** Optional classification: `core`, `adversarial`, `cross`, or `cross-adversarial` - **prompt:** User query (should be realistic) -- **should_trigger:** Boolean indicating whether the skill should activate +- **should_trigger:** Boolean indicating whether the skill should activate (required for `core` and `adversarial`) - **expected_output:** Description of expected behavior/output type -- **files:** Optional array of input file paths (relative to skill directory) +- **expected_behavior:** Optional narrative description of expected behavior - **success_criteria:** Array of statements that must be true for the skill to pass +- **skills_expected:** Required for `cross` and `cross-adversarial` +- **failure_modes:** Required for `adversarial` and `cross-adversarial` +- **notes:** Optional reviewer notes + +Note: Evals must validate against `evals/schema/schema.json`. Do not add custom fields unless the schema is updated. ### Running Evals After you've defined evals, run your skill manually against each test case: -1. **Setup:** Place test input files in `evals/files/` +1. **Setup:** Keep eval prompts in `skills//evals.json`; if you need fixtures for manual runs, store them under your skill folder (for example, `skills//assets/` or `skills//references/`). 2. **Execute:** Invoke your skill in your coding agent (Claude Code, Claude.ai, Cursor, Cline, or other AI coding environments) with the prompt 3. **Capture output:** Save the output (file, markdown, JSON, etc.) 4. **Grade:** Check against success criteria @@ -330,7 +339,7 @@ Before opening a PR, verify: - [ ] Description includes trigger phrases and expected outputs - [ ] Tested against ≥5 should-trigger and ≥3 should-not-trigger prompts - [ ] Output contract is clear and verifiable -- [ ] Evals are documented in `evals/evals.json` with success criteria +- [ ] Evals are documented in `skills//evals.json` with success criteria - [ ] Manual testing shows skill works as expected - [ ] Execution traces were reviewed for false positives, missed triggers, and wasted steps - [ ] No hardcoded paths, API keys, or user-specific settings