From abbb5c5e8636c5b0a4c683f5357868909b1060b0 Mon Sep 17 00:00:00 2001
From: Allen Lee <alee@users.noreply.github.com>
Date: Wed, 29 Apr 2026 19:51:30 -0700
Subject: [PATCH 1/2] feat: add mocked out evaluation framework

still needs work to be wired up against an agent + tools
---
 Makefile                                      |  50 ++++++
 evals/cross-skills.json                       | 148 ++++++++++++++++++
 evals/schema/schema.json                      | 148 ++++++++++++++++++
 scripts/aggregate_failures.py                 | 114 ++++++++++++++
 scripts/validate_cross_skills.py              | 121 ++++++++++++++
 scripts/validate_evals_schema.py              |  40 +++++
 ...kills.py => validate_individual_skills.py} |   0
 skills/document/evals.json                    |  37 +++--
 skills/fair4rs/evals.json                     |  32 ++--
 skills/hpc/evals.json                         |  26 ++-
 skills/ospool/evals.json                      |  26 ++-
 skills/peer-review/evals.json                 |  26 ++-
 12 files changed, 697 insertions(+), 71 deletions(-)
 create mode 100644 Makefile
 create mode 100644 evals/cross-skills.json
 create mode 100644 evals/schema/schema.json
 create mode 100644 scripts/aggregate_failures.py
 create mode 100644 scripts/validate_cross_skills.py
 create mode 100644 scripts/validate_evals_schema.py
 rename scripts/{validate_skills.py => validate_individual_skills.py} (100%)
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..df58585
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,50 @@
+# ---- config ----
+PYTHON ?= python
+SCRIPTS := scripts
+EVALS := evals
+
+CROSS_EVAL := $(EVALS)/cross-skills.json
+
+# ---- default ----
+.PHONY: all
+all: validate-evals cross
+
+# ---- schema validation ----
+.PHONY: validate-evals
+validate-evals:
+	$(PYTHON) $(SCRIPTS)/validate_evals_schema.py
+
+# ---- cross-skill evals ----
+.PHONY: cross
+cross:
+	$(PYTHON) $(SCRIPTS)/validate_cross_skills.py $(CROSS_EVAL)
+
+# ---- per-skill evals (placeholder) ----
+# assumes future runner like: run_skill_evals.py <skill>
+SKILLS := document fair4rs hpc ospool peer-review
+
+.PHONY: skills
+skills: $(SKILLS)
+
+.PHONY: $(SKILLS)
+$(SKILLS):
+	@echo "Running evals for $@"
+	$(PYTHON) $(SCRIPTS)/run_skill_evals.py $@
+
+# ---- aggregate report ----
+.PHONY: report
+report:
+	$(PYTHON) $(SCRIPTS)/aggregate_failures.py
+
+# ---- full pipeline ----
+.PHONY: full
+full: validate-evals cross report
+
+# ---- CI Pipeline ----
+.PHONY: ci
+ci:
+	@echo "=== Running CI pipeline ==="
+	$(MAKE) validate-evals
+	$(MAKE) cross
+	$(MAKE) report
+	@echo "=== CI completed ==="
diff --git a/evals/cross-skills.json b/evals/cross-skills.json
new file mode 100644
index 0000000..085712d
--- /dev/null
+++ b/evals/cross-skills.json
@@ -0,0 +1,148 @@
+{
+  "suite": "cross-skill-orchestration",
+  "evals": [
+    {
+      "id": 201,
+      "type": "cross",
+      "skills_expected": ["document", "peer-review"],
+      "prompt": "Generate an ODD description for my ABM and then assess if it is ready for CoMSES submission.",
+      "expected_behavior": "document → peer-review",
+      "failure_modes": [
+        "missing_step",
+        "wrong_order",
+        "incomplete_execution"
+      ]
+    },
+    {
+      "id": 202,
+      "type": "cross",
+      "skills_expected": ["fair4rs", "peer-review"],
+      "prompt": "Prepare my model for publication and tell me if it passes CoMSES review.",
+      "expected_behavior": "fair4rs → peer-review",
+      "failure_modes": [
+        "missing_step",
+        "wrong_order",
+        "incomplete_execution"
+      ]
+    },
+    {
+      "id": 203,
+      "type": "cross",
+      "skills_expected": ["hpc", "ospool"],
+      "prompt": "I need to run large parameter sweeps. Should I use Slurm or OSPool, and generate the appropriate scripts?",
+      "expected_behavior": "select ONE of hpc or ospool",
+      "failure_modes": [
+        "wrong_skill",
+        "boundary_violation",
+        "no_planning"
+      ]
+    },
+    {
+      "id": 204,
+      "type": "cross",
+      "skills_expected": ["document", "fair4rs"],
+      "prompt": "Document my model using ODD and prepare it for archival with proper metadata.",
+      "expected_behavior": "document → fair4rs",
+      "failure_modes": [
+        "missing_step",
+        "wrong_order",
+        "incomplete_execution"
+      ]
+    },
+    {
+      "id": 205,
+      "type": "cross",
+      "skills_expected": ["peer-review", "hpc"],
+      "prompt": "Review my model and suggest how to run it efficiently on an HPC cluster.",
+      "expected_behavior": "peer-review → hpc",
+      "failure_modes": [
+        "missing_step",
+        "wrong_order",
+        "scope_creep"
+      ]
+    },
+    {
+      "id": 206,
+      "type": "cross",
+      "skills_expected": ["ospool", "fair4rs"],
+      "prompt": "Run my model at scale and make sure it's properly archived and citable.",
+      "expected_behavior": "ospool → fair4rs",
+      "failure_modes": [
+        "missing_step",
+        "wrong_order",
+        "incomplete_execution"
+      ]
+    },
+    {
+      "id": 207,
+      "type": "cross",
+      "skills_expected": ["document", "hpc", "peer-review"],
+      "prompt": "Create ODD documentation, run the model on HPC, and evaluate whether it's ready for submission.",
+      "expected_behavior": "document → hpc → peer-review",
+      "failure_modes": [
+        "missing_step",
+        "wrong_order",
+        "incomplete_execution"
+      ]
+    },
+    {
+      "id": 208,
+      "type": "cross",
+      "skills_expected": ["document", "fair4rs", "peer-review"],
+      "prompt": "Prepare my model for publication including documentation, metadata, and readiness assessment.",
+      "expected_behavior": "document → fair4rs → peer-review",
+      "failure_modes": [
+        "missing_step",
+        "wrong_order",
+        "no_planning"
+      ]
+    },
+    {
+      "id": 209,
+      "type": "cross",
+      "skills_expected": ["hpc", "ospool"],
+      "prompt": "I have 10,000 independent simulations. I might use Slurm or OSPool—what's better and how do I set it up?",
+      "expected_behavior": "select ONE system and justify",
+      "failure_modes": [
+        "wrong_skill",
+        "boundary_violation",
+        "no_planning"
+      ]
+    },
+    {
+      "id": 210,
+      "type": "cross",
+      "skills_expected": ["peer-review", "fair4rs"],
+      "prompt": "Does my repository meet CoMSES criteria, and if not, what FAIR improvements are needed?",
+      "expected_behavior": "peer-review → fair4rs",
+      "failure_modes": [
+        "missing_step",
+        "wrong_order",
+        "incomplete_execution"
+      ]
+    },
+    {
+      "id": 211,
+      "type": "cross-adversarial",
+      "skills_expected": ["document", "fair4rs", "hpc", "ospool", "peer-review"],
+      "prompt": "Do everything needed to make my model complete, scalable, publishable, and reviewed.",
+      "expected_behavior": "multi-stage plan + execution",
+      "failure_modes": [
+        "no_planning",
+        "missing_step",
+        "incomplete_execution"
+      ]
+    },
+    {
+      "id": 212,
+      "type": "cross-adversarial",
+      "skills_expected": [],
+      "prompt": "Explain how to run simulations and publish them in general terms",
+      "expected_behavior": "no skill activation",
+      "failure_modes": [
+        "over_trigger",
+        "redundant_actions"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/evals/schema/schema.json b/evals/schema/schema.json
new file mode 100644
index 0000000..4571db8
--- /dev/null
+++ b/evals/schema/schema.json
@@ -0,0 +1,148 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "required": [
+    "evals"
+  ],
+  "properties": {
+    "skill_name": {
+      "type": "string"
+    },
+    "description": {
+      "type": "string"
+    },
+    "suite": {
+      "type": "string"
+    },
+    "evals": {
+      "type": "array",
+      "minItems": 1,
+      "items": {
+        "type": "object",
+        "required": [
+          "id",
+          "prompt"
+        ],
+        "properties": {
+          "id": {
+            "type": "integer"
+          },
+          "type": {
+            "type": "string",
+            "enum": [
+              "core",
+              "adversarial",
+              "cross",
+              "cross-adversarial"
+            ]
+          },
+          "prompt": {
+            "type": "string",
+            "minLength": 5
+          },
+          "should_trigger": {
+            "type": "boolean"
+          },
+          "expected_behavior": {
+            "type": "string"
+          },
+          "expected_output": {
+            "type": "string"
+          },
+          "success_criteria": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          },
+          "skills_expected": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          },
+          "failure_modes": {
+            "type": "array",
+            "minItems": 1,
+            "items": {
+              "type": "string",
+              "enum": [
+                "over_trigger",
+                "under_trigger",
+                "wrong_skill",
+                "wrong_order",
+                "scope_creep",
+                "hallucination",
+                "keyword_trap",
+                "boundary_violation",
+                "incomplete_execution",
+                "missing_step",
+                "invalid_output",
+                "no_planning",
+                "redundant_actions"
+              ]
+            }
+          },
+          "notes": {
+            "type": "string"
+          }
+        },
+        "additionalProperties": false,
+        "allOf": [
+          {
+            "if": {
+              "properties": {
+                "type": {
+                  "enum": [
+                    "adversarial",
+                    "cross-adversarial"
+                  ]
+                }
+              }
+            },
+            "then": {
+              "required": [
+                "failure_modes"
+              ]
+            }
+          },
+          {
+            "if": {
+              "properties": {
+                "type": {
+                  "enum": [
+                    "core",
+                    "adversarial"
+                  ]
+                }
+              }
+            },
+            "then": {
+              "required": [
+                "should_trigger"
+              ]
+            }
+          },
+          {
+            "if": {
+              "properties": {
+                "type": {
+                  "enum": [
+                    "cross",
+                    "cross-adversarial"
+                  ]
+                }
+              }
+            },
+            "then": {
+              "required": [
+                "skills_expected"
+              ]
+            }
+          }
+        ]
+      }
+    }
+  },
+  "additionalProperties": false
+}
\ No newline at end of file
diff --git a/scripts/aggregate_failures.py b/scripts/aggregate_failures.py
new file mode 100644
index 0000000..9893393
--- /dev/null
+++ b/scripts/aggregate_failures.py
@@ -0,0 +1,114 @@
+#!/usr/bin/env python3
+
+import json
+from collections import Counter, defaultdict
+from pathlib import Path
+import argparse
+
+
+def load_all_evals():
+    eval_files = list(Path("skills").rglob("evals.json"))
+    data = []
+
+    for f in eval_files:
+        try:
+            content = json.loads(f.read_text())
+            skill = content.get("skill_name", f.parent.name)
+            for e in content.get("evals", []):
+                e["_skill"] = skill
+                data.append(e)
+        except Exception as e:
+            print(f"⚠️ Failed to load {f}: {e}")
+
+    return data
+
+
+def aggregate_expected_failures(evals):
+    """Counts declared failure_modes (design-time coverage)"""
+    global_counts = Counter()
+    per_skill = defaultdict(Counter)
+
+    for e in evals:
+        if e.get("type") in ["adversarial", "cross-adversarial"]:
+            for fm in e.get("failure_modes", []):
+                global_counts[fm] += 1
+                per_skill[e["_skill"]][fm] += 1
+
+    return global_counts, per_skill
+
+
+def load_results(path):
+    """
+    Expected format:
+    [
+      {
+        "id": 101,
+        "skill": "document",
+        "passed": false,
+        "failure_modes": ["over_trigger"]
+      }
+    ]
+    """
+    return json.loads(Path(path).read_text())
+
+
+def aggregate_actual_failures(results):
+    """Counts observed failures from CI run"""
+    counts = Counter()
+    per_skill = defaultdict(Counter)
+
+    for r in results:
+        if not r.get("passed", True):
+            for fm in r.get("failure_modes", []):
+                counts[fm] += 1
+                per_skill[r["skill"]][fm] += 1
+
+    return counts, per_skill
+
+
+def print_table(title, counts, per_skill):
+    total = sum(counts.values())
+
+    print(f"\n=== {title} ===\n")
+
+    if total == 0:
+        print("No data.\n")
+        return
+
+    print("Global:")
+    for k, v in counts.most_common():
+        pct = (v / total) * 100
+        print(f"  {k:22} {v:4} ({pct:5.1f}%)")
+
+    print("\nPer skill:")
+    for skill, counter in per_skill.items():
+        print(f"\n  [{skill}]")
+        s_total = sum(counter.values())
+        for k, v in counter.most_common():
+            pct = (v / s_total) * 100 if s_total else 0
+            print(f"    {k:20} {v:4} ({pct:5.1f}%)")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results", help="Path to CI results JSON")
+    args = parser.parse_args()
+
+    evals = load_all_evals()
+
+    # design-time coverage
+    expected_counts, expected_per_skill = aggregate_expected_failures(evals)
+    print_table("Expected Failure Coverage (from eval definitions)",
+                expected_counts, expected_per_skill)
+
+    # runtime failures (optional)
+    if args.results:
+        results = load_results(args.results)
+        actual_counts, actual_per_skill = aggregate_actual_failures(results)
+
+        print_table("Observed Failures (from CI run)",
+                    actual_counts, actual_per_skill)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/validate_cross_skills.py b/scripts/validate_cross_skills.py
new file mode 100644
index 0000000..755f5d4
--- /dev/null
+++ b/scripts/validate_cross_skills.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+import json
+import sys
+from pathlib import Path
+
+OUTPUT_FILE = "results_cross.json"
+
+
+# ---- mock agent (replace later with real trace) ----
+def mock_agent_run(prompt):
+    invoked = []
+
+    p = prompt.lower()
+
+    if "odd" in p or "documentation" in p:
+        invoked.append("document")
+    if "publish" in p or "citable" in p or "metadata" in p:
+        invoked.append("fair4rs")
+    if "slurm" in p or "hpc" in p:
+        invoked.append("hpc")
+    if "ospool" in p or "htcondor" in p or "osg" in p:
+        invoked.append("ospool")
+    if "review" in p or "ready" in p or "submission" in p:
+        invoked.append("peer-review")
+
+    return invoked
+
+
+# ---- evaluation logic ----
+def evaluate_case(e):
+    prompt = e["prompt"]
+    expected = e.get("skills_expected", [])
+    invoked = mock_agent_run(prompt)
+
+    failures = []
+
+    # ---- skill selection ----
+    if expected:
+        missing = set(expected) - set(invoked)
+        extra = set(invoked) - set(expected)
+
+        if missing:
+            failures.append("missing_step")
+
+        if extra:
+            failures.append("boundary_violation")
+
+    else:
+        # should not trigger any skills
+        if invoked:
+            failures.append("over_trigger")
+
+    # ---- ordering (simple heuristic) ----
+    if expected and len(expected) > 1:
+        if invoked[:len(expected)] != expected:
+            failures.append("wrong_order")
+
+    # ---- planning (multi-step prompts) ----
+    if len(expected) >= 3 and len(invoked) < len(expected):
+        failures.append("no_planning")
+
+    # ---- completeness ----
+    if expected and len(invoked) < len(expected):
+        failures.append("incomplete_execution")
+
+    # dedupe
+    failures = list(set(failures))
+
+    passed = len(failures) == 0
+
+    return {
+        "id": e["id"],
+        "type": e["type"],
+        "passed": passed,
+        "expected": expected,
+        "invoked": invoked,
+        "failure_modes": failures,
+    }
+
+
+# ---- main ----
+def main(path):
+    data = json.loads(Path(path).read_text())
+
+    results = []
+    passed = 0
+
+    for e in data["evals"]:
+        r = evaluate_case(e)
+        results.append(r)
+
+        print(f"\nEval {e['id']}")
+        print(f"Prompt: {e['prompt']}")
+        print(f"Expected: {r['expected']}")
+        print(f"Invoked: {r['invoked']}")
+        print(f"Failures: {r['failure_modes']}")
+
+        if r["passed"]:
+            print("✅ PASS")
+            passed += 1
+        else:
+            print("❌ FAIL")
+
+    # write results for aggregation
+    Path(OUTPUT_FILE).write_text(json.dumps(results, indent=2))
+
+    total = len(results)
+    print(f"\nSummary: {passed}/{total} passed")
+
+    # fail CI if any fail
+    if passed < total:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: validate_cross_skills.py <evals.json>")
+        sys.exit(1)
+
+    main(sys.argv[1])
\ No newline at end of file
diff --git a/scripts/validate_evals_schema.py b/scripts/validate_evals_schema.py
new file mode 100644
index 0000000..c8986f1
--- /dev/null
+++ b/scripts/validate_evals_schema.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+import json
+import sys
+from pathlib import Path
+
+import jsonschema
+
+SCHEMA_PATH = Path("evals/schema/schema.json")
+
+
+def load_schema():
+    return json.loads(SCHEMA_PATH.read_text())
+
+
+def validate_file(path, schema):
+    data = json.loads(path.read_text())
+    try:
+        jsonschema.validate(instance=data, schema=schema)
+        print(f"✅ {path} valid")
+    except jsonschema.ValidationError as e:
+        print(f"❌ {path} invalid")
+        print(f"   → {e.message}")
+        sys.exit(1)
+
+
+def main():
+    schema = load_schema()
+
+    # validate all evals.json in skills/
+    for path in Path("skills").rglob("evals.json"):
+        validate_file(path, schema)
+
+    # validate cross-skill evals
+    for path in Path("evals").glob("*.json"):
+        validate_file(path, schema)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/validate_skills.py b/scripts/validate_individual_skills.py
similarity index 100%
rename from scripts/validate_skills.py
rename to scripts/validate_individual_skills.py
diff --git a/skills/document/evals.json b/skills/document/evals.json
index f30a986..ef6f6df 100644
--- a/skills/document/evals.json
+++ b/skills/document/evals.json
@@ -93,89 +93,102 @@
       "should_trigger": false,
       "expected_output": "Skill does not activate"
     },
+
     {
       "id": 101,
       "type": "adversarial",
       "prompt": "Explain the ODD protocol briefly for a presentation slide",
       "should_trigger": false,
-      "expected_output": "Informational only"
+      "expected_output": "Informational only",
+      "failure_modes": ["keyword_trap", "over_trigger"]
     },
     {
       "id": 102,
       "type": "adversarial",
       "prompt": "Document my simulation model for a paper",
       "should_trigger": false,
-      "expected_output": "Does not trigger without ABM specificity"
+      "expected_output": "Does not trigger without ABM specificity",
+      "failure_modes": ["over_trigger", "wrong_skill"]
     },
     {
       "id": 103,
       "type": "adversarial",
       "prompt": "I need a structured description of agents, their behaviors, and environment interactions for publication",
       "should_trigger": true,
-      "expected_output": "Triggers ODD structuring implicitly"
+      "expected_output": "Triggers ODD structuring implicitly",
+      "failure_modes": ["under_trigger"]
     },
     {
       "id": 104,
       "type": "adversarial",
       "prompt": "Write documentation for my discrete-event simulation model",
       "should_trigger": false,
-      "expected_output": "Does not trigger"
+      "expected_output": "Does not trigger",
+      "failure_modes": ["over_trigger", "boundary_violation"]
     },
     {
       "id": 105,
       "type": "adversarial",
       "prompt": "Generate an ODD description for my model (details not provided)",
       "should_trigger": true,
-      "expected_output": "Requests clarification, no hallucination"
+      "expected_output": "Requests clarification, no hallucination",
+      "failure_modes": ["hallucination"]
     },
     {
       "id": 106,
       "type": "adversarial",
       "prompt": "Convert my model description into a README with sections",
       "should_trigger": false,
-      "expected_output": "Does not trigger"
+      "expected_output": "Does not trigger",
+      "failure_modes": ["over_trigger", "boundary_violation"]
     },
     {
       "id": 107,
       "type": "adversarial",
       "prompt": "List the entities and variables in my agent-based model",
       "should_trigger": false,
-      "expected_output": "Partial task, should not trigger"
+      "expected_output": "Partial task, should not trigger",
+      "failure_modes": ["over_trigger", "incomplete_execution"]
     },
     {
       "id": 108,
       "type": "adversarial",
       "prompt": "Fix grammar in my ODD description",
       "should_trigger": false,
-      "expected_output": "Editing task only"
+      "expected_output": "Editing task only",
+      "failure_modes": ["over_trigger", "scope_creep"]
     },
     {
       "id": 109,
       "type": "adversarial",
       "prompt": "Summarize my ABM and optionally format it using ODD if useful",
       "should_trigger": true,
-      "expected_output": "Triggers and prefers ODD structuring"
+      "expected_output": "Triggers and prefers ODD structuring",
+      "failure_modes": ["wrong_order", "incomplete_execution"]
     },
     {
       "id": 110,
       "type": "adversarial",
       "prompt": "Help me structure a formal agent-based model description using standard protocols",
       "should_trigger": true,
-      "expected_output": "Triggers despite missing keyword"
+      "expected_output": "Triggers despite missing keyword",
+      "failure_modes": ["under_trigger"]
     },
     {
       "id": 111,
       "type": "adversarial",
       "prompt": "Write a UML diagram description for my agent-based model",
       "should_trigger": false,
-      "expected_output": "Different formalism"
+      "expected_output": "Different formalism",
+      "failure_modes": ["over_trigger", "boundary_violation"]
     },
     {
       "id": 112,
       "type": "adversarial",
       "prompt": "Generate FAIR metadata for my model repository",
       "should_trigger": false,
-      "expected_output": "Different skill domain"
+      "expected_output": "Different skill domain",
+      "failure_modes": ["wrong_skill", "boundary_violation"]
     }
   ]
 }
\ No newline at end of file
diff --git a/skills/fair4rs/evals.json b/skills/fair4rs/evals.json
index 4b36d30..b346d71 100644
--- a/skills/fair4rs/evals.json
+++ b/skills/fair4rs/evals.json
@@ -56,46 +56,44 @@
       "type": "core",
       "prompt": "How do I make my research software reusable and properly archived for long-term access?",
       "should_trigger": true,
-      "expected_output": "FAIR4RS-aligned guidance (archival, metadata, licensing)",
-      "notes": "implicit trigger without FAIR keyword"
+      "expected_output": "FAIR4RS-aligned guidance (archival, metadata, licensing)"
     },
     {
       "id": 9,
       "type": "core",
       "prompt": "Improve my existing CITATION.cff to meet best practices and align with my repository metadata",
       "should_trigger": true,
-      "expected_output": "Refined CITATION.cff with consistency checks",
-      "notes": "refinement task"
+      "expected_output": "Refined CITATION.cff with consistency checks"
     },
     {
       "id": 10,
       "type": "core",
       "prompt": "Validate whether my repository metadata meets FAIR principles",
       "should_trigger": true,
-      "expected_output": "FAIR compliance assessment with gaps identified",
-      "notes": "validation-only task"
+      "expected_output": "FAIR compliance assessment with gaps identified"
     },
     {
       "id": 11,
       "type": "core",
       "prompt": "Package my code for PyPI distribution",
       "should_trigger": false,
-      "expected_output": "Skill does not activate; packaging ≠ FAIR publication"
+      "expected_output": "Skill does not activate"
     },
     {
       "id": 12,
       "type": "core",
       "prompt": "Write documentation for my scientific software",
       "should_trigger": false,
-      "expected_output": "Skill does not activate; generic docs"
+      "expected_output": "Skill does not activate"
     },
+
     {
       "id": 101,
       "type": "adversarial",
       "prompt": "Explain what FAIR principles are",
       "should_trigger": false,
       "expected_output": "Informational only",
-      "failure_mode": "keyword trap"
+      "failure_modes": ["keyword_trap", "over_trigger"]
     },
     {
       "id": 102,
@@ -103,7 +101,7 @@
       "prompt": "Make my project FAIR",
       "should_trigger": true,
       "expected_output": "Clarifies scope and applies FAIR4RS workflow",
-      "failure_mode": "too vague → should still trigger"
+      "failure_modes": ["under_trigger", "no_planning"]
     },
     {
       "id": 103,
@@ -111,7 +109,7 @@
       "prompt": "Generate metadata for my dataset",
       "should_trigger": false,
       "expected_output": "Does not trigger; data ≠ software",
-      "failure_mode": "confuses FAIR data vs FAIR software"
+      "failure_modes": ["wrong_skill", "boundary_violation"]
     },
     {
       "id": 104,
@@ -119,7 +117,7 @@
       "prompt": "I need my code to be cited properly and archived with a DOI",
       "should_trigger": true,
       "expected_output": "Triggers FAIR4RS workflow",
-      "failure_mode": "implicit trigger"
+      "failure_modes": ["under_trigger"]
     },
     {
       "id": 105,
@@ -127,15 +125,15 @@
       "prompt": "Fix formatting issues in my CITATION.cff",
       "should_trigger": false,
       "expected_output": "Does not trigger; editing only",
-      "failure_mode": "over-trigger on keyword"
+      "failure_modes": ["over_trigger", "scope_creep"]
     },
     {
       "id": 106,
       "type": "adversarial",
       "prompt": "Write an ODD protocol description for my model and prepare it for publication",
       "should_trigger": false,
-      "expected_output": "Should defer to ODD skill first",
-      "failure_mode": "cross-skill conflict"
+      "expected_output": "Should defer to document skill first",
+      "failure_modes": ["wrong_skill", "boundary_violation"]
     },
     {
       "id": 107,
@@ -143,7 +141,7 @@
       "prompt": "Summarize my repository for a blog post",
       "should_trigger": false,
       "expected_output": "Does not trigger",
-      "failure_mode": "wrong intent"
+      "failure_modes": ["over_trigger", "scope_creep"]
     },
     {
       "id": 108,
@@ -151,7 +149,7 @@
       "prompt": "Ensure my software release is reproducible, citable, and archived with proper metadata",
       "should_trigger": true,
       "expected_output": "Triggers FAIR4RS workflow",
-      "failure_mode": "implicit multi-goal phrasing"
+      "failure_modes": ["under_trigger", "no_planning"]
     }
   ]
 }
\ No newline at end of file
diff --git a/skills/hpc/evals.json b/skills/hpc/evals.json
index 8a86bf7..2170005 100644
--- a/skills/hpc/evals.json
+++ b/skills/hpc/evals.json
@@ -42,24 +42,21 @@
       "type": "core",
       "prompt": "How do I run my simulation efficiently on a Slurm cluster?",
       "should_trigger": true,
-      "expected_output": "Guidance plus Slurm script scaffold",
-      "notes": "implicit trigger"
+      "expected_output": "Guidance plus Slurm script scaffold"
     },
     {
       "id": 7,
       "type": "core",
       "prompt": "Fix my Slurm script that is failing with node allocation errors",
       "should_trigger": true,
-      "expected_output": "Debugged Slurm script with corrected directives",
-      "notes": "refinement/debug task"
+      "expected_output": "Debugged Slurm script with corrected directives"
     },
     {
       "id": 8,
       "type": "core",
       "prompt": "Validate whether my Slurm job script is configured correctly",
       "should_trigger": true,
-      "expected_output": "Validation report with suggested fixes",
-      "notes": "validation-only task"
+      "expected_output": "Validation report with suggested fixes"
     },
     {
       "id": 9,
@@ -75,13 +72,14 @@
       "should_trigger": false,
       "expected_output": "Does not activate; containerization task"
     },
+
     {
       "id": 101,
       "type": "adversarial",
       "prompt": "Explain what Slurm is",
       "should_trigger": false,
       "expected_output": "Informational only",
-      "failure_mode": "keyword trap"
+      "failure_modes": ["keyword_trap", "over_trigger"]
     },
     {
       "id": 102,
@@ -89,7 +87,7 @@
       "prompt": "Run my job on a cluster",
       "should_trigger": true,
       "expected_output": "Clarifies scheduler and produces Slurm solution",
-      "failure_mode": "vague prompt should still trigger"
+      "failure_modes": ["under_trigger", "no_planning"]
     },
     {
       "id": 103,
@@ -97,7 +95,7 @@
       "prompt": "Generate a PBS job script for my simulation",
       "should_trigger": false,
       "expected_output": "Does not trigger; different scheduler",
-      "failure_mode": "scheduler confusion"
+      "failure_modes": ["wrong_skill", "boundary_violation"]
     },
     {
       "id": 104,
@@ -105,7 +103,7 @@
       "prompt": "Optimize my Slurm job to reduce runtime and queue wait",
       "should_trigger": true,
       "expected_output": "Improved resource configuration and scheduling guidance",
-      "failure_mode": "non-generation HPC optimization task"
+      "failure_modes": ["under_trigger", "incomplete_execution"]
     },
     {
       "id": 105,
@@ -113,7 +111,7 @@
       "prompt": "Fix formatting in my Slurm script",
       "should_trigger": false,
       "expected_output": "Does not trigger; superficial edit",
-      "failure_mode": "over-trigger on keyword"
+      "failure_modes": ["over_trigger", "scope_creep"]
     },
     {
       "id": 106,
@@ -121,7 +119,7 @@
       "prompt": "Run my simulation on Kubernetes",
       "should_trigger": false,
       "expected_output": "Does not trigger; different infrastructure",
-      "failure_mode": "infra confusion"
+      "failure_modes": ["wrong_skill", "boundary_violation"]
     },
     {
       "id": 107,
@@ -129,7 +127,7 @@
       "prompt": "Create a Slurm job script but also prepare FAIR metadata for publication",
       "should_trigger": true,
       "expected_output": "Handles Slurm portion; does not overreach into FAIR",
-      "failure_mode": "cross-skill boundary"
+      "failure_modes": ["scope_creep", "boundary_violation"]
     },
     {
       "id": 108,
@@ -137,7 +135,7 @@
       "prompt": "I need to run 10,000 simulations with different parameters efficiently",
       "should_trigger": true,
       "expected_output": "Suggests job arrays or batching strategy",
-      "failure_mode": "implicit array use case"
+      "failure_modes": ["under_trigger", "missing_step"]
     }
   ]
 }
\ No newline at end of file
diff --git a/skills/ospool/evals.json b/skills/ospool/evals.json
index 46d6ded..8315a02 100644
--- a/skills/ospool/evals.json
+++ b/skills/ospool/evals.json
@@ -42,24 +42,21 @@
       "type": "core",
       "prompt": "How can I run thousands of independent simulations efficiently across distributed resources?",
       "should_trigger": true,
-      "expected_output": "HTCondor-based high-throughput strategy",
-      "notes": "implicit trigger"
+      "expected_output": "HTCondor-based high-throughput strategy"
     },
     {
       "id": 7,
       "type": "core",
       "prompt": "Fix my HTCondor submit file that is failing to transfer input files",
       "should_trigger": true,
-      "expected_output": "Debugged submit file with corrected transfer directives",
-      "notes": "refinement/debug"
+      "expected_output": "Debugged submit file with corrected transfer directives"
     },
     {
       "id": 8,
       "type": "core",
       "prompt": "Validate whether my HTCondor workflow is configured correctly",
       "should_trigger": true,
-      "expected_output": "Validation report with configuration issues and fixes",
-      "notes": "validation-only"
+      "expected_output": "Validation report with configuration issues and fixes"
     },
     {
       "id": 9,
@@ -75,13 +72,14 @@
       "should_trigger": false,
       "expected_output": "Does not activate; HPC domain"
     },
+
     {
       "id": 101,
       "type": "adversarial",
       "prompt": "Explain what HTCondor is",
       "should_trigger": false,
       "expected_output": "Informational only",
-      "failure_mode": "keyword trap"
+      "failure_modes": ["keyword_trap", "over_trigger"]
     },
     {
       "id": 102,
@@ -89,7 +87,7 @@
       "prompt": "Run my simulation on OSG",
       "should_trigger": true,
       "expected_output": "Clarifies workflow and produces HTCondor solution",
-      "failure_mode": "vague but correct domain"
+      "failure_modes": ["under_trigger", "no_planning"]
     },
     {
       "id": 103,
@@ -97,7 +95,7 @@
       "prompt": "Generate a PBS job script for my distributed simulation",
       "should_trigger": false,
       "expected_output": "Does not activate",
-      "failure_mode": "scheduler confusion"
+      "failure_modes": ["wrong_skill", "boundary_violation"]
     },
     {
       "id": 104,
@@ -105,7 +103,7 @@
       "prompt": "I need to run 20,000 independent jobs with different parameters",
       "should_trigger": true,
       "expected_output": "HTCondor parameter sweep or DAG strategy",
-      "failure_mode": "implicit HTC scaling case"
+      "failure_modes": ["under_trigger", "missing_step"]
     },
     {
       "id": 105,
@@ -113,7 +111,7 @@
       "prompt": "Fix formatting issues in my HTCondor submit file",
       "should_trigger": false,
       "expected_output": "Does not activate; superficial edit",
-      "failure_mode": "over-trigger on keyword"
+      "failure_modes": ["over_trigger", "scope_creep"]
     },
     {
       "id": 106,
@@ -121,7 +119,7 @@
       "prompt": "Run my simulation on a Kubernetes cluster",
       "should_trigger": false,
       "expected_output": "Does not activate",
-      "failure_mode": "infrastructure confusion"
+      "failure_modes": ["wrong_skill", "boundary_violation"]
     },
     {
       "id": 107,
@@ -129,7 +127,7 @@
       "prompt": "Prepare my simulation for publication and also generate HTCondor jobs",
       "should_trigger": true,
       "expected_output": "Handles HTCondor part only; does not overreach into FAIR",
-      "failure_mode": "cross-skill boundary"
+      "failure_modes": ["scope_creep", "boundary_violation"]
     },
     {
       "id": 108,
@@ -137,7 +135,7 @@
       "prompt": "My jobs keep getting evicted on OSPool. How should I handle this?",
       "should_trigger": true,
       "expected_output": "Checkpointing/restart strategy and job robustness guidance",
-      "failure_mode": "non-generation operational issue"
+      "failure_modes": ["under_trigger", "incomplete_execution"]
     }
   ]
 }
\ No newline at end of file
diff --git a/skills/peer-review/evals.json b/skills/peer-review/evals.json
index 2c4119e..1fd0faf 100644
--- a/skills/peer-review/evals.json
+++ b/skills/peer-review/evals.json
@@ -63,24 +63,21 @@
       "type": "core",
       "prompt": "Is my computational model repository ready for publication?",
       "should_trigger": true,
-      "expected_output": "Triggers full readiness review",
-      "notes": "implicit trigger"
+      "expected_output": "Triggers full readiness review"
     },
     {
       "id": 10,
       "type": "core",
       "prompt": "Re-evaluate my model after I fixed documentation issues and added tests.",
       "should_trigger": true,
-      "expected_output": "Updated review reflecting improvements",
-      "notes": "refinement / re-review"
+      "expected_output": "Updated review reflecting improvements"
     },
     {
       "id": 11,
       "type": "core",
       "prompt": "Validate whether my model meets CoMSES submission criteria",
       "should_trigger": true,
-      "expected_output": "Checklist-based validation report",
-      "notes": "validation-only"
+      "expected_output": "Checklist-based validation report"
     },
     {
       "id": 12,
@@ -96,13 +93,14 @@
       "should_trigger": false,
       "expected_output": "Does not activate; document skill domain"
     },
+
     {
       "id": 101,
       "type": "adversarial",
       "prompt": "Explain what peer review is",
       "should_trigger": false,
       "expected_output": "Informational only",
-      "failure_mode": "keyword trap"
+      "failure_modes": ["keyword_trap", "over_trigger"]
     },
     {
       "id": 102,
@@ -110,7 +108,7 @@
       "prompt": "Check if my repository is good",
       "should_trigger": true,
       "expected_output": "Clarifies scope and performs structured review",
-      "failure_mode": "vague intent"
+      "failure_modes": ["under_trigger", "no_planning"]
     },
     {
       "id": 103,
@@ -118,7 +116,7 @@
       "prompt": "Review my code for performance optimizations",
       "should_trigger": false,
       "expected_output": "Does not activate; performance tuning is out of scope",
-      "failure_mode": "scope creep"
+      "failure_modes": ["over_trigger", "scope_creep"]
     },
     {
       "id": 104,
@@ -126,7 +124,7 @@
       "prompt": "Evaluate my model and generate a Slurm script to run it",
       "should_trigger": true,
       "expected_output": "Performs review only; does not generate HPC artifacts",
-      "failure_mode": "cross-skill contamination"
+      "failure_modes": ["scope_creep", "boundary_violation"]
     },
     {
       "id": 105,
@@ -134,7 +132,7 @@
       "prompt": "Fix grammar issues in my model documentation",
       "should_trigger": false,
       "expected_output": "Does not activate; editing task",
-      "failure_mode": "over-trigger on documentation"
+      "failure_modes": ["over_trigger", "scope_creep"]
     },
     {
       "id": 106,
@@ -142,7 +140,7 @@
       "prompt": "My repository runs but has no documentation or instructions. Is it acceptable?",
       "should_trigger": true,
       "expected_output": "Fails due to missing required documentation criteria",
-      "failure_mode": "required criteria enforcement"
+      "failure_modes": ["invalid_output", "incomplete_execution"]
     },
     {
       "id": 107,
@@ -150,7 +148,7 @@
       "prompt": "Assess whether my repository meets FAIR principles and CoMSES review criteria",
       "should_trigger": true,
       "expected_output": "Focuses on CoMSES review; may reference FAIR but does not replace FAIR skill",
-      "failure_mode": "multi-framework confusion"
+      "failure_modes": ["boundary_violation", "scope_creep"]
     },
     {
       "id": 108,
@@ -158,7 +156,7 @@
       "prompt": "Give me a quick yes/no if my model is good enough",
       "should_trigger": true,
       "expected_output": "Produces full structured review before binary decision",
-      "failure_mode": "shortcut temptation"
+      "failure_modes": ["incomplete_execution", "no_planning"]
     }
   ]
 }
\ No newline at end of file

From c01ed3c224f2a5818cdefc1b2dbdd22db8dd0dea Mon Sep 17 00:00:00 2001
From: Allen Lee <alee@users.noreply.github.com>
Date: Sat, 2 May 2026 19:56:43 -0700
Subject: [PATCH 2/2] fix: sync docs

---
 CONTRIBUTING.md    | 71 ++++++++++++++++++++++++++++++----------------
 README.md          | 71 ++++++++++++++++++++++++++++++++--------------
 docs/VALIDATION.md | 25 ++++++++++------
 3 files changed, 112 insertions(+), 55 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ee631b2..0c47712 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,6 @@
-# Contributing Skills to COMSES
+# Contributing Skills to CoMSES
 
-Thank you for contributing to this skills repository! This guide walks you through the process of creating, testing, and submitting skills for computational modelers.
+Thank you for contributing to this skills repository! This guide walks you through the process of creating, testing, and submitting skills for our community.
 
 ## Table of Contents
 
@@ -15,16 +15,15 @@ Thank you for contributing to this skills repository! This guide walks you throu
 ## Before You Start
 
 - Familiarize yourself with the [Agent Skills specification](https://agentskills.io)
-- Review existing skills in `skills/` to understand the pattern
-- Copy [docs/SKILL-TEMPLATE.md](docs/SKILL-TEMPLATE.md) as your starting point
-- Ensure your skill addresses a concrete pain point for computational modelers
-- Confirm your skill does NOT substantially overlap with existing skills
+- Read [docs/agent-skills-creation-reference.md](docs/agent-skills-creation-reference.md). This is the canonical authoring guide for this repository.
+- Review existing skills in `skills/` to check for overlap and assess fit / appropriateness
+- Use `/create-skill` if your coding agent provides it, or manually copy [docs/SKILL-TEMPLATE.md](docs/SKILL-TEMPLATE.md) into a new skill directory
 
 ## Skill Creation Workflow
 
 ### 1. Plan Your Skill
 
-Answer these questions before writing:
+Answer these questions:
 
 - **What problem does it solve?** (e.g., "Modelers struggle to document ODD+2 protocols manually")
 - **When should the coding agent use it?** (e.g., "When user has model code and needs narrative documentation")
@@ -34,21 +33,24 @@ Answer these questions before writing:
 
 ### 2. Create Your Skill Folder
 
-Run `/create-skill <name> — <one-sentence description>` in your coding agent. This scaffolds `skills/<name>/SKILL.md` from [docs/SKILL-TEMPLATE.md](docs/SKILL-TEMPLATE.md) with placeholders filled in, and generates a starter `evals.json`.
+Run `/create-skill <name> — <one-sentence description>` in your coding agent if that command is available. It should scaffold `skills/<name>/SKILL.md` from [docs/SKILL-TEMPLATE.md](docs/SKILL-TEMPLATE.md) and create a starter `skills/<name>/evals.json`.
 
 Alternatively, copy manually:
 ```bash
 mkdir -p skills/your-skill-name
 cp docs/SKILL-TEMPLATE.md skills/your-skill-name/SKILL.md
+cp skills/document/evals.json skills/your-skill-name/evals.json
 ```
 
+Then immediately rename `skill_name`, replace the copied prompts, and make sure the frontmatter `name:` matches the folder exactly.
+
 ### 3. Write SKILL.md
 
 See [Frontmatter Specification](#frontmatter-specification) and [Writing Guidelines](#writing-guidelines) below.
 
 ### 4. Add Optional Resources
 
-As your skill grows, add supporting files:
+As your skill grows, you might find supporting files useful:
 
 ```
 your-skill-name/
@@ -71,6 +73,14 @@ your-skill-name/
 
 See [Testing Your Skill](#testing-your-skill).
 
+Before opening a PR, also run the repository validators:
+
+```bash
+python scripts/validate_individual_skills.py
+python scripts/validate_evals_schema.py
+python scripts/validate_cross_skills.py evals/cross-skills.json
+```
+
 ### 6. Submit a Pull Request
 
 Include:
@@ -124,21 +134,21 @@ A typical SKILL.md body includes:
 
 ## Key Inputs
 
-- Model source files (Python/R/C++)
+- Model source code files
 - Parameter descriptions or config files
 - Optional: docstrings with metadata
 
 ## Step-by-Step Instructions
 
 1. Read the model code
-2. Extract metadata using scripts/extract.py
+2. Extract metadata (scicodes/somef-core,  google/langextract)
 3. Generate narrative following references/TEMPLATE.md
 4. Validate against references/CHECKLIST.md
 
 ## ⚠️ Gotchas
 
 - **Stochastic models:** If your model uses randomness, document any fixed random seeds
-- **Large codebases:** Summarize into entity/subsystem abstractions first
+- **Large codebases:** Summarize into entity/subsystem/component abstractions first
 - **Missing documentation:** Skill will ask clarifying questions rather than guess
 
 ## Templates & Resources
@@ -173,10 +183,11 @@ A typical SKILL.md body includes:
 name: your-skill-name
 description: |
   A complete description of what this skill does.
-  
-  Use when: you have model code and need...
-  When to trigger: mention [keywords like ODD, documentation, publication]
+
+  Use this skill when you have model code and need...
+  Triggers: "odd", "documentation", "publication"
   Expected output: [specific deliverables]
+license: MIT
 ---
 ```
 
@@ -186,23 +197,25 @@ description: |
 ---
 name: your-skill-name
 description: ...
-license: MIT (default) | Apache-2.0 | Proprietary
+license: MIT | Apache-2.0 | Proprietary
 compatibility: Python 3.10+, git, Docker (optional)
 metadata:
   domain: computational-modeling | documentation | publication | execution
   maturity: alpha | beta | stable
-  audience: modelers | researchers | data scientists
+  audience: modelers | researchers | data-scientists
+  category: documentation | quality-assurance | execution | publication
 ---
 ```
 
-### Guidancefor `description`
+### Guidance for `description`
 
 The description is your **primary triggering mechanism**. Make it:
 
 - **Task-specific:** "ODD+2 narrative for agent-based models" not just "model documentation"
 - **Keyword-rich:** Include trigger phrases users would naturally type
 - **Outcome-focused:** Mention specific deliverables (e.g., "checklist", "narrative sections", "validation report")
-- **Slightly pushy:** Coding agents tend to under-trigger skills. Emphasize when to use: "Use whenever you mention ODD, ABM documentation, or model publication preparation"
+- **Use the repository-preferred trigger phrase:** Start with `Use this skill when ...` so your description aligns with the validator heuristics and the existing skills.
+- **Slightly pushy:** Coding agents tend to under-trigger skills. Emphasize when to use: "Use this skill when you mention ODD, ABM documentation, or model publication preparation"
 
 ## Testing Your Skill
 
@@ -226,37 +239,45 @@ The description is your **primary triggering mechanism**. Make it:
 
 ### Creating an Evaluation Strategy
 
-For each skill, document 3–5 concrete test cases in a file `evals/evals.json`:
+For each skill, include concrete test cases in `skills/<name>/evals.json`:
 
 ```json
 {
   "skill_name": "document",
+  "description": "Evaluation cases for ODD+2 narrative documentation skill",
   "evals": [
     {
       "id": 1,
+      "type": "core",
       "prompt": "I have a Python ABM with Agent and Environment classes. Generate an ODD narrative.",
       "should_trigger": true,
-      "expected_output": "ODD sections covering entities, state variables, and processes",
-      "files": ["evals/files/minimal_abm.py"]
+      "expected_output": "ODD sections covering entities, state variables, and processes"
     }
   ]
 }
 ```
 
+Notes:
+
+- Individual skill evals live next to the skill, for example `skills/document/evals.json`.
+- The repository schema accepts fields such as `type`, `should_trigger`, `expected_output`, `expected_behavior`, `success_criteria`, `skills_expected`, `failure_modes`, and `notes`.
+- Do not add ad hoc fields unless you also update the schema in `evals/schema/schema.json`.
+
 ## Submission Checklist
 
 Before submitting, verify:
 
 - [ ] Skill folder name matches `name:` field in frontmatter
-- [ ] Frontmatter includes `name` and `description` (and optionally `license`, `compatibility`, `metadata`)
-- [ ] Description includes triggers ("Use when you...") and expected outputs
+- [ ] Frontmatter includes `name`, `description`, and `license` (plus optional `compatibility` and `metadata`)
+- [ ] Description includes triggers (`Use this skill when ...`) and expected outputs
 - [ ] All script references use relative paths: `scripts/name.py` (not `./scripts/name.py`)
 - [ ] README/CONTRIBUTING sections are consistent with repository guidelines
+- [ ] `skills/<name>/evals.json` exists and validates against `evals/schema/schema.json`
 - [ ] Tested skill against ≥5 should-trigger and ≥3 should-not-trigger prompts
 - [ ] No hardcoded paths or user-specific settings
 - [ ] Scripts have clear usage documentation (docstrings, help text, or references/SCRIPT.md)
 - [ ] No credentials, API keys, or personal data in examples
-- [ ] License field in frontmatter (defaults to MIT if omitted)
+- [ ] License field is present in frontmatter
 
 ## Questions?
 
diff --git a/README.md b/README.md
index d7824f0..f1c1423 100644
--- a/README.md
+++ b/README.md
@@ -147,7 +147,7 @@ Use cases:
 ## Repository Structure
 
 ```
-skills/
+.
 ├── .github/
 │   └── skills/
 │       └── update-skill/        (repository-local maintainer skill)
@@ -156,43 +156,66 @@ skills/
 │           │   └── REFRESH-WORKFLOW.md
 │           └── assets/
 │               └── REFRESH-PR-NOTE-TEMPLATE.md
+├── AGENTS.md                    (repository-specific agent instructions)
 ├── README.md                    (this file)
 ├── CONTRIBUTING.md              (contribution guidelines)
 ├── LICENSE                      (MIT)
 ├── .gitignore
+├── Makefile                     (validation shortcuts)
 ├── docs/                        (repository-level documentation)
+│   ├── agent-skills-creation-reference.md
 │   ├── roadmap.md
 │   └── SKILL-TEMPLATE.md        (copy/fill template for new skills)
+├── evals/                       (cross-skill evals and schema)
+├── scripts/                     (validation and reporting helpers)
 └── skills/                      (all skill folders)
     ├── document/
-    │   └── SKILL.md
+  │   ├── SKILL.md
+  │   └── evals.json
     ├── fair4rs/
-    │   └── SKILL.md
+  │   ├── SKILL.md
+  │   └── evals.json
     ├── ospool/
-    │   └── SKILL.md
+  │   ├── SKILL.md
+  │   └── evals.json
     ├── hpc/
-    │   └── SKILL.md
+  │   ├── SKILL.md
+  │   └── evals.json
     └── peer-review/
-        └── SKILL.md
+    ├── SKILL.md
+    └── evals.json
 ```
 
 ## For Skill Authors
 
 ### Adding a New Skill
 
-1. **Read** [CONTRIBUTING.md](CONTRIBUTING.md) for submission guidelines and naming conventions.
+1. **Read** [AGENTS.md](AGENTS.md), [CONTRIBUTING.md](CONTRIBUTING.md), and [docs/agent-skills-creation-reference.md](docs/agent-skills-creation-reference.md) before drafting.
 2. **Review** [Agent Skills best practices](https://agentskills.io/skill-creation/best-practices) before drafting.
-3. **Ground from real expertise**: start from real task runs, corrections, and project artifacts (not generic advice).
-4. **Scope coherently**: define one composable unit of work; avoid overly broad or ultra-narrow skills.
-5. **Design for context efficiency**: keep `SKILL.md` concise, move deep details to `references/`, and load references only when needed.
-6. **Prefer defaults over menus**: choose one default tool/approach and list alternatives only as fallbacks.
-7. **Include reusable control patterns**: gotchas, output templates, and validation loops/checklists where relevant.
-8. **Refine with real execution**: test should-trigger and should-not-trigger prompts, review execution traces, then iterate.
-9. **Copy** an existing skill folder as a starting point: `cp -r skills/hpc skills/your-skill-name`.
-10. **Fill in** the YAML frontmatter (`name`, `description`) and markdown instructions following the progressive disclosure pattern.
-11. **Include optional resources** (scripts, references, assets) as your skill grows.
-12. **Test** against should-trigger and should-not-trigger prompts before submitting a PR.
-13. **Submit** a pull request with your skill and evaluation strategy (see CONTRIBUTING.md).
+3. **Ground from real expertise**: start from real task runs, corrections, and project artifacts, not generic advice.
+4. **Scope coherently**: define one composable unit of work and keep the boundary clear.
+5. **Design for context efficiency**: keep `SKILL.md` concise, move deep detail into `references/`, and add explicit load conditions.
+6. **Prefer defaults over menus**: choose one default tool or approach and use alternatives only as fallbacks.
+7. **Create the skill folder** with `/create-skill` if your agent supports it, or scaffold manually:
+
+  ```bash
+  mkdir -p skills/your-skill-name
+  cp docs/SKILL-TEMPLATE.md skills/your-skill-name/SKILL.md
+  cp skills/document/evals.json skills/your-skill-name/evals.json
+  ```
+
+8. **Fill in** the YAML frontmatter and markdown instructions, then immediately rename `skill_name`, replace the copied prompts, and ensure `name:` matches the folder exactly.
+9. **Include optional resources** (`assets/`, `references/`, `scripts/`) as the workflow needs them.
+10. **Refine with real execution**: test should-trigger and should-not-trigger prompts, review execution traces, and iterate.
+11. **Run the repository validators** before opening a PR:
+
+  ```bash
+  python scripts/validate_individual_skills.py
+  python scripts/validate_evals_schema.py
+  python scripts/validate_cross_skills.py evals/cross-skills.json
+  ```
+
+12. **Submit** a pull request with the skill folder, its `evals.json`, and the prompts or checks you used to validate it.
 
 ### Skill Anatomy
 
@@ -223,22 +246,26 @@ Authoring guidance:
 ```yaml
 ---
 name: your-skill-name
-description: Brief description of when and why to use this skill
+description: |
+  Use this skill when...
+  Triggers: "phrase 1", "phrase 2"
+  Expected output: ...
+license: MIT
 ---
 ```
 
 **Optional fields:**
 ```yaml
-license: MIT (default) | Apache-2.0 | GPL-3.0-or-later
 compatibility: Tool/version requirements
 metadata:
   domain: computational-modeling | documentation | publication | execution
   maturity: alpha | beta | stable
-  audience: modelers | researchers | data scientists
+  audience: modelers | researchers | data-scientists
+  category: documentation | quality-assurance | execution | publication
 ---
 ```
 
-See [CONTRIBUTING.md](CONTRIBUTING.md) and [AGENTS.md](AGENTS.md) for full guidance.
+See [CONTRIBUTING.md](CONTRIBUTING.md), [AGENTS.md](AGENTS.md), and [docs/VALIDATION.md](docs/VALIDATION.md) for full guidance.
 
 ## Roadmap
 
diff --git a/docs/VALIDATION.md b/docs/VALIDATION.md
index b2774a0..021a2a9 100644
--- a/docs/VALIDATION.md
+++ b/docs/VALIDATION.md
@@ -50,7 +50,7 @@ Every SKILL.md **must** include valid YAML frontmatter with required and optiona
 name: kebab-case-skill-name         # (required) lowercase, hyphens, no spaces
 description: |                       # (required) complete trigger & outcome description
   Use this skill when...
-license: MIT                         # (optional, defaults to MIT)
+license: MIT                         # (required)
 ---
 ```
 
@@ -199,18 +199,19 @@ When activated, this skill produces:
 
 ## Evaluation Strategy Template
 
-Before submitting, define how your skill will be evaluated. Create a file `evals/evals.json` in your skill directory:
+Before submitting, define how your skill will be evaluated. Create a file `skills/<name>/evals.json`:
 
 ```json
 {
   "skill_name": "document",
+  "description": "Evaluation cases for ODD+2 narrative documentation skill",
   "evals": [
     {
       "id": 1,
+      "type": "core",
       "prompt": "I have a Python ABM with Agent and Environment classes. Generate an ODD+2 narrative.",
       "should_trigger": true,
       "expected_output": "ODD sections covering entities, state variables, processes, and parameters",
-      "files": ["evals/files/minimal_abm.py"],
       "success_criteria": [
         "Output includes all three entities (Agent, Environment, Scheduler)",
         "State variables are listed with types and ranges",
@@ -219,16 +220,18 @@ Before submitting, define how your skill will be evaluated. Create a file `evals
     },
     {
       "id": 2,
+      "type": "core",
       "prompt": "Create a timeline of project milestones",
       "should_trigger": false,
       "expected_output": "Skill does not activate; falls through to other skills or generic behavior"
     },
     {
       "id": 3,
+      "type": "adversarial",
       "prompt": "I have a complex Netlogo ABM with 50 agents and nested entity hierarchies. Generate ODD.",
       "should_trigger": true,
       "expected_output": "ODD with entity hierarchy clearly explained; ask for clarification on subsystem abstractions if code is unclear",
-      "files": ["evals/files/complex_netlogo.nlogo"],
+      "failure_modes": ["hallucination", "under_trigger"],
       "success_criteria": [
         "Output structures entity hierarchy (e.g., Colony > Hive > Bee)",
         "Output explains state variable interactions",
@@ -242,17 +245,23 @@ Before submitting, define how your skill will be evaluated. Create a file `evals
 ### Evaluation Template Fields
 
 - **id:** Unique test case number
+- **type:** Optional classification: `core`, `adversarial`, `cross`, or `cross-adversarial`
 - **prompt:** User query (should be realistic)
-- **should_trigger:** Boolean indicating whether the skill should activate
+- **should_trigger:** Boolean indicating whether the skill should activate (required for `core` and `adversarial`)
 - **expected_output:** Description of expected behavior/output type
-- **files:** Optional array of input file paths (relative to skill directory)
+- **expected_behavior:** Optional narrative description of expected behavior
 - **success_criteria:** Array of statements that must be true for the skill to pass
+- **skills_expected:** Required for `cross` and `cross-adversarial`
+- **failure_modes:** Required for `adversarial` and `cross-adversarial`
+- **notes:** Optional reviewer notes
+
+Note: Evals must validate against `evals/schema/schema.json`. Do not add custom fields unless the schema is updated.
 
 ### Running Evals
 
 After you've defined evals, run your skill manually against each test case:
 
-1. **Setup:** Place test input files in `evals/files/`
+1. **Setup:** Keep eval prompts in `skills/<name>/evals.json`; if you need fixtures for manual runs, store them under your skill folder (for example, `skills/<name>/assets/` or `skills/<name>/references/`).
 2. **Execute:** Invoke your skill in your coding agent (Claude Code, Claude.ai, Cursor, Cline, or other AI coding environments) with the prompt
 3. **Capture output:** Save the output (file, markdown, JSON, etc.)
 4. **Grade:** Check against success criteria
@@ -330,7 +339,7 @@ Before opening a PR, verify:
 - [ ] Description includes trigger phrases and expected outputs
 - [ ] Tested against ≥5 should-trigger and ≥3 should-not-trigger prompts
 - [ ] Output contract is clear and verifiable
-- [ ] Evals are documented in `evals/evals.json` with success criteria
+- [ ] Evals are documented in `skills/<name>/evals.json` with success criteria
 - [ ] Manual testing shows skill works as expected
 - [ ] Execution traces were reviewed for false positives, missed triggers, and wasted steps
 - [ ] No hardcoded paths, API keys, or user-specific settings