NousResearch · steezkelly · Apr 25, 2026 · Apr 25, 2026
diff --git a/evolution/core/config.py b/evolution/core/config.py
@@ -23,7 +23,7 @@ class EvolutionConfig:
     judge_model: str = "openai/gpt-4.1"  # Model for dataset generation
 
     # Constraints
-    max_skill_size: int = 15_000  # 15KB default
+    max_skill_size: int = 50_000  # 50KB default — evolved skills may include few-shot examples
     max_tool_desc_size: int = 500  # chars
     max_param_desc_size: int = 200  # chars
     max_prompt_growth: float = 0.2  # 20% max growth over baseline

diff --git a/evolution/core/constraints.py b/evolution/core/constraints.py
@@ -4,6 +4,7 @@
 considered valid. Failed constraints = immediate rejection.
 """
 
+import re
 import subprocess
 from pathlib import Path
 from dataclasses import dataclass
@@ -148,27 +149,65 @@ def _check_non_empty(self, text: str) -> ConstraintResult:
             )
 
     def _check_skill_structure(self, text: str) -> ConstraintResult:
-        """Check that a skill file has valid YAML frontmatter and markdown body."""
+        """Check that a skill file has valid YAML frontmatter AND a substantive body.
+
+        Frontmatter validation (YAML between --- markers):
+        - Must start with ---
+        - Must contain 'name:' field
+        - Must contain 'description:' field
+
+        Body validation (markdown after frontmatter):
+        - Must have at least 2 of 3: headings, procedural content, substantial length
+        This allows varied skill formats while ensuring meaningful content.
+        """
         has_frontmatter = text.strip().startswith("---")
         has_name = "name:" in text[:500] if has_frontmatter else False
         has_description = "description:" in text[:500] if has_frontmatter else False
 
-        if has_frontmatter and has_name and has_description:
+        frontmatter_ok = has_frontmatter and has_name and has_description
+
+        # Separate body from frontmatter for body validation
+        body = text
+        if has_frontmatter:
+            parts = text.split("---", 2)
+            if len(parts) >= 3:
+                body = parts[2].strip()
+
+        # Body must have ≥2 of 3: headings, procedural content, substantial length
+        has_headings = bool(re.search(r"^#+\s", body, re.MULTILINE))
+        has_steps = any(
+            marker in body.lower()
+            for marker in ["step", "1.", "procedure", "how to", "instructions"]
+        )
+        has_content = len(body.strip()) > 100
+
+        body_checks = {
+            "headings": has_headings,
+            "procedural content": has_steps,
+            "substantial content": has_content,
+        }
+        body_passed = sum(body_checks.values()) >= 2
+
+        if frontmatter_ok and body_passed:
             return ConstraintResult(
                 passed=True,
                 constraint_name="skill_structure",
-                message="Skill has valid frontmatter (name + description)",
-            )
-        else:
-            missing = []
-            if not has_frontmatter:
-                missing.append("YAML frontmatter (---)")
-            if not has_name:
-                missing.append("name field")
-            if not has_description:
-                missing.append("description field")
-            return ConstraintResult(
-                passed=False,
-                constraint_name="skill_structure",
-                message=f"Skill missing: {', '.join(missing)}",
+                message="Skill has valid frontmatter (name + description) and substantive body",
             )
+
+        missing = []
+        if not has_frontmatter:
+            missing.append("YAML frontmatter (---)")
+        if not has_name:
+            missing.append("name field")
+        if not has_description:
+            missing.append("description field")
+        if not body_passed:
+            failed_checks = [k for k, v in body_checks.items() if not v]
+            missing.append(f"body lacks: {', '.join(failed_checks)}")
+
+        return ConstraintResult(
+            passed=False,
+            constraint_name="skill_structure",
+            message=f"Skill missing: {', '.join(missing)}",
+        )
diff --git a/evolution/core/dataset_builder.py b/evolution/core/dataset_builder.py
@@ -6,17 +6,96 @@
 C) Golden sets — hand-curated JSONL files
 """
 
+import ast
 import json
 import random
+import re
 from pathlib import Path
 from dataclasses import dataclass, field
 from typing import Optional
 
 import dspy
+import os
 
 from evolution.core.config import EvolutionConfig
 
 
+def _try_parse_json(text: str) -> list:
+    """Parse JSON with multiple fallback strategies for LLM output.
+
+    LLMs frequently produce malformed JSON: trailing commas, single quotes,
+    text wrapped in markdown fences, etc. This tries progressively more
+    aggressive fixes before giving up.
+    """
+    text = text.strip()
+
+    # Strategy 1: Direct parse
+    try:
+        result = json.loads(text)
+        if isinstance(result, list):
+            return result
+    except json.JSONDecodeError:
+        pass
+
+    # Strategy 2: Python literal_eval — handles single-quoted dicts/strings
+    try:
+        result = ast.literal_eval(text)
+        if isinstance(result, list):
+            return result
+    except (ValueError, SyntaxError):
+        pass
+
+    # Strategy 3: Extract JSON array from surrounding text
+    match = re.search(r'\[\s*\{.*\}\s*\]', text, re.DOTALL)
+    if match:
+        try:
+            result = json.loads(match.group())
+            if isinstance(result, list):
+                return result
+        except json.JSONDecodeError:
+            pass
+
+    # Strategy 4: Try literal_eval on extracted candidate
+    if match:
+        try:
+            result = ast.literal_eval(match.group())
+            if isinstance(result, list):
+                return result
+        except (ValueError, SyntaxError):
+            pass
+
+    # Strategy 5: Fix trailing commas, then parse
+    fixed = re.sub(r',\s*([}\]])', r'\1', text)
+    fixed = re.sub(r"(?<!')\'([^']+?)'(?=\s*[:,\]\}])", r'"\1"', fixed)
+    try:
+        result = json.loads(fixed)
+        if isinstance(result, list):
+            return result
+    except json.JSONDecodeError:
+        pass
+
+    # Strategy 6: Strip markdown code fences
+    stripped = re.sub(r'^```(?:json)?\s*', '', text, flags=re.MULTILINE)
+    stripped = re.sub(r'\s*```$', '', stripped)
+    try:
+        result = json.loads(stripped)
+        if isinstance(result, list):
+            return result
+    except json.JSONDecodeError:
+        pass
+
+    # Last resort: extract all {...} blocks and try each
+    for block_match in re.finditer(r'\{[^{}]*\}', text):
+        try:
+            result = json.loads(block_match.group())
+            if isinstance(result, list):
+                return result
+        except json.JSONDecodeError:
+            continue
+
+    return None
+
+
 @dataclass
 class EvalExample:
     """A single evaluation example."""
@@ -123,7 +202,7 @@ def generate(
         n = num_cases or self.config.eval_dataset_size
 
         # Configure DSPy to use the judge model for generation
-        lm = dspy.LM(self.config.judge_model)
+        lm = dspy.LM(self.config.judge_model, api_base=os.getenv("OPENROUTER_BASE_URL")) if os.getenv("OPENROUTER_BASE_URL") else dspy.LM(self.config.judge_model)
 
         with dspy.context(lm=lm):
             result = self.generator(
@@ -132,17 +211,10 @@ def generate(
                 num_cases=n,
             )
 
-        # Parse the generated test cases
-        try:
-            cases_raw = json.loads(result.test_cases)
-        except json.JSONDecodeError:
-            # Try to extract JSON from the response
-            import re
-            match = re.search(r'\[.*\]', result.test_cases, re.DOTALL)
-            if match:
-                cases_raw = json.loads(match.group())
-            else:
-                raise ValueError(f"Could not parse test cases from LLM output: {result.test_cases[:200]}")
+        # Parse the generated test cases using robust multi-strategy parser
+        cases_raw = _try_parse_json(result.test_cases)
+        if cases_raw is None:
+            raise ValueError(f"Could not parse test cases from LLM output: {result.test_cases[:500]}")
 
         examples = [
             EvalExample(

diff --git a/evolution/core/external_importers.py b/evolution/core/external_importers.py
@@ -30,6 +30,7 @@
 
 import click
 import dspy
+import os
 from rich.console import Console
 from rich.progress import Progress
 
@@ -490,7 +491,7 @@ def filter_and_score(
         # Stage 2: LLM relevance scoring
         examples = []
         errors = 0
-        lm = dspy.LM(self.model)
+        lm = dspy.LM(self.model, api_base=os.getenv("OPENROUTER_BASE_URL")) if os.getenv("OPENROUTER_BASE_URL") else dspy.LM(self.model)
 
         with Progress() as progress:
             task = progress.add_task("Scoring relevance...", total=len(candidates))

diff --git a/evolution/core/fitness.py b/evolution/core/fitness.py
@@ -5,6 +5,7 @@
 """
 
 import dspy
+import os
 from dataclasses import dataclass
 from typing import Optional
 
@@ -72,7 +73,7 @@ def score(
     ) -> FitnessScore:
         """Score an agent output using LLM-as-judge."""
 
-        lm = dspy.LM(self.config.eval_model)
+        lm = dspy.LM(self.config.eval_model, api_base=os.getenv("OPENROUTER_BASE_URL")) if os.getenv("OPENROUTER_BASE_URL") else dspy.LM(self.config.eval_model)
 
         with dspy.context(lm=lm):
             result = self.judge(
@@ -104,10 +105,10 @@ def score(
         )
 
 
-def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> float:
+def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None, pred_name=None, pred_trace=None) -> float:
     """DSPy-compatible metric function for skill optimization.
 
-    This is what gets passed to dspy.GEPA(metric=...).
+    Accepts 5 args for GEPA compatibility: (gold, pred, trace, pred_name, pred_trace).
     Returns a float 0-1 score.
     """
     # The prediction should have an 'output' field with the agent's response