Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion evolution/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class EvolutionConfig:
judge_model: str = "openai/gpt-4.1" # Model for dataset generation

# Constraints
max_skill_size: int = 15_000 # 15KB default
max_skill_size: int = 50_000 # 50KB default — evolved skills may include few-shot examples
max_tool_desc_size: int = 500 # chars
max_param_desc_size: int = 200 # chars
max_prompt_growth: float = 0.2 # 20% max growth over baseline
Expand Down
71 changes: 55 additions & 16 deletions evolution/core/constraints.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
considered valid. Failed constraints = immediate rejection.
"""

import re
import subprocess
from pathlib import Path
from dataclasses import dataclass
Expand Down Expand Up @@ -148,27 +149,65 @@ def _check_non_empty(self, text: str) -> ConstraintResult:
)

def _check_skill_structure(self, text: str) -> ConstraintResult:
"""Check that a skill file has valid YAML frontmatter and markdown body."""
"""Check that a skill file has valid YAML frontmatter AND a substantive body.

Frontmatter validation (YAML between --- markers):
- Must start with ---
- Must contain 'name:' field
- Must contain 'description:' field

Body validation (markdown after frontmatter):
- Must have at least 2 of 3: headings, procedural content, substantial length
This allows varied skill formats while ensuring meaningful content.
"""
has_frontmatter = text.strip().startswith("---")
has_name = "name:" in text[:500] if has_frontmatter else False
has_description = "description:" in text[:500] if has_frontmatter else False

if has_frontmatter and has_name and has_description:
frontmatter_ok = has_frontmatter and has_name and has_description

# Separate body from frontmatter for body validation
body = text
if has_frontmatter:
parts = text.split("---", 2)
if len(parts) >= 3:
body = parts[2].strip()

# Body must have ≥2 of 3: headings, procedural content, substantial length
has_headings = bool(re.search(r"^#+\s", body, re.MULTILINE))
has_steps = any(
marker in body.lower()
for marker in ["step", "1.", "procedure", "how to", "instructions"]
)
has_content = len(body.strip()) > 100

body_checks = {
"headings": has_headings,
"procedural content": has_steps,
"substantial content": has_content,
}
body_passed = sum(body_checks.values()) >= 2

if frontmatter_ok and body_passed:
return ConstraintResult(
passed=True,
constraint_name="skill_structure",
message="Skill has valid frontmatter (name + description)",
)
else:
missing = []
if not has_frontmatter:
missing.append("YAML frontmatter (---)")
if not has_name:
missing.append("name field")
if not has_description:
missing.append("description field")
return ConstraintResult(
passed=False,
constraint_name="skill_structure",
message=f"Skill missing: {', '.join(missing)}",
message="Skill has valid frontmatter (name + description) and substantive body",
)

missing = []
if not has_frontmatter:
missing.append("YAML frontmatter (---)")
if not has_name:
missing.append("name field")
if not has_description:
missing.append("description field")
if not body_passed:
failed_checks = [k for k, v in body_checks.items() if not v]
missing.append(f"body lacks: {', '.join(failed_checks)}")

return ConstraintResult(
passed=False,
constraint_name="skill_structure",
message=f"Skill missing: {', '.join(missing)}",
)
96 changes: 84 additions & 12 deletions evolution/core/dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,96 @@
C) Golden sets — hand-curated JSONL files
"""

import ast
import json
import random
import re
from pathlib import Path
from dataclasses import dataclass, field
from typing import Optional

import dspy
import os

from evolution.core.config import EvolutionConfig


def _try_parse_json(text: str) -> list:
"""Parse JSON with multiple fallback strategies for LLM output.

LLMs frequently produce malformed JSON: trailing commas, single quotes,
text wrapped in markdown fences, etc. This tries progressively more
aggressive fixes before giving up.
"""
text = text.strip()

# Strategy 1: Direct parse
try:
result = json.loads(text)
if isinstance(result, list):
return result
except json.JSONDecodeError:
pass

# Strategy 2: Python literal_eval — handles single-quoted dicts/strings
try:
result = ast.literal_eval(text)
if isinstance(result, list):
return result
except (ValueError, SyntaxError):
pass

# Strategy 3: Extract JSON array from surrounding text
match = re.search(r'\[\s*\{.*\}\s*\]', text, re.DOTALL)
if match:
try:
result = json.loads(match.group())
if isinstance(result, list):
return result
except json.JSONDecodeError:
pass

# Strategy 4: Try literal_eval on extracted candidate
if match:
try:
result = ast.literal_eval(match.group())
if isinstance(result, list):
return result
except (ValueError, SyntaxError):
pass

# Strategy 5: Fix trailing commas, then parse
fixed = re.sub(r',\s*([}\]])', r'\1', text)
fixed = re.sub(r"(?<!')\'([^']+?)'(?=\s*[:,\]\}])", r'"\1"', fixed)
try:
result = json.loads(fixed)
if isinstance(result, list):
return result
except json.JSONDecodeError:
pass

# Strategy 6: Strip markdown code fences
stripped = re.sub(r'^```(?:json)?\s*', '', text, flags=re.MULTILINE)
stripped = re.sub(r'\s*```$', '', stripped)
try:
result = json.loads(stripped)
if isinstance(result, list):
return result
except json.JSONDecodeError:
pass

# Last resort: extract all {...} blocks and try each
for block_match in re.finditer(r'\{[^{}]*\}', text):
try:
result = json.loads(block_match.group())
if isinstance(result, list):
return result
except json.JSONDecodeError:
continue

return None


@dataclass
class EvalExample:
"""A single evaluation example."""
Expand Down Expand Up @@ -123,7 +202,7 @@ def generate(
n = num_cases or self.config.eval_dataset_size

# Configure DSPy to use the judge model for generation
lm = dspy.LM(self.config.judge_model)
lm = dspy.LM(self.config.judge_model, api_base=os.getenv("OPENROUTER_BASE_URL")) if os.getenv("OPENROUTER_BASE_URL") else dspy.LM(self.config.judge_model)

with dspy.context(lm=lm):
result = self.generator(
Expand All @@ -132,17 +211,10 @@ def generate(
num_cases=n,
)

# Parse the generated test cases
try:
cases_raw = json.loads(result.test_cases)
except json.JSONDecodeError:
# Try to extract JSON from the response
import re
match = re.search(r'\[.*\]', result.test_cases, re.DOTALL)
if match:
cases_raw = json.loads(match.group())
else:
raise ValueError(f"Could not parse test cases from LLM output: {result.test_cases[:200]}")
# Parse the generated test cases using robust multi-strategy parser
cases_raw = _try_parse_json(result.test_cases)
if cases_raw is None:
raise ValueError(f"Could not parse test cases from LLM output: {result.test_cases[:500]}")

examples = [
EvalExample(
Expand Down
3 changes: 2 additions & 1 deletion evolution/core/external_importers.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

import click
import dspy
import os
from rich.console import Console
from rich.progress import Progress

Expand Down Expand Up @@ -490,7 +491,7 @@ def filter_and_score(
# Stage 2: LLM relevance scoring
examples = []
errors = 0
lm = dspy.LM(self.model)
lm = dspy.LM(self.model, api_base=os.getenv("OPENROUTER_BASE_URL")) if os.getenv("OPENROUTER_BASE_URL") else dspy.LM(self.model)

with Progress() as progress:
task = progress.add_task("Scoring relevance...", total=len(candidates))
Expand Down
7 changes: 4 additions & 3 deletions evolution/core/fitness.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import dspy
import os
from dataclasses import dataclass
from typing import Optional

Expand Down Expand Up @@ -72,7 +73,7 @@ def score(
) -> FitnessScore:
"""Score an agent output using LLM-as-judge."""

lm = dspy.LM(self.config.eval_model)
lm = dspy.LM(self.config.eval_model, api_base=os.getenv("OPENROUTER_BASE_URL")) if os.getenv("OPENROUTER_BASE_URL") else dspy.LM(self.config.eval_model)

with dspy.context(lm=lm):
result = self.judge(
Expand Down Expand Up @@ -104,10 +105,10 @@ def score(
)


def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None) -> float:
def skill_fitness_metric(example: dspy.Example, prediction: dspy.Prediction, trace=None, pred_name=None, pred_trace=None) -> float:
"""DSPy-compatible metric function for skill optimization.

This is what gets passed to dspy.GEPA(metric=...).
Accepts 5 args for GEPA compatibility: (gold, pred, trace, pred_name, pred_trace).
Returns a float 0-1 score.
"""
# The prediction should have an 'output' field with the agent's response
Expand Down
Loading