canonical-agent-architecture-python/tool_comparison_eval.py at master · scriptstar/canonical-agent-architecture-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
"""Tool comparison evaluation.

Compares the performance of purpose-built specific tools vs generic API wrapper tools.
This demonstrates why specific tools lead to better agent performance.
"""

import asyncio
import os
from dotenv import load_dotenv
from braintrust import EvalAsync
import braintrust

from src.agent import WhileLoopAgent, AgentOptions
from src.tools import get_all_tools
from src.generic_tools import get_generic_tools

load_dotenv()

# Test cases for evaluation
test_cases = [
    {
        "input": "Find all premium users and notify them about a new feature launch",
        "expected": {
            "success_criteria": [
                "Found premium users",
                "Sent notifications to premium users",
                "john@co.com",
                "bob@co.com",
            ],
            "required_actions": ["search", "notify"],
        },
        "metadata": {
            "category": "multi-step",
            "difficulty": "medium",
        },
    },
    {
        "input": "Check if jane@co.com is an active subscriber and what plan they have",
        "expected": {
            "success_criteria": ["Jane Doe", "jane@co.com", "active", "basic"],
            "required_actions": ["lookup"],
        },
        "metadata": {
            "category": "single-lookup",
            "difficulty": "easy",
        },
    },
    {
        "input": "Find users with expired subscriptions and send them renewal reminders with a special offer",
        "expected": {
            "success_criteria": ["expired", "Bob Wilson", "renewal", "reminder"],
            "required_actions": ["search", "notify"],
        },
        "metadata": {
            "category": "multi-step",
            "difficulty": "medium",
        },
    },
    {
        "input": "Upgrade jane@co.com to premium plan and send confirmation",
        "expected": {
            "success_criteria": ["upgrade", "premium", "jane@co.com", "confirmation"],
            "required_actions": ["update", "notify"],
        },
        "metadata": {
            "category": "multi-step",
            "difficulty": "medium",
        },
    },
    {
        "input": "List all active users sorted by subscription type",
        "expected": {
            "success_criteria": ["John Smith", "Jane Doe", "active", "premium", "basic"],
            "required_actions": ["search"],
        },
        "metadata": {
            "category": "single-lookup",
            "difficulty": "easy",
        },
    },
]


# Scorer for checking if the agent accomplished the task
def task_success_scorer(output: str, expected: dict) -> dict:
    """Score based on whether success criteria were met."""
    if not expected or not expected.get("success_criteria"):
        return None

    success_criteria = expected["success_criteria"]
    found_criteria = [
        criteria
        for criteria in success_criteria
        if criteria.lower() in output.lower()
    ]

    score = len(found_criteria) / len(success_criteria)

    return {
        "name": "task_success",
        "score": score,
        "metadata": {
            "expected_criteria": success_criteria,
            "found_criteria": found_criteria,
            "missing_criteria": [c for c in success_criteria if c not in found_criteria],
        },
    }


# Scorer for response clarity
def clarity_scorer(output: str) -> dict:
    """Score based on clarity and structure of response."""
    # Check for clear, structured responses
    has_structure = "\n" in output or "•" in output or "-" in output
    has_confirmation = (
        "✓" in output or "successfully" in output.lower() or "completed" in output.lower()
    )
    is_verbose = len(output) > 1000
    has_json = "{" in output and "}" in output
    has_raw_data = (
        "query_id" in output
        or "request_id" in output
        or "transaction_id" in output
        or "execution_time_ms" in output
    )
    has_error = "Error:" in output or "error" in output.lower()

    score = 0.5
    if (
        has_structure
        and has_confirmation
        and not is_verbose
        and not has_json
        and not has_raw_data
        and not has_error
    ):
        score = 1.0
    elif (has_structure or has_confirmation) and not has_error:
        score = 0.7
    elif has_json or is_verbose or has_raw_data or has_error:
        score = 0.3

    return {
        "name": "clarity",
        "score": score,
        "metadata": {
            "has_structure": has_structure,
            "has_confirmation": has_confirmation,
            "is_verbose": is_verbose,
            "has_json": has_json,
            "has_raw_data": has_raw_data,
            "has_error": has_error,
            "response_length": len(output),
        },
    }


# Same system prompt for both evaluations
SYSTEM_PROMPT = """You are a customer service assistant. Help users manage customer accounts and subscriptions.

When asked to find and notify users:
- First find the relevant users
- Then send notifications to each user
- Be specific about what actions you're taking
- Provide clear confirmation of completed tasks"""


# Task function for specific tools
async def run_with_specific_tools(input_text: str) -> str:
    """Run agent with purpose-built specific tools."""
    agent = WhileLoopAgent(
        AgentOptions(
            tools=get_all_tools(),
            system_prompt=SYSTEM_PROMPT,
            max_iterations=10,
            openai_api_key=os.getenv("BRAINTRUST_API_KEY"),
        )
    )
    return await agent.run(input_text)


# Task function for generic tools
async def run_with_generic_tools(input_text: str) -> str:
    """Run agent with generic API wrapper tools."""
    agent = WhileLoopAgent(
        AgentOptions(
            tools=get_generic_tools(),
            system_prompt=SYSTEM_PROMPT,
            max_iterations=10,
            openai_api_key=os.getenv("BRAINTRUST_API_KEY"),
        )
    )
    return await agent.run(input_text)


async def main_async():
    """Run the tool comparison evaluation."""
    # Initialize Braintrust
    braintrust.init(project="canonical-agent-customer-service")

    # Evaluation with specific tools
    await EvalAsync(
        "canonical-agent-customer-service",
        experiment_name="specific-tools",
        data=test_cases,
        task=run_with_specific_tools,
        scores=[task_success_scorer, clarity_scorer],
        metadata={
            "description": "Evaluation using purpose-built, specific tools",
            "tool_type": "specific",
        },
    )

    # Evaluation with generic tools
    await EvalAsync(
        "canonical-agent-customer-service",
        experiment_name="generic-tools",
        data=test_cases,
        task=run_with_generic_tools,
        scores=[task_success_scorer, clarity_scorer],
        metadata={
            "description": "Evaluation using generic API wrapper tools",
            "tool_type": "generic",
        },
    )

    print("✅ Tool comparison evaluation complete!")
    print("View results at: https://www.braintrust.dev/app")


if __name__ == "__main__":
    # Run with proper async context to avoid cleanup errors
    asyncio.run(main_async())