Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions agents/__tests__/evaluation/conversationRunner.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { executeChatGraphWithStream, IStreamCallback } from '@/agents/graph/chatGraph';
import { ISearchResult } from '@/agents/graph/state';
import { IConversationTurn } from './evaluator';

export interface IConversationScenario {
Expand Down Expand Up @@ -32,22 +33,26 @@ export const runConversation = async (
const callbacks = createNoopCallbacks();

const messages: Array<{ role: string; content: string }> = [];
let lastSearchResults: ISearchResult | null = null;

for (const turn of scenario.turns) {
messages.push({ role: 'user', content: turn.userMessage });
conversation.push({ role: 'user', content: turn.userMessage });

const response = await executeChatGraphWithStream(
const result = await executeChatGraphWithStream(
sessionId,
scenario.locale,
messages,
callbacks
callbacks,
lastSearchResults
);

messages.push({ role: 'assistant', content: response });
conversation.push({ role: 'assistant', content: response });
lastSearchResults = result.lastSearchResults;

if (turn.validateResponse && !turn.validateResponse(response)) {
messages.push({ role: 'assistant', content: result.response });
conversation.push({ role: 'assistant', content: result.response });

if (turn.validateResponse && !turn.validateResponse(result.response)) {
return {
scenario,
conversation,
Expand Down
23 changes: 23 additions & 0 deletions agents/__tests__/evaluation/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,26 @@ export const defaultChatCriteria: IEvaluationCriteria[] = [
weight: 1,
},
];

export const defaultProductDetailsCriteria: IEvaluationCriteria[] = [
{
name: 'Accuracy',
description: 'Does the assistant provide accurate product details?',
weight: 3,
},
{
name: 'Completeness',
description: 'Does the response include relevant specifications?',
weight: 2,
},
{
name: 'Reference Understanding',
description: 'Does the assistant correctly identify which product the user is asking about?',
weight: 3,
},
{
name: 'Natural Language',
description: 'Is the response natural and easy to understand?',
weight: 1,
},
];
183 changes: 183 additions & 0 deletions agents/__tests__/evaluation/productDetails.e2e.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import { describe, it, expect, beforeAll, afterAll } from 'vitest';
import {
evaluateConversation,
defaultProductDetailsCriteria,
IEvaluationResult,
IConversationTurn,
} from './evaluator';
import { runConversation, IConversationScenario } from './conversationRunner';
import { clearLastRunDirectory, saveFailedTest } from './testResultsReporter';
import { setupTestProducts, teardownTestProducts } from './testFixtures';

const MINIMUM_PASSING_SCORE = 3.5;

beforeAll(async () => {
clearLastRunDirectory();
await setupTestProducts();
}, 60000);

afterAll(async () => {
await teardownTestProducts();
}, 30000);

const productDetailsScenarios: IConversationScenario[] = [
{
name: 'Product details by position',
locale: 'en',
turns: [
{ userMessage: 'Show me laptops' },
{ userMessage: 'What are the specs of the first one?' },
],
expectedBehavior:
'After showing laptops, the assistant should provide detailed specifications of the first laptop including RAM, processor, storage from attributes or description.',
},
{
name: 'Product details by name',
locale: 'en',
turns: [{ userMessage: 'Tell me about Gaming Laptop Pro X1' }],
expectedBehavior:
'The assistant should provide detailed information about the Gaming Laptop Pro X1 including specifications like RAM, GPU, and storage.',
},
{
name: 'Product details in Polish',
locale: 'pl',
turns: [
{ userMessage: 'Pokaż laptopy' },
{ userMessage: 'Jaki procesor ma pierwszy?' },
],
expectedBehavior:
'The assistant should provide processor details of the first laptop in Polish language.',
},
{
name: 'Non-existent product',
locale: 'en',
turns: [{ userMessage: 'Tell me about SuperPhone 3000' }],
expectedBehavior:
'The assistant should indicate that the product was not found or ask for more information.',
},
{
name: 'Product details by partial name',
locale: 'en',
turns: [{ userMessage: 'What specs does the iPhone have?' }],
expectedBehavior:
'The assistant should provide details about the iPhone 15 Pro Max including processor and storage.',
},
];

describe('Product Details E2E Evaluation', () => {
describe.each(productDetailsScenarios)('Scenario: $name', (scenario) => {
let evaluationResult: IEvaluationResult;
let conversation: IConversationTurn[];

beforeAll(async () => {
const conversationResult = await runConversation(scenario);
conversation = conversationResult.conversation;

console.log(`\n=== Conversation: ${scenario.name} ===`);
conversation.forEach((turn) => {
console.log(`${turn.role.toUpperCase()}: ${turn.content}`);
});

expect(conversationResult.success).toBe(true);

evaluationResult = await evaluateConversation(
conversation,
defaultProductDetailsCriteria,
scenario.expectedBehavior
);

console.log(`\nEvaluation Score: ${evaluationResult.score}`);
console.log(`Reasoning: ${evaluationResult.reasoning}\n`);

if (evaluationResult.score < MINIMUM_PASSING_SCORE) {
saveFailedTest(scenario, conversation, evaluationResult);
}
}, 180000);

it('should pass LLM evaluation with score >= 3.5', () => {
expect(evaluationResult.score).toBeGreaterThanOrEqual(MINIMUM_PASSING_SCORE);
expect(evaluationResult.passed).toBe(true);
});

it('should have valid reasoning', () => {
expect(evaluationResult.reasoning).toBeTruthy();
expect(evaluationResult.reasoning.length).toBeGreaterThan(10);
});
});
});

const MULTI_TURN_COMPLEX_MIN_SCORE = 3.0;

const multiTurnDetailsScenarios: Array<{
scenario: IConversationScenario;
minScore: number;
}> = [
{
scenario: {
name: 'Search then ask for multiple products',
locale: 'en',
turns: [
{ userMessage: 'Show me smartphones' },
{ userMessage: 'Tell me more about the first one' },
{ userMessage: 'What about the second one?' },
],
expectedBehavior:
'The assistant should show smartphones first, then provide details for the first smartphone, then provide details for the second smartphone. Each product should have specifications.',
},
minScore: MULTI_TURN_COMPLEX_MIN_SCORE,
},
{
scenario: {
name: 'Search then compare',
locale: 'en',
turns: [
{ userMessage: 'I need a laptop' },
{ userMessage: 'How much RAM does the first one have?' },
],
expectedBehavior:
'The assistant should first show laptops, then provide the RAM specification for the first laptop when asked.',
},
minScore: MINIMUM_PASSING_SCORE,
},
];

describe('Multi-Turn Product Details E2E Evaluation', () => {
describe.each(multiTurnDetailsScenarios)('Scenario: $scenario.name', ({ scenario, minScore }) => {
let evaluationResult: IEvaluationResult;
let conversation: IConversationTurn[];

beforeAll(async () => {
const conversationResult = await runConversation(scenario);
conversation = conversationResult.conversation;

console.log(`\n=== Multi-Turn: ${scenario.name} ===`);
conversation.forEach((turn) => {
console.log(`${turn.role.toUpperCase()}: ${turn.content}`);
});

expect(conversationResult.success).toBe(true);

evaluationResult = await evaluateConversation(
conversation,
defaultProductDetailsCriteria,
scenario.expectedBehavior
);

console.log(`\nEvaluation Score: ${evaluationResult.score}`);
console.log(`Reasoning: ${evaluationResult.reasoning}\n`);

if (evaluationResult.score < minScore) {
saveFailedTest(scenario, conversation, evaluationResult);
}
}, 240000);

it(`should pass LLM evaluation with score >= ${minScore}`, () => {
expect(evaluationResult.score).toBeGreaterThanOrEqual(minScore);
});

it('should have valid reasoning', () => {
expect(evaluationResult.reasoning).toBeTruthy();
expect(evaluationResult.reasoning.length).toBeGreaterThan(10);
});
});
});
54 changes: 54 additions & 0 deletions agents/__tests__/evaluation/testFixtures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,12 @@ export const TEST_PRODUCTS: IProductCreateInput[] = [
stock: 15,
category: 'Laptops',
isActive: true,
attributes: [
{ name: 'RAM', value: '32', unit: 'GB' },
{ name: 'GPU', value: 'RTX 4080' },
{ name: 'Storage', value: '1', unit: 'TB SSD' },
{ name: 'Processor', value: 'Intel Core i9-13900HX' },
],
},
{
name: 'Business Laptop Elite',
Expand All @@ -22,6 +28,12 @@ export const TEST_PRODUCTS: IProductCreateInput[] = [
stock: 25,
category: 'Laptops',
isActive: true,
attributes: [
{ name: 'RAM', value: '16', unit: 'GB' },
{ name: 'Processor', value: 'Intel Core i7-1365U' },
{ name: 'Storage', value: '512', unit: 'GB SSD' },
{ name: 'Weight', value: '1.3', unit: 'kg' },
],
},
{
name: 'Budget Laptop Basic',
Expand All @@ -31,6 +43,11 @@ export const TEST_PRODUCTS: IProductCreateInput[] = [
stock: 50,
category: 'Laptops',
isActive: true,
attributes: [
{ name: 'RAM', value: '8', unit: 'GB' },
{ name: 'Processor', value: 'Intel Core i5-1235U' },
{ name: 'Storage', value: '256', unit: 'GB SSD' },
],
},
{
name: 'Samsung Galaxy S24 Ultra',
Expand All @@ -40,6 +57,12 @@ export const TEST_PRODUCTS: IProductCreateInput[] = [
stock: 30,
category: 'Smartphones',
isActive: true,
attributes: [
{ name: 'RAM', value: '12', unit: 'GB' },
{ name: 'Storage', value: '512', unit: 'GB' },
{ name: 'Camera', value: '200', unit: 'MP' },
{ name: 'Display', value: '6.8', unit: 'inch' },
],
},
{
name: 'iPhone 15 Pro Max',
Expand All @@ -49,6 +72,12 @@ export const TEST_PRODUCTS: IProductCreateInput[] = [
stock: 20,
category: 'Smartphones',
isActive: true,
attributes: [
{ name: 'Processor', value: 'A17 Pro' },
{ name: 'Storage', value: '256', unit: 'GB' },
{ name: 'Display', value: '6.7', unit: 'inch' },
{ name: 'Material', value: 'Titanium' },
],
},
{
name: 'Xiaomi 14 Pro',
Expand All @@ -58,6 +87,11 @@ export const TEST_PRODUCTS: IProductCreateInput[] = [
stock: 40,
category: 'Smartphones',
isActive: true,
attributes: [
{ name: 'Processor', value: 'Snapdragon 8 Gen 3' },
{ name: 'Storage', value: '256', unit: 'GB' },
{ name: 'Camera', value: 'Leica' },
],
},
{
name: 'Mechanical Gaming Keyboard RGB',
Expand All @@ -67,6 +101,11 @@ export const TEST_PRODUCTS: IProductCreateInput[] = [
stock: 100,
category: 'Gaming Peripherals',
isActive: true,
attributes: [
{ name: 'Switch Type', value: 'Cherry MX' },
{ name: 'Backlight', value: 'RGB' },
{ name: 'Keys', value: '104' },
],
},
{
name: 'Gaming Mouse Pro',
Expand All @@ -76,6 +115,11 @@ export const TEST_PRODUCTS: IProductCreateInput[] = [
stock: 80,
category: 'Gaming Peripherals',
isActive: true,
attributes: [
{ name: 'DPI', value: '25000' },
{ name: 'Buttons', value: '8' },
{ name: 'Lighting', value: 'RGB' },
],
},
{
name: 'Sony WH-1000XM5 Headphones',
Expand All @@ -85,6 +129,11 @@ export const TEST_PRODUCTS: IProductCreateInput[] = [
stock: 35,
category: 'Audio',
isActive: true,
attributes: [
{ name: 'Battery Life', value: '30', unit: 'hours' },
{ name: 'Noise Cancellation', value: 'Active' },
{ name: 'Connection', value: 'Wireless Bluetooth' },
],
},
{
name: 'AirPods Pro 2',
Expand All @@ -94,6 +143,11 @@ export const TEST_PRODUCTS: IProductCreateInput[] = [
stock: 45,
category: 'Audio',
isActive: true,
attributes: [
{ name: 'Noise Cancellation', value: 'Active' },
{ name: 'Audio', value: 'Spatial Audio' },
{ name: 'Type', value: 'Wireless Earbuds' },
],
},
];

Expand Down
Loading