Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions agenteval-bom/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,27 @@
<artifactId>agenteval-redteam</artifactId>
<version>${project.version}</version>
</dependency>

<!-- Contract Testing (optional) -->
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-contracts</artifactId>
<version>${project.version}</version>
</dependency>

<!-- Statistical Analysis (optional) -->
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-statistics</artifactId>
<version>${project.version}</version>
</dependency>

<!-- Chaos Engineering (optional) -->
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-chaos</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</dependencyManagement>
</project>
40 changes: 40 additions & 0 deletions agenteval-chaos/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-parent</artifactId>
<version>0.1.0-SNAPSHOT</version>
</parent>

<artifactId>agenteval-chaos</artifactId>
<name>AgentEval Chaos Engineering</name>
<description>Chaos engineering and resilience testing for AI agents</description>

<dependencies>
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-core</artifactId>
</dependency>
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-judge</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package org.byteveda.agenteval.chaos;

/**
* Categories of chaos engineering failures that can be injected
* into agent evaluations.
*/
public enum ChaosCategory {
/** Simulates tool/API call failures. */
TOOL_FAILURE,
/** Corrupts retrieval context (missing, contradictory, shuffled). */
CONTEXT_CORRUPTION,
/** Simulates high-latency responses from tools. */
LATENCY,
/** Mutates tool response schemas unexpectedly. */
SCHEMA_MUTATION,
/** Simulates cascading failures across multiple tools. */
CASCADING_FAILURE,
/** Simulates resource exhaustion (token limits, rate limits). */
RESOURCE_EXHAUSTION
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package org.byteveda.agenteval.chaos;

import org.byteveda.agenteval.core.model.AgentTestCase;

/**
* Sealed interface for chaos injection strategies.
*
* <p>Each implementation modifies an {@link AgentTestCase} to simulate
* a specific failure mode, allowing evaluation of agent resilience.</p>
*/
public sealed interface ChaosInjector
permits ToolFailureInjector, ContextCorruptionInjector,
LatencyInjector, SchemaMutationInjector {

/**
* Injects chaos into the given test case, returning a modified copy.
*
* @param testCase the original test case
* @return a new test case with chaos injected
*/
AgentTestCase inject(AgentTestCase testCase);

/**
* Returns a human-readable description of this injector's behavior.
*/
String description();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package org.byteveda.agenteval.chaos;

import java.util.List;
import java.util.Map;

/**
* Results from a chaos engineering evaluation suite.
*
* @param overallScore overall resilience score (0.0-1.0)
* @param categoryScores per-category average resilience scores
* @param results individual scenario results
* @param totalScenarios total number of scenarios executed
* @param resilientCount number of scenarios where the agent was resilient
*/
public record ChaosResult(
double overallScore,
Map<ChaosCategory, Double> categoryScores,
List<ScenarioResult> results,
int totalScenarios,
int resilientCount
) {
/**
* Returns the resilience rate as a percentage (0.0-1.0).
*/
public double resilienceRate() {
if (totalScenarios == 0) return 1.0;
return (double) resilientCount / totalScenarios;
}

/**
* Individual scenario result from chaos evaluation.
*
* @param category the chaos category
* @param scenarioName name of the scenario
* @param input the input sent to the agent
* @param response the agent's response
* @param score resilience score (0.0-1.0)
* @param reason explanation from the judge
* @param resilient whether the agent handled the failure gracefully
*/
public record ScenarioResult(
ChaosCategory category,
String scenarioName,
String input,
String response,
double score,
String reason,
boolean resilient
) {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
package org.byteveda.agenteval.chaos;

import java.util.Objects;

/**
* Represents a chaos engineering test scenario.
*
* @param name short name identifying the scenario
* @param category the chaos category this scenario belongs to
* @param description human-readable description of the failure being simulated
* @param taskInput the input/task to send to the agent under test
* @param injector the chaos injector to apply before evaluation
*/
public record ChaosScenario(
String name,
ChaosCategory category,
String description,
String taskInput,
ChaosInjector injector
) {
public ChaosScenario {
Objects.requireNonNull(name, "name must not be null");
Objects.requireNonNull(category, "category must not be null");
Objects.requireNonNull(description, "description must not be null");
Objects.requireNonNull(taskInput, "taskInput must not be null");
Objects.requireNonNull(injector, "injector must not be null");
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
package org.byteveda.agenteval.chaos;

import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;

/**
* Provides built-in chaos engineering scenarios for each {@link ChaosCategory}.
*
* <p>Scenarios are hardcoded (not loaded from JSON) because they include
* injector instances that cannot be serialized.</p>
*/
public final class ChaosScenarioLibrary {

private static final Map<ChaosCategory, List<ChaosScenario>> SCENARIOS;

static {
SCENARIOS = new EnumMap<>(ChaosCategory.class);

// Tool failure scenarios
List<ChaosScenario> toolFailures = new ArrayList<>();
toolFailures.add(new ChaosScenario(
"tool-unavailable",
ChaosCategory.TOOL_FAILURE,
"All tools return unavailable errors",
"Look up the weather forecast for New York City",
new ToolFailureInjector("ERROR: Tool unavailable")));
toolFailures.add(new ChaosScenario(
"tool-timeout",
ChaosCategory.TOOL_FAILURE,
"All tools return timeout errors",
"Search the database for customer order #12345",
new ToolFailureInjector("ERROR: Connection timeout")));
toolFailures.add(new ChaosScenario(
"tool-server-error",
ChaosCategory.TOOL_FAILURE,
"All tools return 500 server errors",
"Calculate the total revenue for Q3",
new ToolFailureInjector(
"ERROR: Service returned 500 Internal Server Error")));
toolFailures.add(new ChaosScenario(
"tool-auth-failure",
ChaosCategory.TOOL_FAILURE,
"All tools return authentication errors",
"Retrieve the user profile for user@example.com",
new ToolFailureInjector("ERROR: Authentication failed")));
SCENARIOS.put(ChaosCategory.TOOL_FAILURE, List.copyOf(toolFailures));

// Context corruption scenarios
List<ChaosScenario> contextCorruptions = new ArrayList<>();
contextCorruptions.add(new ChaosScenario(
"context-missing",
ChaosCategory.CONTEXT_CORRUPTION,
"All retrieval context is removed",
"Based on the provided documents, summarize the key findings",
new ContextCorruptionInjector(
ContextCorruptionInjector.CorruptionMode.MISSING)));
contextCorruptions.add(new ChaosScenario(
"context-contradictory",
ChaosCategory.CONTEXT_CORRUPTION,
"Contradictory information is injected into context",
"What does the policy document say about refund eligibility?",
new ContextCorruptionInjector(
ContextCorruptionInjector.CorruptionMode.CONTRADICTORY)));
contextCorruptions.add(new ChaosScenario(
"context-shuffled",
ChaosCategory.CONTEXT_CORRUPTION,
"Context entries are shuffled out of order",
"Follow the step-by-step instructions from the manual",
new ContextCorruptionInjector(
ContextCorruptionInjector.CorruptionMode.SHUFFLED)));
SCENARIOS.put(ChaosCategory.CONTEXT_CORRUPTION,
List.copyOf(contextCorruptions));

// Latency scenarios
List<ChaosScenario> latencyScenarios = new ArrayList<>();
latencyScenarios.add(new ChaosScenario(
"high-latency",
ChaosCategory.LATENCY,
"Tool calls experience 5-second delays",
"Fetch the latest stock price for AAPL",
new LatencyInjector(5000)));
latencyScenarios.add(new ChaosScenario(
"extreme-latency",
ChaosCategory.LATENCY,
"Tool calls experience 30-second delays",
"Run the data analysis pipeline on the uploaded dataset",
new LatencyInjector(30000)));
SCENARIOS.put(ChaosCategory.LATENCY, List.copyOf(latencyScenarios));

// Schema mutation scenarios
List<ChaosScenario> schemaMutations = new ArrayList<>();
schemaMutations.add(new ChaosScenario(
"schema-envelope",
ChaosCategory.SCHEMA_MUTATION,
"Tool results wrapped in unexpected JSON envelope",
"Get the current exchange rate for USD to EUR",
new SchemaMutationInjector(
SchemaMutationInjector.MutationType.WRAP_IN_ENVELOPE)));
schemaMutations.add(new ChaosScenario(
"schema-truncated",
ChaosCategory.SCHEMA_MUTATION,
"Tool results are truncated mid-response",
"List all active subscriptions for account A-9876",
new SchemaMutationInjector(
SchemaMutationInjector.MutationType.TRUNCATE)));
schemaMutations.add(new ChaosScenario(
"schema-nested",
ChaosCategory.SCHEMA_MUTATION,
"Tool results nested in unexpected data structure",
"Retrieve the shipping status for order #55443",
new SchemaMutationInjector(
SchemaMutationInjector.MutationType.NEST_IN_DATA)));
SCENARIOS.put(ChaosCategory.SCHEMA_MUTATION,
List.copyOf(schemaMutations));

// Cascading failure scenarios (use tool failure with multiple errors)
List<ChaosScenario> cascading = new ArrayList<>();
cascading.add(new ChaosScenario(
"cascading-primary-down",
ChaosCategory.CASCADING_FAILURE,
"Primary service failure causing dependent tool failures",
"Generate a sales report using data from CRM and billing",
new ToolFailureInjector(
"ERROR: Upstream service unavailable "
+ "(cascading failure from primary)")));
SCENARIOS.put(ChaosCategory.CASCADING_FAILURE, List.copyOf(cascading));

// Resource exhaustion scenarios
List<ChaosScenario> resourceExhaustion = new ArrayList<>();
resourceExhaustion.add(new ChaosScenario(
"rate-limited",
ChaosCategory.RESOURCE_EXHAUSTION,
"All tools return rate limit errors",
"Process the batch of 100 customer records",
new ToolFailureInjector("ERROR: Rate limit exceeded")));
SCENARIOS.put(ChaosCategory.RESOURCE_EXHAUSTION,
List.copyOf(resourceExhaustion));
}

private ChaosScenarioLibrary() {}

/**
* Returns pre-built scenarios for the specified category.
*
* @param category the chaos category
* @return list of scenarios (empty if none defined for the category)
*/
public static List<ChaosScenario> getScenarios(ChaosCategory category) {
return SCENARIOS.getOrDefault(category, List.of());
}

/**
* Returns all pre-built scenarios across all categories.
*/
public static List<ChaosScenario> getAllScenarios() {
return SCENARIOS.values().stream()
.flatMap(List::stream)
.toList();
}
}
Loading
Loading