Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 6 additions & 20 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,26 +273,6 @@ Optional modules for automatic capture with popular frameworks:

---

## Build & CI/CD Plugins

### Maven Plugin

```xml
<plugin>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-maven-plugin</artifactId>
<version>0.1.0-SNAPSHOT</version>
<executions>
<execution>
<goals><goal>evaluate</goal></goals>
</execution>
</executions>
</plugin>
```

```bash
mvn agenteval:evaluate
```

### Gradle Plugin

Expand Down Expand Up @@ -350,6 +330,12 @@ agenteval-langchain4j/ — LangChain4j auto-capture (optional)
agenteval-langgraph4j/ — LangGraph4j graph execution capture (optional)
agenteval-mcp/ — MCP Java SDK tool call capture (optional)
agenteval-redteam/ — Adversarial testing, 20 attack templates
agenteval-contracts/ — Contract testing, behavioral invariant verification
agenteval-statistics/ — Statistical rigor: confidence intervals, significance tests
agenteval-chaos/ — Chaos engineering, agent resilience testing
agenteval-replay/ — Deterministic record & replay for $0 regression tests
agenteval-mutation/ — Prompt mutation testing, eval quality verification
agenteval-fingerprint/ — Agent capability profiling across 8 dimensions
agenteval-maven-plugin/ — Maven build integration
agenteval-gradle-plugin/— Gradle build integration
agenteval-github-actions/ — GitHub Actions composite action
Expand Down
21 changes: 21 additions & 0 deletions agenteval-bom/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,27 @@
<artifactId>agenteval-chaos</artifactId>
<version>${project.version}</version>
</dependency>

<!-- Deterministic Replay (optional) -->
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-replay</artifactId>
<version>${project.version}</version>
</dependency>

<!-- Mutation Testing (optional) -->
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-mutation</artifactId>
<version>${project.version}</version>
</dependency>

<!-- Capability Fingerprinting (optional) -->
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-fingerprint</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</dependencyManagement>
</project>
44 changes: 44 additions & 0 deletions agenteval-fingerprint/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<parent>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-parent</artifactId>
<version>0.1.0-SNAPSHOT</version>
</parent>

<artifactId>agenteval-fingerprint</artifactId>
<name>AgentEval Fingerprint</name>
<description>Capability profiling and fingerprinting for AI agents</description>

<dependencies>
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-core</artifactId>
</dependency>
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-metrics</artifactId>
</dependency>
<dependency>
<groupId>org.byteveda.agenteval</groupId>
<artifactId>agenteval-judge</artifactId>
</dependency>
<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package org.byteveda.agenteval.fingerprint;

import java.util.ArrayList;
import java.util.EnumMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

/**
* Utility for comparing two {@link CapabilityProfile} instances.
*/
public final class CapabilityComparison {

private CapabilityComparison() {}

/**
* Compares two capability profiles and returns a comparison result.
*
* <p>For each dimension present in both profiles, computes the delta
* (B minus A). Positive deltas indicate improvement in profile B;
* negative deltas indicate regression.</p>
*
* @param profileA the baseline profile
* @param profileB the profile to compare against the baseline
* @return the comparison result
*/
public static CapabilityComparisonResult compare(
CapabilityProfile profileA, CapabilityProfile profileB) {
Objects.requireNonNull(profileA, "profileA must not be null");
Objects.requireNonNull(profileB, "profileB must not be null");

Map<CapabilityDimension, Double> deltas = new EnumMap<>(CapabilityDimension.class);
List<CapabilityDimension> improvements = new ArrayList<>();
List<CapabilityDimension> regressions = new ArrayList<>();

Set<CapabilityDimension> allDimensions = profileA.scores().keySet();

for (CapabilityDimension dim : allDimensions) {
ProfileScore scoreA = profileA.scores().get(dim);
ProfileScore scoreB = profileB.scores().get(dim);

if (scoreA != null && scoreB != null) {
double delta = scoreB.score() - scoreA.score();
deltas.put(dim, delta);

if (delta > 0.0) {
improvements.add(dim);
} else if (delta < 0.0) {
regressions.add(dim);
}
}
}

// Also check dimensions only in B
for (CapabilityDimension dim : profileB.scores().keySet()) {
if (!deltas.containsKey(dim)) {
ProfileScore scoreB = profileB.scores().get(dim);
deltas.put(dim, scoreB.score());
improvements.add(dim);
}
}

return new CapabilityComparisonResult(
profileA, profileB, deltas, improvements, regressions
);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package org.byteveda.agenteval.fingerprint;

import java.util.List;
import java.util.Map;
import java.util.Objects;

/**
* Result of comparing two capability profiles.
*
* @param profileA the first profile
* @param profileB the second profile
* @param deltas score differences per dimension (B minus A)
* @param improvements dimensions where B scored higher than A
* @param regressions dimensions where B scored lower than A
*/
public record CapabilityComparisonResult(
CapabilityProfile profileA,
CapabilityProfile profileB,
Map<CapabilityDimension, Double> deltas,
List<CapabilityDimension> improvements,
List<CapabilityDimension> regressions
) {

public CapabilityComparisonResult {
Objects.requireNonNull(profileA, "profileA must not be null");
Objects.requireNonNull(profileB, "profileB must not be null");
Objects.requireNonNull(deltas, "deltas must not be null");
deltas = Map.copyOf(deltas);
improvements = improvements == null ? List.of() : List.copyOf(improvements);
regressions = regressions == null ? List.of() : List.copyOf(regressions);
}

/**
* Returns the overall score delta (B minus A).
*
* @return the overall delta
*/
public double overallDelta() {
return profileB.overallScore() - profileA.overallScore();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package org.byteveda.agenteval.fingerprint;

/**
* Dimensions along which an agent's capabilities are profiled.
*
* <p>Each dimension represents a distinct aspect of agent behavior that can
* be independently measured and compared across agents or model versions.</p>
*/
public enum CapabilityDimension {

ACCURACY("Accuracy",
"Correctness and factual precision of agent responses"),

RELEVANCY("Relevancy",
"How well the agent's responses address the user's query"),

FAITHFULNESS("Faithfulness",
"Adherence to provided context without fabrication"),

COHERENCE("Coherence",
"Logical consistency and readability of responses"),

SAFETY("Safety",
"Avoidance of toxic, biased, or harmful content"),

TOOL_USE("Tool Use",
"Accuracy and appropriateness of tool selection and invocation"),

TASK_COMPLETION("Task Completion",
"Ability to fully accomplish assigned tasks"),

CONTEXT_UTILIZATION("Context Utilization",
"Effective use of retrieval context and provided information");

private final String displayName;
private final String description;

CapabilityDimension(String displayName, String description) {
this.displayName = displayName;
this.description = description;
}

/**
* Returns the human-readable display name.
*
* @return the display name
*/
public String displayName() {
return displayName;
}

/**
* Returns a description of what this dimension measures.
*
* @return the description
*/
public String description() {
return description;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package org.byteveda.agenteval.fingerprint;

import java.util.List;
import java.util.Map;
import java.util.Objects;

/**
* Complete capability profile for an agent, containing scores across all dimensions.
*
* @param agentName the name of the profiled agent
* @param scores scores keyed by dimension
* @param durationMs total profiling time in milliseconds
*/
public record CapabilityProfile(
String agentName,
Map<CapabilityDimension, ProfileScore> scores,
long durationMs
) {

public CapabilityProfile {
Objects.requireNonNull(agentName, "agentName must not be null");
Objects.requireNonNull(scores, "scores must not be null");
scores = Map.copyOf(scores);
}

/**
* Returns the overall score as the average across all dimensions.
*
* @return the average score (0.0 to 1.0), or 0.0 if no scores
*/
public double overallScore() {
if (scores.isEmpty()) {
return 0.0;
}
return scores.values().stream()
.mapToDouble(ProfileScore::score)
.average()
.orElse(0.0);
}

/**
* Returns dimensions where the score is at or above the given threshold.
*
* @param threshold the minimum score to qualify as a strength
* @return list of strong dimensions
*/
public List<CapabilityDimension> strengths(double threshold) {
return scores.entrySet().stream()
.filter(e -> e.getValue().score() >= threshold)
.map(Map.Entry::getKey)
.toList();
}

/**
* Returns dimensions with strengths at or above 0.8.
*
* @return list of strong dimensions
*/
public List<CapabilityDimension> strengths() {
return strengths(0.8);
}

/**
* Returns dimensions where the score is below the given threshold.
*
* @param threshold the score below which a dimension is considered weak
* @return list of weak dimensions
*/
public List<CapabilityDimension> weaknesses(double threshold) {
return scores.entrySet().stream()
.filter(e -> e.getValue().score() < threshold)
.map(Map.Entry::getKey)
.toList();
}

/**
* Returns dimensions with weaknesses below 0.5.
*
* @return list of weak dimensions
*/
public List<CapabilityDimension> weaknesses() {
return weaknesses(0.5);
}
}
Loading