diff --git a/Braintrust.Sdk.sln b/Braintrust.Sdk.sln
index b9f05d0..e7c2dfb 100644
--- a/Braintrust.Sdk.sln
+++ b/Braintrust.Sdk.sln
@@ -19,6 +19,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenAIInstrumentation", "ex
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "EvalExample", "examples\EvalExample\EvalExample.csproj", "{DFAA25AA-72B1-4246-BAB9-A10CCF115406}"
EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ClassifiersExample", "examples\ClassifiersExample\ClassifiersExample.csproj", "{0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}"
+EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TraceScoring", "examples\TraceScoring\TraceScoring.csproj", "{66D24AFB-3541-429D-9402-72A344D99115}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Braintrust.Sdk.OpenAI", "src\Braintrust.Sdk.OpenAI\Braintrust.Sdk.OpenAI.csproj", "{B3C7D1A2-4E5F-6789-ABCD-EF0123456789}"
@@ -72,6 +74,10 @@ Global
{DFAA25AA-72B1-4246-BAB9-A10CCF115406}.Debug|Any CPU.Build.0 = Debug|Any CPU
{DFAA25AA-72B1-4246-BAB9-A10CCF115406}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DFAA25AA-72B1-4246-BAB9-A10CCF115406}.Release|Any CPU.Build.0 = Release|Any CPU
+ {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Release|Any CPU.Build.0 = Release|Any CPU
{66D24AFB-3541-429D-9402-72A344D99115}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{66D24AFB-3541-429D-9402-72A344D99115}.Debug|Any CPU.Build.0 = Debug|Any CPU
{66D24AFB-3541-429D-9402-72A344D99115}.Release|Any CPU.ActiveCfg = Release|Any CPU
@@ -127,6 +133,7 @@ Global
{5A09E90C-6BCB-440C-AC03-5212B2AAE6C2} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A}
{929EDD10-7B06-4C4F-B70F-E4E51072A724} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A}
{DFAA25AA-72B1-4246-BAB9-A10CCF115406} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A}
+ {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A}
{66D24AFB-3541-429D-9402-72A344D99115} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A}
{A8A1C23E-7D6F-47FE-9959-B90E9CEF7B2C} = {6530DEC3-1D19-4854-80AC-2D6D02BEAECC}
{446D2C4A-41D6-4E4F-AC4C-6809E2416A98} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A}
diff --git a/examples/ClassifiersExample/ClassifiersExample.csproj b/examples/ClassifiersExample/ClassifiersExample.csproj
new file mode 100644
index 0000000..4cc28eb
--- /dev/null
+++ b/examples/ClassifiersExample/ClassifiersExample.csproj
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+ Exe
+ net8.0
+ enable
+ enable
+
+
+
diff --git a/examples/ClassifiersExample/Program.cs b/examples/ClassifiersExample/Program.cs
new file mode 100644
index 0000000..4727ddd
--- /dev/null
+++ b/examples/ClassifiersExample/Program.cs
@@ -0,0 +1,152 @@
+using Braintrust.Sdk.Eval;
+
+namespace Braintrust.Sdk.Examples.ClassifiersExample;
+
+// Example: Classifiers
+//
+// Classifiers categorize and label eval outputs. Unlike scorers (which return
+// numeric 0-1 values), classifiers return structured Classification items —
+// each with an Id, an optional Label, and optional Metadata.
+//
+// Results are stored as a dictionary keyed by classifier name:
+//
+// { "sentiment": [{ id: "positive", label: "Positive" }] }
+//
+// Three patterns are shown:
+//
+// 1. Inline single-label FunctionClassifier
+// 2. Inline multi-label FunctionClassifier (returns IReadOnlyList)
+// 3. Class-based classifier implementing IClassifier
+//
+// Classifiers and scorers run independently. You can use both together, or
+// use only classifiers when you don't need numeric scores.
+
+sealed class ResponseQualityClassifier : IClassifier
+{
+ public string Name => "response_quality";
+
+ public Task> Classify(TaskResult taskResult)
+ {
+ var output = taskResult.Result;
+ var wordCount = output.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length;
+
+ string id;
+ if (string.IsNullOrWhiteSpace(output))
+ {
+ id = "no_response";
+ }
+ else if (wordCount < 5)
+ {
+ id = "too_short";
+ }
+ else if (output.Contains("immediately", StringComparison.OrdinalIgnoreCase)
+ || output.Contains("right away", StringComparison.OrdinalIgnoreCase)
+ || output.Contains("look into", StringComparison.OrdinalIgnoreCase))
+ {
+ id = "action_oriented";
+ }
+ else
+ {
+ id = "informational";
+ }
+
+ var label = char.ToUpperInvariant(id[0]) + id[1..].Replace('_', ' ');
+
+ IReadOnlyList results = new[]
+ {
+ new Classification(
+ id,
+ Label: label,
+ Metadata: new Dictionary { ["word_count"] = wordCount })
+ };
+ return Task.FromResult(results);
+ }
+}
+
+class Program
+{
+ private static readonly (string Input, string Expected)[] Messages =
+ {
+ ("Hi! I just wanted to say thank you, the product is amazing!", "praise"),
+ ("I've been waiting 2 weeks for my order. This is unacceptable!", "follow_up"),
+ ("How do I reset my password? I can't find the option anywhere.", "how_to"),
+ ("The item arrived damaged. I need a refund immediately.", "complaint"),
+ ("Just checking in — any update on my ticket #4821?", "follow_up")
+ };
+
+ static string GenerateResponse(string message)
+ {
+ if (Regex("thank").IsMatch(message))
+ return "You're welcome! So glad you're enjoying it.";
+ if (Regex("waiting|order").IsMatch(message))
+ return "I sincerely apologise for the delay. Let me look into this right away.";
+ if (Regex("password|reset").IsMatch(message))
+ return "To reset your password, go to Settings > Account > Reset Password.";
+ if (Regex("damaged|refund").IsMatch(message))
+ return "I'm sorry to hear that. I'll process your refund immediately.";
+ return "Thanks for reaching out! Let me check on that for you.";
+ }
+
+ static System.Text.RegularExpressions.Regex Regex(string pattern)
+ => new(pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase);
+
+ static async Task Main()
+ {
+ var braintrust = Braintrust.Get();
+
+ // Pattern 1: inline single-label classifier
+ var intentClassifier = new FunctionClassifier(
+ "intent",
+ taskResult =>
+ {
+ var input = taskResult.DatasetCase.Input;
+ string id =
+ Regex("thank").IsMatch(input) ? "praise" :
+ Regex("waiting|order|update").IsMatch(input) ? "follow_up" :
+ Regex("password|reset|find").IsMatch(input) ? "how_to" :
+ Regex("damaged|refund").IsMatch(input) ? "complaint" :
+ "other";
+
+ return new Classification(
+ id,
+ Label: char.ToUpperInvariant(id[0]) + id[1..].Replace('_', ' '));
+ });
+
+ // Pattern 2: inline multi-label classifier — returns a list
+ var toneClassifier = new FunctionClassifier(
+ "tone",
+ taskResult =>
+ {
+ var input = taskResult.DatasetCase.Input;
+ var labels = new List();
+ if (Regex("immediately|unacceptable|waiting").IsMatch(input))
+ labels.Add(new Classification("urgent", Label: "Urgent"));
+ if (Regex("please|thank|just checking").IsMatch(input))
+ labels.Add(new Classification("polite", Label: "Polite"));
+ if (Regex("unacceptable|damaged|waiting").IsMatch(input))
+ labels.Add(new Classification("frustrated", Label: "Frustrated"));
+ if (labels.Count == 0)
+ labels.Add(new Classification("neutral", Label: "Neutral"));
+ return (IReadOnlyList)labels;
+ });
+
+ // Pattern 3: class-based classifier (see ResponseQualityClassifier above)
+ var qualityClassifier = new ResponseQualityClassifier();
+
+ var cases = Messages
+ .Select(m => DatasetCase.Of(m.Input, m.Expected))
+ .ToArray();
+
+ var eval = await braintrust
+ .EvalBuilder()
+ .Name($"dotnet-classifiers-example-{DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()}")
+ .Tags("classifiers-example", "dotnet-sdk")
+ .Cases(cases)
+ .TaskFunction(GenerateResponse)
+ .Classifiers(intentClassifier, toneClassifier, qualityClassifier)
+ .BuildAsync();
+
+ var result = await eval.RunAsync();
+ Console.WriteLine($"\n\n{result.CreateReportString()}");
+ }
+}
diff --git a/src/Braintrust.Sdk/Eval/Classification.cs b/src/Braintrust.Sdk/Eval/Classification.cs
new file mode 100644
index 0000000..a69b7a6
--- /dev/null
+++ b/src/Braintrust.Sdk/Eval/Classification.cs
@@ -0,0 +1,14 @@
+namespace Braintrust.Sdk.Eval;
+
+///
+/// A structured label produced by a classifier.
+///
+/// Stable identifier for filtering and grouping. Required.
+/// Grouping key in the per-case classifications dictionary. If null or empty, the runner defaults this to the classifier's resolved name.
+/// Optional display label. Consumers may fall back to when omitted.
+/// Optional arbitrary metadata associated with this classification.
+public readonly record struct Classification(
+ string Id,
+ string? Name = null,
+ string? Label = null,
+ IReadOnlyDictionary? Metadata = null);
diff --git a/src/Braintrust.Sdk/Eval/Eval.cs b/src/Braintrust.Sdk/Eval/Eval.cs
index fbe6672..8afa9d4 100644
--- a/src/Braintrust.Sdk/Eval/Eval.cs
+++ b/src/Braintrust.Sdk/Eval/Eval.cs
@@ -32,6 +32,7 @@ public sealed class Eval
private readonly IDataset _dataset;
private readonly ITask _task;
private readonly IReadOnlyList> _scorers;
+ private readonly IReadOnlyList> _classifiers;
private readonly IReadOnlyList? _experimentTags;
private readonly IReadOnlyDictionary? _experimentMetadata;
private readonly int? _maxConcurrency;
@@ -49,6 +50,7 @@ private Eval(Builder builder, OrganizationAndProjectInfo orgAndProject, RepoInfo
_dataset = builder._dataset ?? throw new ArgumentNullException(nameof(builder._dataset));
_task = builder._task ?? throw new ArgumentNullException(nameof(builder._task));
_scorers = builder._scorers.ToList();
+ _classifiers = builder._classifiers.ToList();
_experimentTags = builder._experimentTags;
_experimentMetadata = builder._experimentMetadata;
_maxConcurrency = builder._maxConcurrency;
@@ -165,12 +167,13 @@ private async Task EvalOne(string experimentId, DatasetCase dat
}
if (taskException == null)
{
- // Task succeeded — record output and run all scorers in parallel, each in their own span
+ // Task succeeded — record output and run all scorers and classifiers in parallel, each in their own span
rootActivity.SetTag("braintrust.output_json", ToJson(new { output = taskResult!.Value.Result }));
- // Flush OTel spans to Braintrust before scoring so traced scorers can access them
- var hasTracedScorers = _scorers.OfType>().Any();
- if (hasTracedScorers)
+ // Flush OTel spans to Braintrust before scoring so traced scorers/classifiers can access them
+ var needsTraceFlush = _scorers.OfType>().Any()
+ || _classifiers.OfType>().Any();
+ if (needsTraceFlush)
{
BraintrustTracing.ForceFlush();
}
@@ -179,7 +182,8 @@ private async Task EvalOne(string experimentId, DatasetCase dat
var rootSpanId = rootActivity.TraceId.ToHexString();
var trace = new EvalTrace(ct => _btqlClient.QuerySpansAsync(experimentId, rootSpanId, ct));
- await RunScorers(experimentId, rootActivity, taskResult!.Value, trace).ConfigureAwait(false);
+ await RunScorersAndClassifiers(experimentId, rootActivity, taskResult!.Value, trace, datasetCase.Metadata)
+ .ConfigureAwait(false);
}
else
{
@@ -234,19 +238,28 @@ private async Task RunSingleScorerForTaskException(
}
///
- /// Runs all scorers for a successful task result, each in their own score span.
- /// Calls (or
- /// for traced scorers) and falls back to on error.
+ /// Runs all scorers and classifiers for a successful task result in parallel, each in their own span.
+ /// After completion, aggregates classifier results onto the root span as braintrust.classifications
+ /// and merges any classifier errors into the root span's braintrust.metadata under
+ /// classifier_errors.
///
- private async Task RunScorers(
+ private async Task RunScorersAndClassifiers(
string experimentId,
Activity rootActivity,
TaskResult taskResult,
- EvalTrace trace)
+ EvalTrace trace,
+ IReadOnlyDictionary caseMetadata)
{
var scorerTasks = _scorers.Select(scorer =>
RunSingleScorer(experimentId, rootActivity, scorer, taskResult, trace));
- await Task.WhenAll(scorerTasks).ConfigureAwait(false);
+
+ var classifierOutcomes = new ClassifierOutcome?[_classifiers.Count];
+ var classifierTasks = _classifiers.Select((classifier, index) =>
+ RunSingleClassifier(experimentId, rootActivity, classifier, index, taskResult, trace, classifierOutcomes));
+
+ await Task.WhenAll(scorerTasks.Concat(classifierTasks)).ConfigureAwait(false);
+
+ AggregateClassifierOutcomes(rootActivity, caseMetadata, classifierOutcomes);
}
private async Task RunSingleScorer(
@@ -327,6 +340,214 @@ private static void RecordScores(
}
}
+ ///
+ /// Per-classifier outcome captured after running. Either a successful list of normalized items
+ /// (already grouped by resolved name) or an error message.
+ ///
+ private sealed class ClassifierOutcome
+ {
+ public string ClassifierName { get; }
+ public IReadOnlyList<(string Name, Dictionary Item)>? Items { get; }
+ public string? ErrorMessage { get; }
+
+ private ClassifierOutcome(
+ string classifierName,
+ IReadOnlyList<(string Name, Dictionary Item)>? items,
+ string? errorMessage)
+ {
+ ClassifierName = classifierName;
+ Items = items;
+ ErrorMessage = errorMessage;
+ }
+
+ public static ClassifierOutcome Success(
+ string classifierName,
+ IReadOnlyList<(string Name, Dictionary Item)> items)
+ => new(classifierName, items, null);
+
+ public static ClassifierOutcome Error(string classifierName, string errorMessage)
+ => new(classifierName, null, errorMessage);
+ }
+
+ private async Task RunSingleClassifier(
+ string experimentId,
+ Activity rootActivity,
+ IClassifier classifier,
+ int classifierIndex,
+ TaskResult taskResult,
+ EvalTrace trace,
+ ClassifierOutcome?[] outcomes)
+ {
+ var resolvedName = string.IsNullOrWhiteSpace(classifier.Name)
+ ? $"classifier_{classifierIndex}"
+ : classifier.Name;
+
+ var classifierActivity = _activitySource.StartActivity(resolvedName);
+ classifierActivity?.SetTag(BraintrustTracing.ParentKey, $"experiment_id:{experimentId}");
+ classifierActivity?.SetTag(
+ "braintrust.span_attributes",
+ ToJson(new { type = "classifier", name = resolvedName, purpose = "scorer" }));
+
+ var datasetCase = taskResult.DatasetCase;
+ classifierActivity?.SetTag(
+ "braintrust.input_json",
+ ToJson(new
+ {
+ input = datasetCase.Input,
+ expected = datasetCase.Expected,
+ output = taskResult.Result,
+ metadata = datasetCase.Metadata
+ }));
+
+ try
+ {
+ using var classifierScope = BraintrustContext.OfExperiment(experimentId).MakeCurrent();
+
+ IReadOnlyList rawResults;
+ try
+ {
+ rawResults = classifier is ITracedClassifier tracedClassifier
+ ? await tracedClassifier.Classify(taskResult, trace).ConfigureAwait(false)
+ : await classifier.Classify(taskResult).ConfigureAwait(false);
+
+ if (rawResults == null)
+ {
+ rawResults = Array.Empty();
+ }
+ }
+ catch (Exception ex)
+ {
+ classifierActivity?.SetStatus(ActivityStatusCode.Error, ex.Message);
+ classifierActivity?.AddEvent(CreateExceptionEvent(ex));
+ outcomes[classifierIndex] = ClassifierOutcome.Error(resolvedName, ex.Message);
+ return;
+ }
+
+ // Normalize: resolve name + validate, build storage items (no Name key).
+ var normalized = new List<(string Name, Dictionary Item)>(rawResults.Count);
+ try
+ {
+ foreach (var classification in rawResults)
+ {
+ if (string.IsNullOrEmpty(classification.Id))
+ {
+ throw new InvalidOperationException(
+ "When returning structured classifier results, each classification must be a non-empty object.");
+ }
+
+ var groupingName = string.IsNullOrWhiteSpace(classification.Name)
+ ? resolvedName
+ : classification.Name!;
+
+ var item = new Dictionary { ["id"] = classification.Id };
+ if (classification.Label != null)
+ {
+ item["label"] = classification.Label;
+ }
+ if (classification.Metadata != null && classification.Metadata.Count > 0)
+ {
+ item["metadata"] = classification.Metadata;
+ }
+
+ normalized.Add((groupingName, item));
+ }
+ }
+ catch (Exception ex)
+ {
+ classifierActivity?.SetStatus(ActivityStatusCode.Error, ex.Message);
+ classifierActivity?.AddEvent(CreateExceptionEvent(ex));
+ outcomes[classifierIndex] = ClassifierOutcome.Error(resolvedName, ex.Message);
+ return;
+ }
+
+ // Build output_json keyed by resolved name for the classifier span.
+ if (normalized.Count > 0)
+ {
+ var outputByName = new Dictionary>>();
+ foreach (var (name, item) in normalized)
+ {
+ if (!outputByName.TryGetValue(name, out var list))
+ {
+ list = new List>();
+ outputByName[name] = list;
+ }
+ list.Add(item);
+ }
+ classifierActivity?.SetTag("braintrust.output_json", ToJson(outputByName));
+ }
+
+ outcomes[classifierIndex] = ClassifierOutcome.Success(resolvedName, normalized);
+ }
+ finally
+ {
+ classifierActivity?.Stop();
+ }
+ }
+
+ ///
+ /// Aggregates per-classifier outcomes onto the root span:
+ ///
+ /// - Sets braintrust.classifications when any classifications were produced.
+ /// - Merges any classifier errors into braintrust.metadata under classifier_errors.
+ ///
+ ///
+ private static void AggregateClassifierOutcomes(
+ Activity rootActivity,
+ IReadOnlyDictionary caseMetadata,
+ ClassifierOutcome?[] outcomes)
+ {
+ if (outcomes.Length == 0)
+ {
+ return;
+ }
+
+ var classifications = new Dictionary>>();
+ var classifierErrors = new Dictionary();
+
+ foreach (var outcome in outcomes)
+ {
+ if (outcome == null)
+ {
+ continue;
+ }
+
+ if (outcome.ErrorMessage != null)
+ {
+ classifierErrors[outcome.ClassifierName] = outcome.ErrorMessage;
+ continue;
+ }
+
+ if (outcome.Items == null)
+ {
+ continue;
+ }
+
+ foreach (var (name, item) in outcome.Items)
+ {
+ if (!classifications.TryGetValue(name, out var list))
+ {
+ list = new List>();
+ classifications[name] = list;
+ }
+ list.Add(item);
+ }
+ }
+
+ if (classifications.Count > 0)
+ {
+ rootActivity.SetTag("braintrust.classifications", ToJson(classifications));
+ }
+
+ if (classifierErrors.Count > 0)
+ {
+ var merged = new Dictionary(caseMetadata)
+ {
+ ["classifier_errors"] = classifierErrors
+ };
+ rootActivity.SetTag("braintrust.metadata", ToJson(merged));
+ }
+ }
+
private static string ToJson(object obj)
{
return JsonSerializer.Serialize(obj, JsonOptions);
@@ -388,6 +609,7 @@ public sealed class Builder
internal IDataset? _dataset;
internal ITask? _task;
internal List> _scorers = new();
+ internal List> _classifiers = new();
internal IReadOnlyList? _experimentTags;
internal IReadOnlyDictionary? _experimentMetadata;
internal int? _maxConcurrency = 10;
@@ -406,9 +628,9 @@ public async Task> BuildAsync()
_apiClient ??= BraintrustApiClient.Of(_config);
_btqlClient ??= new BtqlClient(_config);
- if (_scorers.Count == 0)
+ if (_scorers.Count == 0 && _classifiers.Count == 0)
{
- throw new InvalidOperationException("Must provide at least one scorer");
+ throw new InvalidOperationException("Must provide at least one scorer or classifier");
}
if (_dataset == null)
@@ -561,6 +783,16 @@ public Builder Scorers(params IScorer[] scorers)
return this;
}
+ ///
+ /// Set the classifiers.
+ /// At least one of or must be provided.
+ ///
+ public Builder Classifiers(params IClassifier[] classifiers)
+ {
+ _classifiers = classifiers.ToList();
+ return this;
+ }
+
///
/// Set the experiment-level tags.
/// These tags are applied to the experiment itself, not individual cases.
diff --git a/src/Braintrust.Sdk/Eval/FunctionClassifier.cs b/src/Braintrust.Sdk/Eval/FunctionClassifier.cs
new file mode 100644
index 0000000..5aeb473
--- /dev/null
+++ b/src/Braintrust.Sdk/Eval/FunctionClassifier.cs
@@ -0,0 +1,73 @@
+namespace Braintrust.Sdk.Eval;
+
+///
+/// Implementation of a classifier from a function.
+/// Supports synchronous and asynchronous functions returning either a single
+/// or a list. Returning null means "no classifications for this case".
+///
+public class FunctionClassifier : IClassifier
+ where TInput : notnull
+ where TOutput : notnull
+{
+ private static readonly IReadOnlyList Empty = Array.Empty();
+
+ private readonly Func, Task>> _classifierFn;
+
+ ///
+ /// Create a classifier from a synchronous function returning a single classification (or null).
+ ///
+ public FunctionClassifier(string name, Func, Classification?> classifierFn)
+ {
+ Name = name;
+ _classifierFn = taskResult =>
+ {
+ var result = classifierFn(taskResult);
+ return Task.FromResult>(
+ result.HasValue ? new[] { result.Value } : Empty);
+ };
+ }
+
+ ///
+ /// Create a classifier from a synchronous function returning a list of classifications (or null).
+ ///
+ public FunctionClassifier(string name, Func, IReadOnlyList?> classifierFn)
+ {
+ Name = name;
+ _classifierFn = taskResult =>
+ {
+ var result = classifierFn(taskResult);
+ return Task.FromResult>(result ?? Empty);
+ };
+ }
+
+ ///
+ /// Create a classifier from an asynchronous function returning a single classification (or null).
+ ///
+ public FunctionClassifier(string name, Func, Task> classifierFn)
+ {
+ Name = name;
+ _classifierFn = async taskResult =>
+ {
+ var result = await classifierFn(taskResult).ConfigureAwait(false);
+ return result.HasValue ? new[] { result.Value } : Empty;
+ };
+ }
+
+ ///
+ /// Create a classifier from an asynchronous function returning a list of classifications (or null).
+ ///
+ public FunctionClassifier(string name, Func, Task?>> classifierFn)
+ {
+ Name = name;
+ _classifierFn = async taskResult =>
+ {
+ var result = await classifierFn(taskResult).ConfigureAwait(false);
+ return result ?? Empty;
+ };
+ }
+
+ public string Name { get; }
+
+ public Task> Classify(TaskResult taskResult)
+ => _classifierFn(taskResult);
+}
diff --git a/src/Braintrust.Sdk/Eval/IClassifier.cs b/src/Braintrust.Sdk/Eval/IClassifier.cs
new file mode 100644
index 0000000..f85f502
--- /dev/null
+++ b/src/Braintrust.Sdk/Eval/IClassifier.cs
@@ -0,0 +1,30 @@
+namespace Braintrust.Sdk.Eval;
+
+///
+/// A classifier categorizes and labels eval outputs.
+/// Unlike (which returns numeric 0-1 values),
+/// classifiers return structured items with an id and optional label and metadata.
+///
+///
+/// Implementations must be thread-safe as classifiers may be executed concurrently.
+/// Classifier failures are non-fatal: an exception thrown by is recorded
+/// under classifier_errors in the eval span's metadata and does not abort the evaluation.
+///
+/// Type of the input data
+/// Type of the output data
+public interface IClassifier
+ where TInput : notnull
+ where TOutput : notnull
+{
+ ///
+ /// Gets the name of this classifier. Used as the classifier span name and as the
+ /// default grouping key when a returned has no Name.
+ ///
+ string Name { get; }
+
+ ///
+ /// Classify the task result and return zero or more classifications.
+ /// Return an empty list to indicate no classifications for this case.
+ ///
+ Task> Classify(TaskResult taskResult);
+}
diff --git a/src/Braintrust.Sdk/Eval/ITracedClassifier.cs b/src/Braintrust.Sdk/Eval/ITracedClassifier.cs
new file mode 100644
index 0000000..97fd6d7
--- /dev/null
+++ b/src/Braintrust.Sdk/Eval/ITracedClassifier.cs
@@ -0,0 +1,24 @@
+namespace Braintrust.Sdk.Eval;
+
+///
+/// A classifier that receives access to the distributed trace (spans) of the task that was evaluated.
+/// This allows classifiers to inspect intermediate LLM calls and tool-use chains, not just the final output.
+///
+/// Implement this interface when your classifier needs to examine multi-turn conversations or tool-use chains
+/// (e.g. classifying a conversation pattern as "single-turn", "tool-heavy", or "clarification-loop").
+/// When a classifier implements this interface,
+/// is called instead of .
+/// Backward-compatible: classifiers that only implement continue to work without change.
+///
+/// The type of input data for the evaluation
+/// The type of output produced by the task
+public interface ITracedClassifier : IClassifier
+ where TInput : notnull
+ where TOutput : notnull
+{
+ ///
+ /// Classify the task result using the distributed trace for additional context.
+ /// Called instead of when trace is available.
+ ///
+ Task> Classify(TaskResult taskResult, EvalTrace trace);
+}
diff --git a/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs b/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs
new file mode 100644
index 0000000..b790130
--- /dev/null
+++ b/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs
@@ -0,0 +1,573 @@
+using System.Diagnostics;
+using System.Text.Json;
+using Braintrust.Sdk.Config;
+using Braintrust.Sdk.Eval;
+
+namespace Braintrust.Sdk.Tests.Eval;
+
+[Collection("BraintrustGlobals")]
+public class ClassifierTest : IDisposable
+{
+ private readonly ActivityListener _activityListener;
+
+ public ClassifierTest()
+ {
+ Braintrust.ResetForTest();
+ _activityListener = new ActivityListener
+ {
+ ShouldListenTo = source => source.Name == "braintrust-dotnet",
+ Sample = (ref ActivityCreationOptions _) => ActivitySamplingResult.AllDataAndRecorded
+ };
+ ActivitySource.AddActivityListener(_activityListener);
+ }
+
+ public void Dispose()
+ {
+ _activityListener?.Dispose();
+ Braintrust.ResetForTest();
+ }
+
+ // =====================================================================
+ // FunctionClassifier shape normalization
+ // =====================================================================
+
+ [Fact]
+ public async Task FunctionClassifierReturnsSingleClassification()
+ {
+ var classifier = new FunctionClassifier(
+ "category",
+ _ => new Classification("greeting", Label: "Greeting"));
+
+ var taskResult = MakeTaskResult("hello", "hi");
+ var results = await classifier.Classify(taskResult);
+
+ Assert.Single(results);
+ Assert.Equal("greeting", results[0].Id);
+ Assert.Equal("Greeting", results[0].Label);
+ }
+
+ [Fact]
+ public async Task FunctionClassifierReturnsList()
+ {
+ var classifier = new FunctionClassifier(
+ "sentiment",
+ _ => (IReadOnlyList)new[]
+ {
+ new Classification("positive", Label: "Positive"),
+ new Classification("enthusiastic", Label: "Enthusiastic")
+ });
+
+ var results = await classifier.Classify(MakeTaskResult("great!", ""));
+
+ Assert.Equal(2, results.Count);
+ Assert.Equal("positive", results[0].Id);
+ Assert.Equal("enthusiastic", results[1].Id);
+ }
+
+ [Fact]
+ public async Task FunctionClassifierNullReturnsEmptyList()
+ {
+ var classifier = new FunctionClassifier(
+ "maybe",
+ _ => (Classification?)null);
+
+ var results = await classifier.Classify(MakeTaskResult("hello", "hi"));
+ Assert.Empty(results);
+ }
+
+ [Fact]
+ public async Task FunctionClassifierNullListReturnsEmptyList()
+ {
+ var classifier = new FunctionClassifier(
+ "maybe",
+ _ => (IReadOnlyList?)null);
+
+ var results = await classifier.Classify(MakeTaskResult("hello", "hi"));
+ Assert.Empty(results);
+ }
+
+ [Fact]
+ public async Task FunctionClassifierAsyncSingle()
+ {
+ var classifier = new FunctionClassifier(
+ "category",
+ _ => Task.FromResult(new Classification("greeting")));
+
+ var results = await classifier.Classify(MakeTaskResult("hello", "hi"));
+ Assert.Single(results);
+ Assert.Equal("greeting", results[0].Id);
+ }
+
+ [Fact]
+ public async Task FunctionClassifierAsyncList()
+ {
+ var classifier = new FunctionClassifier(
+ "category",
+ _ => Task.FromResult?>(new[]
+ {
+ new Classification("a"),
+ new Classification("b")
+ }));
+
+ var results = await classifier.Classify(MakeTaskResult("hello", "hi"));
+ Assert.Equal(2, results.Count);
+ }
+
+ // =====================================================================
+ // Builder validation
+ // =====================================================================
+
+ [Fact]
+ public async Task EvalRequiresAtLeastScorersOrClassifiers()
+ {
+ var config = BraintrustConfig.Of(("BRAINTRUST_API_KEY", "test-key"));
+ var mockClient = new MockBraintrustApiClient();
+
+ var ex = await Assert.ThrowsAsync(() =>
+ Eval.NewBuilder()
+ .Name("test-eval")
+ .Config(config)
+ .ApiClient(mockClient)
+ .Cases(DatasetCase.Of("input", "expected"))
+ .TaskFunction(x => x)
+ .BuildAsync());
+
+ Assert.Contains("at least one scorer or classifier", ex.Message);
+ }
+
+ [Fact]
+ public async Task EvalBuildsWithClassifiersOnly()
+ {
+ var config = BraintrustConfig.Of(
+ ("BRAINTRUST_API_KEY", "test-key"),
+ ("BRAINTRUST_APP_URL", "https://braintrust.dev"),
+ ("BRAINTRUST_DEFAULT_PROJECT_NAME", "test-project"));
+ var mockClient = new MockBraintrustApiClient();
+
+ var eval = await Eval.NewBuilder()
+ .Name("test-eval")
+ .Config(config)
+ .ApiClient(mockClient)
+ .Cases(DatasetCase.Of("hello", "hi"))
+ .TaskFunction(x => x)
+ .Classifiers(new FunctionClassifier(
+ "category",
+ _ => new Classification("greeting")))
+ .BuildAsync();
+
+ var result = await eval.RunAsync();
+ Assert.NotNull(result.ExperimentUrl);
+ }
+
+ // =====================================================================
+ // Runner — classifier results on the eval span
+ // =====================================================================
+
+ [Fact]
+ public async Task RunnerWritesClassificationsToEvalSpan()
+ {
+ var (rootSpans, classifierSpans) = await RunEval(
+ cases: new[] { DatasetCase.Of("hello", "hi") },
+ taskFn: x => x,
+ classifiers: new IClassifier[]
+ {
+ new FunctionClassifier(
+ "category",
+ _ => new Classification("greeting", Label: "Greeting"))
+ });
+
+ var root = Assert.Single(rootSpans);
+ var classifications = ReadClassifications(root);
+ Assert.NotNull(classifications);
+ Assert.True(classifications.RootElement.TryGetProperty("category", out var categoryItems));
+ Assert.Equal(1, categoryItems.GetArrayLength());
+ Assert.Equal("greeting", categoryItems[0].GetProperty("id").GetString());
+ Assert.Equal("Greeting", categoryItems[0].GetProperty("label").GetString());
+
+ // Single classifier span produced
+ Assert.Single(classifierSpans);
+ }
+
+ [Fact]
+ public async Task RunnerWritesNoClassificationsTagWhenAllNull()
+ {
+ var (rootSpans, _) = await RunEval(
+ cases: new[] { DatasetCase.Of("hello", "hi") },
+ taskFn: x => x,
+ classifiers: new IClassifier[]
+ {
+ new FunctionClassifier("maybe", _ => (Classification?)null)
+ });
+
+ var root = Assert.Single(rootSpans);
+ Assert.Null(root.GetTagItem("braintrust.classifications"));
+ }
+
+ [Fact]
+ public async Task RunnerCombinesScorersAndClassifiers()
+ {
+ var (rootSpans, _) = await RunEval(
+ cases: new[] { DatasetCase.Of("hello", "hi") },
+ taskFn: x => x,
+ scorers: new IScorer[]
+ {
+ new FunctionScorer("exact", (e, a) => e == a ? 1.0 : 0.0)
+ },
+ classifiers: new IClassifier[]
+ {
+ new FunctionClassifier("category", _ => new Classification("greeting"))
+ });
+
+ var root = Assert.Single(rootSpans);
+ Assert.NotNull(root.GetTagItem("braintrust.classifications"));
+ // The eval span does not store scores itself; verify the classification path was hit
+ // independently from the scorer path. Score span coverage is in EvalTest.
+ }
+
+ [Fact]
+ public async Task RunnerHandlesClassifierExceptionWithoutAbortingEval()
+ {
+ var (rootSpans, classifierSpans) = await RunEval(
+ cases: new[] { DatasetCase.Of("hello", "hi") },
+ taskFn: x => x,
+ classifiers: new IClassifier[]
+ {
+ new ThrowingClassifier("broken", "classifier boom"),
+ new FunctionClassifier("working", _ => new Classification("ok"))
+ });
+
+ var root = Assert.Single(rootSpans);
+
+ // Classifier errors merged into braintrust.metadata under classifier_errors
+ var metadataJson = root.GetTagItem("braintrust.metadata") as string;
+ Assert.NotNull(metadataJson);
+ using var doc = JsonDocument.Parse(metadataJson);
+ Assert.True(doc.RootElement.TryGetProperty("classifier_errors", out var errors));
+ Assert.Equal("classifier boom", errors.GetProperty("broken").GetString());
+
+ // The working classifier still wrote its classification
+ var classifications = ReadClassifications(root);
+ Assert.NotNull(classifications);
+ Assert.True(classifications.RootElement.TryGetProperty("working", out _));
+
+ // The broken classifier span has error status + exception event
+ var brokenSpan = classifierSpans.First(s => s.DisplayName == "broken");
+ Assert.Equal(ActivityStatusCode.Error, brokenSpan.Status);
+ Assert.NotEmpty(brokenSpan.Events);
+
+ // The eval (root) span itself is not marked Error by a classifier failure
+ Assert.Equal(ActivityStatusCode.Unset, root.Status);
+ }
+
+ [Fact]
+ public async Task RunnerWritesClassifierSpanAttributes()
+ {
+ var (_, classifierSpans) = await RunEval(
+ cases: new[] { DatasetCase.Of("hello", "hi") },
+ taskFn: x => x,
+ classifiers: new IClassifier[]
+ {
+ new FunctionClassifier(
+ "my_classifier",
+ _ => new Classification("foo"))
+ });
+
+ var span = Assert.Single(classifierSpans);
+ Assert.Equal("my_classifier", span.DisplayName);
+
+ var attrsJson = span.GetTagItem("braintrust.span_attributes") as string;
+ Assert.NotNull(attrsJson);
+ using var doc = JsonDocument.Parse(attrsJson);
+ Assert.Equal("classifier", doc.RootElement.GetProperty("type").GetString());
+ Assert.Equal("my_classifier", doc.RootElement.GetProperty("name").GetString());
+ Assert.Equal("scorer", doc.RootElement.GetProperty("purpose").GetString());
+ }
+
+ [Fact]
+ public async Task RunnerMultiLabelResultPreservesOrder()
+ {
+ var (rootSpans, _) = await RunEval(
+ cases: new[] { DatasetCase.Of("great!", "hi") },
+ taskFn: x => x,
+ classifiers: new IClassifier[]
+ {
+ new FunctionClassifier(
+ "sentiment",
+ _ => (IReadOnlyList)new[]
+ {
+ new Classification("positive", Label: "Positive"),
+ new Classification("enthusiastic", Label: "Enthusiastic")
+ })
+ });
+
+ var root = Assert.Single(rootSpans);
+ var classifications = ReadClassifications(root);
+ Assert.NotNull(classifications);
+ var items = classifications.RootElement.GetProperty("sentiment");
+ Assert.Equal(2, items.GetArrayLength());
+ Assert.Equal("positive", items[0].GetProperty("id").GetString());
+ Assert.Equal("enthusiastic", items[1].GetProperty("id").GetString());
+ }
+
+ [Fact]
+ public async Task RunnerClassificationNameDefaultsToClassifierName()
+ {
+ var (rootSpans, _) = await RunEval(
+ cases: new[] { DatasetCase.Of("hello", "hi") },
+ taskFn: x => x,
+ classifiers: new IClassifier[]
+ {
+ // Classification has no Name set
+ new FunctionClassifier(
+ "my_classifier",
+ _ => new Classification("foo"))
+ });
+
+ var root = Assert.Single(rootSpans);
+ var classifications = ReadClassifications(root);
+ Assert.NotNull(classifications);
+ Assert.True(classifications.RootElement.TryGetProperty("my_classifier", out _));
+ }
+
+ [Fact]
+ public async Task RunnerClassificationExplicitNameOverridesClassifierName()
+ {
+ var (rootSpans, _) = await RunEval(
+ cases: new[] { DatasetCase.Of("hello", "hi") },
+ taskFn: x => x,
+ classifiers: new IClassifier[]
+ {
+ new FunctionClassifier(
+ "my_classifier",
+ _ => new Classification("foo", Name: "override_name"))
+ });
+
+ var root = Assert.Single(rootSpans);
+ var classifications = ReadClassifications(root);
+ Assert.NotNull(classifications);
+ Assert.True(classifications.RootElement.TryGetProperty("override_name", out _));
+ Assert.False(classifications.RootElement.TryGetProperty("my_classifier", out _));
+ }
+
+ [Fact]
+ public async Task RunnerEmptyClassificationItemIsRecordedAsError()
+ {
+ var (rootSpans, classifierSpans) = await RunEval(
+ cases: new[] { DatasetCase.Of("hello", "hi") },
+ taskFn: x => x,
+ classifiers: new IClassifier[]
+ {
+ // Default(Classification) — Id is null/empty, so should fail validation
+ new FunctionClassifier(
+ "bad",
+ _ => (Classification?)default(Classification))
+ });
+
+ var root = Assert.Single(rootSpans);
+ var metadataJson = root.GetTagItem("braintrust.metadata") as string;
+ Assert.NotNull(metadataJson);
+ using var doc = JsonDocument.Parse(metadataJson);
+ var errors = doc.RootElement.GetProperty("classifier_errors");
+ var brokenError = errors.GetProperty("bad").GetString();
+ Assert.NotNull(brokenError);
+ Assert.Contains("each classification must be a non-empty object", brokenError);
+
+ var brokenSpan = Assert.Single(classifierSpans);
+ Assert.Equal(ActivityStatusCode.Error, brokenSpan.Status);
+ }
+
+ [Fact]
+ public async Task RunnerAccumulatesClassificationsAcrossCases()
+ {
+ var (rootSpans, _) = await RunEval(
+ cases: new[]
+ {
+ DatasetCase.Of("hi", "x"),
+ DatasetCase.Of("hello", "x"),
+ DatasetCase.Of("ok", "x")
+ },
+ taskFn: x => x,
+ classifiers: new IClassifier[]
+ {
+ new FunctionClassifier(
+ "category",
+ tr => new Classification(tr.Result.Length > 3 ? "long" : "short"))
+ });
+
+ Assert.Equal(3, rootSpans.Count);
+ foreach (var root in rootSpans)
+ {
+ var classifications = ReadClassifications(root);
+ Assert.NotNull(classifications);
+ Assert.True(classifications.RootElement.TryGetProperty("category", out _));
+ }
+ }
+
+ [Fact]
+ public async Task RunnerClassifierInputContainsAllScoringArgs()
+ {
+ var (_, classifierSpans) = await RunEval(
+ cases: new[]
+ {
+ DatasetCase.Of(
+ "hello", "hi",
+ new List(),
+ new Dictionary { ["k"] = "v" })
+ },
+ taskFn: x => x,
+ classifiers: new IClassifier[]
+ {
+ new FunctionClassifier("category", _ => new Classification("greeting"))
+ });
+
+ var span = Assert.Single(classifierSpans);
+ var inputJson = span.GetTagItem("braintrust.input_json") as string;
+ Assert.NotNull(inputJson);
+ using var doc = JsonDocument.Parse(inputJson);
+ Assert.Equal("hello", doc.RootElement.GetProperty("input").GetString());
+ Assert.Equal("hi", doc.RootElement.GetProperty("expected").GetString());
+ Assert.Equal("hello", doc.RootElement.GetProperty("output").GetString());
+ Assert.True(doc.RootElement.TryGetProperty("metadata", out var md));
+ Assert.Equal("v", md.GetProperty("k").GetString());
+ }
+
+ // =====================================================================
+ // ITracedClassifier
+ // =====================================================================
+
+ [Fact]
+ public async Task TracedClassifierReceivesEvalTrace()
+ {
+ var spans = new[]
+ {
+ MockBtqlClient.MakeSpan("llm", input: new { messages = new[] { new { role = "user", content = "hi" } } },
+ output: new { choices = new[] { new { message = new { role = "assistant", content = "hello" } } } })
+ };
+ var mockBtql = new MockBtqlClient(spans);
+
+ var capturedSpanCount = -1;
+ var classifier = new TracedClassifier(
+ "trace_inspector",
+ async (_, trace) =>
+ {
+ var fetched = await trace.GetSpansAsync("llm");
+ capturedSpanCount = fetched.Count;
+ return new[] { new Classification("multi_turn") };
+ });
+
+ var (rootSpans, _) = await RunEval(
+ cases: new[] { DatasetCase.Of("hello", "hi") },
+ taskFn: x => x,
+ classifiers: new IClassifier[] { classifier },
+ btqlClient: mockBtql);
+
+ Assert.Single(rootSpans);
+ Assert.Equal(1, capturedSpanCount);
+ Assert.Equal(1, mockBtql.QueryCount);
+ }
+
+ // =====================================================================
+ // Helpers
+ // =====================================================================
+
+ private static TaskResult MakeTaskResult(string input, string output)
+ => new(output, new DatasetCase(input, ""));
+
+ private static JsonDocument? ReadClassifications(Activity span)
+ {
+ var json = span.GetTagItem("braintrust.classifications") as string;
+ return json == null ? null : JsonDocument.Parse(json);
+ }
+
+ private async Task<(List RootSpans, List ClassifierSpans)> RunEval(
+ DatasetCase[] cases,
+ Func taskFn,
+ IScorer[]? scorers = null,
+ IClassifier[]? classifiers = null,
+ MockBtqlClient? btqlClient = null)
+ {
+ var config = BraintrustConfig.Of(
+ ("BRAINTRUST_API_KEY", "test-key"),
+ ("BRAINTRUST_APP_URL", "https://braintrust.dev"),
+ ("BRAINTRUST_DEFAULT_PROJECT_NAME", "test-project"));
+ var mockClient = new MockBraintrustApiClient();
+ btqlClient ??= new MockBtqlClient();
+
+ var captured = new List();
+ using var listener = new ActivityListener
+ {
+ ShouldListenTo = source => source.Name == "braintrust-dotnet",
+ Sample = (ref ActivityCreationOptions _) => ActivitySamplingResult.AllDataAndRecorded,
+ ActivityStopped = captured.Add
+ };
+ ActivitySource.AddActivityListener(listener);
+
+ var builder = Eval.NewBuilder()
+ .Name("classifier-test")
+ .Config(config)
+ .ApiClient(mockClient)
+ .BtqlClient(btqlClient)
+ .Cases(cases)
+ .TaskFunction(taskFn);
+
+ if (scorers != null && scorers.Length > 0)
+ {
+ builder.Scorers(scorers);
+ }
+
+ if (classifiers != null && classifiers.Length > 0)
+ {
+ builder.Classifiers(classifiers);
+ }
+ else if (scorers == null || scorers.Length == 0)
+ {
+ // The validator forbids zero classifiers and zero scorers; tests using RunEval should specify at least one.
+ throw new InvalidOperationException("Test setup error: provide at least one scorer or classifier.");
+ }
+
+ var eval = await builder.BuildAsync();
+ await eval.RunAsync();
+
+ var rootSpans = captured.Where(a => a.DisplayName == "eval").ToList();
+ var classifierSpans = captured
+ .Where(a =>
+ {
+ var attrs = a.GetTagItem("braintrust.span_attributes") as string;
+ return attrs != null && attrs.Contains("\"type\":\"classifier\"");
+ })
+ .ToList();
+ return (rootSpans, classifierSpans);
+ }
+
+ private sealed class ThrowingClassifier : IClassifier
+ {
+ private readonly string _message;
+ public ThrowingClassifier(string name, string message)
+ {
+ Name = name;
+ _message = message;
+ }
+ public string Name { get; }
+ public Task> Classify(TaskResult taskResult)
+ => throw new InvalidOperationException(_message);
+ }
+
+ private sealed class TracedClassifier : ITracedClassifier
+ {
+ private readonly Func, EvalTrace, Task>> _fn;
+ public TracedClassifier(
+ string name,
+ Func, EvalTrace, Task>> fn)
+ {
+ Name = name;
+ _fn = fn;
+ }
+ public string Name { get; }
+
+ public Task> Classify(TaskResult taskResult)
+ => Task.FromResult>(Array.Empty());
+
+ public Task> Classify(TaskResult taskResult, EvalTrace trace)
+ => _fn(taskResult, trace);
+ }
+}