From e1e162357a6c4c956cfbba7801a0a70053fa083c Mon Sep 17 00:00:00 2001 From: Stephen Belanger Date: Tue, 26 May 2026 00:58:28 +0800 Subject: [PATCH 1/2] feat: add classifier support Adds the classifiers feature from the spec (braintrust-spec/docs/features/classifiers.md). Classifiers return structured Classification items (id, optional label, optional metadata) keyed by name and run in parallel with scorers. At least one of scorers or classifiers is now required. Includes ITracedClassifier (parallel to ITracedScorer) so classifiers can inspect intermediate trace spans, e.g. to label conversation patterns. Co-Authored-By: Claude Opus 4.7 (1M context) --- Braintrust.Sdk.sln | 7 + .../ClassifiersExample.csproj | 14 + examples/ClassifiersExample/Program.cs | 152 +++++ src/Braintrust.Sdk/Eval/Classification.cs | 14 + src/Braintrust.Sdk/Eval/Eval.cs | 258 +++++++- src/Braintrust.Sdk/Eval/FunctionClassifier.cs | 73 +++ src/Braintrust.Sdk/Eval/IClassifier.cs | 30 + src/Braintrust.Sdk/Eval/ITracedClassifier.cs | 24 + .../Eval/ClassifierTest.cs | 566 ++++++++++++++++++ 9 files changed, 1125 insertions(+), 13 deletions(-) create mode 100644 examples/ClassifiersExample/ClassifiersExample.csproj create mode 100644 examples/ClassifiersExample/Program.cs create mode 100644 src/Braintrust.Sdk/Eval/Classification.cs create mode 100644 src/Braintrust.Sdk/Eval/FunctionClassifier.cs create mode 100644 src/Braintrust.Sdk/Eval/IClassifier.cs create mode 100644 src/Braintrust.Sdk/Eval/ITracedClassifier.cs create mode 100644 tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs diff --git a/Braintrust.Sdk.sln b/Braintrust.Sdk.sln index b9f05d0..e7c2dfb 100644 --- a/Braintrust.Sdk.sln +++ b/Braintrust.Sdk.sln @@ -19,6 +19,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenAIInstrumentation", "ex EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "EvalExample", "examples\EvalExample\EvalExample.csproj", "{DFAA25AA-72B1-4246-BAB9-A10CCF115406}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ClassifiersExample", "examples\ClassifiersExample\ClassifiersExample.csproj", "{0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}" +EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TraceScoring", "examples\TraceScoring\TraceScoring.csproj", "{66D24AFB-3541-429D-9402-72A344D99115}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Braintrust.Sdk.OpenAI", "src\Braintrust.Sdk.OpenAI\Braintrust.Sdk.OpenAI.csproj", "{B3C7D1A2-4E5F-6789-ABCD-EF0123456789}" @@ -72,6 +74,10 @@ Global {DFAA25AA-72B1-4246-BAB9-A10CCF115406}.Debug|Any CPU.Build.0 = Debug|Any CPU {DFAA25AA-72B1-4246-BAB9-A10CCF115406}.Release|Any CPU.ActiveCfg = Release|Any CPU {DFAA25AA-72B1-4246-BAB9-A10CCF115406}.Release|Any CPU.Build.0 = Release|Any CPU + {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Release|Any CPU.ActiveCfg = Release|Any CPU + {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Release|Any CPU.Build.0 = Release|Any CPU {66D24AFB-3541-429D-9402-72A344D99115}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {66D24AFB-3541-429D-9402-72A344D99115}.Debug|Any CPU.Build.0 = Debug|Any CPU {66D24AFB-3541-429D-9402-72A344D99115}.Release|Any CPU.ActiveCfg = Release|Any CPU @@ -127,6 +133,7 @@ Global {5A09E90C-6BCB-440C-AC03-5212B2AAE6C2} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} {929EDD10-7B06-4C4F-B70F-E4E51072A724} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} {DFAA25AA-72B1-4246-BAB9-A10CCF115406} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} + {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} {66D24AFB-3541-429D-9402-72A344D99115} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} {A8A1C23E-7D6F-47FE-9959-B90E9CEF7B2C} = {6530DEC3-1D19-4854-80AC-2D6D02BEAECC} {446D2C4A-41D6-4E4F-AC4C-6809E2416A98} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} diff --git a/examples/ClassifiersExample/ClassifiersExample.csproj b/examples/ClassifiersExample/ClassifiersExample.csproj new file mode 100644 index 0000000..4cc28eb --- /dev/null +++ b/examples/ClassifiersExample/ClassifiersExample.csproj @@ -0,0 +1,14 @@ + + + + + + + + Exe + net8.0 + enable + enable + + + diff --git a/examples/ClassifiersExample/Program.cs b/examples/ClassifiersExample/Program.cs new file mode 100644 index 0000000..4727ddd --- /dev/null +++ b/examples/ClassifiersExample/Program.cs @@ -0,0 +1,152 @@ +using Braintrust.Sdk.Eval; + +namespace Braintrust.Sdk.Examples.ClassifiersExample; + +// Example: Classifiers +// +// Classifiers categorize and label eval outputs. Unlike scorers (which return +// numeric 0-1 values), classifiers return structured Classification items — +// each with an Id, an optional Label, and optional Metadata. +// +// Results are stored as a dictionary keyed by classifier name: +// +// { "sentiment": [{ id: "positive", label: "Positive" }] } +// +// Three patterns are shown: +// +// 1. Inline single-label FunctionClassifier +// 2. Inline multi-label FunctionClassifier (returns IReadOnlyList) +// 3. Class-based classifier implementing IClassifier +// +// Classifiers and scorers run independently. You can use both together, or +// use only classifiers when you don't need numeric scores. + +sealed class ResponseQualityClassifier : IClassifier +{ + public string Name => "response_quality"; + + public Task> Classify(TaskResult taskResult) + { + var output = taskResult.Result; + var wordCount = output.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length; + + string id; + if (string.IsNullOrWhiteSpace(output)) + { + id = "no_response"; + } + else if (wordCount < 5) + { + id = "too_short"; + } + else if (output.Contains("immediately", StringComparison.OrdinalIgnoreCase) + || output.Contains("right away", StringComparison.OrdinalIgnoreCase) + || output.Contains("look into", StringComparison.OrdinalIgnoreCase)) + { + id = "action_oriented"; + } + else + { + id = "informational"; + } + + var label = char.ToUpperInvariant(id[0]) + id[1..].Replace('_', ' '); + + IReadOnlyList results = new[] + { + new Classification( + id, + Label: label, + Metadata: new Dictionary { ["word_count"] = wordCount }) + }; + return Task.FromResult(results); + } +} + +class Program +{ + private static readonly (string Input, string Expected)[] Messages = + { + ("Hi! I just wanted to say thank you, the product is amazing!", "praise"), + ("I've been waiting 2 weeks for my order. This is unacceptable!", "follow_up"), + ("How do I reset my password? I can't find the option anywhere.", "how_to"), + ("The item arrived damaged. I need a refund immediately.", "complaint"), + ("Just checking in — any update on my ticket #4821?", "follow_up") + }; + + static string GenerateResponse(string message) + { + if (Regex("thank").IsMatch(message)) + return "You're welcome! So glad you're enjoying it."; + if (Regex("waiting|order").IsMatch(message)) + return "I sincerely apologise for the delay. Let me look into this right away."; + if (Regex("password|reset").IsMatch(message)) + return "To reset your password, go to Settings > Account > Reset Password."; + if (Regex("damaged|refund").IsMatch(message)) + return "I'm sorry to hear that. I'll process your refund immediately."; + return "Thanks for reaching out! Let me check on that for you."; + } + + static System.Text.RegularExpressions.Regex Regex(string pattern) + => new(pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase); + + static async Task Main() + { + var braintrust = Braintrust.Get(); + + // Pattern 1: inline single-label classifier + var intentClassifier = new FunctionClassifier( + "intent", + taskResult => + { + var input = taskResult.DatasetCase.Input; + string id = + Regex("thank").IsMatch(input) ? "praise" : + Regex("waiting|order|update").IsMatch(input) ? "follow_up" : + Regex("password|reset|find").IsMatch(input) ? "how_to" : + Regex("damaged|refund").IsMatch(input) ? "complaint" : + "other"; + + return new Classification( + id, + Label: char.ToUpperInvariant(id[0]) + id[1..].Replace('_', ' ')); + }); + + // Pattern 2: inline multi-label classifier — returns a list + var toneClassifier = new FunctionClassifier( + "tone", + taskResult => + { + var input = taskResult.DatasetCase.Input; + var labels = new List(); + if (Regex("immediately|unacceptable|waiting").IsMatch(input)) + labels.Add(new Classification("urgent", Label: "Urgent")); + if (Regex("please|thank|just checking").IsMatch(input)) + labels.Add(new Classification("polite", Label: "Polite")); + if (Regex("unacceptable|damaged|waiting").IsMatch(input)) + labels.Add(new Classification("frustrated", Label: "Frustrated")); + if (labels.Count == 0) + labels.Add(new Classification("neutral", Label: "Neutral")); + return (IReadOnlyList)labels; + }); + + // Pattern 3: class-based classifier (see ResponseQualityClassifier above) + var qualityClassifier = new ResponseQualityClassifier(); + + var cases = Messages + .Select(m => DatasetCase.Of(m.Input, m.Expected)) + .ToArray(); + + var eval = await braintrust + .EvalBuilder() + .Name($"dotnet-classifiers-example-{DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()}") + .Tags("classifiers-example", "dotnet-sdk") + .Cases(cases) + .TaskFunction(GenerateResponse) + .Classifiers(intentClassifier, toneClassifier, qualityClassifier) + .BuildAsync(); + + var result = await eval.RunAsync(); + Console.WriteLine($"\n\n{result.CreateReportString()}"); + } +} diff --git a/src/Braintrust.Sdk/Eval/Classification.cs b/src/Braintrust.Sdk/Eval/Classification.cs new file mode 100644 index 0000000..a69b7a6 --- /dev/null +++ b/src/Braintrust.Sdk/Eval/Classification.cs @@ -0,0 +1,14 @@ +namespace Braintrust.Sdk.Eval; + +/// +/// A structured label produced by a classifier. +/// +/// Stable identifier for filtering and grouping. Required. +/// Grouping key in the per-case classifications dictionary. If null or empty, the runner defaults this to the classifier's resolved name. +/// Optional display label. Consumers may fall back to when omitted. +/// Optional arbitrary metadata associated with this classification. +public readonly record struct Classification( + string Id, + string? Name = null, + string? Label = null, + IReadOnlyDictionary? Metadata = null); diff --git a/src/Braintrust.Sdk/Eval/Eval.cs b/src/Braintrust.Sdk/Eval/Eval.cs index fbe6672..ff10611 100644 --- a/src/Braintrust.Sdk/Eval/Eval.cs +++ b/src/Braintrust.Sdk/Eval/Eval.cs @@ -32,6 +32,7 @@ public sealed class Eval private readonly IDataset _dataset; private readonly ITask _task; private readonly IReadOnlyList> _scorers; + private readonly IReadOnlyList> _classifiers; private readonly IReadOnlyList? _experimentTags; private readonly IReadOnlyDictionary? _experimentMetadata; private readonly int? _maxConcurrency; @@ -49,6 +50,7 @@ private Eval(Builder builder, OrganizationAndProjectInfo orgAndProject, RepoInfo _dataset = builder._dataset ?? throw new ArgumentNullException(nameof(builder._dataset)); _task = builder._task ?? throw new ArgumentNullException(nameof(builder._task)); _scorers = builder._scorers.ToList(); + _classifiers = builder._classifiers.ToList(); _experimentTags = builder._experimentTags; _experimentMetadata = builder._experimentMetadata; _maxConcurrency = builder._maxConcurrency; @@ -165,12 +167,13 @@ private async Task EvalOne(string experimentId, DatasetCase dat } if (taskException == null) { - // Task succeeded — record output and run all scorers in parallel, each in their own span + // Task succeeded — record output and run all scorers and classifiers in parallel, each in their own span rootActivity.SetTag("braintrust.output_json", ToJson(new { output = taskResult!.Value.Result })); - // Flush OTel spans to Braintrust before scoring so traced scorers can access them - var hasTracedScorers = _scorers.OfType>().Any(); - if (hasTracedScorers) + // Flush OTel spans to Braintrust before scoring so traced scorers/classifiers can access them + var needsTraceFlush = _scorers.OfType>().Any() + || _classifiers.OfType>().Any(); + if (needsTraceFlush) { BraintrustTracing.ForceFlush(); } @@ -179,7 +182,8 @@ private async Task EvalOne(string experimentId, DatasetCase dat var rootSpanId = rootActivity.TraceId.ToHexString(); var trace = new EvalTrace(ct => _btqlClient.QuerySpansAsync(experimentId, rootSpanId, ct)); - await RunScorers(experimentId, rootActivity, taskResult!.Value, trace).ConfigureAwait(false); + await RunScorersAndClassifiers(experimentId, rootActivity, taskResult!.Value, trace, datasetCase.Metadata) + .ConfigureAwait(false); } else { @@ -234,19 +238,28 @@ private async Task RunSingleScorerForTaskException( } /// - /// Runs all scorers for a successful task result, each in their own score span. - /// Calls (or - /// for traced scorers) and falls back to on error. + /// Runs all scorers and classifiers for a successful task result in parallel, each in their own span. + /// After completion, aggregates classifier results onto the root span as braintrust.classifications + /// and merges any classifier errors into the root span's braintrust.metadata under + /// classifier_errors. /// - private async Task RunScorers( + private async Task RunScorersAndClassifiers( string experimentId, Activity rootActivity, TaskResult taskResult, - EvalTrace trace) + EvalTrace trace, + IReadOnlyDictionary caseMetadata) { var scorerTasks = _scorers.Select(scorer => RunSingleScorer(experimentId, rootActivity, scorer, taskResult, trace)); - await Task.WhenAll(scorerTasks).ConfigureAwait(false); + + var classifierOutcomes = new ClassifierOutcome?[_classifiers.Count]; + var classifierTasks = _classifiers.Select((classifier, index) => + RunSingleClassifier(experimentId, rootActivity, classifier, index, taskResult, trace, classifierOutcomes)); + + await Task.WhenAll(scorerTasks.Concat(classifierTasks)).ConfigureAwait(false); + + AggregateClassifierOutcomes(rootActivity, caseMetadata, classifierOutcomes); } private async Task RunSingleScorer( @@ -327,6 +340,214 @@ private static void RecordScores( } } + /// + /// Per-classifier outcome captured after running. Either a successful list of normalized items + /// (already grouped by resolved name) or an error message. + /// + private sealed class ClassifierOutcome + { + public string ClassifierName { get; } + public IReadOnlyList<(string Name, Dictionary Item)>? Items { get; } + public string? ErrorMessage { get; } + + private ClassifierOutcome( + string classifierName, + IReadOnlyList<(string Name, Dictionary Item)>? items, + string? errorMessage) + { + ClassifierName = classifierName; + Items = items; + ErrorMessage = errorMessage; + } + + public static ClassifierOutcome Success( + string classifierName, + IReadOnlyList<(string Name, Dictionary Item)> items) + => new(classifierName, items, null); + + public static ClassifierOutcome Error(string classifierName, string errorMessage) + => new(classifierName, null, errorMessage); + } + + private async Task RunSingleClassifier( + string experimentId, + Activity rootActivity, + IClassifier classifier, + int classifierIndex, + TaskResult taskResult, + EvalTrace trace, + ClassifierOutcome?[] outcomes) + { + var resolvedName = string.IsNullOrWhiteSpace(classifier.Name) + ? $"classifier_{classifierIndex}" + : classifier.Name; + + var classifierActivity = _activitySource.StartActivity($"classifier:{resolvedName}"); + classifierActivity?.SetTag(BraintrustTracing.ParentKey, $"experiment_id:{experimentId}"); + classifierActivity?.SetTag( + "braintrust.span_attributes", + ToJson(new { type = "classifier", purpose = "scorer" })); + + var datasetCase = taskResult.DatasetCase; + classifierActivity?.SetTag( + "braintrust.input_json", + ToJson(new + { + input = datasetCase.Input, + expected = datasetCase.Expected, + output = taskResult.Result, + metadata = datasetCase.Metadata + })); + + try + { + using var classifierScope = BraintrustContext.OfExperiment(experimentId).MakeCurrent(); + + IReadOnlyList rawResults; + try + { + rawResults = classifier is ITracedClassifier tracedClassifier + ? await tracedClassifier.Classify(taskResult, trace).ConfigureAwait(false) + : await classifier.Classify(taskResult).ConfigureAwait(false); + + if (rawResults == null) + { + rawResults = Array.Empty(); + } + } + catch (Exception ex) + { + classifierActivity?.SetStatus(ActivityStatusCode.Error, ex.Message); + classifierActivity?.AddEvent(CreateExceptionEvent(ex)); + outcomes[classifierIndex] = ClassifierOutcome.Error(resolvedName, ex.Message); + return; + } + + // Normalize: resolve name + validate, build storage items (no Name key). + var normalized = new List<(string Name, Dictionary Item)>(rawResults.Count); + try + { + foreach (var classification in rawResults) + { + if (string.IsNullOrEmpty(classification.Id)) + { + throw new InvalidOperationException( + "When returning structured classifier results, each classification must be a non-empty object."); + } + + var groupingName = string.IsNullOrWhiteSpace(classification.Name) + ? resolvedName + : classification.Name!; + + var item = new Dictionary { ["id"] = classification.Id }; + if (classification.Label != null) + { + item["label"] = classification.Label; + } + if (classification.Metadata != null && classification.Metadata.Count > 0) + { + item["metadata"] = classification.Metadata; + } + + normalized.Add((groupingName, item)); + } + } + catch (Exception ex) + { + classifierActivity?.SetStatus(ActivityStatusCode.Error, ex.Message); + classifierActivity?.AddEvent(CreateExceptionEvent(ex)); + outcomes[classifierIndex] = ClassifierOutcome.Error(resolvedName, ex.Message); + return; + } + + // Build output_json keyed by resolved name for the classifier span. + if (normalized.Count > 0) + { + var outputByName = new Dictionary>>(); + foreach (var (name, item) in normalized) + { + if (!outputByName.TryGetValue(name, out var list)) + { + list = new List>(); + outputByName[name] = list; + } + list.Add(item); + } + classifierActivity?.SetTag("braintrust.output_json", ToJson(outputByName)); + } + + outcomes[classifierIndex] = ClassifierOutcome.Success(resolvedName, normalized); + } + finally + { + classifierActivity?.Stop(); + } + } + + /// + /// Aggregates per-classifier outcomes onto the root span: + /// + /// Sets braintrust.classifications when any classifications were produced. + /// Merges any classifier errors into braintrust.metadata under classifier_errors. + /// + /// + private static void AggregateClassifierOutcomes( + Activity rootActivity, + IReadOnlyDictionary caseMetadata, + ClassifierOutcome?[] outcomes) + { + if (outcomes.Length == 0) + { + return; + } + + var classifications = new Dictionary>>(); + var classifierErrors = new Dictionary(); + + foreach (var outcome in outcomes) + { + if (outcome == null) + { + continue; + } + + if (outcome.ErrorMessage != null) + { + classifierErrors[outcome.ClassifierName] = outcome.ErrorMessage; + continue; + } + + if (outcome.Items == null) + { + continue; + } + + foreach (var (name, item) in outcome.Items) + { + if (!classifications.TryGetValue(name, out var list)) + { + list = new List>(); + classifications[name] = list; + } + list.Add(item); + } + } + + if (classifications.Count > 0) + { + rootActivity.SetTag("braintrust.classifications", ToJson(classifications)); + } + + if (classifierErrors.Count > 0) + { + var merged = new Dictionary(caseMetadata) + { + ["classifier_errors"] = classifierErrors + }; + rootActivity.SetTag("braintrust.metadata", ToJson(merged)); + } + } + private static string ToJson(object obj) { return JsonSerializer.Serialize(obj, JsonOptions); @@ -388,6 +609,7 @@ public sealed class Builder internal IDataset? _dataset; internal ITask? _task; internal List> _scorers = new(); + internal List> _classifiers = new(); internal IReadOnlyList? _experimentTags; internal IReadOnlyDictionary? _experimentMetadata; internal int? _maxConcurrency = 10; @@ -406,9 +628,9 @@ public async Task> BuildAsync() _apiClient ??= BraintrustApiClient.Of(_config); _btqlClient ??= new BtqlClient(_config); - if (_scorers.Count == 0) + if (_scorers.Count == 0 && _classifiers.Count == 0) { - throw new InvalidOperationException("Must provide at least one scorer"); + throw new InvalidOperationException("Must provide at least one scorer or classifier"); } if (_dataset == null) @@ -561,6 +783,16 @@ public Builder Scorers(params IScorer[] scorers) return this; } + /// + /// Set the classifiers. + /// At least one of or must be provided. + /// + public Builder Classifiers(params IClassifier[] classifiers) + { + _classifiers = classifiers.ToList(); + return this; + } + /// /// Set the experiment-level tags. /// These tags are applied to the experiment itself, not individual cases. diff --git a/src/Braintrust.Sdk/Eval/FunctionClassifier.cs b/src/Braintrust.Sdk/Eval/FunctionClassifier.cs new file mode 100644 index 0000000..5aeb473 --- /dev/null +++ b/src/Braintrust.Sdk/Eval/FunctionClassifier.cs @@ -0,0 +1,73 @@ +namespace Braintrust.Sdk.Eval; + +/// +/// Implementation of a classifier from a function. +/// Supports synchronous and asynchronous functions returning either a single +/// or a list. Returning null means "no classifications for this case". +/// +public class FunctionClassifier : IClassifier + where TInput : notnull + where TOutput : notnull +{ + private static readonly IReadOnlyList Empty = Array.Empty(); + + private readonly Func, Task>> _classifierFn; + + /// + /// Create a classifier from a synchronous function returning a single classification (or null). + /// + public FunctionClassifier(string name, Func, Classification?> classifierFn) + { + Name = name; + _classifierFn = taskResult => + { + var result = classifierFn(taskResult); + return Task.FromResult>( + result.HasValue ? new[] { result.Value } : Empty); + }; + } + + /// + /// Create a classifier from a synchronous function returning a list of classifications (or null). + /// + public FunctionClassifier(string name, Func, IReadOnlyList?> classifierFn) + { + Name = name; + _classifierFn = taskResult => + { + var result = classifierFn(taskResult); + return Task.FromResult>(result ?? Empty); + }; + } + + /// + /// Create a classifier from an asynchronous function returning a single classification (or null). + /// + public FunctionClassifier(string name, Func, Task> classifierFn) + { + Name = name; + _classifierFn = async taskResult => + { + var result = await classifierFn(taskResult).ConfigureAwait(false); + return result.HasValue ? new[] { result.Value } : Empty; + }; + } + + /// + /// Create a classifier from an asynchronous function returning a list of classifications (or null). + /// + public FunctionClassifier(string name, Func, Task?>> classifierFn) + { + Name = name; + _classifierFn = async taskResult => + { + var result = await classifierFn(taskResult).ConfigureAwait(false); + return result ?? Empty; + }; + } + + public string Name { get; } + + public Task> Classify(TaskResult taskResult) + => _classifierFn(taskResult); +} diff --git a/src/Braintrust.Sdk/Eval/IClassifier.cs b/src/Braintrust.Sdk/Eval/IClassifier.cs new file mode 100644 index 0000000..f85f502 --- /dev/null +++ b/src/Braintrust.Sdk/Eval/IClassifier.cs @@ -0,0 +1,30 @@ +namespace Braintrust.Sdk.Eval; + +/// +/// A classifier categorizes and labels eval outputs. +/// Unlike (which returns numeric 0-1 values), +/// classifiers return structured items with an id and optional label and metadata. +/// +/// +/// Implementations must be thread-safe as classifiers may be executed concurrently. +/// Classifier failures are non-fatal: an exception thrown by is recorded +/// under classifier_errors in the eval span's metadata and does not abort the evaluation. +/// +/// Type of the input data +/// Type of the output data +public interface IClassifier + where TInput : notnull + where TOutput : notnull +{ + /// + /// Gets the name of this classifier. Used as the classifier span name and as the + /// default grouping key when a returned has no Name. + /// + string Name { get; } + + /// + /// Classify the task result and return zero or more classifications. + /// Return an empty list to indicate no classifications for this case. + /// + Task> Classify(TaskResult taskResult); +} diff --git a/src/Braintrust.Sdk/Eval/ITracedClassifier.cs b/src/Braintrust.Sdk/Eval/ITracedClassifier.cs new file mode 100644 index 0000000..97fd6d7 --- /dev/null +++ b/src/Braintrust.Sdk/Eval/ITracedClassifier.cs @@ -0,0 +1,24 @@ +namespace Braintrust.Sdk.Eval; + +/// +/// A classifier that receives access to the distributed trace (spans) of the task that was evaluated. +/// This allows classifiers to inspect intermediate LLM calls and tool-use chains, not just the final output. +/// +/// Implement this interface when your classifier needs to examine multi-turn conversations or tool-use chains +/// (e.g. classifying a conversation pattern as "single-turn", "tool-heavy", or "clarification-loop"). +/// When a classifier implements this interface, +/// is called instead of . +/// Backward-compatible: classifiers that only implement continue to work without change. +/// +/// The type of input data for the evaluation +/// The type of output produced by the task +public interface ITracedClassifier : IClassifier + where TInput : notnull + where TOutput : notnull +{ + /// + /// Classify the task result using the distributed trace for additional context. + /// Called instead of when trace is available. + /// + Task> Classify(TaskResult taskResult, EvalTrace trace); +} diff --git a/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs b/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs new file mode 100644 index 0000000..b99bcb7 --- /dev/null +++ b/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs @@ -0,0 +1,566 @@ +using System.Diagnostics; +using System.Text.Json; +using Braintrust.Sdk.Config; +using Braintrust.Sdk.Eval; + +namespace Braintrust.Sdk.Tests.Eval; + +[Collection("BraintrustGlobals")] +public class ClassifierTest : IDisposable +{ + private readonly ActivityListener _activityListener; + + public ClassifierTest() + { + Braintrust.ResetForTest(); + _activityListener = new ActivityListener + { + ShouldListenTo = source => source.Name == "braintrust-dotnet", + Sample = (ref ActivityCreationOptions _) => ActivitySamplingResult.AllDataAndRecorded + }; + ActivitySource.AddActivityListener(_activityListener); + } + + public void Dispose() + { + _activityListener?.Dispose(); + Braintrust.ResetForTest(); + } + + // ===================================================================== + // FunctionClassifier shape normalization + // ===================================================================== + + [Fact] + public async Task FunctionClassifierReturnsSingleClassification() + { + var classifier = new FunctionClassifier( + "category", + _ => new Classification("greeting", Label: "Greeting")); + + var taskResult = MakeTaskResult("hello", "hi"); + var results = await classifier.Classify(taskResult); + + Assert.Single(results); + Assert.Equal("greeting", results[0].Id); + Assert.Equal("Greeting", results[0].Label); + } + + [Fact] + public async Task FunctionClassifierReturnsList() + { + var classifier = new FunctionClassifier( + "sentiment", + _ => (IReadOnlyList)new[] + { + new Classification("positive", Label: "Positive"), + new Classification("enthusiastic", Label: "Enthusiastic") + }); + + var results = await classifier.Classify(MakeTaskResult("great!", "")); + + Assert.Equal(2, results.Count); + Assert.Equal("positive", results[0].Id); + Assert.Equal("enthusiastic", results[1].Id); + } + + [Fact] + public async Task FunctionClassifierNullReturnsEmptyList() + { + var classifier = new FunctionClassifier( + "maybe", + _ => (Classification?)null); + + var results = await classifier.Classify(MakeTaskResult("hello", "hi")); + Assert.Empty(results); + } + + [Fact] + public async Task FunctionClassifierNullListReturnsEmptyList() + { + var classifier = new FunctionClassifier( + "maybe", + _ => (IReadOnlyList?)null); + + var results = await classifier.Classify(MakeTaskResult("hello", "hi")); + Assert.Empty(results); + } + + [Fact] + public async Task FunctionClassifierAsyncSingle() + { + var classifier = new FunctionClassifier( + "category", + _ => Task.FromResult(new Classification("greeting"))); + + var results = await classifier.Classify(MakeTaskResult("hello", "hi")); + Assert.Single(results); + Assert.Equal("greeting", results[0].Id); + } + + [Fact] + public async Task FunctionClassifierAsyncList() + { + var classifier = new FunctionClassifier( + "category", + _ => Task.FromResult?>(new[] + { + new Classification("a"), + new Classification("b") + })); + + var results = await classifier.Classify(MakeTaskResult("hello", "hi")); + Assert.Equal(2, results.Count); + } + + // ===================================================================== + // Builder validation + // ===================================================================== + + [Fact] + public async Task EvalRequiresAtLeastScorersOrClassifiers() + { + var config = BraintrustConfig.Of(("BRAINTRUST_API_KEY", "test-key")); + var mockClient = new MockBraintrustApiClient(); + + var ex = await Assert.ThrowsAsync(() => + Eval.NewBuilder() + .Name("test-eval") + .Config(config) + .ApiClient(mockClient) + .Cases(DatasetCase.Of("input", "expected")) + .TaskFunction(x => x) + .BuildAsync()); + + Assert.Contains("at least one scorer or classifier", ex.Message); + } + + [Fact] + public async Task EvalBuildsWithClassifiersOnly() + { + var config = BraintrustConfig.Of( + ("BRAINTRUST_API_KEY", "test-key"), + ("BRAINTRUST_APP_URL", "https://braintrust.dev"), + ("BRAINTRUST_DEFAULT_PROJECT_NAME", "test-project")); + var mockClient = new MockBraintrustApiClient(); + + var eval = await Eval.NewBuilder() + .Name("test-eval") + .Config(config) + .ApiClient(mockClient) + .Cases(DatasetCase.Of("hello", "hi")) + .TaskFunction(x => x) + .Classifiers(new FunctionClassifier( + "category", + _ => new Classification("greeting"))) + .BuildAsync(); + + var result = await eval.RunAsync(); + Assert.NotNull(result.ExperimentUrl); + } + + // ===================================================================== + // Runner — classifier results on the eval span + // ===================================================================== + + [Fact] + public async Task RunnerWritesClassificationsToEvalSpan() + { + var (rootSpans, classifierSpans) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier( + "category", + _ => new Classification("greeting", Label: "Greeting")) + }); + + var root = Assert.Single(rootSpans); + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + Assert.True(classifications.RootElement.TryGetProperty("category", out var categoryItems)); + Assert.Equal(1, categoryItems.GetArrayLength()); + Assert.Equal("greeting", categoryItems[0].GetProperty("id").GetString()); + Assert.Equal("Greeting", categoryItems[0].GetProperty("label").GetString()); + + // Single classifier span produced + Assert.Single(classifierSpans); + } + + [Fact] + public async Task RunnerWritesNoClassificationsTagWhenAllNull() + { + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier("maybe", _ => (Classification?)null) + }); + + var root = Assert.Single(rootSpans); + Assert.Null(root.GetTagItem("braintrust.classifications")); + } + + [Fact] + public async Task RunnerCombinesScorersAndClassifiers() + { + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + scorers: new IScorer[] + { + new FunctionScorer("exact", (e, a) => e == a ? 1.0 : 0.0) + }, + classifiers: new IClassifier[] + { + new FunctionClassifier("category", _ => new Classification("greeting")) + }); + + var root = Assert.Single(rootSpans); + Assert.NotNull(root.GetTagItem("braintrust.classifications")); + // The eval span does not store scores itself; verify the classification path was hit + // independently from the scorer path. Score span coverage is in EvalTest. + } + + [Fact] + public async Task RunnerHandlesClassifierExceptionWithoutAbortingEval() + { + var (rootSpans, classifierSpans) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new ThrowingClassifier("broken", "classifier boom"), + new FunctionClassifier("working", _ => new Classification("ok")) + }); + + var root = Assert.Single(rootSpans); + + // Classifier errors merged into braintrust.metadata under classifier_errors + var metadataJson = root.GetTagItem("braintrust.metadata") as string; + Assert.NotNull(metadataJson); + using var doc = JsonDocument.Parse(metadataJson); + Assert.True(doc.RootElement.TryGetProperty("classifier_errors", out var errors)); + Assert.Equal("classifier boom", errors.GetProperty("broken").GetString()); + + // The working classifier still wrote its classification + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + Assert.True(classifications.RootElement.TryGetProperty("working", out _)); + + // The broken classifier span has error status + exception event + var brokenSpan = classifierSpans.First(s => s.DisplayName == "classifier:broken"); + Assert.Equal(ActivityStatusCode.Error, brokenSpan.Status); + Assert.NotEmpty(brokenSpan.Events); + + // The eval (root) span itself is not marked Error by a classifier failure + Assert.Equal(ActivityStatusCode.Unset, root.Status); + } + + [Fact] + public async Task RunnerWritesClassifierSpanAttributes() + { + var (_, classifierSpans) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier( + "my_classifier", + _ => new Classification("foo")) + }); + + var span = Assert.Single(classifierSpans); + Assert.Equal("classifier:my_classifier", span.DisplayName); + + var attrsJson = span.GetTagItem("braintrust.span_attributes") as string; + Assert.NotNull(attrsJson); + using var doc = JsonDocument.Parse(attrsJson); + Assert.Equal("classifier", doc.RootElement.GetProperty("type").GetString()); + Assert.Equal("scorer", doc.RootElement.GetProperty("purpose").GetString()); + } + + [Fact] + public async Task RunnerMultiLabelResultPreservesOrder() + { + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("great!", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier( + "sentiment", + _ => (IReadOnlyList)new[] + { + new Classification("positive", Label: "Positive"), + new Classification("enthusiastic", Label: "Enthusiastic") + }) + }); + + var root = Assert.Single(rootSpans); + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + var items = classifications.RootElement.GetProperty("sentiment"); + Assert.Equal(2, items.GetArrayLength()); + Assert.Equal("positive", items[0].GetProperty("id").GetString()); + Assert.Equal("enthusiastic", items[1].GetProperty("id").GetString()); + } + + [Fact] + public async Task RunnerClassificationNameDefaultsToClassifierName() + { + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + // Classification has no Name set + new FunctionClassifier( + "my_classifier", + _ => new Classification("foo")) + }); + + var root = Assert.Single(rootSpans); + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + Assert.True(classifications.RootElement.TryGetProperty("my_classifier", out _)); + } + + [Fact] + public async Task RunnerClassificationExplicitNameOverridesClassifierName() + { + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier( + "my_classifier", + _ => new Classification("foo", Name: "override_name")) + }); + + var root = Assert.Single(rootSpans); + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + Assert.True(classifications.RootElement.TryGetProperty("override_name", out _)); + Assert.False(classifications.RootElement.TryGetProperty("my_classifier", out _)); + } + + [Fact] + public async Task RunnerEmptyClassificationItemIsRecordedAsError() + { + var (rootSpans, classifierSpans) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + // Default(Classification) — Id is null/empty, so should fail validation + new FunctionClassifier( + "bad", + _ => (Classification?)default(Classification)) + }); + + var root = Assert.Single(rootSpans); + var metadataJson = root.GetTagItem("braintrust.metadata") as string; + Assert.NotNull(metadataJson); + using var doc = JsonDocument.Parse(metadataJson); + var errors = doc.RootElement.GetProperty("classifier_errors"); + var brokenError = errors.GetProperty("bad").GetString(); + Assert.NotNull(brokenError); + Assert.Contains("each classification must be a non-empty object", brokenError); + + var brokenSpan = Assert.Single(classifierSpans); + Assert.Equal(ActivityStatusCode.Error, brokenSpan.Status); + } + + [Fact] + public async Task RunnerAccumulatesClassificationsAcrossCases() + { + var (rootSpans, _) = await RunEval( + cases: new[] + { + DatasetCase.Of("hi", "x"), + DatasetCase.Of("hello", "x"), + DatasetCase.Of("ok", "x") + }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier( + "category", + tr => new Classification(tr.Result.Length > 3 ? "long" : "short")) + }); + + Assert.Equal(3, rootSpans.Count); + foreach (var root in rootSpans) + { + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + Assert.True(classifications.RootElement.TryGetProperty("category", out _)); + } + } + + [Fact] + public async Task RunnerClassifierInputContainsAllScoringArgs() + { + var (_, classifierSpans) = await RunEval( + cases: new[] + { + DatasetCase.Of( + "hello", "hi", + new List(), + new Dictionary { ["k"] = "v" }) + }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier("category", _ => new Classification("greeting")) + }); + + var span = Assert.Single(classifierSpans); + var inputJson = span.GetTagItem("braintrust.input_json") as string; + Assert.NotNull(inputJson); + using var doc = JsonDocument.Parse(inputJson); + Assert.Equal("hello", doc.RootElement.GetProperty("input").GetString()); + Assert.Equal("hi", doc.RootElement.GetProperty("expected").GetString()); + Assert.Equal("hello", doc.RootElement.GetProperty("output").GetString()); + Assert.True(doc.RootElement.TryGetProperty("metadata", out var md)); + Assert.Equal("v", md.GetProperty("k").GetString()); + } + + // ===================================================================== + // ITracedClassifier + // ===================================================================== + + [Fact] + public async Task TracedClassifierReceivesEvalTrace() + { + var spans = new[] + { + MockBtqlClient.MakeSpan("llm", input: new { messages = new[] { new { role = "user", content = "hi" } } }, + output: new { choices = new[] { new { message = new { role = "assistant", content = "hello" } } } }) + }; + var mockBtql = new MockBtqlClient(spans); + + var capturedSpanCount = -1; + var classifier = new TracedClassifier( + "trace_inspector", + async (_, trace) => + { + var fetched = await trace.GetSpansAsync("llm"); + capturedSpanCount = fetched.Count; + return new[] { new Classification("multi_turn") }; + }); + + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] { classifier }, + btqlClient: mockBtql); + + Assert.Single(rootSpans); + Assert.Equal(1, capturedSpanCount); + Assert.Equal(1, mockBtql.QueryCount); + } + + // ===================================================================== + // Helpers + // ===================================================================== + + private static TaskResult MakeTaskResult(string input, string output) + => new(output, new DatasetCase(input, "")); + + private static JsonDocument? ReadClassifications(Activity span) + { + var json = span.GetTagItem("braintrust.classifications") as string; + return json == null ? null : JsonDocument.Parse(json); + } + + private async Task<(List RootSpans, List ClassifierSpans)> RunEval( + DatasetCase[] cases, + Func taskFn, + IScorer[]? scorers = null, + IClassifier[]? classifiers = null, + MockBtqlClient? btqlClient = null) + { + var config = BraintrustConfig.Of( + ("BRAINTRUST_API_KEY", "test-key"), + ("BRAINTRUST_APP_URL", "https://braintrust.dev"), + ("BRAINTRUST_DEFAULT_PROJECT_NAME", "test-project")); + var mockClient = new MockBraintrustApiClient(); + btqlClient ??= new MockBtqlClient(); + + var captured = new List(); + using var listener = new ActivityListener + { + ShouldListenTo = source => source.Name == "braintrust-dotnet", + Sample = (ref ActivityCreationOptions _) => ActivitySamplingResult.AllDataAndRecorded, + ActivityStopped = captured.Add + }; + ActivitySource.AddActivityListener(listener); + + var builder = Eval.NewBuilder() + .Name("classifier-test") + .Config(config) + .ApiClient(mockClient) + .BtqlClient(btqlClient) + .Cases(cases) + .TaskFunction(taskFn); + + if (scorers != null && scorers.Length > 0) + { + builder.Scorers(scorers); + } + + if (classifiers != null && classifiers.Length > 0) + { + builder.Classifiers(classifiers); + } + else if (scorers == null || scorers.Length == 0) + { + // The validator forbids zero classifiers and zero scorers; tests using RunEval should specify at least one. + throw new InvalidOperationException("Test setup error: provide at least one scorer or classifier."); + } + + var eval = await builder.BuildAsync(); + await eval.RunAsync(); + + var rootSpans = captured.Where(a => a.DisplayName == "eval").ToList(); + var classifierSpans = captured.Where(a => a.DisplayName.StartsWith("classifier:")).ToList(); + return (rootSpans, classifierSpans); + } + + private sealed class ThrowingClassifier : IClassifier + { + private readonly string _message; + public ThrowingClassifier(string name, string message) + { + Name = name; + _message = message; + } + public string Name { get; } + public Task> Classify(TaskResult taskResult) + => throw new InvalidOperationException(_message); + } + + private sealed class TracedClassifier : ITracedClassifier + { + private readonly Func, EvalTrace, Task>> _fn; + public TracedClassifier( + string name, + Func, EvalTrace, Task>> fn) + { + Name = name; + _fn = fn; + } + public string Name { get; } + + public Task> Classify(TaskResult taskResult) + => Task.FromResult>(Array.Empty()); + + public Task> Classify(TaskResult taskResult, EvalTrace trace) + => _fn(taskResult, trace); + } +} From 40212c06aaeb306d788a9b4d5dd15710d62681cc Mon Sep 17 00:00:00 2001 From: Stephen Belanger Date: Sat, 30 May 2026 01:03:13 +0800 Subject: [PATCH 2/2] Align classifier span name + span_attributes with canonical Ruby spec The Ruby reference implementation (sdk-ruby/lib/braintrust/eval/runner.rb:391, 416-420) uses the classifier name directly as the span name and includes a `name` key in braintrust.span_attributes. Java already follows this pattern (Eval.java:290, 297). .NET was using a "classifier:" prefix on the span name and omitting the name attribute, which prevented consistent classifier-span discovery downstream. Update tests that asserted the prefixed display name; the test helper now identifies classifier spans by their span_attributes.type tag instead of a name prefix. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/Braintrust.Sdk/Eval/Eval.cs | 4 ++-- tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs | 13 ++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/src/Braintrust.Sdk/Eval/Eval.cs b/src/Braintrust.Sdk/Eval/Eval.cs index ff10611..8afa9d4 100644 --- a/src/Braintrust.Sdk/Eval/Eval.cs +++ b/src/Braintrust.Sdk/Eval/Eval.cs @@ -382,11 +382,11 @@ private async Task RunSingleClassifier( ? $"classifier_{classifierIndex}" : classifier.Name; - var classifierActivity = _activitySource.StartActivity($"classifier:{resolvedName}"); + var classifierActivity = _activitySource.StartActivity(resolvedName); classifierActivity?.SetTag(BraintrustTracing.ParentKey, $"experiment_id:{experimentId}"); classifierActivity?.SetTag( "braintrust.span_attributes", - ToJson(new { type = "classifier", purpose = "scorer" })); + ToJson(new { type = "classifier", name = resolvedName, purpose = "scorer" })); var datasetCase = taskResult.DatasetCase; classifierActivity?.SetTag( diff --git a/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs b/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs index b99bcb7..b790130 100644 --- a/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs +++ b/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs @@ -251,7 +251,7 @@ public async Task RunnerHandlesClassifierExceptionWithoutAbortingEval() Assert.True(classifications.RootElement.TryGetProperty("working", out _)); // The broken classifier span has error status + exception event - var brokenSpan = classifierSpans.First(s => s.DisplayName == "classifier:broken"); + var brokenSpan = classifierSpans.First(s => s.DisplayName == "broken"); Assert.Equal(ActivityStatusCode.Error, brokenSpan.Status); Assert.NotEmpty(brokenSpan.Events); @@ -273,12 +273,13 @@ public async Task RunnerWritesClassifierSpanAttributes() }); var span = Assert.Single(classifierSpans); - Assert.Equal("classifier:my_classifier", span.DisplayName); + Assert.Equal("my_classifier", span.DisplayName); var attrsJson = span.GetTagItem("braintrust.span_attributes") as string; Assert.NotNull(attrsJson); using var doc = JsonDocument.Parse(attrsJson); Assert.Equal("classifier", doc.RootElement.GetProperty("type").GetString()); + Assert.Equal("my_classifier", doc.RootElement.GetProperty("name").GetString()); Assert.Equal("scorer", doc.RootElement.GetProperty("purpose").GetString()); } @@ -528,7 +529,13 @@ private static TaskResult MakeTaskResult(string input, string ou await eval.RunAsync(); var rootSpans = captured.Where(a => a.DisplayName == "eval").ToList(); - var classifierSpans = captured.Where(a => a.DisplayName.StartsWith("classifier:")).ToList(); + var classifierSpans = captured + .Where(a => + { + var attrs = a.GetTagItem("braintrust.span_attributes") as string; + return attrs != null && attrs.Contains("\"type\":\"classifier\""); + }) + .ToList(); return (rootSpans, classifierSpans); }