diff --git a/Braintrust.Sdk.sln b/Braintrust.Sdk.sln index b9f05d0..e7c2dfb 100644 --- a/Braintrust.Sdk.sln +++ b/Braintrust.Sdk.sln @@ -19,6 +19,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "OpenAIInstrumentation", "ex EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "EvalExample", "examples\EvalExample\EvalExample.csproj", "{DFAA25AA-72B1-4246-BAB9-A10CCF115406}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ClassifiersExample", "examples\ClassifiersExample\ClassifiersExample.csproj", "{0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}" +EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "TraceScoring", "examples\TraceScoring\TraceScoring.csproj", "{66D24AFB-3541-429D-9402-72A344D99115}" EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Braintrust.Sdk.OpenAI", "src\Braintrust.Sdk.OpenAI\Braintrust.Sdk.OpenAI.csproj", "{B3C7D1A2-4E5F-6789-ABCD-EF0123456789}" @@ -72,6 +74,10 @@ Global {DFAA25AA-72B1-4246-BAB9-A10CCF115406}.Debug|Any CPU.Build.0 = Debug|Any CPU {DFAA25AA-72B1-4246-BAB9-A10CCF115406}.Release|Any CPU.ActiveCfg = Release|Any CPU {DFAA25AA-72B1-4246-BAB9-A10CCF115406}.Release|Any CPU.Build.0 = Release|Any CPU + {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Release|Any CPU.ActiveCfg = Release|Any CPU + {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB}.Release|Any CPU.Build.0 = Release|Any CPU {66D24AFB-3541-429D-9402-72A344D99115}.Debug|Any CPU.ActiveCfg = Debug|Any CPU {66D24AFB-3541-429D-9402-72A344D99115}.Debug|Any CPU.Build.0 = Debug|Any CPU {66D24AFB-3541-429D-9402-72A344D99115}.Release|Any CPU.ActiveCfg = Release|Any CPU @@ -127,6 +133,7 @@ Global {5A09E90C-6BCB-440C-AC03-5212B2AAE6C2} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} {929EDD10-7B06-4C4F-B70F-E4E51072A724} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} {DFAA25AA-72B1-4246-BAB9-A10CCF115406} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} + {0A934BA7-BEBB-4EF0-88A6-9A5355E6D0BB} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} {66D24AFB-3541-429D-9402-72A344D99115} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} {A8A1C23E-7D6F-47FE-9959-B90E9CEF7B2C} = {6530DEC3-1D19-4854-80AC-2D6D02BEAECC} {446D2C4A-41D6-4E4F-AC4C-6809E2416A98} = {A1BDA853-65BE-4CC8-8070-CCBA22069A7A} diff --git a/examples/ClassifiersExample/ClassifiersExample.csproj b/examples/ClassifiersExample/ClassifiersExample.csproj new file mode 100644 index 0000000..4cc28eb --- /dev/null +++ b/examples/ClassifiersExample/ClassifiersExample.csproj @@ -0,0 +1,14 @@ + + + + + + + + Exe + net8.0 + enable + enable + + + diff --git a/examples/ClassifiersExample/Program.cs b/examples/ClassifiersExample/Program.cs new file mode 100644 index 0000000..4727ddd --- /dev/null +++ b/examples/ClassifiersExample/Program.cs @@ -0,0 +1,152 @@ +using Braintrust.Sdk.Eval; + +namespace Braintrust.Sdk.Examples.ClassifiersExample; + +// Example: Classifiers +// +// Classifiers categorize and label eval outputs. Unlike scorers (which return +// numeric 0-1 values), classifiers return structured Classification items — +// each with an Id, an optional Label, and optional Metadata. +// +// Results are stored as a dictionary keyed by classifier name: +// +// { "sentiment": [{ id: "positive", label: "Positive" }] } +// +// Three patterns are shown: +// +// 1. Inline single-label FunctionClassifier +// 2. Inline multi-label FunctionClassifier (returns IReadOnlyList) +// 3. Class-based classifier implementing IClassifier +// +// Classifiers and scorers run independently. You can use both together, or +// use only classifiers when you don't need numeric scores. + +sealed class ResponseQualityClassifier : IClassifier +{ + public string Name => "response_quality"; + + public Task> Classify(TaskResult taskResult) + { + var output = taskResult.Result; + var wordCount = output.Split(' ', StringSplitOptions.RemoveEmptyEntries).Length; + + string id; + if (string.IsNullOrWhiteSpace(output)) + { + id = "no_response"; + } + else if (wordCount < 5) + { + id = "too_short"; + } + else if (output.Contains("immediately", StringComparison.OrdinalIgnoreCase) + || output.Contains("right away", StringComparison.OrdinalIgnoreCase) + || output.Contains("look into", StringComparison.OrdinalIgnoreCase)) + { + id = "action_oriented"; + } + else + { + id = "informational"; + } + + var label = char.ToUpperInvariant(id[0]) + id[1..].Replace('_', ' '); + + IReadOnlyList results = new[] + { + new Classification( + id, + Label: label, + Metadata: new Dictionary { ["word_count"] = wordCount }) + }; + return Task.FromResult(results); + } +} + +class Program +{ + private static readonly (string Input, string Expected)[] Messages = + { + ("Hi! I just wanted to say thank you, the product is amazing!", "praise"), + ("I've been waiting 2 weeks for my order. This is unacceptable!", "follow_up"), + ("How do I reset my password? I can't find the option anywhere.", "how_to"), + ("The item arrived damaged. I need a refund immediately.", "complaint"), + ("Just checking in — any update on my ticket #4821?", "follow_up") + }; + + static string GenerateResponse(string message) + { + if (Regex("thank").IsMatch(message)) + return "You're welcome! So glad you're enjoying it."; + if (Regex("waiting|order").IsMatch(message)) + return "I sincerely apologise for the delay. Let me look into this right away."; + if (Regex("password|reset").IsMatch(message)) + return "To reset your password, go to Settings > Account > Reset Password."; + if (Regex("damaged|refund").IsMatch(message)) + return "I'm sorry to hear that. I'll process your refund immediately."; + return "Thanks for reaching out! Let me check on that for you."; + } + + static System.Text.RegularExpressions.Regex Regex(string pattern) + => new(pattern, System.Text.RegularExpressions.RegexOptions.IgnoreCase); + + static async Task Main() + { + var braintrust = Braintrust.Get(); + + // Pattern 1: inline single-label classifier + var intentClassifier = new FunctionClassifier( + "intent", + taskResult => + { + var input = taskResult.DatasetCase.Input; + string id = + Regex("thank").IsMatch(input) ? "praise" : + Regex("waiting|order|update").IsMatch(input) ? "follow_up" : + Regex("password|reset|find").IsMatch(input) ? "how_to" : + Regex("damaged|refund").IsMatch(input) ? "complaint" : + "other"; + + return new Classification( + id, + Label: char.ToUpperInvariant(id[0]) + id[1..].Replace('_', ' ')); + }); + + // Pattern 2: inline multi-label classifier — returns a list + var toneClassifier = new FunctionClassifier( + "tone", + taskResult => + { + var input = taskResult.DatasetCase.Input; + var labels = new List(); + if (Regex("immediately|unacceptable|waiting").IsMatch(input)) + labels.Add(new Classification("urgent", Label: "Urgent")); + if (Regex("please|thank|just checking").IsMatch(input)) + labels.Add(new Classification("polite", Label: "Polite")); + if (Regex("unacceptable|damaged|waiting").IsMatch(input)) + labels.Add(new Classification("frustrated", Label: "Frustrated")); + if (labels.Count == 0) + labels.Add(new Classification("neutral", Label: "Neutral")); + return (IReadOnlyList)labels; + }); + + // Pattern 3: class-based classifier (see ResponseQualityClassifier above) + var qualityClassifier = new ResponseQualityClassifier(); + + var cases = Messages + .Select(m => DatasetCase.Of(m.Input, m.Expected)) + .ToArray(); + + var eval = await braintrust + .EvalBuilder() + .Name($"dotnet-classifiers-example-{DateTimeOffset.UtcNow.ToUnixTimeMilliseconds()}") + .Tags("classifiers-example", "dotnet-sdk") + .Cases(cases) + .TaskFunction(GenerateResponse) + .Classifiers(intentClassifier, toneClassifier, qualityClassifier) + .BuildAsync(); + + var result = await eval.RunAsync(); + Console.WriteLine($"\n\n{result.CreateReportString()}"); + } +} diff --git a/src/Braintrust.Sdk/Eval/Classification.cs b/src/Braintrust.Sdk/Eval/Classification.cs new file mode 100644 index 0000000..a69b7a6 --- /dev/null +++ b/src/Braintrust.Sdk/Eval/Classification.cs @@ -0,0 +1,14 @@ +namespace Braintrust.Sdk.Eval; + +/// +/// A structured label produced by a classifier. +/// +/// Stable identifier for filtering and grouping. Required. +/// Grouping key in the per-case classifications dictionary. If null or empty, the runner defaults this to the classifier's resolved name. +/// Optional display label. Consumers may fall back to when omitted. +/// Optional arbitrary metadata associated with this classification. +public readonly record struct Classification( + string Id, + string? Name = null, + string? Label = null, + IReadOnlyDictionary? Metadata = null); diff --git a/src/Braintrust.Sdk/Eval/Eval.cs b/src/Braintrust.Sdk/Eval/Eval.cs index fbe6672..8afa9d4 100644 --- a/src/Braintrust.Sdk/Eval/Eval.cs +++ b/src/Braintrust.Sdk/Eval/Eval.cs @@ -32,6 +32,7 @@ public sealed class Eval private readonly IDataset _dataset; private readonly ITask _task; private readonly IReadOnlyList> _scorers; + private readonly IReadOnlyList> _classifiers; private readonly IReadOnlyList? _experimentTags; private readonly IReadOnlyDictionary? _experimentMetadata; private readonly int? _maxConcurrency; @@ -49,6 +50,7 @@ private Eval(Builder builder, OrganizationAndProjectInfo orgAndProject, RepoInfo _dataset = builder._dataset ?? throw new ArgumentNullException(nameof(builder._dataset)); _task = builder._task ?? throw new ArgumentNullException(nameof(builder._task)); _scorers = builder._scorers.ToList(); + _classifiers = builder._classifiers.ToList(); _experimentTags = builder._experimentTags; _experimentMetadata = builder._experimentMetadata; _maxConcurrency = builder._maxConcurrency; @@ -165,12 +167,13 @@ private async Task EvalOne(string experimentId, DatasetCase dat } if (taskException == null) { - // Task succeeded — record output and run all scorers in parallel, each in their own span + // Task succeeded — record output and run all scorers and classifiers in parallel, each in their own span rootActivity.SetTag("braintrust.output_json", ToJson(new { output = taskResult!.Value.Result })); - // Flush OTel spans to Braintrust before scoring so traced scorers can access them - var hasTracedScorers = _scorers.OfType>().Any(); - if (hasTracedScorers) + // Flush OTel spans to Braintrust before scoring so traced scorers/classifiers can access them + var needsTraceFlush = _scorers.OfType>().Any() + || _classifiers.OfType>().Any(); + if (needsTraceFlush) { BraintrustTracing.ForceFlush(); } @@ -179,7 +182,8 @@ private async Task EvalOne(string experimentId, DatasetCase dat var rootSpanId = rootActivity.TraceId.ToHexString(); var trace = new EvalTrace(ct => _btqlClient.QuerySpansAsync(experimentId, rootSpanId, ct)); - await RunScorers(experimentId, rootActivity, taskResult!.Value, trace).ConfigureAwait(false); + await RunScorersAndClassifiers(experimentId, rootActivity, taskResult!.Value, trace, datasetCase.Metadata) + .ConfigureAwait(false); } else { @@ -234,19 +238,28 @@ private async Task RunSingleScorerForTaskException( } /// - /// Runs all scorers for a successful task result, each in their own score span. - /// Calls (or - /// for traced scorers) and falls back to on error. + /// Runs all scorers and classifiers for a successful task result in parallel, each in their own span. + /// After completion, aggregates classifier results onto the root span as braintrust.classifications + /// and merges any classifier errors into the root span's braintrust.metadata under + /// classifier_errors. /// - private async Task RunScorers( + private async Task RunScorersAndClassifiers( string experimentId, Activity rootActivity, TaskResult taskResult, - EvalTrace trace) + EvalTrace trace, + IReadOnlyDictionary caseMetadata) { var scorerTasks = _scorers.Select(scorer => RunSingleScorer(experimentId, rootActivity, scorer, taskResult, trace)); - await Task.WhenAll(scorerTasks).ConfigureAwait(false); + + var classifierOutcomes = new ClassifierOutcome?[_classifiers.Count]; + var classifierTasks = _classifiers.Select((classifier, index) => + RunSingleClassifier(experimentId, rootActivity, classifier, index, taskResult, trace, classifierOutcomes)); + + await Task.WhenAll(scorerTasks.Concat(classifierTasks)).ConfigureAwait(false); + + AggregateClassifierOutcomes(rootActivity, caseMetadata, classifierOutcomes); } private async Task RunSingleScorer( @@ -327,6 +340,214 @@ private static void RecordScores( } } + /// + /// Per-classifier outcome captured after running. Either a successful list of normalized items + /// (already grouped by resolved name) or an error message. + /// + private sealed class ClassifierOutcome + { + public string ClassifierName { get; } + public IReadOnlyList<(string Name, Dictionary Item)>? Items { get; } + public string? ErrorMessage { get; } + + private ClassifierOutcome( + string classifierName, + IReadOnlyList<(string Name, Dictionary Item)>? items, + string? errorMessage) + { + ClassifierName = classifierName; + Items = items; + ErrorMessage = errorMessage; + } + + public static ClassifierOutcome Success( + string classifierName, + IReadOnlyList<(string Name, Dictionary Item)> items) + => new(classifierName, items, null); + + public static ClassifierOutcome Error(string classifierName, string errorMessage) + => new(classifierName, null, errorMessage); + } + + private async Task RunSingleClassifier( + string experimentId, + Activity rootActivity, + IClassifier classifier, + int classifierIndex, + TaskResult taskResult, + EvalTrace trace, + ClassifierOutcome?[] outcomes) + { + var resolvedName = string.IsNullOrWhiteSpace(classifier.Name) + ? $"classifier_{classifierIndex}" + : classifier.Name; + + var classifierActivity = _activitySource.StartActivity(resolvedName); + classifierActivity?.SetTag(BraintrustTracing.ParentKey, $"experiment_id:{experimentId}"); + classifierActivity?.SetTag( + "braintrust.span_attributes", + ToJson(new { type = "classifier", name = resolvedName, purpose = "scorer" })); + + var datasetCase = taskResult.DatasetCase; + classifierActivity?.SetTag( + "braintrust.input_json", + ToJson(new + { + input = datasetCase.Input, + expected = datasetCase.Expected, + output = taskResult.Result, + metadata = datasetCase.Metadata + })); + + try + { + using var classifierScope = BraintrustContext.OfExperiment(experimentId).MakeCurrent(); + + IReadOnlyList rawResults; + try + { + rawResults = classifier is ITracedClassifier tracedClassifier + ? await tracedClassifier.Classify(taskResult, trace).ConfigureAwait(false) + : await classifier.Classify(taskResult).ConfigureAwait(false); + + if (rawResults == null) + { + rawResults = Array.Empty(); + } + } + catch (Exception ex) + { + classifierActivity?.SetStatus(ActivityStatusCode.Error, ex.Message); + classifierActivity?.AddEvent(CreateExceptionEvent(ex)); + outcomes[classifierIndex] = ClassifierOutcome.Error(resolvedName, ex.Message); + return; + } + + // Normalize: resolve name + validate, build storage items (no Name key). + var normalized = new List<(string Name, Dictionary Item)>(rawResults.Count); + try + { + foreach (var classification in rawResults) + { + if (string.IsNullOrEmpty(classification.Id)) + { + throw new InvalidOperationException( + "When returning structured classifier results, each classification must be a non-empty object."); + } + + var groupingName = string.IsNullOrWhiteSpace(classification.Name) + ? resolvedName + : classification.Name!; + + var item = new Dictionary { ["id"] = classification.Id }; + if (classification.Label != null) + { + item["label"] = classification.Label; + } + if (classification.Metadata != null && classification.Metadata.Count > 0) + { + item["metadata"] = classification.Metadata; + } + + normalized.Add((groupingName, item)); + } + } + catch (Exception ex) + { + classifierActivity?.SetStatus(ActivityStatusCode.Error, ex.Message); + classifierActivity?.AddEvent(CreateExceptionEvent(ex)); + outcomes[classifierIndex] = ClassifierOutcome.Error(resolvedName, ex.Message); + return; + } + + // Build output_json keyed by resolved name for the classifier span. + if (normalized.Count > 0) + { + var outputByName = new Dictionary>>(); + foreach (var (name, item) in normalized) + { + if (!outputByName.TryGetValue(name, out var list)) + { + list = new List>(); + outputByName[name] = list; + } + list.Add(item); + } + classifierActivity?.SetTag("braintrust.output_json", ToJson(outputByName)); + } + + outcomes[classifierIndex] = ClassifierOutcome.Success(resolvedName, normalized); + } + finally + { + classifierActivity?.Stop(); + } + } + + /// + /// Aggregates per-classifier outcomes onto the root span: + /// + /// Sets braintrust.classifications when any classifications were produced. + /// Merges any classifier errors into braintrust.metadata under classifier_errors. + /// + /// + private static void AggregateClassifierOutcomes( + Activity rootActivity, + IReadOnlyDictionary caseMetadata, + ClassifierOutcome?[] outcomes) + { + if (outcomes.Length == 0) + { + return; + } + + var classifications = new Dictionary>>(); + var classifierErrors = new Dictionary(); + + foreach (var outcome in outcomes) + { + if (outcome == null) + { + continue; + } + + if (outcome.ErrorMessage != null) + { + classifierErrors[outcome.ClassifierName] = outcome.ErrorMessage; + continue; + } + + if (outcome.Items == null) + { + continue; + } + + foreach (var (name, item) in outcome.Items) + { + if (!classifications.TryGetValue(name, out var list)) + { + list = new List>(); + classifications[name] = list; + } + list.Add(item); + } + } + + if (classifications.Count > 0) + { + rootActivity.SetTag("braintrust.classifications", ToJson(classifications)); + } + + if (classifierErrors.Count > 0) + { + var merged = new Dictionary(caseMetadata) + { + ["classifier_errors"] = classifierErrors + }; + rootActivity.SetTag("braintrust.metadata", ToJson(merged)); + } + } + private static string ToJson(object obj) { return JsonSerializer.Serialize(obj, JsonOptions); @@ -388,6 +609,7 @@ public sealed class Builder internal IDataset? _dataset; internal ITask? _task; internal List> _scorers = new(); + internal List> _classifiers = new(); internal IReadOnlyList? _experimentTags; internal IReadOnlyDictionary? _experimentMetadata; internal int? _maxConcurrency = 10; @@ -406,9 +628,9 @@ public async Task> BuildAsync() _apiClient ??= BraintrustApiClient.Of(_config); _btqlClient ??= new BtqlClient(_config); - if (_scorers.Count == 0) + if (_scorers.Count == 0 && _classifiers.Count == 0) { - throw new InvalidOperationException("Must provide at least one scorer"); + throw new InvalidOperationException("Must provide at least one scorer or classifier"); } if (_dataset == null) @@ -561,6 +783,16 @@ public Builder Scorers(params IScorer[] scorers) return this; } + /// + /// Set the classifiers. + /// At least one of or must be provided. + /// + public Builder Classifiers(params IClassifier[] classifiers) + { + _classifiers = classifiers.ToList(); + return this; + } + /// /// Set the experiment-level tags. /// These tags are applied to the experiment itself, not individual cases. diff --git a/src/Braintrust.Sdk/Eval/FunctionClassifier.cs b/src/Braintrust.Sdk/Eval/FunctionClassifier.cs new file mode 100644 index 0000000..5aeb473 --- /dev/null +++ b/src/Braintrust.Sdk/Eval/FunctionClassifier.cs @@ -0,0 +1,73 @@ +namespace Braintrust.Sdk.Eval; + +/// +/// Implementation of a classifier from a function. +/// Supports synchronous and asynchronous functions returning either a single +/// or a list. Returning null means "no classifications for this case". +/// +public class FunctionClassifier : IClassifier + where TInput : notnull + where TOutput : notnull +{ + private static readonly IReadOnlyList Empty = Array.Empty(); + + private readonly Func, Task>> _classifierFn; + + /// + /// Create a classifier from a synchronous function returning a single classification (or null). + /// + public FunctionClassifier(string name, Func, Classification?> classifierFn) + { + Name = name; + _classifierFn = taskResult => + { + var result = classifierFn(taskResult); + return Task.FromResult>( + result.HasValue ? new[] { result.Value } : Empty); + }; + } + + /// + /// Create a classifier from a synchronous function returning a list of classifications (or null). + /// + public FunctionClassifier(string name, Func, IReadOnlyList?> classifierFn) + { + Name = name; + _classifierFn = taskResult => + { + var result = classifierFn(taskResult); + return Task.FromResult>(result ?? Empty); + }; + } + + /// + /// Create a classifier from an asynchronous function returning a single classification (or null). + /// + public FunctionClassifier(string name, Func, Task> classifierFn) + { + Name = name; + _classifierFn = async taskResult => + { + var result = await classifierFn(taskResult).ConfigureAwait(false); + return result.HasValue ? new[] { result.Value } : Empty; + }; + } + + /// + /// Create a classifier from an asynchronous function returning a list of classifications (or null). + /// + public FunctionClassifier(string name, Func, Task?>> classifierFn) + { + Name = name; + _classifierFn = async taskResult => + { + var result = await classifierFn(taskResult).ConfigureAwait(false); + return result ?? Empty; + }; + } + + public string Name { get; } + + public Task> Classify(TaskResult taskResult) + => _classifierFn(taskResult); +} diff --git a/src/Braintrust.Sdk/Eval/IClassifier.cs b/src/Braintrust.Sdk/Eval/IClassifier.cs new file mode 100644 index 0000000..f85f502 --- /dev/null +++ b/src/Braintrust.Sdk/Eval/IClassifier.cs @@ -0,0 +1,30 @@ +namespace Braintrust.Sdk.Eval; + +/// +/// A classifier categorizes and labels eval outputs. +/// Unlike (which returns numeric 0-1 values), +/// classifiers return structured items with an id and optional label and metadata. +/// +/// +/// Implementations must be thread-safe as classifiers may be executed concurrently. +/// Classifier failures are non-fatal: an exception thrown by is recorded +/// under classifier_errors in the eval span's metadata and does not abort the evaluation. +/// +/// Type of the input data +/// Type of the output data +public interface IClassifier + where TInput : notnull + where TOutput : notnull +{ + /// + /// Gets the name of this classifier. Used as the classifier span name and as the + /// default grouping key when a returned has no Name. + /// + string Name { get; } + + /// + /// Classify the task result and return zero or more classifications. + /// Return an empty list to indicate no classifications for this case. + /// + Task> Classify(TaskResult taskResult); +} diff --git a/src/Braintrust.Sdk/Eval/ITracedClassifier.cs b/src/Braintrust.Sdk/Eval/ITracedClassifier.cs new file mode 100644 index 0000000..97fd6d7 --- /dev/null +++ b/src/Braintrust.Sdk/Eval/ITracedClassifier.cs @@ -0,0 +1,24 @@ +namespace Braintrust.Sdk.Eval; + +/// +/// A classifier that receives access to the distributed trace (spans) of the task that was evaluated. +/// This allows classifiers to inspect intermediate LLM calls and tool-use chains, not just the final output. +/// +/// Implement this interface when your classifier needs to examine multi-turn conversations or tool-use chains +/// (e.g. classifying a conversation pattern as "single-turn", "tool-heavy", or "clarification-loop"). +/// When a classifier implements this interface, +/// is called instead of . +/// Backward-compatible: classifiers that only implement continue to work without change. +/// +/// The type of input data for the evaluation +/// The type of output produced by the task +public interface ITracedClassifier : IClassifier + where TInput : notnull + where TOutput : notnull +{ + /// + /// Classify the task result using the distributed trace for additional context. + /// Called instead of when trace is available. + /// + Task> Classify(TaskResult taskResult, EvalTrace trace); +} diff --git a/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs b/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs new file mode 100644 index 0000000..b790130 --- /dev/null +++ b/tests/Braintrust.Sdk.Tests/Eval/ClassifierTest.cs @@ -0,0 +1,573 @@ +using System.Diagnostics; +using System.Text.Json; +using Braintrust.Sdk.Config; +using Braintrust.Sdk.Eval; + +namespace Braintrust.Sdk.Tests.Eval; + +[Collection("BraintrustGlobals")] +public class ClassifierTest : IDisposable +{ + private readonly ActivityListener _activityListener; + + public ClassifierTest() + { + Braintrust.ResetForTest(); + _activityListener = new ActivityListener + { + ShouldListenTo = source => source.Name == "braintrust-dotnet", + Sample = (ref ActivityCreationOptions _) => ActivitySamplingResult.AllDataAndRecorded + }; + ActivitySource.AddActivityListener(_activityListener); + } + + public void Dispose() + { + _activityListener?.Dispose(); + Braintrust.ResetForTest(); + } + + // ===================================================================== + // FunctionClassifier shape normalization + // ===================================================================== + + [Fact] + public async Task FunctionClassifierReturnsSingleClassification() + { + var classifier = new FunctionClassifier( + "category", + _ => new Classification("greeting", Label: "Greeting")); + + var taskResult = MakeTaskResult("hello", "hi"); + var results = await classifier.Classify(taskResult); + + Assert.Single(results); + Assert.Equal("greeting", results[0].Id); + Assert.Equal("Greeting", results[0].Label); + } + + [Fact] + public async Task FunctionClassifierReturnsList() + { + var classifier = new FunctionClassifier( + "sentiment", + _ => (IReadOnlyList)new[] + { + new Classification("positive", Label: "Positive"), + new Classification("enthusiastic", Label: "Enthusiastic") + }); + + var results = await classifier.Classify(MakeTaskResult("great!", "")); + + Assert.Equal(2, results.Count); + Assert.Equal("positive", results[0].Id); + Assert.Equal("enthusiastic", results[1].Id); + } + + [Fact] + public async Task FunctionClassifierNullReturnsEmptyList() + { + var classifier = new FunctionClassifier( + "maybe", + _ => (Classification?)null); + + var results = await classifier.Classify(MakeTaskResult("hello", "hi")); + Assert.Empty(results); + } + + [Fact] + public async Task FunctionClassifierNullListReturnsEmptyList() + { + var classifier = new FunctionClassifier( + "maybe", + _ => (IReadOnlyList?)null); + + var results = await classifier.Classify(MakeTaskResult("hello", "hi")); + Assert.Empty(results); + } + + [Fact] + public async Task FunctionClassifierAsyncSingle() + { + var classifier = new FunctionClassifier( + "category", + _ => Task.FromResult(new Classification("greeting"))); + + var results = await classifier.Classify(MakeTaskResult("hello", "hi")); + Assert.Single(results); + Assert.Equal("greeting", results[0].Id); + } + + [Fact] + public async Task FunctionClassifierAsyncList() + { + var classifier = new FunctionClassifier( + "category", + _ => Task.FromResult?>(new[] + { + new Classification("a"), + new Classification("b") + })); + + var results = await classifier.Classify(MakeTaskResult("hello", "hi")); + Assert.Equal(2, results.Count); + } + + // ===================================================================== + // Builder validation + // ===================================================================== + + [Fact] + public async Task EvalRequiresAtLeastScorersOrClassifiers() + { + var config = BraintrustConfig.Of(("BRAINTRUST_API_KEY", "test-key")); + var mockClient = new MockBraintrustApiClient(); + + var ex = await Assert.ThrowsAsync(() => + Eval.NewBuilder() + .Name("test-eval") + .Config(config) + .ApiClient(mockClient) + .Cases(DatasetCase.Of("input", "expected")) + .TaskFunction(x => x) + .BuildAsync()); + + Assert.Contains("at least one scorer or classifier", ex.Message); + } + + [Fact] + public async Task EvalBuildsWithClassifiersOnly() + { + var config = BraintrustConfig.Of( + ("BRAINTRUST_API_KEY", "test-key"), + ("BRAINTRUST_APP_URL", "https://braintrust.dev"), + ("BRAINTRUST_DEFAULT_PROJECT_NAME", "test-project")); + var mockClient = new MockBraintrustApiClient(); + + var eval = await Eval.NewBuilder() + .Name("test-eval") + .Config(config) + .ApiClient(mockClient) + .Cases(DatasetCase.Of("hello", "hi")) + .TaskFunction(x => x) + .Classifiers(new FunctionClassifier( + "category", + _ => new Classification("greeting"))) + .BuildAsync(); + + var result = await eval.RunAsync(); + Assert.NotNull(result.ExperimentUrl); + } + + // ===================================================================== + // Runner — classifier results on the eval span + // ===================================================================== + + [Fact] + public async Task RunnerWritesClassificationsToEvalSpan() + { + var (rootSpans, classifierSpans) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier( + "category", + _ => new Classification("greeting", Label: "Greeting")) + }); + + var root = Assert.Single(rootSpans); + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + Assert.True(classifications.RootElement.TryGetProperty("category", out var categoryItems)); + Assert.Equal(1, categoryItems.GetArrayLength()); + Assert.Equal("greeting", categoryItems[0].GetProperty("id").GetString()); + Assert.Equal("Greeting", categoryItems[0].GetProperty("label").GetString()); + + // Single classifier span produced + Assert.Single(classifierSpans); + } + + [Fact] + public async Task RunnerWritesNoClassificationsTagWhenAllNull() + { + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier("maybe", _ => (Classification?)null) + }); + + var root = Assert.Single(rootSpans); + Assert.Null(root.GetTagItem("braintrust.classifications")); + } + + [Fact] + public async Task RunnerCombinesScorersAndClassifiers() + { + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + scorers: new IScorer[] + { + new FunctionScorer("exact", (e, a) => e == a ? 1.0 : 0.0) + }, + classifiers: new IClassifier[] + { + new FunctionClassifier("category", _ => new Classification("greeting")) + }); + + var root = Assert.Single(rootSpans); + Assert.NotNull(root.GetTagItem("braintrust.classifications")); + // The eval span does not store scores itself; verify the classification path was hit + // independently from the scorer path. Score span coverage is in EvalTest. + } + + [Fact] + public async Task RunnerHandlesClassifierExceptionWithoutAbortingEval() + { + var (rootSpans, classifierSpans) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new ThrowingClassifier("broken", "classifier boom"), + new FunctionClassifier("working", _ => new Classification("ok")) + }); + + var root = Assert.Single(rootSpans); + + // Classifier errors merged into braintrust.metadata under classifier_errors + var metadataJson = root.GetTagItem("braintrust.metadata") as string; + Assert.NotNull(metadataJson); + using var doc = JsonDocument.Parse(metadataJson); + Assert.True(doc.RootElement.TryGetProperty("classifier_errors", out var errors)); + Assert.Equal("classifier boom", errors.GetProperty("broken").GetString()); + + // The working classifier still wrote its classification + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + Assert.True(classifications.RootElement.TryGetProperty("working", out _)); + + // The broken classifier span has error status + exception event + var brokenSpan = classifierSpans.First(s => s.DisplayName == "broken"); + Assert.Equal(ActivityStatusCode.Error, brokenSpan.Status); + Assert.NotEmpty(brokenSpan.Events); + + // The eval (root) span itself is not marked Error by a classifier failure + Assert.Equal(ActivityStatusCode.Unset, root.Status); + } + + [Fact] + public async Task RunnerWritesClassifierSpanAttributes() + { + var (_, classifierSpans) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier( + "my_classifier", + _ => new Classification("foo")) + }); + + var span = Assert.Single(classifierSpans); + Assert.Equal("my_classifier", span.DisplayName); + + var attrsJson = span.GetTagItem("braintrust.span_attributes") as string; + Assert.NotNull(attrsJson); + using var doc = JsonDocument.Parse(attrsJson); + Assert.Equal("classifier", doc.RootElement.GetProperty("type").GetString()); + Assert.Equal("my_classifier", doc.RootElement.GetProperty("name").GetString()); + Assert.Equal("scorer", doc.RootElement.GetProperty("purpose").GetString()); + } + + [Fact] + public async Task RunnerMultiLabelResultPreservesOrder() + { + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("great!", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier( + "sentiment", + _ => (IReadOnlyList)new[] + { + new Classification("positive", Label: "Positive"), + new Classification("enthusiastic", Label: "Enthusiastic") + }) + }); + + var root = Assert.Single(rootSpans); + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + var items = classifications.RootElement.GetProperty("sentiment"); + Assert.Equal(2, items.GetArrayLength()); + Assert.Equal("positive", items[0].GetProperty("id").GetString()); + Assert.Equal("enthusiastic", items[1].GetProperty("id").GetString()); + } + + [Fact] + public async Task RunnerClassificationNameDefaultsToClassifierName() + { + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + // Classification has no Name set + new FunctionClassifier( + "my_classifier", + _ => new Classification("foo")) + }); + + var root = Assert.Single(rootSpans); + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + Assert.True(classifications.RootElement.TryGetProperty("my_classifier", out _)); + } + + [Fact] + public async Task RunnerClassificationExplicitNameOverridesClassifierName() + { + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier( + "my_classifier", + _ => new Classification("foo", Name: "override_name")) + }); + + var root = Assert.Single(rootSpans); + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + Assert.True(classifications.RootElement.TryGetProperty("override_name", out _)); + Assert.False(classifications.RootElement.TryGetProperty("my_classifier", out _)); + } + + [Fact] + public async Task RunnerEmptyClassificationItemIsRecordedAsError() + { + var (rootSpans, classifierSpans) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] + { + // Default(Classification) — Id is null/empty, so should fail validation + new FunctionClassifier( + "bad", + _ => (Classification?)default(Classification)) + }); + + var root = Assert.Single(rootSpans); + var metadataJson = root.GetTagItem("braintrust.metadata") as string; + Assert.NotNull(metadataJson); + using var doc = JsonDocument.Parse(metadataJson); + var errors = doc.RootElement.GetProperty("classifier_errors"); + var brokenError = errors.GetProperty("bad").GetString(); + Assert.NotNull(brokenError); + Assert.Contains("each classification must be a non-empty object", brokenError); + + var brokenSpan = Assert.Single(classifierSpans); + Assert.Equal(ActivityStatusCode.Error, brokenSpan.Status); + } + + [Fact] + public async Task RunnerAccumulatesClassificationsAcrossCases() + { + var (rootSpans, _) = await RunEval( + cases: new[] + { + DatasetCase.Of("hi", "x"), + DatasetCase.Of("hello", "x"), + DatasetCase.Of("ok", "x") + }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier( + "category", + tr => new Classification(tr.Result.Length > 3 ? "long" : "short")) + }); + + Assert.Equal(3, rootSpans.Count); + foreach (var root in rootSpans) + { + var classifications = ReadClassifications(root); + Assert.NotNull(classifications); + Assert.True(classifications.RootElement.TryGetProperty("category", out _)); + } + } + + [Fact] + public async Task RunnerClassifierInputContainsAllScoringArgs() + { + var (_, classifierSpans) = await RunEval( + cases: new[] + { + DatasetCase.Of( + "hello", "hi", + new List(), + new Dictionary { ["k"] = "v" }) + }, + taskFn: x => x, + classifiers: new IClassifier[] + { + new FunctionClassifier("category", _ => new Classification("greeting")) + }); + + var span = Assert.Single(classifierSpans); + var inputJson = span.GetTagItem("braintrust.input_json") as string; + Assert.NotNull(inputJson); + using var doc = JsonDocument.Parse(inputJson); + Assert.Equal("hello", doc.RootElement.GetProperty("input").GetString()); + Assert.Equal("hi", doc.RootElement.GetProperty("expected").GetString()); + Assert.Equal("hello", doc.RootElement.GetProperty("output").GetString()); + Assert.True(doc.RootElement.TryGetProperty("metadata", out var md)); + Assert.Equal("v", md.GetProperty("k").GetString()); + } + + // ===================================================================== + // ITracedClassifier + // ===================================================================== + + [Fact] + public async Task TracedClassifierReceivesEvalTrace() + { + var spans = new[] + { + MockBtqlClient.MakeSpan("llm", input: new { messages = new[] { new { role = "user", content = "hi" } } }, + output: new { choices = new[] { new { message = new { role = "assistant", content = "hello" } } } }) + }; + var mockBtql = new MockBtqlClient(spans); + + var capturedSpanCount = -1; + var classifier = new TracedClassifier( + "trace_inspector", + async (_, trace) => + { + var fetched = await trace.GetSpansAsync("llm"); + capturedSpanCount = fetched.Count; + return new[] { new Classification("multi_turn") }; + }); + + var (rootSpans, _) = await RunEval( + cases: new[] { DatasetCase.Of("hello", "hi") }, + taskFn: x => x, + classifiers: new IClassifier[] { classifier }, + btqlClient: mockBtql); + + Assert.Single(rootSpans); + Assert.Equal(1, capturedSpanCount); + Assert.Equal(1, mockBtql.QueryCount); + } + + // ===================================================================== + // Helpers + // ===================================================================== + + private static TaskResult MakeTaskResult(string input, string output) + => new(output, new DatasetCase(input, "")); + + private static JsonDocument? ReadClassifications(Activity span) + { + var json = span.GetTagItem("braintrust.classifications") as string; + return json == null ? null : JsonDocument.Parse(json); + } + + private async Task<(List RootSpans, List ClassifierSpans)> RunEval( + DatasetCase[] cases, + Func taskFn, + IScorer[]? scorers = null, + IClassifier[]? classifiers = null, + MockBtqlClient? btqlClient = null) + { + var config = BraintrustConfig.Of( + ("BRAINTRUST_API_KEY", "test-key"), + ("BRAINTRUST_APP_URL", "https://braintrust.dev"), + ("BRAINTRUST_DEFAULT_PROJECT_NAME", "test-project")); + var mockClient = new MockBraintrustApiClient(); + btqlClient ??= new MockBtqlClient(); + + var captured = new List(); + using var listener = new ActivityListener + { + ShouldListenTo = source => source.Name == "braintrust-dotnet", + Sample = (ref ActivityCreationOptions _) => ActivitySamplingResult.AllDataAndRecorded, + ActivityStopped = captured.Add + }; + ActivitySource.AddActivityListener(listener); + + var builder = Eval.NewBuilder() + .Name("classifier-test") + .Config(config) + .ApiClient(mockClient) + .BtqlClient(btqlClient) + .Cases(cases) + .TaskFunction(taskFn); + + if (scorers != null && scorers.Length > 0) + { + builder.Scorers(scorers); + } + + if (classifiers != null && classifiers.Length > 0) + { + builder.Classifiers(classifiers); + } + else if (scorers == null || scorers.Length == 0) + { + // The validator forbids zero classifiers and zero scorers; tests using RunEval should specify at least one. + throw new InvalidOperationException("Test setup error: provide at least one scorer or classifier."); + } + + var eval = await builder.BuildAsync(); + await eval.RunAsync(); + + var rootSpans = captured.Where(a => a.DisplayName == "eval").ToList(); + var classifierSpans = captured + .Where(a => + { + var attrs = a.GetTagItem("braintrust.span_attributes") as string; + return attrs != null && attrs.Contains("\"type\":\"classifier\""); + }) + .ToList(); + return (rootSpans, classifierSpans); + } + + private sealed class ThrowingClassifier : IClassifier + { + private readonly string _message; + public ThrowingClassifier(string name, string message) + { + Name = name; + _message = message; + } + public string Name { get; } + public Task> Classify(TaskResult taskResult) + => throw new InvalidOperationException(_message); + } + + private sealed class TracedClassifier : ITracedClassifier + { + private readonly Func, EvalTrace, Task>> _fn; + public TracedClassifier( + string name, + Func, EvalTrace, Task>> fn) + { + Name = name; + _fn = fn; + } + public string Name { get; } + + public Task> Classify(TaskResult taskResult) + => Task.FromResult>(Array.Empty()); + + public Task> Classify(TaskResult taskResult, EvalTrace trace) + => _fn(taskResult, trace); + } +}