Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 34 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ This is the official Ruby SDK for [Braintrust](https://www.braintrust.dev), for
- [Viewing traces](#viewing-traces)
- [Evals](#evals)
- [Datasets](#datasets)
- [Remote scorers](#remote-scorers)
- [Scorers](#scorers)
- [Documentation](#documentation)
- [Troubleshooting](#troubleshooting)
- [Contributing](#contributing)
Expand Down Expand Up @@ -260,7 +260,7 @@ Braintrust::Eval.run(

### Datasets

Load test cases from a Braintrust dataset:
Use test cases from a Braintrust dataset:

```ruby
Braintrust::Eval.run(
Expand All @@ -271,7 +271,22 @@ Braintrust::Eval.run(
)
```

### Remote scorers
Or define test cases inline with metadata and tags:

```ruby
Braintrust::Eval.run(
project: "my-project",
experiment: "classifier-v1",
cases: [
{input: "apple", expected: "fruit", tags: ["produce"], metadata: {difficulty: "easy"}},
{input: "salmon", expected: "protein", tags: ["seafood"], metadata: {difficulty: "medium"}}
],
task: ->(input) { classify(input) },
scorers: [...]
)
```

### Scorers

Use scoring functions defined in Braintrust:

Expand All @@ -281,7 +296,22 @@ Braintrust::Eval.run(
cases: [...],
task: ->(input) { ... },
scorers: [
Braintrust::Scorer.remote("my-project", "accuracy-scorer")
Braintrust::Eval::Functions.scorer(project: "my-project", slug: "accuracy-scorer")
]
)
```

Or define scorers inline with `Eval.scorer`:

```ruby
Braintrust::Eval.run(
project: "my-project",
cases: [...],
task: ->(input) { ... },
scorers: [
Braintrust::Eval.scorer("exact_match") do |input, expected, output|
output == expected ? 1.0 : 0.0
end
]
)
```
Expand Down
23 changes: 22 additions & 1 deletion lib/braintrust/eval.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
require_relative "eval/scorer"
require_relative "eval/runner"
require_relative "internal/experiments"
require_relative "internal/origin"

require "opentelemetry/sdk"
require "json"
Expand Down Expand Up @@ -354,16 +355,36 @@ def resolve_dataset(dataset, project, state)
end

# Filter records to only include Case-compatible fields
# Case accepts: input, expected, tags, metadata
# Case accepts: input, expected, tags, metadata, origin
records.map do |record|
filtered = {}
filtered[:input] = record["input"] if record.key?("input")
filtered[:expected] = record["expected"] if record.key?("expected")
filtered[:tags] = record["tags"] if record.key?("tags")
filtered[:metadata] = record["metadata"] if record.key?("metadata")

origin = build_dataset_origin(record, dataset_id)
filtered[:origin] = origin if origin

filtered
end
end

# Build origin JSON for a dataset record
# @param record [Hash] Record from dataset fetch API
# @param dataset_id [String] Dataset ID (fallback if not in record)
# @return [String, nil] JSON-serialized origin, or nil if record lacks required fields
def build_dataset_origin(record, dataset_id)
return nil unless record["id"] && record["_xact_id"]

Internal::Origin.to_json(
object_type: "dataset",
object_id: record["dataset_id"] || dataset_id,
id: record["id"],
xact_id: record["_xact_id"],
created: record["created"]
)
end
end
end
end
4 changes: 3 additions & 1 deletion lib/braintrust/eval/case.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ module Eval
# @attr expected [Object, nil] The expected output (optional)
# @attr tags [Array<String>, nil] Optional tags for filtering/grouping
# @attr metadata [Hash, nil] Optional metadata for the case
Case = Struct.new(:input, :expected, :tags, :metadata, keyword_init: true)
# @attr origin [Hash, nil] Origin pointer for cases from remote sources (e.g., datasets).
# Contains: object_type, object_id, id, _xact_id, created
Case = Struct.new(:input, :expected, :tags, :metadata, :origin, keyword_init: true)
end
end
3 changes: 3 additions & 0 deletions lib/braintrust/eval/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,9 @@ def run_case(test_case, errors)
set_json_attr(eval_span, "braintrust.input_json", test_case.input)
set_json_attr(eval_span, "braintrust.output_json", output)
set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected

# Set origin for cases from remote sources (already JSON-serialized)
eval_span.set_attribute("braintrust.origin", test_case.origin) if test_case.origin
end
end

Expand Down
28 changes: 28 additions & 0 deletions lib/braintrust/internal/origin.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# frozen_string_literal: true

require "json"

module Braintrust
module Internal
# Origin provides serialization for source object pointers in Braintrust.
# Used internally to link spans back to their source records (e.g., dataset rows).
module Origin
# Serialize an origin pointer to JSON
# @param object_type [String] Type of source object (e.g., "dataset", "playground_logs")
# @param object_id [String] ID of the source object
# @param id [String] ID of the specific record within the source
# @param xact_id [String] Transaction ID
# @param created [String, nil] Creation timestamp
# @return [String] JSON-serialized origin
def self.to_json(object_type:, object_id:, id:, xact_id:, created:)
JSON.dump({
object_type: object_type,
object_id: object_id,
id: id,
_xact_id: xact_id,
created: created
})
end
end
end
end
121 changes: 121 additions & 0 deletions test/braintrust/eval_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -750,4 +750,125 @@ def test_eval_run_invalid_parallelism_falls_back_to_sequential
assert result.success?
assert_equal %w[a b], order
end

# ============================================
# Origin tests (for remote dataset linking)
# ============================================
# Origin is automatically generated when using remote datasets.
# It links eval spans back to their source dataset records in the UI.

def test_build_dataset_origin_uses_fallback_dataset_id
# Some API responses may not include dataset_id in the record itself
record = {
"id" => "record-123",
"_xact_id" => "1000196022104685824",
"created" => "2025-10-24T15:29:18.118Z"
}

origin = Braintrust::Eval.send(:build_dataset_origin, record, "fallback-dataset-id")

parsed = JSON.parse(origin)
assert_equal "fallback-dataset-id", parsed["object_id"]
end

def test_build_dataset_origin_returns_nil_when_missing_required_fields
# Missing id - can't link to a specific record
record_no_id = {"_xact_id" => "1000196022104685824"}
assert_nil Braintrust::Eval.send(:build_dataset_origin, record_no_id, "dataset-id")

# Missing _xact_id - can't identify the transaction
record_no_xact = {"id" => "record-123"}
assert_nil Braintrust::Eval.send(:build_dataset_origin, record_no_xact, "dataset-id")
end

def test_runner_does_not_set_origin_when_case_has_no_origin
# Inline cases (not from remote datasets) have no origin
rig = setup_otel_test_rig

task = ->(input) { input.upcase }
scorer = Braintrust::Eval.scorer("exact") { |i, e, o| (o == e) ? 1.0 : 0.0 }

run_test_eval(
experiment_id: "test-exp-123",
experiment_name: "test-no-origin",
project_id: "test-proj-123",
project_name: "test-project",
cases: [{input: "hello", expected: "HELLO"}],
task: task,
scorers: [scorer],
state: rig.state,
tracer_provider: rig.tracer_provider
)

spans = rig.drain
eval_span = spans.find { |s| s.name == "eval" }

assert eval_span, "Expected eval span"
assert_nil eval_span.attributes["braintrust.origin"]
end

# Integration test: verify real API dataset records result in correct origin on spans
# Note: Dataset is not deleted after test - relies on idempotent create (same pattern as other dataset tests)
def test_eval_with_remote_dataset_sets_origin_from_api_response
VCR.use_cassette("eval/dataset_origin") do
# Set up span capture (uses unit test state internally, but we override state for API calls)
rig = setup_otel_test_rig
# Get integration state for real API calls via VCR
state = get_integration_test_state

api = Braintrust::API.new(state: state)

# Create/reuse test dataset (idempotent)
project_name = "ruby-sdk-test"
dataset_name = "test-ruby-sdk-dataset-origin"

result = api.datasets.create(
name: dataset_name,
project_name: project_name,
description: "Test dataset for origin integration"
)
dataset_id = result["dataset"]["id"]

# Insert test record
api.datasets.insert(
id: dataset_id,
events: [{input: "origin-test", expected: "ORIGIN-TEST"}]
)

# Run eval with remote dataset
task = ->(input) { input.upcase }
scorer = Braintrust::Eval.scorer("exact") { |i, e, o| (o == e) ? 1.0 : 0.0 }

eval_result = Braintrust::Eval.run(
project: project_name,
experiment: "test-ruby-sdk-exp-origin",
dataset: dataset_name,
task: task,
scorers: [scorer],
state: state,
tracer_provider: rig.tracer_provider,
quiet: true
)

assert eval_result.success?

# Verify origin was set on eval spans
spans = rig.drain
eval_spans = spans.select { |s| s.name == "eval" }
assert eval_spans.any?, "Expected at least one eval span"

# All eval spans from a dataset should have origin
eval_spans.each do |span|
origin_json = span.attributes["braintrust.origin"]
assert origin_json, "Expected braintrust.origin on eval span"

# Verify origin structure matches expected format
origin = JSON.parse(origin_json)
assert_equal "dataset", origin["object_type"]
assert origin["object_id"], "origin.object_id should be present"
assert origin["id"], "origin.id (record id) should be present"
assert origin["_xact_id"], "origin._xact_id should be present"
end
end
end
end
69 changes: 69 additions & 0 deletions test/braintrust/internal/origin_test.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# frozen_string_literal: true

require "test_helper"
require "braintrust/internal/origin"

class Braintrust::Internal::OriginTest < Minitest::Test
def test_to_json_serializes_all_fields
result = Braintrust::Internal::Origin.to_json(
object_type: "dataset",
object_id: "dataset-123",
id: "record-456",
xact_id: "1000196022104685824",
created: "2025-10-24T15:29:18.118Z"
)

parsed = JSON.parse(result)

assert_equal "dataset", parsed["object_type"]
assert_equal "dataset-123", parsed["object_id"]
assert_equal "record-456", parsed["id"]
assert_equal "1000196022104685824", parsed["_xact_id"]
assert_equal "2025-10-24T15:29:18.118Z", parsed["created"]
end

def test_to_json_handles_nil_created
result = Braintrust::Internal::Origin.to_json(
object_type: "dataset",
object_id: "dataset-123",
id: "record-456",
xact_id: "1000196022104685824",
created: nil
)

parsed = JSON.parse(result)

assert_equal "dataset", parsed["object_type"]
assert_equal "dataset-123", parsed["object_id"]
assert_equal "record-456", parsed["id"]
assert_equal "1000196022104685824", parsed["_xact_id"]
assert_nil parsed["created"]
end

def test_to_json_returns_valid_json_string
result = Braintrust::Internal::Origin.to_json(
object_type: "dataset",
object_id: "abc-123",
id: "def-456",
xact_id: "789",
created: "2025-01-01T00:00:00Z"
)

assert_instance_of String, result
# Should not raise
JSON.parse(result)
end

def test_to_json_with_playground_logs_type
result = Braintrust::Internal::Origin.to_json(
object_type: "playground_logs",
object_id: "playground-123",
id: "log-456",
xact_id: "789",
created: "2025-01-01T00:00:00Z"
)

parsed = JSON.parse(result)
assert_equal "playground_logs", parsed["object_type"]
end
end
Loading