From e4c9755e4978c4834eb7e167dff0e11c54cb39b3 Mon Sep 17 00:00:00 2001 From: David Elner Date: Wed, 4 Feb 2026 12:06:55 -0500 Subject: [PATCH 1/2] Added: origin to datasets (for remote evals) --- lib/braintrust/eval.rb | 23 +- lib/braintrust/eval/case.rb | 4 +- lib/braintrust/eval/runner.rb | 3 + lib/braintrust/internal/origin.rb | 28 + test/braintrust/eval_test.rb | 121 ++++ test/braintrust/internal/origin_test.rb | 69 +++ .../vcr_cassettes/eval/dataset_origin.yml | 531 ++++++++++++++++++ 7 files changed, 777 insertions(+), 2 deletions(-) create mode 100644 lib/braintrust/internal/origin.rb create mode 100644 test/braintrust/internal/origin_test.rb create mode 100644 test/fixtures/vcr_cassettes/eval/dataset_origin.yml diff --git a/lib/braintrust/eval.rb b/lib/braintrust/eval.rb index cc14076..3e68f6d 100644 --- a/lib/braintrust/eval.rb +++ b/lib/braintrust/eval.rb @@ -3,6 +3,7 @@ require_relative "eval/scorer" require_relative "eval/runner" require_relative "internal/experiments" +require_relative "internal/origin" require "opentelemetry/sdk" require "json" @@ -354,16 +355,36 @@ def resolve_dataset(dataset, project, state) end # Filter records to only include Case-compatible fields - # Case accepts: input, expected, tags, metadata + # Case accepts: input, expected, tags, metadata, origin records.map do |record| filtered = {} filtered[:input] = record["input"] if record.key?("input") filtered[:expected] = record["expected"] if record.key?("expected") filtered[:tags] = record["tags"] if record.key?("tags") filtered[:metadata] = record["metadata"] if record.key?("metadata") + + origin = build_dataset_origin(record, dataset_id) + filtered[:origin] = origin if origin + filtered end end + + # Build origin JSON for a dataset record + # @param record [Hash] Record from dataset fetch API + # @param dataset_id [String] Dataset ID (fallback if not in record) + # @return [String, nil] JSON-serialized origin, or nil if record lacks required fields + def build_dataset_origin(record, dataset_id) + return nil unless record["id"] && record["_xact_id"] + + Internal::Origin.to_json( + object_type: "dataset", + object_id: record["dataset_id"] || dataset_id, + id: record["id"], + xact_id: record["_xact_id"], + created: record["created"] + ) + end end end end diff --git a/lib/braintrust/eval/case.rb b/lib/braintrust/eval/case.rb index d549aa1..f798160 100644 --- a/lib/braintrust/eval/case.rb +++ b/lib/braintrust/eval/case.rb @@ -7,6 +7,8 @@ module Eval # @attr expected [Object, nil] The expected output (optional) # @attr tags [Array, nil] Optional tags for filtering/grouping # @attr metadata [Hash, nil] Optional metadata for the case - Case = Struct.new(:input, :expected, :tags, :metadata, keyword_init: true) + # @attr origin [Hash, nil] Origin pointer for cases from remote sources (e.g., datasets). + # Contains: object_type, object_id, id, _xact_id, created + Case = Struct.new(:input, :expected, :tags, :metadata, :origin, keyword_init: true) end end diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb index 6093de4..6040f3d 100644 --- a/lib/braintrust/eval/runner.rb +++ b/lib/braintrust/eval/runner.rb @@ -116,6 +116,9 @@ def run_case(test_case, errors) set_json_attr(eval_span, "braintrust.input_json", test_case.input) set_json_attr(eval_span, "braintrust.output_json", output) set_json_attr(eval_span, "braintrust.expected", test_case.expected) if test_case.expected + + # Set origin for cases from remote sources (already JSON-serialized) + eval_span.set_attribute("braintrust.origin", test_case.origin) if test_case.origin end end diff --git a/lib/braintrust/internal/origin.rb b/lib/braintrust/internal/origin.rb new file mode 100644 index 0000000..935da8b --- /dev/null +++ b/lib/braintrust/internal/origin.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: true + +require "json" + +module Braintrust + module Internal + # Origin provides serialization for source object pointers in Braintrust. + # Used internally to link spans back to their source records (e.g., dataset rows). + module Origin + # Serialize an origin pointer to JSON + # @param object_type [String] Type of source object (e.g., "dataset", "playground_logs") + # @param object_id [String] ID of the source object + # @param id [String] ID of the specific record within the source + # @param xact_id [String] Transaction ID + # @param created [String, nil] Creation timestamp + # @return [String] JSON-serialized origin + def self.to_json(object_type:, object_id:, id:, xact_id:, created:) + JSON.dump({ + object_type: object_type, + object_id: object_id, + id: id, + _xact_id: xact_id, + created: created + }) + end + end + end +end diff --git a/test/braintrust/eval_test.rb b/test/braintrust/eval_test.rb index 3896464..ff28e7e 100644 --- a/test/braintrust/eval_test.rb +++ b/test/braintrust/eval_test.rb @@ -750,4 +750,125 @@ def test_eval_run_invalid_parallelism_falls_back_to_sequential assert result.success? assert_equal %w[a b], order end + + # ============================================ + # Origin tests (for remote dataset linking) + # ============================================ + # Origin is automatically generated when using remote datasets. + # It links eval spans back to their source dataset records in the UI. + + def test_build_dataset_origin_uses_fallback_dataset_id + # Some API responses may not include dataset_id in the record itself + record = { + "id" => "record-123", + "_xact_id" => "1000196022104685824", + "created" => "2025-10-24T15:29:18.118Z" + } + + origin = Braintrust::Eval.send(:build_dataset_origin, record, "fallback-dataset-id") + + parsed = JSON.parse(origin) + assert_equal "fallback-dataset-id", parsed["object_id"] + end + + def test_build_dataset_origin_returns_nil_when_missing_required_fields + # Missing id - can't link to a specific record + record_no_id = {"_xact_id" => "1000196022104685824"} + assert_nil Braintrust::Eval.send(:build_dataset_origin, record_no_id, "dataset-id") + + # Missing _xact_id - can't identify the transaction + record_no_xact = {"id" => "record-123"} + assert_nil Braintrust::Eval.send(:build_dataset_origin, record_no_xact, "dataset-id") + end + + def test_runner_does_not_set_origin_when_case_has_no_origin + # Inline cases (not from remote datasets) have no origin + rig = setup_otel_test_rig + + task = ->(input) { input.upcase } + scorer = Braintrust::Eval.scorer("exact") { |i, e, o| (o == e) ? 1.0 : 0.0 } + + run_test_eval( + experiment_id: "test-exp-123", + experiment_name: "test-no-origin", + project_id: "test-proj-123", + project_name: "test-project", + cases: [{input: "hello", expected: "HELLO"}], + task: task, + scorers: [scorer], + state: rig.state, + tracer_provider: rig.tracer_provider + ) + + spans = rig.drain + eval_span = spans.find { |s| s.name == "eval" } + + assert eval_span, "Expected eval span" + assert_nil eval_span.attributes["braintrust.origin"] + end + + # Integration test: verify real API dataset records result in correct origin on spans + # Note: Dataset is not deleted after test - relies on idempotent create (same pattern as other dataset tests) + def test_eval_with_remote_dataset_sets_origin_from_api_response + VCR.use_cassette("eval/dataset_origin") do + # Set up span capture (uses unit test state internally, but we override state for API calls) + rig = setup_otel_test_rig + # Get integration state for real API calls via VCR + state = get_integration_test_state + + api = Braintrust::API.new(state: state) + + # Create/reuse test dataset (idempotent) + project_name = "ruby-sdk-test" + dataset_name = "test-ruby-sdk-dataset-origin" + + result = api.datasets.create( + name: dataset_name, + project_name: project_name, + description: "Test dataset for origin integration" + ) + dataset_id = result["dataset"]["id"] + + # Insert test record + api.datasets.insert( + id: dataset_id, + events: [{input: "origin-test", expected: "ORIGIN-TEST"}] + ) + + # Run eval with remote dataset + task = ->(input) { input.upcase } + scorer = Braintrust::Eval.scorer("exact") { |i, e, o| (o == e) ? 1.0 : 0.0 } + + eval_result = Braintrust::Eval.run( + project: project_name, + experiment: "test-ruby-sdk-exp-origin", + dataset: dataset_name, + task: task, + scorers: [scorer], + state: state, + tracer_provider: rig.tracer_provider, + quiet: true + ) + + assert eval_result.success? + + # Verify origin was set on eval spans + spans = rig.drain + eval_spans = spans.select { |s| s.name == "eval" } + assert eval_spans.any?, "Expected at least one eval span" + + # All eval spans from a dataset should have origin + eval_spans.each do |span| + origin_json = span.attributes["braintrust.origin"] + assert origin_json, "Expected braintrust.origin on eval span" + + # Verify origin structure matches expected format + origin = JSON.parse(origin_json) + assert_equal "dataset", origin["object_type"] + assert origin["object_id"], "origin.object_id should be present" + assert origin["id"], "origin.id (record id) should be present" + assert origin["_xact_id"], "origin._xact_id should be present" + end + end + end end diff --git a/test/braintrust/internal/origin_test.rb b/test/braintrust/internal/origin_test.rb new file mode 100644 index 0000000..0053484 --- /dev/null +++ b/test/braintrust/internal/origin_test.rb @@ -0,0 +1,69 @@ +# frozen_string_literal: true + +require "test_helper" +require "braintrust/internal/origin" + +class Braintrust::Internal::OriginTest < Minitest::Test + def test_to_json_serializes_all_fields + result = Braintrust::Internal::Origin.to_json( + object_type: "dataset", + object_id: "dataset-123", + id: "record-456", + xact_id: "1000196022104685824", + created: "2025-10-24T15:29:18.118Z" + ) + + parsed = JSON.parse(result) + + assert_equal "dataset", parsed["object_type"] + assert_equal "dataset-123", parsed["object_id"] + assert_equal "record-456", parsed["id"] + assert_equal "1000196022104685824", parsed["_xact_id"] + assert_equal "2025-10-24T15:29:18.118Z", parsed["created"] + end + + def test_to_json_handles_nil_created + result = Braintrust::Internal::Origin.to_json( + object_type: "dataset", + object_id: "dataset-123", + id: "record-456", + xact_id: "1000196022104685824", + created: nil + ) + + parsed = JSON.parse(result) + + assert_equal "dataset", parsed["object_type"] + assert_equal "dataset-123", parsed["object_id"] + assert_equal "record-456", parsed["id"] + assert_equal "1000196022104685824", parsed["_xact_id"] + assert_nil parsed["created"] + end + + def test_to_json_returns_valid_json_string + result = Braintrust::Internal::Origin.to_json( + object_type: "dataset", + object_id: "abc-123", + id: "def-456", + xact_id: "789", + created: "2025-01-01T00:00:00Z" + ) + + assert_instance_of String, result + # Should not raise + JSON.parse(result) + end + + def test_to_json_with_playground_logs_type + result = Braintrust::Internal::Origin.to_json( + object_type: "playground_logs", + object_id: "playground-123", + id: "log-456", + xact_id: "789", + created: "2025-01-01T00:00:00Z" + ) + + parsed = JSON.parse(result) + assert_equal "playground_logs", parsed["object_type"] + end +end diff --git a/test/fixtures/vcr_cassettes/eval/dataset_origin.yml b/test/fixtures/vcr_cassettes/eval/dataset_origin.yml new file mode 100644 index 0000000..3cc7849 --- /dev/null +++ b/test/fixtures/vcr_cassettes/eval/dataset_origin.yml @@ -0,0 +1,531 @@ +--- +http_interactions: +- request: + method: post + uri: https://www.braintrust.dev/api/apikey/login + body: + encoding: UTF-8 + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Host: + - www.braintrust.dev + Authorization: + - Bearer + response: + status: + code: 200 + message: OK + headers: + Access-Control-Allow-Credentials: + - 'true' + Access-Control-Allow-Headers: + - X-CSRF-Token, X-Requested-With, Accept, Accept-Version, Content-Length, Content-MD5, + Content-Type, Date, X-Api-Version + Access-Control-Allow-Methods: + - GET,OPTIONS,PATCH,DELETE,POST,PUT + Access-Control-Allow-Origin: + - "*" + Cache-Control: + - public, max-age=0, must-revalidate + Content-Length: + - '395' + Content-Security-Policy: + - 'script-src ''self'' ''unsafe-eval'' ''wasm-unsafe-eval'' ''strict-dynamic'' + ''nonce-M2ViNThjNGMtZWE3Ny00ZmFkLWI1YTUtZDI5NWM0MGY1MjFl'' *.js.stripe.com + js.stripe.com maps.googleapis.com ; style-src ''self'' ''unsafe-inline'' *.braintrust.dev + btcm6qilbbhv4yi1.public.blob.vercel-storage.com fonts.googleapis.com www.gstatic.com + d4tuoctqmanu0.cloudfront.net; font-src ''self'' data: fonts.gstatic.com btcm6qilbbhv4yi1.public.blob.vercel-storage.com + cdn.jsdelivr.net d4tuoctqmanu0.cloudfront.net fonts.googleapis.com mintlify-assets.b-cdn.net + fonts.cdnfonts.com; object-src ''none''; base-uri ''self''; form-action ''self''; + frame-ancestors ''self''; worker-src ''self'' blob:; report-uri https://o4507221741076480.ingest.us.sentry.io/api/4507221754380288/security/?sentry_key=27fa5ac907cf7c6ce4a1ab2a03f805b4&sentry_environment=production&sentry_release=16; + report-to csp-endpoint-0' + Content-Type: + - application/json; charset=utf-8 + Date: + - Wed, 04 Feb 2026 17:03:11 GMT + Etag: + - '"12n7ok4b5phaz"' + Reporting-Endpoints: + - csp-endpoint-0="https://o4507221741076480.ingest.us.sentry.io/api/4507221754380288/security/?sentry_key=27fa5ac907cf7c6ce4a1ab2a03f805b4&sentry_environment=production&sentry_release=16" + Server: + - Vercel + Strict-Transport-Security: + - max-age=63072000 + X-Clerk-Auth-Message: + - Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid, + token-carrier=header) + X-Clerk-Auth-Reason: + - token-invalid + X-Clerk-Auth-Status: + - signed-out + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-Matched-Path: + - "/api/apikey/login" + X-Nonce: + - M2ViNThjNGMtZWE3Ny00ZmFkLWI1YTUtZDI5NWM0MGY1MjFl + X-Vercel-Cache: + - MISS + X-Vercel-Id: + - cle1::iad1::w4qxq-1770224591237-18a9c1334bce + body: + encoding: UTF-8 + string: '{"org_info":[{"id":"5d7c97d7-fef1-4cb7-bda6-7e3756a0ca8e","name":"braintrustdata.com","api_url":"https://staging-api.braintrust.dev","git_metadata":{"fields":["commit","branch","tag","author_name","author_email","commit_message","commit_time","dirty"],"collect":"some"},"is_universal_api":true,"proxy_url":"https://staging-api.braintrust.dev","realtime_url":"wss://realtime.braintrustapi.com"}]}' + recorded_at: Wed, 04 Feb 2026 17:03:11 GMT +- request: + method: post + uri: https://www.braintrust.dev/api/dataset/register + body: + encoding: UTF-8 + string: '{"dataset_name":"test-ruby-sdk-dataset-origin","org_id":"5d7c97d7-fef1-4cb7-bda6-7e3756a0ca8e","project_name":"ruby-sdk-test","description":"Test + dataset for origin integration"}' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Host: + - www.braintrust.dev + Content-Type: + - application/json + Authorization: + - Bearer + response: + status: + code: 200 + message: OK + headers: + Cache-Control: + - public, max-age=0, must-revalidate + Content-Length: + - '650' + Content-Security-Policy: + - 'script-src ''self'' ''unsafe-eval'' ''wasm-unsafe-eval'' ''strict-dynamic'' + ''nonce-MGFiNDY2NTgtMjcyYi00YTQzLTliMGYtOGMyYzk4MmUzOWFk'' *.js.stripe.com + js.stripe.com maps.googleapis.com ; style-src ''self'' ''unsafe-inline'' *.braintrust.dev + btcm6qilbbhv4yi1.public.blob.vercel-storage.com fonts.googleapis.com www.gstatic.com + d4tuoctqmanu0.cloudfront.net; font-src ''self'' data: fonts.gstatic.com btcm6qilbbhv4yi1.public.blob.vercel-storage.com + cdn.jsdelivr.net d4tuoctqmanu0.cloudfront.net fonts.googleapis.com mintlify-assets.b-cdn.net + fonts.cdnfonts.com; object-src ''none''; base-uri ''self''; form-action ''self''; + frame-ancestors ''self''; worker-src ''self'' blob:; report-uri https://o4507221741076480.ingest.us.sentry.io/api/4507221754380288/security/?sentry_key=27fa5ac907cf7c6ce4a1ab2a03f805b4&sentry_environment=production&sentry_release=16; + report-to csp-endpoint-0' + Content-Type: + - application/json; charset=utf-8 + Date: + - Wed, 04 Feb 2026 17:03:11 GMT + Etag: + - '"la06xqsiuui2"' + Reporting-Endpoints: + - csp-endpoint-0="https://o4507221741076480.ingest.us.sentry.io/api/4507221754380288/security/?sentry_key=27fa5ac907cf7c6ce4a1ab2a03f805b4&sentry_environment=production&sentry_release=16" + Server: + - Vercel + Strict-Transport-Security: + - max-age=63072000 + X-Clerk-Auth-Message: + - Invalid JWT form. A JWT consists of three parts separated by dots. (reason=token-invalid, + token-carrier=header) + X-Clerk-Auth-Reason: + - token-invalid + X-Clerk-Auth-Status: + - signed-out + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - SAMEORIGIN + X-Matched-Path: + - "/api/dataset/register" + X-Nonce: + - MGFiNDY2NTgtMjcyYi00YTQzLTliMGYtOGMyYzk4MmUzOWFk + X-Vercel-Cache: + - MISS + X-Vercel-Id: + - cle1::iad1::grckn-1770224591416-32f37c83293e + body: + encoding: UTF-8 + string: '{"project":{"id":"ac86d18e-af78-4caf-918d-b89ad108ec67","org_id":"5d7c97d7-fef1-4cb7-bda6-7e3756a0ca8e","name":"ruby-sdk-test","description":null,"created":"2025-10-22T03:32:12.324Z","deleted_at":null,"user_id":"f2ddc4e6-a51a-4a60-9734-9af4ea05c6ef","settings":null},"dataset":{"id":"4f9ef62b-44a4-40c4-9a83-7cdb791e2876","project_id":"ac86d18e-af78-4caf-918d-b89ad108ec67","name":"test-ruby-sdk-dataset-origin","description":"Test + dataset for origin integration","created":"2026-02-04T17:03:11.480Z","deleted_at":null,"user_id":"c755328d-f64a-4737-a984-e83c088cd9f7","metadata":null,"url_slug":"test-ruby-sdk-dataset-origin"},"found_existing":false}' + recorded_at: Wed, 04 Feb 2026 17:03:11 GMT +- request: + method: post + uri: https://staging-api.braintrust.dev/v1/dataset/4f9ef62b-44a4-40c4-9a83-7cdb791e2876/insert + body: + encoding: UTF-8 + string: '{"events":[{"input":"origin-test","expected":"ORIGIN-TEST"}]}' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Host: + - staging-api.braintrust.dev + Content-Type: + - application/json + Authorization: + - Bearer + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=utf-8 + Content-Length: + - '52' + Connection: + - keep-alive + X-Amz-Cf-Pop: + - CMH68-P1 + - CMH68-P1 + Date: + - Wed, 04 Feb 2026 17:03:12 GMT + Access-Control-Allow-Credentials: + - 'true' + X-Amzn-Requestid: + - 1352c1d1-cd55-4a54-a4f7-fcf6238d51e2 + X-Bt-Internal-Trace-Id: + - 69837bcf0000000031efc9a2201a1d98 + X-Amz-Apigw-Id: + - YRBIhHF2oAMEbng= + Vary: + - Origin, Accept-Encoding + Etag: + - W/"34-53NSuFcAopa3dx2Efipnj4vEFUc" + Access-Control-Expose-Headers: + - x-bt-cursor,x-bt-found-existing,x-bt-query-plan + X-Amzn-Trace-Id: + - Root=1-69837bcf-1c08c98a647c0e0606c79a8f;Parent=2647ff1a973e0bcb;Sampled=0;Lineage=1:fc3b4ff1:0 + Via: + - 1.1 1aba603d822d5b3ffcc843f252edb6ea.cloudfront.net (CloudFront), 1.1 7d742df65452f74d1ef6daa93f595db8.cloudfront.net + (CloudFront) + X-Cache: + - Miss from cloudfront + X-Amz-Cf-Id: + - pXzpD-NZ1dSXTs1PKcYpzoKk3dB9tjjIpi3rIza9JKN3vfr2v7gzMQ== + body: + encoding: ASCII-8BIT + string: '{"row_ids":["10cc8e51-4280-4f4b-8951-e2bfd0617e3c"]}' + recorded_at: Wed, 04 Feb 2026 17:03:12 GMT +- request: + method: get + uri: https://staging-api.braintrust.dev/v1/dataset?dataset_name=test-ruby-sdk-dataset-origin&project_name=ruby-sdk-test + body: + encoding: US-ASCII + string: '' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Host: + - staging-api.braintrust.dev + Authorization: + - Bearer + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=utf-8 + Content-Length: + - '363' + Connection: + - keep-alive + X-Amz-Cf-Pop: + - CMH68-P1 + - CMH68-P1 + Date: + - Wed, 04 Feb 2026 17:03:12 GMT + Access-Control-Allow-Credentials: + - 'true' + X-Amzn-Requestid: + - 443bf5a8-1109-4002-a082-a7fd04d0b613 + X-Bt-Internal-Trace-Id: + - 69837bd0000000005567a266111c60a5 + X-Amz-Apigw-Id: + - YRBIlGHjoAMEbgg= + Vary: + - Origin, Accept-Encoding + Etag: + - W/"16b-BK+mmoQKsxWNKLSUq3Nx7g8XBig" + Access-Control-Expose-Headers: + - x-bt-cursor,x-bt-found-existing,x-bt-query-plan + X-Amzn-Trace-Id: + - Root=1-69837bd0-136b81a0023cce093f7f1a21;Parent=5600aaf71154ee3a;Sampled=0;Lineage=1:fc3b4ff1:0 + Via: + - 1.1 1aba603d822d5b3ffcc843f252edb6ea.cloudfront.net (CloudFront), 1.1 e4115573bd297fb3424a2ffc8114fa1c.cloudfront.net + (CloudFront) + X-Cache: + - Miss from cloudfront + X-Amz-Cf-Id: + - hoFTcuFhYlA6sMPFQoBAYzMoJP5RGsErG37tOw_uOhwbEp_WBIKRsg== + body: + encoding: ASCII-8BIT + string: '{"objects":[{"id":"4f9ef62b-44a4-40c4-9a83-7cdb791e2876","project_id":"ac86d18e-af78-4caf-918d-b89ad108ec67","name":"test-ruby-sdk-dataset-origin","description":"Test + dataset for origin integration","created":"2026-02-04T17:03:11.480Z","deleted_at":null,"user_id":"c755328d-f64a-4737-a984-e83c088cd9f7","metadata":null,"url_slug":"test-ruby-sdk-dataset-origin"}]}' + recorded_at: Wed, 04 Feb 2026 17:03:12 GMT +- request: + method: post + uri: https://staging-api.braintrust.dev/btql + body: + encoding: UTF-8 + string: '{"query":{"from":{"op":"function","name":{"op":"ident","name":["dataset"]},"args":[{"op":"literal","value":"4f9ef62b-44a4-40c4-9a83-7cdb791e2876"}]},"select":[{"op":"star"}],"limit":1000},"fmt":"jsonl"}' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Host: + - staging-api.braintrust.dev + Content-Type: + - application/json + Authorization: + - Bearer + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json + Content-Length: + - '634' + Connection: + - keep-alive + X-Amz-Cf-Pop: + - CMH68-P1 + - CMH68-P1 + Date: + - Wed, 04 Feb 2026 17:03:12 GMT + Access-Control-Allow-Credentials: + - 'true' + X-Amzn-Requestid: + - 719ce2c3-83eb-4fc7-ad42-f937e28a713e + X-Bt-Internal-Trace-Id: + - 69837bd000000000018003f7ecdcaf3d + X-Amz-Apigw-Id: + - YRBImGJNIAMEJuQ= + Vary: + - Origin + Access-Control-Expose-Headers: + - x-bt-cursor,x-bt-found-existing,x-bt-query-plan + X-Amzn-Trace-Id: + - Root=1-69837bd0-31356f9930877c170fe4e39f;Parent=04996d2505832fac;Sampled=0;Lineage=1:fc3b4ff1:0 + X-Bt-Cursor: + - aYN7z+qKAAA + Via: + - 1.1 1aba603d822d5b3ffcc843f252edb6ea.cloudfront.net (CloudFront), 1.1 ea4a33625617615e13496b292edda6d6.cloudfront.net + (CloudFront) + X-Cache: + - Miss from cloudfront + X-Amz-Cf-Id: + - D0TlN1KhaS9dVmGffAq7K4nN-ju8zSLZ32nGQ8OJ6Vwo0Tm247BDnQ== + body: + encoding: ASCII-8BIT + string: '{"_pagination_key":"p07603056728854888448","_xact_id":"1000196605691816586","audit_data":[{"_xact_id":"1000196605691816586","audit_data":{"action":"upsert"},"metadata":{},"source":"api"}],"classifications":null,"comments":null,"created":"2026-02-04T17:03:11.751Z","dataset_id":"4f9ef62b-44a4-40c4-9a83-7cdb791e2876","expected":"ORIGIN-TEST","facets":null,"id":"10cc8e51-4280-4f4b-8951-e2bfd0617e3c","input":"origin-test","is_root":true,"metadata":null,"origin":null,"project_id":"ac86d18e-af78-4caf-918d-b89ad108ec67","root_span_id":"a588c208-c631-4d0f-bed9-dc732b1fffb5","span_id":"a588c208-c631-4d0f-bed9-dc732b1fffb5","tags":null} + + ' + recorded_at: Wed, 04 Feb 2026 17:03:12 GMT +- request: + method: post + uri: https://staging-api.braintrust.dev/btql + body: + encoding: UTF-8 + string: '{"query":{"from":{"op":"function","name":{"op":"ident","name":["dataset"]},"args":[{"op":"literal","value":"4f9ef62b-44a4-40c4-9a83-7cdb791e2876"}]},"select":[{"op":"star"}],"limit":1000,"cursor":"aYN7z+qKAAA"},"fmt":"jsonl"}' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Host: + - staging-api.braintrust.dev + Content-Type: + - application/json + Authorization: + - Bearer + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json + Content-Length: + - '0' + Connection: + - keep-alive + X-Amz-Cf-Pop: + - CMH68-P1 + - CMH68-P1 + Date: + - Wed, 04 Feb 2026 17:03:12 GMT + Access-Control-Allow-Credentials: + - 'true' + X-Amzn-Requestid: + - 8d3db6e4-20ea-489e-ab35-a8557d5334e0 + X-Bt-Internal-Trace-Id: + - 69837bd00000000014d92a3d98c91123 + X-Amz-Apigw-Id: + - YRBIoFtHoAMEF3g= + Vary: + - Origin + Access-Control-Expose-Headers: + - x-bt-cursor,x-bt-found-existing,x-bt-query-plan + X-Amzn-Trace-Id: + - Root=1-69837bd0-44b5818373116fd553cba2ab;Parent=68218d2017a38677;Sampled=0;Lineage=1:fc3b4ff1:0 + Via: + - 1.1 1f25a64e755f195dbccfdacb5a82a7ce.cloudfront.net (CloudFront), 1.1 597391769ad998307dcc74a3c790e7c6.cloudfront.net + (CloudFront) + X-Cache: + - Miss from cloudfront + X-Amz-Cf-Id: + - rfpLJ7Q-o_25h7HnGu3UeCLtUStUAf2pmr6owcZ7J3OuwutPdSR8nQ== + body: + encoding: ASCII-8BIT + string: '' + recorded_at: Wed, 04 Feb 2026 17:03:12 GMT +- request: + method: post + uri: https://staging-api.braintrust.dev/v1/project + body: + encoding: UTF-8 + string: '{"name":"ruby-sdk-test"}' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Host: + - staging-api.braintrust.dev + Content-Type: + - application/json + Authorization: + - Bearer + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=utf-8 + Content-Length: + - '255' + Connection: + - keep-alive + X-Amz-Cf-Pop: + - CMH68-P1 + - CMH68-P1 + Date: + - Wed, 04 Feb 2026 17:03:12 GMT + Access-Control-Allow-Credentials: + - 'true' + X-Amzn-Requestid: + - af677481-4552-4ca7-83ca-4323e90b989f + X-Bt-Internal-Trace-Id: + - 69837bd0000000005ff77c9219450725 + X-Amz-Apigw-Id: + - YRBIpF6WoAMEdsw= + Vary: + - Origin, Accept-Encoding + Etag: + - W/"ff-6vJBGKzQtkVK0GxiGBpsJc6l6u8" + Access-Control-Expose-Headers: + - x-bt-cursor,x-bt-found-existing,x-bt-query-plan + X-Amzn-Trace-Id: + - Root=1-69837bd0-64684dc01db6267b16e40fa1;Parent=62fbb94f36a39c05;Sampled=0;Lineage=1:fc3b4ff1:0 + X-Bt-Found-Existing: + - 'true' + Via: + - 1.1 1f25a64e755f195dbccfdacb5a82a7ce.cloudfront.net (CloudFront), 1.1 6600f36fdbb63d37961eb0d99869f3fa.cloudfront.net + (CloudFront) + X-Cache: + - Miss from cloudfront + X-Amz-Cf-Id: + - sy6pqKcZ-yyLL_S43RltOszkr9V7_a9fHYymsTvSXewU7-zF6cGJfA== + body: + encoding: ASCII-8BIT + string: '{"id":"ac86d18e-af78-4caf-918d-b89ad108ec67","org_id":"5d7c97d7-fef1-4cb7-bda6-7e3756a0ca8e","name":"ruby-sdk-test","description":null,"created":"2025-10-22T03:32:12.324Z","deleted_at":null,"user_id":"f2ddc4e6-a51a-4a60-9734-9af4ea05c6ef","settings":null}' + recorded_at: Wed, 04 Feb 2026 17:03:12 GMT +- request: + method: post + uri: https://staging-api.braintrust.dev/v1/experiment + body: + encoding: UTF-8 + string: '{"project_id":"ac86d18e-af78-4caf-918d-b89ad108ec67","name":"test-ruby-sdk-exp-origin","ensure_new":true}' + headers: + Accept-Encoding: + - gzip;q=1.0,deflate;q=0.6,identity;q=0.3 + Accept: + - "*/*" + User-Agent: + - Ruby + Host: + - staging-api.braintrust.dev + Content-Type: + - application/json + Authorization: + - Bearer + response: + status: + code: 200 + message: OK + headers: + Content-Type: + - application/json; charset=utf-8 + Content-Length: + - '388' + Connection: + - keep-alive + X-Amz-Cf-Pop: + - CMH68-P1 + - CMH68-P1 + Date: + - Wed, 04 Feb 2026 17:03:12 GMT + Access-Control-Allow-Credentials: + - 'true' + X-Amzn-Requestid: + - e24e5226-95bd-4b29-a859-32b949fca72c + X-Bt-Internal-Trace-Id: + - 69837bd000000000127ab07cc0170707 + X-Amz-Apigw-Id: + - YRBIrHMTIAMEevQ= + Vary: + - Origin, Accept-Encoding + Etag: + - W/"184-nGHxQXw/TAVG+jG26cSGPF3M/hY" + Access-Control-Expose-Headers: + - x-bt-cursor,x-bt-found-existing,x-bt-query-plan + X-Amzn-Trace-Id: + - Root=1-69837bd0-4b35edb40fdc5d4c342db63d;Parent=75e6651e0b9f01cf;Sampled=0;Lineage=1:fc3b4ff1:0 + Via: + - 1.1 1aba603d822d5b3ffcc843f252edb6ea.cloudfront.net (CloudFront), 1.1 ff8e36e5267cb39e0ce8c3df049957a6.cloudfront.net + (CloudFront) + X-Cache: + - Miss from cloudfront + X-Amz-Cf-Id: + - CWfxpLUMlJuzi9JxGu0T39DoJquyPIjuBsmTG3ANVCAjKrDo-jMKqQ== + body: + encoding: ASCII-8BIT + string: '{"id":"d4663a0c-92ee-4801-8abe-8049f7dae7be","project_id":"ac86d18e-af78-4caf-918d-b89ad108ec67","name":"test-ruby-sdk-exp-origin","description":null,"created":"2026-02-04T17:03:12.838Z","repo_info":null,"commit":null,"base_exp_id":null,"deleted_at":null,"dataset_id":null,"dataset_version":null,"public":false,"user_id":"c755328d-f64a-4737-a984-e83c088cd9f7","metadata":null,"tags":null}' + recorded_at: Wed, 04 Feb 2026 17:03:12 GMT +recorded_with: VCR 6.4.0 From fb0b8b4a34696ea13998c2fb29c4c5e9e90b4e05 Mon Sep 17 00:00:00 2001 From: David Elner Date: Wed, 4 Feb 2026 16:58:20 -0500 Subject: [PATCH 2/2] Fixed: Readme for Evals (inaccurate example) --- README.md | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ebea5f7..c74b010 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ This is the official Ruby SDK for [Braintrust](https://www.braintrust.dev), for - [Viewing traces](#viewing-traces) - [Evals](#evals) - [Datasets](#datasets) - - [Remote scorers](#remote-scorers) + - [Scorers](#scorers) - [Documentation](#documentation) - [Troubleshooting](#troubleshooting) - [Contributing](#contributing) @@ -260,7 +260,7 @@ Braintrust::Eval.run( ### Datasets -Load test cases from a Braintrust dataset: +Use test cases from a Braintrust dataset: ```ruby Braintrust::Eval.run( @@ -271,7 +271,22 @@ Braintrust::Eval.run( ) ``` -### Remote scorers +Or define test cases inline with metadata and tags: + +```ruby +Braintrust::Eval.run( + project: "my-project", + experiment: "classifier-v1", + cases: [ + {input: "apple", expected: "fruit", tags: ["produce"], metadata: {difficulty: "easy"}}, + {input: "salmon", expected: "protein", tags: ["seafood"], metadata: {difficulty: "medium"}} + ], + task: ->(input) { classify(input) }, + scorers: [...] +) +``` + +### Scorers Use scoring functions defined in Braintrust: @@ -281,7 +296,22 @@ Braintrust::Eval.run( cases: [...], task: ->(input) { ... }, scorers: [ - Braintrust::Scorer.remote("my-project", "accuracy-scorer") + Braintrust::Eval::Functions.scorer(project: "my-project", slug: "accuracy-scorer") + ] +) +``` + +Or define scorers inline with `Eval.scorer`: + +```ruby +Braintrust::Eval.run( + project: "my-project", + cases: [...], + task: ->(input) { ... }, + scorers: [ + Braintrust::Eval.scorer("exact_match") do |input, expected, output| + output == expected ? 1.0 : 0.0 + end ] ) ```