comet-ml · chasefortier · Mar 17, 2026 · Mar 17, 2026
diff --git a/opik/otel_with_offline_eval_example/README.md b/opik/otel_with_offline_eval_example/README.md
@@ -0,0 +1,87 @@
+# OTel Tracing + Opik Offline Evaluation
+
+Demonstrates how to use OpenTelemetry for application tracing **alongside**
+Opik's offline evaluation workflow. These are complementary tools, not alternatives.
+
+## Architecture
+
+```
+┌─────────────────────────────────────┐    ┌───────────────────────────────────────┐
+│         llm_app.py                  │    │           evaluate.py                 │
+│  (your application code)            │    │   (offline evaluation script)         │
+│                                     │    │                                       │
+│  - OTel for tracing                 │    │  - Opik Python SDK                    │
+│  - Runs in production/dev           │    │  - Runs separately (CI, scheduled)    │
+│  - Sends spans → Opik OTLP endpoint │    │  - Imports answer_question() from app │
+│                                     │    │  - Runs task against a dataset        │
+│                                     │    │  - Creates an Experiment in Opik      │
+└─────────────────────────────────────┘    └───────────────────────────────────────┘
+         │                                              │
+         └──────────────────────┬───────────────────────┘
+                                ▼
+                    ┌─────────────────────┐
+                    │       Opik          │
+                    │                    │
+                    │  Traces (from OTel) │
+                    │  Experiments (from  │
+                    │  evaluate())        │
+                    └─────────────────────┘
+```
+
+## Key Insight
+
+| | OTel tracing | `opik.evaluate()` |
+|---|---|---|
+| Purpose | Observe your app in real-time | Test quality offline against a dataset |
+| When it runs | During application execution | In a separate evaluation script |
+| What you get in Opik | Traces / Spans | Experiments with scored results |
+| SDK | OTel SDK | Opik Python SDK (`pip install opik`) |
+
+**There is no OTel-native way to run offline experiments.** The offline evaluation
+workflow requires the Opik Python SDK because it needs to:
+1. Read items from an Opik Dataset
+2. Run your LLM task against each item
+3. Score outputs with metrics
+4. Record results as an Experiment in Opik
+
+## Setup
+
+```bash
+pip install opentelemetry-sdk opentelemetry-exporter-otlp-proto-http opik
+```
+
+Environment variables for `llm_app.py`:
+```bash
+export OPIK_OTLP_ENDPOINT="https://your-opik-host/opik/api/v1/private/otel/v1/traces"
+export OPIK_API_KEY="your-api-key"
+export OPIK_WORKSPACE="your-workspace"
+```
+
+Environment variables for `evaluate.py`:
+```bash
+export OPIK_API_KEY="your-api-key"
+export OPIK_WORKSPACE="your-workspace"
+export OPIK_URL_OVERRIDE="https://your-opik-host/opik/api"
+```
+
+## Running
+
+```bash
+# Run your application normally (OTel tracing active)
+python llm_app.py
+
+# Run offline evaluation separately
+python evaluate.py
+```
+
+## Notes for self-hosted Opik
+
+The OTel OTLP endpoint for self-hosted Opik is:
+```
+https://<your-opik-host>/opik/api/v1/private/otel/v1/traces
+```
+
+The Opik SDK base URL override for self-hosted is:
+```
+https://<your-opik-host>/opik/api
+```
diff --git a/opik/otel_with_offline_eval_example/evaluate.py b/opik/otel_with_offline_eval_example/evaluate.py
@@ -0,0 +1,65 @@
+"""
+FILE 2: evaluate.py
+Separate offline evaluation script — uses the Opik Python SDK.
+Import your application function directly and wrap it as an Opik task.
+
+This does NOT replace OTel tracing in your app. It runs offline experiments
+against a dataset and records scored results as an Experiment in Opik.
+
+Setup:
+    pip install opik
+
+Environment variables:
+    OPIK_API_KEY        - your Opik API key
+    OPIK_WORKSPACE      - your Opik workspace name
+    OPIK_URL_OVERRIDE   - your self-hosted Opik URL
+                          (e.g. https://your-opik-host/opik/api)
+"""
+
+import opik
+from opik.evaluation.metrics import Equals, Hallucination
+# Note: Hallucination is an LLM-judge metric — it requires an LLM provider to be
+# configured (e.g. set OPENAI_API_KEY). Replace or remove it if you don't have one.
+
+# Import the function from your application code.
+# opik.evaluate() will call this function for each dataset item.
+from llm_app import answer_question
+
+
+# --- Step 1: Create (or get) a dataset ---
+
+client = opik.Opik()
+
+dataset = client.get_or_create_dataset("capital-cities-qa")
+dataset.insert([
+    {"question": "What is the capital of France?",   "expected": "Paris"},
+    {"question": "What is the capital of Japan?",    "expected": "Tokyo"},
+    {"question": "What is the capital of Germany?",  "expected": "Berlin"},
+])
+
+
+# --- Step 2: Define the evaluation task ---
+# This is a thin wrapper that maps dataset item fields to your app function.
+# The dict keys returned here are used by scoring metrics below.
+
+def evaluation_task(dataset_item: dict) -> dict:
+    answer = answer_question(dataset_item["question"])
+    return {
+        "output": answer,
+        "reference": dataset_item["expected"],  # passed to Equals metric
+        "input": dataset_item["question"],       # passed to Hallucination metric
+    }
+
+
+# --- Step 3: Run the experiment ---
+
+opik.evaluate(
+    dataset=dataset,
+    task=evaluation_task,
+    scoring_metrics=[
+        Equals(name="exact_match"),
+        Hallucination(name="hallucination"),
+    ],
+    experiment_name="capital-cities-baseline",
+    project_name="mediatek-llm-eval",
+)
diff --git a/opik/otel_with_offline_eval_example/llm_app.py b/opik/otel_with_offline_eval_example/llm_app.py
@@ -0,0 +1,79 @@
+"""
+FILE 1: llm_app.py
+Your application code — uses OpenTelemetry for tracing.
+No Opik SDK required here.
+
+This is the code that runs in production or dev environments.
+OTel spans are exported to Opik via the OTLP endpoint.
+
+Setup:
+    pip install opentelemetry-sdk opentelemetry-exporter-otlp-proto-http
+
+Environment variables:
+    OPIK_OTLP_ENDPOINT  - e.g. https://your-opik-host/opik/api/v1/private/otel/v1/traces
+    OPIK_API_KEY        - your Opik API key
+    OPIK_WORKSPACE      - your Opik workspace name
+"""
+
+import os
+from opentelemetry import trace
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import BatchSpanProcessor
+from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
+
+
+def setup_otel():
+    """Configure OTel to export traces to Opik."""
+    otlp_endpoint = os.environ["OPIK_OTLP_ENDPOINT"]
+    api_key = os.environ["OPIK_API_KEY"]
+    workspace = os.environ["OPIK_WORKSPACE"]
+
+    exporter = OTLPSpanExporter(
+        endpoint=otlp_endpoint,
+        headers={
+            "Authorization": f"Bearer {api_key}",
+            "Comet-Workspace": workspace,
+        },
+    )
+
+    provider = TracerProvider()
+    provider.add_span_processor(BatchSpanProcessor(exporter))
+    trace.set_tracer_provider(provider)
+
+
+tracer = trace.get_tracer(__name__)
+
+
+def call_llm(prompt: str) -> str:
+    """
+    Your LLM call — wrapped in an OTel span for tracing.
+    Replace this with your actual model call (OpenAI, vLLM, etc.)
+    """
+    with tracer.start_as_current_span("llm_call") as span:
+        span.set_attribute("input.prompt", prompt)
+
+        # --- replace with your actual LLM call ---
+        response = f"[LLM response to: {prompt}]"
+        # -----------------------------------------
+
+        span.set_attribute("output.response", response)
+        return response
+
+
+def answer_question(user_question: str) -> str:
+    """
+    Your application logic — also traced via OTel.
+    This is the function you'll reuse in evaluate.py.
+    """
+    with tracer.start_as_current_span("answer_question") as span:
+        span.set_attribute("input.question", user_question)
+        answer = call_llm(user_question)
+        span.set_attribute("output.answer", answer)
+        return answer
+
+
+if __name__ == "__main__":
+    # Set up OTel when running as the application entry point
+    setup_otel()
+    result = answer_question("What is the capital of France?")
+    print(result)