From 1314b2b1eee201d0932e756cae28dafeb6787e69 Mon Sep 17 00:00:00 2001
From: sanjay singh <sanjay.singh360@gmail.com>
Date: Fri, 15 May 2026 21:40:40 +0200
Subject: [PATCH 1/3] chore: upgrade model from gpt-4o-2024-11-20 to gpt-4.1

---
 agents/tech-trends-agent.json | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/agents/tech-trends-agent.json b/agents/tech-trends-agent.json
index 1bb67ce..96db2b9 100644
--- a/agents/tech-trends-agent.json
+++ b/agents/tech-trends-agent.json
@@ -5,7 +5,9 @@
     "model": "${GPT_DEPLOYMENT}",
     "instructions_file": "prompts/tech-trends-agent.md",
     "tools": [
-      { "type": "code_interpreter" }
+      {
+        "type": "code_interpreter"
+      }
     ]
   },
   "eval": {
@@ -14,6 +16,17 @@
     "config": "evals/eval-config.json"
   },
   "_model_history": [
-    { "model": "gpt-4o-2024-11-20", "from": "2025-01-10", "to": null, "reason": "initial" }
+    {
+      "model": "gpt-4o-2024-11-20",
+      "from": "2025-01-10",
+      "to": "2026-05-15",
+      "reason": "initial"
+    },
+    {
+      "model": "gpt-4.1",
+      "from": "2026-05-15",
+      "to": null,
+      "reason": "quality improvement, eval gated"
+    }
   ]
 }

From 7265240571ff82044f92e8f2b5edd8d3c3d3cf4e Mon Sep 17 00:00:00 2001
From: sanjay singh <sanjay.singh360@gmail.com>
Date: Fri, 15 May 2026 21:42:51 +0200
Subject: [PATCH 2/3] fix: sanitize response_preview for GITHUB_OUTPUT format

Strip newlines from output preview before writing to GITHUB_OUTPUT,
as multiline values break the key=value format.
---
 .github/workflows/evaluate.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 3e5dbf5..73334ab 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -114,9 +114,10 @@ jobs:
           # Write smoke test result for downstream steps
           gh_output = os.environ.get("GITHUB_OUTPUT", "")
           if gh_output:
+              preview = output[:200].replace('\n', ' ').replace('\r', '')
               with open(gh_output, "a") as f:
                   f.write(f"response_length={len(output)}\n")
-                  f.write(f"response_preview={output[:200]}\n")
+                  f.write(f"response_preview={preview}\n")
           EOF
 
       - name: Run Foundry evaluation

From fe20b5c87e4d24eccf0db811f8b18d089154df95 Mon Sep 17 00:00:00 2001
From: sanjay singh <sanjay.singh360@gmail.com>
Date: Fri, 15 May 2026 21:48:23 +0200
Subject: [PATCH 3/3] feat: implement create-once evaluation pattern

Replace microsoft/ai-agent-evals action with custom run_evaluation.py
that creates one evaluation per agent and adds runs on each pipeline
execution. Run names encode branch/commit for traceability.

- First run: creates evaluation named '{agent-name}-eval'
- Subsequent runs: reuses existing evaluation, adds new run
- Run name format: '{branch}/{commit-sha}'
---
 .github/workflows/evaluate.yml |  17 +--
 scripts/run_evaluation.py      | 208 +++++++++++++++++++++++++++++++++
 2 files changed, 218 insertions(+), 7 deletions(-)
 create mode 100644 scripts/run_evaluation.py

diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml
index 73334ab..735225d 100644
--- a/.github/workflows/evaluate.yml
+++ b/.github/workflows/evaluate.yml
@@ -122,13 +122,16 @@ jobs:
 
       - name: Run Foundry evaluation
         id: eval
-        uses: microsoft/ai-agent-evals@v3-beta
-        with:
-          azure-ai-project-endpoint: ${{ vars.FOUNDRY_TEST_ENDPOINT }}
-          deployment-name:           ${{ vars.GPT_DEPLOYMENT }}
-          agent-ids:                 "tech-trends-agent:${{ steps.deploy.outputs.agent_version }}"
-          data-path:                 "./evals/golden-dataset.json"
-          evaluation-result-view:    "all-scores"
+        env:
+          FOUNDRY_TEST_ENDPOINT: ${{ vars.FOUNDRY_TEST_ENDPOINT }}
+          GPT_DEPLOYMENT:        ${{ vars.GPT_DEPLOYMENT }}
+        run: |
+          python scripts/run_evaluation.py \
+            --agent-name tech-trends-agent \
+            --agent-version "${{ steps.deploy.outputs.agent_version }}" \
+            --data-path "./evals/golden-dataset.json" \
+            --commit-sha "${{ github.sha }}" \
+            --branch "${{ github.head_ref || github.ref_name }}"
 
       - name: Post evaluation summary to PR
         if: always()
diff --git a/scripts/run_evaluation.py b/scripts/run_evaluation.py
new file mode 100644
index 0000000..7564bcf
--- /dev/null
+++ b/scripts/run_evaluation.py
@@ -0,0 +1,208 @@
+"""Run agent evaluation with create-once, run-many pattern.
+
+On first run: creates an evaluation named after the agent.
+On subsequent runs: reuses the existing evaluation and adds a new run.
+Run name encodes the commit SHA and branch for traceability.
+
+Usage:
+    python scripts/run_evaluation.py \
+        --agent-name tech-trends-agent \
+        --agent-version 16 \
+        --data-path evals/golden-dataset.json \
+        --commit-sha abc1234 \
+        --branch feature/my-branch
+"""
+
+import argparse
+import json
+import os
+import time
+
+from azure.ai.projects import AIProjectClient
+from azure.identity import DefaultAzureCredential
+from openai.types.eval_create_params import DataSourceConfigCustom
+
+
+POLLING_INTERVAL = 5
+
+
+def find_existing_eval(openai_client, eval_name: str):
+    """Search for an existing evaluation by name."""
+    page = openai_client.evals.list(order="desc", limit=100)
+    for eval_obj in page.data:
+        if eval_obj.name == eval_name:
+            return eval_obj
+    return None
+
+
+def build_testing_criteria(evaluators: list, deployment_name: str) -> list:
+    """Build testing criteria for Azure AI evaluators."""
+    criteria = []
+    for evaluator_name in evaluators:
+        display_name = evaluator_name.split(".")[-1] if "." in evaluator_name else evaluator_name
+        criteria.append({
+            "type": "azure_ai_evaluator",
+            "name": display_name,
+            "evaluator_name": evaluator_name,
+            "initialization_parameters": {
+                "deployment_name": deployment_name,
+            },
+            "data_mapping": {
+                "response": "{{sample.output_text}}",
+                "query": "{{item.query}}",
+                "ground_truth": "{{item.ground_truth}}",
+                "tool_calls": "{{sample.tool_calls}}",
+                "tool_definitions": "{{sample.tool_definitions}}",
+            },
+        })
+    return criteria
+
+
+def create_evaluation(openai_client, eval_name: str, evaluators: list, deployment_name: str):
+    """Create a new evaluation."""
+    data_source_config = DataSourceConfigCustom(
+        type="custom",
+        item_schema={
+            "type": "object",
+            "properties": {"query": {"type": "string"}},
+            "required": ["query"],
+        },
+        include_sample_schema=True,
+    )
+
+    testing_criteria = build_testing_criteria(evaluators, deployment_name)
+
+    eval_obj = openai_client.evals.create(
+        name=eval_name,
+        data_source_config=data_source_config,
+        testing_criteria=testing_criteria,
+    )
+    print(f"Created new evaluation: {eval_obj.name} (id: {eval_obj.id})")
+    return eval_obj
+
+
+def create_eval_run(openai_client, project_client, eval_id: str, run_name: str,
+                    agent_name: str, agent_version: str, data_path: str):
+    """Create a run against an existing evaluation."""
+    # Upload dataset
+    jsonl_path = convert_to_jsonl(data_path)
+    dataset = project_client.datasets.upload_file(
+        name=f"{agent_name}-eval-data",
+        version=str(int(time.time())),
+        file_path=jsonl_path,
+    )
+    print(f"Uploaded dataset: {dataset.name} (version: {dataset.version})")
+
+    data_source = {
+        "type": "azure_ai_target_completions",
+        "source": {
+            "type": "file_id",
+            "id": dataset.id,
+        },
+        "input_messages": {
+            "type": "template",
+            "template": [
+                {"type": "message", "role": "user", "content": "{{item.query}}"}
+            ],
+        },
+        "target": {
+            "type": "azure_ai_agent",
+            "name": agent_name,
+            "version": agent_version,
+        },
+    }
+
+    eval_run = openai_client.evals.runs.create(
+        eval_id=eval_id,
+        name=run_name,
+        data_source=data_source,
+    )
+    print(f"Created evaluation run: {eval_run.id} (name: {run_name})")
+    return eval_run
+
+
+def convert_to_jsonl(data_path: str) -> str:
+    """Convert JSON dataset to JSONL format for upload."""
+    with open(data_path) as f:
+        data = json.load(f)
+
+    jsonl_path = data_path.replace(".json", ".jsonl")
+    with open(jsonl_path, "w") as f:
+        for item in data["data"]:
+            f.write(json.dumps(item) + "\n")
+
+    return jsonl_path
+
+
+def wait_for_run(openai_client, eval_id: str, run_id: str):
+    """Poll until the evaluation run completes."""
+    print("Waiting for evaluation run to complete...")
+    while True:
+        run = openai_client.evals.runs.retrieve(run_id=run_id, eval_id=eval_id)
+        if run.status in ("completed", "failed"):
+            print(f"Run finished with status: {run.status}")
+            return run
+        time.sleep(POLLING_INTERVAL)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run agent evaluation (create-once pattern)")
+    parser.add_argument("--agent-name", required=True, help="Agent name")
+    parser.add_argument("--agent-version", required=True, help="Agent version")
+    parser.add_argument("--data-path", required=True, help="Path to golden dataset JSON")
+    parser.add_argument("--commit-sha", required=True, help="Git commit SHA (short)")
+    parser.add_argument("--branch", required=True, help="Git branch name")
+    args = parser.parse_args()
+
+    endpoint = os.environ["FOUNDRY_TEST_ENDPOINT"]
+    deployment_name = os.environ["GPT_DEPLOYMENT"]
+
+    credential = DefaultAzureCredential()
+    project_client = AIProjectClient(endpoint=endpoint, credential=credential)
+    openai_client = project_client.get_openai_client()
+
+    # Load evaluators from dataset file
+    with open(args.data_path) as f:
+        input_data = json.load(f)
+    evaluators = input_data.get("evaluators", [])
+
+    # Evaluation name is based on agent name (stable across runs)
+    eval_name = f"{args.agent_name}-eval"
+
+    # Find or create the evaluation
+    eval_obj = find_existing_eval(openai_client, eval_name)
+    if eval_obj:
+        print(f"Found existing evaluation: {eval_obj.name} (id: {eval_obj.id})")
+    else:
+        print(f"No evaluation found with name '{eval_name}', creating new one...")
+        eval_obj = create_evaluation(openai_client, eval_name, evaluators, deployment_name)
+
+    # Run name encodes commit and branch for traceability
+    run_name = f"{args.branch}/{args.commit_sha}"
+
+    # Create and wait for the evaluation run
+    eval_run = create_eval_run(
+        openai_client, project_client, eval_obj.id, run_name,
+        args.agent_name, args.agent_version, args.data_path,
+    )
+    completed_run = wait_for_run(openai_client, eval_obj.id, eval_run.id)
+
+    # Output results for GitHub Actions
+    gh_output = os.environ.get("GITHUB_OUTPUT", "")
+    if gh_output:
+        with open(gh_output, "a") as f:
+            f.write(f"eval_id={eval_obj.id}\n")
+            f.write(f"eval_run_id={completed_run.id}\n")
+            f.write(f"eval_run_status={completed_run.status}\n")
+            report_url = getattr(completed_run, "report_url", "")
+            f.write(f"eval_report_url={report_url}\n")
+
+    if completed_run.status == "failed":
+        print("ERROR: Evaluation run failed")
+        raise SystemExit(1)
+
+    print(f"Evaluation complete. Run: {completed_run.id}")
+
+
+if __name__ == "__main__":
+    main()