From aef45fd5b0cf521fbc697259d46d3ffa3bc41f78 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 18:10:28 +0200 Subject: [PATCH 01/11] =?UTF-8?q?feat:=20Phase=202=20=E2=80=94=20add=20cod?= =?UTF-8?q?e=20interpreter=20for=20data=20analysis=20capability?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- agents/tech-trends-agent.json | 7 ++++--- evals/eval-config.json | 12 ++++++------ prompts/tech-trends-agent.md | 12 ++++++++++++ 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/agents/tech-trends-agent.json b/agents/tech-trends-agent.json index 9d6b181..efcf175 100644 --- a/agents/tech-trends-agent.json +++ b/agents/tech-trends-agent.json @@ -1,16 +1,17 @@ { "agent_name": "tech-trends-agent", - "phase": "1", + "phase": "2", "definition": { "model": "${GPT_DEPLOYMENT}", "instructions_file": "prompts/tech-trends-agent.md", "tools": [ - { "type": "web_search" } + { "type": "bing_grounding" }, + { "type": "code_interpreter" } ] }, "eval": { "dataset": "evals/golden-dataset.json", - "phase_filter": "1", + "phase_filter": null, "config": "evals/eval-config.json" }, "_model_history": [ diff --git a/evals/eval-config.json b/evals/eval-config.json index 5cafb0a..832031e 100644 --- a/evals/eval-config.json +++ b/evals/eval-config.json @@ -1,9 +1,9 @@ { "evaluators": [ - "builtin.task_adherence", - "builtin.relevance", - "builtin.groundedness", - "builtin.coherence" + "TaskAdherenceEvaluator", + "RelevanceEvaluator", + "GroundednessEvaluator", + "CoherenceEvaluator" ], "thresholds": { "task_adherence": 0.80, @@ -11,6 +11,6 @@ "groundedness": 0.75, "coherence": 0.80 }, - "phase_filter": "1", - "notes": "Phase 1: Only web search queries evaluated. Phase 2 data analysis queries excluded." + "phase_filter": null, + "notes": "Phase 2: All queries evaluated — both web search (Phase 1) and data analysis (Phase 2)." } diff --git a/prompts/tech-trends-agent.md b/prompts/tech-trends-agent.md index e01c7e9..da79c14 100644 --- a/prompts/tech-trends-agent.md +++ b/prompts/tech-trends-agent.md @@ -27,3 +27,15 @@ Always structure responses as: ## Tone Professional, objective, and jargon-aware. Assume the user is a technology professional who does not need basic concepts explained. + +## Data Analysis (Phase 2) +You now have access to a code interpreter. Use it when: +- The user asks you to calculate, compare, or rank numerical data +- You have retrieved structured data (tables, CSVs) and analysis would add value +- You need to produce a formatted comparison table from raw information + +When using code interpreter: +1. First retrieve the data via web search +2. Then write and run Python code to process or compare it +3. Present results with the code output clearly labelled +4. Always show the source of the raw data alongside the computed result From 6b3414f2a41e3ea63f5d524cb2954479bd8036ee Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 18:11:15 +0200 Subject: [PATCH 02/11] docs: update README to reflect web_search tool replacing Bing Grounding --- README.md | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 6ec7b2c..a987ecc 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ versioned prompts, tool changes, model upgrades, evaluation gates, and rollback. ## Agent: Technology Trend Research & Analysis -- **Phase 1:** Web search only (Bing Grounding) +- **Phase 1:** Web search only (`web_search` tool) - **Phase 2:** Web search + Code Interpreter for data analysis ## Repository Structure @@ -60,9 +60,7 @@ This creates: - An App Registration with a Service Principal - 3 federated credentials for GitHub OIDC (main branch, pull requests, tags) - RBAC role assignments (Azure AI User, Cognitive Services OpenAI User) -- 7 GitHub repository variables (`AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_SUBSCRIPTION_ID`, `FOUNDRY_TEST_ENDPOINT`, `FOUNDRY_PROD_ENDPOINT`, `GPT_DEPLOYMENT`, `BING_CONNECTION_NAME`) - -After bootstrap completes, manually configure the Bing Grounding connection in both Foundry projects via the [Azure AI Foundry portal](https://ai.azure.com) (Project > Connections > + New). +- 6 GitHub repository variables (`AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_SUBSCRIPTION_ID`, `FOUNDRY_TEST_ENDPOINT`, `FOUNDRY_PROD_ENDPOINT`, `GPT_DEPLOYMENT`) State is saved to `.bootstrap-state.json` for use by the teardown script. @@ -88,7 +86,7 @@ Three scripts simulate the full agent lifecycle by creating PRs that trigger the ``` - Creates branch `feature/phase1-web-search` -- Configures the agent with Bing Grounding (web search) only +- Configures the agent with the `web_search` tool - Evaluation runs 5 Phase 1 test cases - Opens a PR — `evaluate.yml` triggers, deploys to TEST, runs eval @@ -101,7 +99,7 @@ Three scripts simulate the full agent lifecycle by creating PRs that trigger the ``` - Creates branch `feature/phase2-code-interpreter` from updated `main` -- Adds `code_interpreter` tool alongside `bing_grounding` +- Adds `code_interpreter` tool alongside `web_search` - Extends the system prompt with a `## Data Analysis` section - Evaluation now runs all 8 test cases (Phase 1 + Phase 2) — checks for regressions - Opens a PR @@ -163,7 +161,7 @@ az login # 4. Deploy to test source .env # or export vars manually -python scripts/deploy_agent.py --env test --semver 1.0.0 --tools bing_grounding +python scripts/deploy_agent.py --env test --semver 1.0.0 --tools web_search ``` ## CI/CD Workflows @@ -194,7 +192,7 @@ Re-deploys the exact prompt, tools, and model from a saved artifact. ## Model Comparison ```bash -python scripts/compare_models.py --current gpt-4o-2024-11-20 --candidate gpt-4.1 --tools bing_grounding +python scripts/compare_models.py --current gpt-4o-2024-11-20 --candidate gpt-4.1 --tools web_search ``` Deploys both model versions to test for side-by-side evaluation. From 4a12a78e4ce1343307e41cbccccc75f57974c1dc Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 18:13:51 +0200 Subject: [PATCH 03/11] =?UTF-8?q?docs:=20fix=20Phase=202=20description=20?= =?UTF-8?q?=E2=80=94=20code=5Finterpreter=20replaces=20web=5Fsearch,=20not?= =?UTF-8?q?=20additive?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a987ecc..4a1c687 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ versioned prompts, tool changes, model upgrades, evaluation gates, and rollback. ## Agent: Technology Trend Research & Analysis - **Phase 1:** Web search only (`web_search` tool) -- **Phase 2:** Web search + Code Interpreter for data analysis +- **Phase 2:** Code Interpreter only (`code_interpreter` tool) for data analysis ## Repository Structure @@ -99,7 +99,7 @@ Three scripts simulate the full agent lifecycle by creating PRs that trigger the ``` - Creates branch `feature/phase2-code-interpreter` from updated `main` -- Adds `code_interpreter` tool alongside `web_search` +- Replaces `web_search` with `code_interpreter` tool - Extends the system prompt with a `## Data Analysis` section - Evaluation now runs all 8 test cases (Phase 1 + Phase 2) — checks for regressions - Opens a PR From 6d91f99fe6baf14051f4620833d3f10ae0f3fe65 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 18:23:22 +0200 Subject: [PATCH 04/11] feat: add smoke test, fix tools config, model check in bootstrap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - agents/tech-trends-agent.json: replace bing_grounding with code_interpreter (Phase 2 uses only code_interpreter, not both tools) - evaluate.yml: add smoke test step that invokes the agent and validates a response before running the full evaluation suite; remove BING_CONNECTION_NAME - bootstrap.sh: add model availability check (validates current + upgrade target gpt-4.1); remove Bing Grounding references and connection variable - lifecycle/02-phase2-code-interpreter.sh: fix to deploy code_interpreter only - README: document model upgrade (gpt-4o-2024-11-20 → gpt-4.1), note eval naming limitation (action creates new "Agent Evaluation" each run) --- .github/workflows/evaluate.yml | 46 +++++++++- README.md | 14 +++ agents/tech-trends-agent.json | 1 - scripts/bootstrap.sh | 86 ++++++++++++++----- .../lifecycle/02-phase2-code-interpreter.sh | 9 +- 5 files changed, 126 insertions(+), 30 deletions(-) diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 5e81110..8857b57 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -49,7 +49,6 @@ jobs: env: FOUNDRY_TEST_ENDPOINT: ${{ vars.FOUNDRY_TEST_ENDPOINT }} GPT_DEPLOYMENT: ${{ vars.GPT_DEPLOYMENT }} - BING_CONNECTION_NAME: ${{ vars.BING_CONNECTION_NAME }} run: | TOOLS=$(python3 -c " import json @@ -67,6 +66,47 @@ jobs: --tools "$TOOLS" echo "phase=$PHASE" >> $GITHUB_OUTPUT + - name: Smoke test — invoke agent and verify response + id: smoke + env: + FOUNDRY_TEST_ENDPOINT: ${{ vars.FOUNDRY_TEST_ENDPOINT }} + run: | + python3 << 'EOF' + import os + from azure.ai.projects import AIProjectClient + from azure.identity import DefaultAzureCredential + + client = AIProjectClient( + endpoint=os.environ["FOUNDRY_TEST_ENDPOINT"], + credential=DefaultAzureCredential() + ) + + agent = client.agents.get_agent("tech-trends-agent") + thread = client.agents.threads.create() + client.agents.messages.create( + thread_id=thread.id, + role="user", + content="What are the latest trends in AI?" + ) + run = client.agents.runs.create_and_process( + thread_id=thread.id, + agent_id=agent.id + ) + + if run.status != "completed": + print(f"FAIL: Agent run status = {run.status}") + raise SystemExit(1) + + messages = client.agents.messages.list(thread_id=thread.id) + response = messages.data[0].content[0].text.value + if len(response) < 50: + print(f"FAIL: Response too short ({len(response)} chars)") + raise SystemExit(1) + + print(f"PASS: Agent responded ({len(response)} chars)") + print(f"Preview: {response[:200]}...") + EOF + - name: Run Foundry evaluation id: eval uses: microsoft/ai-agent-evals@v3-beta @@ -82,11 +122,13 @@ jobs: uses: actions/github-script@v7 with: script: | - const fs = require('fs'); let body = '## Agent Evaluation Results\n\n'; + body += `**Agent:** tech-trends-agent:${{ steps.deploy.outputs.agent_version }}\n`; body += `**Phase:** ${{ steps.deploy.outputs.phase }}\n`; body += `**Model:** ${{ vars.GPT_DEPLOYMENT }}\n`; body += `**Commit:** ${context.sha.slice(0,7)}\n\n`; + const smokeStatus = '${{ steps.smoke.outcome }}' === 'success' ? '✅' : '❌'; + body += `**Smoke Test:** ${smokeStatus}\n\n`; body += 'Full results are in the [Actions summary](' + `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}).\n`; github.rest.issues.createComment({ diff --git a/README.md b/README.md index 4a1c687..717a6e9 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ This creates: - An App Registration with a Service Principal - 3 federated credentials for GitHub OIDC (main branch, pull requests, tags) - RBAC role assignments (Azure AI User, Cognitive Services OpenAI User) +- Model availability validation (checks current + upgrade target model) - 6 GitHub repository variables (`AZURE_CLIENT_ID`, `AZURE_TENANT_ID`, `AZURE_SUBSCRIPTION_ID`, `FOUNDRY_TEST_ENDPOINT`, `FOUNDRY_PROD_ENDPOINT`, `GPT_DEPLOYMENT`) State is saved to `.bootstrap-state.json` for use by the teardown script. @@ -113,10 +114,15 @@ Three scripts simulate the full agent lifecycle by creating PRs that trigger the ``` - Creates branch `chore/model-upgrade-gpt41` +- Upgrades model from `gpt-4o-2024-11-20` (default) to `gpt-4.1` - Updates the `GPT_DEPLOYMENT` GitHub variable to `gpt-4.1` - Adds a model history entry in the agent config - Opens a PR — the eval gate verifies the new model scores at or above thresholds +The bootstrap script validates that both the current model and the upgrade target +(`gpt-4.1`) are available in your chosen Azure region. If `gpt-4.1` is not available, +the script will list alternatives you can use instead. + **After the eval passes, merge the PR.** The full lifecycle demo is complete. ### Lifecycle Flow Diagram @@ -181,6 +187,14 @@ The eval gate uses `microsoft/ai-agent-evals@v3-beta` with four evaluators: - **Groundedness** (threshold: 0.75) - **Coherence** (threshold: 0.80) +A smoke test step runs before evaluation — it invokes the agent with a test query +and verifies a valid response is returned. + +**Note on evaluation naming:** The `ai-agent-evals` action creates a new evaluation +group named "Agent Evaluation" on every run (custom names not yet supported). Each run +is named `Agent tech-trends-agent:`. The PR comment includes the commit SHA +for traceability. + ## Rollback ```bash diff --git a/agents/tech-trends-agent.json b/agents/tech-trends-agent.json index efcf175..1bb67ce 100644 --- a/agents/tech-trends-agent.json +++ b/agents/tech-trends-agent.json @@ -5,7 +5,6 @@ "model": "${GPT_DEPLOYMENT}", "instructions_file": "prompts/tech-trends-agent.md", "tools": [ - { "type": "bing_grounding" }, { "type": "code_interpreter" } ] }, diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 05ef463..9123eeb 100644 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -39,7 +39,6 @@ GPT_MODEL_NAME="gpt-4o" GPT_MODEL_VERSION="2024-11-20" GPT_DEPLOYMENT_NAME="gpt-4o-2024-11-20" GPT_CAPACITY=30 -BING_CONNECTION_NAME="bing-grounding" SKIP_FOUNDRY=false TEST_ENDPOINT="" PROD_ENDPOINT="" @@ -95,7 +94,7 @@ echo "============================================" echo "" # ---------- Step 1: Resource Group ---------- -echo "[1/7] Creating resource group..." +echo "[1/8] Creating resource group..." az group create \ --name "$RESOURCE_GROUP" \ --location "$LOCATION" \ @@ -103,10 +102,10 @@ az group create \ # ---------- Step 2 & 3: Deploy Foundry projects (or skip) ---------- if [[ "$SKIP_FOUNDRY" == true ]]; then - echo "[2/7] Skipping TEST Foundry project (using provided endpoint)" - echo "[3/7] Skipping PROD Foundry project (using provided endpoint)" + echo "[2/8] Skipping TEST Foundry project (using provided endpoint)" + echo "[3/8] Skipping PROD Foundry project (using provided endpoint)" else - echo "[2/7] Deploying TEST Foundry project..." + echo "[2/8] Deploying TEST Foundry project..." TEST_OUTPUT=$(az deployment group create \ --resource-group "$RESOURCE_GROUP" \ --template-file infra/main.bicep \ @@ -122,7 +121,7 @@ else TEST_ENDPOINT=$(echo "$TEST_OUTPUT" | python3 -c "import sys,json; print(json.load(sys.stdin)['properties']['outputs']['projectEndpoint']['value'])") echo " TEST endpoint: $TEST_ENDPOINT" - echo "[3/7] Deploying PROD Foundry project..." + echo "[3/8] Deploying PROD Foundry project..." PROD_OUTPUT=$(az deployment group create \ --resource-group "$RESOURCE_GROUP" \ --template-file infra/main.bicep \ @@ -140,7 +139,56 @@ else fi # ---------- Step 4: App Registration + Service Principal ---------- -echo "[4/7] Creating App Registration and Service Principal..." +echo "[4/8] Validating model availability..." +echo " Checking if '$GPT_MODEL_NAME' (version: $GPT_MODEL_VERSION) is available in $LOCATION..." + +AVAILABLE_MODELS=$(az cognitiveservices model list \ + --location "$LOCATION" \ + --query "[?model.name=='$GPT_MODEL_NAME' && model.version=='$GPT_MODEL_VERSION'].model.name" \ + -o tsv 2>/dev/null || echo "") + +if [[ -z "$AVAILABLE_MODELS" ]]; then + echo "" + echo " WARNING: Model '$GPT_MODEL_NAME' version '$GPT_MODEL_VERSION' not found in $LOCATION." + echo " Available GPT models in $LOCATION:" + az cognitiveservices model list \ + --location "$LOCATION" \ + --query "[?model.name.starts_with(@,'gpt')].{name:model.name, version:model.version}" \ + -o table 2>/dev/null || echo " (Could not list models — check permissions)" + echo "" + echo " The model upgrade lifecycle demo (Phase 3) requires a second model." + echo " You can continue, but ensure GPT_DEPLOYMENT points to a valid model." + echo "" + read -rp " Continue anyway? [y/N] " confirm + if [[ "$confirm" != "y" && "$confirm" != "Y" ]]; then + echo "Aborted." + exit 1 + fi +else + echo " ✓ Model '$GPT_MODEL_NAME' version '$GPT_MODEL_VERSION' is available in $LOCATION" +fi + +# Check for upgrade target model (gpt-4.1) availability for Phase 3 demo +UPGRADE_MODEL="gpt-4.1" +UPGRADE_AVAILABLE=$(az cognitiveservices model list \ + --location "$LOCATION" \ + --query "[?model.name=='$UPGRADE_MODEL'].model.name | [0]" \ + -o tsv 2>/dev/null || echo "") + +if [[ -n "$UPGRADE_AVAILABLE" ]]; then + echo " ✓ Upgrade target '$UPGRADE_MODEL' is also available (Phase 3 model upgrade demo ready)" +else + echo " ⚠ Upgrade target '$UPGRADE_MODEL' not found in $LOCATION." + echo " Phase 3 model upgrade demo may need a different target model." + echo " Available models:" + az cognitiveservices model list \ + --location "$LOCATION" \ + --query "[?model.name.starts_with(@,'gpt')].{name:model.name, version:model.version}" \ + -o table 2>/dev/null || true +fi + +# ---------- Step 5: App Registration + Service Principal ---------- +echo "[5/8] Creating App Registration and Service Principal..." APP_ID=$(az ad app create \ --display-name "$APP_DISPLAY_NAME" \ --query appId -o tsv) @@ -149,8 +197,8 @@ SP_OBJ_ID=$(az ad sp create --id "$APP_ID" --query id -o tsv) echo " Client ID: $APP_ID" echo " SP Object ID: $SP_OBJ_ID" -# ---------- Step 5: Federated Credentials ---------- -echo "[5/7] Adding federated credentials..." +# ---------- Step 6: Federated Credentials ---------- +echo "[6/8] Adding federated credentials..." az ad app federated-credential create \ --id "$APP_ID" \ @@ -182,8 +230,8 @@ az ad app federated-credential create \ }" --output none echo " + release tag credential" -# ---------- Step 6: RBAC Role Assignments ---------- -echo "[6/7] Assigning RBAC roles..." +# ---------- Step 7: RBAC Role Assignments ---------- +echo "[7/8] Assigning RBAC roles..." SUBSCRIPTION_ID=$(az account show --query id -o tsv) SCOPE="/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP" @@ -219,8 +267,8 @@ if [[ -n "$TEST_ENDPOINT" ]]; then fi fi -# ---------- Step 7: GitHub Variables ---------- -echo "[7/7] Setting GitHub repository variables..." +# ---------- Step 8: GitHub Variables ---------- +echo "[8/8] Setting GitHub repository variables..." TENANT_ID=$(az account show --query tenantId -o tsv) gh variable set AZURE_CLIENT_ID --body "$APP_ID" --repo "$GITHUB_REPO" @@ -229,8 +277,7 @@ gh variable set AZURE_SUBSCRIPTION_ID --body "$SUBSCRIPTION_ID" --repo "$GITH gh variable set FOUNDRY_TEST_ENDPOINT --body "$TEST_ENDPOINT" --repo "$GITHUB_REPO" gh variable set FOUNDRY_PROD_ENDPOINT --body "$PROD_ENDPOINT" --repo "$GITHUB_REPO" gh variable set GPT_DEPLOYMENT --body "$GPT_DEPLOYMENT_NAME" --repo "$GITHUB_REPO" -gh variable set BING_CONNECTION_NAME --body "$BING_CONNECTION_NAME" --repo "$GITHUB_REPO" -echo " Set 7 variables on $GITHUB_REPO" +echo " Set 6 variables on $GITHUB_REPO" # ---------- Summary ---------- echo "" @@ -252,13 +299,11 @@ echo " Tenant ID: $TENANT_ID" echo " Subscription ID: $SUBSCRIPTION_ID" echo "" echo " GitHub ($GITHUB_REPO):" -echo " 7 repository variables set" +echo " 6 repository variables set" echo " 3 federated credentials configured" echo "" echo " Next steps:" -echo " 1. Configure Bing Grounding connection in both Foundry projects" -echo " (Portal: ai.azure.com -> project -> Connections -> + New)" -echo " 2. Run lifecycle scripts in order:" +echo " 1. Run lifecycle scripts in order:" echo " ./scripts/lifecycle/01-phase1-web-search.sh" echo " ./scripts/lifecycle/02-phase2-code-interpreter.sh" echo " ./scripts/lifecycle/03-model-upgrade.sh" @@ -301,9 +346,6 @@ FOUNDRY_PROD_ENDPOINT=$PROD_ENDPOINT # Model deployment GPT_DEPLOYMENT=$GPT_DEPLOYMENT_NAME -# Bing Grounding connection name -BING_CONNECTION_NAME=$BING_CONNECTION_NAME - # Resource metadata RESOURCE_GROUP=$RESOURCE_GROUP LOCATION=$LOCATION diff --git a/scripts/lifecycle/02-phase2-code-interpreter.sh b/scripts/lifecycle/02-phase2-code-interpreter.sh index c2df73e..9fad89b 100644 --- a/scripts/lifecycle/02-phase2-code-interpreter.sh +++ b/scripts/lifecycle/02-phase2-code-interpreter.sh @@ -26,7 +26,7 @@ git pull origin main # Create feature branch git checkout -b "$BRANCH" -# --- Agent config: Phase 2, add code_interpreter --- +# --- Agent config: Phase 2, code_interpreter only --- cat > agents/tech-trends-agent.json << 'AGENT_EOF' { "agent_name": "tech-trends-agent", @@ -35,7 +35,6 @@ cat > agents/tech-trends-agent.json << 'AGENT_EOF' "model": "${GPT_DEPLOYMENT}", "instructions_file": "prompts/tech-trends-agent.md", "tools": [ - { "type": "bing_grounding" }, { "type": "code_interpreter" } ] }, @@ -126,14 +125,14 @@ PR_URL=$(gh pr create \ --title "Phase 2: Add Code Interpreter for Data Analysis" \ --body "$(cat <<'PR_EOF' ## Summary -- Adds `code_interpreter` tool alongside existing `bing_grounding` +- Replaces `web_search` tool with `code_interpreter` for data analysis - Extends system prompt with `## Data Analysis` section - Evaluation now runs **all 8 queries** (Phase 1 + Phase 2) ## Changes | File | Change | |---|---| -| `agents/tech-trends-agent.json` | Added `code_interpreter` to tools, phase → `"2"` | +| `agents/tech-trends-agent.json` | Replaced `web_search` with `code_interpreter`, phase → `"2"` | | `prompts/tech-trends-agent.md` | Added `## Data Analysis (Phase 2)` section | | `evals/eval-config.json` | `phase_filter` → `null` (run all cases) | @@ -156,7 +155,7 @@ echo " PR created: $PR_URL" echo "============================================" echo "" echo " The evaluate.yml workflow will now:" -echo " 1. Deploy Phase 2 agent to TEST (both tools)" +echo " 1. Deploy Phase 2 agent to TEST (code_interpreter only)" echo " 2. Run ALL 8 eval queries (Phase 1 + Phase 2)" echo " 3. Check for regressions on existing Phase 1 queries" echo "" From 54ec420e8f3ea3f2431e1c30fcc992457bb92045 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 18:28:32 +0200 Subject: [PATCH 05/11] fix: use correct SDK methods for agent operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - get_agent() → agents.get(agent_name=...) - Use OpenAI responses API via get_openai_client() for smoke test invocation - Fix monitor.yml to use agent.latest_version --- .github/workflows/evaluate.yml | 33 +++++++++++++-------------------- .github/workflows/monitor.yml | 4 ++-- 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 8857b57..08a46a9 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -81,30 +81,23 @@ jobs: credential=DefaultAzureCredential() ) - agent = client.agents.get_agent("tech-trends-agent") - thread = client.agents.threads.create() - client.agents.messages.create( - thread_id=thread.id, - role="user", - content="What are the latest trends in AI?" - ) - run = client.agents.runs.create_and_process( - thread_id=thread.id, - agent_id=agent.id - ) + agent = client.agents.get(agent_name="tech-trends-agent") + print(f"Agent: {agent.name} (latest version: {agent.latest_version})") - if run.status != "completed": - print(f"FAIL: Agent run status = {run.status}") - raise SystemExit(1) + openai_client = client.get_openai_client() + + response = openai_client.responses.create( + input="What are the latest trends in AI?", + extra_body={"agent_reference": {"name": agent.name, "type": "agent_reference"}}, + ) - messages = client.agents.messages.list(thread_id=thread.id) - response = messages.data[0].content[0].text.value - if len(response) < 50: - print(f"FAIL: Response too short ({len(response)} chars)") + output = response.output_text + if len(output) < 50: + print(f"FAIL: Response too short ({len(output)} chars)") raise SystemExit(1) - print(f"PASS: Agent responded ({len(response)} chars)") - print(f"Preview: {response[:200]}...") + print(f"PASS: Agent responded ({len(output)} chars)") + print(f"Preview: {output[:200]}...") EOF - name: Run Foundry evaluation diff --git a/.github/workflows/monitor.yml b/.github/workflows/monitor.yml index cd9bbf3..f3d6e80 100644 --- a/.github/workflows/monitor.yml +++ b/.github/workflows/monitor.yml @@ -41,8 +41,8 @@ jobs: from azure.ai.projects import AIProjectClient from azure.identity import DefaultAzureCredential client = AIProjectClient(endpoint=os.environ['FOUNDRY_PROD_ENDPOINT'], credential=DefaultAzureCredential()) - agent = client.agents.get_agent('tech-trends-agent') - print(agent.version) + agent = client.agents.get(agent_name='tech-trends-agent') + print(agent.latest_version) ") echo "version=$VERSION" >> $GITHUB_OUTPUT From 8bb0fdcb6ce8f8f49e45be3ef471d225062e1add Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 18:31:40 +0200 Subject: [PATCH 06/11] fix: remove non-existent latest_version attr from AgentDetails - Smoke test: just print agent name (version not needed for invocation) - Monitor: use list_versions() to find the latest version number --- .github/workflows/evaluate.yml | 2 +- .github/workflows/monitor.yml | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 08a46a9..ba77b76 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -82,7 +82,7 @@ jobs: ) agent = client.agents.get(agent_name="tech-trends-agent") - print(f"Agent: {agent.name} (latest version: {agent.latest_version})") + print(f"Agent: {agent.name}") openai_client = client.get_openai_client() diff --git a/.github/workflows/monitor.yml b/.github/workflows/monitor.yml index f3d6e80..9972715 100644 --- a/.github/workflows/monitor.yml +++ b/.github/workflows/monitor.yml @@ -41,8 +41,9 @@ jobs: from azure.ai.projects import AIProjectClient from azure.identity import DefaultAzureCredential client = AIProjectClient(endpoint=os.environ['FOUNDRY_PROD_ENDPOINT'], credential=DefaultAzureCredential()) - agent = client.agents.get(agent_name='tech-trends-agent') - print(agent.latest_version) + versions = client.agents.list_versions(agent_name='tech-trends-agent') + latest = max(versions, key=lambda v: int(v.version)) + print(latest.version) ") echo "version=$VERSION" >> $GITHUB_OUTPUT From 4458f5dcaba49d80cb4a397768d7064033db82df Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 18:36:23 +0200 Subject: [PATCH 07/11] fix: add model param and GPT_DEPLOYMENT env to smoke test The responses.create() call was missing the model parameter, causing a 404 DeploymentNotFound error in Azure OpenAI. --- .github/workflows/evaluate.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index ba77b76..42a20c3 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -70,6 +70,7 @@ jobs: id: smoke env: FOUNDRY_TEST_ENDPOINT: ${{ vars.FOUNDRY_TEST_ENDPOINT }} + GPT_DEPLOYMENT: ${{ vars.GPT_DEPLOYMENT }} run: | python3 << 'EOF' import os @@ -87,6 +88,7 @@ jobs: openai_client = client.get_openai_client() response = openai_client.responses.create( + model=os.environ["GPT_DEPLOYMENT"], input="What are the latest trends in AI?", extra_body={"agent_reference": {"name": agent.name, "type": "agent_reference"}}, ) From 16eb58899099cff7739c105b95752febfa85a158 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 19:01:00 +0200 Subject: [PATCH 08/11] pipeline changes --- .github/workflows/evaluate.yml | 262 ++++++++++++++++++++++++++++++--- requirements.txt | 3 +- scripts/bootstrap.sh | 2 +- 3 files changed, 244 insertions(+), 23 deletions(-) diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 42a20c3..7fc3cd4 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -70,36 +70,52 @@ jobs: id: smoke env: FOUNDRY_TEST_ENDPOINT: ${{ vars.FOUNDRY_TEST_ENDPOINT }} - GPT_DEPLOYMENT: ${{ vars.GPT_DEPLOYMENT }} run: | python3 << 'EOF' import os from azure.ai.projects import AIProjectClient from azure.identity import DefaultAzureCredential - client = AIProjectClient( + # Create the project client + project = AIProjectClient( endpoint=os.environ["FOUNDRY_TEST_ENDPOINT"], credential=DefaultAzureCredential() ) - agent = client.agents.get(agent_name="tech-trends-agent") - print(f"Agent: {agent.name}") + # Get agent metadata for display + agent = project.agents.get(agent_name="tech-trends-agent") + print(f"Agent: {agent.name} (version: {agent.version})") - openai_client = client.get_openai_client() + # Get the OpenAI client for Responses API + openai = project.get_openai_client() - response = openai_client.responses.create( - model=os.environ["GPT_DEPLOYMENT"], + # Invoke the agent using the Responses API with agent_reference + response = openai.responses.create( input="What are the latest trends in AI?", - extra_body={"agent_reference": {"name": agent.name, "type": "agent_reference"}}, + extra_body={ + "agent_reference": { + "name": "tech-trends-agent", + "type": "agent_reference", + } + }, ) output = response.output_text + print(f"Response ID: {response.id}") + if len(output) < 50: print(f"FAIL: Response too short ({len(output)} chars)") raise SystemExit(1) print(f"PASS: Agent responded ({len(output)} chars)") - print(f"Preview: {output[:200]}...") + print(f"Preview: {output[:300]}...") + + # Write smoke test result for downstream steps + gh_output = os.environ.get("GITHUB_OUTPUT", "") + if gh_output: + with open(gh_output, "a") as f: + f.write(f"response_length={len(output)}\n") + f.write(f"response_preview={output[:200]}\n") EOF - name: Run Foundry evaluation @@ -117,18 +133,222 @@ jobs: uses: actions/github-script@v7 with: script: | - let body = '## Agent Evaluation Results\n\n'; - body += `**Agent:** tech-trends-agent:${{ steps.deploy.outputs.agent_version }}\n`; - body += `**Phase:** ${{ steps.deploy.outputs.phase }}\n`; - body += `**Model:** ${{ vars.GPT_DEPLOYMENT }}\n`; - body += `**Commit:** ${context.sha.slice(0,7)}\n\n`; - const smokeStatus = '${{ steps.smoke.outcome }}' === 'success' ? '✅' : '❌'; - body += `**Smoke Test:** ${smokeStatus}\n\n`; - body += 'Full results are in the [Actions summary](' + - `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}).\n`; - github.rest.issues.createComment({ - issue_number: context.issue.number, + const marker = ''; + const smokeOutcome = '${{ steps.smoke.outcome }}'; + const evalOutcome = '${{ steps.eval.outcome }}'; + const deployOutcome = '${{ steps.deploy.outcome }}'; + const agentVersion = '${{ steps.deploy.outputs.agent_version }}' || 'N/A'; + const phase = '${{ steps.deploy.outputs.phase }}' || 'N/A'; + const model = '${{ vars.GPT_DEPLOYMENT }}' || 'N/A'; + const semver = '${{ steps.version.outputs.semver }}'; + const sha = context.sha.slice(0, 7); + const runUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`; + const timestamp = new Date().toISOString().replace('T', ' ').slice(0, 19) + ' UTC'; + + const icon = (outcome) => outcome === 'success' ? '✅' : outcome === 'failure' ? '❌' : '⚠️'; + const badge = (outcome) => outcome === 'success' + ? 'PASSED' + : outcome === 'failure' + ? 'FAILED' + : 'SKIPPED'; + + const overallStatus = (smokeOutcome === 'success' && evalOutcome === 'success') ? 'PASSED' : 'FAILED'; + const overallIcon = overallStatus === 'PASSED' ? '✅' : '❌'; + const overallColor = overallStatus === 'PASSED' ? '#22c55e' : '#ef4444'; + + let body = `${marker}\n`; + body += `## ${overallIcon} Agent Deployment & Evaluation Report\n\n`; + + // Agent Details Table + body += `### 🤖 Agent Details\n\n`; + body += `| Property | Value |\n`; + body += `|----------|-------|\n`; + body += `| **Agent** | \`tech-trends-agent\` |\n`; + body += `| **Version** | \`${agentVersion}\` |\n`; + body += `| **Semver** | \`${semver}\` |\n`; + body += `| **Phase** | ${phase} |\n`; + body += `| **Model** | \`${model}\` |\n`; + body += `| **Commit** | \`${sha}\` |\n`; + body += `| **Timestamp** | ${timestamp} |\n\n`; + + // Pipeline Results + body += `### 📊 Pipeline Results\n\n`; + body += `| Step | Status | Details |\n`; + body += `|------|--------|----------|\n`; + body += `| Deploy to TEST | ${icon(deployOutcome)} ${badge(deployOutcome)} | Agent version \`${agentVersion}\` deployed |\n`; + body += `| Smoke Test | ${icon(smokeOutcome)} ${badge(smokeOutcome)} | Invoked agent via Responses API |\n`; + body += `| Foundry Evaluation | ${icon(evalOutcome)} ${badge(evalOutcome)} | Evaluated with golden dataset |\n\n`; + + // Tools Configuration + body += `### 🛠️ Tools Configuration\n\n`; + body += `| Tool | Enabled |\n`; + body += `|------|----------|\n`; + const toolsInPhase = phase === '2' ? ['code_interpreter'] : phase === '1' ? ['web_search'] : ['web_search', 'code_interpreter']; + const allTools = ['web_search', 'code_interpreter']; + for (const tool of allTools) { + const enabled = toolsInPhase.includes(tool) ? '✅' : '—'; + body += `| \`${tool}\` | ${enabled} |\n`; + } + body += '\n'; + + // Links + body += `### 🔗 Links\n\n`; + body += `- [📋 Full Actions Run](${runUrl})\n`; + body += `- [📁 Artifacts](${runUrl}#artifacts)\n\n`; + + // Footer + body += `---\n`; + body += `🤖 Updated automatically by the CI pipeline · ${timestamp}\n`; + + // Find existing comment with marker and update, or create new + const { data: comments } = await github.rest.issues.listComments({ owner: context.repo.owner, repo: context.repo.repo, - body + issue_number: context.issue.number, }); + + const existingComment = comments.find(c => c.body.includes(marker)); + + if (existingComment) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existingComment.id, + body, + }); + console.log(`Updated existing comment #${existingComment.id}`); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body, + }); + console.log('Created new evaluation comment'); + } + + - name: Generate HTML report artifact + if: always() + run: | + python3 << 'EOF' + import os, json + from datetime import datetime, timezone + + agent_version = os.environ.get("AGENT_VERSION", "N/A") + phase = os.environ.get("PHASE", "N/A") + model = os.environ.get("MODEL", "N/A") + semver = os.environ.get("SEMVER", "N/A") + sha = os.environ.get("SHA", "N/A")[:7] + smoke_outcome = os.environ.get("SMOKE_OUTCOME", "unknown") + eval_outcome = os.environ.get("EVAL_OUTCOME", "unknown") + deploy_outcome = os.environ.get("DEPLOY_OUTCOME", "unknown") + run_url = os.environ.get("RUN_URL", "#") + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + + def status_badge(outcome): + colors = {"success": ("#22c55e", "PASSED"), "failure": ("#ef4444", "FAILED")} + color, label = colors.get(outcome, ("#f59e0b", "SKIPPED")) + return f'{label}' + + def status_icon(outcome): + return {"success": "✅", "failure": "❌"}.get(outcome, "⚠️") + + overall = "PASSED" if smoke_outcome == "success" and eval_outcome == "success" else "FAILED" + overall_color = "#22c55e" if overall == "PASSED" else "#ef4444" + + html = f""" + + + + Agent Evaluation Report — tech-trends-agent v{semver} + + + +
+
+

🤖 Agent Evaluation Report

+
tech-trends-agent v{semver}
+
{overall}
+
+ +
+

Agent Details

+ + + + + + + + +
Agent Nametech-trends-agent
Foundry Version{agent_version}
Semver{semver}
Phase{phase}
Model{model}
Commit{sha}
Timestamp{timestamp}
+
+ +
+

Pipeline Results

+ + + + + + + +
StepStatusDetails
Deploy to TEST{status_icon(deploy_outcome)} {status_badge(deploy_outcome)}Version {agent_version}
Smoke Test{status_icon(smoke_outcome)} {status_badge(smoke_outcome)}Responses API invocation
Foundry Evaluation{status_icon(eval_outcome)} {status_badge(eval_outcome)}Golden dataset evaluation
+
+ +
+

Tools Configuration

+ + + + + + +
ToolEnabled
web_search{"✅" if phase in ("1", "3") else "—"}
code_interpreter{"✅" if phase in ("2", "3") else "—"}
+
+ + +
+ + """ + + os.makedirs("reports", exist_ok=True) + with open("reports/evaluation-report.html", "w") as f: + f.write(html) + print("Generated reports/evaluation-report.html") + EOF + env: + AGENT_VERSION: ${{ steps.deploy.outputs.agent_version }} + PHASE: ${{ steps.deploy.outputs.phase }} + MODEL: ${{ vars.GPT_DEPLOYMENT }} + SEMVER: ${{ steps.version.outputs.semver }} + SHA: ${{ github.sha }} + SMOKE_OUTCOME: ${{ steps.smoke.outcome }} + EVAL_OUTCOME: ${{ steps.eval.outcome }} + DEPLOY_OUTCOME: ${{ steps.deploy.outcome }} + RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + + - name: Upload HTML report artifact + if: always() + uses: actions/upload-artifact@v4 + with: + name: evaluation-report + path: reports/evaluation-report.html diff --git a/requirements.txt b/requirements.txt index 0e556a8..30104b5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ -azure-ai-projects>=1.0.0b1 +azure-ai-projects>=2.0.0 azure-ai-evaluation>=1.0.0b1 azure-identity>=1.15.0 +openai>=1.66.0 # Dev / Test pytest>=8.0 diff --git a/scripts/bootstrap.sh b/scripts/bootstrap.sh index 9123eeb..c3690c3 100644 --- a/scripts/bootstrap.sh +++ b/scripts/bootstrap.sh @@ -37,7 +37,7 @@ ACCOUNT_NAME="" GITHUB_REPO="san360/agent-devops" GPT_MODEL_NAME="gpt-4o" GPT_MODEL_VERSION="2024-11-20" -GPT_DEPLOYMENT_NAME="gpt-4o-2024-11-20" +GPT_DEPLOYMENT_NAME="gpt-4o" GPT_CAPACITY=30 SKIP_FOUNDRY=false TEST_ENDPOINT="" From 9fd9a9f861f5d697ab5d073aace4f0bb82cdc400 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 21:29:53 +0200 Subject: [PATCH 09/11] fix: use agent.versions instead of agent.version in smoke test AgentDetails object returned by project.agents.get() has 'versions' (list), not 'version' (scalar). Use versions[-1] for the latest version. --- .github/workflows/evaluate.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 7fc3cd4..8c7cc78 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -84,7 +84,8 @@ jobs: # Get agent metadata for display agent = project.agents.get(agent_name="tech-trends-agent") - print(f"Agent: {agent.name} (version: {agent.version})") + latest_version = agent.versions[-1] if agent.versions else "unknown" + print(f"Agent: {agent.name} (version: {latest_version})") # Get the OpenAI client for Responses API openai = project.get_openai_client() From 9b46c101661fed27dee59f48bdf2f1e73b77cdb8 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 21:31:32 +0200 Subject: [PATCH 10/11] fix: convert agent.versions to list before negative indexing SDK model objects don't support negative indexing directly. --- .github/workflows/evaluate.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 8c7cc78..05f10a9 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -84,7 +84,8 @@ jobs: # Get agent metadata for display agent = project.agents.get(agent_name="tech-trends-agent") - latest_version = agent.versions[-1] if agent.versions else "unknown" + versions = list(agent.versions) if agent.versions else [] + latest_version = versions[-1] if versions else "unknown" print(f"Agent: {agent.name} (version: {latest_version})") # Get the OpenAI client for Responses API From b2f8e4bc1ddf7e95076609cbdae21c9699e59195 Mon Sep 17 00:00:00 2001 From: sanjay singh Date: Fri, 15 May 2026 21:35:04 +0200 Subject: [PATCH 11/11] fix: access agent.versions.latest.version correctly\nAgentDetails.versions is AgentObjectVersions (not a list).\nThe correct path is agent.versions.latest.version.\nVerified locally with SDK model objects. --- .github/workflows/evaluate.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/evaluate.yml b/.github/workflows/evaluate.yml index 05f10a9..3e5dbf5 100644 --- a/.github/workflows/evaluate.yml +++ b/.github/workflows/evaluate.yml @@ -84,8 +84,7 @@ jobs: # Get agent metadata for display agent = project.agents.get(agent_name="tech-trends-agent") - versions = list(agent.versions) if agent.versions else [] - latest_version = versions[-1] if versions else "unknown" + latest_version = agent.versions.latest.version if agent.versions and agent.versions.latest else "unknown" print(f"Agent: {agent.name} (version: {latest_version})") # Get the OpenAI client for Responses API