Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 97 additions & 23 deletions infra/main.bicep
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,25 @@ module applicationInsights 'br/public:avm/res/insights/component:0.7.0' = if (en
retentionInDays: 365
kind: 'web'
disableIpMasking: false
disableLocalAuth: true
flowType: 'Bluefield'
// WAF aligned configuration for Private Networking - block public ingestion/query
publicNetworkAccessForIngestion: enablePrivateNetworking ? 'Disabled' : 'Enabled'
publicNetworkAccessForQuery: enablePrivateNetworking ? 'Disabled' : 'Enabled'
}
}

// ========== Data Collection Endpoint (DCE) ========== //
// Required for Azure Monitor Private Link - provides private ingestion and configuration endpoints
// Per: https://learn.microsoft.com/en-us/azure/azure-monitor/fundamentals/private-link-configure
module dataCollectionEndpoint 'br/public:avm/res/insights/data-collection-endpoint:0.5.0' = if (enablePrivateNetworking && enableMonitoring) {
name: take('avm.res.insights.data-collection-endpoint.${solutionSuffix}', 64)
params: {
name: 'dce-${solutionSuffix}'
location: location
kind: 'Windows'
publicNetworkAccess: 'Disabled'
tags: allTags
enableTelemetry: enableTelemetry
}
}

Expand Down Expand Up @@ -320,6 +337,10 @@ var privateDnsZones = [
'privatelink.vaultcore.azure.net'
'privatelink.blob.${environment().suffixes.storage}'
'privatelink.file.${environment().suffixes.storage}'
'privatelink.monitor.azure.com' // Azure Monitor global endpoints (App Insights, DCE)
'privatelink.oms.opinsights.azure.com' // Log Analytics OMS endpoints
'privatelink.ods.opinsights.azure.com' // Log Analytics ODS ingestion endpoints
'privatelink.agentsvc.azure-automation.net' // Agent service automation endpoints
]

// DNS Zone Index Constants
Expand All @@ -331,6 +352,10 @@ var dnsZoneIndex = {
keyVault: 4
storageBlob: 5
storageFile: 6
monitor: 7
oms: 8
ods: 9
agentSvc: 10
}

// ===================================================
Expand All @@ -356,6 +381,76 @@ module avmPrivateDnsZones 'br/public:avm/res/network/private-dns-zone:0.8.0' = [
}
]

// ========== Azure Monitor Private Link Scope (AMPLS) ========== //
// Step 1: Create AMPLS
// Step 2: Connect Azure Monitor resources (LAW, Application Insights, DCE) to the AMPLS
// Step 3: Connect AMPLS to a private endpoint with required DNS zones
// Per: https://learn.microsoft.com/en-us/azure/azure-monitor/fundamentals/private-link-configure
module azureMonitorPrivateLinkScope 'br/public:avm/res/insights/private-link-scope:0.6.0' = if (enablePrivateNetworking) {
name: take('avm.res.insights.private-link-scope.${solutionSuffix}', 64)
#disable-next-line no-unnecessary-dependson
dependsOn: [logAnalyticsWorkspace, applicationInsights, dataCollectionEndpoint, virtualNetwork]
params: {
name: 'ampls-${solutionSuffix}'
location: 'global'
// Access mode: PrivateOnly ensures all ingestion and queries go through private link
accessModeSettings: {
ingestionAccessMode: 'PrivateOnly'
queryAccessMode: 'PrivateOnly'
}
// Step 2: Connect Azure Monitor resources to the AMPLS as scoped resources
scopedResources: concat([
{
name: 'scoped-law'
linkedResourceId: logAnalyticsWorkspaceResourceId
}
], enableMonitoring ? [
{
name: 'scoped-appi'
linkedResourceId: applicationInsights!.outputs.resourceId
}
{
name: 'scoped-dce'
linkedResourceId: dataCollectionEndpoint!.outputs.resourceId
}
] : [])
// Step 3: Connect AMPLS to a private endpoint
// The private endpoint requires 5 DNS zones per documentation:
// - privatelink.monitor.azure.com (App Insights + DCE global endpoints)
// - privatelink.oms.opinsights.azure.com (Log Analytics OMS)
// - privatelink.ods.opinsights.azure.com (Log Analytics ODS ingestion)
// - privatelink.agentsvc.azure-automation.net (Agent service automation)
// - privatelink.blob.core.windows.net (Agent solution packs storage)
privateEndpoints: [
{
name: 'pep-ampls-${solutionSuffix}'
subnetResourceId: virtualNetwork!.outputs.pepsSubnetResourceId
privateDnsZoneGroup: {
privateDnsZoneGroupConfigs: [
{
privateDnsZoneResourceId: avmPrivateDnsZones[dnsZoneIndex.monitor]!.outputs.resourceId
}
{
privateDnsZoneResourceId: avmPrivateDnsZones[dnsZoneIndex.oms]!.outputs.resourceId
}
{
privateDnsZoneResourceId: avmPrivateDnsZones[dnsZoneIndex.ods]!.outputs.resourceId
}
{
privateDnsZoneResourceId: avmPrivateDnsZones[dnsZoneIndex.agentSvc]!.outputs.resourceId
}
{
privateDnsZoneResourceId: avmPrivateDnsZones[dnsZoneIndex.storageBlob]!.outputs.resourceId
}
]
}
}
]
tags: allTags
enableTelemetry: enableTelemetry
}
}

// Azure Bastion Host
var bastionHostName = 'bas-${solutionSuffix}'
module bastionHost 'br/public:avm/res/network/bastion-host:0.8.0' = if (enablePrivateNetworking) {
Expand Down Expand Up @@ -437,6 +532,7 @@ module windowsVmDataCollectionRules 'br/public:avm/res/insights/data-collection-
location: dataCollectionRulesLocation
dataCollectionRuleProperties: {
kind: 'Windows'
dataCollectionEndpointResourceId: dataCollectionEndpoint!.outputs.resourceId
dataSources: {
performanceCounters: [
{
Expand Down Expand Up @@ -495,26 +591,6 @@ module windowsVmDataCollectionRules 'br/public:avm/res/insights/data-collection-
name: 'perfCounterDataSource60'
}
]
windowsEventLogs: [
{
name: 'SecurityAuditEvents'
streams: [
'Microsoft-WindowsEvent'
]
eventLogName: 'Security'
eventTypes: [
{
eventType: 'Audit Success'
}
{
eventType: 'Audit Failure'
}
]
xPathQueries: [
'Security!*[System[(EventID=4624 or EventID=4625)]]'
]
}
]
}
destinations: {
logAnalytics: [
Expand All @@ -532,8 +608,6 @@ module windowsVmDataCollectionRules 'br/public:avm/res/insights/data-collection-
destinations: [
'la-${dataCollectionRulesResourceName}'
]
transformKql: 'source'
outputStream: 'Microsoft-Perf'
}
]
}
Expand Down
20 changes: 0 additions & 20 deletions src/backend/api/api_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
# Standard library
import asyncio
import io
import logging
import os
import zipfile
from typing import Optional

Expand All @@ -14,9 +12,6 @@
from api.status_updates import app_connection_manager, close_connection

# Third-party
# Azure Monitor OpenTelemetry integration is currently causing issues with OpenAI calls in process_batch_async, needs further investigation, commenting out for now
# from azure.monitor.opentelemetry import configure_azure_monitor

from common.logger.app_logger import AppLogger
from common.services.batch_service import BatchService

Expand All @@ -40,21 +35,6 @@
router = APIRouter()
logger = AppLogger("APIRoutes")

# Check if the Application Insights Instrumentation Key is set in the environment variables
instrumentation_key = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
if instrumentation_key:
# Configure Application Insights if the Instrumentation Key is found
# commenting below line as configure_azure_monitor is causing issues with OpenAI calls in process_batch_async, needs further investigation
# configure_azure_monitor(connection_string=instrumentation_key)
logging.info(
"Application Insights configured with the provided Instrumentation Key"
)
else:
# Log a warning if the Instrumentation Key is not found
logging.warning(
"No Application Insights Instrumentation Key found. Skipping configuration"
)


def record_exception_to_trace(e):
"""Record exception to the current OpenTelemetry trace span."""
Expand Down
56 changes: 54 additions & 2 deletions src/backend/api/event_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,69 @@
import os

# Third-party
from azure.monitor.events.extension import track_event
from applicationinsights import TelemetryClient
from applicationinsights.channel import SynchronousQueue, SynchronousSender, TelemetryChannel

from dotenv import load_dotenv

load_dotenv()

# Global telemetry client (initialized once)
_telemetry_client = None


def _get_telemetry_client():
"""Get or create the Application Insights telemetry client."""
global _telemetry_client

if _telemetry_client is None:
connection_string = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
if connection_string:
try:
# Extract instrumentation key from connection string
# Format: InstrumentationKey=xxx;IngestionEndpoint=https://...
parts = dict(part.split('=', 1) for part in connection_string.split(';') if '=' in part)
instrumentation_key = parts.get('InstrumentationKey')

if instrumentation_key:
# Create a synchronous channel for immediate sending
sender = SynchronousSender()
queue = SynchronousQueue(sender)
channel = TelemetryChannel(None, queue)

_telemetry_client = TelemetryClient(instrumentation_key, channel)
logging.info("Application Insights TelemetryClient initialized successfully")
else:
logging.error("Could not extract InstrumentationKey from connection string")
except Exception as e:
logging.error(f"Failed to initialize TelemetryClient: {e}")

return _telemetry_client


def track_event_if_configured(event_name: str, event_data: dict):
"""Track a custom event to Application Insights customEvents table.

This uses the Application Insights SDK TelemetryClient which properly
sends custom events to the customEvents table in Application Insights.
"""
instrumentation_key = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
if instrumentation_key:
track_event(event_name, event_data)
try:
client = _get_telemetry_client()
if client:
# Convert all values to strings to ensure compatibility
properties = {k: str(v) for k, v in event_data.items()}

# Track the custom event
client.track_event(event_name, properties=properties)
client.flush() # Ensure immediate sending

logging.debug(f"Tracked custom event: {event_name} with data: {event_data}")
else:
logging.warning("TelemetryClient not available, custom event not tracked")
except Exception as e:
logging.error(f"Failed to track event {event_name}: {e}")
else:
logging.warning(
f"Skipping track_event for {event_name} as Application Insights is not configured"
Expand Down
68 changes: 68 additions & 0 deletions src/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from api.api_routes import router as backend_router

from azure.monitor.opentelemetry.exporter import AzureMonitorLogExporter, AzureMonitorTraceExporter

from common.config.config import app_config
from common.logger.app_logger import AppLogger

Expand All @@ -15,6 +17,14 @@

from helper.azure_credential_utils import get_azure_credential

from opentelemetry import trace
from opentelemetry._logs import set_logger_provider
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.sdk._logs import LoggerProvider, LoggingHandler
from opentelemetry.sdk._logs.export import BatchLogRecordProcessor
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor

from semantic_kernel.agents.azure_ai.azure_ai_agent import AzureAIAgent # pylint: disable=E0611

from sql_agents.agent_manager import clear_sql_agents, set_sql_agents
Expand Down Expand Up @@ -46,6 +56,11 @@
for logger_name in AZURE_LOGGING_PACKAGES:
logging.getLogger(logger_name).setLevel(getattr(logging, AZURE_PACKAGE_LOGGING_LEVEL, logging.WARNING))

# Suppress noisy OpenTelemetry and Azure Monitor logs
# logging.getLogger("opentelemetry.sdk").setLevel(logging.ERROR)
# logging.getLogger("azure.core.pipeline.policies.http_logging_policy").setLevel(logging.WARNING)
# logging.getLogger("azure.monitor.opentelemetry.exporter.export._base").setLevel(logging.WARNING)

logger = AppLogger("app")

# Global variables for agents
Expand Down Expand Up @@ -119,6 +134,59 @@ def create_app() -> FastAPI:
allow_headers=["*"],
)

# Configure Azure Monitor and instrument FastAPI for OpenTelemetry
# This must happen AFTER app creation but BEFORE route registration
instrumentation_key = os.getenv("APPLICATIONINSIGHTS_CONNECTION_STRING")
if instrumentation_key:
# SOLUTION: Use manual telemetry setup instead of configure_azure_monitor
# This gives us precise control over what gets instrumented, avoiding interference
# with Semantic Kernel's async generators while still tracking Azure SDK calls

# Set up Azure Monitor exporter for traces
azure_trace_exporter = AzureMonitorTraceExporter(connection_string=instrumentation_key)

# Create a tracer provider and add the Azure Monitor exporter
tracer_provider = TracerProvider()
tracer_provider.add_span_processor(BatchSpanProcessor(azure_trace_exporter))

# Set the global tracer provider
trace.set_tracer_provider(tracer_provider)

# Set up Azure Monitor exporter for logs (appears in traces table)
azure_log_exporter = AzureMonitorLogExporter(connection_string=instrumentation_key)

# Create a logger provider and add the Azure Monitor exporter
logger_provider = LoggerProvider()
logger_provider.add_log_record_processor(BatchLogRecordProcessor(azure_log_exporter))
set_logger_provider(logger_provider)

# Attach OpenTelemetry handler to Python's root logger
handler = LoggingHandler(logger_provider=logger_provider)
logging.getLogger().addHandler(handler)

# Instrument ONLY FastAPI for HTTP request/response tracing
# This is safe because it only wraps HTTP handlers, not internal async operations
FastAPIInstrumentor.instrument_app(
app,
excluded_urls="socket,ws", # Exclude WebSocket URLs to reduce noise
tracer_provider=tracer_provider
)

# Optional: Add manual spans in your code for Azure SDK operations using:
# from opentelemetry import trace
# tracer = trace.get_tracer(__name__)
# with tracer.start_as_current_span("operation_name"):
# # your Azure SDK call here

logger.logger.info("Application Insights configured with selective instrumentation")
logger.logger.info("✓ FastAPI HTTP tracing enabled")
logger.logger.info("✓ Python logging export to Application Insights enabled")
logger.logger.info("✓ Manual span support enabled for Azure SDK operations")
logger.logger.info("✓ Custom events via OpenTelemetry enabled")
logger.logger.info("✓ Semantic Kernel async generators unaffected")
else:
logger.logger.warning("No Application Insights connection string found. Telemetry disabled.")

# Include routers with /api prefix
app.include_router(backend_router, prefix="/api", tags=["backend"])
# app.include_router(agents_router, prefix="/api/agents", tags=["agents"])
Expand Down
17 changes: 17 additions & 0 deletions src/backend/common/telemetry/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
"""Telemetry utilities for Application Insights integration."""

from common.telemetry.telemetry_helper import (
add_span_attributes,
get_tracer,
trace_context,
trace_operation,
trace_sync_context,
)

__all__ = [
"trace_operation",
"trace_context",
"trace_sync_context",
"get_tracer",
"add_span_attributes",
]
Loading