diff --git a/cleancloud/providers/azure/rules/ai/ai_search_idle.py b/cleancloud/providers/azure/rules/ai/ai_search_idle.py index 9e93cb8..9f74b1d 100644 --- a/cleancloud/providers/azure/rules/ai/ai_search_idle.py +++ b/cleancloud/providers/azure/rules/ai/ai_search_idle.py @@ -1,10 +1,56 @@ +""" +Rule: azure.ai_search.idle + +Intent: + Detect Azure AI Search services that are structurally empty and + operationally inactive over a fixed 90-day observation window, making + them conservative review candidates for deletion or rightsizing. + + This rule intentionally requires BOTH documented activity silence AND + confirmed structural emptiness before emitting. Either condition alone + is not sufficient. + + Review-candidate rule only. Does not prove deletion is safe, that no + future go-live depends on the service, or that a specific monthly saving + exists. + +Exclusions: + - id absent or empty + - name absent or empty + - outside optional region filter (exact lowercase match) + - provisioning_state does not resolve to exactly "succeeded" (SDK+nested, conflict -> skip) + - status does not resolve to exactly "running" (SDK+nested, conflict -> skip) + - sku.name not in supported dedicated billable tiers + (basic / standard / standard2 / standard3 / storage_optimized_l1 / storage_optimized_l2) + - created_at absent, invalid, in the future, or service age < 90 days + - replica_count or partition_count not a known positive integer (SDK+nested, conflict -> skip) + - data-plane client factory returns None (package unavailable -> skip) + - any required object surface (indexes, indexers, data_sources, skillsets, synonym_maps) + fails, is unavailable, or is non-empty + - any optional object surface (aliases, knowledge_sources, agents) that could be fully + enumerated and is non-empty + - any required activity metric cannot be resolved reliably (< 95% daily-bucket coverage) + - any required activity metric is non-zero over 90 days + - per-service SDK retrieval raises an expected error + +Cost model (spec 10): + estimated_monthly_cost_usd = None (always) + Risk = MEDIUM (always) + Confidence = HIGH (always, when all conditions met) + +APIs: + - Microsoft.Search/searchServices/read (services.list_by_subscription) + - Microsoft.Insights/metrics/read + - Azure AI Search data-plane object list APIs (RBAC keyless auth, no admin keys) +""" + import math from datetime import datetime, timedelta, timezone -from typing import Any, List, Optional +from enum import Enum +from typing import Callable, List, Optional, Tuple +from azure.core.exceptions import HttpResponseError, ServiceRequestError, ServiceResponseError from azure.mgmt.monitor import MonitorManagementClient - -# Azure SDK (top-level imports for CI fail-fast) from azure.mgmt.search import SearchManagementClient from cleancloud.core.confidence import ConfidenceLevel @@ -12,51 +58,408 @@ from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +_RULE_ID = "azure.ai_search.idle" +_RESOURCE_TYPE = "azure.ai.search_service" +_IDLE_WINDOW_DAYS = 90 # fixed per spec 6.3 +_MIN_AGE_DAYS = 90 # spec 8.7 +_MIN_COVERAGE = 0.95 # spec 9.5 + +# Supported dedicated billable tiers (spec 9.2) +_SUPPORTED_SKUS = frozenset( + { + "basic", + "standard", + "standard2", + "standard3", + "storage_optimized_l1", + "storage_optimized_l2", + } +) + +# Required object surfaces (spec 9.3): (surface_key, data_plane_method_name) +_REQUIRED_SURFACES: Tuple[Tuple[str, str], ...] = ( + ("indexes", "list_indexes"), + ("indexers", "list_indexers"), + ("data_sources", "list_data_source_connections"), + ("skillsets", "list_skillsets"), + ("synonym_maps", "list_synonym_maps"), +) + +# Optional reinforcing surfaces (spec 9.3.5-9.3.7): enumerated when supported; +# if non-empty, the service still must skip (spec 9.3.7). +_OPTIONAL_SURFACES: Tuple[Tuple[str, str], ...] = ( + ("aliases", "list_aliases"), + ("knowledge_sources", "list_knowledge_sources"), + ("agents", "list_agents"), +) + +# Required activity metrics (spec 9.5): (metric_name, aggregation_type) +_REQUIRED_METRICS: Tuple[Tuple[str, str], ...] = ( + ("SearchQueriesPerSecond", "Average"), + ("DocumentsProcessedCount", "Total"), + ("SkillExecutionCount", "Total"), +) + +# SKU alias table: stripped-lowercase SDK variant -> canonical _SUPPORTED_SKUS key +_SKU_ALIASES = { + "storageoptimizedl1": "storage_optimized_l1", + "storageoptimizedl2": "storage_optimized_l2", +} + +_SENTINEL = object() + RULE_METADATA = { - "id": "azure.ai_search.idle", + "id": _RULE_ID, "category": "ai", "service": "search", "cost_impact": "high", } -# Metric names for Azure Monitor queries — constants for easy update if Microsoft renames them -_METRIC_QPS = "SearchQueriesPerSecond" -_METRIC_TOTAL = "TotalSearchRequestCount" - -# Monthly cost per 1 replica × 1 partition (USD). Azure Search bills per unit = replica × partition. -_SKU_COSTS = { - "basic": 73.0, - "standard": 261.0, - "standard2": 523.0, - "standard3": 1047.0, - "storage_optimized_l1": 2014.0, - "storage_optimized_l2": 4028.0, -} -_WATCHED_SKUS = { - "standard", - "standard2", - "standard3", - "storage_optimized_l1", - "storage_optimized_l2", -} +class _MetricResult(Enum): + ACTIVE = "ACTIVE" + ZERO = "ZERO" + UNKNOWN = "UNKNOWN" -# Azure SDK returns SKU names in various formats (camelCase, underscores, etc.). -# Map stripped-lowercase variants to the canonical _SKU_COSTS keys. -_SKU_ALIASES = { - "storageoptimizedl1": "storage_optimized_l1", - "storageoptimizedl2": "storage_optimized_l2", -} + +# --------------------------------------------------------------------------- +# Normalization +# --------------------------------------------------------------------------- def _normalize_sku(raw: str) -> str: - """Normalize an Azure Search SKU name to the canonical key used in _SKU_COSTS. + """ + Normalize an Azure Search SKU name to the canonical _SUPPORTED_SKUS key. + + Spec 7: lowercase only — no character stripping beyond case folding. + Known SDK camelCase variants (e.g. "StorageOptimizedL1") are then resolved + via _SKU_ALIASES to their canonical underscore form. + + Anything not in _SKU_ALIASES is returned as-is after lowercasing; the caller + checks membership in _SUPPORTED_SKUS and skips on no match. + """ + lowered = (raw or "").lower() + return _SKU_ALIASES.get(lowered, lowered) + + +def _norm_location(s: str) -> str: + """Lowercase only -- exact lowercase match per spec 7.""" + return s.lower() if s else "" + + +def _extract_resource_group(resource_id: str) -> Optional[str]: + """Extract resource group name from Azure ARM resource ID.""" + if not resource_id: + return None + parts = resource_id.split("/") + try: + idx = next(i for i, p in enumerate(parts) if p.lower() == "resourcegroups") + return parts[idx + 1] + except (StopIteration, IndexError): + return None + + +# --------------------------------------------------------------------------- +# State resolvers (spec 9.1) +# --------------------------------------------------------------------------- + + +def _resolve_provisioning_state(svc) -> Optional[str]: + """ + SDK-first / nested fallback. Returns None on conflict or both absent. + Only exact "succeeded" is eligible; caller skips on anything else. + """ + sdk_val = getattr(svc, "provisioning_state", None) + props = getattr(svc, "properties", None) + nested_val = None + if props is not None: + nested_val = getattr(props, "provisioning_state", None) + if nested_val is None: + nested_val = getattr(props, "provisioningState", None) + if sdk_val is not None and nested_val is not None and sdk_val != nested_val: + return None # conflict -> skip + return sdk_val or nested_val + + +def _resolve_status(svc) -> Optional[str]: + """ + SDK-first / nested fallback. Returns None on conflict or both absent. + Only exact "running" is eligible; caller skips on anything else. + """ + sdk_val = getattr(svc, "status", None) + props = getattr(svc, "properties", None) + nested_val = None + if props is not None: + nested_val = getattr(props, "status", None) + if sdk_val is not None and nested_val is not None and sdk_val != nested_val: + return None # conflict -> skip + return sdk_val or nested_val + + +def _resolve_capacity(svc, sdk_attr: str, nested_snake: str, nested_camel: str) -> Optional[int]: + """ + Resolve replica_count or partition_count. SDK-first / nested fallback. + Returns a known positive integer, or None (conflict, absent, zero, invalid). + """ + sdk_val = getattr(svc, sdk_attr, None) + props = getattr(svc, "properties", None) + nested_val = None + if props is not None: + nested_val = getattr(props, nested_snake, None) + if nested_val is None: + nested_val = getattr(props, nested_camel, None) + if sdk_val is not None and nested_val is not None and sdk_val != nested_val: + return None # conflict -> skip + val = sdk_val if sdk_val is not None else nested_val + if val is None: + return None + try: + n = int(val) + return n if n > 0 else None + except (TypeError, ValueError): + return None + + +def _resolve_created_at(svc) -> Optional[datetime]: + """ + Resolve creation timestamp from systemData.createdAt. + Returns a UTC-aware datetime, or None if absent, invalid, or in the future. + """ + system_data = getattr(svc, "system_data", None) + if system_data is None: + return None + raw = getattr(system_data, "created_at", None) + if raw is None: + return None + + if isinstance(raw, datetime): + ts = raw if raw.tzinfo is not None else raw.replace(tzinfo=timezone.utc) + elif isinstance(raw, str): + try: + ts = datetime.fromisoformat(raw.rstrip("Z")) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + except ValueError: + return None + else: + return None + + if ts > datetime.now(timezone.utc): + return None # future timestamp -> invalid -> skip + return ts + + +# --------------------------------------------------------------------------- +# Structural-emptiness contract (spec 9.3) +# --------------------------------------------------------------------------- + + +def _check_object_surfaces(dp_client) -> Optional[dict]: + """ + Enumerate all required and optional object surfaces per spec 9.3. + + Returns a dict of {surface_key: count} for successfully enumerated surfaces, + or None if any required surface fails, is unavailable, or is non-empty + (required or optional). + + Sensitive content (credentials, keys, connection strings) must never be + captured; only counts are recorded (spec 9.4). + """ + object_counts: dict = {} + + for surface_key, method_name in _REQUIRED_SURFACES: + fn = getattr(dp_client, method_name, None) + if fn is None: + return None # required surface unavailable -> skip service + try: + count = sum(1 for _ in fn()) + except Exception: + return None # required surface failed -> skip service + if count > 0: + return None # non-empty required surface -> skip service (spec 9.3.3) + object_counts[surface_key] = 0 + + for surface_key, method_name in _OPTIONAL_SURFACES: + fn = getattr(dp_client, method_name, None) + if fn is None: + continue # optional surface not supported -> omit from counts + try: + count = sum(1 for _ in fn()) + except Exception: + continue # optional surface failed -> omit from counts (spec 9.3.6) + if count > 0: + return None # non-empty optional surface -> skip service (spec 9.3.7) + object_counts[surface_key] = 0 + + return object_counts + + +# --------------------------------------------------------------------------- +# Activity-metric contract (spec 9.5) +# --------------------------------------------------------------------------- + + +def _evaluate_metric( + monitor_client, + resource_id: str, + metric_name: str, + aggregation: str, + window_start: datetime, + window_end: datetime, +) -> _MetricResult: + """ + Evaluate a single Azure Monitor metric over the 90-day window per spec 9.5. + + Queries Azure Monitor without a fixed interval so it auto-selects the finest + available granularity for the timespan. Activity is evaluated at each returned + source bucket before any UTC-day normalization, which prevents short-lived + spikes from being diluted into a daily average (spec 9.5.2). + + Coverage is measured in UTC-aligned daily buckets (spec 9.5 definitions). + >= 95% coverage is required. Returns ACTIVE, ZERO, or UNKNOWN. + + Fail-closed on unusable response shapes (spec 9.5.6): + - absent or non-datetime timestamp -> UNKNOWN (entire metric) + - non-numeric aggregation value -> UNKNOWN (entire metric) + - malformed series element -> UNKNOWN (entire metric) + + Datapoints with no populated aggregation value reduce bucket coverage toward + the threshold, driving toward UNKNOWN via the coverage check (not fail-close). + + Datapoints outside the requested window are filtered out (not fail-closed). + """ + fmt = "%Y-%m-%dT%H:%M:%SZ" + timespan = f"{window_start.strftime(fmt)}/{window_end.strftime(fmt)}" + + first_bucket = window_start.replace(hour=0, minute=0, second=0, microsecond=0) + expected_buckets = math.ceil((window_end - first_bucket).total_seconds() / 86400) + if expected_buckets == 0: + return _MetricResult.UNKNOWN + + try: + response = monitor_client.metrics.list( + resource_id, + metricnames=metric_name, + timespan=timespan, + # No interval= parameter: Azure Monitor auto-selects the finest available + # granularity for this timespan. This preserves source-bucket granularity + # so short-lived activity is not diluted away (spec 9.5.2). + aggregation=aggregation, + ) + except Exception: + return _MetricResult.UNKNOWN + + if not hasattr(response, "value") or response.value is None: + return _MetricResult.UNKNOWN + + agg_attr = aggregation.lower() # "average" or "total" + + # Per-bucket maximum across all timeseries and dimension slices. + # Activity evaluated at source-bucket level (spec 9.5.2); timestamps normalized + # to UTC day only for coverage calculation (spec 9.5.3). + bucket_max: dict = {} - Azure returns names like "Standard", "StorageOptimizedL1", or "storage_optimized_l1". - We lowercase, strip non-alphanumeric chars, then resolve known aliases. + try: + for metric in response.value: + for ts in getattr(metric, "timeseries", None) or []: + for data in getattr(ts, "data", None) or []: + if data.timestamp is None: + return _MetricResult.UNKNOWN # unparseable -> fail-closed + + ts_dt = data.timestamp + if not isinstance(ts_dt, datetime): + return _MetricResult.UNKNOWN # unparseable timestamp type -> fail-closed + + val = getattr(data, agg_attr, None) + if val is None: + continue # sparse/missing aggregation -> reduces coverage, not fail-close + if not isinstance(val, (int, float)): + return _MetricResult.UNKNOWN # non-numeric aggregation -> fail-closed + + ts_utc = ( + ts_dt if ts_dt.tzinfo is not None else ts_dt.replace(tzinfo=timezone.utc) + ) + if not (window_start <= ts_utc < window_end): + continue # outside window -> filter + + key = ts_utc.strftime("%Y-%m-%dT00:00:00Z") + existing = bucket_max.get(key) + bucket_max[key] = max(existing, val) if existing is not None else val + except (AttributeError, TypeError, ValueError): + return _MetricResult.UNKNOWN # malformed response shape -> fail-closed + + observed = len(bucket_max) + if observed == 0: + return _MetricResult.UNKNOWN + if observed / expected_buckets < _MIN_COVERAGE: + return _MetricResult.UNKNOWN + + signal = sum(bucket_max.values()) + return _MetricResult.ACTIVE if signal > 0 else _MetricResult.ZERO + + +# --------------------------------------------------------------------------- +# Default data-plane factory +# --------------------------------------------------------------------------- + + +def _make_default_data_plane_factory(credential) -> Callable[[str], Optional[object]]: + """ + Returns a factory that creates Azure Search data-plane clients via RBAC credentials. + + Requires the azure-search-documents package. If unavailable or construction fails, + the factory returns None for every endpoint; the caller skips that service (spec 9.3.4). + + The implementation must not retrieve admin keys (spec 6.2). + """ + + def factory(endpoint: str) -> Optional[object]: + try: + from azure.search.documents.indexes import ( # noqa: PLC0415 + SearchIndexClient, + SearchIndexerClient, + ) + + return _DataPlaneClients( + SearchIndexClient(endpoint, credential), + SearchIndexerClient(endpoint, credential), + ) + except (ImportError, Exception): + return None + + return factory + + +class _DataPlaneClients: """ - stripped = "".join(c for c in (raw or "").lower() if c.isalnum()) - return _SKU_ALIASES.get(stripped, stripped) + Thin adapter over SearchIndexClient and SearchIndexerClient. + Exposes only name-listing methods to keep enumeration lightweight + and avoid capturing sensitive object definitions (spec 9.4). + """ + + def __init__(self, index_client, indexer_client): + self._ic = index_client + self._ixer = indexer_client + + def list_indexes(self): + return self._ic.list_index_names() + + def list_synonym_maps(self): + return self._ic.list_synonym_map_names() + + def list_indexers(self): + return self._ixer.list_indexer_names() + + def list_data_source_connections(self): + return self._ixer.list_data_source_connection_names() + + def list_skillsets(self): + return self._ixer.list_skillset_names() + + +# --------------------------------------------------------------------------- +# Main rule function +# --------------------------------------------------------------------------- def find_idle_ai_search_services( @@ -64,14 +467,30 @@ def find_idle_ai_search_services( subscription_id: str, credential, region_filter: str = None, - client: Optional[Any] = None, - monitor_client: Optional[Any] = None, - idle_days: int = 30, + client=None, + monitor_client=None, + data_plane_factory: Optional[Callable[[str], Optional[object]]] = None, ) -> List[Finding]: + """ + Find Azure AI Search services that are structurally empty and have no + documented query, indexing, or skill activity for 90 days. + + IAM permissions: + - Microsoft.Search/searchServices/read + - Microsoft.Insights/metrics/read + - Azure AI Search data-plane RBAC (Search Service Contributor or equivalent) + + data_plane_factory: callable(endpoint: str) -> data-plane client or None. + If the callable returns None for an endpoint, that service is skipped. + The returned client must expose at minimum: + list_indexes(), list_indexers(), list_data_source_connections(), + list_skillsets(), list_synonym_maps(). + Optionally: list_aliases(), list_knowledge_sources(), list_agents(). + Defaults to _make_default_data_plane_factory(credential) which requires + the azure-search-documents package. + """ findings: List[Finding] = [] - now = datetime.now(timezone.utc) - # Instantiate Azure SDK clients (top-level imports ensure CI fails fast if SDKs missing) search_client = client or SearchManagementClient( credential=credential, subscription_id=subscription_id ) @@ -79,258 +498,169 @@ def find_idle_ai_search_services( credential=credential, subscription_id=subscription_id ) - def _norm(s: str) -> str: - return "".join(c for c in (s or "").lower() if c.isalnum()) + if data_plane_factory is None: + data_plane_factory = _make_default_data_plane_factory(credential) - try: - for svc in search_client.services.list_by_subscription(): - sku = getattr(svc, "sku", None) - sku_raw = getattr(sku, "name", None) - if not sku_raw and isinstance(sku, dict): - sku_raw = sku.get("name") - sku_name = _normalize_sku(sku_raw) - if sku_name not in _WATCHED_SKUS: + now = datetime.now(timezone.utc) + window_start = now - timedelta(days=_IDLE_WINDOW_DAYS) + + # Subscription-wide service inventory (spec 12: propagate if this fails) + for svc in search_client.services.list_by_subscription(): + # spec 8.1: id guard + svc_id = getattr(svc, "id", None) + if not svc_id: + continue + + # spec 8.2: name guard + svc_name = getattr(svc, "name", None) + if not svc_name: + continue + + # Per-service: resolve state, enumerate objects, evaluate metrics. + # Expected SDK retrieval failures -> skip this service (spec 12). + # HttpResponseError: HTTP-level failure (404, 403, 429, 5xx). + # ServiceRequestError: transport failure before a response. + # ServiceResponseError: transport failure while reading the response. + try: + # spec 8.3: region filter -- exact lowercase match + location = _norm_location(getattr(svc, "location", "") or "") + if region_filter and location != _norm_location(region_filter): continue - location_raw = getattr(svc, "location", "") or "" - if region_filter and _norm(location_raw) != _norm(region_filter): + # spec 8.4 / 9.1: provisioning_state must resolve to exactly "succeeded" + if _resolve_provisioning_state(svc) != "succeeded": continue - # Replica and partition counts - props = getattr(svc, "properties", None) - replica_count = ( - getattr(svc, "replica_count", None) - or getattr(props, "replica_count", None) - or getattr(props, "replicaCount", None) - or 1 - ) - partition_count = ( - getattr(svc, "partition_count", None) - or getattr(props, "partition_count", None) - or getattr(props, "partitionCount", None) - or 1 - ) - - # Age - age_days: Optional[int] = None - created_at = getattr(svc, "system_data", None) and getattr( - svc.system_data, "created_at", None - ) - if created_at is not None: - if created_at.tzinfo is None: - created_at = created_at.replace(tzinfo=timezone.utc) - age_days = max((now - created_at).days, 0) - if age_days < max(idle_days // 2, 3): - continue - - effective_window = min(idle_days, age_days) if age_days is not None else idle_days - if effective_window < 3: + # spec 8.5 / 9.1: status must resolve to exactly "running" + if _resolve_status(svc) != "running": continue - if not svc.id: + # spec 8.6 / 9.2: SKU must be a supported dedicated billable tier + sku_obj = getattr(svc, "sku", None) + sku_raw = getattr(sku_obj, "name", None) if sku_obj else None + if isinstance(sku_obj, dict): + sku_raw = sku_obj.get("name") + sku_name = _normalize_sku(sku_raw) + if sku_name not in _SUPPORTED_SKUS: continue - # Metric check over effective_window days. - # Age gates (idle_days // 2 minimum, 75% for MEDIUM, 100% for HIGH) are heuristics: - # they balance catching genuinely idle services early while avoiding false positives - # on recently-deployed services that haven't had time to build query history. - idle_signal = _check_search_queries(mon_client, svc.id, effective_window) - if idle_signal is None or idle_signal[0] == "active": + # spec 8.7: created_at must be present, valid, and service age >= 90 days + created_at = _resolve_created_at(svc) + if created_at is None: + continue # absent or invalid -> skip + age_days = (now - created_at).days + if age_days < _MIN_AGE_DAYS: continue - signal_scope, idle_metric, average_value = idle_signal - - if signal_scope == "no_data": - if age_days is not None and age_days >= idle_days * 2: - confidence = ConfidenceLevel.LOW - signal_scope = "age_only" - else: - continue - elif signal_scope == "metric_zero" and age_days is not None and age_days >= idle_days: - confidence = ConfidenceLevel.HIGH - elif ( - signal_scope == "metric_zero" - and age_days is not None - and age_days >= math.ceil(idle_days * 0.75) - ): - confidence = ConfidenceLevel.MEDIUM - elif signal_scope == "metric_zero" and age_days is None: - confidence = ConfidenceLevel.MEDIUM - else: + # spec 8.8: replica_count and partition_count must be known positive integers + replica_count = _resolve_capacity(svc, "replica_count", "replica_count", "replicaCount") + if replica_count is None: continue - base = _SKU_COSTS.get(sku_name, None) - if base is None: + partition_count = _resolve_capacity( + svc, "partition_count", "partition_count", "partitionCount" + ) + if partition_count is None: continue - replicas = int(replica_count or 1) - partitions = int(partition_count or 1) - est_cost = base * replicas * partitions + resource_group = _extract_resource_group(svc_id) - if est_cost < 100: - continue + # spec 8.9-8.10 / 9.3: data-plane structural emptiness + endpoint = f"https://{svc_name}.search.windows.net" + dp_client = data_plane_factory(endpoint) + if dp_client is None: + continue # data-plane unavailable -> skip (spec 9.3.4) + + object_counts = _check_object_surfaces(dp_client) + if object_counts is None: + continue # required surface failed or non-empty -> skip + + # spec 8.11-8.12 / 9.5: all three required activity metrics must be ZERO + metric_outcomes: dict = {} + all_zero = True + for metric_name, aggregation in _REQUIRED_METRICS: + result = _evaluate_metric( + mon_client, svc_id, metric_name, aggregation, window_start, now + ) + metric_outcomes[metric_name] = result + if result != _MetricResult.ZERO: + all_zero = False + break - if est_cost >= 3000: - risk = RiskLevel.CRITICAL - elif est_cost >= 1000: - risk = RiskLevel.HIGH - else: - risk = RiskLevel.MEDIUM + if not all_zero: + continue - signals = [ + # --- Context fields (best-effort; never gate emission) --- + hosting_mode = getattr(svc, "hosting_mode", None) + tags = getattr(svc, "tags", None) or {} # never None in output + + # --- EMIT --- + signals_used = [ + "Provisioning state is 'succeeded'", + "Service status is 'running'", + f"Supported dedicated billable SKU confirmed: '{sku_name}'", + f"Service age is {age_days} days (>= {_MIN_AGE_DAYS} days)", + f"replica_count={replica_count}, partition_count={partition_count} confirmed", + ( + f"All required object surfaces confirmed empty with full pagination exhaustion " + f"({', '.join(k for k, _ in _REQUIRED_SURFACES)})" + ), ( - f"No search traffic detected for {effective_window} days (metric: {idle_metric})" - if signal_scope != "age_only" - else f"No metric data available; service age {age_days} days >= {idle_days * 2} days" + f"All required activity metrics resolved to zero over {_IDLE_WINDOW_DAYS} days " + f"with >= {int(_MIN_COVERAGE * 100)}% daily-bucket coverage " + f"({', '.join(m for m, _ in _REQUIRED_METRICS)})" ), - f"SKU: {sku_name}, replicas: {replicas}, partitions: {partitions}", ] - evidence = Evidence( - signals_used=signals, - signals_not_checked=[ - "Indexing-only services with no search queries", - "Services used as failover", - "Scheduled batch re-indexing", - ], - time_window=f"{effective_window} days", - ) - - rg = None - if svc.id: - _parts = svc.id.split("/") - _rg_idx = next( - (i for i, p in enumerate(_parts) if p.lower() == "resourcegroups"), - None, - ) - rg = ( - _parts[_rg_idx + 1] - if _rg_idx is not None and _rg_idx + 1 < len(_parts) - else None - ) - - details = { - "service_name": svc.name, - "resource_group": rg, - "sku": sku_name, - "location": location_raw, - "replica_count": replicas, - "partition_count": partitions, - "age_days": age_days, - "idle_days_threshold": idle_days, - "idle_signal": signal_scope, - "idle_metric": idle_metric or "none", - "estimated_monthly_cost": est_cost, - "cost_source": "heuristic_sku_table" if base is not None else "unknown", - } - - title = f"Idle Azure AI Search Service: {svc.name}" - summary = f"Azure AI Search service '{svc.name}' ({sku_name}) has near-zero search traffic for {effective_window}+ days and continues to incur monthly charges." - reason = signals[0] - findings.append( Finding( provider="azure", - rule_id="azure.ai_search.idle", - resource_type="azure.ai.search_service", - resource_id=svc.id, - region=location_raw, - title=title, - summary=summary, - reason=reason, - risk=risk, - confidence=confidence, + rule_id=_RULE_ID, + resource_type=_RESOURCE_TYPE, + resource_id=svc_id, + region=location, + estimated_monthly_cost_usd=None, # spec 10: always None + title=f"Idle Azure AI Search Service: {svc_name}", + summary=( + f"Azure AI Search service '{svc_name}' ({sku_name}) is structurally " + f"empty and has no documented activity over {_IDLE_WINDOW_DAYS} days" + ), + reason=( + f"No configured search objects and no query, indexing, or skill activity " + f"over {_IDLE_WINDOW_DAYS} days; dedicated service continues to incur cost" + ), + risk=RiskLevel.MEDIUM, # spec 11.1: always MEDIUM + confidence=ConfidenceLevel.HIGH, # spec 11.1: always HIGH detected_at=now, - evidence=evidence, - details=details, - estimated_monthly_cost_usd=est_cost, + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=[ + "Future go-live or migration intent", + "Business-owner intent not visible in Azure control plane", + "Premium-feature billing not inferable from baseline surfaces", + ], + time_window=f"{_IDLE_WINDOW_DAYS} days", + ), + details={ + "service_name": svc_name, + "resource_group": resource_group, + "subscription_id": subscription_id, + "sku_name": sku_name, + "replica_count": replica_count, + "partition_count": partition_count, + "hosting_mode": hosting_mode, + "status": "running", + "provisioning_state": "succeeded", + "created_at": created_at.isoformat(), + "idle_window_days": _IDLE_WINDOW_DAYS, + "object_counts": object_counts, + "metrics_used": [m for m, _ in _REQUIRED_METRICS], + "tags": tags, + }, ) ) - except Exception as e: - msg = str(e) - if "AuthorizationFailed" in msg or "Forbidden" in msg or "403" in msg: - raise PermissionError( - "Missing required permissions: Microsoft.Search/searchServices/read, Microsoft.Insights/metrics/read" - ) from e - raise + except (HttpResponseError, ServiceRequestError, ServiceResponseError): + continue # per-service retrieval failure -> skip (spec 12) return findings - - -def _check_search_queries(monitor_client: Any, resource_id: str, days: int) -> Optional[tuple]: - now = datetime.now(timezone.utc) - start = now - timedelta(days=max(days, 1)) - fmt = "%Y-%m-%dT%H:%M:%SZ" - timespan = f"{start.strftime(fmt)}/{now.strftime(fmt)}" - - had_successful = False - - # Try SearchQueriesPerSecond (Average) first - try: - response = monitor_client.metrics.list( - resource_id, - metricnames=_METRIC_QPS, - timespan=timespan, - interval="PT24H", - aggregation="Average", - ) - had_successful = True - seen_datapoints = 0 - for metric in getattr(response, "value", []): - for ts in getattr(metric, "timeseries", []): - vals = [ - p.average - for p in getattr(ts, "data", []) - if getattr(p, "average", None) is not None - ] - if vals: - seen_datapoints += len(vals) - avg = sum(vals) / len(vals) - if avg > 0: - return ("active", _METRIC_QPS, avg) - if seen_datapoints > 0: - return ("metric_zero", _METRIC_QPS, 0) - except PermissionError: - raise - except Exception as e: - msg = str(e) - if "AuthorizationFailed" in msg or "Forbidden" in msg or "403" in msg: - raise PermissionError( - "Missing required permissions: Microsoft.Insights/metrics/read" - ) from e - - # Fallback: TotalSearchRequestCount (Total) - try: - response = monitor_client.metrics.list( - resource_id, - metricnames=_METRIC_TOTAL, - timespan=timespan, - interval="PT24H", - aggregation="Total", - ) - had_successful = True - seen_datapoints = 0 - for metric in getattr(response, "value", []): - for ts in getattr(metric, "timeseries", []): - vals = [ - p.total - for p in getattr(ts, "data", []) - if getattr(p, "total", None) is not None - ] - if vals: - seen_datapoints += len(vals) - total = sum(vals) - if total > 0: - return ("active", _METRIC_TOTAL, total) - if seen_datapoints > 0: - return ("metric_zero", _METRIC_TOTAL, 0) - # Both metrics called and neither returned datapoints - if had_successful and seen_datapoints == 0: - return ("no_data", None, None) - except PermissionError: - raise - except Exception: - pass - - return None diff --git a/cleancloud/providers/azure/rules/ai/aml_compute_idle.py b/cleancloud/providers/azure/rules/ai/aml_compute_idle.py index ada9a8a..ce311be 100644 --- a/cleancloud/providers/azure/rules/ai/aml_compute_idle.py +++ b/cleancloud/providers/azure/rules/ai/aml_compute_idle.py @@ -1,7 +1,51 @@ +""" +Rule: azure.aml.compute.idle + +Intent: + Detect managed Azure Machine Learning compute clusters (AmlCompute) that retain + billable baseline capacity while showing no observed per-cluster job activity over + a fixed 14-day observation window. + + This rule is deliberately precision-first. It requires BOTH confirmed positive + baseline node allocation (min_node_count > 0 with current nodes allocated) AND + confirmed zero per-cluster activity (Active Nodes metric at zero for the cluster) + before emitting. It is a conservative review-candidate rule only and does not + prove that deleting the cluster is safe. + +Exclusions: + - id absent or empty + - name absent or empty + - workspace.name absent or empty + - outside optional region filter (compute resource location, exact lowercase match; + spaces and hyphens preserved) + - compute_type does not resolve to exactly "AmlCompute" (SDK+nested, conflict -> skip) + - provisioning_state does not resolve to exactly "Succeeded" (SDK+nested, conflict -> skip) + - allocation_state does not resolve to exactly "Steady" (SDK+nested, conflict -> skip) + - created_at absent, invalid, in the future, or cluster age < 14 days + - min_node_count <= 0 or unresolvable + - current_node_count negative, unresolvable, or < min_node_count + - Active Nodes metric cannot be resolved reliably for the target cluster + (no ClusterName-scoped series, < 95% daily-bucket coverage, unusable shape) + - Active Nodes metric is non-zero over the 14-day window + - per-compute record resolution or metric retrieval fails + - per-workspace compute listing fails (skip that workspace) + +Cost model (spec 10): + estimated_monthly_cost_usd = None (always) + Risk = MEDIUM (always) + Confidence = HIGH (always, when all conditions met) + +APIs: + - Microsoft.MachineLearningServices/workspaces/read + - Microsoft.MachineLearningServices/workspaces/computes/read + - Microsoft.Insights/metrics/read +""" + from datetime import datetime, timedelta, timezone -from typing import Any, List, Optional +from enum import Enum +from typing import List, Optional -# Azure SDK (top-level imports for CI fail-fast) +from azure.core.exceptions import HttpResponseError, ServiceRequestError, ServiceResponseError from azure.mgmt.machinelearningservices import AzureMachineLearningWorkspaces from azure.mgmt.monitor import MonitorManagementClient @@ -10,44 +54,347 @@ from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +_RULE_ID = "azure.aml.compute.idle" +_RESOURCE_TYPE = "azure.aml.compute" +_IDLE_WINDOW_DAYS = 14 # fixed per spec 6.3 +_MIN_AGE_DAYS = 14 # spec 8.8 +_MIN_COVERAGE = 0.95 # spec 9.3 + RULE_METADATA = { - "id": "azure.aml.compute.idle", + "id": _RULE_ID, "category": "ai", "service": "machinelearning", "cost_impact": "high", } -# GPU VM size prefixes — significantly more expensive than CPU -_GPU_VM_PREFIXES = ("Standard_NC", "Standard_ND", "Standard_NV") - -# Approximate monthly cost per node (on-demand, East US, 730 h/month) -# Cost for min_node_count nodes that run continuously regardless of job activity -_MONTHLY_COST_PER_NODE = { - "Standard_D2_v2": 130.0, - "Standard_D4_v2": 259.0, - "Standard_D8_v2": 518.0, - "Standard_D2s_v3": 96.0, - "Standard_D4s_v3": 192.0, - "Standard_D8s_v3": 384.0, - "Standard_NC6": 648.0, - "Standard_NC12": 1_296.0, - "Standard_NC24": 2_592.0, - "Standard_NC6s_v3": 2_203.0, - "Standard_NC12s_v3": 4_406.0, - "Standard_NC24s_v3": 8_812.0, - "Standard_ND6s": 2_203.0, - "Standard_ND12s": 4_406.0, - "Standard_ND24s": 8_812.0, - "Standard_ND40rs_v2": 15_862.0, - "Standard_NV6": 1_094.0, - "Standard_NV12": 2_189.0, - "Standard_NV24": 4_378.0, -} -_DEFAULT_MONTHLY_COST_PER_NODE = 200.0 -# Metric names to try in order — Azure ML metrics have drifted across API versions -# and regions. Try all known names before giving up. -_ACTIVE_NODE_METRICS = ("Active Nodes", "NodeCount", "CurrentNodeCount") +class _MetricResult(Enum): + ACTIVE = "ACTIVE" + ZERO = "ZERO" + UNKNOWN = "UNKNOWN" + + +# --------------------------------------------------------------------------- +# Normalization +# --------------------------------------------------------------------------- + + +def _norm_location(s: str) -> str: + """Lowercase only — exact lowercase match per spec 7.""" + return s.lower() if s else "" + + +def _extract_resource_group(resource_id: Optional[str]) -> Optional[str]: + """Extract resource group name from Azure ARM resource ID.""" + if not resource_id: + return None + parts = resource_id.split("/") + try: + idx = next(i for i, p in enumerate(parts) if p.lower() == "resourcegroups") + return parts[idx + 1] + except (StopIteration, IndexError): + return None + + +# --------------------------------------------------------------------------- +# State resolvers (spec 9.1) +# --------------------------------------------------------------------------- + + +def _resolve_str_field(obj, snake: str, camel: str) -> Optional[str]: + """ + Resolve a string field from SDK snake_case then raw camelCase. + Returns None on conflict or absent. + """ + if obj is None: + return None + sdk_val = getattr(obj, snake, None) + raw_val = getattr(obj, camel, None) + if sdk_val is not None and raw_val is not None and sdk_val != raw_val: + return None # conflict -> skip + val = sdk_val if sdk_val is not None else raw_val + return val if isinstance(val, str) else None + + +def _resolve_int_field(obj, snake: str, camel: str) -> Optional[int]: + """ + Resolve an integer field from SDK snake_case then raw camelCase. + Tries snake first; falls back to camel. Returns parsed int or None. + Range checks (>0, >=0) are the caller's responsibility. + """ + if obj is None: + return None + val = getattr(obj, snake, None) + if val is None: + val = getattr(obj, camel, None) + if val is None: + return None + try: + return int(val) + except (TypeError, ValueError): + return None + + +def _to_detail_str(v) -> Optional[str]: + """Serialize any SDK value to a JSON-safe string for finding details.""" + return str(v) if v is not None else None + + +def _resolve_compute_type(compute) -> Optional[str]: + """ + Resolve compute_type from compute.properties (SDK+nested, spec 9.1). + Only "AmlCompute" is eligible; conflict or absent -> None. + """ + outer = getattr(compute, "properties", None) + return _resolve_str_field(outer, "compute_type", "computeType") + + +def _resolve_provisioning_state(compute) -> Optional[str]: + """ + Resolve provisioning_state from compute.properties (SDK+nested, spec 9.1). + Only "Succeeded" is eligible; conflict or absent -> None. + """ + outer = getattr(compute, "properties", None) + return _resolve_str_field(outer, "provisioning_state", "provisioningState") + + +def _resolve_allocation_state(compute) -> Optional[str]: + """ + Resolve allocation_state from compute.properties.properties (SDK+nested, spec 9.1). + Only "Steady" is eligible; conflict or absent -> None. + """ + outer = getattr(compute, "properties", None) + inner = getattr(outer, "properties", None) if outer is not None else None + return _resolve_str_field(inner, "allocation_state", "allocationState") + + +def _resolve_created_at(compute) -> Optional[datetime]: + """ + Resolve creation timestamp from compute.properties.created_on (spec 7). + Returns UTC-aware datetime, or None if absent, invalid, or in the future. + """ + outer = getattr(compute, "properties", None) + if outer is None: + return None + raw = getattr(outer, "created_on", None) + if raw is None: + raw = getattr(outer, "createdOn", None) + if raw is None: + return None + + if isinstance(raw, datetime): + ts = raw if raw.tzinfo is not None else raw.replace(tzinfo=timezone.utc) + elif isinstance(raw, str): + try: + ts = datetime.fromisoformat(raw.rstrip("Z")) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + except ValueError: + return None + else: + return None + + if ts > datetime.now(timezone.utc): + return None # future timestamp -> invalid -> skip + return ts + + +def _resolve_min_node_count(compute) -> Optional[int]: + """ + Resolve min_node_count from scale_settings (spec 9.2). + + Tries SDK snake_case first, then raw camelCase for both the scale_settings + container and the min_node_count field itself. Returns a known positive + integer, or None (absent, zero, negative, invalid). + """ + outer = getattr(compute, "properties", None) + inner = getattr(outer, "properties", None) if outer is not None else None + if inner is None: + return None + # scale_settings container: SDK snake_case or raw camelCase + scale = getattr(inner, "scale_settings", None) + if scale is None: + scale = getattr(inner, "scaleSettings", None) + if scale is None: + return None + n = _resolve_int_field(scale, "min_node_count", "minNodeCount") + return n if n is not None and n > 0 else None + + +def _resolve_current_node_count(compute) -> Optional[int]: + """ + Resolve current_node_count from AmlComputeProperties (spec 9.2). + + Tries SDK snake_case first, then raw camelCase. Returns non-negative + integer, or None (absent, negative, invalid). + """ + outer = getattr(compute, "properties", None) + inner = getattr(outer, "properties", None) if outer is not None else None + if inner is None: + return None + n = _resolve_int_field(inner, "current_node_count", "currentNodeCount") + return n if n is not None and n >= 0 else None + + +# --------------------------------------------------------------------------- +# Activity-metric contract (spec 9.3) +# --------------------------------------------------------------------------- + + +def _series_is_cluster_scoped(ts, compute_name: str) -> bool: + """ + Return True only when timeseries metadata confirms ClusterName == compute_name. + + Spec 9.3.2 requires per-cluster scoping via the documented ClusterName + dimension. Spec 9.3.3 prohibits workspace-level fallback to prove idleness. + A series without verified ClusterName metadata cannot be trusted as + cluster-specific and must be skipped (spec 9.3.7 "no valid series"). + + The dimension key is matched case-insensitively ("ClusterName" / "clusterName"); + the dimension value is matched exactly (same case as the compute name). + + Two metadata shapes are tolerated: + - mv.name.value (LocalizableString — standard SDK object) + - mv.name (plain str — surfaced by some SDK versions / REST responses) + """ + metadata_values = getattr(ts, "metadata_values", None) or [] + try: + for mv in metadata_values: + # Dimension key: try LocalizableString shape first, then plain string fallback. + name_obj = getattr(mv, "name", None) + dim_name = getattr(name_obj, "value", None) + if not isinstance(dim_name, str): + dim_name = name_obj if isinstance(name_obj, str) else None + # Dimension value + dim_value = getattr(mv, "value", None) + if ( + isinstance(dim_name, str) + and dim_name.lower() == "clustername" + and isinstance(dim_value, str) + and dim_value == compute_name + ): + return True + except (AttributeError, TypeError): + pass + return False + + +def _evaluate_metric( + monitor_client, + workspace_id: str, + compute_name: str, + window_start: datetime, + window_end: datetime, +) -> _MetricResult: + """ + Evaluate the Active Nodes metric for the target cluster per spec 9.3. + + Queries with ClusterName dimension filter (spec 9.3.2). No unfiltered workspace- + level fallback permitted (spec 9.3.3). No fixed interval so Azure Monitor auto- + selects the finest available granularity; activity is evaluated at each returned + source bucket before any UTC-day normalization (spec 9.3.4). >= 95% UTC-day + coverage is required. + + Each returned timeseries is verified against the ClusterName dimension + metadata before any datapoints are consumed. Series without confirmed + ClusterName == compute_name metadata are skipped entirely (spec 9.3.2, + 9.3.3). If no series can be verified as cluster-specific, UNKNOWN is + returned (spec 9.3.7 "no valid series"). + + Coverage is evaluated over fully-elapsed UTC day buckets only. Both expected_buckets + and the datapoint acceptance window are capped symmetrically at midnight(window_end). + This excludes the current partial UTC day from both sides so it cannot overstate + coverage and cannot mask a missing complete past day. + + Fail-closed on unusable response shapes (spec 9.3.7): + - absent or non-datetime timestamp -> UNKNOWN (entire metric) + - non-numeric Maximum value -> UNKNOWN (entire metric) + - malformed series element -> UNKNOWN (entire metric) + + Datapoints with no Maximum value reduce bucket coverage toward the threshold, + driving toward UNKNOWN via the coverage check (not fail-close). + """ + fmt = "%Y-%m-%dT%H:%M:%SZ" + timespan = f"{window_start.strftime(fmt)}/{window_end.strftime(fmt)}" + + first_bucket = window_start.replace(hour=0, minute=0, second=0, microsecond=0) + # Both expected_buckets and the datapoint acceptance window are capped at + # last_complete_midnight so they are consistent. The current partial UTC day + # (window_end = now, mid-day) is excluded from both sides: including it in + # expected_buckets would cause spurious UNKNOWN (Azure Monitor may not have + # emitted today's datapoint yet); including it in observed but not expected + # would let today's partial bucket mask a missing prior day, allowing a false + # emit on a rule that must be fail-closed. + last_complete_midnight = window_end.replace(hour=0, minute=0, second=0, microsecond=0) + expected_buckets = int((last_complete_midnight - first_bucket).total_seconds() // 86400) + if expected_buckets == 0: + return _MetricResult.UNKNOWN + + try: + response = monitor_client.metrics.list( + workspace_id, + metricnames="Active Nodes", + timespan=timespan, + # No interval= parameter: Azure Monitor auto-selects the finest available + # granularity. This preserves source-bucket granularity so short-lived + # activity is not diluted away (spec 9.3.4). + aggregation="Maximum", + filter=f"ClusterName eq '{compute_name}'", + ) + except Exception: + return _MetricResult.UNKNOWN + + if not hasattr(response, "value") or response.value is None: + return _MetricResult.UNKNOWN + + # Per-bucket maximum across all returned timeseries for the target cluster. + # Coverage is tracked per UTC day bucket (spec 9.3 definitions). + bucket_max: dict = {} + + try: + for metric in response.value: + for ts in getattr(metric, "timeseries", None) or []: + if not _series_is_cluster_scoped(ts, compute_name): + continue # not verified as cluster-specific; skip per spec 9.3.2/9.3.3 + for data in getattr(ts, "data", None) or []: + if data.timestamp is None: + return _MetricResult.UNKNOWN # unparseable -> fail-closed + + ts_dt = data.timestamp + if not isinstance(ts_dt, datetime): + return _MetricResult.UNKNOWN # unparseable timestamp -> fail-closed + + val = getattr(data, "maximum", None) + if val is None: + continue # sparse/missing -> reduces coverage, not fail-close + if not isinstance(val, (int, float)): + return _MetricResult.UNKNOWN # non-numeric -> fail-closed + + ts_utc = ( + ts_dt if ts_dt.tzinfo is not None else ts_dt.replace(tzinfo=timezone.utc) + ) + if not (window_start <= ts_utc < last_complete_midnight): + continue # outside eligible bucket range; today's partial day excluded + + key = ts_utc.strftime("%Y-%m-%dT00:00:00Z") + existing = bucket_max.get(key) + bucket_max[key] = max(existing, val) if existing is not None else val + except (AttributeError, TypeError, ValueError): + return _MetricResult.UNKNOWN # malformed response shape -> fail-closed + + observed = len(bucket_max) + if observed == 0: + return _MetricResult.UNKNOWN + if observed / expected_buckets < _MIN_COVERAGE: + return _MetricResult.UNKNOWN + + signal = sum(bucket_max.values()) + return _MetricResult.ACTIVE if signal > 0 else _MetricResult.ZERO + + +# --------------------------------------------------------------------------- +# Main rule function +# --------------------------------------------------------------------------- def find_idle_aml_compute( @@ -55,31 +402,12 @@ def find_idle_aml_compute( subscription_id: str, credential, region_filter: str = None, - client: Optional[Any] = None, - monitor_client: Optional[Any] = None, - idle_days: int = 14, + client=None, + monitor_client=None, ) -> List[Finding]: """ - Find Azure ML compute clusters with min_node_count > 0 and no active nodes. - - AML compute clusters with min_node_count > 0 keep instances running continuously - regardless of whether any jobs are submitted — identical billing model to SageMaker - InService endpoints. GPU clusters (NC/ND series) cost $600–$15K/month at minimum - node count. - - Detection logic: - - Compute type is AmlCompute - - min_node_count > 0 (instances always running, always billing) - - Azure Monitor active-node metric maximum is 0 over the effective idle window - - Metric strategy (Azure Monitor metrics are inconsistent across regions/API versions): - - Tries "Active Nodes" first, falls back to "NodeCount" - - For each metric, tries with ComputeName dimension filter first, - then falls back to unfiltered workspace-level query if no timeseries returned - - Confidence: - - HIGH: Zero active nodes over the full idle window (age >= idle_days) - - MEDIUM: Zero active nodes, age >= 75% of idle_days threshold, or age unknown + Find AML compute clusters with min_node_count > 0 and no observed active nodes + for 14 days. IAM permissions: - Microsoft.MachineLearningServices/workspaces/read @@ -87,11 +415,7 @@ def find_idle_aml_compute( - Microsoft.Insights/metrics/read """ findings: List[Finding] = [] - now = datetime.now(timezone.utc) - - idle_days = max(idle_days, 3) # effective_window < 3 skips all clusters; clamp to match - # Instantiate Azure SDK clients (top-level imports ensure CI fails fast if SDKs missing) ml_client = client or AzureMachineLearningWorkspaces( credential=credential, subscription_id=subscription_id ) @@ -99,296 +423,178 @@ def find_idle_aml_compute( credential=credential, subscription_id=subscription_id ) - def _norm(s: str) -> str: - return s.lower().replace(" ", "").replace("-", "") + now = datetime.now(timezone.utc) + window_start = now - timedelta(days=_IDLE_WINDOW_DAYS) + + # Subscription-wide workspace inventory (spec 12: propagate if this fails) + for workspace in ml_client.workspaces.list_by_subscription(): + # spec 8.3: workspace name guard + ws_name = getattr(workspace, "name", None) + if not ws_name: + continue + + rg = _extract_resource_group(getattr(workspace, "id", None)) + if not rg: + continue + + try: + for compute in ml_client.machine_learning_compute.list_by_workspace(rg, ws_name): + try: + # spec 8.1: id guard + compute_id = getattr(compute, "id", None) + if not compute_id: + continue - try: - for workspace in ml_client.workspaces.list_by_subscription(): - # Normalise only for filter comparison; preserve original for output - location_raw = workspace.location or "" - if region_filter and _norm(location_raw) != _norm(region_filter): - continue - - rg = _parse_resource_group(workspace.id) - if not rg: - continue - - try: - for compute in ml_client.machine_learning_compute.list_by_workspace( - rg, workspace.name - ): - compute_obj = compute.properties - if ( - not compute_obj - or getattr(compute_obj, "compute_type", None) != "AmlCompute" - ): + # spec 8.2: name guard + compute_name = getattr(compute, "name", None) + if not compute_name: continue - # AmlComputeProperties lives under compute_obj.properties - aml_props = getattr(compute_obj, "properties", None) - scale_settings = getattr(aml_props, "scale_settings", None) - min_node_count = getattr(scale_settings, "min_node_count", 0) or 0 - vm_size = getattr(aml_props, "vm_size", None) + # spec 8.4: region filter on compute resource location (not workspace) + compute_location = _norm_location(getattr(compute, "location", "") or "") + if region_filter and compute_location != _norm_location(region_filter): + continue - # Only flag clusters with min_node_count > 0 — those billing continuously - if min_node_count == 0: + # spec 8.5: compute_type must resolve to exactly "AmlCompute" + if _resolve_compute_type(compute) != "AmlCompute": continue - # Age from compute properties — created_on is on AmlCompute (compute.properties), - # not on ComputeResource.system_data as one might expect - age_days: Optional[int] = None - created_at = getattr(compute_obj, "created_on", None) - if created_at is not None: - if created_at.tzinfo is None: - created_at = created_at.replace(tzinfo=timezone.utc) - age_days = (now - created_at).days - # Skip clusters younger than half the idle threshold — - # too new to reliably classify as abandoned - if age_days < max(idle_days // 2, 7): - continue - - # Effective window: cap to age if known; otherwise use full idle_days - effective_window = ( - min(idle_days, age_days) if age_days is not None else idle_days - ) + # spec 8.6: provisioning_state must resolve to exactly "Succeeded" + if _resolve_provisioning_state(compute) != "Succeeded": + continue + + # spec 8.7: allocation_state must resolve to exactly "Steady" + if _resolve_allocation_state(compute) != "Steady": + continue + + # spec 8.8: created_at must be present, valid, and cluster age >= 14 days + created_at = _resolve_created_at(compute) + if created_at is None: + continue + age_days = (now - created_at).days + if age_days < _MIN_AGE_DAYS: + continue + + # spec 8.9: min_node_count must be a known positive integer + min_node_count = _resolve_min_node_count(compute) + if min_node_count is None: + continue - if effective_window < 3: + # spec 8.10: current_node_count must be known and >= min_node_count + current_node_count = _resolve_current_node_count(compute) + if current_node_count is None: + continue + if current_node_count < min_node_count: continue - # Check for active nodes over the effective window. - # Returns the metric name that confirmed idle, or None if active/unknown. - idle_metric = _check_active_nodes( + # spec 8.11-8.12: Active Nodes metric must evaluate to ZERO + result = _evaluate_metric( mon_client, - workspace.id, - compute.name, - effective_window, + getattr(workspace, "id", "") or "", + compute_name, + window_start, + now, ) - if idle_metric is None: + if result != _MetricResult.ZERO: continue - # Confidence based on age relative to idle threshold. - # Unknown age -> MEDIUM: we can't rule out a recently-created cluster. - if age_days is not None and age_days >= idle_days: - confidence = ConfidenceLevel.HIGH - elif age_days is not None and age_days >= int(idle_days * 0.75): - confidence = ConfidenceLevel.MEDIUM - elif age_days is None: - confidence = ConfidenceLevel.MEDIUM - else: - continue # too borderline for a confident finding - - is_gpu = bool( - any((vm_size or "").lower().startswith(p.lower()) for p in _GPU_VM_PREFIXES) + # --- Enrichment fields (best-effort; never gate emission) --- + outer = getattr(compute, "properties", None) + inner = getattr(outer, "properties", None) if outer is not None else None + scale = getattr(inner, "scale_settings", None) if inner is not None else None + + vm_size = getattr(inner, "vm_size", None) if inner is not None else None + vm_priority = getattr(inner, "vm_priority", None) if inner is not None else None + max_node_count = ( + getattr(scale, "max_node_count", None) if scale is not None else None ) - if is_gpu and min_node_count >= 2: - risk = RiskLevel.HIGH - elif is_gpu or min_node_count >= 2: - risk = RiskLevel.MEDIUM - else: - risk = RiskLevel.LOW - - # Normalize casing for lookup — Azure ML can return "STANDARD_NC6" or "standard_nc6" - vm_size_key = next( - (k for k in _MONTHLY_COST_PER_NODE if k.lower() == (vm_size or "").lower()), - None, + target_node_count = ( + getattr(inner, "target_node_count", None) if inner is not None else None ) - cost_per_node = ( - _MONTHLY_COST_PER_NODE[vm_size_key] - if vm_size_key - else _DEFAULT_MONTHLY_COST_PER_NODE + node_idle_time = ( + getattr(scale, "node_idle_time_before_scale_down", None) + if scale is not None + else None ) - monthly_cost = cost_per_node * min_node_count - - signals = [ - f"Cluster configured with non-zero baseline capacity but no workload observed for {effective_window} days (Azure Monitor: {idle_metric})", - f"Baseline cost driver: min_node_count={min_node_count} (always-on compute — billed continuously)", - "Compute type: AmlCompute", + tags = getattr(compute, "tags", None) or {} # spec 7: never None in output + + signals_used = [ + "Resource is exact compute type 'AmlCompute'", + "Provisioning state is 'Succeeded'", + "Allocation state is 'Steady'", + f"Cluster age is {age_days} days (>= {_MIN_AGE_DAYS} days)", + ( + f"min_node_count={min_node_count} (positive baseline confirmed), " + f"current_node_count={current_node_count} (>= min_node_count)" + ), + ( + f"Active Nodes metric for cluster '{compute_name}' resolved to " + f"no observed active nodes over {_IDLE_WINDOW_DAYS} days with " + f">= {int(_MIN_COVERAGE * 100)}% daily-bucket coverage " + f"(ClusterName dimension, Maximum aggregation)" + ), ] - if age_days is not None: - signals.append(f"Cluster age: {age_days} days") - if vm_size: - signals.append(f"VM size: {vm_size}") - if is_gpu: - signals.append("GPU cluster with no workload — high-cost idle state") - if min_node_count == 1: - signals.append("Single-node baseline — may be intentional for dev/test") - - evidence = Evidence( - signals_used=signals, - signals_not_checked=[ - "Scheduled or periodic training jobs", - "Jobs submitted outside the observation window", - "Planned future usage", - "Cluster configured with min_node_count for warm-start latency", - "Cluster reserved for interactive development", - ], - time_window=f"{effective_window} days", - ) - - age_for_details = age_days if age_days is not None else "unknown" findings.append( Finding( provider="azure", - rule_id="azure.aml.compute.idle", - resource_type="azure.aml.compute", - resource_id=compute.id, - region=location_raw, - estimated_monthly_cost_usd=monthly_cost, - title=f"Idle Azure ML Compute Cluster (Baseline Capacity Waste for {effective_window} Days)", + rule_id=_RULE_ID, + resource_type=_RESOURCE_TYPE, + resource_id=compute_id, + region=compute_location, + estimated_monthly_cost_usd=None, # spec 10: always None + title=f"Idle AML Compute Cluster with Retained Baseline Capacity: {compute_name}", summary=( - f"AML compute cluster '{compute.name}' in workspace '{workspace.name}' " - f"is configured to keep {min_node_count} node(s) always running " - f"(min_node_count={min_node_count}) but no workload activity was " - f"observed for {effective_window} days — baseline capacity waste." + f"AML compute cluster '{compute_name}' in workspace '{ws_name}' " + f"is configured to keep {min_node_count} node(s) running " + f"(min_node_count={min_node_count}) with no observed active nodes " + f"over {_IDLE_WINDOW_DAYS} days" ), reason=( - f"AML compute cluster has min_node_count={min_node_count} " - f"with no workload activity for {effective_window} days" + f"Cluster retains {min_node_count} baseline node(s) " + f"(min_node_count={min_node_count}) with no documented job " + f"activity for {_IDLE_WINDOW_DAYS} days; baseline nodes incur " + f"ongoing cost regardless of job activity" ), - risk=risk, - confidence=confidence, + risk=RiskLevel.MEDIUM, # spec 11.1: always MEDIUM + confidence=ConfidenceLevel.HIGH, # spec 11.1: always HIGH detected_at=now, - evidence=evidence, + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=[ + "Future or scheduled training intent", + "Business-owner intent not visible in Azure control plane", + "Warm baseline retained intentionally for startup latency, quota reservation, or sporadic experimentation", + "Exact VM and infrastructure pricing after discounts, reservations, or special commercial terms", + ], + time_window=f"{_IDLE_WINDOW_DAYS} days", + ), details={ - "cluster_name": compute.name, - "workspace_name": workspace.name, + "cluster_name": compute_name, + "workspace_name": ws_name, "resource_group": rg, + "subscription_id": subscription_id, "vm_size": vm_size, + "vm_priority": _to_detail_str(vm_priority), "min_node_count": min_node_count, - "is_gpu": is_gpu, - "age_days": age_for_details, - "idle_window_days": effective_window, - "idle_days_threshold": idle_days, - "estimated_monthly_cost": f"~${monthly_cost:,.0f}/month", - "cost_estimate_type": ("mapped" if vm_size_key else "approximate"), + "max_node_count": max_node_count, + "current_node_count": current_node_count, + "target_node_count": target_node_count, + "allocation_state": "Steady", + "provisioning_state": "Succeeded", + "created_at": created_at.isoformat(), + "node_idle_time_before_scale_down": _to_detail_str(node_idle_time), + "idle_window_days": _IDLE_WINDOW_DAYS, + "metrics_used": ["Active Nodes"], + "tags": tags, }, ) ) - except Exception as ws_err: - ws_msg = str(ws_err) - if "AuthorizationFailed" in ws_msg or "Forbidden" in ws_msg or "403" in ws_msg: - raise PermissionError( - "Missing required permissions: " - "Microsoft.MachineLearningServices/workspaces/read, " - "Microsoft.MachineLearningServices/workspaces/computes/read, " - "Microsoft.Insights/metrics/read" - ) from ws_err - continue # skip this workspace on transient error; preserve findings so far - - except Exception as e: - msg = str(e) - if "AuthorizationFailed" in msg or "Forbidden" in msg or "403" in msg: - raise PermissionError( - "Missing required permissions: " - "Microsoft.MachineLearningServices/workspaces/read, " - "Microsoft.MachineLearningServices/workspaces/computes/read, " - "Microsoft.Insights/metrics/read" - ) from e - raise - - return findings - - -def _check_active_nodes( - monitor_client: Any, - workspace_id: str, - compute_name: str, - days: int, -) -> Optional[str]: - """Check whether the cluster had any active nodes in the past `days` days. - - Returns the metric name that confirmed idle (e.g. "Active Nodes"), or None - if the cluster appears active or no reliable per-cluster signal was found. - - Azure Monitor metrics for ML workspaces are inconsistent: the metric name - ("Active Nodes" vs "NodeCount" vs "CurrentNodeCount") and available dimensions - (ComputeName vs ClusterName vs none) vary by API version, region, and workspace type. - - Strategy — for each candidate metric name: - 1. Query with ComputeName dimension filter (only reliable signal) - - active -> return None (skip this cluster) - - idle -> return metric name (confirmed idle — dimension-filtered zero is trustworthy) - - no data -> filter unsupported; fall back to workspace-level query - 2. Workspace-level fallback (unfiltered): - - active -> return None (something active somewhere; skip conservatively) - - idle/no data -> UNKNOWN, not idle (one active cluster can hide many idle ones) - 3. If no metric yields a reliable per-cluster signal -> return None (assume active) - - Returns None (assume active) on any API exception to avoid false positives. - """ - now = datetime.now(timezone.utc) - start_time = now - timedelta(days=max(days, 1)) - fmt = "%Y-%m-%dT%H:%M:%SZ" - timespan = f"{start_time.strftime(fmt)}/{now.strftime(fmt)}" - - def _query(metric_name: str, dimension_filter: Optional[str]) -> Optional[bool]: - """Query one metric. - - Returns True -> activity found (cluster was active) - Returns False -> timeseries returned, all values zero (cluster confirmed idle) - Returns None -> no timeseries returned (metric/dimension unavailable) - Raises -> API error (caller handles) - """ - kwargs = dict( - metricnames=metric_name, - timespan=timespan, - interval="P1D", - aggregation="Maximum", - ) - if dimension_filter: - kwargs["filter"] = dimension_filter - response = monitor_client.metrics.list(workspace_id, **kwargs) + except (HttpResponseError, ServiceRequestError, ServiceResponseError): + continue # per-compute retrieval failure -> skip (spec 12) - has_real_data = False - for metric in response.value: - for ts in metric.timeseries: - for data in ts.data: - if data.maximum is not None: - has_real_data = True - if data.maximum > 0: - return True # confirmed active - - # Only treat as confirmed idle when at least one non-None datapoint was seen. - # All-None maximums (metric publishing gap / throttled ingestion) are treated - # as unknown — same as no timeseries — to avoid false positives. - return False if has_real_data else None + except (HttpResponseError, ServiceRequestError, ServiceResponseError): + continue # per-workspace compute list failure -> skip workspace (spec 12) - try: - for metric_name in _ACTIVE_NODE_METRICS: - # Step 1: try with ComputeName dimension filter - result = _query(metric_name, f"ComputeName eq '{compute_name}'") - if result is True: - return None # confirmed active - if result is False: - return metric_name # confirmed idle — dimension-filtered zero is trustworthy - - # Step 2: filter returned no timeseries — dimension may not be supported. - # Fall back to unfiltered workspace-level query. - result = _query(metric_name, None) - if result is True: - return None # something active in the workspace — skip conservatively - - # False or None at workspace level is UNKNOWN, not idle: - # one active cluster can mask multiple idle ones — never confirm idle here. - # Continue to the next metric name. - - # No metric returned a reliable per-cluster signal — assume active. - # This avoids flagging clusters whose metrics are simply not published yet. - return None - - except Exception: - return None # conservative: assume active if metrics unavailable - - -def _parse_resource_group(resource_id: Optional[str]) -> Optional[str]: - """Extract the resource group name from an Azure resource ID.""" - if not resource_id: - return None - parts = resource_id.split("/") - try: - idx = [p.lower() for p in parts].index("resourcegroups") - return parts[idx + 1] - except (ValueError, IndexError): - return None + return findings diff --git a/cleancloud/providers/azure/rules/ai/aml_compute_instance_idle.py b/cleancloud/providers/azure/rules/ai/aml_compute_instance_idle.py index 70b6487..5ef0b9d 100644 --- a/cleancloud/providers/azure/rules/ai/aml_compute_instance_idle.py +++ b/cleancloud/providers/azure/rules/ai/aml_compute_instance_idle.py @@ -1,7 +1,49 @@ +""" +Rule: azure.ml.compute_instance.idle + +Intent: + Detect Azure Machine Learning compute instances that remain billable in Running + state while showing no recent documented control-plane lifecycle activity over + a conservative review window. + + This rule is deliberately precision-first. It is not a generic "inactive notebook" + rule, not proof that a compute instance is safe to stop or delete, and not proof + that no user is actively connected. It is a conservative review-candidate rule for + compute instances that appear to have been left running without recent documented + lifecycle actions. + +Exclusions: + - id absent or empty + - name absent or empty + - workspace.name absent or empty + - outside optional region filter (compute resource location, exact lowercase match; + spaces and hyphens preserved) + - compute_type does not resolve to exactly "ComputeInstance" (SDK+nested, + conflict -> skip) + - provisioning_state does not resolve to exactly "Succeeded" (SDK+nested, + conflict -> skip) + - state does not resolve to exactly "Running" (SDK+nested, conflict -> skip) + - location unresolvable or conflicting + - created_at absent, invalid, in the future, or instance age < idle_days + - lastOperation.operationTime present but unparsable -> skip + - lastOperation.operationTime == created_at -> no proven post-create signal -> skip + - modifiedOn fallback: only when lastOperation absent or has no operationTime; + skip when modifiedOn absent, unparsable, <= created_at, or in the future + - no lifecycle signal at all -> skip (no age-only fallback; no systemData fallback) + - resolved lifecycle timestamp in the future -> skip + - floored idle_since_days < effective idle_days -> skip + +Cost model (spec 10): + estimated_monthly_cost_usd = None (always) + +APIs: + - Microsoft.MachineLearningServices/workspaces/read + - Microsoft.MachineLearningServices/workspaces/computes/read +""" + from datetime import datetime, timezone from typing import Any, List, Optional -# Azure SDK (top-level imports for CI fail-fast) from azure.mgmt.machinelearningservices import AzureMachineLearningWorkspaces from cleancloud.core.confidence import ConfidenceLevel @@ -9,49 +51,209 @@ from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +_RULE_ID = "azure.ml.compute_instance.idle" +_RESOURCE_TYPE = "azure.ml.compute_instance" +_DEFAULT_IDLE_DAYS = 14 + RULE_METADATA = { - "id": "azure.ml.compute_instance.idle", + "id": _RULE_ID, "category": "ai", "service": "machinelearning", "cost_impact": "high", } -# GPU VM size prefixes — significantly more expensive than CPU +# GPU VM size prefixes — exact case-sensitive prefix matching (spec 7, 9.5) _GPU_VM_PREFIXES = ("Standard_NC", "Standard_ND", "Standard_NV") -# Approximate monthly cost per instance (on-demand, East US, 730 h/month) -# Compute instances are single-VM dev environments — billed per hour while Running -_MONTHLY_COST_BY_SIZE = { - # CPU — general purpose - "Standard_DS2_v2": 130.0, - "Standard_DS3_v2": 260.0, - "Standard_DS4_v2": 519.0, - "Standard_DS11_v2": 174.0, - "Standard_DS12_v2": 349.0, - "Standard_DS13_v2": 699.0, - "Standard_D2s_v3": 96.0, - "Standard_D4s_v3": 192.0, - "Standard_D8s_v3": 384.0, - "Standard_D16s_v3": 768.0, - # GPU — NVIDIA V100 (NC v3 series) - "Standard_NC6s_v3": 2_203.0, - "Standard_NC12s_v3": 4_406.0, - "Standard_NC24s_v3": 8_812.0, - # GPU — NVIDIA K80 (NC series) - "Standard_NC6": 648.0, - "Standard_NC12": 1_296.0, - "Standard_NC24": 2_592.0, - # GPU — NVIDIA P40 (ND series) - "Standard_ND6s": 2_203.0, - "Standard_ND12s": 4_406.0, - "Standard_ND24s": 8_812.0, - "Standard_ND40rs_v2": 15_862.0, - # GPU — NVIDIA M60 (NV series) - "Standard_NV6": 1_094.0, - "Standard_NV12": 2_189.0, - "Standard_NV24": 4_378.0, -} -_DEFAULT_MONTHLY_COST = 200.0 + +# --------------------------------------------------------------------------- +# Normalization +# --------------------------------------------------------------------------- + + +def _norm_location(s: str) -> str: + """Lowercase only — exact lowercase match per spec 7 (spaces and hyphens preserved).""" + return s.lower() if s else "" + + +def _extract_resource_group(resource_id: Optional[str]) -> Optional[str]: + """Extract resource group name from Azure ARM resource ID.""" + if not resource_id: + return None + parts = resource_id.split("/") + try: + idx = next(i for i, p in enumerate(parts) if p.lower() == "resourcegroups") + return parts[idx + 1] + except (StopIteration, IndexError): + return None + + +def _extract_subscription_id(resource_id: Optional[str]) -> Optional[str]: + """Extract subscription ID from Azure ARM resource ID.""" + if not resource_id: + return None + parts = resource_id.split("/") + try: + idx = next(i for i, p in enumerate(parts) if p.lower() == "subscriptions") + return parts[idx + 1] + except (StopIteration, IndexError): + return None + + +# --------------------------------------------------------------------------- +# State resolvers (spec 9.1) +# --------------------------------------------------------------------------- + + +def _resolve_str_field(obj, snake: str, camel: str) -> Optional[str]: + """ + Resolve a string field from SDK snake_case then raw camelCase. + Returns None on conflict or absent. + """ + if obj is None: + return None + sdk_val = getattr(obj, snake, None) + raw_val = getattr(obj, camel, None) + if sdk_val is not None and raw_val is not None and sdk_val != raw_val: + return None # conflict -> skip + val = sdk_val if sdk_val is not None else raw_val + return val if isinstance(val, str) else None + + +def _resolve_compute_type(compute) -> Optional[str]: + """ + Resolve compute_type from compute.properties (SDK+nested, spec 9.1). + Only "ComputeInstance" is eligible; conflict or absent -> None. + """ + outer = getattr(compute, "properties", None) + return _resolve_str_field(outer, "compute_type", "computeType") + + +def _resolve_provisioning_state(compute) -> Optional[str]: + """ + Resolve provisioning_state from compute.properties (SDK+nested, spec 9.1). + Only "Succeeded" is eligible; conflict or absent -> None. + """ + outer = getattr(compute, "properties", None) + return _resolve_str_field(outer, "provisioning_state", "provisioningState") + + +def _resolve_state(compute) -> Optional[str]: + """ + Resolve state from compute.properties.properties (SDK+nested, spec 9.1). + Normalized by surrounding-whitespace trimming per spec 7 before comparison. + Only "Running" is eligible; conflict or absent -> None. + """ + outer = getattr(compute, "properties", None) + inner = getattr(outer, "properties", None) if outer is not None else None + raw = _resolve_str_field(inner, "state", "state") + return raw.strip() if raw is not None else None + + +# --------------------------------------------------------------------------- +# Location contract (spec 9.2) +# --------------------------------------------------------------------------- + + +def _resolve_location(compute) -> Optional[str]: + """ + Resolve compute resource location (spec 9.2) in priority order: + 1. top-level compute.location + 2. compute.properties.compute_location + 3. compute.properties.computeLocation + + Returns normalized (lowercase) or None if unresolvable or materially conflicting. + """ + loc1_raw = getattr(compute, "location", None) + outer = getattr(compute, "properties", None) + loc2_raw = getattr(outer, "compute_location", None) if outer is not None else None + if loc2_raw is None and outer is not None: + loc2_raw = getattr(outer, "computeLocation", None) + + candidates = [loc for loc in (loc1_raw, loc2_raw) if isinstance(loc, str) and loc.strip()] + if not candidates: + return None + normalized = [_norm_location(loc) for loc in candidates] + if len(set(normalized)) > 1: + return None # material conflict -> skip + return normalized[0] + + +# --------------------------------------------------------------------------- +# Timestamp parsing (spec 7, 9.3, 9.4) +# --------------------------------------------------------------------------- + + +def _parse_utc_timestamp(raw) -> Optional[datetime]: + """ + Parse a raw timestamp to a UTC-normalized datetime (spec 9.4 contract). + Naive datetimes are treated as UTC; aware non-UTC datetimes are converted to UTC. + Returns None if absent, invalid type, or unparsable string. + """ + if raw is None: + return None + if isinstance(raw, datetime): + if raw.tzinfo is None: + return raw.replace(tzinfo=timezone.utc) + return raw.astimezone(timezone.utc) + if isinstance(raw, str): + try: + ts = datetime.fromisoformat(raw.rstrip("Z")) + if ts.tzinfo is None: + return ts.replace(tzinfo=timezone.utc) + return ts.astimezone(timezone.utc) + except ValueError: + return None + return None + + +def _resolve_created_at(compute) -> Optional[datetime]: + """ + Resolve creation timestamp from compute.properties.created_on / createdOn (spec 7, 9.3). + Returns UTC-aware datetime or None if absent or unparsable. + """ + outer = getattr(compute, "properties", None) + if outer is None: + return None + raw = getattr(outer, "created_on", None) + if raw is None: + raw = getattr(outer, "createdOn", None) + return _parse_utc_timestamp(raw) + + +def _resolve_modified_at(compute) -> Optional[datetime]: + """ + Resolve modifiedOn from compute.properties (spec 7, 9.4 fallback). + Documented SDK surface: modified_on / modifiedOn. + Returns UTC-aware datetime or None if absent or unparsable. + """ + outer = getattr(compute, "properties", None) + if outer is None: + return None + raw = getattr(outer, "modified_on", None) + if raw is None: + raw = getattr(outer, "modifiedOn", None) + return _parse_utc_timestamp(raw) + + +# --------------------------------------------------------------------------- +# Risk and GPU classification (spec 7, 9.5) +# --------------------------------------------------------------------------- + + +def _is_gpu(vm_size: Optional[str]) -> bool: + """ + GPU classification via exact case-sensitive prefix matching (spec 7, 9.5). + null / absent vm_size is non-GPU. + """ + if not vm_size: + return False + return any(vm_size.startswith(prefix) for prefix in _GPU_VM_PREFIXES) + + +# --------------------------------------------------------------------------- +# Main rule function +# --------------------------------------------------------------------------- def find_idle_aml_compute_instances( @@ -60,41 +262,22 @@ def find_idle_aml_compute_instances( credential, region_filter: str = None, client: Optional[Any] = None, - idle_days: int = 14, + idle_days: int = _DEFAULT_IDLE_DAYS, ) -> List[Finding]: """ - Find Azure ML Compute Instances in Running state with no recent control-plane activity. - - Azure ML Compute Instances are single-VM interactive development environments - (Jupyter, VS Code, RStudio) that bill continuously while Running — regardless of - whether any notebooks or kernels are active. GPU instances (NC/ND/NV series) cost - $600–$15K+/month. Data scientists frequently leave instances Running after a sprint - ends, a project is deprioritised, or when a new instance is provisioned and the old - one is forgotten. - - Detection logic: - - Compute type is ComputeInstance - - Instance state is Running (the only state that incurs compute charges) - - No control-plane activity within the idle threshold: - last_operation.operation_time older than idle_days (primary signal) - system_data.last_modified_at used as fallback if last_operation unavailable - - Why last_operation / last_modified_at: - Azure ML Compute Instances do not publish per-instance utilisation metrics to Azure - Monitor by default. last_operation.operation_time is updated by the Azure ML control - plane on Start, Stop, Restart, and Create operations. An instance with no recent - last_operation has had no control-plane activity — the same approach AWS Cost - Optimisation Hub uses for SageMaker Notebook LastModifiedTime. - - Confidence: - - HIGH: idle_signal_source != age_fallback AND idle_since_days >= idle_days AND age >= idle_days - - MEDIUM: idle_since_days >= 75% of idle_days AND age >= 75% of idle_days - (age_fallback findings are capped at MEDIUM — age alone is not evidence of idleness) - - Risk: - - CRITICAL: GPU instance AND idle_ratio >= 2.0 (e.g. 28+ days at default 14-day window) - - HIGH: GPU instance (NC*, ND*, NV*) - - MEDIUM: CPU instance + Find Azure ML Compute Instances in Running state with no recent documented + control-plane lifecycle activity. + + Detection logic (spec 4, 8, 9): + - compute_type resolves exactly to "ComputeInstance" + - provisioning_state resolves exactly to "Succeeded" + - state resolves exactly to "Running" + - instance age >= effective idle_days + - last documented lifecycle activity >= effective idle_days ago: + Primary: lastOperation.operationTime + Fallback: modifiedOn (only when lastOperation absent or has no operationTime, + and modifiedOn > created_at) + No age fallback; no systemData.lastModifiedAt (spec 9.4.12-13) IAM permissions: - Microsoft.MachineLearningServices/workspaces/read @@ -102,228 +285,247 @@ def find_idle_aml_compute_instances( """ findings: List[Finding] = [] now = datetime.now(timezone.utc) + effective_idle_days = max(idle_days, 1) # spec 6.3 - idle_days = max(idle_days, 1) - - # Instantiate Azure SDK client (top-level imports ensure CI fails fast if SDKs missing) ml_client = client or AzureMachineLearningWorkspaces( credential=credential, subscription_id=subscription_id ) - def _norm(s: str) -> str: - return s.lower().replace(" ", "").replace("-", "") + # Subscription-wide workspace inventory (spec 12: propagate if this fails) + for workspace in ml_client.workspaces.list_by_subscription(): + # spec 8.3: workspace name guard + ws_name = getattr(workspace, "name", None) + if not ws_name: + continue + + rg = _extract_resource_group(getattr(workspace, "id", None)) + if not rg: + continue + + try: + for compute in ml_client.machine_learning_compute.list_by_workspace(rg, ws_name): + try: + # spec 8.1: id guard + compute_id = getattr(compute, "id", None) + if not compute_id: + continue + + # spec 8.2: name guard + compute_name = getattr(compute, "name", None) + if not compute_name: + continue - try: - for workspace in ml_client.workspaces.list_by_subscription(): - location_raw = workspace.location or "" - if region_filter and _norm(location_raw) != _norm(region_filter): - continue - - rg = _parse_resource_group(workspace.id) - if not rg: - continue - - try: - for compute in ml_client.machine_learning_compute.list_by_workspace( - rg, workspace.name - ): - compute_obj = compute.properties - if ( - not compute_obj - or getattr(compute_obj, "compute_type", None) != "ComputeInstance" - ): + # spec 8.5: compute_type must resolve to exactly "ComputeInstance" + if _resolve_compute_type(compute) != "ComputeInstance": continue - # ComputeInstanceProperties lives under compute_obj.properties - ci_props = getattr(compute_obj, "properties", None) + # spec 8.6: provisioning_state must resolve to exactly "Succeeded" + if _resolve_provisioning_state(compute) != "Succeeded": + continue - # Only flag Running instances — Stopped instances do not incur charges - state = getattr(ci_props, "state", None) - if state != "Running": + # spec 8.7: state must resolve to exactly "Running" + if _resolve_state(compute) != "Running": continue - vm_size = getattr(ci_props, "vm_size", None) + # spec 8.8: location must resolve from compute resource (not workspace) + location = _resolve_location(compute) + if location is None: + continue - # --- Age --- - age_days: Optional[int] = None - created_at = getattr(compute_obj, "created_on", None) - if created_at is not None: - if created_at.tzinfo is None: - created_at = created_at.replace(tzinfo=timezone.utc) - age_days = (now - created_at).days - if age_days < max(idle_days // 2, 7): - continue + # spec 8.4: region filter — exact lowercase equality + if region_filter and location != _norm_location(region_filter): + continue - # --- Idle signal: last_operation.operation_time (primary) --- - idle_since_days: Optional[int] = None - - last_op = getattr(ci_props, "last_operation", None) - op_time = getattr(last_op, "operation_time", None) - if op_time is not None: - if op_time.tzinfo is None: - op_time = op_time.replace(tzinfo=timezone.utc) - idle_since_days = (now - op_time).days - - # Fallback: system_data.last_modified_at - if idle_since_days is None: - system_data = getattr(compute, "system_data", None) - last_modified = getattr(system_data, "last_modified_at", None) - if last_modified is not None: - if last_modified.tzinfo is None: - last_modified = last_modified.replace(tzinfo=timezone.utc) - idle_since_days = (now - last_modified).days - - # Second fallback: treat age as idle proxy — caps confidence at MEDIUM - # (age alone is not evidence of idleness; the instance could be in active use) - using_age_fallback = idle_since_days is None - if idle_since_days is None: - idle_since_days = age_days if age_days is not None else idle_days - - # Use effective age for confidence — fall back to idle_days (neutral) if unknown - effective_age = age_days if age_days is not None else idle_days - - threshold_high = idle_days - threshold_medium = int(idle_days * 0.75) - - if ( - not using_age_fallback - and idle_since_days >= threshold_high - and effective_age >= threshold_high - ): - confidence = ConfidenceLevel.HIGH - elif idle_since_days >= threshold_medium and effective_age >= threshold_medium: - confidence = ConfidenceLevel.MEDIUM + # spec 8.9 / 9.3: created_at required; future timestamps invalid + created_at = _resolve_created_at(compute) + if created_at is None: + continue + if created_at > now: + continue # spec 9.3.2: future created_at -> skip + age_days_actual = (now - created_at).days + if age_days_actual < effective_idle_days: + continue # spec 9.3.3: age gate + + # spec 9.4: lifecycle-activity contract + outer = getattr(compute, "properties", None) + inner = getattr(outer, "properties", None) if outer is not None else None + + last_op = getattr(inner, "last_operation", None) + if last_op is None: + last_op = getattr(inner, "lastOperation", None) + + if last_op is not None: + # Each field: SDK snake_case first, then raw camelCase fallback + last_op_time_raw = getattr(last_op, "operation_time", None) + if last_op_time_raw is None: + last_op_time_raw = getattr(last_op, "operationTime", None) + + last_op_name = getattr(last_op, "operation_name", None) + if last_op_name is None: + last_op_name = getattr(last_op, "operationName", None) + + last_op_status = getattr(last_op, "operation_status", None) + if last_op_status is None: + last_op_status = getattr(last_op, "operationStatus", None) else: - continue # too borderline for a confident finding + last_op_time_raw = None + last_op_name = None + last_op_status = None + + # Resolve modified_at for fallback and detail reporting + modified_at = _resolve_modified_at(compute) + + lifecycle_activity_at: Optional[datetime] = None + idle_signal_source: Optional[str] = None + + if last_op_time_raw is not None: + # spec 9.4.3: lastOperation.operationTime present — must parse or skip + parsed_op_time = _parse_utc_timestamp(last_op_time_raw) + if parsed_op_time is None: + continue # spec 9.4.4: present but unparsable -> skip + # spec 9.4.7: operationTime == created_at -> no proven post-create signal + if parsed_op_time == created_at: + continue + lifecycle_activity_at = parsed_op_time + idle_signal_source = "last_operation" + else: + # spec 9.4.8: lastOperation absent or has no operationTime -> try modifiedOn + if modified_at is None: + continue # no lifecycle signal -> fail closed (spec 9.4.13) + # spec 9.4.8: only when modified_at > created_at (strict greater than) + if modified_at <= created_at: + continue # spec 9.4.8/9: no proven post-create signal -> skip + # spec 9.4.10: modified_at selected but fails parsing already handled above + lifecycle_activity_at = modified_at + idle_signal_source = "modified_on" + + # spec 8.11 / 9.4.11: future lifecycle timestamp -> skip (no clock-skew) + if lifecycle_activity_at > now: + continue + + # spec 8.12 / 9.4.16-17: idle_since_days must be >= effective idle window + idle_since_days = int((now - lifecycle_activity_at).total_seconds() // 86400) + if idle_since_days < effective_idle_days: + continue - vm_size_norm = (vm_size or "").lower() - is_gpu = any(vm_size_norm.startswith(p.lower()) for p in _GPU_VM_PREFIXES) + # --- Enrichment --- + vm_size = getattr(inner, "vm_size", None) if inner is not None else None + is_gpu_instance = _is_gpu(vm_size) + tags = getattr(compute, "tags", None) or {} # spec 7: never None in output - idle_ratio = round(idle_since_days / idle_days, 2) if idle_days > 0 else 0.0 - if is_gpu and idle_ratio >= 2.0: - risk = RiskLevel.CRITICAL - elif is_gpu: - risk = RiskLevel.HIGH - else: - risk = RiskLevel.MEDIUM + # spec 9.5 / 11.2: Risk — HIGH for GPU, MEDIUM otherwise + risk = RiskLevel.HIGH if is_gpu_instance else RiskLevel.MEDIUM - vm_size_key = next( - (k for k in _MONTHLY_COST_BY_SIZE if k.lower() == vm_size_norm), - None, - ) - monthly_cost = ( - _MONTHLY_COST_BY_SIZE[vm_size_key] if vm_size_key else _DEFAULT_MONTHLY_COST + # spec 9.5 / 11.2: Confidence — MEDIUM for last_operation, LOW for modified_on + confidence = ( + ConfidenceLevel.MEDIUM + if idle_signal_source == "last_operation" + else ConfidenceLevel.LOW ) - # Determine idle signal source for evidence + details - if last_op is not None and op_time is not None: - idle_signal_source = "last_operation" - idle_signal_desc = f"Last control-plane operation: {idle_since_days} days ago (last_operation.operation_time)" - op_name = getattr(last_op, "operation_name", None) - if op_name: - idle_signal_desc += f" — last op: {op_name}" - elif ( - system_data is not None - and getattr(system_data, "last_modified_at", None) is not None - ): - idle_signal_source = "last_modified_at" - idle_signal_desc = f"Last control-plane activity: {idle_since_days} days ago (last_modified_at fallback)" - else: - idle_signal_source = "age_fallback" - idle_signal_desc = f"Last control-plane activity: {idle_since_days} days ago (age used as proxy — no operation or modified timestamp available)" + # Resolve the lifecycle field label for evidence and detail reporting + lifecycle_field = ( + "lastOperation.operationTime" + if idle_signal_source == "last_operation" + else "modifiedOn" + ) - signals = [ - "Instance state: Running", - f"Age: {effective_age} days", - idle_signal_desc, + # spec 11.3: signals_used + signals_used = [ + "Resource is exact compute type 'ComputeInstance'", + "Provisioning state is 'Succeeded'", + "Runtime state is 'Running'", + ( + f"Instance age is {age_days_actual} days " + f"(>= configured idle window of {effective_idle_days} days)" + ), + ( + f"Last documented control-plane lifecycle activity is " + f"{idle_since_days} days ago " + f"(field: {lifecycle_field}), " + f"older than the configured idle window" + ), ] - if vm_size: - signals.append(f"VM size: {vm_size}") - if is_gpu: - signals.append("GPU instance — high hourly cost") - - evidence = Evidence( - signals_used=signals, - signals_not_checked=[ - "Active Jupyter kernel or notebook sessions", - "VS Code / RStudio remote connections", - "Scheduled jobs running on this instance", - "Assigned user's planned future use", - "Resource tags (e.g. keep_alive=true) — use --ignore-tag or cleancloud.yaml exceptions to suppress intentional instances", - ], - time_window=f"{idle_since_days} days", + if idle_signal_source == "last_operation" and last_op_name: + signals_used.append(f"Last operation name: {last_op_name}") + + # last_operation_time detail: always the parsed operationTime when present; + # None when lastOperation was absent or had no operationTime (modified_on path) + last_operation_time_iso = ( + lifecycle_activity_at.isoformat() + if idle_signal_source == "last_operation" + else None ) findings.append( Finding( provider="azure", - rule_id="azure.ml.compute_instance.idle", - resource_type="azure.ml.compute_instance", - resource_id=compute.id, - region=location_raw, - estimated_monthly_cost_usd=monthly_cost, - title=f"Idle Azure ML Compute Instance (No Activity for {idle_since_days} Days)", + rule_id=_RULE_ID, + resource_type=_RESOURCE_TYPE, + resource_id=compute_id, + region=location, + estimated_monthly_cost_usd=None, # spec 10: always None + title=( + f"Idle Azure ML Compute Instance: {compute_name} " + f"({idle_since_days} days without control-plane activity)" + ), summary=( - f"Azure ML Compute Instance '{compute.name}' in workspace " - f"'{workspace.name}' has had no control-plane activity for " - f"{idle_since_days} days but remains Running, incurring continuous " - f"charges (~${monthly_cost:,.0f}/month)." + f"Azure ML Compute Instance '{compute_name}' in workspace " + f"'{ws_name}' has had no documented control-plane lifecycle " + f"activity for {idle_since_days} days but remains in Running " + f"state, continuing to incur compute-hour charges until stopped." ), reason=( - f"Azure ML Compute Instance has had no control-plane activity " - f"for {idle_since_days} days" + f"Compute instance remains Running with no documented " + f"lifecycle activity for {idle_since_days} days " + f"(signal: {idle_signal_source})" ), risk=risk, confidence=confidence, detected_at=now, - evidence=evidence, + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=[ + "Active Jupyter kernel sessions", + "Active Jupyter terminal sessions", + "Active AML runs or experiments", + "Active VS Code connections", + "Custom applications currently running on the compute", + "Creator or business-owner intent", + "Automatic schedules or shutdown behavior not visible from the rule's read path", + "Exact pricing after discounts, reservations, or special commercial terms", + ], + time_window=f"{idle_since_days} days", + ), details={ - "instance_name": compute.name, - "workspace_name": workspace.name, + "instance_name": compute_name, + "workspace_name": ws_name, "resource_group": rg, + "subscription_id": subscription_id, + "location": location, "vm_size": vm_size, - "state": state, - "is_gpu": is_gpu, - "age_days": effective_age, + "compute_type": "ComputeInstance", + "provisioning_state": "Succeeded", + "state": "Running", + "created_at": created_at.isoformat(), + "modified_at": ( + modified_at.isoformat() if modified_at is not None else None + ), + "last_operation_name": last_op_name, + "last_operation_time": last_operation_time_iso, + "last_operation_status": last_op_status, "idle_since_days": idle_since_days, - "idle_days_threshold": idle_days, - "idle_ratio": idle_ratio, + "idle_days_threshold": effective_idle_days, "idle_signal_source": idle_signal_source, - "estimated_monthly_cost": f"~${monthly_cost:,.0f}/month", - "cost_source": f"approximate_{location_raw}", + "tags": tags, }, ) ) - except Exception as ws_err: - ws_msg = str(ws_err) - if "AuthorizationFailed" in ws_msg or "Forbidden" in ws_msg or "403" in ws_msg: - raise PermissionError( - "Missing required permissions: " - "Microsoft.MachineLearningServices/workspaces/read, " - "Microsoft.MachineLearningServices/workspaces/computes/read" - ) from ws_err - continue # skip this workspace on transient error; preserve findings so far - - except Exception as e: - msg = str(e) - if "AuthorizationFailed" in msg or "Forbidden" in msg or "403" in msg: - raise PermissionError( - "Missing required permissions: " - "Microsoft.MachineLearningServices/workspaces/read, " - "Microsoft.MachineLearningServices/workspaces/computes/read" - ) from e - raise - return findings + except Exception: + continue # malformed per-compute record -> skip (spec 12) + except Exception: + continue # skip workspace on any error (spec 12); preserve findings so far -def _parse_resource_group(resource_id: Optional[str]) -> Optional[str]: - """Extract the resource group name from an Azure resource ID.""" - if not resource_id: - return None - parts = resource_id.split("/") - try: - idx = [p.lower() for p in parts].index("resourcegroups") - return parts[idx + 1] - except (ValueError, IndexError): - return None + return findings diff --git a/cleancloud/providers/azure/rules/ai/ml_online_endpoint_idle.py b/cleancloud/providers/azure/rules/ai/ml_online_endpoint_idle.py index cd90b38..690c7fe 100644 --- a/cleancloud/providers/azure/rules/ai/ml_online_endpoint_idle.py +++ b/cleancloud/providers/azure/rules/ai/ml_online_endpoint_idle.py @@ -1,9 +1,41 @@ -import math +""" +Rule: azure.ml.online_endpoint.idle + +Intent: + Detect Azure Machine Learning managed online endpoints that retain billable + deployment baseline instances while RequestsPerMinute stays at zero over a + documented observation window. + + This rule is deliberately precision-first. It is not a generic "quiet workspace" + rule, not proof that deleting an endpoint is safe, and not proof of a specific + monthly saving. It is a conservative review-candidate rule for managed online + endpoints that appear to be continuously provisioned but unused. + +Exclusions (spec 8): + - endpoint.id absent or empty + - endpoint.name absent or empty + - workspace.name absent or empty + - region filter set and normalized endpoint location does not match + - managed scope not established per spec 9.1 + - provisioning_state != "Succeeded" (exact case-sensitive) + - created_at absent, invalid, in the future, or age < effective idle_days + - deployment inventory cannot be resolved (listing fails) + - no stable deployment with a known positive baseline instance count + - RequestsPerMinute metric result is not ZERO per spec 9.5 + +Cost model (spec 10): + estimated_monthly_cost_usd = None (always) + +APIs: + - Microsoft.MachineLearningServices/workspaces/read + - Microsoft.MachineLearningServices/workspaces/onlineEndpoints/read + - Microsoft.MachineLearningServices/workspaces/onlineEndpoints/deployments/read + - Microsoft.Insights/metrics/read +""" + from datetime import datetime, timedelta, timezone -from typing import Any, List, Optional +from typing import Any, List, Optional, Tuple -# Azure SDK (top-level imports for CI fail-fast) -from azure.ai.ml import MLClient from azure.core.exceptions import HttpResponseError from azure.mgmt.machinelearningservices import AzureMachineLearningWorkspaces from azure.mgmt.monitor import MonitorManagementClient @@ -13,39 +45,245 @@ from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +_RULE_ID = "azure.ml.online_endpoint.idle" +_RESOURCE_TYPE = "azure.ml.online_endpoint" +_DEFAULT_IDLE_DAYS = 7 + RULE_METADATA = { - "id": "azure.ml.online_endpoint.idle", + "id": _RULE_ID, "category": "ai", "service": "machinelearningservices", "cost_impact": "high", } -# Metrics to try in order -_REQUEST_METRICS = ("RequestCount", "ModelEndpointRequests") +# GPU VM size prefixes — uppercase-normalized exact prefix matching (spec 7, 9.4) +_GPU_VM_PREFIXES = ("STANDARD_NC", "STANDARD_ND", "STANDARD_NV") -# Example VM SKU cost table (monthly per instance). Extend as needed. -_VM_SKU_COSTS = { - "Standard_NC6": 657.0, - "Standard_NC6s_v2": 900.0, - "Standard_NC12": 1300.0, - "Standard_NC24": 2600.0, -} -# Case-insensitive lookup: Azure SDK may return mixed-case SKU names. -_VM_SKU_COSTS_LOWER: dict = {k.lower(): v for k, v in _VM_SKU_COSTS.items()} - -_GPU_FAMILIES = ( - "standard_nc", - "standard_nd", - "standard_nv", - "standard_ncv2", - "standard_ncv3", - "standard_ndv2", - "standard_nd40rs", - "standard_nc4as_t4", - "standard_nc8as_t4", - "standard_nc16as_t4", - "standard_nc64as_t4", -) +_METRIC_NAME = "RequestsPerMinute" +_METRIC_AGGREGATION = "Average" +_METRIC_INTERVAL = "PT1M" + +_COVERAGE_ACCEPTABLE = 0.80 # minimum coverage for acceptable ZERO result +_COVERAGE_HIGH = 0.95 # coverage threshold for HIGH confidence + + +# --------------------------------------------------------------------------- +# Normalization +# --------------------------------------------------------------------------- + + +def _norm_location(s: str) -> str: + """Lowercase only — exact lowercase match per spec 7 (spaces and hyphens preserved).""" + return s.lower() if s else "" + + +def _extract_resource_group(resource_id: Optional[str]) -> Optional[str]: + """Extract resource group name from Azure ARM resource ID.""" + if not resource_id: + return None + parts = resource_id.split("/") + try: + idx = next(i for i, p in enumerate(parts) if p.lower() == "resourcegroups") + return parts[idx + 1] + except (StopIteration, IndexError): + return None + + +def _parse_utc_timestamp(raw) -> Optional[datetime]: + """ + Parse raw timestamp to UTC-normalized datetime. + Naive datetimes are treated as UTC; aware non-UTC datetimes are converted to UTC. + """ + if raw is None: + return None + if isinstance(raw, datetime): + if raw.tzinfo is None: + return raw.replace(tzinfo=timezone.utc) + return raw.astimezone(timezone.utc) + if isinstance(raw, str): + try: + ts = datetime.fromisoformat(raw.rstrip("Z")) + if ts.tzinfo is None: + return ts.replace(tzinfo=timezone.utc) + return ts.astimezone(timezone.utc) + except ValueError: + return None + return None + + +def _is_gpu(instance_type: Optional[str]) -> bool: + """GPU classification: uppercase-normalized exact prefix matching (spec 7, 9.4).""" + if not instance_type: + return False + return any(instance_type.upper().startswith(p) for p in _GPU_VM_PREFIXES) + + +# --------------------------------------------------------------------------- +# Managed scope resolution (spec 9.1) +# --------------------------------------------------------------------------- + + +def _endpoint_scope_signal(endpoint) -> str: + """ + Resolve endpoint-level managed/kubernetes scope signal from documented + endpoint class name or kind attribute (spec 9.1.1). + Returns: "managed", "kubernetes", or "unknown". + """ + cls_name = type(endpoint).__name__ + if cls_name == "ManagedOnlineEndpoint": + return "managed" + if cls_name == "KubernetesOnlineEndpoint": + return "kubernetes" + + kind = getattr(endpoint, "kind", None) + if isinstance(kind, str): + k = kind.lower() + if k == "managed": + return "managed" + if k == "kubernetes": + return "kubernetes" + + return "unknown" + + +def _deployment_scope_signal(deployment) -> str: + """ + Resolve deployment-level managed/kubernetes scope hint from documented + deployment class name (spec 9.1.2). + Returns: "managed", "kubernetes", or "unknown". + """ + cls_name = type(deployment).__name__ + if cls_name == "ManagedOnlineDeployment": + return "managed" + if cls_name == "KubernetesOnlineDeployment": + return "kubernetes" + return "unknown" + + +def _resolve_managed_scope(endpoint, stable_deployments: List) -> Tuple[bool, str]: + """ + Resolve managed scope per spec 9.1 strict priority rules. + Returns (is_managed: bool, managed_scope_source: str). + managed_scope_source: "endpoint", "deployment", or "none". + """ + ep_signal = _endpoint_scope_signal(endpoint) + + # Priority 1: endpoint-level explicit Kubernetes -> out of scope (spec 9.1.5.i) + if ep_signal == "kubernetes": + return (False, "none") + + # Collect deployment-level scope hints from stable deployments + dep_signals = [_deployment_scope_signal(d) for d in stable_deployments] + dep_has_managed = any(s == "managed" for s in dep_signals) + dep_has_kubernetes = any(s == "kubernetes" for s in dep_signals) + + # Priority 2: endpoint-level explicit managed (spec 9.1.5.ii) + if ep_signal == "managed": + # If any stable deployment explicitly identifies Kubernetes -> conflict -> skip (spec 9.1.6) + if dep_has_kubernetes: + return (False, "none") + return (True, "endpoint") + + # Priority 3: no endpoint-level signal; stable-deployment explicit managed (spec 9.1.5.iii) + if dep_has_managed and not dep_has_kubernetes: + return (True, "deployment") + + # Priority 4: out of scope (spec 9.1.5.iv) + return (False, "none") + + +# --------------------------------------------------------------------------- +# Traffic metric (spec 9.5) +# --------------------------------------------------------------------------- + + +def _query_requests_per_minute( + monitor_client: Any, + endpoint_id: str, + effective_idle_days: int, +) -> Tuple[str, Optional[float]]: + """ + Query RequestsPerMinute on the endpoint ARM resource id (spec 9.5). + + Returns: + ("ZERO", coverage_ratio) when coverage >= 80% and every usable bucket Average == 0 + ("ACTIVE", None) when any usable bucket has Average > 0 + ("UNKNOWN", None) on query failure, no usable data, or coverage < 80% + """ + now_utc = datetime.now(timezone.utc) + # floor_to_minute(now_utc - 5 minutes) per spec 9.5 + raw_end = now_utc - timedelta(minutes=5) + metric_end_utc = raw_end.replace(second=0, microsecond=0) + window_start_utc = metric_end_utc - timedelta(days=effective_idle_days) + + fmt = "%Y-%m-%dT%H:%M:%SZ" + timespan = f"{window_start_utc.strftime(fmt)}/{metric_end_utc.strftime(fmt)}" + + # Expected complete minute buckets in [window_start_utc, metric_end_utc) + expected_buckets = int((metric_end_utc - window_start_utc).total_seconds() // 60) + if expected_buckets <= 0: + return ("UNKNOWN", None) + + try: + response = monitor_client.metrics.list( + endpoint_id, + metricnames=_METRIC_NAME, + timespan=timespan, + interval=_METRIC_INTERVAL, + aggregation=_METRIC_AGGREGATION, + ) + except PermissionError: + raise + except HttpResponseError as exc: + if exc.status_code in (401, 403): + raise PermissionError( + "Missing required permissions: Microsoft.Insights/metrics/read" + ) from exc + return ("UNKNOWN", None) + except Exception: + return ("UNKNOWN", None) + + # Count unique complete minute buckets inside [window_start_utc, metric_end_utc). + # A usable datapoint must have a parseable UTC timestamp within the window (spec 9.5). + # Deduplication by bucket prevents duplicate or overlapping series from overstating coverage. + usable_buckets: set = set() + + for metric in response.value or []: + for ts in getattr(metric, "timeseries", None) or []: + for point in getattr(ts, "data", None) or []: + avg = getattr(point, "average", None) + if avg is None: + continue + + # Resolve and parse the datapoint timestamp + raw_ts = getattr(point, "time_stamp", None) + if raw_ts is None: + raw_ts = getattr(point, "timestamp", None) + pt_utc = _parse_utc_timestamp(raw_ts) + if pt_utc is None: + continue # unparseable timestamp -> not a usable datapoint (spec 9.5) + + # Floor to the minute to identify the complete minute bucket + bucket = pt_utc.replace(second=0, microsecond=0) + + # Must be within [window_start_utc, metric_end_utc) (spec 9.5.3) + if bucket < window_start_utc or bucket >= metric_end_utc: + continue # out-of-window -> skip + + usable_buckets.add(bucket) + if avg > 0: + return ("ACTIVE", None) + + coverage_ratio = len(usable_buckets) / expected_buckets + if coverage_ratio < _COVERAGE_ACCEPTABLE: + return ("UNKNOWN", None) + + return ("ZERO", coverage_ratio) + + +# --------------------------------------------------------------------------- +# Main rule function +# --------------------------------------------------------------------------- def find_idle_ml_online_endpoints( @@ -55,17 +293,28 @@ def find_idle_ml_online_endpoints( region_filter: str = None, client: Optional[Any] = None, monitor_client: Optional[Any] = None, - idle_days: int = 7, + idle_days: int = _DEFAULT_IDLE_DAYS, ) -> List[Finding]: - """Find AML managed online endpoints with zero scoring requests over `idle_days`. - - Uses azure-mgmt-machinelearningservices for workspace enumeration (ARM) and - azure-ai-ml MLClient for per-workspace endpoint and deployment listing. - When `client` is injected (tests), it serves as both. + """ + Find Azure ML managed online endpoints with zero RequestsPerMinute while + retaining positive deployment baseline instances. + + Detection logic (spec 4, 8, 9): + - Managed scope established from documented endpoint/deployment surfaces + - Endpoint provisioning_state exactly "Succeeded" + - Endpoint created_at resolves to a known UTC timestamp; age >= effective idle_days + - At least one stable deployment with a known positive baseline instance count + - RequestsPerMinute == 0 across the rolling UTC window defined in spec 9.5 + + IAM permissions: + - Microsoft.MachineLearningServices/workspaces/read + - Microsoft.MachineLearningServices/workspaces/onlineEndpoints/read + - Microsoft.MachineLearningServices/workspaces/onlineEndpoints/deployments/read + - Microsoft.Insights/metrics/read """ findings: List[Finding] = [] now = datetime.now(timezone.utc) - idle_days = max(idle_days, 3) + effective_idle_days = max(idle_days, 1) # spec 6.3: minimum effective 1 arm_client = client or AzureMachineLearningWorkspaces( credential=credential, subscription_id=subscription_id @@ -74,14 +323,13 @@ def find_idle_ml_online_endpoints( credential=credential, subscription_id=subscription_id ) - def _norm(s: str) -> str: - return "".join(c for c in (s or "").lower() if c.isalnum()) - def _ws_client(rg: str, ws_name: str) -> Any: # Tests inject a single mock client that covers all operations. # Production creates a workspace-scoped MLClient for endpoint/deployment ops. if client is not None: return client + from azure.ai.ml import MLClient # noqa: PLC0415 + return MLClient( credential=credential, subscription_id=subscription_id, @@ -89,338 +337,245 @@ def _ws_client(rg: str, ws_name: str) -> Any: workspace_name=ws_name, ) - try: - for ws in arm_client.workspaces.list_by_subscription(): - location_raw = getattr(ws, "location", "") or "" - if region_filter and _norm(location_raw) != _norm(region_filter): - continue - - # Resource group: prefer the attribute, fall back to parsing the ARM id - rg = getattr(ws, "resource_group", None) - if not rg and getattr(ws, "id", None): - parts = ws.id.split("/") - rg_idx = next( - (i for i, p in enumerate(parts) if p.lower() == "resourcegroups"), - None, - ) - rg = parts[rg_idx + 1] if rg_idx is not None and rg_idx + 1 < len(parts) else None - if not rg: - continue - - # ARM resource ID for Azure Monitor metrics (workspace scope) - ws_id = getattr(ws, "id", None) or ( - f"/subscriptions/{subscription_id}/resourceGroups/{rg}" - f"/providers/Microsoft.MachineLearningServices/workspaces/{ws.name}" - ) - - ep_client = _ws_client(rg, ws.name) - - try: - for ep in ep_client.online_endpoints.list(): - prov = getattr(ep, "provisioning_state", None) - if (prov or "").lower() != "succeeded": - continue + region_filter_norm = _norm_location(region_filter) if region_filter else None - # Age — azure-ai-ml uses creation_context; fall back to system_data - age_days: Optional[int] = None - ctx = getattr(ep, "creation_context", None) or getattr(ep, "system_data", None) - created_at = getattr(ctx, "created_at", None) if ctx is not None else None - if created_at is not None: - if getattr(created_at, "tzinfo", None) is None: - created_at = created_at.replace(tzinfo=timezone.utc) - age_days = max((now - created_at).days, 0) - if age_days < max(idle_days // 2, 3): - continue + # Subscription-wide workspace inventory: propagate if this fails (spec 12) + for ws in arm_client.workspaces.list_by_subscription(): + # spec 8.3: workspace name guard + ws_name = getattr(ws, "name", None) + if not ws_name: + continue - effective_window = ( - min(idle_days, age_days) if age_days is not None else idle_days - ) - if effective_window < 3: - continue + # Resolve resource group from workspace + rg = getattr(ws, "resource_group", None) + if not rg: + rg = _extract_resource_group(getattr(ws, "id", None)) + if not rg: + continue - # Metric checks scoped to workspace resource with EndpointName dimension - idle_signal = _check_requests(mon_client, ws_id, ep.name, effective_window) + try: + ep_client = _ws_client(rg, ws_name) - if idle_signal is None or idle_signal[0] == "active": + for ep in ep_client.online_endpoints.list(): + try: + # spec 8.1: endpoint.id guard + ep_id = getattr(ep, "id", None) + if not ep_id: continue - signal_scope, idle_metric = idle_signal + # spec 8.2: endpoint.name guard + ep_name = getattr(ep, "name", None) + if not ep_name: + continue - if signal_scope == "no_data": - if age_days is not None and age_days >= idle_days * 2: - signal_scope = "age_only" - idle_metric = "none" - confidence = ConfidenceLevel.LOW - else: - continue - elif signal_scope == "workspace_level": - # Pass-2 signal: zero traffic at workspace level — endpoint likely idle - # but cannot be confirmed per-endpoint; require age >= idle_days before - # emitting even a LOW-confidence finding to reduce false positives. - if age_days is None or age_days < idle_days: - continue - confidence = ConfidenceLevel.LOW - elif ( - signal_scope == "per_endpoint" - and age_days is not None - and age_days >= idle_days - ): - confidence = ConfidenceLevel.HIGH - elif ( - signal_scope == "per_endpoint" - and age_days is not None - and age_days >= math.ceil(idle_days * 0.75) - ): - confidence = ConfidenceLevel.MEDIUM - elif signal_scope == "per_endpoint" and age_days is None: - confidence = ConfidenceLevel.MEDIUM - else: + # Endpoint location (spec 7, 9.2.1: use endpoint resource location, not workspace) + location_raw = getattr(ep, "location", None) or "" + location_norm = _norm_location(location_raw) + if not location_norm: + continue # spec 7: unresolved location -> skip + + # spec 8.4: region filter — exact lowercase equality + if region_filter_norm and location_norm != region_filter_norm: + continue + + # spec 8.6: provisioning_state must be exactly "Succeeded" (case-sensitive) + prov_state = getattr(ep, "provisioning_state", None) + if prov_state != "Succeeded": continue - # Deployment details via online_deployments.list (azure-ai-ml) - instance_type = None - total_instances = 0 - had_instance_data = False - is_gpu = False - deployment_count = 0 + # spec 8.7 / 9.2: created_at from systemData.createdAt + created_at: Optional[datetime] = None + sys_data = getattr(ep, "system_data", None) or getattr(ep, "systemData", None) + if sys_data is not None: + raw_created = getattr(sys_data, "created_at", None) + if raw_created is None: + raw_created = getattr(sys_data, "createdAt", None) + created_at = _parse_utc_timestamp(raw_created) + + if created_at is None: + continue # spec 8.7: required + if created_at > now: + continue # spec 9.2.3: future created_at -> skip + age_days = (now - created_at).days + if age_days < effective_idle_days: + continue # spec 8.7 / 9.2.4: age gate + + # spec 8.8: deployment inventory must resolve successfully + all_deployments: List = [] try: - for d in ep_client.online_deployments.list(ep.name): - deployment_count += 1 - it = ( - getattr(d, "instance_type", None) - or getattr(getattr(d, "sku", None), "name", None) - or getattr(getattr(d, "properties", None), "instanceType", None) - ) - if it: - instance_type = instance_type or it # keep first non-None - it_norm = it.lower() - if any( - it_norm.startswith(f) or f in it_norm for f in _GPU_FAMILIES - ): - is_gpu = True - scale = getattr(d, "scale_settings", None) - # Use explicit is-not-None checks: 0 is a valid (scale-to-zero) count - _candidates = [ - ( - getattr(scale, "min_instances", None) - if scale is not None - else None - ), - getattr(d, "instance_count", None), - ( - getattr(scale, "min_replicas", None) - if scale is not None - else None - ), - getattr( - getattr(d, "properties", None), - "minReplicaCount", - None, - ), - ] - cnt = next((v for v in _candidates if v is not None), None) - if cnt is not None: - had_instance_data = True - total_instances += int(cnt) + for dep in ep_client.online_deployments.list(ep_name): + all_deployments.append(dep) except Exception: - pass - - # Scale-to-zero endpoints have no running instances and no cost - if had_instance_data and total_instances == 0: + continue # spec 8.8: listing failure -> skip endpoint + + # Stable deployments: exact provisioning_state == "Succeeded" (spec 9.3.2) + stable_deployments = [ + d + for d in all_deployments + if getattr(d, "provisioning_state", None) == "Succeeded" + ] + + # spec 8.5: managed scope per spec 9.1 + is_managed, managed_scope_source = _resolve_managed_scope( + ep, stable_deployments + ) + if not is_managed: continue - # Cost lookup — only emit a cost when we have a known SKU price; - # guessing an unknown SKU's cost erodes trust in the findings. - monthly_cost = None - if instance_type and total_instances: - base = _VM_SKU_COSTS_LOWER.get(instance_type.lower()) - if base is not None: - monthly_cost = base * total_instances + # spec 8.9 / 9.3: billing-relevant deployments + billing_relevant_count = 0 + total_baseline_instances = 0 + first_instance_type: Optional[str] = None + any_gpu = False + + for dep in stable_deployments: + # Baseline instance count resolution order (spec 9.3.4): + # scale_settings.min_instances -> instance_count -> unknown + scale = getattr(dep, "scale_settings", None) + cnt = None + if scale is not None: + cnt = getattr(scale, "min_instances", None) + if cnt is None: + cnt = getattr(dep, "instance_count", None) + + if cnt is None: + continue # unknown -> not billing-relevant (spec 9.3.4-5) + try: + cnt_int = int(cnt) + except (TypeError, ValueError): + continue + if cnt_int <= 0: + continue # not billing-relevant (spec 9.3.5) + + billing_relevant_count += 1 + total_baseline_instances += cnt_int - idle_ratio = ( - (age_days / idle_days) if (age_days is not None and idle_days) else None + it = getattr(dep, "instance_type", None) + if it and first_instance_type is None: + first_instance_type = it + if it and _is_gpu(it): + any_gpu = True + + if billing_relevant_count == 0: + continue # spec 8.9 / 9.3.6: no billing-relevant deployment + + # spec 8.10: traffic metric must resolve to ZERO per spec 9.5 + metric_result, coverage_ratio = _query_requests_per_minute( + mon_client, ep_id, effective_idle_days ) + if metric_result != "ZERO": + continue # ACTIVE or UNKNOWN -> skip (spec 8.10) - # Risk — CRITICAL only on strong per-endpoint signal; LOW-confidence - # signals (workspace_level, age_only) must not escalate beyond HIGH. - if ( - is_gpu - and signal_scope == "per_endpoint" - and idle_ratio is not None - and idle_ratio >= 2.0 - ): - risk = RiskLevel.CRITICAL - elif is_gpu: - risk = RiskLevel.HIGH + # spec 9.6: confidence from metric coverage + if coverage_ratio >= _COVERAGE_HIGH: + confidence = ConfidenceLevel.HIGH else: - risk = RiskLevel.MEDIUM + confidence = ConfidenceLevel.MEDIUM - if signal_scope == "age_only": - primary_signal = ( - f"No Azure Monitor metric data available; endpoint age ({age_days} days) " - f"exceeds {idle_days * 2} days" - ) - elif signal_scope == "workspace_level": - primary_signal = ( - f"No observable endpoint-level traffic; workspace metrics show zero " - f"activity for {effective_window} days (Azure Monitor: {idle_metric})" - ) - else: - primary_signal = ( - f"Zero scoring requests for {effective_window} days " - f"(Azure Monitor: {idle_metric})" - ) - signals = [primary_signal, f"Provisioning state: {prov}"] - if age_days is not None: - signals.append(f"Endpoint age: {age_days} days") - if monthly_cost: - signals.append(f"Estimated cost: ~${monthly_cost:,.0f}/month") - - evidence = Evidence( - signals_used=signals, - signals_not_checked=[ - "Batch scoring pipelines or scheduled external callers", - "Failover/shadow deployments", - "A/B test traffic splits", - ], - time_window=f"{effective_window} days", - ) + # spec 9.6: risk from GPU presence + risk = RiskLevel.HIGH if any_gpu else RiskLevel.MEDIUM + + # spec 9.5: idle_since_days = effective idle window (not observational estimate) + idle_since_days = effective_idle_days + + # Endpoint kind for details; tags never None in output (spec 7) + ep_kind = getattr(ep, "kind", None) + tags = getattr(ep, "tags", None) or {} + + # spec 11.2: signals_used + signals_used = [ + f"Managed scope established from {managed_scope_source} surfaces", + "Endpoint provisioning state is 'Succeeded'", + ( + f"Endpoint age is {age_days} days " + f"(>= configured idle window of {effective_idle_days} days)" + ), + ( + f"{billing_relevant_count} deployment(s) retain positive configured " + f"baseline instance count (total: {total_baseline_instances})" + ), + ( + f"{_METRIC_NAME} metric result is ZERO with " + f">={_COVERAGE_ACCEPTABLE:.0%} coverage " + f"across a {effective_idle_days}-day rolling UTC window " + f"(coverage: {coverage_ratio:.1%}, aggregation: {_METRIC_AGGREGATION})" + ), + ] details = { - "endpoint_name": ep.name, - "workspace_name": ws.name, + "endpoint_name": ep_name, + "workspace_name": ws_name, "resource_group": rg, - "instance_type": instance_type, - "min_instance_count": total_instances, - "deployment_count": deployment_count, - "is_gpu": is_gpu, - "age_days": age_days, - "idle_days_threshold": idle_days, - "idle_signal_scope": signal_scope, - "estimated_monthly_cost": monthly_cost, - "cost_source": ( - "heuristic_sku_table" - if (instance_type and instance_type.lower() in _VM_SKU_COSTS_LOWER) - else "unknown" - ), + "subscription_id": subscription_id, + "location": location_norm, + "endpoint_kind": ep_kind, + "managed_scope_source": managed_scope_source, + "endpoint_provisioning_state": "Succeeded", + "created_at": created_at.isoformat(), + "billing_relevant_deployment_count": billing_relevant_count, + "deployment_count": len(all_deployments), + "stable_deployment_count": len(stable_deployments), + "instance_type": first_instance_type, + "is_gpu": any_gpu, + "baseline_instance_count_total": total_baseline_instances, + "idle_days_threshold": effective_idle_days, + "idle_since_days": idle_since_days, + "metric_name": _METRIC_NAME, + "metric_aggregation": _METRIC_AGGREGATION, + "metric_coverage_ratio": coverage_ratio, + "tags": tags, } - title = f"Idle Azure ML Online Endpoint: {ep.name}" + title = f"Idle Azure ML Managed Online Endpoint: {ep_name}" summary = ( - f"Azure ML online endpoint '{ep.name}' in workspace '{ws.name}' has received " - f"no scoring requests for {effective_window}+ days and continues to bill per-instance." + f"Azure ML managed online endpoint '{ep_name}' in workspace '{ws_name}' " + f"has received no scoring requests (RequestsPerMinute == 0) for " + f"{effective_idle_days} days while retaining " + f"{total_baseline_instances} positive baseline deployment instance(s), " + f"continuing to incur compute cost." + ) + reason = ( + f"RequestsPerMinute is zero across a {effective_idle_days}-day rolling " + f"window while {billing_relevant_count} deployment(s) retain positive " + f"baseline instances" ) - reason = signals[0] findings.append( Finding( provider="azure", - rule_id="azure.ml.online_endpoint.idle", - resource_type="azure.ml.online_endpoint", - resource_id=ep.id, - region=location_raw, + rule_id=_RULE_ID, + resource_type=_RESOURCE_TYPE, + resource_id=ep_id, + region=location_norm, title=title, summary=summary, reason=reason, risk=risk, confidence=confidence, detected_at=now, - evidence=evidence, + estimated_monthly_cost_usd=None, # spec 10: always None + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=[ + "Future traffic intent or standby usage", + "Autoscale policies or live instance state not visible from deployment configuration", + "Exact endpoint cost after discounts, reservations, or special commercial terms", + "Business-owner intent or rollout plans", + ], + time_window=f"{effective_idle_days} days", + ), details=details, - estimated_monthly_cost_usd=monthly_cost, ) ) - except PermissionError: - raise - except Exception: - continue - - except PermissionError: - raise - except HttpResponseError as e: - if e.status_code in (401, 403): - raise PermissionError( - "Missing required permissions: Microsoft.MachineLearningServices/workspaces/read, " - "Microsoft.MachineLearningServices/workspaces/onlineEndpoints/read, " - "Microsoft.MachineLearningServices/workspaces/onlineEndpoints/deployments/read, " - "Microsoft.Insights/metrics/read" - ) from e - raise - - return findings - - -def _check_requests( - monitor_client: Any, - workspace_id: str, - endpoint_name: str, - days: int, -) -> Optional[tuple]: - now = datetime.now(timezone.utc) - start = now - timedelta(days=max(days, 1)) - fmt = "%Y-%m-%dT%H:%M:%SZ" - timespan = f"{start.strftime(fmt)}/{now.strftime(fmt)}" - coverage_threshold = max(int(days * 0.7), 3) - - had_successful_call = False - - for metric_name in _REQUEST_METRICS: - try: - # Pass 1: filter by EndpointName dimension. - # OData single-quote escaping: replace ' with '' per OData spec. - safe_name = endpoint_name.replace("'", "''") - response = monitor_client.metrics.list( - workspace_id, - metricnames=metric_name, - timespan=timespan, - interval="PT24H", - aggregation="Total", - filter=f"EndpointName eq '{safe_name}'", - ) - had_successful_call = True - has_timeseries = False - seen_datapoints = 0 - for metric in response.value: - for ts in metric.timeseries: - has_timeseries = True - for point in ts.data: - if point.total is not None: - seen_datapoints += 1 - if point.total > 0: - return ("active", None) - - if has_timeseries and seen_datapoints >= coverage_threshold: - return ("per_endpoint", metric_name) - - # Pass 2: no filter — dimension may not be emitted for this endpoint - response2 = monitor_client.metrics.list( - workspace_id, - metricnames=metric_name, - timespan=timespan, - interval="PT24H", - aggregation="Total", - ) - seen2 = 0 - for metric in response2.value: - for ts in metric.timeseries: - for point in ts.data: - if point.total is not None: - seen2 += 1 - if point.total > 0: - return ("active", None) - if seen2 >= coverage_threshold: - return ("workspace_level", metric_name) + except PermissionError: + raise + except Exception: + continue # spec 12: malformed or failed per-endpoint record -> skip except PermissionError: raise - except HttpResponseError as e: - if e.status_code in (401, 403): - raise PermissionError( - "Missing required permissions: Microsoft.Insights/metrics/read" - ) from e - continue except Exception: - continue + continue # spec 12: per-workspace failure -> skip; preserve findings so far - return ("no_data", None) if had_successful_call else None + return findings diff --git a/cleancloud/providers/azure/rules/ai/openai_provisioned_idle.py b/cleancloud/providers/azure/rules/ai/openai_provisioned_idle.py index eb5b391..ff57344 100644 --- a/cleancloud/providers/azure/rules/ai/openai_provisioned_idle.py +++ b/cleancloud/providers/azure/rules/ai/openai_provisioned_idle.py @@ -1,8 +1,45 @@ -import math +""" +Rule: azure.openai.provisioned_deployment.idle + +Intent: + Detect Azure OpenAI provisioned deployments that retain billable PTU capacity + while showing no observed Azure OpenAI request traffic over a conservative + documented observation window. + + This rule is deliberately precision-first. It is not proof that deleting a + deployment is safe, not proof that a reservation can be canceled without + consequence, and not proof of an exact monthly saving. It is a conservative + review-candidate rule for provisioned Azure OpenAI deployments that appear to + be continuously billed but unused. + +Exclusions (spec 8): + - account.id absent, None, or empty + - account.name absent, None, or empty + - deployment.id absent, None, or empty + - deployment.name absent, None, or empty + - account location unresolved + - region filter set and normalized account location does not match + - account provisioning_state != "Succeeded" (exact case-sensitive) + - deployment provisioning_state != "Succeeded" (exact case-sensitive) + - deployment model_format != "OpenAI" (exact case-sensitive) + - deployment sku_name not in documented provisioned-managed set + - ptu_capacity absent, invalid, zero, or negative + - created_at absent, invalid, in the future, or age < effective idle_days + - AzureOpenAIRequests metric result not ZERO per spec 9.3 + +Cost model (spec 10): + estimated_monthly_cost_usd = None (always) + +APIs: + - Microsoft.CognitiveServices/accounts/read + - Microsoft.CognitiveServices/accounts/deployments/read + - Microsoft.Insights/metrics/read +""" + from datetime import datetime, timedelta, timezone -from typing import Any, List, Optional +from typing import Any, Dict, List, Optional, Tuple -# Azure SDK (top-level imports for CI fail-fast) +from azure.core.exceptions import HttpResponseError from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient from azure.mgmt.monitor import MonitorManagementClient @@ -11,14 +48,18 @@ from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +_RULE_ID = "azure.openai.provisioned_deployment.idle" +_RESOURCE_TYPE = "azure.openai.provisioned_deployment" +_DEFAULT_IDLE_DAYS = 7 + RULE_METADATA = { - "id": "azure.openai.provisioned_deployment.idle", + "id": _RULE_ID, "category": "ai", "service": "cognitiveservices", "cost_impact": "high", } -# Provisioned SKU names — these bill by PTU regardless of usage +# Provisioned SKU names that bill by PTU (spec 3.2, 9.1.5) _PROVISIONED_SKUS = frozenset( { "ProvisionedManaged", @@ -27,18 +68,208 @@ } ) -# Account kinds that host Azure OpenAI deployments -_OPENAI_KINDS = frozenset({"OpenAI", "AIServices"}) +_METRIC_NAME = "AzureOpenAIRequests" +_METRIC_AGGREGATION = "Total" +_METRIC_INTERVAL = "PT1M" -# On-demand PTU cost: $2/PTU/hour × 730 hours/month -# Reserved pricing is lower (~$1,000–$1,200/PTU/month) — we report on-demand as the ceiling. -_PTU_MONTHLY_COST_USD = 1_460.0 +_COVERAGE_ACCEPTABLE = 0.80 # minimum coverage for acceptable ZERO result +_COVERAGE_HIGH = 0.95 # coverage threshold for HIGH confidence +_MAX_IDLE_DAYS = 30 # soft upper bound; avoids multi-month metric queries + + +# --------------------------------------------------------------------------- +# Normalization +# --------------------------------------------------------------------------- + + +def _norm_location(s: str) -> str: + """Lowercase only — exact lowercase match per spec 7 (spaces and hyphens preserved).""" + return s.lower() if s else "" + + +def _extract_resource_group(resource_id: Optional[str]) -> Optional[str]: + """Extract resource group name from Azure ARM resource ID.""" + if not resource_id: + return None + parts = resource_id.split("/") + try: + idx = next(i for i, p in enumerate(parts) if p.lower() == "resourcegroups") + return parts[idx + 1] + except (StopIteration, IndexError): + return None + + +def _parse_utc_timestamp(raw) -> Optional[datetime]: + """ + Parse raw timestamp to UTC-normalized datetime. + Naive datetimes are treated as UTC; aware non-UTC datetimes are converted to UTC. + """ + if raw is None: + return None + if isinstance(raw, datetime): + if raw.tzinfo is None: + return raw.replace(tzinfo=timezone.utc) + return raw.astimezone(timezone.utc) + if isinstance(raw, str): + try: + ts = datetime.fromisoformat(raw.rstrip("Z")) + if ts.tzinfo is None: + return ts.replace(tzinfo=timezone.utc) + return ts.astimezone(timezone.utc) + except ValueError: + return None + return None + + +def _escape_odata_string(value: str) -> str: + """ + Escape a string literal for use in an Azure Monitor OData filter expression. + + Rules applied: + - Single quotes are escaped to '' (OData spec 4.5.2 string literals). + - ASCII control characters (< 0x20, excluding tab 0x09) are stripped because + they cannot appear in a well-formed OData string and can confuse filter parsing. + """ + # Strip ASCII control chars before building the OData filter string. + # Azure Monitor rejects filter expressions that contain raw control chars + # (NUL, CR, LF, ESC, etc.); tab (0x09) is kept as it can appear in valid names. + sanitized = "".join(ch for ch in value if ch == "\t" or ord(ch) >= 0x20) + return sanitized.replace("'", "''") -# Azure Monitor metric names to check for request activity (tried in order) -_REQUEST_METRICS = ( - "AzureOpenAIRequests", - "ProcessedPromptTokens", -) + +# --------------------------------------------------------------------------- +# Traffic metric (spec 9.3) +# --------------------------------------------------------------------------- + + +def _query_openai_requests( + monitor_client: Any, + account_id: str, + deployment_name: str, + effective_idle_days: int, +) -> Tuple[ + str, Optional[float], Optional[int], Optional[int], Optional[datetime], Optional[datetime] +]: + """ + Query AzureOpenAIRequests on the parent account ARM resource id (spec 9.3). + + After applying the ModelDeploymentName deployment scoping filter, sums Total + across all remaining dimension series into per-minute bucket_totals. Any + bucket_total > 0 is activity; coverage is based on unique usable minute buckets + after that aggregation (spec 9.3.6–9.3.8). + + Returns a 6-tuple: (result_code, coverage_ratio, expected_buckets, observed_count, + window_start_utc, metric_end_utc) + + result_code values: + "ZERO" coverage >= 80% and every bucket_total == 0 + "ACTIVE" any bucket_total > 0 + "UNKNOWN_QUERY_FAILURE" exception during query or invalid window (expected_buckets <= 0) + "UNKNOWN_NO_DATA" query succeeded but no usable datapoints inside the window + "UNKNOWN_LOW_COVERAGE" had usable data but coverage fell below 80% threshold + + Window timestamps are returned for every non-failure case to let callers record + the exact evaluation window in finding details. + """ + now_utc = datetime.now(timezone.utc) + # floor_to_minute(now_utc - 5 minutes) per spec 9.3 + raw_end = now_utc - timedelta(minutes=5) + metric_end_utc = raw_end.replace(second=0, microsecond=0) + window_start_utc = metric_end_utc - timedelta(days=effective_idle_days) + + fmt = "%Y-%m-%dT%H:%M:%SZ" + timespan = f"{window_start_utc.strftime(fmt)}/{metric_end_utc.strftime(fmt)}" + + # Expected complete minute buckets in [window_start_utc, metric_end_utc) + expected_buckets = int((metric_end_utc - window_start_utc).total_seconds() // 60) + if expected_buckets <= 0: + return ("UNKNOWN_QUERY_FAILURE", None, None, None, None, None) + + try: + response = monitor_client.metrics.list( + account_id, + metricnames=_METRIC_NAME, + timespan=timespan, + interval=_METRIC_INTERVAL, + aggregation=_METRIC_AGGREGATION, + filter=f"ModelDeploymentName eq '{_escape_odata_string(deployment_name)}'", + ) + except PermissionError: + raise + except HttpResponseError as exc: + if exc.status_code in (401, 403): + raise PermissionError( + "Missing required permissions: Microsoft.Insights/metrics/read" + ) from exc + return ("UNKNOWN_QUERY_FAILURE", None, None, None, None, None) + except Exception: + return ("UNKNOWN_QUERY_FAILURE", None, None, None, None, None) + + # Per-minute bucket totals: sum Total across all remaining dimension series (spec 9.3.6.v). + # Keyed by floored-to-minute UTC datetime so duplicate timestamps or multiple series + # do not overstate coverage (spec 9.3.8). + usable_bucket_totals: Dict[datetime, float] = {} + + for metric in response.value or []: + for ts in getattr(metric, "timeseries", None) or []: + for point in getattr(ts, "data", None) or []: + total = getattr(point, "total", None) + if total is None: + continue + + # Resolve and parse the datapoint timestamp + raw_ts = getattr(point, "time_stamp", None) + if raw_ts is None: + raw_ts = getattr(point, "timestamp", None) + pt_utc = _parse_utc_timestamp(raw_ts) + if pt_utc is None: + continue # unparseable timestamp -> not a usable datapoint + + # Floor to the minute to identify the complete minute bucket + bucket = pt_utc.replace(second=0, microsecond=0) + + # Must be within [window_start_utc, metric_end_utc) (spec 9.3.4) + if bucket < window_start_utc or bucket >= metric_end_utc: + continue + + usable_bucket_totals[bucket] = usable_bucket_totals.get(bucket, 0.0) + total + + observed_count = len(usable_bucket_totals) + + if observed_count == 0: + return ("UNKNOWN_NO_DATA", None, expected_buckets, 0, window_start_utc, metric_end_utc) + + # Any positive bucket_total means activity (spec 9.3.10) + for bucket_total in usable_bucket_totals.values(): + if bucket_total > 0: + return ("ACTIVE", None, None, None, None, None) + + # Azure Monitor minute coverage can be sparse or arrive late; this rule intentionally + # fails closed — insufficient coverage produces UNKNOWN rather than a false ZERO. + coverage_ratio = observed_count / expected_buckets + if coverage_ratio < _COVERAGE_ACCEPTABLE: + return ( + "UNKNOWN_LOW_COVERAGE", + None, + expected_buckets, + observed_count, + window_start_utc, + metric_end_utc, + ) + + return ( + "ZERO", + coverage_ratio, + expected_buckets, + observed_count, + window_start_utc, + metric_end_utc, + ) + + +# --------------------------------------------------------------------------- +# Main rule function +# --------------------------------------------------------------------------- def find_idle_openai_provisioned_deployments( @@ -48,44 +279,19 @@ def find_idle_openai_provisioned_deployments( region_filter: str = None, client: Optional[Any] = None, monitor_client: Optional[Any] = None, - idle_days: int = 7, + idle_days: int = _DEFAULT_IDLE_DAYS, ) -> List[Finding]: """ - Find Azure OpenAI provisioned deployments (PTUs) with zero API requests. - - Provisioned Throughput Units (PTUs) reserve dedicated model capacity and - bill continuously at ~$1,460/PTU/month (on-demand) regardless of traffic. - A provisioned deployment with zero requests is paying for capacity that is - delivering zero value — typically a forgotten dev/test deployment, a - proof-of-concept that was never decommissioned, or a migration where traffic - moved to a different deployment but the old one was left running. - - This is the Azure equivalent of an idle SageMaker Provisioned endpoint: - same always-on billing model, same abandonment pattern. - - Detection logic: - - Account kind is OpenAI or AIServices - - Deployment SKU is ProvisionedManaged, GlobalProvisionedManaged, or - DataZoneProvisionedManaged (capacity-based billing, not token-based) - - Azure Monitor AzureOpenAIRequests (or ProcessedPromptTokens) sum is 0 - over the idle window, scoped to this deployment via dimension filter - - Metric strategy: - - Query AzureOpenAIRequests with ModelDeploymentName dimension filter - (per-deployment signal — most reliable) - - Fall back to ProcessedPromptTokens with same filter if AzureOpenAIRequests - returns no timeseries (metric or dimension unsupported in this region) - - If both per-deployment queries return no data, result is ("no_data", None). - Account-level aggregation is NOT used as a fallback: a zero account total only - covers deployments that emit the metric; deployments that don't are invisible, - making account-level zero an unsafe basis for a finding. - - Conservative: return None (assume active) on any API exception - - Confidence: - - HIGH: Per-deployment metric confirms zero requests, deployment age >= idle_days - - MEDIUM: Per-deployment metric confirms zero, age >= ceil(75% of idle_days) but - < idle_days; OR per-deployment metric confirms zero, age unknown; OR metrics - unavailable and deployment age >= 2× idle_days (age-only fallback) + Find Azure OpenAI provisioned deployments with zero AzureOpenAIRequests while + retaining positive PTU capacity. + + Detection logic (spec 4, 8, 9): + - model_format == "OpenAI" (exact, case-sensitive; establishes OpenAI scope per spec 9.1.4) + - sku_name in documented provisioned-managed set + - Account and deployment provisioning_state both exactly "Succeeded" + - ptu_capacity > 0 (billing-relevant per spec 9.1.7) + - created_at resolves to known UTC timestamp; age >= effective idle_days + - AzureOpenAIRequests == 0 across the rolling UTC window defined in spec 9.3 IAM permissions: - Microsoft.CognitiveServices/accounts/read @@ -94,8 +300,8 @@ def find_idle_openai_provisioned_deployments( """ findings: List[Finding] = [] now = datetime.now(timezone.utc) - - idle_days = max(idle_days, 3) + # spec 6.3: minimum 1; soft upper bound avoids multi-month metric queries + effective_idle_days = min(max(idle_days, 1), _MAX_IDLE_DAYS) cs_client = client or CognitiveServicesManagementClient( credential=credential, subscription_id=subscription_id @@ -104,341 +310,251 @@ def find_idle_openai_provisioned_deployments( credential=credential, subscription_id=subscription_id ) - def _norm(s: str) -> str: - return s.lower().replace(" ", "").replace("-", "") + region_filter_norm = _norm_location(region_filter) if region_filter else None + + # Subscription-wide account inventory: propagate if this fails (spec 12) + for account in cs_client.accounts.list(): + # spec 8.1: account.id guard + account_id = getattr(account, "id", None) + if not account_id: + continue + + # spec 8.2: account.name guard + account_name = getattr(account, "name", None) + if not account_name: + continue + + # spec 8.5: account location must resolve + location_raw = getattr(account, "location", None) or "" + location_norm = _norm_location(location_raw) + if not location_norm: + continue # unresolved location -> skip (spec 7) + + # spec 8.6: region filter — exact lowercase equality + if region_filter_norm and location_norm != region_filter_norm: + continue + + # spec 8.7: account provisioning_state must be exactly "Succeeded" + # Check both snake_case (SDK) and camelCase (raw shape) field names. + acct_props = getattr(account, "properties", None) + account_prov_state = None + if acct_props is not None: + account_prov_state = getattr(acct_props, "provisioning_state", None) + if account_prov_state is None: + account_prov_state = getattr(acct_props, "provisioningState", None) + if account_prov_state != "Succeeded": + continue + + account_kind = getattr(account, "kind", None) + + rg = _extract_resource_group(account_id) + if not rg: + continue - try: - for account in cs_client.accounts.list(): - # Only Azure OpenAI accounts - if getattr(account, "kind", None) not in _OPENAI_KINDS: - continue - - location_raw = account.location or "" - if region_filter and _norm(location_raw) != _norm(region_filter): - continue - - rg = _parse_resource_group(account.id) - if not rg: - continue - - try: - for deployment in cs_client.deployments.list(rg, account.name): - sku_name = getattr(deployment.sku, "name", None) if deployment.sku else None + try: + for deployment in cs_client.deployments.list(rg, account_name): + try: + # spec 8.3: deployment.id guard + dep_id = getattr(deployment, "id", None) + if not dep_id: + continue + + # spec 8.4: deployment.name guard + dep_name = getattr(deployment, "name", None) + if not dep_name: + continue + + dep_props = getattr(deployment, "properties", None) + + # spec 8.8: deployment provisioning_state must be exactly "Succeeded" + # Check both snake_case (SDK) and camelCase (raw shape) field names. + dep_prov_state = None + if dep_props is not None: + dep_prov_state = getattr(dep_props, "provisioning_state", None) + if dep_prov_state is None: + dep_prov_state = getattr(dep_props, "provisioningState", None) + if dep_prov_state != "Succeeded": + continue + + # spec 8.9: model_format must be exactly "OpenAI" (case-sensitive) + model = getattr(dep_props, "model", None) if dep_props is not None else None + model_format = getattr(model, "format", None) if model is not None else None + if model_format != "OpenAI": + continue + + model_name = getattr(model, "name", None) if model is not None else None + model_version = getattr(model, "version", None) if model is not None else None + + # spec 8.10: sku_name must be in documented provisioned-managed set + sku = getattr(deployment, "sku", None) + sku_name = getattr(sku, "name", None) if sku is not None else None if sku_name not in _PROVISIONED_SKUS: continue - ptu_capacity = getattr(deployment.sku, "capacity", None) or 0 - model_name = None - if deployment.properties: - model = getattr(deployment.properties, "model", None) - if model: - model_name = getattr(model, "name", None) + # spec 8.11: ptu_capacity must be a known integer > 0 + ptu_capacity = None + if sku is not None: + raw_capacity = getattr(sku, "capacity", None) + if raw_capacity is not None: + try: + ptu_capacity = int(raw_capacity) + except (TypeError, ValueError): + pass + if ptu_capacity is None or ptu_capacity <= 0: + continue - # Age from system_data - age_days: Optional[int] = None + # spec 8.12 / 9.2: created_at from systemData.createdAt + sys_data = getattr(deployment, "system_data", None) or getattr( + deployment, "systemData", None + ) created_at = None - if deployment.system_data: - created_at = getattr(deployment.system_data, "created_at", None) - if created_at is not None: - if created_at.tzinfo is None: - created_at = created_at.replace(tzinfo=timezone.utc) - age_days = max((now - created_at).days, 0) - if age_days < max(idle_days // 2, 3): - continue # too new to classify - - effective_window = ( - min(idle_days, age_days) if age_days is not None else idle_days + if sys_data is not None: + raw_created = getattr(sys_data, "created_at", None) + if raw_created is None: + raw_created = getattr(sys_data, "createdAt", None) + created_at = _parse_utc_timestamp(raw_created) + + if created_at is None: + continue # spec 8.12: required + if created_at > now: + continue # spec 9.2.3: future created_at -> skip + age_days = (now - created_at).days + if age_days < effective_idle_days: + continue # spec 8.12 / 9.2.4: age gate + + # spec 8.13: traffic metric must resolve to ZERO per spec 9.3 + ( + metric_result, + coverage_ratio, + expected_bucket_count, + observed_bucket_count, + metric_window_start_utc, + metric_end_utc, + ) = _query_openai_requests( + mon_client, account_id, dep_name, effective_idle_days ) - if effective_window < 3: - continue + if metric_result != "ZERO": + continue # ACTIVE or UNKNOWN_* -> skip - # Check request activity via Azure Monitor - idle_signal = _check_requests( - mon_client, - account.id, - deployment.name, - effective_window, - ) - # idle_signal: - # ("per_deployment", metric_name) — zero confirmed at deployment level - # ("active", None) — has traffic, skip - # ("no_data", None) — metrics returned no timeseries (unsupported) - # None — all metric calls failed (transient), skip - - if idle_signal is None or idle_signal[0] == "active": - continue # transient error or definitively active — skip - - signal_scope, idle_metric = idle_signal - - if signal_scope == "no_data": - # Age-only fallback: metrics unsupported, but deployment is very old - if age_days is not None and age_days >= idle_days * 2: - signal_scope = "age_only" - idle_metric = "none" - confidence = ConfidenceLevel.MEDIUM - else: - continue # not enough signal - - elif ( - signal_scope == "per_deployment" - and age_days is not None - and age_days >= idle_days - ): + # spec 9.4: confidence from metric coverage + if coverage_ratio >= _COVERAGE_HIGH: confidence = ConfidenceLevel.HIGH - elif ( - signal_scope == "per_deployment" - and age_days is not None - and age_days >= math.ceil(idle_days * 0.75) - ): - # 75–100% of idle_days: metric confirms zero requests but the - # deployment hasn't fully cleared the observation window yet. - # Surface as MEDIUM rather than skipping — early waste is still - # waste, but we avoid HIGH until the full window is satisfied. - # ceil ensures "75%" is never rounded down (e.g. idle_days=7 - # gives ceil(5.25)=6, so age=5 is correctly excluded). - confidence = ConfidenceLevel.MEDIUM - elif signal_scope == "per_deployment" and age_days is None: - confidence = ConfidenceLevel.MEDIUM else: - # age_days < ceil(75% of idle_days): too early to be confident. - # Prefer false negatives over false positives here. - continue + confidence = ConfidenceLevel.MEDIUM - monthly_cost = ptu_capacity * _PTU_MONTHLY_COST_USD if ptu_capacity else None + # spec 9.4: risk always HIGH + risk = RiskLevel.HIGH - # Risk scales with PTU cost — PTUs are always significant - if monthly_cost and monthly_cost >= 10_000: - risk = RiskLevel.HIGH - elif monthly_cost and monthly_cost >= 2_000: - risk = RiskLevel.MEDIUM - else: - risk = RiskLevel.MEDIUM # even small PTU allocations are expensive + # spec 9.3.13: idle_since_days = effective idle window + idle_since_days = effective_idle_days - signals = [ + # Tags: deployment tags when present; otherwise {} (spec 7) + dep_tags = getattr(deployment, "tags", None) + tags = dep_tags if isinstance(dep_tags, dict) else {} + + # spec 11.2: signals_used + signals_used = [ + f"Deployment model_format is 'OpenAI' with provisioned SKU '{sku_name}'", + "Account and deployment provisioning states are 'Succeeded'", ( - f"No Azure Monitor metric data available; deployment age ({age_days} days) " - f"exceeds {idle_days * 2} days" - if signal_scope == "age_only" - else f"Zero API requests for {effective_window} days " - f"(Azure Monitor: {idle_metric}, scope: {signal_scope.replace('_', ' ')})" + f"Deployment age is {age_days} days " + f"(>= configured idle window of {effective_idle_days} days)" ), - f"Provisioned SKU: {sku_name} — bills continuously regardless of usage", - f"PTU capacity: {ptu_capacity} PTU(s)", - ] - if model_name: - signals.append(f"Model: {model_name}") - if age_days is not None: - signals.append(f"Deployment age: {age_days} days") - if monthly_cost: - signals.append( - f"Estimated cost: ~${monthly_cost:,.0f}/month on-demand " - f"({ptu_capacity} PTU × $1,460/PTU — reserved pricing is lower)" - ) - - evidence = Evidence( - signals_used=signals, - signals_not_checked=[ - "Deployments used as failover/backup capacity only", - "Scheduled batch processing with infrequent job submissions", - "PTU reservation commitment — deleting may forfeit reserved capacity", - "Internal tooling with very low but non-zero request rates", - ], - time_window=f"{effective_window} days", - ) - - _confidence_reasons = { - "age_only": "no_metric_data_deployment_age_only", - "per_deployment": ( - "per_deployment_metric_zero_age_confirmed" - if age_days is not None and age_days >= idle_days - else ( - "per_deployment_metric_zero_age_partial" - if age_days is not None - else "per_deployment_metric_zero_age_unknown" - ) + f"Deployment retains {ptu_capacity} PTU(s) of provisioned capacity — billed hourly while the deployment exists", + ( + f"{_METRIC_NAME} metric result is ZERO with " + f">={_COVERAGE_ACCEPTABLE:.2%} coverage " + f"across a {effective_idle_days}-day rolling UTC window " + f"({observed_bucket_count}/{expected_bucket_count} minute buckets, " + f"coverage: {coverage_ratio:.2%}, aggregation: {_METRIC_AGGREGATION})" ), - } + ] details = { - "account_name": account.name, - "deployment_name": deployment.name, + "account_name": account_name, + "resource_group": rg, + "subscription_id": subscription_id, + "account_location": location_norm, + "account_kind": account_kind, + "deployment_name": dep_name, + "deployment_provisioning_state": dep_prov_state, "sku_name": sku_name, "ptu_capacity": ptu_capacity, - "location": location_raw, - "idle_days_threshold": idle_days, - "idle_signal_scope": signal_scope, - "confidence_reason": _confidence_reasons.get(signal_scope, signal_scope), + "model_format": model_format, + "model_name": model_name, + "model_version": model_version, + "created_at": created_at.isoformat(), + "age_days": age_days, + "idle_days_requested": idle_days, + "idle_days_threshold": effective_idle_days, + "idle_since_days": idle_since_days, + "metric_name": _METRIC_NAME, + "metric_aggregation": _METRIC_AGGREGATION, + "metric_result_reason": metric_result, + "metric_coverage_ratio": coverage_ratio, + "metric_expected_bucket_count": expected_bucket_count, + "metric_observed_bucket_count": observed_bucket_count, + # Window timestamps are always non-None for ZERO results; guarded + # defensively so future code changes can't produce a silent AttributeError. + "metric_window_start_utc": ( + metric_window_start_utc.isoformat() + if metric_window_start_utc is not None + else None + ), + "metric_end_utc": ( + metric_end_utc.isoformat() if metric_end_utc is not None else None + ), + "tags": tags, } - if model_name: - details["model"] = model_name - if age_days is not None: - details["age_days"] = age_days - if account.tags: - details["tags"] = account.tags - - if signal_scope == "age_only": - title = f"Possibly Idle Azure OpenAI Provisioned Deployment ({ptu_capacity} PTU, No Metric Data for {age_days}+ Days)" - summary = ( - f"Azure OpenAI provisioned deployment '{deployment.name}' " - f"({sku_name}, {ptu_capacity} PTU) in account '{account.name}' " - f"has been running for {age_days} days with no Azure Monitor metric data " - f"available to confirm activity. PTU charges continue regardless." - ) - reason = ( - f"No Azure Monitor data available after {age_days} days " - f"({ptu_capacity} PTU billed continuously — activity unconfirmed)" - ) - else: - title = f"Idle Azure OpenAI Provisioned Deployment ({ptu_capacity} PTU, No Requests for {effective_window}+ Days)" - summary = ( - f"Azure OpenAI provisioned deployment '{deployment.name}' " - f"({sku_name}, {ptu_capacity} PTU) in account '{account.name}' " - f"has received zero API requests for {effective_window}+ days " - f"but continues to accrue PTU charges." - ) - reason = ( - f"Provisioned deployment has zero API requests for {effective_window}+ days " - f"({ptu_capacity} PTU billed continuously)" - ) + + title = f"Idle Azure OpenAI Provisioned Deployment: {dep_name}" + summary = ( + f"Azure OpenAI provisioned deployment '{dep_name}' " + f"({sku_name}, {ptu_capacity} PTU) in account '{account_name}' " + f"has received zero API requests for {effective_idle_days} days " + f"while retaining positive PTU capacity, continuing to incur hourly PTU charges." + ) + reason = ( + f"AzureOpenAIRequests is zero across a {effective_idle_days}-day rolling " + f"window while deployment retains {ptu_capacity} PTU(s) of provisioned capacity" + ) findings.append( Finding( provider="azure", - rule_id="azure.openai.provisioned_deployment.idle", - resource_type="azure.openai.provisioned_deployment", - resource_id=deployment.id, - region=location_raw, + rule_id=_RULE_ID, + resource_type=_RESOURCE_TYPE, + resource_id=dep_id, + region=location_norm, title=title, summary=summary, reason=reason, risk=risk, confidence=confidence, detected_at=now, - evidence=evidence, + estimated_monthly_cost_usd=None, # spec 10: always None + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=[ + "Business-owner intent or planned future traffic", + "Spillover/failover policy intent beyond observed request activity", + "Reservation coverage, reservation cancellation implications, or other commercial commitments", + "Client-side retries or other application semantics not visible from management and Azure Monitor surfaces", + ], + time_window=f"{effective_idle_days} days", + ), details=details, - estimated_monthly_cost_usd=monthly_cost, ) ) - except PermissionError: - raise # propagate from _check_requests (e.g. missing metrics/read) - except Exception as acct_err: - msg = str(acct_err) - if "AuthorizationFailed" in msg or "Forbidden" in msg or "403" in msg: - raise PermissionError( - "Missing required permissions: " - "Microsoft.CognitiveServices/accounts/read, " - "Microsoft.CognitiveServices/accounts/deployments/read, " - "Microsoft.Insights/metrics/read" - ) from acct_err - continue # skip this account on transient error + except PermissionError: + raise + except Exception: + continue # spec 12: malformed or failed per-deployment record -> skip - except PermissionError: - raise - except Exception as e: - msg = str(e) - if "AuthorizationFailed" in msg or "Forbidden" in msg or "403" in msg: - raise PermissionError( - "Missing required permissions: " - "Microsoft.CognitiveServices/accounts/read, " - "Microsoft.CognitiveServices/accounts/deployments/read" - ) from e - raise + except PermissionError: + raise + except Exception: + continue # spec 12: per-account deployment listing failure -> skip account return findings - - -def _check_requests( - monitor_client: Any, - account_id: str, - deployment_name: str, - days: int, -) -> Optional[tuple]: - """Check whether a deployment had any API requests in the past `days` days. - - Returns: - ("per_deployment", metric_name) — deployment-level zero confirmed - ("active", None) — deployment or account has traffic (do not flag) - ("no_data", None) — metrics API responded but returned no timeseries - for any metric. Can mean: metric unsupported in - this region, dimension filter unsupported, or - ingestion lag. Age-only fallback may apply in - the caller for deployments >= 2× idle_days. - None — all metric calls failed with transient errors; - conservative skip (do not flag) - - Account-level aggregation is intentionally NOT used as an idle signal. A zero - account-level total only confirms all deployments on the account emitting that - metric are idle — deployments that do not emit the metric are invisible to it, - making it an unsafe basis for a finding. - - Auth errors (403/AuthorizationFailed) are re-raised as PermissionError so the - caller can surface them as a skipped-rule signal rather than silent no-findings. - """ - now = datetime.now(timezone.utc) - start = now - timedelta(days=max(days, 1)) - fmt = "%Y-%m-%dT%H:%M:%SZ" - timespan = f"{start.strftime(fmt)}/{now.strftime(fmt)}" - - had_successful_call = False # tracks whether any metric API call returned without exception - - for metric_name in _REQUEST_METRICS: - try: - # 1. Per-deployment query via ModelDeploymentName dimension filter - response = monitor_client.metrics.list( - account_id, - metricnames=metric_name, - timespan=timespan, - interval="P1D", - aggregation="Total", - filter=f"ModelDeploymentName eq '{deployment_name}'", - ) - had_successful_call = True - has_timeseries = False - seen_datapoints = 0 - for metric in response.value: - for ts in metric.timeseries: - has_timeseries = True - for point in ts.data: - if point.total is not None: - seen_datapoints += 1 - if point.total > 0: - return ("active", None) # deployment has traffic - # Require at least one explicit data point: an all-None timeseries means - # Azure returned the metric structure but ingested no measurements - # (ingestion gap or very new deployment). Treat as no_data rather than - # idle to avoid false positives from empty metric shells. - if has_timeseries and seen_datapoints > 0: - return ( - "per_deployment", - metric_name, - ) # timeseries with explicit zeros confirmed - - # No per-deployment timeseries — dimension filter unsupported for this - # deployment. Do NOT fall back to account-level aggregation: a zero - # account total only covers deployments that emit this metric; deployments - # that do not emit it are invisible and would be falsely flagged as idle. - # Fall through to try the next metric instead. - - except Exception as e: - msg = str(e) - if "AuthorizationFailed" in msg or "Forbidden" in msg or "403" in msg: - raise PermissionError( - "Missing required permissions: Microsoft.Insights/metrics/read" - ) from e - continue # transient error — try next metric - - # All metrics tried: distinguish "API responded with no data" from "all calls failed" - return ("no_data", None) if had_successful_call else None - - -def _parse_resource_group(resource_id: str) -> Optional[str]: - """Extract resource group name from an Azure resource ID.""" - if not resource_id: - return None - parts = resource_id.split("/") - try: - rg_index = next(i for i, p in enumerate(parts) if p.lower() == "resourcegroups") - return parts[rg_index + 1] - except (StopIteration, IndexError): - return None diff --git a/docs/rules/azure.md b/docs/rules/azure.md index 08b7bb1..912e6f3 100644 --- a/docs/rules/azure.md +++ b/docs/rules/azure.md @@ -18,11 +18,11 @@ | `azure.sql.database.idle` | Platform | Dedicated single databases with zero activity across all five required metrics over idle window | | `azure.container_registry.unused` | Platform | Container registries with zero pulls and pushes 90+ days | | `azure.resource.untagged` | Governance | Disks and snapshots with zero tags | -| `azure.aml.compute.idle` | AI/ML | AML compute clusters with min_node_count > 0 and no active nodes 14+ days | -| `azure.ml.compute_instance.idle` | AI/ML | Azure ML Compute Instances Running with no activity 14+ days | -| `azure.ml.online_endpoint.idle` | AI/ML | Azure ML managed online endpoints with zero scoring requests 7+ days | -| `azure.ai_search.idle` | AI/ML | Azure AI Search services (Standard+) with zero queries 30+ days | -| `azure.openai.provisioned_deployment.idle` | AI/ML | Azure OpenAI provisioned deployments (PTUs) with zero requests 7+ days | +| `azure.aml.compute.idle` | AI/ML | AML compute clusters with `min_node_count > 0`, confirmed current node allocation, and zero per-cluster `Active Nodes` activity 14+ days | +| `azure.ml.compute_instance.idle` | AI/ML | Azure ML Compute Instances in `Running` state with no documented control-plane lifecycle activity for `idle_days` (default 14); uses `lastOperation.operationTime` or `modifiedOn` fallback only — no age-only or undocumented fallbacks | +| `azure.ml.online_endpoint.idle` | AI/ML | Azure ML managed online endpoints retaining positive deployment baseline instances with `RequestsPerMinute == 0` over a rolling `idle_days` window; managed scope required from documented endpoint/deployment surfaces | +| `azure.ai_search.idle` | AI/ML | Azure AI Search services (Basic+) structurally empty with zero query, indexing, and skill activity 90+ days | +| `azure.openai.provisioned_deployment.idle` | AI/ML | Azure OpenAI provisioned deployments (`model_format == "OpenAI"`, provisioned SKU) retaining positive PTU capacity with zero `AzureOpenAIRequests` across a rolling `idle_days` window; `model_format` gate is case-sensitive and based on deployment properties only | --- @@ -205,66 +205,72 @@ ## AI/ML *(opt-in: `--category ai`)* #### `azure.aml.compute.idle` -**Detects:** AML compute clusters with `min_node_count > 0` and zero active nodes for 14+ days +**Detects:** AML compute clusters (`computeType == "AmlCompute"`) with `min_node_count > 0` retaining confirmed baseline node allocation and no observed per-cluster `Active Nodes` activity for 14 days; requires BOTH confirmed positive baseline capacity AND confirmed zero per-cluster activity metric before emitting -**Confidence / Risk:** HIGH (zero nodes, cluster age ≥ 14 days); MEDIUM (zero nodes, age 7–13 days or creation time unavailable) / HIGH (GPU VM sizes: Standard_NC*, Standard_ND*, Standard_NV*); MEDIUM (CPU) +**Confidence / Risk:** HIGH (always, when all required signals resolve) / MEDIUM (always) **Permissions:** `Microsoft.MachineLearningServices/workspaces/read`, `Microsoft.MachineLearningServices/workspaces/computes/read`, `Microsoft.Insights/metrics/read` -**Params:** none (14-day threshold is fixed) +**Params:** none (14-day window is fixed) -**Exclusions:** clusters with `min_node_count == 0` (scale-to-zero; no idle cost) +**Exclusions:** `id` or `name` absent/empty; workspace `name` absent/empty; outside optional region filter (exact lowercase match on **compute** resource location; spaces and hyphens preserved); `compute_type` does not resolve to exactly `"AmlCompute"` (SDK+nested, conflict → skip); `provisioning_state` does not resolve to exactly `"Succeeded"` (SDK+nested, conflict → skip); `allocation_state` does not resolve to exactly `"Steady"` (SDK+nested, conflict → skip); `created_at` absent, invalid, in the future, or cluster age < 14 days (no age-only fallback); `min_node_count <= 0` or unresolvable; `current_node_count` negative, unresolvable, or < `min_node_count`; `Active Nodes` metric with `ClusterName` dimension filter cannot be resolved reliably (< 95% daily-bucket coverage, unusable response shape, no per-cluster series); `Active Nodes` metric is non-zero over the 14-day window; per-compute retrieval error (skip that compute); per-workspace compute listing error (skip that workspace) -**Spec:** — +**Spec:** [specs/azure/ai/aml_compute_idle.md](../specs/azure/ai/aml_compute_idle.md) #### `azure.ml.compute_instance.idle` -**Detects:** Azure ML Compute Instances in `Running` state with no control-plane activity for `idle_days` +**Detects:** Azure ML Compute Instances (`computeType == "ComputeInstance"`) in `Running` state with `provisioning_state == "Succeeded"` and no documented control-plane lifecycle activity for `idle_days`; precision-first review-candidate rule — does not claim to observe notebook/kernel/session inactivity -**Confidence / Risk:** HIGH (`last_operation.operation_time` or `last_modified_at` ≥ threshold, age ≥ threshold); MEDIUM (≥ 75% of threshold on both signals, or age-only fallback) / CRITICAL (GPU + `idle_ratio ≥ 2.0`); HIGH (GPU: Standard_NC*, Standard_ND*, Standard_NV*); MEDIUM (CPU) +**Confidence / Risk:** MEDIUM (`lastOperation.operationTime` is the idle signal source); LOW (`modifiedOn` fallback is the idle signal source) / HIGH (GPU: exact case-sensitive prefix match on `Standard_NC`, `Standard_ND`, `Standard_NV`); MEDIUM (all other VM families including null/absent `vm_size`) + +**Cost:** `estimated_monthly_cost_usd = None` always — no hardcoded price tables; rule notes only that a Running instance incurs ongoing compute-hour charges **Permissions:** `Microsoft.MachineLearningServices/workspaces/read`, `Microsoft.MachineLearningServices/workspaces/computes/read` -**Params:** `idle_days` (default: 14) +**Params:** `idle_days` (default: 14, minimum effective value: 1) -**Exclusions:** stopped instances (only `Running` state evaluated) +**Exclusions:** `id` or `name` absent/empty; workspace `name` absent/empty; outside optional region filter (exact lowercase match on **compute** resource location; spaces and hyphens preserved); `compute_type` does not resolve to exactly `"ComputeInstance"` (SDK+nested, conflict → skip); `provisioning_state` does not resolve to exactly `"Succeeded"` (SDK+nested, conflict → skip); `state` does not resolve to exactly `"Running"` (SDK+nested, conflict → skip); location unresolvable or conflicting; `created_at` absent, invalid, or in the future; instance age < `idle_days`; `lastOperation.operationTime` present but unparsable (skip — no silent fallback); `lastOperation.operationTime == created_at` (no proven post-create signal → skip); `modifiedOn` fallback only when `lastOperation` absent or has no `operationTime` — skipped when `modifiedOn` absent, unparsable, `<= created_at`, or in the future; no lifecycle signal resolvable (fail closed — no age-only fallback, no `systemData.lastModifiedAt`); resolved lifecycle timestamp in the future; floored `idle_since_days` < `idle_days`; per-compute record malformed (skip that compute); per-workspace compute listing fails (skip that workspace) -**Spec:** — +**Spec:** [specs/azure/ai/aml_compute_instance_idle.md](../specs/azure/ai/aml_compute_instance_idle.md) #### `azure.ml.online_endpoint.idle` -**Detects:** Azure ML managed online endpoints in `Succeeded` provisioning state with zero scoring requests for `idle_days` +**Detects:** Azure ML managed online endpoints with `provisioning_state == "Succeeded"`, at least one stable deployment retaining a known positive baseline instance count, and `RequestsPerMinute == 0` (Average, PT1M) across a rolling UTC window on the **endpoint ARM resource id**; precision-first review-candidate rule — does not claim exact endpoint cost and emits only when all required signals resolve -**Confidence / Risk:** HIGH (per-endpoint `RequestCount` metric confirms zero + age ≥ `idle_days`); MEDIUM (zero confirmed but age < `idle_days`, or metric unavailable + age ≥ 2× `idle_days`) / CRITICAL (GPU + `idle_ratio ≥ 2.0`); HIGH (GPU/accelerator); MEDIUM (CPU) +**Confidence / Risk:** HIGH (`RequestsPerMinute` metric coverage ≥ 95% for a ZERO result); MEDIUM (metric coverage 80–95%) / HIGH (any billing-relevant deployment is GPU — uppercase prefix match on `STANDARD_NC`, `STANDARD_ND`, `STANDARD_NV`); MEDIUM (all other instance families including null/absent) + +**Cost:** `estimated_monthly_cost_usd = None` always — no hardcoded VM price tables; rule notes only that deployments retaining positive baseline instances incur ongoing compute cost **Permissions:** `Microsoft.MachineLearningServices/workspaces/read`, `Microsoft.MachineLearningServices/workspaces/onlineEndpoints/read`, `Microsoft.MachineLearningServices/workspaces/onlineEndpoints/deployments/read`, `Microsoft.Insights/metrics/read` -**Params:** `idle_days` (default: 7) +**Params:** `idle_days` (default: 7, minimum effective value: 1) -**Exclusions:** `provisioning_state != "Succeeded"`; batch endpoints +**Exclusions:** `endpoint.id` or `endpoint.name` absent/empty; workspace `name` absent/empty; outside optional region filter (exact lowercase match on **endpoint** resource location; spaces and hyphens preserved); managed scope not established from documented endpoint/deployment surfaces — Kubernetes endpoints (class name or `kind == "Kubernetes"`) always out of scope; `provisioning_state` does not exactly equal `"Succeeded"` (case-sensitive); `created_at` absent from `systemData.createdAt`, unparsable, in the future, or endpoint age < `idle_days`; deployment inventory listing fails (skip endpoint); no stable deployment (`deployment_provisioning_state == "Succeeded"`) resolves to a known positive baseline instance count (`scale_settings.min_instances` → `instance_count`, known integer > 0); `RequestsPerMinute` metric unavailable, coverage below 80%, or result not ZERO; per-endpoint failure (skip that endpoint); per-workspace failure (skip that workspace) -**Spec:** — +**Spec:** [specs/azure/ai/ml_online_endpoint_idle.md](../specs/azure/ai/ml_online_endpoint_idle.md) #### `azure.ai_search.idle` -**Detects:** Azure AI Search services (Standard tier and above) with zero `SearchQueriesPerSecond` for `idle_days` +**Detects:** Azure AI Search services (Basic tier and above) that are structurally empty and have no documented query, indexing, or skill activity over a fixed 90-day window; requires BOTH confirmed zero activity across all three required metrics AND confirmed emptiness of all required object surfaces before emitting -**Confidence / Risk:** HIGH (zero queries confirmed + age ≥ `idle_days`); MEDIUM (zero confirmed but age < `idle_days`, or metric unavailable + age ≥ 2× `idle_days`) / HIGH (estimated cost ≥ $1,000/month); MEDIUM (otherwise) +**Confidence / Risk:** HIGH (always, when all required signals resolve) / MEDIUM (always) -**Permissions:** `Microsoft.Search/searchServices/read`, `Microsoft.Insights/metrics/read` +**Permissions:** `Microsoft.Search/searchServices/read`, `Microsoft.Insights/metrics/read`, Azure AI Search data-plane RBAC (`Search Service Contributor` or equivalent; no admin keys) -**Params:** `idle_days` (default: 30) +**Params:** none (90-day window is fixed) -**Exclusions:** Basic tier and below; only `standard`, `standard2`, `standard3`, `storage_optimized_l1`, `storage_optimized_l2` evaluated +**Exclusions:** `id` or `name` absent/empty; outside optional region filter (exact lowercase match; spaces and hyphens preserved); `provisioning_state` does not resolve to exactly `"succeeded"` (SDK+nested, conflict → skip); `status` does not resolve to exactly `"running"` (SDK+nested, conflict → skip); `sku.name` not in supported dedicated billable tiers (`basic`, `standard`, `standard2`, `standard3`, `storage_optimized_l1`, `storage_optimized_l2`) after lowercase normalization and camelCase alias resolution; `systemData.createdAt` absent, invalid, in the future, or service age < 90 days (no age-only fallback); `replica_count` or `partition_count` not a known positive integer (conflict → skip); data-plane client factory returns `None` (azure-search-documents package unavailable → skip); any required object surface (`indexes`, `indexers`, `data_sources`, `skillsets`, `synonym_maps`) fails, is unavailable, or is non-empty; any optional reinforcing surface (`aliases`, `knowledge_sources`, `agents`) fully enumerated and non-empty; any of three required activity metrics (`SearchQueriesPerSecond`/Average, `DocumentsProcessedCount`/Total, `SkillExecutionCount`/Total) below 95% daily-bucket coverage or non-zero over 90 days; non-numeric aggregation values or malformed metric response shapes (fail-closed to UNKNOWN → skip); per-service retrieval raises `HttpResponseError`, `ServiceRequestError`, or `ServiceResponseError` -**Spec:** — +**Spec:** [specs/azure/ai/ai_search_idle.md](../specs/azure/ai/ai_search_idle.md) #### `azure.openai.provisioned_deployment.idle` -**Detects:** Azure OpenAI provisioned deployments (PTUs) with zero API requests for `idle_days`; bills per PTU per hour regardless of traffic +**Detects:** Azure OpenAI provisioned deployments (`model_format == "OpenAI"`, provisioned SKU) retaining positive PTU capacity with `AzureOpenAIRequests == 0` (Total, PT1M) across a rolling UTC window on the **parent account ARM resource id**; precision-first review-candidate rule — does not claim exact savings and emits only when all required signals resolve + +**Confidence / Risk:** HIGH (`AzureOpenAIRequests` metric coverage ≥ 95% for a ZERO result); MEDIUM (metric coverage 80–95%) / HIGH (always — every provisioned deployment with positive PTU capacity is inherently a cost candidate) -**Confidence / Risk:** HIGH (per-deployment `AzureOpenAIRequests` metric confirms zero + age ≥ `idle_days`); MEDIUM (per-deployment zero but age < `idle_days`, or account-level zero only) / HIGH (≥ 7 PTUs, ~$10K+/month); MEDIUM (< 7 PTUs) +**Cost:** `estimated_monthly_cost_usd = None` always — no hardcoded PTU price constant; rule notes only that deployed PTUs incur hourly billing while the deployment exists **Permissions:** `Microsoft.CognitiveServices/accounts/read`, `Microsoft.CognitiveServices/accounts/deployments/read`, `Microsoft.Insights/metrics/read` -**Params:** `idle_days` (default: 7) +**Params:** `idle_days` (default: 7, minimum effective value: 1) -**Exclusions:** non-provisioned SKUs; only `ProvisionedManaged`, `GlobalProvisionedManaged`, `DataZoneProvisionedManaged` evaluated +**Exclusions:** `account.id` or `account.name` absent/empty; `deployment.id` or `deployment.name` absent/empty; account location unresolved (spaces and hyphens preserved in normalized form); outside optional region filter (exact lowercase match); `account_provisioning_state` does not exactly equal `"Succeeded"` (case-sensitive); `deployment_provisioning_state` does not exactly equal `"Succeeded"` (case-sensitive); `model_format` does not exactly equal `"OpenAI"` (case-sensitive; account kind is not used to establish OpenAI scope); `sku_name` not in `{ProvisionedManaged, GlobalProvisionedManaged, DataZoneProvisionedManaged}`; `ptu_capacity` absent, invalid, zero, or negative; `created_at` absent, unparsable, in the future, or deployment age < `idle_days`; `AzureOpenAIRequests` metric unavailable, coverage below 80%, or result not ZERO; no age-only, token-only, utilization-only, or `ProcessedPromptTokens` fallback; per-deployment failure (skip that deployment); per-account deployment listing failure (skip that account) -**Spec:** — +**Spec:** [specs/azure/ai/openai_provisioned_idle.md](../specs/azure/ai/openai_provisioned_idle.md) diff --git a/docs/specs/azure/ai/ai_search_idle.md b/docs/specs/azure/ai/ai_search_idle.md new file mode 100644 index 0000000..599946e --- /dev/null +++ b/docs/specs/azure/ai/ai_search_idle.md @@ -0,0 +1,485 @@ +# Azure Rule Spec — `azure.ai_search.idle` + +## 1. Rule Identity + +- **Rule ID:** `azure.ai_search.idle` +- **Provider:** Azure +- **ARM resource type:** `Microsoft.Search/searchServices` +- **Finding resource_type:** `azure.ai.search_service` + +--- + +## 2. Intent + +Detect **dedicated Azure AI Search services that appear structurally empty and operationally inactive** over a long observation window, making them conservative review candidates for deletion or rightsizing. + +This rule is deliberately **precision-first**. It is **not** a generic "zero queries" rule. It is a review-candidate rule only, not proof that a service is safe to delete, not proof that no future rollout depends on it, and not proof of a specific monthly saving. + +--- + +## 3. Azure Documentation Grounding + +### 3.1 Azure AI Search has ongoing service cost while the service exists + +Microsoft documents that Azure AI Search has: + +1. a fixed base service cost driven by partitions and replicas +2. optional premium-feature usage charges +3. no way to temporarily stop billing short of deleting the service + +Sources: + +- *Choose a pricing tier for Azure AI Search* +- *Plan and manage costs for Azure AI Search* + +URLs: + +- https://learn.microsoft.com/en-us/azure/search/search-sku-tier +- https://learn.microsoft.com/en-us/azure/search/search-sku-manage-costs + +Rule consequence: + +1. An unused dedicated service is billing-relevant even if premium features are unused. +2. Flat monthly cost tables must not be hardcoded because pricing varies by region, tier, capacity, and optional premium features. +3. `estimated_monthly_cost_usd` should remain `None`. + +### 3.2 Azure AI Search has both querying and indexing workloads + +Microsoft documents that Azure AI Search supports: + +- querying workloads +- indexing workloads +- push ingestion directly into an index +- pull ingestion through indexers +- AI enrichment during indexing +- agentic retrieval / knowledge-source workflows + +Sources: + +- *What is Azure AI Search?* +- *Import data into a search index* +- *Indexers in Azure AI Search* + +URLs: + +- https://learn.microsoft.com/en-us/azure/search/search-what-is-azure-search +- https://learn.microsoft.com/en-us/azure/search/search-how-to-load-search-index +- https://learn.microsoft.com/en-us/azure/search/search-indexer-overview + +Rule consequence: + +1. Zero query traffic alone is **not** sufficient evidence of overall service idleness. +2. Zero indexer activity alone is **not** sufficient evidence of overall service idleness. +3. A conservative rule should require both **zero documented activity metrics** and **zero configured search-service objects** before emitting. + +### 3.3 Azure Monitor metrics for Azure AI Search + +Microsoft documents the following platform metrics for `Microsoft.Search/searchServices`: + +- `SearchQueriesPerSecond` +- `DocumentsProcessedCount` +- `SkillExecutionCount` +- `SearchLatency` +- `ThrottledSearchQueriesPercentage` + +Source: *Supported metrics for Microsoft.Search/searchServices* +URL: https://learn.microsoft.com/en-us/azure/azure-monitor/reference/supported-metrics/microsoft-search-searchservices-metrics + +Rule consequence: + +1. `SearchQueriesPerSecond` is the documented query-activity signal. +2. `DocumentsProcessedCount` and `SkillExecutionCount` are documented indexing / enrichment signals. +3. If any required activity metric cannot be resolved reliably, the service must be skipped. + +### 3.4 Search service control-plane state + +Microsoft documents search service fields including: + +- `properties.provisioningState` +- `properties.status` +- `properties.replicaCount` +- `properties.partitionCount` +- `properties.hostingMode` +- `systemData.createdAt` +- `sku.name` + +Sources: + +- *Services - Get (Search Management REST API)* +- *azure.mgmt.search.models.SearchService* + +URLs: + +- https://learn.microsoft.com/en-us/rest/api/searchmanagement/services/get?view=rest-searchmanagement-2025-05-01 +- https://learn.microsoft.com/en-us/python/api/azure-mgmt-search/azure.mgmt.search.models.searchservice?view=azure-python + +Microsoft further documents that degraded, disabled, and error states can still be chargeable for dedicated services. + +Rule consequence: + +1. The rule should evaluate only **stable** services: exact `provisioningState == "succeeded"` and exact `status == "running"`. +2. The rule should skip transitional or impaired service states rather than infer idleness from them. + +### 3.5 Azure AI Search object-management surfaces + +Microsoft documents RBAC-backed data-plane object-management access for Azure AI Search and explicitly states that Search Service Contributor can list and manage search objects, including: + +- indexes +- indexers +- data sources +- skillsets +- aliases +- synonym maps +- knowledge bases / knowledge sources + +Source: *Use role-based access control in Azure AI Search* +URL: https://learn.microsoft.com/en-us/azure/search/search-security-rbac + +Microsoft also documents list APIs for several object types, including: + +- `GET {endpoint}/indexes` +- `GET {endpoint}/indexers` +- `GET {endpoint}/datasources` +- `GET {endpoint}/skillsets` +- `GET {endpoint}/synonymmaps` +- preview list APIs for `aliases`, `knowledgesources`, and `agents` + +Sources: + +- *Indexes - List* +- *Indexers - List* +- *Data Sources - List* +- *Skillsets - List* +- *Synonym Maps - List* +- *Aliases - List* +- *Knowledge Sources - List* +- *Knowledge Agents - List* + +URLs: + +- https://learn.microsoft.com/en-us/rest/api/searchservice/indexes/list?view=rest-searchservice-2025-09-01 +- https://learn.microsoft.com/en-us/rest/api/searchservice/indexers/list?view=rest-searchservice-2025-09-01 +- https://learn.microsoft.com/en-us/rest/api/searchservice/data-sources/list?view=rest-searchservice-2025-09-01 +- https://learn.microsoft.com/en-us/rest/api/searchservice/skillsets/list?view=rest-searchservice-2025-09-01 +- https://learn.microsoft.com/en-us/rest/api/searchservice/synonym-maps/list?view=rest-searchservice-2025-09-01 +- https://learn.microsoft.com/en-us/rest/api/searchservice/aliases/list?view=rest-searchservice-2025-09-01 +- https://learn.microsoft.com/en-us/rest/api/searchservice/knowledge-sources/list?view=rest-searchservice-2025-09-01 +- https://learn.microsoft.com/en-us/rest/api/searchservice/knowledge-agents/list?view=rest-searchservice-2025-09-01 + +Rule consequence: + +1. A service with configured search objects is not safely classifiable as idle. +2. A conservative rule should emit only when **all required object surfaces are empty**. +3. If object enumeration fails or is unavailable for a required surface, the service must be skipped. + +### 3.6 Sensitive object definitions must not leak into findings + +Microsoft's documented object-management responses can include sensitive configuration material such as: + +- datasource credentials / connection strings +- encryption key references +- model connection details + +Rule consequence: + +Object enumeration may be used only to establish **presence / count / safe names**, never to persist, emit, or log sensitive configuration fields. + +--- + +## 4. Detection Goal + +Emit a finding only when **all** of the following are true: + +1. `service.id` is present and non-empty +2. `service.name` is present and non-empty +3. the optional region filter matches the normalized location +4. `provisioning_state` resolves to exactly `"succeeded"` +5. `status` resolves to exactly `"running"` +6. `sku.name` resolves to a supported dedicated billable tier +7. `systemData.createdAt` is known and the service age is at least `90 days` +8. `replica_count` and `partition_count` are known positive integers +9. all required object-list surfaces resolve reliably +10. all required object-list surfaces are empty +11. all required activity metrics resolve reliably (see section 9.5) for the same `90-day` window +12. all required activity metrics are zero for that window + +If any required signal cannot be established reliably, skip rather than emit. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that deleting the service is safe +- that no deployment, migration, or DR plan depends on the service +- that unused premium features are absent in every conceivable configuration surface +- that a specific monthly saving exists +- that future data-plane or preview feature surfaces will never expand + +--- + +## 6. Canonical Inputs + +### 6.1 Required surfaces + +| Surface | Purpose | +|---|---| +| Search Management `services.list_by_subscription()` / `services.get()` | stable service identity, region, SKU, capacity, status, provisioning, creation time | +| Azure Monitor metrics on the service ARM id | documented query / indexing / skill activity | +| Azure AI Search data-plane object list APIs | determine whether the service is structurally empty | + +### 6.2 Authentication / permissions + +Minimum permissions: + +- `Microsoft.Search/searchServices/read` +- `Microsoft.Insights/metrics/read` + +And **data-plane object-management read capability** using Azure AI Search RBAC / keyless auth for object enumeration, typically via **Search Service Contributor** or an equivalent custom role. + +The implementation must **not** retrieve admin keys merely to evaluate this rule. + +### 6.3 Fixed idle window + +- Configurable parameter: none +- Fixed evaluation window: `90 days` + +Reason: + +- Azure AI Search services can exist ahead of go-live +- indexing and retrieval workloads can be periodic +- a longer window materially reduces false findings for expensive dedicated services + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `location` | Lowercase ARM location string; compare by exact lowercase string equality only. Do not remove spaces, hyphens, or digits. | +| `provisioning_state` | Resolve from documented SDK/raw surfaces and compare case-sensitively to exact `"succeeded"`. | +| `status` | Resolve from documented SDK/raw surfaces and compare case-sensitively to exact `"running"`. | +| `sku_name` | Lowercase only. Supported dedicated tiers for this rule are exact: `basic`, `standard`, `standard2`, `standard3`, `storage_optimized_l1`, `storage_optimized_l2`. | +| `created_at` | Parse as UTC instant from `systemData.createdAt` or equivalent SDK projection. | +| `replica_count`, `partition_count` | Positive integers only. `<= 0`, invalid, or unresolvable values are not eligible. | +| `object_list_empty` | `True` only when the list call succeeds, all pages are exhausted, and the returned `value` collection is confirmed empty across the full paginated result set. | +| `tags` | `service.tags or {}` — never `None` in output. | + +--- + +## 8. Unified Decision Rule + +| # | Condition | Action | +|---|---|---| +| 8.1 | `id` absent, `None`, or empty | Skip | +| 8.2 | `name` absent, `None`, or empty | Skip | +| 8.3 | Region filter set and normalized location does not match | Skip | +| 8.4 | `provisioning_state` does not resolve to `"succeeded"` | Skip | +| 8.5 | `status` does not resolve to `"running"` | Skip | +| 8.6 | `sku_name` is not one of the supported dedicated billable tiers | Skip | +| 8.7 | `created_at` is absent, invalid, in the future, or less than `90 days` old | Skip | +| 8.8 | `replica_count <= 0` or `partition_count <= 0`, or either value is unresolvable | Skip | +| 8.9 | Any required object-list surface fails, is unavailable, or is unresolvable | Skip | +| 8.10 | Any required object-list surface is non-empty | Skip | +| 8.11 | Any required metric cannot be resolved reliably | Skip | +| 8.12 | Any required metric is non-zero over the `90-day` window | Skip | +| 8.13 | All required signals resolve, required object surfaces are empty, and required metrics are zero over `90 days` | **EMIT** | + +--- + +## 9. Canonical Evaluation Contracts + +### 9.1 Stable-state contract + +Resolve `provisioning_state` in this order: + +1. SDK projection such as `service.provisioning_state` +2. nested/raw management payload such as `properties.provisioningState` +3. otherwise unknown + +Resolve `status` in this order: + +1. SDK projection such as `service.status` +2. nested/raw management payload such as `properties.status` +3. otherwise unknown + +Required behavior: + +1. Only exact `"succeeded"` is eligible for `provisioning_state`. +2. Only exact `"running"` is eligible for `status`. +3. Unknown, conflicting, or any other values must skip. + +### 9.2 Supported billing-tier contract + +This rule is limited to documented dedicated billable tiers: + +- `basic` +- `standard` +- `standard2` +- `standard3` +- `storage_optimized_l1` +- `storage_optimized_l2` + +Required behavior: + +1. `free` must skip. +2. `serverless`, unknown, preview, or future SKU labels must skip. +3. The rule must not hardcode monthly price tables. + +### 9.3 Structural-emptiness contract + +Required zero-count object surfaces: + +1. indexes +2. indexers +3. data sources +4. skillsets +5. synonym maps + +Optional reinforcing zero-count object surfaces: + +1. aliases +2. knowledge sources +3. agents + +Required behavior: + +1. Each required surface must be enumerated explicitly. +2. A required surface is empty only when enumeration succeeds and pagination is fully exhausted across the complete result set. +3. Any non-empty required surface must skip. +4. Any failed, unauthorized, unsupported, or unresolvable required-surface enumeration must skip. +5. Optional reinforcing surfaces may be enumerated when supported, but must not be required for eligibility. +6. Optional reinforcing surfaces are ignored unless enumeration succeeds and pagination is fully exhausted across the complete result set. +7. If a fully enumerated optional reinforcing surface is non-empty, the service must skip. + +Rationale: + +This rule intentionally treats **configured search objects** as sufficient evidence that the service is not safely classifiable as idle. + +### 9.4 Sensitive-response handling contract + +When enumerating data-plane objects: + +1. implementations may use only safe presence/count information +2. they must not persist, emit, or log connection strings, keys, credentials, model secrets, or equivalent sensitive payload fields +3. finding evidence may include object counts and safe object names only when they are non-sensitive + +### 9.5 Activity-metric contract + +Required metrics: + +1. `SearchQueriesPerSecond` with `Average` +2. `DocumentsProcessedCount` with `Total` +3. `SkillExecutionCount` with `Total` + +Definitions: + +- **usable datapoint**: a datapoint with a parseable UTC timestamp inside the requested window and a numeric aggregation value +- **source bucket**: the metric bucket returned by Azure Monitor for the requested query interval before any spec-level normalization +- **UTC day bucket**: the UTC day boundary derived from a datapoint timestamp by normalizing it to `00:00:00Z` for that day +- **expected buckets**: count of UTC-aligned daily buckets overlapping `[window_start, window_end)` +- **observed buckets**: count of unique UTC day buckets with at least one usable datapoint after consolidating duplicate timestamps across all returned series and dimension slices +- **coverage ratio**: `observed_buckets / expected_buckets` +- **acceptable coverage**: `coverage_ratio >= 0.95` +- **resolve reliably**: the metric query returns valid data for the requested window, meets the coverage threshold, and does not trigger any `UNKNOWN` condition +- **unusable response shape**: a metric response with missing `value`, malformed time series collections, unparsable timestamps, or non-numeric aggregation values + +Required behavior: + +1. Query all three required metrics for the same `90-day` window. +2. Evaluate activity on the returned source buckets before any UTC-day normalization. Implementations must not rely on coarse day-level pre-aggregated buckets as the sole activity test because short-lived activity can be diluted away. +3. Normalize datapoint timestamps to UTC day buckets only for coverage calculation and day-level consolidation. +4. For `SearchQueriesPerSecond`, use the returned `Average` value for each source bucket. Any positive source-bucket value makes its containing UTC day bucket positive. +5. Consolidate duplicate series for the same UTC day bucket before final evaluation. For `SearchQueriesPerSecond`, any positive contributing value keeps that UTC day bucket positive. For `DocumentsProcessedCount` and `SkillExecutionCount`, aggregate values across all returned dimension slices for the same UTC day bucket. +6. Treat any missing metric, failed query, unusable response shape, empty series, no datapoints, no valid series, or coverage below threshold as `UNKNOWN`. +7. Treat any metric with any positive consolidated bucket value as `ACTIVE`. +8. Treat a metric as `ZERO` only when it resolves reliably and all usable bucket values are exactly zero. +9. Emit only when **all three** required metrics evaluate to `ZERO`. + +Rationale: + +1. Query silence alone is not enough because indexing workloads can be valid. +2. Indexer / skill silence alone is not enough because search workloads can be valid. +3. This rule still does not claim to observe every possible undocumented activity surface, which is why it also requires structural emptiness. + +--- + +## 10. Cost Model + +`estimated_monthly_cost_usd = None` + +Mandatory rules: + +1. Do **not** use flat hardcoded price tables +2. Do **not** infer cost from SU count alone +3. Do **not** infer cost from metric silence alone +4. State only that dedicated Azure AI Search services incur ongoing service cost while they exist + +--- + +## 11. Finding Shape + +### 11.1 Required fields + +| Field | Value | +|---|---| +| `provider` | `"azure"` | +| `rule_id` | `"azure.ai_search.idle"` | +| `resource_type` | `"azure.ai.search_service"` | +| `resource_id` | original ARM id from `service.id` | +| `region` | normalized location | +| `confidence` | `HIGH` | +| `risk` | `MEDIUM` | +| `estimated_monthly_cost_usd` | `None` | + +### 11.2 Required evidence + +`signals_used` must clearly disclose: + +1. provisioning state is `"succeeded"` +2. service status is `"running"` +3. supported dedicated SKU was confirmed +4. service age is at least `90 days` +5. all required object surfaces were confirmed empty with full pagination exhaustion +6. all required activity metrics resolved to **no observed query/indexing/skill activity** with sufficient coverage + +`signals_not_checked` should include remaining blind spots such as: + +1. future go-live or migration intent +2. business-owner intent not visible in Azure control plane +3. premium-feature billing details not inferable from baseline management and metric surfaces + +### 11.3 Required details + +Details should include at least: + +- `service_name` +- `resource_group` +- `subscription_id` +- `sku_name` +- `replica_count` +- `partition_count` +- `hosting_mode` +- `status` +- `provisioning_state` +- `created_at` +- `idle_window_days` +- `object_counts` +- `metrics_used` +- `tags` + +`object_counts` contract: + +1. include counts for all required surfaces only after full pagination exhaustion +2. include counts for optional reinforcing surfaces only when they were successfully enumerated with full pagination exhaustion +3. omit unevaluated or partially evaluated surfaces rather than defaulting them to `0` + +--- + +## 12. Failure Behavior + +- If subscription-wide service inventory fails, let the exception propagate +- If per-service `get(...)`, data-plane object enumeration, or metric retrieval fails, skip that service +- If a service record is malformed or missing required fields, skip that service +- Do not emit on partial or unresolved object-surface state or metric state diff --git a/docs/specs/azure/ai/aml_compute_idle.md b/docs/specs/azure/ai/aml_compute_idle.md new file mode 100644 index 0000000..3fe085b --- /dev/null +++ b/docs/specs/azure/ai/aml_compute_idle.md @@ -0,0 +1,413 @@ +# Azure Rule Spec — `azure.aml.compute.idle` + +## 1. Rule Identity + +- **Rule ID:** `azure.aml.compute.idle` +- **Provider:** Azure +- **ARM resource type:** `Microsoft.MachineLearningServices/workspaces/computes` +- **Finding resource_type:** `azure.aml.compute` + +--- + +## 2. Intent + +Detect **managed Azure Machine Learning compute clusters (`AmlCompute`) that retain billable baseline capacity while showing no observed per-cluster job activity** over a fixed observation window. + +This rule is deliberately **precision-first**. It is **not** a generic “quiet workspace” rule, **not** a generic “unused training resource” rule, and **not** proof that deleting the cluster is safe. It is a conservative review-candidate rule for clusters that appear to be kept warm by configuration rather than by observed workload. + +--- + +## 3. Azure Documentation Grounding + +### 3.1 AML compute clusters keep baseline nodes running when `minNodeCount > 0` + +Microsoft documents that Azure Machine Learning compute clusters: + +1. autoscale based on submitted jobs +2. scale down to the configured minimum node count +3. avoid charges when idle only when the minimum node count is set to `0` +4. keep the configured minimum number of nodes running when `minNodeCount > 0`, even if no jobs are running + +Sources: + +- *Manage and optimize costs for Azure Machine Learning* +- *Create an Azure Machine Learning compute cluster* +- *Compute target* + +URLs: + +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-optimize-cost?view=azureml-api-2 +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster?view=azureml-api-2 +- https://learn.microsoft.com/en-us/azure/machine-learning/concept-compute-target?view=azureml-api-2 + +Rule consequence: + +1. `minNodeCount == 0` is out of scope for this rule. +2. `minNodeCount > 0` is billing-relevant even when no jobs are active. +3. This rule should target **baseline-capacity waste**, not all possible residual AML workspace costs. + +### 3.2 Azure Machine Learning costs vary by VM size, region, priority, and surrounding infrastructure + +Microsoft documents that Azure Machine Learning costs can include: + +- VM runtime costs +- Azure Monitor costs +- load balancer costs for compute resources +- network and other dependent infrastructure costs +- pricing that varies by Azure region and resource choice + +Sources: + +- *Plan to manage costs for Azure Machine Learning* +- *Manage and optimize costs for Azure Machine Learning* + +URLs: + +- https://learn.microsoft.com/en-us/azure/machine-learning/concept-plan-manage-cost?view=azureml-api-2 +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-optimize-cost?view=azureml-api-2 + +Rule consequence: + +1. The rule may state that ongoing cost exists when baseline nodes are intentionally retained. +2. The rule must **not** hardcode static VM price tables. +3. `estimated_monthly_cost_usd` should remain `None`. + +### 3.3 Azure Monitor exposes per-cluster AML workspace metrics + +Microsoft documents that `Microsoft.MachineLearningServices/workspaces` exposes quota and resource metrics including: + +- `Active Nodes` +- `Idle Nodes` +- `Total Nodes` + +Microsoft further documents: + +- `Active Nodes` has the `ClusterName` dimension +- `Idle Nodes` has the `ClusterName` dimension +- `Total Nodes` has the `ClusterName` dimension +- these metrics support `PT1M` time grain + +Source: *Supported metrics for Microsoft.MachineLearningServices/workspaces* +URL: https://learn.microsoft.com/en-us/azure/azure-monitor/reference/supported-metrics/microsoft-machinelearningservices-workspaces-metrics + +Rule consequence: + +1. `Active Nodes` is the documented per-cluster workload-activity signal for this rule. +2. The rule should evaluate the metric **for the specific cluster** using the documented `ClusterName` dimension. +3. Workspace-level unfiltered fallback must **not** be used to prove a cluster is idle. +4. If the per-cluster activity metric cannot be resolved reliably, the cluster must be skipped. + +### 3.4 AML compute control-plane fields expose cluster type, baseline scale settings, and allocation state + +Microsoft documents AML compute control-plane fields including: + +- `properties.computeType` +- `properties.provisioningState` +- `properties.createdOn` +- `properties.properties.vmSize` +- `properties.properties.vmPriority` +- `properties.properties.scaleSettings.minNodeCount` +- `properties.properties.scaleSettings.maxNodeCount` +- `properties.properties.scaleSettings.nodeIdleTimeBeforeScaleDown` +- `properties.properties.allocationState` +- `properties.properties.currentNodeCount` +- `properties.properties.targetNodeCount` +- `properties.properties.nodeStateCounts` + +Sources: + +- *Compute - Get (Azure ML REST API)* +- *Compute - List (Azure ML REST API)* +- *azure.mgmt.machinelearningservices.models.AmlCompute* +- *azure.mgmt.machinelearningservices.models.AmlComputeProperties* +- *azure.mgmt.machinelearningservices.models.ScaleSettings* + +URLs: + +- https://learn.microsoft.com/en-us/rest/api/azureml/compute/get?view=rest-azureml-2025-06-01 +- https://learn.microsoft.com/en-us/rest/api/azureml/compute/list?view=rest-azureml-2025-06-01 +- https://learn.microsoft.com/en-us/python/api/azure-mgmt-machinelearningservices/azure.mgmt.machinelearningservices.models.amlcompute?view=azure-python +- https://learn.microsoft.com/en-us/python/api/azure-mgmt-machinelearningservices/azure.mgmt.machinelearningservices.models.amlcomputeproperties?view=azure-python +- https://learn.microsoft.com/en-us/python/api/azure-mgmt-machinelearningservices/azure.mgmt.machinelearningservices.models.scalesettings?view=azure-python + +Rule consequence: + +1. This rule must be limited to exact `computeType == "AmlCompute"`. +2. The rule should evaluate only **stable** compute clusters: exact `provisioningState == "Succeeded"` and exact `allocationState == "Steady"`. +3. The rule should require positive baseline scale settings and positive current node allocation before emitting. + +### 3.5 AML compute clusters can live in a different region than the workspace + +Microsoft documents that compute clusters can be created in a different region than the workspace. + +Source: *Create an Azure Machine Learning compute cluster* +URL: https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster?view=azureml-api-2 + +Rule consequence: + +If a region filter is used, it must be applied to the **compute resource location**, not the workspace location. + +--- + +## 4. Detection Goal + +Emit a finding only when **all** of the following are true: + +1. `compute.id` is present and non-empty +2. `compute.name` is present and non-empty +3. `workspace.name` is present and non-empty +4. the optional region filter matches the normalized **compute** location +5. `compute_type` resolves to exactly `"AmlCompute"` +6. `provisioning_state` resolves to exactly `"Succeeded"` +7. `allocation_state` resolves to exactly `"Steady"` +8. `created_at` is known and the cluster age is at least `14 days` +9. `min_node_count` resolves to a known positive integer +10. `current_node_count` resolves to a known integer and is at least `min_node_count` +11. the required per-cluster activity metric resolves reliably for the same `14-day` window +12. the required per-cluster activity metric is zero for that window + +If any required signal cannot be established reliably, skip rather than emit. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that deleting the cluster is safe +- that no future training run, pipeline, or batch inference job depends on it +- that the cluster is the cheapest possible configuration +- that no residual Azure ML infrastructure cost exists elsewhere in the workspace +- that a specific monthly saving exists + +--- + +## 6. Canonical Inputs + +### 6.1 Required surfaces + +| Surface | Purpose | +|---|---| +| AML workspace inventory | enumerate candidate workspaces | +| AML compute list/get for each workspace | determine compute type, region, provisioning state, age, baseline scale settings, allocation state, and node counts | +| Azure Monitor metrics on the workspace ARM id | determine observed per-cluster activity using the documented `ClusterName` dimension | + +### 6.2 Authentication / permissions + +Minimum permissions: + +- `Microsoft.MachineLearningServices/workspaces/read` +- `Microsoft.MachineLearningServices/workspaces/computes/read` +- `Microsoft.Insights/metrics/read` + +No secret or key retrieval is required for this rule. + +### 6.3 Fixed idle window + +- Configurable parameter: none +- Fixed evaluation window: `14 days` + +Reason: + +- AML compute clusters are autoscaling training infrastructure rather than long-lived serving infrastructure +- short warm baselines can be intentional for active experimentation +- a two-week fixed window is conservative enough to avoid flagging brief pauses while still surfacing clusters that appear intentionally kept warm without observed workload + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `location` | Lowercase ARM location string from the compute resource; compare by exact lowercase string equality only. Do not remove spaces, hyphens, or digits. | +| `compute_type` | Resolve from documented SDK/raw surfaces and compare case-sensitively to exact `"AmlCompute"`. | +| `provisioning_state` | Resolve from documented SDK/raw surfaces and compare case-sensitively to exact `"Succeeded"`. | +| `allocation_state` | Resolve from documented SDK/raw surfaces and compare case-sensitively to exact `"Steady"`. | +| `created_at` | Parse as UTC instant from `createdOn` or equivalent SDK projection. | +| `min_node_count` | Positive integer from documented scale-settings surfaces only. `<= 0`, invalid, or unresolvable values are not eligible. | +| `current_node_count` | Integer from documented AML compute properties only. Unknown, invalid, negative, or smaller-than-minimum values are not eligible. | +| `vm_priority` | Preserve raw documented value such as `Dedicated` or `LowPriority`; do not use it to infer exact savings. | +| `active_nodes_zero` | `True` only when the documented `Active Nodes` metric resolves reliably for the requested cluster and all usable source-bucket `Maximum` values are exactly zero. | +| `tags` | `compute.tags or {}` — never `None` in output. | + +--- + +## 8. Unified Decision Rule + +| # | Condition | Action | +|---|---|---| +| 8.1 | `compute.id` absent, `None`, or empty | Skip | +| 8.2 | `compute.name` absent, `None`, or empty | Skip | +| 8.3 | `workspace.name` absent, `None`, or empty | Skip | +| 8.4 | Region filter set and normalized compute location does not match | Skip | +| 8.5 | `compute_type` does not resolve to `"AmlCompute"` | Skip | +| 8.6 | `provisioning_state` does not resolve to `"Succeeded"` | Skip | +| 8.7 | `allocation_state` does not resolve to `"Steady"` | Skip | +| 8.8 | `created_at` is absent, invalid, in the future, or less than `14 days` old | Skip | +| 8.9 | `min_node_count <= 0` or is unresolvable | Skip | +| 8.10 | `current_node_count` is negative, unresolvable, or smaller than `min_node_count` | Skip | +| 8.11 | Required per-cluster activity metric cannot be resolved reliably | Skip | +| 8.12 | Required per-cluster activity metric is non-zero over the `14-day` window | Skip | +| 8.13 | All required signals resolve, baseline capacity is clearly retained, and per-cluster activity is zero over `14 days` | **EMIT** | + +--- + +## 9. Canonical Evaluation Contracts + +### 9.1 Scope and stable-state contract + +Resolve `compute_type` in this order: + +1. SDK projection such as `compute.properties.compute_type` +2. nested/raw management payload such as `properties.computeType` +3. otherwise unknown + +Resolve `provisioning_state` in this order: + +1. SDK projection such as `compute.properties.provisioning_state` +2. nested/raw management payload such as `properties.provisioningState` +3. otherwise unknown + +Resolve `allocation_state` in this order: + +1. SDK projection such as `compute.properties.properties.allocation_state` +2. nested/raw management payload such as `properties.properties.allocationState` +3. otherwise unknown + +Required behavior: + +1. Only exact `"AmlCompute"` is eligible for `compute_type`. +2. Only exact `"Succeeded"` is eligible for `provisioning_state`. +3. Only exact `"Steady"` is eligible for `allocation_state`. +4. Unknown, conflicting, transitional, or any other values must skip. + +### 9.2 Baseline-capacity contract + +Required behavior: + +1. `min_node_count` must resolve to a known positive integer. +2. `current_node_count` must resolve to a known integer. +3. `current_node_count` must be at least `min_node_count`. +4. `min_node_count == 0` must skip. +5. Unknown, invalid, negative, or conflicting count values must skip. +6. `vm_size`, `vm_priority`, `max_node_count`, and `node_idle_time_before_scale_down` may enrich evidence, but they must not replace the baseline-node requirement. + +Rationale: + +This rule is specifically about clusters that are **clearly configured to keep billable baseline nodes allocated**. If the control plane does not clearly show that retained baseline, the rule must fail closed. + +### 9.3 Per-cluster activity-metric contract + +Required metric: + +1. `Active Nodes` with `Maximum` + +Definitions: + +- **usable datapoint**: a datapoint with a parseable UTC timestamp inside the requested window and a numeric `Maximum` value +- **source bucket**: the metric bucket returned by Azure Monitor for the requested query interval before any spec-level normalization +- **UTC day bucket**: the UTC day boundary derived from a datapoint timestamp by normalizing it to `00:00:00Z` for that day +- **expected buckets**: count of UTC-aligned daily buckets overlapping `[window_start, window_end)` +- **observed buckets**: count of unique UTC day buckets with at least one usable datapoint after consolidating duplicate timestamps across all returned series for the target cluster +- **coverage ratio**: `observed_buckets / expected_buckets` +- **acceptable coverage**: `coverage_ratio >= 0.95` +- **resolve reliably**: the metric query returns valid per-cluster data for the requested window, meets the coverage threshold, and does not trigger any `UNKNOWN` condition +- **unusable response shape**: a metric response with missing `value`, malformed time series collections, unparsable timestamps, non-numeric aggregation values, or no reliable `ClusterName`-scoped series for the target cluster + +Required behavior: + +1. Query the documented `Active Nodes` metric for the same fixed `14-day` window. +2. Scope the query to the target cluster using the documented `ClusterName` dimension and the exact compute name. +3. Implementations must not use unfiltered workspace-level fallback to prove cluster idleness. +4. Implementations should request the finest practical documented granularity available for the query and evaluate activity on the returned source buckets before any UTC-day normalization. +5. Normalize datapoint timestamps to UTC day buckets only for coverage calculation and day-level consolidation. +6. If any source bucket has `Maximum > 0`, the metric is `ACTIVE`. +7. Treat any failed query, missing metric, unsupported cluster dimension, unusable response shape, empty series, no datapoints, no valid series, or coverage below threshold as `UNKNOWN`. +8. Treat the metric as `ZERO` only when it resolves reliably and all usable source-bucket values are exactly zero. +9. Emit only when the required activity metric evaluates to `ZERO`. + +Rationale: + +1. This rule is about **no observed workload activity** on a cluster that is still configured to keep baseline nodes alive. +2. The per-cluster dimension is required because workspace-level activity can hide or blur cluster-specific idleness. +3. The metric is evidence of observed job activity, not proof that the cluster is safe to delete. + +--- + +## 10. Cost Model + +`estimated_monthly_cost_usd = None` + +Mandatory rules: + +1. Do **not** use flat hardcoded VM price tables. +2. Do **not** derive a dollar amount from VM family prefixes or from `min_node_count`. +3. Do **not** claim exact monthly savings from management and metric surfaces alone. +4. State only that clusters retaining positive baseline nodes incur ongoing cost while that baseline is kept alive. + +--- + +## 11. Finding Shape + +### 11.1 Required fields + +| Field | Value | +|---|---| +| `provider` | `"azure"` | +| `rule_id` | `"azure.aml.compute.idle"` | +| `resource_type` | `"azure.aml.compute"` | +| `resource_id` | original ARM id from `compute.id` | +| `region` | normalized compute location | +| `confidence` | `HIGH` | +| `risk` | `MEDIUM` | +| `estimated_monthly_cost_usd` | `None` | + +### 11.2 Required evidence + +`signals_used` must clearly disclose: + +1. the resource is exact `AmlCompute` +2. provisioning state is `"Succeeded"` +3. allocation state is `"Steady"` +4. cluster age is at least `14 days` +5. `min_node_count` is positive and `current_node_count >= min_node_count` +6. the documented `Active Nodes` metric for the target cluster resolved to **no observed active nodes** with sufficient coverage + +`signals_not_checked` should include remaining blind spots such as: + +1. future or scheduled training intent +2. business-owner intent not visible in Azure control plane +3. whether a warm baseline is intentionally retained for startup latency, quota reservation, or sporadic experimentation +4. exact VM and infrastructure pricing after discounts, reservations, or special commercial terms + +### 11.3 Required details + +Details should include at least: + +- `cluster_name` +- `workspace_name` +- `resource_group` +- `subscription_id` +- `vm_size` +- `vm_priority` +- `min_node_count` +- `max_node_count` +- `current_node_count` +- `target_node_count` +- `allocation_state` +- `provisioning_state` +- `created_at` +- `node_idle_time_before_scale_down` +- `idle_window_days` +- `metrics_used` +- `tags` + +--- + +## 12. Failure Behavior + +- If subscription-wide workspace inventory fails, let the exception propagate +- If per-workspace compute listing fails, skip that workspace +- If per-compute record resolution or metric retrieval fails, skip that compute +- If a compute record is malformed or missing required fields, skip that compute +- Do not emit on partial, aggregated-only, or unresolved per-cluster metric state diff --git a/docs/specs/azure/ai/aml_compute_instance_idle.md b/docs/specs/azure/ai/aml_compute_instance_idle.md new file mode 100644 index 0000000..ab880b9 --- /dev/null +++ b/docs/specs/azure/ai/aml_compute_instance_idle.md @@ -0,0 +1,440 @@ +# Azure Rule Spec - `azure.ml.compute_instance.idle` + +## 1. Rule Identity + +- **Rule ID:** `azure.ml.compute_instance.idle` +- **Provider:** Azure +- **ARM resource type:** `Microsoft.MachineLearningServices/workspaces/computes` +- **Finding resource_type:** `azure.ml.compute_instance` + +--- + +## 2. Intent + +Detect **Azure Machine Learning compute instances that remain billable in `Running` state while showing no recent documented control-plane lifecycle activity** over a conservative review window. + +This rule is deliberately **precision-first**. It is **not** a generic "inactive notebook" rule, **not** proof that a compute instance is safe to stop or delete, and **not** proof that no user is actively connected. It is a conservative review-candidate rule for compute instances that appear to have been left running without recent documented lifecycle actions. + +--- + +## 3. Azure Documentation Grounding + +### 3.1 Running compute instances continue to incur compute cost until stopped + +Microsoft documents that Azure Machine Learning compute instances: + +1. are managed cloud workstations for development and testing +2. can be started, stopped, restarted, and deleted +3. should be stopped to prevent ongoing compute-hour charges +4. stop compute-hour billing when deallocated, while disk, public IP, and standard load balancer charges can still remain + +Sources: + +- *What is an Azure Machine Learning compute instance?* +- *Manage an Azure Machine Learning compute instance* + +URLs: + +- https://learn.microsoft.com/en-us/azure/machine-learning/concept-compute-instance?view=azureml-api-2 +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-compute-instance?view=azureml-api-2 + +Rule consequence: + +1. This rule must evaluate only compute instances that are currently `Running`. +2. A stopped compute instance is out of scope for this rule. +3. The rule may state that stopping the instance would stop compute-hour spend, but it must not imply total workspace cost becomes zero. + +### 3.2 Azure's documented inactivity definition is stronger than this rule's observable surface + +Microsoft documents that compute-instance idle shutdown is based on runtime inactivity conditions such as: + +- no active Jupyter kernel sessions +- no active Jupyter terminal sessions +- no active Azure Machine Learning runs or experiments +- no VS Code connections +- no custom applications running on the compute + +Microsoft further documents that idle shutdown can be configured only within bounded inactivity periods, from a minimum of `15 minutes` to a maximum of `3 days`. + +Sources: + +- *Create an Azure Machine Learning compute instance* +- *Compute - Update Idle Shutdown Setting (Azure ML REST API)* + +URLs: + +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-compute-instance?view=azureml-api-2 +- https://learn.microsoft.com/en-us/rest/api/azureml/compute/update-idle-shutdown-setting?view=rest-azureml-2026-01-15-preview + +Rule consequence: + +1. This rule must **not** claim it observes actual notebook, terminal, run, VS Code, or custom-app inactivity. +2. This rule must be framed as a **control-plane review candidate**, not as definitive runtime idleness. +3. The rule must not use Azure's idle-shutdown bounds (`15 minutes` to `3 days`) as its own detection threshold; those bounds govern platform auto-shutdown settings, not this cost-hygiene review rule. + +### 3.3 Compute-instance control-plane fields expose the documented lifecycle surfaces for this rule + +Microsoft documents compute-instance control-plane fields including: + +- top-level ARM `location` +- `properties.computeType` +- `properties.provisioningState` +- `properties.createdOn` +- `properties.modifiedOn` +- `properties.properties.state` +- `properties.properties.vmSize` +- `properties.properties.lastOperation.operationName` +- `properties.properties.lastOperation.operationTime` +- `properties.properties.lastOperation.operationStatus` + +Sources: + +- *Compute - Get (Azure ML REST API)* +- *Compute - List (Azure ML REST API)* +- *azure.mgmt.machinelearningservices.models.ComputeResource* +- *azure.mgmt.machinelearningservices.models.ComputeInstance* +- *azure.mgmt.machinelearningservices.models.ComputeInstanceProperties* +- *azure.mgmt.machinelearningservices.models.ComputeInstanceLastOperation* + +URLs: + +- https://learn.microsoft.com/en-us/rest/api/azureml/compute/get?view=rest-azureml-2025-06-01 +- https://learn.microsoft.com/en-us/rest/api/azureml/compute/list?view=rest-azureml-2025-06-01 +- https://learn.microsoft.com/en-us/python/api/azure-mgmt-machinelearningservices/azure.mgmt.machinelearningservices.models.computeresource?view=azure-python +- https://learn.microsoft.com/en-us/python/api/azure-mgmt-machinelearningservices/azure.mgmt.machinelearningservices.models.computeinstance?view=azure-python +- https://learn.microsoft.com/en-us/python/api/azure-mgmt-machinelearningservices/azure.mgmt.machinelearningservices.models.computeinstanceproperties?view=azure-python +- https://learn.microsoft.com/en-us/python/api/azure-mgmt-machinelearningservices/azure.mgmt.machinelearningservices.models.computeinstancelastoperation?view=azure-python + +Rule consequence: + +1. This rule must be limited to exact `computeType == "ComputeInstance"`. +2. The rule should evaluate only stable resources: exact `provisioningState == "Succeeded"` and exact inner `state == "Running"`. +3. `lastOperation.operationTime` is the primary documented lifecycle-activity timestamp. +4. `modifiedOn` may be used only as a weaker documented fallback when `lastOperation.operationTime` is unavailable. +5. Undocumented fallbacks such as age-only inference must not be used to prove idleness. + +### 3.4 Idle-shutdown and schedule configuration are not reliable read-side exclusions for this rule + +Microsoft documents that compute instances can be configured with idle shutdown and scheduled start/stop behavior. + +Sources: + +- *Create an Azure Machine Learning compute instance* +- *Manage an Azure Machine Learning compute instance* + +URLs: + +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-create-compute-instance?view=azureml-api-2 +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-manage-compute-instance?view=azureml-api-2 + +Rule consequence: + +1. This rule must not assume that a compute instance lacks schedule or idle-shutdown protection merely because those settings are not available on the standard read path used by the rule. +2. Schedule or idle-shutdown configuration may be mentioned as a blind spot, but must not be required to emit or to skip. +3. The rule is read-only; it must not mutate idle-shutdown settings. + +--- + +## 4. Detection Goal + +Emit a finding only when **all** of the following are true: + +1. `compute.id` is present and non-empty +2. `compute.name` is present and non-empty +3. `workspace.name` is present and non-empty +4. the optional region filter matches the normalized compute location +5. `compute_type` resolves to exactly `"ComputeInstance"` +6. `provisioning_state` resolves to exactly `"Succeeded"` +7. `state` resolves to exactly `"Running"` +8. `created_at` is known and the instance age is at least the configured idle window +9. a documented lifecycle-activity timestamp resolves reliably from `lastOperation.operationTime` or documented `modifiedOn` +10. the resolved lifecycle inactivity duration is at least the configured idle window + +If any required signal cannot be established reliably, skip rather than emit. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that no notebook kernel, terminal session, VS Code session, AML run, or custom application is active right now +- that stopping or deleting the instance is safe +- that the creator or assigned user no longer needs the instance +- that no automatic schedule or platform policy will stop the instance later +- that a specific monthly dollar saving exists + +--- + +## 6. Canonical Inputs + +### 6.1 Required surfaces + +| Surface | Purpose | +|---|---| +| AML workspace inventory | enumerate candidate workspaces | +| AML compute list/get for each workspace | determine resource identity, location, compute type, provisioning state, creation/modification timestamps, current state, VM size, and last lifecycle operation | + +### 6.2 Authentication / permissions + +Minimum permissions: + +- `Microsoft.MachineLearningServices/workspaces/read` +- `Microsoft.MachineLearningServices/workspaces/computes/read` + +No secret, key, session, or notebook-content retrieval is required for this rule. + +### 6.3 Idle window + +- Configurable parameter: `idle_days` +- Default: `14` +- Minimum effective value: `1` + +Reason: + +- Azure's documented idle-shutdown thresholds are operational auto-stop controls, not a direct contract for this review rule. +- A two-week default window is conservative enough to avoid flagging brief pauses while still surfacing compute instances that appear to have been left running. + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `location` | Resolve from documented compute resource location surfaces only. If unresolved, treat as unknown and skip. Lowercase before comparison, then compare by exact lowercase string equality only. Do not remove spaces, hyphens, or digits. | +| `compute_type` | Resolve from documented SDK/raw surfaces and compare case-sensitively to exact `"ComputeInstance"`. | +| `provisioning_state` | Resolve from documented SDK/raw surfaces and compare case-sensitively to exact `"Succeeded"`. | +| `state` | Resolve from documented inner compute-instance properties, normalize only by string extraction / surrounding-whitespace trimming, then compare case-sensitively to exact `"Running"`. Any other casing or value is not eligible. | +| `created_at` | Parse as a UTC instant from documented `createdOn` or equivalent SDK projection. If the chosen field is present but unparsable, skip. | +| `modified_at` | Parse as a UTC instant from documented `modifiedOn` or equivalent SDK projection. Use only as a fallback lifecycle timestamp when `last_operation_time` is absent, and only when `modified_at > created_at`. If used and unparsable, skip. | +| `last_operation_time` | Parse as a UTC instant from documented `lastOperation.operationTime`. If the field is present but unparsable, skip rather than silently falling back. | +| `last_operation_status` | Preserve the documented raw value such as `"Succeeded"` or `"InProgress"` for evidence only; missing status does not invalidate a parseable `last_operation_time`. | +| `lifecycle_activity_at` | Use `last_operation_time` when present and parseable; otherwise use `modified_at` only under the documented fallback rules. No other fallback is allowed. | +| `idle_signal_source` | One of `last_operation` or `modified_on`. No other fallback is allowed. | +| `vm_size` | Preserve raw documented value. GPU classification is limited to exact case-sensitive prefix matching on `Standard_NC`, `Standard_ND`, and `Standard_NV`. `null` or absent `vm_size` is non-GPU for risk purposes. | +| `tags` | `compute.tags or {}` - never `None` in output. | + +--- + +## 8. Unified Decision Rule + +| # | Condition | Action | +|---|---|---| +| 8.1 | `compute.id` absent, `None`, or empty | Skip | +| 8.2 | `compute.name` absent, `None`, or empty | Skip | +| 8.3 | `workspace.name` absent, `None`, or empty | Skip | +| 8.4 | Region filter set and normalized compute location does not match | Skip | +| 8.5 | `compute_type` does not resolve to `"ComputeInstance"` | Skip | +| 8.6 | `provisioning_state` does not resolve to `"Succeeded"` | Skip | +| 8.7 | `state` does not resolve to `"Running"` | Skip | +| 8.8 | `location` is unresolved | Skip | +| 8.9 | `created_at` is absent, invalid, in the future, or younger than the effective `idle_days` window | Skip | +| 8.10 | Lifecycle-activity evaluation fails any deterministic rule in section `9.4` | Skip | +| 8.11 | Resolved lifecycle-activity timestamp is in the future | Skip | +| 8.12 | Floored `idle_since_days` is less than the effective `idle_days` window | Skip | +| 8.13 | All required signals resolve and documented lifecycle activity is stale for at least `idle_days` while the instance remains `Running` | **EMIT** | + +--- + +## 9. Canonical Evaluation Contracts + +### 9.1 Scope and stable-state contract + +Resolve `compute_type` in this order: + +1. SDK projection such as `compute.properties.compute_type` +2. nested/raw management payload such as `properties.computeType` +3. otherwise unknown + +Resolve `provisioning_state` in this order: + +1. SDK projection such as `compute.properties.provisioning_state` +2. nested/raw management payload such as `properties.provisioningState` +3. otherwise unknown + +Resolve `state` in this order: + +1. SDK projection such as `compute.properties.properties.state` +2. nested/raw management payload such as `properties.properties.state` +3. otherwise unknown + +Required behavior: + +1. Only exact `"ComputeInstance"` is eligible for `compute_type`. +2. Only exact `"Succeeded"` is eligible for `provisioning_state`. +3. Only exact `"Running"` is eligible for `state`. +4. Unknown, conflicting, transitional, failed, or any other values must skip. + +### 9.2 Location contract + +Resolve `location` in this order: + +1. top-level ARM resource location such as `compute.location` +2. subtype location such as `compute.properties.compute_location` +3. nested/raw subtype location such as `properties.computeLocation` +4. otherwise unknown + +Required behavior: + +1. Use the compute resource's documented location, not the workspace location, for filtering and reporting. +2. Compare by exact lowercase equality only. +3. If `location` cannot be resolved, skip. +4. If multiple documented location surfaces are present and conflict materially, skip. + +### 9.3 Age contract + +Required behavior: + +1. `created_at` must resolve to a known UTC timestamp. +2. `created_at` in the future must skip. +3. Instance age must be at least the effective `idle_days` window. +4. Age may gate eligibility, but age alone must never prove idleness. +5. Timestamp parse failure on `created_at` must skip. + +### 9.4 Lifecycle-activity contract + +Definitions: + +- **lifecycle activity**: a documented control-plane operation or modification timestamp on the compute instance resource; it is not proof of actual notebook or user-session activity +- **effective idle window**: `max(idle_days, 1)` +- **now_utc**: the evaluation time captured as a UTC timestamp +- **UTC-normalized timestamp**: a parsed timestamp converted to UTC before any comparison +- **idle duration**: `floor((now_utc - lifecycle_activity_at_utc).total_seconds() / 86400)` +- **idle_since_days**: exactly the computed `idle duration` +- **absent last operation**: `lastOperation` missing entirely, or present without an `operationTime` field +- **unusable last operation**: `lastOperation.operationTime` field is present but invalid or otherwise unusable for deterministic evaluation + +Required behavior: + +1. Resolve `last_operation` from documented compute-instance properties only. +2. All timestamp parsing, ordering, and age / inactivity comparisons must be performed on UTC-normalized timestamps, using `now_utc` as the comparison reference time. +3. If `lastOperation.operationTime` exists and parses successfully, use it as `lifecycle_activity_at`. +4. If `lastOperation.operationTime` exists but does not parse, skip rather than silently falling back. +5. Missing `lastOperation.operationStatus` does not invalidate a parseable `lastOperation.operationTime`. +6. `operationName` and `operationStatus` are evidence fields only; they must not independently drive skip or emit decisions unless the timestamp itself is invalid. +7. If `lastOperation.operationTime == created_at`, treat that as no proven post-create inactivity signal and skip. +8. Use documented `modifiedOn` only when `lastOperation` is absent or has no `operationTime`, and only when `modifiedOn` parses successfully and `modifiedOn > created_at`. +9. If `modifiedOn == created_at`, treat that as no proven post-create inactivity signal and skip. +10. If `modifiedOn` is selected and fails parsing, skip. +11. Future timestamps are handled strictly: any selected lifecycle timestamp greater than `now_utc` must skip; no clock-skew tolerance is allowed. +12. Do **not** use undocumented fallbacks such as `systemData.lastModifiedAt`. +13. Do **not** use age-only fallback. +14. Compute inactivity strictly from `lifecycle_activity_at`, never from age. +15. `idle_signal_source` must exactly match the timestamp actually selected for `lifecycle_activity_at`. +16. `idle_since_days` must equal the floored `idle duration`. +17. Emit only when floored `idle_since_days` is at least the effective idle window. + +Rationale: + +1. Microsoft documents `lastOperation` and `modifiedOn` as control-plane surfaces for compute instances. +2. Microsoft separately documents runtime inactivity for idle shutdown using signals that are not available from this rule's read path. +3. Therefore this rule must fail closed whenever documented lifecycle signals are absent, weak, or unparsable. + +### 9.5 Risk and confidence contract + +Risk: + +1. `HIGH` when non-null `vm_size` begins with one of the exact case-sensitive matching prefixes `Standard_NC`, `Standard_ND`, or `Standard_NV` +2. `MEDIUM` otherwise, including `null` / absent `vm_size` + +Confidence: + +1. `MEDIUM` when `last_operation` is the idle signal source and all required conditions are met +2. `LOW` when documented `modifiedOn` fallback is the idle signal source and all required conditions are met + +Rationale: + +1. Even the strongest version of this rule observes only documented control-plane lifecycle staleness, not runtime notebook/session inactivity. +2. `modifiedOn` is weaker than `lastOperation.operationTime` for review purposes and must not receive the same confidence level. + +--- + +## 10. Cost Model + +`estimated_monthly_cost_usd = None` + +Mandatory rules: + +1. Do **not** use flat hardcoded VM price tables. +2. Do **not** claim exact monthly savings from management metadata alone. +3. State only that a compute instance left in `Running` state continues to incur compute-hour cost until stopped. +4. If relevant, note that disk, public IP, and standard load balancer charges can still remain after stop/deallocation. + +--- + +## 11. Finding Shape + +### 11.1 Required fields + +| Field | Value | +|---|---| +| `provider` | `"azure"` | +| `rule_id` | `"azure.ml.compute_instance.idle"` | +| `resource_type` | `"azure.ml.compute_instance"` | +| `resource_id` | original ARM id from `compute.id` | +| `region` | normalized compute location | +| `estimated_monthly_cost_usd` | `None` | + +### 11.2 Required confidence and risk + +| Condition | Confidence | Risk | +|---|---|---| +| `idle_signal_source == "last_operation"` and GPU VM family | `MEDIUM` | `HIGH` | +| `idle_signal_source == "last_operation"` and non-GPU VM family | `MEDIUM` | `MEDIUM` | +| `idle_signal_source == "modified_on"` and GPU VM family | `LOW` | `HIGH` | +| `idle_signal_source == "modified_on"` and non-GPU VM family | `LOW` | `MEDIUM` | + +### 11.3 Required evidence + +`signals_used` must clearly disclose: + +1. the resource is exact `ComputeInstance` +2. provisioning state is `"Succeeded"` +3. runtime state is `"Running"` +4. instance age is at least the configured idle window +5. the last documented control-plane lifecycle activity is older than the configured idle window +6. whether the stale timestamp came from `lastOperation.operationTime` or `modifiedOn` + +`signals_not_checked` should include remaining blind spots such as: + +1. active Jupyter kernels +2. active Jupyter terminals +3. active AML runs or experiments +4. active VS Code connections +5. custom applications currently running on the compute +6. creator or business-owner intent +7. automatic schedules or shutdown behavior not visible from the rule's read path +8. exact pricing after discounts, reservations, or special commercial terms + +### 11.4 Required details + +Details should include at least: + +- `instance_name` +- `workspace_name` +- `resource_group` +- `subscription_id` +- `location` +- `vm_size` +- `compute_type` +- `provisioning_state` +- `state` +- `created_at` +- `modified_at` (may be present even when it was not used as the selected lifecycle signal) +- `last_operation_name` (`null` allowed) +- `last_operation_time` +- `last_operation_status` (`null` allowed) +- `idle_since_days` +- `idle_days_threshold` +- `idle_signal_source` +- `tags` + +--- + +## 12. Failure Behavior + +- If subscription-wide workspace inventory fails, let the exception propagate +- If per-workspace compute listing fails, skip that workspace +- If an individual compute record is malformed or missing required documented fields, skip that compute +- Do not emit when lifecycle activity can be inferred only from age, undocumented system-data timestamps, or guessed user behavior +- Do not mutate schedules or idle-shutdown settings as part of detection diff --git a/docs/specs/azure/ai/ml_online_endpoint_idle.md b/docs/specs/azure/ai/ml_online_endpoint_idle.md new file mode 100644 index 0000000..95ccaed --- /dev/null +++ b/docs/specs/azure/ai/ml_online_endpoint_idle.md @@ -0,0 +1,453 @@ +# Azure Rule Spec - `azure.ml.online_endpoint.idle` + +## 1. Rule Identity + +- **Rule ID:** `azure.ml.online_endpoint.idle` +- **Provider:** Azure +- **ARM resource type:** `Microsoft.MachineLearningServices/workspaces/onlineEndpoints` +- **Finding resource_type:** `azure.ml.online_endpoint` + +--- + +## 2. Intent + +Detect **Azure Machine Learning managed online endpoints that retain billable deployment baseline instances while `RequestsPerMinute` stays at zero** over a documented observation window. + +This rule is deliberately **precision-first**. It is **not** a generic "quiet workspace" rule, **not** proof that deleting an endpoint is safe, and **not** proof of a specific monthly saving. It is a conservative review-candidate rule for managed online endpoints that appear to be continuously provisioned but unused. + +--- + +## 3. Azure Documentation Grounding + +### 3.1 Managed online endpoints incur compute and networking cost while deployments retain instances + +Microsoft documents that managed online endpoints: + +1. are the recommended Azure Machine Learning online endpoint type +2. use managed VM compute for deployments +3. charge for the VMs assigned to deployments, with no added managed-endpoint surcharge +4. can also incur networking-related charges + +Sources: + +- *Online endpoints for real-time inference* +- *View costs for managed online endpoints* +- *Managed online endpoint YAML reference* + +URLs: + +- https://learn.microsoft.com/en-us/azure/machine-learning/concept-endpoints-online?view=azureml-api-2 +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-view-online-endpoints-costs?view=azureml-api-2 +- https://learn.microsoft.com/en-us/azure/machine-learning/reference-yaml-deployment-managed-online?view=azureml-api-2 + +Rule consequence: + +1. This rule should target **managed online endpoints** only. +2. The rule should require evidence that one or more managed deployments retain billable baseline instances. +3. The rule must **not** claim exact endpoint cost from management metadata alone. + +### 3.2 Managed and Kubernetes online endpoints must not be conflated + +Microsoft documents two online endpoint kinds: + +- managed online endpoints +- Kubernetes online endpoints + +Microsoft further documents that Kubernetes online endpoints use customer-managed compute, while managed online endpoints use Azure-managed compute. + +Sources: + +- *Online endpoints for real-time inference* +- *Online endpoints YAML reference* +- *ManagedOnlineEndpoint class* +- *KubernetesOnlineEndpoint class* + +URLs: + +- https://learn.microsoft.com/en-us/azure/machine-learning/concept-endpoints-online?view=azureml-api-2 +- https://learn.microsoft.com/en-us/azure/machine-learning/reference-yaml-endpoint-online?view=azureml-api-2 +- https://learn.microsoft.com/en-us/python/api/azure-ai-ml/azure.ai.ml.entities.managedonlineendpoint?view=azure-python +- https://learn.microsoft.com/en-us/python/api/azure-ai-ml/azure.ai.ml.entities.kubernetesonlineendpoint?view=azure-python + +Rule consequence: + +1. Kubernetes online endpoints are out of scope. +2. Managed scope should be established from documented endpoint/deployment surfaces only. +3. If managed-vs-kubernetes scope cannot be established reliably, skip rather than infer. + +### 3.3 Endpoint and deployment control-plane fields expose the stable and billing-relevant surfaces + +Microsoft documents online endpoint and deployment fields including: + +- endpoint `location` +- endpoint `kind` +- endpoint `properties.provisioningState` +- endpoint `systemData.createdAt` +- deployment `properties.provisioningState` +- deployment type/class surfaces in SDK/REST when present +- deployment `properties.instanceType` +- deployment `properties.instanceCount` +- deployment scale settings + +Sources: + +- *Online Endpoints - List* +- *Online Endpoints - Get* +- *Online Deployments - List* +- *Online Deployments - Get* +- *ManagedOnlineDeployment class* +- *TargetUtilizationScaleSettings class* + +URLs: + +- https://learn.microsoft.com/en-us/rest/api/azureml/online-endpoints/list?view=rest-azureml-2025-06-01 +- https://learn.microsoft.com/en-us/rest/api/azureml/online-endpoints/get?view=rest-azureml-2025-06-01 +- https://learn.microsoft.com/en-us/rest/api/azureml/online-deployments/list?view=rest-azureml-2025-06-01 +- https://learn.microsoft.com/en-us/rest/api/azureml/online-deployments/get?view=rest-azureml-2025-06-01 +- https://learn.microsoft.com/en-us/python/api/azure-ai-ml/azure.ai.ml.entities.managedonlinedeployment?view=azure-python +- https://learn.microsoft.com/en-us/python/api/azure-ai-ml/azure.ai.ml.entities.targetutilizationscalesettings?view=azure-python + +Rule consequence: + +1. The rule should evaluate only stable endpoints and stable deployments: exact `provisioningState == "Succeeded"`. +2. Managed scope should be established primarily from documented endpoint kind/class surfaces; deployment type/class hints are reinforcing only when present. +3. The rule should require a known positive deployment baseline instance count from documented deployment surfaces before emitting. + +### 3.4 Azure Monitor documents endpoint-scope request traffic via `RequestsPerMinute` + +Microsoft documents that `Microsoft.MachineLearningServices/workspaces/onlineEndpoints` exposes endpoint-scope traffic metrics including: + +- `RequestsPerMinute` +- `RequestLatency` +- `ConnectionsActive` +- `NetworkBytes` + +Microsoft further documents: + +- `RequestsPerMinute` is the endpoint request-count signal +- `RequestsPerMinute` uses `Average` +- `RequestsPerMinute` supports `PT1M` +- endpoint metrics are scoped to the online endpoint resource + +Sources: + +- *Supported metrics for Microsoft.MachineLearningServices/workspaces/onlineEndpoints* +- *Monitor online endpoints* +- *Autoscale online endpoints* + +URLs: + +- https://learn.microsoft.com/en-us/azure/azure-monitor/reference/supported-metrics/microsoft-machinelearningservices-workspaces-onlineendpoints-metrics +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-monitor-online-endpoints?view=azureml-api-2 +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-autoscale-endpoints?view=azureml-api-2 + +Rule consequence: + +1. `RequestsPerMinute` is the canonical idle-traffic signal for this rule. +2. The metric query should target the **endpoint ARM resource id**, not the workspace. +3. Workspace-level request metrics and undocumented fallback metrics such as `RequestCount` or `ModelEndpointRequests` must not be used to prove endpoint idleness. +4. If the documented endpoint metric cannot be resolved reliably, the endpoint must be skipped. + +### 3.5 Managed online endpoint traffic routing can include multiple deployments and mirror traffic + +Microsoft documents endpoint traffic routing and mirrored traffic across deployments for online endpoints. + +Sources: + +- *Safely roll out online endpoints* +- *Online endpoints YAML reference* + +URLs: + +- https://learn.microsoft.com/en-us/azure/machine-learning/how-to-safely-rollout-online-endpoints?view=azureml-api-2 +- https://learn.microsoft.com/en-us/azure/machine-learning/reference-yaml-endpoint-online?view=azureml-api-2 + +Rule consequence: + +1. A zero-traffic finding must be based on the endpoint-level request metric across the whole endpoint. +2. The rule must not infer idleness from deployment routing percentages alone. +3. Multiple deployments under one endpoint do not weaken the endpoint-scope request metric when that metric resolves reliably. + +--- + +## 4. Detection Goal + +Emit only when the endpoint passes every rule in section **8**. Section **8** is the single source of truth for decisioning; sections **7** and **9** define how inputs are normalized and evaluated. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that deleting the endpoint is safe +- that no future rollout, failover, or standby use is planned +- that autoscale policies outside the inspected deployment surfaces will never change live instance count +- that no deployment-specific operational need exists +- that a specific monthly saving exists + +--- + +## 6. Canonical Inputs + +### 6.1 Required surfaces + +| Surface | Purpose | +|---|---| +| AML workspace inventory | enumerate candidate workspaces | +| online endpoint list/get for each workspace | determine endpoint identity, region, kind, provisioning state, and age | +| online deployment list/get for each endpoint | determine managed-vs-kubernetes scope, deployment stability, instance type, and baseline instance counts | +| Azure Monitor metrics on the endpoint ARM id | determine endpoint request traffic using documented endpoint-scope metrics | + +### 6.2 Authentication / permissions + +Minimum permissions: + +- `Microsoft.MachineLearningServices/workspaces/read` +- `Microsoft.MachineLearningServices/workspaces/onlineEndpoints/read` +- `Microsoft.MachineLearningServices/workspaces/onlineEndpoints/deployments/read` +- `Microsoft.Insights/metrics/read` + +No secret, key, request-payload, or model retrieval is required for this rule. + +### 6.3 Idle window + +- Configurable parameter: `idle_days` +- Default: `7` +- Minimum effective value: `1` + +Reason: + +- Managed online endpoints are low-latency serving infrastructure and can legitimately be quiet for short periods. +- A one-week default window is conservative enough to avoid flagging brief pauses while still surfacing continuously provisioned endpoints with no observed request traffic. + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `location` | Resolve from documented endpoint resource location surfaces only. If unresolved, skip. Lowercase before comparison, then compare by exact lowercase equality only. | +| `managed_scope` | Treat the endpoint as managed only by the exact rules in section `9.1`. | +| `managed_scope_source` | Observability field derived from section `9.1`. Allowed values: `endpoint`, `deployment`, or `none`. It records which surface established managed scope; it does not add new decision logic beyond section `9.1`. | +| `provisioning_state` | Resolve from documented SDK/raw surfaces and compare case-sensitively to exact `"Succeeded"`. | +| `created_at` | Parse as a UTC instant from documented `systemData.createdAt` or equivalent SDK projection. If the chosen field is present but unparsable, skip. | +| `deployment_provisioning_state` | Resolve from documented deployment surfaces and compare case-sensitively to exact `"Succeeded"`. | +| `baseline_instance_count` | Resolve in this exact order from documented deployment configuration surfaces: `scale_settings.min_instances`, then `instance_count`, otherwise unknown. Only a known integer `> 0` is billing-relevant for this rule. `0`, invalid, or unresolvable values are not enough to emit. | +| `instance_type` | Preserve raw documented value. Use only for descriptive details and GPU risk classification; it must **not** determine managed scope or billing relevance. GPU classification should use uppercase normalization and exact prefix matching on `STANDARD_NC`, `STANDARD_ND`, and `STANDARD_NV`. `null` / absent `instance_type` is non-GPU for risk purposes. | +| `requests_per_minute_zero` | `True` only when section `9.5` evaluates the endpoint metric result to `ZERO`. | +| `tags` | `endpoint.tags or {}` - never `None` in output. | + +--- + +## 8. Unified Decision Rule + +| # | Condition | Action | +|---|---|---| +| 8.1 | `endpoint.id` absent, `None`, or empty | Skip | +| 8.2 | `endpoint.name` absent, `None`, or empty | Skip | +| 8.3 | `workspace.name` absent, `None`, or empty | Skip | +| 8.4 | Region filter set and normalized endpoint location does not match | Skip | +| 8.5 | `managed_scope` is not established per section `9.1` | Skip | +| 8.6 | Endpoint `provisioning_state` does not resolve to `"Succeeded"` | Skip | +| 8.7 | Endpoint `created_at` is absent, invalid, in the future, or younger than the effective `idle_days` window | Skip | +| 8.8 | Deployment inventory cannot be resolved reliably | Skip | +| 8.9 | No stable deployment under the managed endpoint resolves to a known positive baseline instance count | Skip | +| 8.10 | Endpoint traffic metric result is not `ZERO` per section `9.5` | Skip | +| 8.11 | All required signals resolve and the managed endpoint has `RequestsPerMinute == 0` across the effective window while retaining positive baseline deployment instances | **EMIT** | + +--- + +## 9. Canonical Evaluation Contracts + +### 9.1 Scope and stable-endpoint contract + +Required behavior: + +1. Resolve endpoint-level scope only from documented endpoint class/type or kind surfaces. +2. Resolve deployment-level scope hints only from explicit documented deployment class/type surfaces on stable deployments; never infer them from unrelated fields. +3. Section `9.1` is the single source of truth for managed-scope resolution. +4. Endpoint-level explicit signals always override deployment-level signals for managed-scope determination. +5. Scope priority is strict: + 1. endpoint-level explicit Kubernetes -> out of scope + 2. endpoint-level explicit managed -> in scope unless any stable deployment explicitly identifies Kubernetes + 3. if endpoint-level scope is absent, stable-deployment explicit managed -> in scope only when no stable deployment explicitly identifies Kubernetes + 4. otherwise out of scope +6. If explicit managed and explicit Kubernetes signals both appear across the allowed endpoint/deployment scope surfaces, skip. +7. If no explicit managed signal can be established from the allowed surfaces, skip. +8. Set `managed_scope_source = "endpoint"` when endpoint-level explicit managed evidence establishes scope. +9. Set `managed_scope_source = "deployment"` when stable-deployment explicit managed evidence establishes scope. +10. Set `managed_scope_source = "none"` when managed scope is not established. +11. Endpoint `provisioning_state` must resolve to exact `"Succeeded"`. + +### 9.2 Location and age contract + +Required behavior: + +1. Use the endpoint resource's documented location, not the workspace location, for filtering and reporting. +2. Endpoint `created_at` must resolve to a known UTC timestamp. +3. `created_at` in the future must skip. +4. Endpoint age must be at least the effective `idle_days` window. + +### 9.3 Billing relevance gate (configured capacity proxy only) + +Required behavior: + +1. Deployment inventory must resolve successfully for the endpoint. +2. Only deployments with exact `deployment_provisioning_state == "Succeeded"` may contribute to baseline-instance evidence. +3. A deployment may contribute to baseline-instance evidence only when it is under an endpoint already established as managed per section `9.1`. +4. For each remaining candidate deployment, resolve `baseline_instance_count` in this exact order: `scale_settings.min_instances`, then `instance_count`, otherwise unknown. +5. A deployment is billing-relevant only when its resolved `baseline_instance_count` is a known integer greater than zero. +6. An endpoint is billing-relevant only when at least one deployment is billing-relevant. +7. Endpoints whose deployments all clearly resolve to zero baseline instances must skip. +8. If deployment configuration is too incomplete to establish any billing-relevant deployment reliably, skip. + +Rationale: + +This rule is about endpoints that appear to retain billable baseline serving instances while unused. `baseline_instance_count` is treated as configured retained-capacity intent, not proof of live runtime capacity at every instant, and not as a proxy for deployment existence by itself. If the deployment surfaces do not clearly show positive retained baseline instances, the rule must fail closed. This intentionally skips scale-to-zero, autoscale-min-zero, or otherwise unproven retained-capacity cases, accepting false negatives to avoid over-claiming idle cost. This rule does not attempt to model transient autoscale cost when a positive retained baseline cannot be proven from the required control-plane surfaces. + +### 9.4 Instance-type and GPU contract + +Required behavior: + +1. Preserve the first available documented `instance_type` for details, but consider all billing-relevant deployments when classifying GPU presence. +2. GPU classification uses uppercase-normalized `instance_type` and exact prefix matching on `STANDARD_NC`, `STANDARD_ND`, and `STANDARD_NV`. +3. Unknown or absent `instance_type` must not be treated as GPU by default. +4. `instance_type` must not be used to determine managed scope or billing relevance. + +### 9.5 Endpoint traffic-metric contract + +Required metric: + +1. `RequestsPerMinute` with `Average` + +Definitions: + +- **effective idle window**: `max(idle_days, 1)` +- **now_utc**: the evaluation time captured as a UTC timestamp +- **metric_end_utc**: `floor_to_minute(now_utc - 5 minutes)` +- **window_start_utc**: `metric_end_utc - effective idle window` +- **UTC-normalized timestamp**: a parsed timestamp converted to UTC before any comparison +- **usable datapoint**: a datapoint with a parseable UTC timestamp inside the requested window and a numeric `Average` value +- **acceptable coverage**: at least 80% of complete minute buckets in `[window_start_utc, metric_end_utc)` have usable datapoints +- **idle_duration**: `floor((metric_end_utc - window_start_utc).total_seconds() / 86400)` +- **idle_since_days**: derived constant equal to the configured effective idle window when the metric result is `ZERO`; it is not an observational duration estimate beyond that accepted window +- **metric_result**: one of `ACTIVE`, `ZERO`, or `UNKNOWN` + +Required behavior: + +1. Query the documented `RequestsPerMinute` metric on the **endpoint ARM resource id**. +2. Use the documented `PT1M` granularity and `Average` aggregation. +3. Evaluate only complete minute buckets in `[window_start_utc, metric_end_utc)`. +4. This is a rolling UTC-aligned window, not a calendar-day window. +5. Do **not** use workspace-scope metric fallback. +6. Do **not** use undocumented or legacy request metrics such as `RequestCount` or `ModelEndpointRequests` to prove idleness. +7. If any usable bucket has `Average > 0`, then `metric_result = ACTIVE`. +8. If the query fails, the metric is missing, the response shape is unusable, no usable datapoints exist, or coverage is below threshold, then `metric_result = UNKNOWN`. +9. Treat the metric as `ZERO` only when coverage is acceptable and every usable bucket has `Average == 0`. +10. When `metric_result = ZERO`, set `idle_since_days` to the effective idle window represented by `[window_start_utc, metric_end_utc)`. +11. No secondary metric may substitute for `RequestsPerMinute`; related metrics such as latency or connections are observability-only and must not override the canonical traffic result. +12. Missing buckets, late-arriving datapoints, and sparse telemetry affect only coverage calculation; if resulting coverage is below threshold, `metric_result = UNKNOWN`. + +Rationale: + +This metric contract is intentionally conservative. Azure Monitor ingestion delay or sparse minute coverage may increase skips, but the rule still prefers a documented zero-request result over unsupported metric fallbacks. The single-metric dependency is intentional because the Azure documentation grounds request absence most directly in `RequestsPerMinute`. + +### 9.6 Risk and confidence contract + +Risk: + +1. `HIGH` when any billing-relevant deployment is GPU-classified +2. `MEDIUM` otherwise + +Confidence: + +1. `HIGH` when all required endpoint and deployment signals resolve and metric coverage is at least 95% for a `ZERO` result +2. `MEDIUM` when all required endpoint and deployment signals resolve and metric coverage is at least 80% but below 95% for a `ZERO` result + +Rationale: + +This rule should fail closed rather than emit from unknown traffic evidence, but confidence still reflects metric quality within the acceptable coverage band. Confidence does not override the `ZERO` requirement; it refines finding strength only after emit conditions are met. + +--- + +## 10. Cost Model + +`estimated_monthly_cost_usd = None` + +Mandatory rules: + +1. Do **not** use flat hardcoded VM price tables. +2. Do **not** claim exact monthly savings from management metadata alone. +3. State only that managed deployments retaining positive instance baselines continue to incur compute cost while provisioned. +4. If relevant, note that networking-related charges can also apply. + +--- + +## 11. Finding Shape + +### 11.1 Required fields + +| Field | Value | +|---|---| +| `provider` | `"azure"` | +| `rule_id` | `"azure.ml.online_endpoint.idle"` | +| `resource_type` | `"azure.ml.online_endpoint"` | +| `resource_id` | original ARM id from `endpoint.id` | +| `region` | normalized endpoint location | +| `confidence` | derived from section `9.6` | +| `estimated_monthly_cost_usd` | `None` | + +### 11.2 Required evidence + +`signals_used` must clearly disclose: + +1. managed scope was established from the allowed endpoint/deployment scope surfaces +2. endpoint provisioning state is `"Succeeded"` +3. endpoint age is at least the configured idle window +4. one or more deployments under the managed endpoint retain positive configured baseline instances +5. the documented `RequestsPerMinute` metric result is `ZERO` across the rolling UTC window defined in section `9.5` + +`signals_not_checked` should include remaining blind spots such as: + +1. future traffic intent or standby usage +2. autoscale policies or live instance state not fully visible from the inspected deployment configuration surfaces +3. exact endpoint/deployment cost after discounts, reservations, or special commercial terms +4. business-owner intent or rollout plans + +### 11.3 Required details + +Details should include at least: + +- `endpoint_name` +- `workspace_name` +- `resource_group` +- `subscription_id` +- `location` +- `endpoint_kind` (if present) +- `managed_scope_source` +- `endpoint_provisioning_state` +- `created_at` +- `billing_relevant_deployment_count` +- `deployment_count` +- `instance_type` +- `is_gpu` +- `baseline_instance_count_total` +- `idle_days_threshold` +- `idle_since_days` +- `metric_name` +- `metric_aggregation` +- `metric_coverage_ratio` +- `tags` + +`managed_scope_source`, `deployment_count`, `instance_type`, `is_gpu`, and `metric_coverage_ratio` are observability fields for reviewer context. They do not add gating logic beyond sections `8` and `9`. + +--- + +## 12. Failure Behavior + +- If subscription-wide workspace inventory fails, let the exception propagate +- If per-workspace endpoint listing fails, skip that workspace +- If per-endpoint deployment listing or metric retrieval fails, skip that endpoint +- If endpoint or deployment records are malformed or missing required fields, skip that endpoint +- Do not emit on partial, workspace-level, legacy-metric, or age-only traffic evidence diff --git a/docs/specs/azure/ai/openai_provisioned_idle.md b/docs/specs/azure/ai/openai_provisioned_idle.md new file mode 100644 index 0000000..6e09246 --- /dev/null +++ b/docs/specs/azure/ai/openai_provisioned_idle.md @@ -0,0 +1,437 @@ +# Azure Rule Spec - `azure.openai.provisioned_deployment.idle` + +## 1. Rule Identity + +- **Rule ID:** `azure.openai.provisioned_deployment.idle` +- **Provider:** Azure +- **ARM resource type:** `Microsoft.CognitiveServices/accounts/deployments` +- **Finding resource_type:** `azure.openai.provisioned_deployment` + +--- + +## 2. Intent + +Detect **Azure OpenAI provisioned deployments that retain billable PTU capacity while showing no observed Azure OpenAI request traffic** over a conservative documented observation window. + +This rule is deliberately **precision-first**. It is **not** proof that deleting a deployment is safe, **not** proof that a reservation can be canceled without consequence, and **not** proof of an exact monthly saving. It is a conservative review-candidate rule for provisioned Azure OpenAI deployments that appear to be continuously billed but unused. + +--- + +## 3. Azure Documentation Grounding + +### 3.1 Provisioned throughput bills on deployed PTUs whether used or not + +Microsoft documents that provisioned throughput: + +1. allocates model processing capacity once deployed +2. is sized in Provisioned Throughput Units (PTUs) +3. is billed hourly based on the number of deployed PTUs +4. can receive substantial discount through Azure reservations +5. continues to hold capacity while the deployment exists + +Microsoft also documents that: + +1. deleting a deployment avoids unwanted deployment charges +2. deleting the parent resource before deleting or purging deployments can allow charges to continue +3. reservations and deployments are loosely coupled, so deleting a deployment does not cancel or change a PTU reservation + +Sources: + +- *What is provisioned throughput for Foundry Models?* +- *Provisioned throughput unit (PTU) costs and billing* +- *Get started with provisioned deployments in Microsoft Foundry* + +URLs: + +- https://learn.microsoft.com/en-us/azure/foundry/openai/concepts/provisioned-throughput +- https://learn.microsoft.com/en-us/azure/foundry/openai/how-to/provisioned-throughput-onboarding +- https://learn.microsoft.com/en-us/azure/foundry/openai/how-to/provisioned-get-started + +Rule consequence: + +1. The rule should evaluate only deployments that clearly retain provisioned PTU capacity. +2. The rule may state that deployed PTUs continue to incur hourly cost while the deployment exists. +3. The rule must **not** hardcode a fixed monthly PTU price or claim exact savings from management metadata alone. +4. The rule must not assume idle findings are immediately avoidable cost because reservation coverage and deleted-resource purge state are separate concerns. + +### 3.2 Provisioned deployment scope is established from documented deployment SKU and model surfaces + +Microsoft documents that provisioned deployment types map to these `sku-name` values: + +- `ProvisionedManaged` +- `GlobalProvisionedManaged` +- `DataZoneProvisionedManaged` + +Microsoft also documents deployment-management surfaces under `Microsoft.CognitiveServices/accounts/deployments`, including: + +- deployment `id` +- deployment `name` +- deployment `sku.name` +- deployment `sku.capacity` +- deployment `properties.model.format` +- deployment `properties.model.name` +- deployment `properties.model.version` +- deployment `properties.provisioningState` +- deployment `systemData.createdAt` + +Account-management surfaces include: + +- account `id` +- account `name` +- account `location` +- account `properties.provisioningState` + +Sources: + +- *What is provisioned throughput for Foundry Models?* +- *Get started with provisioned deployments in Microsoft Foundry* +- *Deployments - List (Azure AI Services REST API)* +- *Accounts - List (Azure AI Services REST API)* + +URLs: + +- https://learn.microsoft.com/en-us/azure/foundry/openai/concepts/provisioned-throughput +- https://learn.microsoft.com/en-us/azure/foundry/openai/how-to/provisioned-get-started +- https://learn.microsoft.com/en-us/rest/api/aiservices/accountmanagement/deployments/list?view=rest-aiservices-accountmanagement-2024-10-01 +- https://learn.microsoft.com/en-us/rest/api/aiservices/accountmanagement/accounts/list?view=rest-aiservices-accountmanagement-2024-10-01 + +Rule consequence: + +1. Provisioned scope must be established from the documented deployment SKU only. +2. OpenAI scope must be established from the documented deployment model format, not inferred from names or tags. +3. The rule should evaluate only stable deployments: exact `properties.provisioningState == "Succeeded"`. +4. A deployment is billing-relevant only when `sku.capacity` resolves to a known integer greater than zero. +5. Region filtering and reporting should use the documented **parent account location**, because the deployment list surface does not document a deployment-level location in the sample contract. + +### 3.3 Azure Monitor documents the canonical request metric and warns against legacy Cognitive Services metrics + +Microsoft documents that `Microsoft.CognitiveServices/accounts` exposes Azure OpenAI metrics including: + +- `AzureOpenAIRequests` +- `ActiveTokens` +- `ProcessedPromptTokens` +- `GeneratedTokens` +- `AzureOpenAIProvisionedManagedUtilizationV2` + +Microsoft further documents that: + +1. `AzureOpenAIRequests` is the metric for number of Azure OpenAI API calls over time +2. `AzureOpenAIRequests` uses `Total (Sum)` +3. `AzureOpenAIRequests` supports `PT1M` +4. `AzureOpenAIRequests` supports dimensions including `ModelDeploymentName`, `StatusCode`, `IsSpillover`, `ServiceTierRequest`, and `ServiceTierResponse` +5. legacy **Cognitive Services - HTTP Requests** metrics such as `TotalCalls`, `SuccessfulCalls`, and `ServerErrors` should **not** be used for Azure OpenAI +6. `Provisioned-managed Utilization V2` is a utilization metric, not a request-count metric + +Sources: + +- *Monitoring data reference for Azure OpenAI* +- *Supported metrics - Microsoft.CognitiveServices/accounts - Azure Monitor* + +URLs: + +- https://learn.microsoft.com/en-us/azure/foundry/openai/monitor-openai-reference +- https://learn.microsoft.com/en-us/azure/azure-monitor/reference/supported-metrics/microsoft-cognitiveservices-accounts-metrics + +Rule consequence: + +1. `AzureOpenAIRequests` is the canonical idle-traffic signal for this rule. +2. The metric query must target the **parent account ARM resource id**, not the deployment ARM id. +3. The rule must scope the metric to the deployment using the documented `ModelDeploymentName` dimension with exact deployment-name equality. +4. After that deployment filter is applied, the rule must evaluate activity across **all** remaining metric dimensions together; it must not require separate per-status, per-spillover, or per-service-tier zero proofs. +5. The rule must treat **any** positive request count as activity, regardless of `StatusCode`, spillover status, or service-tier response. +6. Token metrics, utilization metrics, and legacy Cognitive Services request metrics are observability-only for this rule and must not substitute for `AzureOpenAIRequests`. + +### 3.4 Provisioned deployments can spill over or return 429 under saturation, but those still represent request activity + +Microsoft documents that: + +1. provisioned deployments return HTTP `429` when utilization is at or above capacity +2. `AzureOpenAIRequests` can be broken down by `StatusCode` +3. `AzureOpenAIRequests` can be broken down by `IsSpillover` + +Sources: + +- *What is provisioned throughput for Foundry Models?* +- *Monitoring data reference for Azure OpenAI* + +URLs: + +- https://learn.microsoft.com/en-us/azure/foundry/openai/concepts/provisioned-throughput +- https://learn.microsoft.com/en-us/azure/foundry/openai/monitor-openai-reference + +Rule consequence: + +1. The rule must treat request attempts that result in 429 or other statuses as activity when `AzureOpenAIRequests` is positive. +2. The rule must not ignore spillover-tagged request traffic when evaluating idleness. +3. The rule should aggregate over all request statuses and spillover states rather than filtering them away. + +--- + +## 4. Detection Goal + +Emit only when the deployment passes every rule in section **8**. Section **8** is the single source of truth for decisioning; sections **7** and **9** define normalization and evaluation contracts. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that the deployment can be deleted safely +- that no failover, spillover, or standby purpose exists +- that no reservation or commitment discount is attached to the deployment +- that no future workload will return to this deployment +- that utilization, latency, or token metrics are healthy or unhealthy +- that a specific monthly dollar saving exists + +--- + +## 6. Canonical Inputs + +### 6.1 Required surfaces + +| Surface | Purpose | +|---|---| +| Cognitive Services account inventory | enumerate candidate parent accounts and obtain account identity, location, and stable account state | +| account deployment list/get | determine deployment identity, model format, model name/version, SKU, PTU capacity, provisioning state, and creation time | +| Azure Monitor metrics on the parent account ARM id | determine deployment-scoped Azure OpenAI request activity via `ModelDeploymentName` | + +### 6.2 Authentication / permissions + +Minimum permissions: + +- `Microsoft.CognitiveServices/accounts/read` +- `Microsoft.CognitiveServices/accounts/deployments/read` +- `Microsoft.Insights/metrics/read` + +No key, prompt, completion payload, token log, or data-plane inference call is required for this rule. + +### 6.3 Idle window + +- Configurable parameter: `idle_days` +- Default: `7` +- Minimum effective value: `1` + +Reason: + +- Provisioned deployments are designed for stable production throughput and are billed whether used or not once deployed. +- A one-week default is conservative enough to avoid flagging brief pauses while still surfacing deployments that appear continuously billed with no observed request traffic. + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `account_location` | Resolve from documented parent account `location`. If unresolved, skip. Lowercase before comparison, then compare by exact lowercase equality only. Do not remove spaces, hyphens, or digits. | +| `account_provisioning_state` | Resolve from documented account `properties.provisioningState` and compare case-sensitively to exact `"Succeeded"`. | +| `deployment_provisioning_state` | Resolve from documented deployment `properties.provisioningState` and compare case-sensitively to exact `"Succeeded"`. | +| `model_format` | Resolve from documented deployment `properties.model.format` and compare case-sensitively to exact `"OpenAI"`. | +| `sku_name` | Resolve from documented deployment `sku.name`. The only in-scope values are exact `ProvisionedManaged`, `GlobalProvisionedManaged`, and `DataZoneProvisionedManaged`. | +| `ptu_capacity` | Resolve from documented deployment `sku.capacity` as an integer. Only a known integer `> 0` is billing-relevant for this rule. | +| `created_at` | Parse as a UTC instant from documented deployment `systemData.createdAt` or equivalent SDK projection. If the chosen field is present but unparsable, skip. | +| `age_days` | `floor((now_utc - created_at_utc) / 86400 seconds)` using the normalized `created_at`. | +| `account_kind` | Preserve the raw documented account kind if present for reviewer context only; it must not establish OpenAI scope by itself. | +| `model_name` | Preserve raw documented deployment model name. | +| `model_version` | Preserve raw documented deployment model version. | +| `requests_metric_zero` | `True` only when section `9.3` evaluates the canonical metric result to `ZERO`. | +| `tags` | Prefer deployment tags when present; otherwise `{}`. Do not emit `None`. | + +--- + +## 8. Unified Decision Rule + +| # | Condition | Action | +|---|---|---| +| 8.1 | parent account `id` absent, `None`, or empty | Skip | +| 8.2 | parent account `name` absent, `None`, or empty | Skip | +| 8.3 | deployment `id` absent, `None`, or empty | Skip | +| 8.4 | deployment `name` absent, `None`, or empty | Skip | +| 8.5 | parent account location is unresolved | Skip | +| 8.6 | region filter set and normalized account location does not match | Skip | +| 8.7 | parent account provisioning state does not resolve to exact `"Succeeded"` | Skip | +| 8.8 | deployment provisioning state does not resolve to exact `"Succeeded"` | Skip | +| 8.9 | deployment model format does not resolve to exact `"OpenAI"` | Skip | +| 8.10 | deployment SKU name is not one of the documented provisioned-managed SKU names | Skip | +| 8.11 | PTU capacity is absent, invalid, zero, or negative | Skip | +| 8.12 | `created_at` is absent, invalid, in the future, or younger than the effective idle window | Skip | +| 8.13 | deployment-scoped `AzureOpenAIRequests` metric result is not `ZERO` per section `9.3` | Skip | +| 8.14 | all required signals resolve and the provisioned OpenAI deployment shows zero observed request traffic across the effective window while retaining positive PTU capacity | **EMIT** | + +--- + +## 9. Canonical Evaluation Contracts + +### 9.1 Scope and stable-state contract + +Required behavior: + +1. Account inventory must come from documented Cognitive Services account inventory surfaces. +2. Evaluate only parent accounts whose documented `account_provisioning_state` resolves to exact `"Succeeded"`. +3. Deployment inventory must come from documented account deployment list/get surfaces. +4. A deployment is in scope for this rule only when its documented `model_format` resolves to exact `"OpenAI"`. +5. A deployment is provisioned for this rule only when `sku_name` resolves to one of: + 1. `ProvisionedManaged` + 2. `GlobalProvisionedManaged` + 3. `DataZoneProvisionedManaged` +6. A deployment is stable only when `deployment_provisioning_state == "Succeeded"`. +7. A deployment is billing-relevant only when `ptu_capacity` is a known integer greater than zero. +8. `account_kind` may be preserved in details, but it must not override or replace the deployment-level `model_format` gate. + +### 9.2 Location and age contract + +Required behavior: + +1. Use the parent account location for region filtering and finding region. +2. Deployment `created_at` must resolve to a known UTC timestamp. +3. `created_at` in the future must skip. +4. Deployment age must be at least the effective `idle_days` window. +5. No weaker age-only fallback may substitute for missing or unknown traffic evidence. + +### 9.3 Deployment traffic-metric contract + +Required metric: + +1. `AzureOpenAIRequests` with `Total` + +Definitions: + +- **effective idle window**: `max(idle_days, 1)` +- **now_utc**: the evaluation time captured as a UTC timestamp +- **metric_end_utc**: `floor_to_minute(now_utc - 5 minutes)` +- **window_start_utc**: `metric_end_utc - effective idle window` +- **usable datapoint**: a datapoint with a parseable UTC timestamp inside the requested window and a numeric `Total` value after the deployment filter from this section is applied +- **bucketed datapoint**: a usable datapoint assigned to its complete UTC minute bucket by flooring its timestamp to the minute after deployment scoping +- **usable minute bucket**: a unique complete UTC minute in `[window_start_utc, metric_end_utc)` for which at least one bucketed datapoint exists after deployment scoping +- **bucket_total**: the sum of `Total` across all bucketed datapoints that land in the same usable minute bucket after deployment scoping and bucket assignment +- **acceptable coverage**: at least 80% of complete minute buckets in `[window_start_utc, metric_end_utc)` are represented by usable minute buckets +- **metric_result**: one of `ACTIVE`, `ZERO`, or `UNKNOWN` +- **idle_since_days**: derived constant equal to the configured effective idle window when `metric_result = ZERO`; it is not an observational duration claim beyond that accepted window + +Required behavior: + +1. Query the documented `AzureOpenAIRequests` metric on the **parent account ARM resource id**. +2. Scope the query to the deployment using the documented `ModelDeploymentName` dimension with exact deployment-name equality. +3. Use the documented `PT1M` granularity and `Total` aggregation. +4. Evaluate only complete minute buckets in `[window_start_utc, metric_end_utc)`. +5. This is a rolling UTC-aligned window, not a calendar-day window. +6. The deterministic evaluation pipeline is: + 1. apply exact `ModelDeploymentName` deployment scoping + 2. enumerate every datapoint from every remaining metric series after that deployment scoping, without any additional pre-aggregation by status, spillover state, service tier, or other remaining dimensions + 3. discard non-usable datapoints + 4. assign each remaining datapoint to its complete UTC minute bucket + 5. sum `Total` within each minute bucket across all remaining dimensions to produce the final `bucket_total` for that minute + 6. identify unique usable minute buckets from those finalized per-minute buckets + 7. compute coverage from the set of unique usable minute buckets produced after step 5 +7. After deployment scoping, aggregate activity across all remaining dimensions only by summing `Total` into final per-minute `bucket_total` values; do **not** require separate per-status, per-spillover, or per-service-tier zero proofs. +8. Coverage calculation must be based on the unique usable minute buckets that remain after final per-minute `bucket_total` values are computed, not on raw datapoint count, so duplicate points or multiple remaining dimension series cannot overstate completeness. +9. Do **not** filter away request statuses, spillover states, or service-tier values when determining activity; any positive `AzureOpenAIRequests` count is activity. +10. If any usable minute bucket has `bucket_total > 0`, then `metric_result = ACTIVE`. +11. If the query fails, the metric is missing, the response shape is unusable, no usable datapoints exist, or coverage is below threshold, then `metric_result = UNKNOWN`. +12. Treat the metric as `ZERO` only when coverage is acceptable and every usable minute bucket has `bucket_total == 0`. +13. When `metric_result = ZERO`, set `idle_since_days` to the effective idle window represented by `[window_start_utc, metric_end_utc)`. +14. Do **not** fall back to `ProcessedPromptTokens`, `GeneratedTokens`, `TokenTransaction`, `ActiveTokens`, `AzureOpenAIProvisionedManagedUtilizationV2`, deprecated `AzureOpenAIProvisionedManagedUtilization`, or legacy Cognitive Services HTTP-request metrics. +15. Do **not** emit from age-only, utilization-only, or token-only evidence. +16. Missing buckets, late-arriving datapoints, and sparse telemetry affect only coverage calculation; if resulting coverage is below threshold, `metric_result = UNKNOWN`. + +Rationale: + +This metric contract is intentionally conservative. Microsoft documents `AzureOpenAIRequests` as the canonical Azure OpenAI request-count metric and explicitly warns against using legacy Cognitive Services request metrics for Azure OpenAI. Azure Monitor documentation does not define missing datapoints as zero traffic, so the rule must fail closed on weak or sparse telemetry rather than substitute token or utilization metrics. + +### 9.4 Risk and confidence contract + +Risk: + +1. `HIGH` for every emitted finding + +Confidence: + +1. `HIGH` when all required signals resolve and metric coverage is at least 95% for a `ZERO` result +2. `MEDIUM` when all required signals resolve and metric coverage is at least 80% but below 95% for a `ZERO` result + +Rationale: + +Provisioned deployments with known positive PTU capacity are inherently meaningful cost candidates because they are billed on deployed PTUs while active capacity exists. Confidence reflects telemetry quality only after all emit conditions are satisfied. This rule should never emit with `LOW` confidence. + +--- + +## 10. Cost Model + +`estimated_monthly_cost_usd = None` + +Mandatory rules: + +1. Do **not** hardcode a fixed PTU monthly estimate such as `$1,460/PTU/month`. +2. Do **not** infer exact savings from `sku.capacity` alone. +3. State only that deployed PTUs incur hourly billing while the deployment exists. +4. If relevant, note that reservation discounts, reservation coverage, and deleted-resource purge state can change effective avoidable cost. + +--- + +## 11. Finding Shape + +### 11.1 Required fields + +| Field | Value | +|---|---| +| `provider` | `"azure"` | +| `rule_id` | `"azure.openai.provisioned_deployment.idle"` | +| `resource_type` | `"azure.openai.provisioned_deployment"` | +| `resource_id` | original ARM id from deployment `id` | +| `region` | normalized parent account location | +| `confidence` | derived from section `9.4` | +| `estimated_monthly_cost_usd` | `None` | + +### 11.2 Required evidence + +`signals_used` must clearly disclose: + +1. the deployment is an OpenAI deployment with a documented provisioned-managed SKU +2. parent account and deployment provisioning states are `"Succeeded"` +3. the deployment age is at least the configured idle window +4. the deployment retains a known positive PTU capacity +5. the documented `AzureOpenAIRequests` metric result is `ZERO` across the rolling UTC window defined in section `9.3` + +`signals_not_checked` should include remaining blind spots such as: + +1. business-owner intent or planned future traffic +2. spillover/failover policy intent beyond observed request activity +3. reservation coverage, reservation cancellation implications, or other commercial commitments +4. client-side retries or other application semantics not visible from the inspected management and Azure Monitor surfaces + +### 11.3 Required details + +Details should include at least: + +- `account_name` +- `resource_group` +- `subscription_id` +- `account_location` +- `account_kind` (if present) +- `deployment_name` +- `deployment_provisioning_state` +- `sku_name` +- `ptu_capacity` +- `model_format` +- `model_name` +- `model_version` +- `created_at` +- `age_days` +- `idle_days_threshold` +- `idle_since_days` +- `metric_name` +- `metric_aggregation` +- `metric_coverage_ratio` +- `tags` + +`account_kind`, `model_name`, `model_version`, and `metric_coverage_ratio` are reviewer-context fields only. They do not add gating logic beyond sections `8` and `9`. + +--- + +## 12. Failure Behavior + +- If subscription-wide account inventory fails, let the exception propagate +- If per-account deployment listing fails, skip that account +- If per-deployment metric retrieval fails or produces unusable telemetry, skip that deployment +- If account or deployment records are malformed or missing required fields, skip that deployment +- Do not emit on token-only, utilization-only, legacy-metric, account-total-only, or age-only evidence diff --git a/pyproject.toml b/pyproject.toml index 98c0db2..4b05acc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "cleancloud" -version = "1.25.0" +version = "1.26.0" description = "Read-only cloud hygiene for AWS, Azure, and GCP. Multi-account org scanning, CI/CD enforcement, and deterministic cost modeling. No agents, no telemetry." readme = "README.md" requires-python = ">=3.10" diff --git a/tests/cleancloud/providers/azure/test_azure_ai_search_idle.py b/tests/cleancloud/providers/azure/test_azure_ai_search_idle.py index 887c769..dacbe13 100644 --- a/tests/cleancloud/providers/azure/test_azure_ai_search_idle.py +++ b/tests/cleancloud/providers/azure/test_azure_ai_search_idle.py @@ -1,664 +1,1776 @@ +"""Tests for azure.ai_search.idle rule. + +Spec: docs/specs/azure/ai/ai_search_idle.md +""" + +import math from datetime import datetime, timedelta, timezone from types import SimpleNamespace +from typing import Optional import pytest +from azure.core.exceptions import HttpResponseError, ServiceRequestError, ServiceResponseError from cleancloud.providers.azure.rules.ai.ai_search_idle import ( + _SUPPORTED_SKUS, RULE_METADATA, + _check_object_surfaces, + _evaluate_metric, + _extract_resource_group, + _MetricResult, + _norm_location, + _normalize_sku, + _resolve_capacity, + _resolve_created_at, + _resolve_provisioning_state, + _resolve_status, find_idle_ai_search_services, ) # --------------------------------------------------------------------------- -# Helpers +# Constants # --------------------------------------------------------------------------- +_SUB = "sub-123" +_RG = "rg-search" +_SVC_NAME = "my-search-svc" +_SVC_ID = ( + f"/subscriptions/{_SUB}/resourceGroups/{_RG}" + f"/providers/Microsoft.Search/searchServices/{_SVC_NAME}" +) +_WINDOW_DAYS = 90 -def _make_service( - name="test-search", - sku_name="standard", - location="eastus", - age_days=30, - replica_count=1, - partition_count=1, - rg="rg-search", -): - svc_id = ( - f"/subscriptions/sub-123/resourceGroups/{rg}" - f"/providers/Microsoft.Search/searchServices/{name}" - ) - now = datetime.now(timezone.utc) - created_at = now - timedelta(days=age_days) if age_days is not None else None - system_data = SimpleNamespace(created_at=created_at) if created_at is not None else None - return SimpleNamespace( - id=svc_id, - name=name, - sku=SimpleNamespace(name=sku_name), - location=location, - replica_count=replica_count, - partition_count=partition_count, - system_data=system_data, - ) +# Minimum observed day-buckets needed for 95% coverage over 90-day window. +_MIN_BUCKETS = math.ceil(_WINDOW_DAYS * 0.95) # 86 + +# Fixed clean window for unit tests: avoids fractional-day rounding sensitivity. +_UNIT_WINDOW_END = datetime(2024, 4, 1, 0, 0, 0, tzinfo=timezone.utc) +_UNIT_WINDOW_START = _UNIT_WINDOW_END - timedelta(days=_WINDOW_DAYS) -def _make_average_metric_response(average: float) -> SimpleNamespace: - """Azure Monitor response for SearchQueriesPerSecond (Average).""" - dp = SimpleNamespace(average=average) - ts = SimpleNamespace(data=[dp]) - metric = SimpleNamespace(timeseries=[ts]) +# --------------------------------------------------------------------------- +# Metric response builders +# --------------------------------------------------------------------------- + + +def _make_datapoints( + agg_attr: str = "average", + value: float = 0.0, + n: int = _WINDOW_DAYS, + window_start: Optional[datetime] = None, +) -> list: + """n daily datapoints, timestamped one hour into each day of the window.""" + if window_start is None: + window_start = datetime.now(timezone.utc) - timedelta(days=_WINDOW_DAYS) + dps = [] + for i in range(n): + ts = window_start + timedelta(days=i, hours=1) + dp = SimpleNamespace(timestamp=ts, total=None, average=None, maximum=None) + setattr(dp, agg_attr, value) + dps.append(dp) + return dps + + +def _make_metric_response(datapoints: list) -> SimpleNamespace: + ts_obj = SimpleNamespace(data=datapoints) + metric = SimpleNamespace(timeseries=[ts_obj]) return SimpleNamespace(value=[metric]) -def _make_total_metric_response(total: float) -> SimpleNamespace: - """Azure Monitor response for TotalSearchRequestCount (Total).""" - dp = SimpleNamespace(total=total) - ts = SimpleNamespace(data=[dp]) - metric = SimpleNamespace(timeseries=[ts]) +def _make_no_timeseries_response() -> SimpleNamespace: + metric = SimpleNamespace(timeseries=[]) return SimpleNamespace(value=[metric]) -def _make_empty_metric_response() -> SimpleNamespace: +def _make_empty_value_response() -> SimpleNamespace: return SimpleNamespace(value=[]) -def _make_no_timeseries_response() -> SimpleNamespace: - """Metric returned but no timeseries data.""" - metric = SimpleNamespace(timeseries=[]) - return SimpleNamespace(value=[metric]) +# --------------------------------------------------------------------------- +# Monitor client mock +# --------------------------------------------------------------------------- + + +def _make_zero_monitor() -> SimpleNamespace: + """Returns zero for all three required metrics (95%-covered).""" + + def _list(*args, **kwargs): + name = kwargs.get("metricnames", "") + if name == "SearchQueriesPerSecond": + return _make_metric_response(_make_datapoints("average", 0.0)) + elif name in ("DocumentsProcessedCount", "SkillExecutionCount"): + return _make_metric_response(_make_datapoints("total", 0.0)) + return _make_empty_value_response() + return SimpleNamespace(metrics=SimpleNamespace(list=_list)) -def _make_clients(service, *, avg_response=None, total_response=None): - """ - avg_response — returned for SearchQueriesPerSecond calls (default: zero-average) - total_response — returned for TotalSearchRequestCount calls (default: empty) - """ - if avg_response is None: - avg_response = _make_average_metric_response(0.0) - if total_response is None: - total_response = _make_empty_metric_response() - call_log: list[str] = [] +def _make_active_monitor(active_metric: str, agg_attr: str) -> SimpleNamespace: + """Returns non-zero for active_metric, zero for the others.""" - def _metrics_list(*args, **kwargs): + def _list(*args, **kwargs): name = kwargs.get("metricnames", "") - call_log.append(name) + if name == active_metric: + return _make_metric_response(_make_datapoints(agg_attr, 1.0)) if name == "SearchQueriesPerSecond": - return avg_response - return total_response + return _make_metric_response(_make_datapoints("average", 0.0)) + return _make_metric_response(_make_datapoints("total", 0.0)) + + return SimpleNamespace(metrics=SimpleNamespace(list=_list)) + + +def _make_unknown_monitor() -> SimpleNamespace: + """Returns no data (UNKNOWN) for all metrics.""" + + def _list(*args, **kwargs): + return _make_empty_value_response() + + return SimpleNamespace(metrics=SimpleNamespace(list=_list)) + + +def _make_raising_monitor(exc) -> SimpleNamespace: + """Raises exc on every metrics.list call.""" + + def _list(*args, **kwargs): + raise exc + + return SimpleNamespace(metrics=SimpleNamespace(list=_list)) + + +# --------------------------------------------------------------------------- +# Data-plane client mock +# --------------------------------------------------------------------------- + + +class _MockDpClient: + """Mock data-plane client; all required and optional surfaces empty by default.""" + + def __init__( + self, + *, + fail_required: Optional[str] = None, + non_empty_required: Optional[str] = None, + fail_optional: Optional[str] = None, + non_empty_optional: Optional[str] = None, + ): + self._fail_required = fail_required + self._non_empty_required = non_empty_required + self._fail_optional = fail_optional + self._non_empty_optional = non_empty_optional + + def _req(self, key: str): + if key == self._fail_required: + raise RuntimeError(f"simulated failure on required surface '{key}'") + if key == self._non_empty_required: + return [SimpleNamespace(name="item1")] + return [] + + def _opt(self, key: str): + if key == self._fail_optional: + raise RuntimeError(f"simulated failure on optional surface '{key}'") + if key == self._non_empty_optional: + return [SimpleNamespace(name="item1")] + return [] + + def list_indexes(self): + return self._req("indexes") + + def list_indexers(self): + return self._req("indexers") + + def list_data_source_connections(self): + return self._req("data_sources") + + def list_skillsets(self): + return self._req("skillsets") + + def list_synonym_maps(self): + return self._req("synonym_maps") + + def list_aliases(self): + return self._opt("aliases") + + def list_knowledge_sources(self): + return self._opt("knowledge_sources") + + def list_agents(self): + return self._opt("agents") + + +def _make_dp_factory(**kwargs) -> callable: + return lambda endpoint: _MockDpClient(**kwargs) + + +def _make_none_dp_factory() -> callable: + """Factory that always returns None (package unavailable).""" + return lambda endpoint: None + + +# --------------------------------------------------------------------------- +# Service builder +# --------------------------------------------------------------------------- - search_client = SimpleNamespace( - services=SimpleNamespace(list_by_subscription=lambda: [service]) + +def _make_service( + *, + name: str = _SVC_NAME, + svc_id: Optional[str] = None, + location: str = "eastus", + sku_name: str = "standard", + provisioning_state: str = "succeeded", + status: str = "running", + age_days: int = 90, + replica_count: int = 1, + partition_count: int = 1, + tags: Optional[dict] = None, + hosting_mode: Optional[str] = None, + rg: str = _RG, +) -> SimpleNamespace: + if svc_id is None: + svc_id = ( + f"/subscriptions/{_SUB}/resourceGroups/{rg}" + f"/providers/Microsoft.Search/searchServices/{name}" + ) + now = datetime.now(timezone.utc) + created_at = now - timedelta(days=age_days) if age_days is not None else None + system_data = SimpleNamespace(created_at=created_at) if created_at is not None else None + return SimpleNamespace( + id=svc_id, + name=name, + location=location, + sku=SimpleNamespace(name=sku_name), + provisioning_state=provisioning_state, + status=status, + replica_count=replica_count, + partition_count=partition_count, + system_data=system_data, + tags=tags or {}, + hosting_mode=hosting_mode, ) - monitor_client = SimpleNamespace(metrics=SimpleNamespace(list=_metrics_list)) - monitor_client._call_log = call_log - return search_client, monitor_client -def _call(search_client, monitor_client, *, idle_days=30, region_filter=None): +def _make_search_client(services: list) -> SimpleNamespace: + return SimpleNamespace(services=SimpleNamespace(list_by_subscription=lambda: services)) + + +# --------------------------------------------------------------------------- +# Call helper +# --------------------------------------------------------------------------- + + +def _call( + search_client, + monitor_client, + *, + dp_factory: Optional[callable] = None, + region_filter: Optional[str] = None, +) -> list: + if dp_factory is None: + dp_factory = _make_dp_factory() return find_idle_ai_search_services( - subscription_id="sub-123", + subscription_id=_SUB, credential=None, client=search_client, monitor_client=monitor_client, - idle_days=idle_days, + data_plane_factory=dp_factory, region_filter=region_filter, ) +# =========================================================================== +# Integration tests +# =========================================================================== + + +class TestMustEmit: + """Spec 4, 8.13: all required signals resolved and zero -> emit.""" + + def test_happy_path_emits_one_finding(self): + svc = _make_service() + sc = _make_search_client([svc]) + findings = _call(sc, _make_zero_monitor()) + assert len(findings) == 1 + assert findings[0].rule_id == "azure.ai_search.idle" + + def test_no_services_returns_empty(self): + sc = _make_search_client([]) + findings = _call(sc, _make_zero_monitor()) + assert findings == [] + + def test_multiple_idle_services_all_emitted(self): + svc1 = _make_service(name="s1") + svc2 = _make_service(name="s2") + sc = _make_search_client([svc1, svc2]) + findings = _call(sc, _make_zero_monitor()) + assert len(findings) == 2 + + # --------------------------------------------------------------------------- -# Core detection + + +class TestIdNameGuards: + """Spec 8.1-8.2: id and name guards.""" + + def test_id_none_skips(self): + svc = _make_service() + svc.id = None + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_id_empty_skips(self): + svc = _make_service() + svc.id = "" + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_name_none_skips(self): + svc = _make_service() + svc.name = None + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_name_empty_skips(self): + svc = _make_service() + svc.name = "" + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_valid_id_and_name_proceeds(self): + svc = _make_service() + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + # --------------------------------------------------------------------------- -def test_idle_service_detected(): - """Standard service with zero queries → finding produced.""" - svc = _make_service(age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert len(findings) == 1 - assert findings[0].rule_id == "azure.ai_search.idle" +class TestRegionFilter: + """Spec 8.3, 7: region filter uses exact lowercase match; spaces preserved.""" + def test_filter_excludes_non_matching(self): + svc = _make_service(location="westeurope") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor(), region_filter="eastus") == [] -def test_active_service_skipped(): - """Service with non-zero average queries → no finding.""" - svc = _make_service(age_days=30) - sc, mon = _make_clients(svc, avg_response=_make_average_metric_response(5.0)) - findings = _call(sc, mon) - assert findings == [] + def test_filter_matches_same_lowercase(self): + svc = _make_service(location="eastus") + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor(), region_filter="eastus")) == 1 + def test_filter_matches_after_lowercasing(self): + svc = _make_service(location="EastUS") + sc = _make_search_client([svc]) + # "EastUS" normalized to "eastus" == filter "eastus" + assert len(_call(sc, _make_zero_monitor(), region_filter="eastus")) == 1 -def test_no_services_returns_empty(): - sc = SimpleNamespace(services=SimpleNamespace(list_by_subscription=lambda: [])) - mon = SimpleNamespace( - metrics=SimpleNamespace(list=lambda *a, **kw: _make_average_metric_response(0.0)) - ) - findings = _call(sc, mon) - assert findings == [] + def test_filter_spaces_preserved_not_stripped(self): + # spec 7: do NOT remove spaces + svc = _make_service(location="east us") + sc = _make_search_client([svc]) + # "east us" != "eastus": spaces are NOT stripped + assert _call(sc, _make_zero_monitor(), region_filter="eastus") == [] + def test_filter_spaces_preserved_matches_with_spaces(self): + svc = _make_service(location="east us") + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor(), region_filter="east us")) == 1 -def test_service_with_no_id_skipped(): - """svc.id = None must be skipped before the monitor call to avoid SDK errors.""" - svc = _make_service(age_days=30) - svc.id = None - sc, mon = _make_clients(svc) - assert _call(sc, mon) == [] + def test_no_filter_includes_all(self): + svc = _make_service(location="australiaeast") + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor(), region_filter=None)) == 1 # --------------------------------------------------------------------------- -# SKU normalization -# --------------------------------------------------------------------------- -def test_normalize_sku_camel_case_storage_optimized(): - """SDK may return 'StorageOptimizedL1' — should normalize to 'storage_optimized_l1'.""" - from cleancloud.providers.azure.rules.ai.ai_search_idle import _normalize_sku +class TestProvisioningStateContract: + """Spec 8.4, 9.1: provisioning_state must resolve exactly to 'succeeded'.""" - assert _normalize_sku("StorageOptimizedL1") == "storage_optimized_l1" - assert _normalize_sku("StorageOptimizedL2") == "storage_optimized_l2" - assert _normalize_sku("storage_optimized_l1") == "storage_optimized_l1" - assert _normalize_sku("Standard") == "standard" - assert _normalize_sku("Standard2") == "standard2" - assert _normalize_sku("") == "" + def test_succeeded_emits(self): + svc = _make_service(provisioning_state="succeeded") + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + def test_updating_skips(self): + svc = _make_service(provisioning_state="Updating") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] -def test_storage_optimized_l1_camelcase_detected(): - """SDK returns 'StorageOptimizedL1' → normalizes → finding produced.""" - svc = _make_service(sku_name="StorageOptimizedL1", age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert len(findings) == 1 - assert findings[0].details["sku"] == "storage_optimized_l1" + def test_failed_skips(self): + svc = _make_service(provisioning_state="Failed") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_none_skips(self): + svc = _make_service(provisioning_state=None) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_conflict_sdk_nested_skips(self): + svc = _make_service(provisioning_state="succeeded") + svc.properties = SimpleNamespace(provisioning_state="Failed", provisioningState=None) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_nested_only_succeeded_emits(self): + svc = _make_service() + del svc.provisioning_state # SDK attribute absent + svc.properties = SimpleNamespace(provisioning_state="succeeded", provisioningState=None) + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + def test_provisioning_state_capitalized_skips(self): + # Only exact "succeeded" is eligible; "Succeeded" is not + svc = _make_service(provisioning_state="Succeeded") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] -# --------------------------------------------------------------------------- -# SKU filtering # --------------------------------------------------------------------------- -def test_basic_sku_skipped(): - """Basic SKU is not in _WATCHED_SKUS → skipped regardless of traffic.""" - svc = _make_service(sku_name="basic", age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings == [] +class TestStatusContract: + """Spec 8.5, 9.1: status must resolve exactly to 'running'.""" + def test_running_emits(self): + svc = _make_service(status="running") + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 -def test_standard_sku_included(): - svc = _make_service(sku_name="standard", age_days=30) - sc, mon = _make_clients(svc) - assert len(_call(sc, mon)) == 1 + def test_degraded_skips(self): + svc = _make_service(status="degraded") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + def test_disabled_skips(self): + svc = _make_service(status="disabled") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] -def test_standard2_sku_included(): - svc = _make_service(sku_name="standard2", age_days=30) - sc, mon = _make_clients(svc) - assert len(_call(sc, mon)) == 1 + def test_none_skips(self): + svc = _make_service(status=None) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + def test_conflict_skips(self): + svc = _make_service(status="running") + svc.properties = SimpleNamespace(status="degraded") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] -def test_storage_optimized_l1_included(): - svc = _make_service(sku_name="storage_optimized_l1", age_days=30) - sc, mon = _make_clients(svc) - assert len(_call(sc, mon)) == 1 + def test_nested_only_running_emits(self): + svc = _make_service() + del svc.status + svc.properties = SimpleNamespace(status="running") + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + def test_status_capitalized_skips(self): + svc = _make_service(status="Running") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] -def test_storage_optimized_l2_included(): - svc = _make_service(sku_name="storage_optimized_l2", age_days=30) - sc, mon = _make_clients(svc) - assert len(_call(sc, mon)) == 1 +# --------------------------------------------------------------------------- -def test_free_sku_skipped(): - svc = _make_service(sku_name="free", age_days=30) - sc, mon = _make_clients(svc) - assert _call(sc, mon) == [] + +class TestSkuContract: + """Spec 8.6, 9.2: supported dedicated billable tiers.""" + + @pytest.mark.parametrize( + "sku", + [ + "basic", + "standard", + "standard2", + "standard3", + "storage_optimized_l1", + "storage_optimized_l2", + ], + ) + def test_supported_sku_emits(self, sku): + svc = _make_service(sku_name=sku) + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + def test_free_sku_skips(self): + svc = _make_service(sku_name="free") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_unknown_sku_skips(self): + svc = _make_service(sku_name="enterprise") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_camelcase_storage_optimized_l1_normalized(self): + svc = _make_service(sku_name="StorageOptimizedL1") + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + def test_camelcase_storage_optimized_l2_normalized(self): + svc = _make_service(sku_name="StorageOptimizedL2") + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + def test_mixed_case_standard_normalized(self): + svc = _make_service(sku_name="Standard") + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + def test_sku_none_skips(self): + svc = _make_service() + svc.sku = None + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_sku_name_none_skips(self): + svc = _make_service() + svc.sku = SimpleNamespace(name=None) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_hyphenated_storage_optimized_skips(self): + # "storage-optimized-l1" must not match "storage_optimized_l1" (spec 7: lowercase only) + svc = _make_service(sku_name="storage-optimized-l1") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_punctuated_standard_skips(self): + svc = _make_service(sku_name="stan-dard") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] # --------------------------------------------------------------------------- -# Age filtering + + +class TestCreatedAtContract: + """Spec 8.7: created_at must be present, valid, and service age >= 90 days.""" + + def test_age_exactly_90_days_emits(self): + svc = _make_service(age_days=90) + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + def test_age_89_days_skips(self): + svc = _make_service(age_days=89) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_age_200_days_emits(self): + svc = _make_service(age_days=200) + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + def test_system_data_absent_skips(self): + svc = _make_service(age_days=90) + svc.system_data = None + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_created_at_none_skips(self): + svc = _make_service(age_days=90) + svc.system_data = SimpleNamespace(created_at=None) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_created_at_future_skips(self): + future = datetime.now(timezone.utc) + timedelta(days=1) + svc = _make_service() + svc.system_data = SimpleNamespace(created_at=future) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_created_at_string_valid_iso_parsed(self): + ts = datetime.now(timezone.utc) - timedelta(days=91) + svc = _make_service() + svc.system_data = SimpleNamespace(created_at=ts.isoformat()) + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + def test_created_at_string_invalid_skips(self): + svc = _make_service() + svc.system_data = SimpleNamespace(created_at="not-a-date") + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_created_at_non_string_non_datetime_skips(self): + svc = _make_service() + svc.system_data = SimpleNamespace(created_at=12345) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_no_age_only_fallback(self): + # Spec 8.7: absent created_at is skip, not fallback to age-only + svc = _make_service(age_days=200) + svc.system_data = None + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + # --------------------------------------------------------------------------- -def test_young_service_skipped(): - """Service younger than idle_days // 2 → skipped.""" - svc = _make_service(age_days=10) # 10 < 30//2=15 - sc, mon = _make_clients(svc) - assert _call(sc, mon) == [] +class TestCapacityContract: + """Spec 8.8: replica_count and partition_count must be known positive integers.""" + + def test_positive_replica_and_partition_emits(self): + svc = _make_service(replica_count=2, partition_count=3) + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + def test_replica_count_zero_skips(self): + svc = _make_service(replica_count=0) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_replica_count_none_skips(self): + svc = _make_service(replica_count=None) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_partition_count_zero_skips(self): + svc = _make_service(partition_count=0) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_partition_count_none_skips(self): + svc = _make_service(partition_count=None) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_replica_count_conflict_sdk_nested_skips(self): + svc = _make_service(replica_count=1) + svc.properties = SimpleNamespace( + replica_count=2, replicaCount=None, partition_count=None, partitionCount=None + ) + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor()) == [] + + def test_replica_count_nested_only_emits(self): + svc = _make_service() + del svc.replica_count + svc.properties = SimpleNamespace( + replica_count=1, replicaCount=None, partition_count=None, partitionCount=None + ) + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 -def test_service_at_half_threshold_skipped(): - """age_days == idle_days // 2 - 1 → skipped.""" - svc = _make_service(age_days=14) # 14 < 15 - sc, mon = _make_clients(svc) - assert _call(sc, mon) == [] +# --------------------------------------------------------------------------- -def test_service_at_half_threshold_still_below_confidence_gate(): - """age_days == idle_days // 2 passes the age gate but is < 75% of idle_days → no finding.""" - svc = _make_service(age_days=15) # passes age gate (15 >= 15) but 15 < ceil(30*0.75)=23 - sc, mon = _make_clients(svc) - assert _call(sc, mon) == [] +class TestObjectSurfacesContract: + """Spec 8.9-8.10, 9.3: data-plane structural emptiness.""" + def test_dp_factory_returns_none_skips(self): + svc = _make_service() + sc = _make_search_client([svc]) + assert _call(sc, _make_zero_monitor(), dp_factory=_make_none_dp_factory()) == [] -def test_effective_window_capped_to_age(): - """effective_window = min(idle_days, age_days) when age < idle_days.""" - svc = _make_service(age_days=25) # 25 >= ceil(30*0.75)=23, but 25 < 30 → effective_window=25 - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert len(findings) == 1 - assert findings[0].details["age_days"] == 25 + @pytest.mark.parametrize( + "surface", + [ + "indexes", + "indexers", + "data_sources", + "skillsets", + "synonym_maps", + ], + ) + def test_non_empty_required_surface_skips(self, surface): + svc = _make_service() + sc = _make_search_client([svc]) + dp_factory = _make_dp_factory(non_empty_required=surface) + assert _call(sc, _make_zero_monitor(), dp_factory=dp_factory) == [] + + @pytest.mark.parametrize( + "surface", + [ + "indexes", + "indexers", + "data_sources", + "skillsets", + "synonym_maps", + ], + ) + def test_failing_required_surface_skips(self, surface): + svc = _make_service() + sc = _make_search_client([svc]) + dp_factory = _make_dp_factory(fail_required=surface) + assert _call(sc, _make_zero_monitor(), dp_factory=dp_factory) == [] + + @pytest.mark.parametrize( + "surface", + [ + "aliases", + "knowledge_sources", + "agents", + ], + ) + def test_non_empty_optional_surface_skips(self, surface): + # spec 9.3.7: non-empty optional -> skip + svc = _make_service() + sc = _make_search_client([svc]) + dp_factory = _make_dp_factory(non_empty_optional=surface) + assert _call(sc, _make_zero_monitor(), dp_factory=dp_factory) == [] + + @pytest.mark.parametrize( + "surface", + [ + "aliases", + "knowledge_sources", + "agents", + ], + ) + def test_failing_optional_surface_still_emits(self, surface): + # spec 9.3.6: optional surface failure -> omit from counts, do NOT skip service + svc = _make_service() + sc = _make_search_client([svc]) + dp_factory = _make_dp_factory(fail_optional=surface) + assert len(_call(sc, _make_zero_monitor(), dp_factory=dp_factory)) == 1 + def test_optional_surface_method_absent_still_emits(self): + # spec 9.3.5: optional surfaces not required for eligibility -def test_no_creation_time_uses_full_window(): - """No system_data → age_days=None, effective_window=idle_days, still detects.""" - svc = _make_service(age_days=None) - svc.system_data = None - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert len(findings) == 1 - assert findings[0].details["age_days"] is None + class _NoOptionalClient: + def list_indexes(self): + return [] + def list_indexers(self): + return [] -# --------------------------------------------------------------------------- -# Confidence levels -# --------------------------------------------------------------------------- + def list_data_source_connections(self): + return [] + def list_skillsets(self): + return [] -def test_high_confidence_age_ge_idle_days(): - """age >= idle_days with zero metric → HIGH confidence.""" - svc = _make_service(age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon, idle_days=30) - assert findings[0].confidence.value == "high" + def list_synonym_maps(self): + return [] + # No list_aliases, list_knowledge_sources, list_agents -def test_medium_confidence_at_75_percent_age(): - """age >= 75% of idle_days but < idle_days → MEDIUM confidence.""" - svc = _make_service(age_days=23) # ceil(30*0.75)=23, age<30 - sc, mon = _make_clients(svc) - findings = _call(sc, mon, idle_days=30) - assert findings[0].confidence.value == "medium" + svc = _make_service() + sc = _make_search_client([svc]) + def dp_factory(endpoint): + return _NoOptionalClient() -def test_below_75_percent_age_skipped(): - """age < 75% of idle_days → no finding (insufficient evidence).""" - svc = _make_service(age_days=22) # 22 < ceil(30*0.75)=23 - sc, mon = _make_clients(svc) - assert _call(sc, mon, idle_days=30) == [] + assert len(_call(sc, _make_zero_monitor(), dp_factory=dp_factory)) == 1 + def test_all_required_surfaces_empty_emits(self): + svc = _make_service() + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor(), dp_factory=_make_dp_factory())) == 1 -def test_medium_confidence_unknown_age(): - """No creation time but metric shows zero → MEDIUM confidence.""" - svc = _make_service(age_days=None) - svc.system_data = None - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].confidence.value == "medium" + def test_object_counts_in_details(self): + svc = _make_service() + sc = _make_search_client([svc]) + findings = _call(sc, _make_zero_monitor()) + assert len(findings) == 1 + counts = findings[0].details["object_counts"] + for key in ("indexes", "indexers", "data_sources", "skillsets", "synonym_maps"): + assert counts[key] == 0 # --------------------------------------------------------------------------- -# Age-only fallback (no metric data) + + +class TestMetricContract: + """Spec 8.11-8.12, 9.5: all three required metrics must be ZERO.""" + + def test_all_zero_emits(self): + svc = _make_service() + sc = _make_search_client([svc]) + assert len(_call(sc, _make_zero_monitor())) == 1 + + def test_search_queries_per_second_active_skips(self): + svc = _make_service() + sc = _make_search_client([svc]) + mon = _make_active_monitor("SearchQueriesPerSecond", "average") + assert _call(sc, mon) == [] + + def test_documents_processed_count_active_skips(self): + svc = _make_service() + sc = _make_search_client([svc]) + mon = _make_active_monitor("DocumentsProcessedCount", "total") + assert _call(sc, mon) == [] + + def test_skill_execution_count_active_skips(self): + svc = _make_service() + sc = _make_search_client([svc]) + mon = _make_active_monitor("SkillExecutionCount", "total") + assert _call(sc, mon) == [] + + def test_metric_unknown_insufficient_coverage_skips(self): + # Fewer buckets than 95% threshold -> UNKNOWN -> skip + def _list(*args, **kwargs): + name = kwargs.get("metricnames", "") + if name == "SearchQueriesPerSecond": + # Only 50 buckets -> 50/90 = 55% < 95% -> UNKNOWN + return _make_metric_response(_make_datapoints("average", 0.0, n=50)) + return _make_metric_response(_make_datapoints("total", 0.0)) + + svc = _make_service() + sc = _make_search_client([svc]) + mon = SimpleNamespace(metrics=SimpleNamespace(list=_list)) + assert _call(sc, mon) == [] + + def test_all_metrics_unknown_skips(self): + svc = _make_service() + sc = _make_search_client([svc]) + assert _call(sc, _make_unknown_monitor()) == [] + + def test_metric_query_raises_skips(self): + svc = _make_service() + sc = _make_search_client([svc]) + mon = _make_raising_monitor(RuntimeError("timeout")) + assert _call(sc, mon) == [] + + def test_metric_timestamp_none_datapoint_skips(self): + # unparseable timestamp -> UNKNOWN -> skip + def _list(*args, **kwargs): + name = kwargs.get("metricnames", "") + if name == "SearchQueriesPerSecond": + dps = _make_datapoints("average", 0.0) + dps[5] = SimpleNamespace(timestamp=None, average=0.0, total=None, maximum=None) + return _make_metric_response(dps) + return _make_metric_response(_make_datapoints("total", 0.0)) + + svc = _make_service() + sc = _make_search_client([svc]) + mon = SimpleNamespace(metrics=SimpleNamespace(list=_list)) + assert _call(sc, mon) == [] + + def test_metric_non_datetime_timestamp_skips(self): + # non-datetime timestamp -> UNKNOWN -> skip + def _list(*args, **kwargs): + name = kwargs.get("metricnames", "") + if name == "SearchQueriesPerSecond": + dps = _make_datapoints("average", 0.0) + dps[3] = SimpleNamespace( + timestamp="2024-01-01T00:00:00Z", average=0.0, total=None, maximum=None + ) + return _make_metric_response(dps) + return _make_metric_response(_make_datapoints("total", 0.0)) + + svc = _make_service() + sc = _make_search_client([svc]) + mon = SimpleNamespace(metrics=SimpleNamespace(list=_list)) + assert _call(sc, mon) == [] + + def test_metrics_not_queried_when_earlier_check_fails(self): + """Metrics should not be queried when a pre-condition (e.g. status) fails.""" + called = [] + + def _list(*args, **kwargs): + called.append(kwargs.get("metricnames")) + return _make_metric_response(_make_datapoints("average", 0.0)) + + svc = _make_service(status="degraded") + sc = _make_search_client([svc]) + mon = SimpleNamespace(metrics=SimpleNamespace(list=_list)) + _call(sc, mon) + assert called == [] + + def test_monitor_not_called_with_interval_p1d(self): + """No interval= parameter sent to Azure Monitor; source-bucket granularity per spec 9.5.2.""" + captured_kwargs: dict = {} + + def _list(*args, **kwargs): + captured_kwargs.update(kwargs) + name = kwargs.get("metricnames", "") + if name == "SearchQueriesPerSecond": + return _make_metric_response(_make_datapoints("average", 0.0)) + return _make_metric_response(_make_datapoints("total", 0.0)) + + svc = _make_service() + sc = _make_search_client([svc]) + mon = SimpleNamespace(metrics=SimpleNamespace(list=_list)) + findings = _call(sc, mon) + assert len(findings) == 1 + assert "interval" not in captured_kwargs + + def test_non_numeric_aggregation_value_skips(self): + """Non-numeric aggregation value in metric response -> UNKNOWN -> service skipped.""" + + def _list(*args, **kwargs): + name = kwargs.get("metricnames", "") + if name == "SearchQueriesPerSecond": + dps = _make_datapoints("average", 0.0) + bad_ts = dps[5].timestamp + dps[5] = SimpleNamespace(timestamp=bad_ts, average="N/A", total=None, maximum=None) + return _make_metric_response(dps) + return _make_metric_response(_make_datapoints("total", 0.0)) + + svc = _make_service() + sc = _make_search_client([svc]) + mon = SimpleNamespace(metrics=SimpleNamespace(list=_list)) + assert _call(sc, mon) == [] + + def test_malformed_timeseries_not_iterable_skips(self): + """timeseries that is not iterable raises TypeError -> UNKNOWN -> service skipped.""" + + def _list(*args, **kwargs): + name = kwargs.get("metricnames", "") + if name == "SearchQueriesPerSecond": + metric = SimpleNamespace(timeseries=42) # not iterable + return SimpleNamespace(value=[metric]) + return _make_metric_response(_make_datapoints("total", 0.0)) + + svc = _make_service() + sc = _make_search_client([svc]) + mon = SimpleNamespace(metrics=SimpleNamespace(list=_list)) + assert _call(sc, mon) == [] + + # --------------------------------------------------------------------------- -def test_age_only_fallback_when_no_timeseries(): - """Both metrics return no data; age >= idle_days*2 → LOW confidence, age_only signal.""" - svc = _make_service(age_days=62) # >= 30*2=60 - empty = _make_empty_metric_response() - sc, mon = _make_clients(svc, avg_response=empty, total_response=empty) - findings = _call(sc, mon, idle_days=30) - assert len(findings) == 1 - assert findings[0].confidence.value == "low" - assert findings[0].details["idle_signal"] == "age_only" - assert findings[0].details["idle_metric"] == "none" +class TestFailureBehavior: + """Spec 12: subscription list propagates; per-service errors skip.""" + + def test_service_list_propagates_runtime_error(self): + def _raise(): + raise RuntimeError("disk full") + + sc = SimpleNamespace(services=SimpleNamespace(list_by_subscription=_raise)) + with pytest.raises(RuntimeError, match="disk full"): + _call(sc, _make_zero_monitor()) + + def test_per_service_http_error_skips_service(self): + svc = _make_service() + sc = _make_search_client([svc]) + # HttpResponseError raised during metric evaluation -> skip + mon = _make_raising_monitor(HttpResponseError("403 Forbidden")) + assert _call(sc, mon) == [] + + def test_per_service_service_request_error_skips(self): + svc = _make_service() + sc = _make_search_client([svc]) + mon = _make_raising_monitor(ServiceRequestError("transport error")) + assert _call(sc, mon) == [] + + def test_per_service_service_response_error_skips(self): + svc = _make_service() + sc = _make_search_client([svc]) + mon = _make_raising_monitor(ServiceResponseError("response error")) + assert _call(sc, mon) == [] + + def test_one_service_http_error_other_still_emits(self): + svc_bad = _make_service(name="bad-svc") + svc_good = _make_service(name="good-svc") + call_count = [] + + def _list(*args, **kwargs): + call_count.append(1) + # Fail on first call (bad-svc), succeed on subsequent (good-svc) + if len(call_count) == 1: + raise HttpResponseError("403") + name = kwargs.get("metricnames", "") + if name == "SearchQueriesPerSecond": + return _make_metric_response(_make_datapoints("average", 0.0)) + return _make_metric_response(_make_datapoints("total", 0.0)) + + sc = _make_search_client([svc_bad, svc_good]) + mon = SimpleNamespace(metrics=SimpleNamespace(list=_list)) + findings = _call(sc, mon) + assert len(findings) == 1 + assert findings[0].details["service_name"] == "good-svc" -def test_age_only_fallback_requires_2x_idle_days(): - """age < idle_days*2 with no data → no finding.""" - svc = _make_service(age_days=59) - empty = _make_empty_metric_response() - sc, mon = _make_clients(svc, avg_response=empty, total_response=empty) - assert _call(sc, mon, idle_days=30) == [] +# --------------------------------------------------------------------------- -def test_all_metric_calls_fail_returns_none(): - """Both metrics raise non-permission exceptions → no finding (returns None from helper).""" - svc = _make_service(age_days=30) +class TestFindingShape: + """Spec 11: required finding fields and details.""" - def _raise(*a, **kw): - raise RuntimeError("timeout") + def _get_finding(self): + svc = _make_service( + name="shape-svc", + location="eastus", + sku_name="standard", + age_days=120, + replica_count=2, + partition_count=3, + tags={"env": "test"}, + hosting_mode="default", + rg="rg-shape", + ) + sc = _make_search_client([svc]) + findings = _call(sc, _make_zero_monitor()) + assert len(findings) == 1 + return findings[0] - sc = SimpleNamespace(services=SimpleNamespace(list_by_subscription=lambda: [svc])) - mon = SimpleNamespace(metrics=SimpleNamespace(list=_raise)) - assert _call(sc, mon) == [] + def test_provider(self): + assert self._get_finding().provider == "azure" + def test_rule_id(self): + assert self._get_finding().rule_id == "azure.ai_search.idle" -# --------------------------------------------------------------------------- -# Risk levels -# --------------------------------------------------------------------------- + def test_resource_type(self): + assert self._get_finding().resource_type == "azure.ai.search_service" + def test_resource_id_contains_service_name(self): + f = self._get_finding() + assert "shape-svc" in f.resource_id -def test_medium_risk_for_low_cost(): - """standard × 1 replica × 1 partition = $261 < $1000 → MEDIUM risk.""" - svc = _make_service(sku_name="standard", replica_count=1, partition_count=1, age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].risk.value == "medium" + def test_region_normalized(self): + assert self._get_finding().region == "eastus" + def test_estimated_cost_always_none(self): + # spec 10: always None + assert self._get_finding().estimated_monthly_cost_usd is None -def test_high_risk_for_high_cost(): - """standard3 × 1 replica × 1 partition = $1047 >= $1000 → HIGH risk.""" - svc = _make_service(sku_name="standard3", replica_count=1, partition_count=1, age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].risk.value == "high" + def test_confidence_always_high(self): + # spec 11.1 + assert self._get_finding().confidence.value == "high" + def test_risk_always_medium(self): + # spec 11.1 + assert self._get_finding().risk.value == "medium" -def test_high_risk_from_replicas(): - """standard × 4 replicas × 1 partition = $1044 >= $1000 → HIGH risk.""" - svc = _make_service(sku_name="standard", replica_count=4, partition_count=1, age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].risk.value == "high" + def test_title_contains_service_name(self): + assert "shape-svc" in self._get_finding().title + def test_summary_contains_service_name(self): + assert "shape-svc" in self._get_finding().summary -def test_high_risk_from_partitions(): - """standard × 1 replica × 4 partitions = $1044 >= $1000 → HIGH risk.""" - svc = _make_service(sku_name="standard", replica_count=1, partition_count=4, age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].risk.value == "high" + def test_reason_non_empty(self): + assert self._get_finding().reason + def test_detected_at_is_datetime(self): + assert isinstance(self._get_finding().detected_at, datetime) -def test_critical_risk_for_very_high_cost(): - """storage_optimized_l2 × 1 × 1 = $4028 >= $3000 → CRITICAL risk.""" - svc = _make_service( - sku_name="storage_optimized_l2", replica_count=1, partition_count=1, age_days=30 - ) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].risk.value == "critical" - assert findings[0].estimated_monthly_cost_usd == pytest.approx(4028.0) + def test_details_service_name(self): + assert self._get_finding().details["service_name"] == "shape-svc" + def test_details_resource_group(self): + assert self._get_finding().details["resource_group"] == "rg-shape" -def test_critical_risk_threshold_boundary(): - """storage_optimized_l1 × 2 × 1 = $4028 >= $3000 → CRITICAL.""" - svc = _make_service( - sku_name="storage_optimized_l1", replica_count=2, partition_count=1, age_days=30 - ) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].risk.value == "critical" + def test_details_subscription_id(self): + assert self._get_finding().details["subscription_id"] == _SUB + def test_details_sku_name(self): + assert self._get_finding().details["sku_name"] == "standard" -# --------------------------------------------------------------------------- -# Cost estimation -# --------------------------------------------------------------------------- + def test_details_replica_count(self): + assert self._get_finding().details["replica_count"] == 2 + def test_details_partition_count(self): + assert self._get_finding().details["partition_count"] == 3 -def test_known_sku_cost_standard(): - """standard × 2 replicas × 2 partitions = 261 * 4 = $1044/month.""" - svc = _make_service(sku_name="standard", replica_count=2, partition_count=2, age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].estimated_monthly_cost_usd == pytest.approx(1044.0) + def test_details_status(self): + assert self._get_finding().details["status"] == "running" + def test_details_provisioning_state(self): + assert self._get_finding().details["provisioning_state"] == "succeeded" -def test_known_sku_cost_storage_optimized_l2(): - """storage_optimized_l2 × 1 × 1 = $4028/month.""" - svc = _make_service(sku_name="storage_optimized_l2", age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].estimated_monthly_cost_usd == pytest.approx(4028.0) + def test_details_created_at(self): + assert self._get_finding().details["created_at"] + def test_details_idle_window_days(self): + assert self._get_finding().details["idle_window_days"] == 90 -def test_cost_source_heuristic_sku_table_for_known_sku(): - svc = _make_service(sku_name="standard", age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].details["cost_source"] == "heuristic_sku_table" + def test_details_object_counts_dict(self): + assert isinstance(self._get_finding().details["object_counts"], dict) + def test_details_metrics_used(self): + metrics = self._get_finding().details["metrics_used"] + assert "SearchQueriesPerSecond" in metrics + assert "DocumentsProcessedCount" in metrics + assert "SkillExecutionCount" in metrics -# --------------------------------------------------------------------------- -# Metric fallback (SearchQueriesPerSecond → TotalSearchRequestCount) -# --------------------------------------------------------------------------- + def test_details_tags(self): + assert self._get_finding().details["tags"] == {"env": "test"} + def test_details_tags_never_none(self): + # spec 7: tags must never be None in output + svc = _make_service() + svc.tags = None + sc = _make_search_client([svc]) + findings = _call(sc, _make_zero_monitor()) + assert findings[0].details["tags"] == {} -def test_falls_back_to_total_when_avg_has_no_data(): - """First metric has no timeseries; second (Total) shows zero → idle detected.""" - svc = _make_service(age_days=30) - sc, mon = _make_clients( - svc, - avg_response=_make_no_timeseries_response(), - total_response=_make_total_metric_response(0.0), - ) - findings = _call(sc, mon) - assert len(findings) == 1 - assert findings[0].details["idle_signal"] == "metric_zero" - - -def test_active_on_second_metric_skips(): - """First metric has no data; second (Total) shows non-zero → skip.""" - svc = _make_service(age_days=30) - sc, mon = _make_clients( - svc, - avg_response=_make_no_timeseries_response(), - total_response=_make_total_metric_response(500.0), - ) - assert _call(sc, mon) == [] + def test_evidence_signals_used_non_empty(self): + assert self._get_finding().evidence.signals_used + def test_evidence_signals_not_checked_non_empty(self): + assert self._get_finding().evidence.signals_not_checked -def test_primary_metric_used_when_it_has_data(): - """SearchQueriesPerSecond has data → TotalSearchRequestCount not called.""" - svc = _make_service(age_days=30) - sc, mon = _make_clients(svc, avg_response=_make_average_metric_response(0.0)) - _call(sc, mon) - assert "SearchQueriesPerSecond" in mon._call_log - # TotalSearchRequestCount should NOT be called when primary succeeded - assert "TotalSearchRequestCount" not in mon._call_log + def test_evidence_time_window(self): + assert "90" in self._get_finding().evidence.time_window -def test_fallback_metric_called_when_primary_has_no_timeseries(): - svc = _make_service(age_days=30) - sc, mon = _make_clients( - svc, - avg_response=_make_no_timeseries_response(), - total_response=_make_total_metric_response(0.0), - ) - _call(sc, mon) - assert "TotalSearchRequestCount" in mon._call_log +# =========================================================================== +# Unit tests +# =========================================================================== -# --------------------------------------------------------------------------- -# Region filtering -# --------------------------------------------------------------------------- +class TestNormalizeSku: + """_normalize_sku: strip non-alnum, lowercase, then alias-resolve.""" + def test_standard(self): + assert _normalize_sku("standard") == "standard" -def test_region_filter_excludes_other_regions(): - svc = _make_service(location="westeurope", age_days=30) - sc, mon = _make_clients(svc) - assert _call(sc, mon, region_filter="eastus") == [] + def test_standard_mixed_case(self): + assert _normalize_sku("Standard") == "standard" + def test_standard2(self): + assert _normalize_sku("Standard2") == "standard2" -def test_region_filter_matches_normalised(): - """Spaces/dashes in location or filter should not prevent match.""" - svc = _make_service(location="East US", age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon, region_filter="eastus") - assert len(findings) == 1 + def test_storage_optimized_l1_camel(self): + assert _normalize_sku("StorageOptimizedL1") == "storage_optimized_l1" + def test_storage_optimized_l2_camel(self): + assert _normalize_sku("StorageOptimizedL2") == "storage_optimized_l2" -def test_region_filter_normalises_underscores(): - """Underscores in location or filter are stripped during normalisation.""" - svc = _make_service(location="east_us", age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon, region_filter="eastus") - assert len(findings) == 1 + def test_storage_optimized_l1_underscore_already_canonical(self): + assert _normalize_sku("storage_optimized_l1") == "storage_optimized_l1" + def test_empty_string(self): + assert _normalize_sku("") == "" -def test_no_region_filter_includes_all(): - svc = _make_service(location="australiaeast", age_days=30) - sc, mon = _make_clients(svc) - assert len(_call(sc, mon, region_filter=None)) == 1 + def test_none_like(self): + assert _normalize_sku(None) == "" + def test_unknown_not_in_supported(self): + assert _normalize_sku("enterprise") not in _SUPPORTED_SKUS -# --------------------------------------------------------------------------- -# Permission errors -# --------------------------------------------------------------------------- + def test_hyphenated_storage_optimized_not_normalized(self): + # Old strip-non-alnum would map "storage-optimized-l1" -> "storageoptimizedl1" -> alias + # New lowercase-only correctly rejects it (spec 7: lowercase only) + assert _normalize_sku("storage-optimized-l1") not in _SUPPORTED_SKUS + def test_punctuated_standard_not_normalized(self): + # "stan-dard" must NOT map to "standard" + assert _normalize_sku("stan-dard") not in _SUPPORTED_SKUS -def test_monitor_403_raises_permission_error(): - svc = _make_service(age_days=30) + def test_underscored_canonical_still_matches_directly(self): + # "storage_optimized_l1" lowercases to itself, direct SUPPORTED_SKUS member + assert _normalize_sku("storage_optimized_l1") in _SUPPORTED_SKUS - def _raise(*a, **kw): - raise Exception("403 Forbidden") - sc = SimpleNamespace(services=SimpleNamespace(list_by_subscription=lambda: [svc])) - mon = SimpleNamespace(metrics=SimpleNamespace(list=_raise)) - with pytest.raises(PermissionError): - _call(sc, mon) +class TestNormLocation: + """_norm_location: lowercase only; spaces, hyphens, digits preserved (spec 7).""" + def test_lowercase_unchanged(self): + assert _norm_location("eastus") == "eastus" -def test_monitor_authorization_failed_raises_permission_error(): - svc = _make_service(age_days=30) + def test_uppercase_lowercased(self): + assert _norm_location("EastUS") == "eastus" - def _raise(*a, **kw): - raise Exception("AuthorizationFailed: caller does not have permission") + def test_spaces_preserved(self): + # spec 7: do NOT remove spaces + assert _norm_location("East US") == "east us" - sc = SimpleNamespace(services=SimpleNamespace(list_by_subscription=lambda: [svc])) - mon = SimpleNamespace(metrics=SimpleNamespace(list=_raise)) - with pytest.raises(PermissionError): - _call(sc, mon) + def test_hyphens_preserved(self): + assert _norm_location("east-us") == "east-us" + def test_empty_string(self): + assert _norm_location("") == "" -def test_search_list_403_raises_permission_error(): - def _raise(): - raise Exception("403 Forbidden for search services") + def test_none_returns_empty(self): + assert _norm_location(None) == "" - sc = SimpleNamespace(services=SimpleNamespace(list_by_subscription=_raise)) - mon = SimpleNamespace( - metrics=SimpleNamespace(list=lambda *a, **kw: _make_average_metric_response(0.0)) - ) - with pytest.raises(PermissionError): - _call(sc, mon) +class TestExtractResourceGroup: + """_extract_resource_group: ARM id parsing.""" -def test_search_list_authorization_failed_raises_permission_error(): - def _raise(): - raise Exception("AuthorizationFailed on searchServices") + def test_valid_arm_id(self): + rg = _extract_resource_group( + "/subscriptions/sub-1/resourceGroups/my-rg/providers/Microsoft.Search/searchServices/svc1" + ) + assert rg == "my-rg" - sc = SimpleNamespace(services=SimpleNamespace(list_by_subscription=_raise)) - mon = SimpleNamespace( - metrics=SimpleNamespace(list=lambda *a, **kw: _make_average_metric_response(0.0)) - ) - with pytest.raises(PermissionError): - _call(sc, mon) + def test_case_insensitive_resourcegroups(self): + rg = _extract_resource_group( + "/subscriptions/sub-1/ResourceGroups/my-rg/providers/foo/bar/baz" + ) + assert rg == "my-rg" + def test_no_resourcegroups_segment_returns_none(self): + assert _extract_resource_group("/subscriptions/sub-1/providers/foo") is None -def test_unexpected_exception_propagates(): - def _raise(): - raise RuntimeError("disk full") + def test_empty_id_returns_none(self): + assert _extract_resource_group("") is None - sc = SimpleNamespace(services=SimpleNamespace(list_by_subscription=_raise)) - mon = SimpleNamespace( - metrics=SimpleNamespace(list=lambda *a, **kw: _make_average_metric_response(0.0)) - ) - with pytest.raises(RuntimeError): - _call(sc, mon) + def test_none_returns_none(self): + assert _extract_resource_group(None) is None -# --------------------------------------------------------------------------- -# idle_days clamping -# --------------------------------------------------------------------------- +class TestResolveProvisioningState: + def test_sdk_value_returned(self): + svc = SimpleNamespace(provisioning_state="succeeded") + assert _resolve_provisioning_state(svc) == "succeeded" -def test_idle_days_clamped_to_3(): - """idle_days=1 → effective_window clamped to 3 (min effective window).""" - svc = _make_service(age_days=None) - svc.system_data = None - sc, mon = _make_clients(svc) - findings = _call(sc, mon, idle_days=1) - # effective_window = min(1, None) → 1 < 3, should be skipped - # age_days is None, effective_window = idle_days=1 < 3 → skip - assert findings == [] + def test_nested_fallback(self): + svc = SimpleNamespace( + properties=SimpleNamespace(provisioning_state="succeeded", provisioningState=None) + ) + assert _resolve_provisioning_state(svc) == "succeeded" + def test_nested_camel_fallback(self): + svc = SimpleNamespace( + properties=SimpleNamespace(provisioning_state=None, provisioningState="succeeded") + ) + assert _resolve_provisioning_state(svc) == "succeeded" -def test_idle_days_30_with_no_age_uses_window_30(): - svc = _make_service(age_days=None) - svc.system_data = None - sc, mon = _make_clients(svc) - findings = _call(sc, mon, idle_days=30) - assert len(findings) == 1 - assert findings[0].details["idle_days_threshold"] == 30 + def test_conflict_returns_none(self): + svc = SimpleNamespace( + provisioning_state="succeeded", + properties=SimpleNamespace(provisioning_state="Failed", provisioningState=None), + ) + assert _resolve_provisioning_state(svc) is None + def test_both_absent_returns_none(self): + svc = SimpleNamespace() + assert _resolve_provisioning_state(svc) is None -# --------------------------------------------------------------------------- -# Finding shape -# --------------------------------------------------------------------------- + def test_both_same_value_no_conflict(self): + svc = SimpleNamespace( + provisioning_state="succeeded", + properties=SimpleNamespace(provisioning_state="succeeded", provisioningState=None), + ) + assert _resolve_provisioning_state(svc) == "succeeded" -def test_finding_shape_complete(): - svc = _make_service( - name="my-search", - sku_name="standard", - location="eastus", - age_days=30, - replica_count=2, - partition_count=1, - rg="rg-ai", - ) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - - assert len(findings) == 1 - f = findings[0] - assert f.provider == "azure" - assert f.rule_id == "azure.ai_search.idle" - assert f.resource_type == "azure.ai.search_service" - assert "my-search" in f.resource_id - assert f.region == "eastus" - assert f.title == "Idle Azure AI Search Service: my-search" - assert "my-search" in f.summary - assert f.reason - assert f.risk is not None - assert f.confidence is not None - assert isinstance(f.detected_at, datetime) - assert f.evidence is not None - - d = f.details - assert d["service_name"] == "my-search" - assert d["resource_group"] == "rg-ai" - assert d["sku"] == "standard" - assert d["location"] == "eastus" - assert d["replica_count"] == 2 - assert d["partition_count"] == 1 - assert d["age_days"] == 30 - assert d["idle_days_threshold"] == 30 - assert d["idle_signal"] in ("metric_zero", "age_only") - assert d["idle_metric"] # always "none" or a metric name - assert d["estimated_monthly_cost"] == pytest.approx(522.0) # 261 * 2 replicas - assert d["cost_source"] == "heuristic_sku_table" - - -def test_resource_group_parsed_from_id(): - svc = _make_service(rg="my-rg", age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].details["resource_group"] == "my-rg" - - -def test_estimated_cost_in_finding(): - svc = _make_service(sku_name="standard", replica_count=1, partition_count=1, age_days=30) - sc, mon = _make_clients(svc) - findings = _call(sc, mon) - assert findings[0].estimated_monthly_cost_usd == pytest.approx(261.0) +class TestResolveStatus: + def test_sdk_value_returned(self): + svc = SimpleNamespace(status="running") + assert _resolve_status(svc) == "running" -# --------------------------------------------------------------------------- -# RULE_METADATA -# --------------------------------------------------------------------------- + def test_nested_fallback(self): + svc = SimpleNamespace(properties=SimpleNamespace(status="running")) + assert _resolve_status(svc) == "running" + + def test_conflict_returns_none(self): + svc = SimpleNamespace( + status="running", + properties=SimpleNamespace(status="degraded"), + ) + assert _resolve_status(svc) is None + + def test_both_absent_returns_none(self): + assert _resolve_status(SimpleNamespace()) is None + + +class TestResolveCapacity: + + def test_positive_integer_returned(self): + svc = SimpleNamespace(replica_count=3) + assert _resolve_capacity(svc, "replica_count", "replica_count", "replicaCount") == 3 + + def test_zero_returns_none(self): + svc = SimpleNamespace(replica_count=0) + assert _resolve_capacity(svc, "replica_count", "replica_count", "replicaCount") is None + + def test_negative_returns_none(self): + svc = SimpleNamespace(replica_count=-1) + assert _resolve_capacity(svc, "replica_count", "replica_count", "replicaCount") is None + + def test_string_integer_coerced(self): + svc = SimpleNamespace(replica_count="2") + assert _resolve_capacity(svc, "replica_count", "replica_count", "replicaCount") == 2 + + def test_invalid_string_returns_none(self): + svc = SimpleNamespace(replica_count="n/a") + assert _resolve_capacity(svc, "replica_count", "replica_count", "replicaCount") is None + + def test_conflict_returns_none(self): + svc = SimpleNamespace( + replica_count=1, + properties=SimpleNamespace(replica_count=2, replicaCount=None), + ) + assert _resolve_capacity(svc, "replica_count", "replica_count", "replicaCount") is None + + def test_both_absent_returns_none(self): + assert ( + _resolve_capacity(SimpleNamespace(), "replica_count", "replica_count", "replicaCount") + is None + ) + + def test_nested_camel_fallback(self): + svc = SimpleNamespace(properties=SimpleNamespace(replica_count=None, replicaCount=4)) + assert _resolve_capacity(svc, "replica_count", "replica_count", "replicaCount") == 4 + + +class TestResolveCreatedAt: + + def test_tz_aware_datetime_returned(self): + ts = datetime(2023, 1, 1, tzinfo=timezone.utc) + svc = SimpleNamespace(system_data=SimpleNamespace(created_at=ts)) + result = _resolve_created_at(svc) + assert result == ts + + def test_tz_naive_datetime_converted_to_utc(self): + ts = datetime(2023, 1, 1) # naive + svc = SimpleNamespace(system_data=SimpleNamespace(created_at=ts)) + result = _resolve_created_at(svc) + assert result.tzinfo is not None + + def test_string_iso_parsed(self): + ts_str = "2023-01-01T00:00:00" + svc = SimpleNamespace(system_data=SimpleNamespace(created_at=ts_str)) + result = _resolve_created_at(svc) + assert result is not None + assert result.year == 2023 + + def test_string_with_z_suffix_parsed(self): + ts_str = "2023-01-01T00:00:00Z" + svc = SimpleNamespace(system_data=SimpleNamespace(created_at=ts_str)) + result = _resolve_created_at(svc) + assert result is not None + + def test_invalid_string_returns_none(self): + svc = SimpleNamespace(system_data=SimpleNamespace(created_at="not-a-date")) + assert _resolve_created_at(svc) is None + + def test_future_timestamp_returns_none(self): + future = datetime.now(timezone.utc) + timedelta(days=10) + svc = SimpleNamespace(system_data=SimpleNamespace(created_at=future)) + assert _resolve_created_at(svc) is None + + def test_none_value_returns_none(self): + svc = SimpleNamespace(system_data=SimpleNamespace(created_at=None)) + assert _resolve_created_at(svc) is None + def test_system_data_absent_returns_none(self): + assert _resolve_created_at(SimpleNamespace()) is None -def test_rule_metadata_present(): - assert RULE_METADATA["id"] == "azure.ai_search.idle" - assert RULE_METADATA["category"] == "ai" - assert RULE_METADATA["service"] == "search" - assert RULE_METADATA["cost_impact"] == "high" + def test_system_data_none_returns_none(self): + svc = SimpleNamespace(system_data=None) + assert _resolve_created_at(svc) is None + + def test_non_datetime_non_string_returns_none(self): + svc = SimpleNamespace(system_data=SimpleNamespace(created_at=99999)) + assert _resolve_created_at(svc) is None + + +class TestCheckObjectSurfaces: + + def test_all_empty_returns_dict_with_counts(self): + client = _MockDpClient() + result = _check_object_surfaces(client) + assert result is not None + for key in ("indexes", "indexers", "data_sources", "skillsets", "synonym_maps"): + assert result[key] == 0 + + def test_non_empty_required_returns_none(self): + for surface in ("indexes", "indexers", "data_sources", "skillsets", "synonym_maps"): + client = _MockDpClient(non_empty_required=surface) + assert _check_object_surfaces(client) is None, f"expected None for non-empty {surface}" + + def test_required_method_missing_returns_none(self): + class _MissingIndexers: + def list_indexes(self): + return [] + + def list_data_source_connections(self): + return [] + + def list_skillsets(self): + return [] + + def list_synonym_maps(self): + return [] + + # no list_indexers + + assert _check_object_surfaces(_MissingIndexers()) is None + + def test_required_raises_returns_none(self): + client = _MockDpClient(fail_required="skillsets") + assert _check_object_surfaces(client) is None + + def test_non_empty_optional_returns_none(self): + for surface in ("aliases", "knowledge_sources", "agents"): + client = _MockDpClient(non_empty_optional=surface) + assert _check_object_surfaces(client) is None, f"expected None for non-empty {surface}" + + def test_optional_method_missing_omitted_from_counts(self): + class _NoOptional: + def list_indexes(self): + return [] + + def list_indexers(self): + return [] + + def list_data_source_connections(self): + return [] + + def list_skillsets(self): + return [] + + def list_synonym_maps(self): + return [] + + result = _check_object_surfaces(_NoOptional()) + assert result is not None + assert "aliases" not in result + assert "knowledge_sources" not in result + assert "agents" not in result + + def test_optional_raises_omitted_from_counts(self): + client = _MockDpClient(fail_optional="aliases") + result = _check_object_surfaces(client) + assert result is not None + assert "aliases" not in result + + def test_optional_empty_included_in_counts(self): + client = _MockDpClient() + result = _check_object_surfaces(client) + assert result is not None + for key in ("aliases", "knowledge_sources", "agents"): + assert result[key] == 0 + + +class TestEvaluateMetric: + """Unit tests for _evaluate_metric with a fixed window for determinism.""" + + def _zero_mon(self, agg_attr: str, n: int = _WINDOW_DAYS): + dps = _make_datapoints(agg_attr, 0.0, n, window_start=_UNIT_WINDOW_START) + response = _make_metric_response(dps) + return SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + + def _active_mon(self, agg_attr: str): + dps = _make_datapoints(agg_attr, 5.0, _WINDOW_DAYS, window_start=_UNIT_WINDOW_START) + response = _make_metric_response(dps) + return SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + + def test_zero_average_returns_zero(self): + mon = self._zero_mon("average") + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.ZERO + + def test_active_average_returns_active(self): + mon = self._active_mon("average") + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.ACTIVE + + def test_zero_total_returns_zero(self): + mon = self._zero_mon("total") + result = _evaluate_metric( + mon, + _SVC_ID, + "DocumentsProcessedCount", + "Total", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.ZERO + + def test_active_total_returns_active(self): + mon = self._active_mon("total") + result = _evaluate_metric( + mon, + _SVC_ID, + "DocumentsProcessedCount", + "Total", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.ACTIVE + + def test_insufficient_coverage_returns_unknown(self): + # Only 50 of 90 expected buckets -> 55% < 95% + mon = self._zero_mon("average", n=50) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_no_timeseries_returns_unknown(self): + response = _make_no_timeseries_response() + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_empty_value_returns_unknown(self): + response = _make_empty_value_response() + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_query_exception_returns_unknown(self): + def _raise(*a, **kw): + raise RuntimeError("timeout") + + mon = SimpleNamespace(metrics=SimpleNamespace(list=_raise)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_timestamp_none_returns_unknown(self): + # Any None timestamp -> fail-closed -> UNKNOWN + dps = _make_datapoints("average", 0.0, _WINDOW_DAYS, window_start=_UNIT_WINDOW_START) + dps[10] = SimpleNamespace(timestamp=None, average=0.0, total=None, maximum=None) + response = _make_metric_response(dps) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_timestamp_not_datetime_returns_unknown(self): + dps = _make_datapoints("average", 0.0, _WINDOW_DAYS, window_start=_UNIT_WINDOW_START) + dps[0] = SimpleNamespace( + timestamp="2024-01-01T00:00:00Z", average=0.0, total=None, maximum=None + ) + response = _make_metric_response(dps) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_datapoints_outside_window_filtered(self): + # Generate 90 datapoints fully outside the window -> 0 observed buckets -> UNKNOWN + future_start = _UNIT_WINDOW_END + timedelta(days=10) + dps = _make_datapoints("average", 0.0, _WINDOW_DAYS, window_start=future_start) + response = _make_metric_response(dps) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_exact_coverage_threshold_passes(self): + # Exactly 86 buckets (ceil(90 * 0.95)) -> exactly at threshold -> ZERO + dps = _make_datapoints("average", 0.0, _MIN_BUCKETS, window_start=_UNIT_WINDOW_START) + response = _make_metric_response(dps) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.ZERO + + def test_one_below_coverage_threshold_unknown(self): + # 85 buckets -> 85/90 = 94.4% < 95% + dps = _make_datapoints("average", 0.0, _MIN_BUCKETS - 1, window_start=_UNIT_WINDOW_START) + response = _make_metric_response(dps) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_no_aggregation_value_reduces_coverage(self): + # Datapoints with all None agg values don't contribute to bucket coverage + dps = [] + for i in range(_WINDOW_DAYS): + ts = _UNIT_WINDOW_START + timedelta(days=i, hours=1) + # Only first 50 have real values; rest have None -> reduces observed to 50 + val = 0.0 if i < 50 else None + dps.append(SimpleNamespace(timestamp=ts, average=val, total=None, maximum=None)) + response = _make_metric_response(dps) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_no_interval_parameter_sent(self): + """interval= must NOT be passed to Azure Monitor (spec 9.5.2 source-bucket granularity).""" + captured: dict = {} + + def _list(*args, **kwargs): + captured.update(kwargs) + return _make_metric_response( + _make_datapoints("average", 0.0, _WINDOW_DAYS, window_start=_UNIT_WINDOW_START) + ) + + mon = SimpleNamespace(metrics=SimpleNamespace(list=_list)) + _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert "interval" not in captured + + def test_non_numeric_aggregation_string_returns_unknown(self): + """Non-numeric aggregation value -> fail-closed -> UNKNOWN (spec 9.5.6 unusable shape).""" + dps = _make_datapoints("average", 0.0, _WINDOW_DAYS, window_start=_UNIT_WINDOW_START) + dps[5] = SimpleNamespace( + timestamp=_UNIT_WINDOW_START + timedelta(days=5, hours=1), + average="N/A", + total=None, + maximum=None, + ) + response = _make_metric_response(dps) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_non_numeric_aggregation_dict_returns_unknown(self): + dps = _make_datapoints("total", 0.0, _WINDOW_DAYS, window_start=_UNIT_WINDOW_START) + dps[0] = SimpleNamespace( + timestamp=_UNIT_WINDOW_START + timedelta(hours=1), + total={"value": 5}, + average=None, + maximum=None, + ) + response = _make_metric_response(dps) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "DocumentsProcessedCount", + "Total", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_non_iterable_timeseries_returns_unknown(self): + """timeseries attribute is not None but not iterable -> TypeError -> UNKNOWN.""" + metric = SimpleNamespace(timeseries=42) # int is not iterable + response = SimpleNamespace(value=[metric]) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.UNKNOWN + + def test_non_iterable_data_returns_unknown(self): + """ts.data attribute is not None but not iterable -> TypeError -> UNKNOWN.""" + ts_obj = SimpleNamespace(data="malformed") # string iteration would give characters + metric = SimpleNamespace(timeseries=[ts_obj]) + response = SimpleNamespace(value=[metric]) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + # Each character in "malformed" has no .timestamp attribute -> AttributeError -> UNKNOWN + assert result == _MetricResult.UNKNOWN + + def test_subhourly_source_buckets_detect_activity(self): + """Multiple source buckets per UTC day: any positive bucket makes the day ACTIVE.""" + # Simulate hourly data: 24 buckets per day; one bucket has a spike + dps = [] + for day in range(_WINDOW_DAYS): + for hour in range(24): + ts = _UNIT_WINDOW_START + timedelta(days=day, hours=hour) + # Day 5, hour 3: tiny spike that daily averaging could dilute toward 0 + val = 0.001 if (day == 5 and hour == 3) else 0.0 + dps.append(SimpleNamespace(timestamp=ts, average=val, total=None, maximum=None)) + response = _make_metric_response(dps) + mon = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: response)) + result = _evaluate_metric( + mon, + _SVC_ID, + "SearchQueriesPerSecond", + "Average", + _UNIT_WINDOW_START, + _UNIT_WINDOW_END, + ) + assert result == _MetricResult.ACTIVE + + +class TestRuleMetadata: + + def test_rule_id(self): + assert RULE_METADATA["id"] == "azure.ai_search.idle" + + def test_category(self): + assert RULE_METADATA["category"] == "ai" + + def test_service(self): + assert RULE_METADATA["service"] == "search" + + def test_cost_impact(self): + assert RULE_METADATA["cost_impact"] == "high" diff --git a/tests/cleancloud/providers/azure/test_azure_aml_compute_idle.py b/tests/cleancloud/providers/azure/test_azure_aml_compute_idle.py index 7cb4092..f004414 100644 --- a/tests/cleancloud/providers/azure/test_azure_aml_compute_idle.py +++ b/tests/cleancloud/providers/azure/test_azure_aml_compute_idle.py @@ -1,804 +1,1825 @@ +"""Tests for azure.aml.compute.idle rule (hardened per spec).""" + +import math from datetime import datetime, timedelta, timezone from types import SimpleNamespace import pytest +from azure.core.exceptions import HttpResponseError, ServiceRequestError + +from cleancloud.providers.azure.rules.ai.aml_compute_idle import ( + RULE_METADATA, + _evaluate_metric, + _extract_resource_group, + _MetricResult, + _norm_location, + _resolve_allocation_state, + _resolve_compute_type, + _resolve_created_at, + _resolve_current_node_count, + _resolve_int_field, + _resolve_min_node_count, + _resolve_provisioning_state, + _resolve_str_field, + _series_is_cluster_scoped, + _to_detail_str, + find_idle_aml_compute, +) -from cleancloud.providers.azure.rules.ai.aml_compute_idle import find_idle_aml_compute +# --------------------------------------------------------------------------- +# Test helpers +# --------------------------------------------------------------------------- -def _make_workspace(name="test-workspace", location="eastus", rg="rg-ml"): +def _make_workspace(name="test-workspace", rg="rg-ml"): ws_id = ( f"/subscriptions/sub-123/resourceGroups/{rg}" f"/providers/Microsoft.MachineLearningServices/workspaces/{name}" ) - return SimpleNamespace(id=ws_id, name=name, location=location) + return SimpleNamespace(id=ws_id, name=name) def _make_compute( name="test-cluster", - vm_size="Standard_D4_v2", + location="eastus", + age_days=30, min_node_count=2, + max_node_count=10, + current_node_count=2, + target_node_count=2, + vm_size="Standard_D4_v2", + vm_priority="Dedicated", compute_type="AmlCompute", - age_days=30, - workspace="test-workspace", + provisioning_state="Succeeded", + allocation_state="Steady", + tags=None, rg="rg-ml", + workspace_name="test-workspace", ): - compute_id = ( - f"/subscriptions/sub-123/resourceGroups/{rg}" - f"/providers/Microsoft.MachineLearningServices/workspaces/{workspace}/computes/{name}" + now = datetime.now(timezone.utc) + created_on = now - timedelta(days=age_days) if age_days is not None else None + + scale = SimpleNamespace( + min_node_count=min_node_count, + max_node_count=max_node_count, + node_idle_time_before_scale_down="PT120S", ) - scale_settings = SimpleNamespace(min_node_count=min_node_count, max_node_count=10) - aml_compute_props = SimpleNamespace(vm_size=vm_size, scale_settings=scale_settings) - compute_obj = SimpleNamespace( + inner = SimpleNamespace( + allocation_state=allocation_state, + scale_settings=scale, + current_node_count=current_node_count, + target_node_count=target_node_count, + vm_size=vm_size, + vm_priority=vm_priority, + ) + outer = SimpleNamespace( compute_type=compute_type, - properties=aml_compute_props, + provisioning_state=provisioning_state, + created_on=created_on, + properties=inner, + ) + compute_id = ( + f"/subscriptions/sub-123/resourceGroups/{rg}" + f"/providers/Microsoft.MachineLearningServices" + f"/workspaces/{workspace_name}/computes/{name}" ) - now = datetime.now(timezone.utc) - created_on = now - timedelta(days=age_days) if age_days is not None else None - # created_on lives on AmlCompute (compute.properties), not on ComputeResource.system_data - compute_obj.created_on = created_on return SimpleNamespace( id=compute_id, name=name, - properties=compute_obj, + location=location, + tags=tags or {}, + properties=outer, ) -def _make_metric_response(max_value: float = 0.0) -> SimpleNamespace: - """Azure Monitor metrics.list() response with a single datapoint.""" - data_point = SimpleNamespace(maximum=max_value) - timeseries = SimpleNamespace(data=[data_point]) - metric = SimpleNamespace(timeseries=[timeseries]) - return SimpleNamespace(value=[metric]) +def _make_cluster_metadata(compute_name: str): + """Build metadata_values confirming ClusterName = compute_name on a timeseries.""" + name_obj = SimpleNamespace(value="ClusterName") + return [SimpleNamespace(name=name_obj, value=compute_name)] + + +def _metric_response_zero(compute_name="test-cluster"): + """ + Metric response with ClusterName metadata, sufficient coverage, all max=0. + Evaluates to ZERO for the given compute_name. + """ + now = datetime.now(timezone.utc) + window_start = now - timedelta(days=14) + first_bucket = window_start.replace(hour=0, minute=0, second=0, microsecond=0) + expected = math.ceil((now - first_bucket).total_seconds() / 86400) + + datapoints = [] + for i in range(expected): + noon = first_bucket + timedelta(days=i, hours=12) + ts = max(noon, window_start + timedelta(seconds=1)) + if ts >= now: + ts = now - timedelta(seconds=1) + datapoints.append(SimpleNamespace(timestamp=ts, maximum=0.0)) + + ts_obj = SimpleNamespace(data=datapoints, metadata_values=_make_cluster_metadata(compute_name)) + return SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + + +def _metric_response_active(compute_name="test-cluster"): + """ + Metric response with ClusterName metadata, sufficient coverage, max > 0 on day 0. + Evaluates to ACTIVE for the given compute_name. + """ + now = datetime.now(timezone.utc) + window_start = now - timedelta(days=14) + first_bucket = window_start.replace(hour=0, minute=0, second=0, microsecond=0) + expected = math.ceil((now - first_bucket).total_seconds() / 86400) + + datapoints = [] + for i in range(expected): + noon = first_bucket + timedelta(days=i, hours=12) + ts = max(noon, window_start + timedelta(seconds=1)) + if ts >= now: + ts = now - timedelta(seconds=1) + datapoints.append(SimpleNamespace(timestamp=ts, maximum=(3.0 if i == 0 else 0.0))) + + ts_obj = SimpleNamespace(data=datapoints, metadata_values=_make_cluster_metadata(compute_name)) + return SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) -def _make_empty_metric_response() -> SimpleNamespace: - """Azure Monitor returns no timeseries — metric never published (cluster never ran jobs).""" +def _metric_response_unknown(): + """Metric response with no datapoints -> evaluates to UNKNOWN.""" return SimpleNamespace(value=[]) -def _make_clients(workspace, computes, metric_response): +def _make_clients(workspace, computes, metric_fn=None, compute_name="test-cluster"): + """ + Build mock ML and Monitor clients. + + When no explicit metric_fn is provided, the default produces a ZERO response + scoped to compute_name. Pass metric_fn explicitly when the test needs non-default + behaviour or when the compute name differs from the default. + """ + if metric_fn is None: + + def metric_fn(*a, **kw): + return _metric_response_zero(compute_name=compute_name) + ml_client = SimpleNamespace( workspaces=SimpleNamespace(list_by_subscription=lambda: [workspace]), - machine_learning_compute=SimpleNamespace(list_by_workspace=lambda rg, ws_name: computes), + machine_learning_compute=SimpleNamespace(list_by_workspace=lambda rg, ws: computes), ) - monitor_client = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: metric_response)) + monitor_client = SimpleNamespace(metrics=SimpleNamespace(list=metric_fn)) return ml_client, monitor_client -# --------------------------------------------------------------------------- -# Core detection -# --------------------------------------------------------------------------- - - -def test_idle_cpu_cluster_detected(): - """Idle CPU cluster with min_node_count > 0 and zero active nodes should be flagged.""" - ws = _make_workspace() - compute = _make_compute(vm_size="Standard_D4_v2", min_node_count=2, age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) - - findings = find_idle_aml_compute( +def _run(workspace=None, computes=None, metric_fn=None, compute_name="test-cluster", **kwargs): + """Convenience runner for integration tests.""" + ws = workspace or _make_workspace() + c = computes if computes is not None else [_make_compute()] + ml, mon = _make_clients(ws, c, metric_fn, compute_name=compute_name) + return find_idle_aml_compute( subscription_id="sub-123", credential=None, - client=ml_client, - monitor_client=mon_client, + client=ml, + monitor_client=mon, + **kwargs, ) - assert len(findings) == 1 - f = findings[0] - assert f.rule_id == "azure.aml.compute.idle" - assert f.resource_type == "azure.aml.compute" - assert f.confidence.value == "high" - assert f.risk.value == "medium" - assert f.details["is_gpu"] is False - assert f.details["vm_size"] == "Standard_D4_v2" - assert f.details["min_node_count"] == 2 - assert f.details["age_days"] == 30 - - -def test_idle_gpu_cluster_detected_high_risk(): - """Idle GPU cluster with min_node_count >= 2 should be flagged as HIGH risk.""" - ws = _make_workspace() - compute = _make_compute(vm_size="Standard_NC6", min_node_count=2, age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) - - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) - assert len(findings) == 1 - f = findings[0] - assert f.risk.value == "high" - assert f.details["is_gpu"] is True - assert f.details["vm_size"] == "Standard_NC6" - assert f.estimated_monthly_cost_usd == 648.0 * 2 +# =========================================================================== +# Integration: TestMustEmit +# =========================================================================== -def test_idle_gpu_cluster_single_node_medium_risk(): - """Idle GPU cluster with min_node_count=1 should be MEDIUM risk (may be dev/test).""" - ws = _make_workspace() - compute = _make_compute(vm_size="Standard_NC6", min_node_count=1, age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) +class TestMustEmit: + def test_all_conditions_met_emits(self): + findings = _run() + assert len(findings) == 1 - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_finding_rule_id(self): + f = _run()[0] + assert f.rule_id == "azure.aml.compute.idle" - assert len(findings) == 1 - f = findings[0] - assert f.risk.value == "medium" - assert f.details["is_gpu"] is True + def test_finding_resource_type(self): + f = _run()[0] + assert f.resource_type == "azure.aml.compute" + def test_finding_provider(self): + f = _run()[0] + assert f.provider == "azure" -def test_active_cluster_skipped(): - """Cluster with active nodes should NOT be flagged.""" - ws = _make_workspace() - compute = _make_compute(age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(3.0)) + def test_confidence_always_high(self): + f = _run()[0] + assert f.confidence.value == "high" - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_risk_always_medium(self): + f = _run()[0] + assert f.risk.value == "medium" - assert findings == [] + def test_estimated_cost_always_none(self): + f = _run()[0] + assert f.estimated_monthly_cost_usd is None + def test_resource_id_is_original_arm_id(self): + c = _make_compute(name="my-cluster") + findings = _run(computes=[c], compute_name="my-cluster") + assert findings[0].resource_id == c.id -def test_zero_min_node_count_skipped(): - """Cluster with min_node_count=0 should NOT be flagged — scales to zero, no idle cost.""" - ws = _make_workspace() - compute = _make_compute(min_node_count=0, age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + def test_region_is_normalized_compute_location(self): + c = _make_compute(location="EastUS") + findings = _run(computes=[c]) + assert findings[0].region == "eastus" - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_no_computes_returns_empty(self): + assert _run(computes=[]) == [] - assert findings == [] + def test_multiple_eligible_computes_all_emitted(self): + c1 = _make_compute(name="cluster-a", workspace_name="test-workspace") + c2 = _make_compute(name="cluster-b", workspace_name="test-workspace") + def metric_fn(*a, **kw): + # Return response scoped to whichever cluster the filter asks for + f = kw.get("filter", "") + name = f.split("'")[1] if "'" in f else "test-cluster" + return _metric_response_zero(compute_name=name) -def test_non_aml_compute_skipped(): - """Compute instances that are not AmlCompute type (e.g. AKS, ComputeInstance) should be skipped.""" - ws = _make_workspace() - compute = _make_compute(compute_type="ComputeInstance", age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + findings = _run(computes=[c1, c2], metric_fn=metric_fn) + assert len(findings) == 2 - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_active_metric_skips(self): + findings = _run(metric_fn=lambda *a, **kw: _metric_response_active()) + assert findings == [] - assert findings == [] + def test_unknown_metric_skips(self): + findings = _run(metric_fn=lambda *a, **kw: _metric_response_unknown()) + assert findings == [] -def test_young_cluster_skipped(): - """Cluster younger than minimum threshold should NOT be flagged.""" - ws = _make_workspace() - compute = _make_compute(age_days=3) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) +# =========================================================================== +# Integration: TestIdGuard (spec 8.1) +# =========================================================================== - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) - assert findings == [] +class TestIdGuard: + def test_id_none_skips(self): + c = _make_compute() + c.id = None + assert _run(computes=[c]) == [] + def test_id_empty_string_skips(self): + c = _make_compute() + c.id = "" + assert _run(computes=[c]) == [] -def test_no_computes_returns_empty(): - """No compute targets should return empty findings.""" - ws = _make_workspace() - ml_client, mon_client = _make_clients(ws, [], _make_metric_response(0.0)) + def test_id_absent_skips(self): + c = _make_compute() + del c.id + assert _run(computes=[c]) == [] - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) - assert findings == [] +# =========================================================================== +# Integration: TestNameGuard (spec 8.2) +# =========================================================================== -# --------------------------------------------------------------------------- -# Empty metric series (cluster never ran any jobs) -# --------------------------------------------------------------------------- +class TestNameGuard: + def test_name_none_skips(self): + c = _make_compute() + c.name = None + assert _run(computes=[c]) == [] + def test_name_empty_string_skips(self): + c = _make_compute() + c.name = "" + assert _run(computes=[c]) == [] -def test_empty_metric_series_assumes_active(): - """Old cluster with no Azure Monitor data at all is assumed active (conservative). + def test_name_absent_skips(self): + c = _make_compute() + del c.name + assert _run(computes=[c]) == [] - Without a dimension-filtered timeseries confirming zero activity, we cannot - safely conclude the cluster is idle — metrics may simply not be published yet - or the metric name may have changed. Skip to avoid false positives. - """ - ws = _make_workspace() - compute = _make_compute(age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_empty_metric_response()) - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) +# =========================================================================== +# Integration: TestWorkspaceNameGuard (spec 8.3) +# =========================================================================== - assert findings == [] +class TestWorkspaceNameGuard: + def test_workspace_name_none_skips(self): + ws = _make_workspace() + ws.name = None + assert _run(workspace=ws) == [] -# --------------------------------------------------------------------------- -# Effective window (age vs idle period) -# --------------------------------------------------------------------------- + def test_workspace_name_empty_skips(self): + ws = _make_workspace() + ws.name = "" + assert _run(workspace=ws) == [] + def test_workspace_id_missing_resource_group_skips(self): + ws = _make_workspace() + ws.id = "/subscriptions/sub-123/no-rg-here" + assert _run(workspace=ws) == [] -def test_effective_window_capped_to_age(): - """For a cluster younger than days_idle, the effective window is capped to age.""" - ws = _make_workspace() - compute = _make_compute(age_days=12) # age < days_idle=14 - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + def test_workspace_id_none_skips(self): + ws = _make_workspace() + ws.id = None + assert _run(workspace=ws) == [] - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) - assert len(findings) == 1 - assert findings[0].details["idle_window_days"] == 12 - assert findings[0].details["idle_days_threshold"] == 14 +# =========================================================================== +# Integration: TestRegionFilter (spec 8.4) +# =========================================================================== -def test_very_small_idle_days_clamped_to_3(): - """idle_days below 3 is clamped to 3 (matching the effective_window < 3 guard). +class TestRegionFilter: + def test_no_filter_emits(self): + assert len(_run(region_filter=None)) == 1 - Setup: idle_days=2 -> clamped to 3, age=8 - - Age guard: 8 >= max(3//2=1, 7) = 7 -> passes - - effective_window = min(3, 8) = 3 -> proceeds (not skipped) - """ - ws = _make_workspace() - compute = _make_compute(age_days=8) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + def test_matching_location_emits(self): + c = _make_compute(location="eastus") + assert len(_run(computes=[c], region_filter="eastus")) == 1 - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - idle_days=2, - ) + def test_non_matching_location_skips(self): + c = _make_compute(location="eastus") + assert _run(computes=[c], region_filter="westeurope") == [] - assert len(findings) == 1 + def test_filter_case_insensitive_match(self): + c = _make_compute(location="EastUS") + assert len(_run(computes=[c], region_filter="eastus")) == 1 + def test_filter_case_insensitive_filter_value(self): + c = _make_compute(location="eastus") + assert len(_run(computes=[c], region_filter="EASTUS")) == 1 -# --------------------------------------------------------------------------- -# Confidence levels -# --------------------------------------------------------------------------- + def test_spaces_preserved_in_location_no_match(self): + # "east us" != "eastus" — spaces are NOT stripped (spec 7) + c = _make_compute(location="east us") + assert _run(computes=[c], region_filter="eastus") == [] + def test_spaces_preserved_match_when_filter_also_has_spaces(self): + c = _make_compute(location="east us") + assert len(_run(computes=[c], region_filter="east us")) == 1 -def test_high_confidence_for_old_cluster(): - """Cluster older than days_idle should be HIGH confidence.""" - ws = _make_workspace() - compute = _make_compute(age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + def test_hyphens_preserved_no_match(self): + c = _make_compute(location="east-us") + assert _run(computes=[c], region_filter="eastus") == [] - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_region_filter_on_compute_location_not_workspace_location(self): + ws = _make_workspace() + c = _make_compute(location="westus") + ml, mon = _make_clients(ws, [c]) + findings = find_idle_aml_compute( + subscription_id="sub-123", + credential=None, + region_filter="westus", + client=ml, + monitor_client=mon, + ) + assert len(findings) == 1 - assert findings[0].confidence.value == "high" + def test_region_stored_normalized_in_finding(self): + c = _make_compute(location="WestEurope") + findings = _run(computes=[c], region_filter="westeurope") + assert findings[0].region == "westeurope" -def test_medium_confidence_for_borderline_age(): - """Cluster at 75% of idle threshold should be MEDIUM confidence.""" - ws = _make_workspace() - # age=11, int(14 * 0.75)=10 -> 11 >= 10 -> MEDIUM - compute = _make_compute(age_days=11) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) +# =========================================================================== +# Integration: TestComputeTypeContract (spec 8.5) +# =========================================================================== - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" +class TestComputeTypeContract: + def test_aml_compute_emits(self): + c = _make_compute(compute_type="AmlCompute") + assert len(_run(computes=[c])) == 1 + def test_compute_instance_skips(self): + c = _make_compute(compute_type="ComputeInstance") + assert _run(computes=[c]) == [] -def test_borderline_age_below_threshold_skipped(): - """Cluster below the 75% confidence threshold should be skipped.""" - ws = _make_workspace() - # age=8, int(14 * 0.75)=10 -> 8 < 10 -> skip - compute = _make_compute(age_days=8) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + def test_aks_skips(self): + c = _make_compute(compute_type="AKS") + assert _run(computes=[c]) == [] - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_wrong_case_skips(self): + c = _make_compute(compute_type="amlcompute") + assert _run(computes=[c]) == [] - assert findings == [] + def test_none_compute_type_skips(self): + c = _make_compute() + c.properties.compute_type = None + assert _run(computes=[c]) == [] + def test_conflict_sdk_raw_skips(self): + c = _make_compute() + c.properties.compute_type = "AmlCompute" + c.properties.computeType = "ComputeInstance" + assert _run(computes=[c]) == [] -def test_medium_confidence_when_no_creation_time(): - """Cluster with unknown age should be MEDIUM confidence — can't rule out recent creation.""" - ws = _make_workspace() - compute = _make_compute(age_days=None) # no creation time — created_on is None on properties - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + def test_raw_camel_case_field_accepted(self): + c = _make_compute() + del c.properties.compute_type + c.properties.computeType = "AmlCompute" + assert len(_run(computes=[c])) == 1 - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" - assert findings[0].details["age_days"] == "unknown" +# =========================================================================== +# Integration: TestProvisioningStateContract (spec 8.6) +# =========================================================================== -# --------------------------------------------------------------------------- -# GPU family detection and cost estimation -# --------------------------------------------------------------------------- +class TestProvisioningStateContract: + def test_succeeded_emits(self): + c = _make_compute(provisioning_state="Succeeded") + assert len(_run(computes=[c])) == 1 + def test_failed_skips(self): + c = _make_compute(provisioning_state="Failed") + assert _run(computes=[c]) == [] -def test_nc_series_detected_as_gpu(): - ws = _make_workspace() - compute = _make_compute(vm_size="Standard_NC12", min_node_count=2, age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + def test_creating_skips(self): + c = _make_compute(provisioning_state="Creating") + assert _run(computes=[c]) == [] - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_wrong_case_skips(self): + c = _make_compute(provisioning_state="succeeded") + assert _run(computes=[c]) == [] - assert findings[0].details["is_gpu"] is True - assert findings[0].risk.value == "high" - assert findings[0].estimated_monthly_cost_usd == 1_296.0 * 2 + def test_none_skips(self): + c = _make_compute() + c.properties.provisioning_state = None + assert _run(computes=[c]) == [] + def test_conflict_skips(self): + c = _make_compute() + c.properties.provisioning_state = "Succeeded" + c.properties.provisioningState = "Failed" + assert _run(computes=[c]) == [] -def test_nd_series_detected_as_gpu(): - """ND-series (deep learning) should be classified as GPU-class.""" - ws = _make_workspace() - compute = _make_compute(vm_size="Standard_ND40rs_v2", min_node_count=1, age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + def test_raw_camel_case_accepted(self): + c = _make_compute() + del c.properties.provisioning_state + c.properties.provisioningState = "Succeeded" + assert len(_run(computes=[c])) == 1 - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) - assert findings[0].details["is_gpu"] is True - assert findings[0].estimated_monthly_cost_usd == 15_862.0 +# =========================================================================== +# Integration: TestAllocationStateContract (spec 8.7) +# =========================================================================== + + +class TestAllocationStateContract: + def test_steady_emits(self): + c = _make_compute(allocation_state="Steady") + assert len(_run(computes=[c])) == 1 + + def test_resizing_skips(self): + c = _make_compute(allocation_state="Resizing") + assert _run(computes=[c]) == [] + + def test_scaling_skips(self): + c = _make_compute(allocation_state="Scaling") + assert _run(computes=[c]) == [] + def test_wrong_case_skips(self): + c = _make_compute(allocation_state="steady") + assert _run(computes=[c]) == [] + + def test_none_skips(self): + c = _make_compute() + c.properties.properties.allocation_state = None + assert _run(computes=[c]) == [] + + def test_conflict_skips(self): + c = _make_compute() + c.properties.properties.allocation_state = "Steady" + c.properties.properties.allocationState = "Resizing" + assert _run(computes=[c]) == [] + + def test_raw_camel_case_accepted(self): + c = _make_compute() + del c.properties.properties.allocation_state + c.properties.properties.allocationState = "Steady" + assert len(_run(computes=[c])) == 1 + + def test_inner_props_absent_skips(self): + c = _make_compute() + c.properties.properties = None + assert _run(computes=[c]) == [] + + +# =========================================================================== +# Integration: TestCreatedAtContract (spec 8.8) +# =========================================================================== + + +class TestCreatedAtContract: + def test_age_exactly_14_days_emits(self): + c = _make_compute(age_days=14) + assert len(_run(computes=[c])) == 1 + + def test_age_greater_than_14_days_emits(self): + c = _make_compute(age_days=30) + assert len(_run(computes=[c])) == 1 + + def test_age_13_days_skips(self): + c = _make_compute(age_days=13) + assert _run(computes=[c]) == [] + + def test_age_zero_skips(self): + c = _make_compute(age_days=0) + assert _run(computes=[c]) == [] + + def test_created_on_none_skips(self): + c = _make_compute() + c.properties.created_on = None + assert _run(computes=[c]) == [] + + def test_created_on_absent_skips(self): + c = _make_compute() + del c.properties.created_on + assert _run(computes=[c]) == [] + + def test_created_on_future_skips(self): + c = _make_compute() + c.properties.created_on = datetime.now(timezone.utc) + timedelta(days=1) + assert _run(computes=[c]) == [] + + def test_created_on_invalid_string_skips(self): + c = _make_compute() + c.properties.created_on = "not-a-date" + assert _run(computes=[c]) == [] + + def test_created_on_iso_string_parsed(self): + now = datetime.now(timezone.utc) + c = _make_compute() + c.properties.created_on = (now - timedelta(days=30)).isoformat() + assert len(_run(computes=[c])) == 1 + + def test_absent_created_at_skips_no_fallback(self): + # spec: absent created_at -> skip (no MEDIUM confidence fallback) + c = _make_compute() + c.properties.created_on = None + assert _run(computes=[c]) == [] + + def test_camel_case_created_on_accepted(self): + now = datetime.now(timezone.utc) + c = _make_compute() + del c.properties.created_on + c.properties.createdOn = now - timedelta(days=30) + assert len(_run(computes=[c])) == 1 + + +# =========================================================================== +# Integration: TestMinNodeCountContract (spec 8.9) +# =========================================================================== + + +class TestMinNodeCountContract: + def test_positive_min_emits(self): + c = _make_compute(min_node_count=1) + assert len(_run(computes=[c])) == 1 + + def test_min_node_count_zero_skips(self): + c = _make_compute(min_node_count=0) + assert _run(computes=[c]) == [] + + def test_min_node_count_negative_skips(self): + c = _make_compute(min_node_count=-1) + assert _run(computes=[c]) == [] + + def test_scale_settings_none_skips(self): + c = _make_compute() + c.properties.properties.scale_settings = None + assert _run(computes=[c]) == [] + + def test_min_node_count_none_skips(self): + c = _make_compute() + c.properties.properties.scale_settings.min_node_count = None + assert _run(computes=[c]) == [] + + def test_raw_scale_settings_camel_case_accepted(self): + # scaleSettings (raw camelCase) used when scale_settings absent + c = _make_compute() + del c.properties.properties.scale_settings + c.properties.properties.scaleSettings = SimpleNamespace( + min_node_count=2, max_node_count=10, node_idle_time_before_scale_down="PT120S" + ) + assert len(_run(computes=[c])) == 1 -def test_cost_scales_with_min_node_count(): - """Cost estimate should be min_node_count × monthly cost per node.""" - ws = _make_workspace() - # Standard_D4_v2 = $259/month, min_node_count=3 -> $777/month - compute = _make_compute(vm_size="Standard_D4_v2", min_node_count=3, age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + def test_raw_min_node_count_camel_case_accepted(self): + # minNodeCount (raw) used when min_node_count absent on scale_settings; + # use current_node_count=3 so current >= raw min_node_count=3 + c = _make_compute(current_node_count=3) + del c.properties.properties.scale_settings.min_node_count + c.properties.properties.scale_settings.minNodeCount = 3 + assert len(_run(computes=[c])) == 1 - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_raw_min_node_count_zero_still_skips(self): + c = _make_compute() + del c.properties.properties.scale_settings.min_node_count + c.properties.properties.scale_settings.minNodeCount = 0 + assert _run(computes=[c]) == [] - assert findings[0].estimated_monthly_cost_usd == 259.0 * 3 +# =========================================================================== +# Integration: TestCurrentNodeCountContract (spec 8.10) +# =========================================================================== -def test_unknown_vm_size_uses_default_cost(): - """Unknown VM size should use the default cost estimate, not crash.""" - ws = _make_workspace() - compute = _make_compute(vm_size="Standard_FutureSeries_v99", min_node_count=2, age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) +class TestCurrentNodeCountContract: + def test_current_equals_min_emits(self): + c = _make_compute(min_node_count=2, current_node_count=2) + assert len(_run(computes=[c])) == 1 - assert len(findings) == 1 - assert findings[0].estimated_monthly_cost_usd == 200.0 * 2 # default × min_nodes + def test_current_exceeds_min_emits(self): + c = _make_compute(min_node_count=2, current_node_count=5) + assert len(_run(computes=[c])) == 1 + def test_current_zero_min_two_skips(self): + c = _make_compute(min_node_count=2, current_node_count=0) + assert _run(computes=[c]) == [] -# --------------------------------------------------------------------------- -# Region filter -# --------------------------------------------------------------------------- + def test_current_less_than_min_skips(self): + c = _make_compute(min_node_count=3, current_node_count=2) + assert _run(computes=[c]) == [] + def test_current_none_skips(self): + c = _make_compute() + c.properties.properties.current_node_count = None + assert _run(computes=[c]) == [] -def test_region_filter_excludes_other_regions(): - """Clusters in a different location than region_filter should be skipped.""" - ws = _make_workspace(location="westeurope") - compute = _make_compute(age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) + def test_current_negative_skips(self): + c = _make_compute() + c.properties.properties.current_node_count = -1 + assert _run(computes=[c]) == [] - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - region_filter="eastus", - client=ml_client, - monitor_client=mon_client, - ) + def test_raw_current_node_count_camel_case_accepted(self): + # currentNodeCount (raw) used when current_node_count absent + c = _make_compute() + del c.properties.properties.current_node_count + c.properties.properties.currentNodeCount = 2 + assert len(_run(computes=[c])) == 1 - assert findings == [] + def test_raw_current_node_count_negative_skips(self): + c = _make_compute() + del c.properties.properties.current_node_count + c.properties.properties.currentNodeCount = -1 + assert _run(computes=[c]) == [] -def test_region_filter_matches_normalised(): - """Region filter should match after normalisation (spaces/dashes stripped).""" - ws = _make_workspace(location="East US") # raw Azure location name - compute = _make_compute(age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) +# =========================================================================== +# Integration: TestMetricContract (spec 8.11-8.12, 9.3) +# =========================================================================== - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - region_filter="eastus", - client=ml_client, - monitor_client=mon_client, - ) - assert len(findings) == 1 - # Original location string preserved in finding, not normalised - assert findings[0].region == "East US" +class TestMetricContract: + def test_metric_zero_emits(self): + assert len(_run(metric_fn=lambda *a, **kw: _metric_response_zero())) == 1 + def test_metric_active_skips(self): + assert _run(metric_fn=lambda *a, **kw: _metric_response_active()) == [] -# --------------------------------------------------------------------------- -# Resilience -# --------------------------------------------------------------------------- + def test_metric_unknown_skips(self): + assert _run(metric_fn=lambda *a, **kw: _metric_response_unknown()) == [] + def test_monitor_raises_exception_skips(self): + def _raise(*a, **kw): + raise RuntimeError("monitor unavailable") -def test_monitor_failure_treated_as_active(): - """If Azure Monitor metrics fail, cluster should NOT be flagged (avoid false positives).""" + assert _run(metric_fn=_raise) == [] - def _raise(*args, **kwargs): - raise RuntimeError("Monitor unavailable") + def test_metric_filter_uses_cluster_name_dimension(self): + captured = {} - ws = _make_workspace() - compute = _make_compute(age_days=30) - ml_client = SimpleNamespace( - workspaces=SimpleNamespace(list_by_subscription=lambda: [ws]), - machine_learning_compute=SimpleNamespace(list_by_workspace=lambda rg, ws_name: [compute]), - ) - mon_client = SimpleNamespace(metrics=SimpleNamespace(list=_raise)) + def _capture(*a, **kw): + captured.update(kw) + return _metric_response_zero() - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + _run(metric_fn=_capture) + assert "filter" in captured + assert "ClusterName" in captured["filter"] - assert findings == [] + def test_metric_filter_uses_compute_name(self): + captured = {} + def _capture(*a, **kw): + captured.update(kw) + return _metric_response_zero(compute_name="my-special-cluster") -def test_permission_error_raised(): - """AuthorizationFailed should raise PermissionError with required permission names.""" + c = _make_compute(name="my-special-cluster") + _run(computes=[c], metric_fn=_capture) + assert "my-special-cluster" in captured["filter"] - def _raise(*args, **kwargs): - raise Exception("AuthorizationFailed: The client does not have authorization") + def test_no_interval_parameter_passed(self): + captured = {} - ml_client = SimpleNamespace( - workspaces=SimpleNamespace(list_by_subscription=_raise), - ) - mon_client = SimpleNamespace() + def _capture(*a, **kw): + captured.update(kw) + return _metric_response_zero() - try: - find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, + _run(metric_fn=_capture) + assert "interval" not in captured + + def test_metric_name_is_active_nodes(self): + captured = {} + + def _capture(*a, **kw): + captured.update(kw) + return _metric_response_zero() + + _run(metric_fn=_capture) + assert captured.get("metricnames") == "Active Nodes" + + def test_aggregation_is_maximum(self): + captured = {} + + def _capture(*a, **kw): + captured.update(kw) + return _metric_response_zero() + + _run(metric_fn=_capture) + assert captured.get("aggregation") == "Maximum" + + def test_series_without_metadata_causes_unknown(self): + # Timeseries with no metadata_values -> not cluster-scoped -> UNKNOWN -> skip + now = datetime.now(timezone.utc) + window_start = now - timedelta(days=14) + first_bucket = window_start.replace(hour=0, minute=0, second=0, microsecond=0) + expected = math.ceil((now - first_bucket).total_seconds() / 86400) + + datapoints = [] + for i in range(expected): + noon = first_bucket + timedelta(days=i, hours=12) + ts = max(noon, window_start + timedelta(seconds=1)) + if ts >= now: + ts = now - timedelta(seconds=1) + datapoints.append(SimpleNamespace(timestamp=ts, maximum=0.0)) + + ts_obj = SimpleNamespace(data=datapoints) # no metadata_values + response = SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + + assert _run(metric_fn=lambda *a, **kw: response) == [] + + def test_series_with_wrong_cluster_name_causes_unknown(self): + # Timeseries with ClusterName="other-cluster" != "test-cluster" -> not cluster-scoped + now = datetime.now(timezone.utc) + window_start = now - timedelta(days=14) + first_bucket = window_start.replace(hour=0, minute=0, second=0, microsecond=0) + expected = math.ceil((now - first_bucket).total_seconds() / 86400) + + datapoints = [] + for i in range(expected): + noon = first_bucket + timedelta(days=i, hours=12) + ts = max(noon, window_start + timedelta(seconds=1)) + if ts >= now: + ts = now - timedelta(seconds=1) + datapoints.append(SimpleNamespace(timestamp=ts, maximum=0.0)) + + ts_obj = SimpleNamespace( + data=datapoints, metadata_values=_make_cluster_metadata("other-cluster") + ) + response = SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + + assert _run(metric_fn=lambda *a, **kw: response) == [] + + def test_mixed_series_only_cluster_scoped_counted(self): + # Series for target cluster (max=0) + series for another cluster (max=99). + # Only the cluster-scoped series counts -> result is ZERO -> emit. + now = datetime.now(timezone.utc) + window_start = now - timedelta(days=14) + first_bucket = window_start.replace(hour=0, minute=0, second=0, microsecond=0) + expected = math.ceil((now - first_bucket).total_seconds() / 86400) + + def _make_dps(max_val=0.0): + dps = [] + for i in range(expected): + noon = first_bucket + timedelta(days=i, hours=12) + ts = max(noon, window_start + timedelta(seconds=1)) + if ts >= now: + ts = now - timedelta(seconds=1) + dps.append(SimpleNamespace(timestamp=ts, maximum=max_val)) + return dps + + ts_target = SimpleNamespace( + data=_make_dps(0.0), metadata_values=_make_cluster_metadata("test-cluster") + ) + ts_other = SimpleNamespace( + data=_make_dps(99.0), metadata_values=_make_cluster_metadata("other-cluster") ) - assert False, "Expected PermissionError" - except PermissionError as e: - assert "Microsoft.MachineLearningServices/workspaces/read" in str(e) + response = SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_target, ts_other])]) + assert len(_run(metric_fn=lambda *a, **kw: response)) == 1 -# --------------------------------------------------------------------------- -# Metric fallback strategy -# --------------------------------------------------------------------------- +# =========================================================================== +# Integration: TestFailureBehavior (spec 12) +# =========================================================================== -def test_idle_signal_includes_metric_name(): - """The winning metric name should appear in the evidence signal for debuggability.""" - ws = _make_workspace() - compute = _make_compute(age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) +class TestFailureBehavior: + def test_per_workspace_http_error_skips_workspace(self): + ws = _make_workspace() - assert len(findings) == 1 - signal_text = " ".join(findings[0].evidence.signals_used) - assert "Active Nodes" in signal_text # metric name surfaced for debuggability + def _fail(rg, ws_name): + raise HttpResponseError("workspace compute list failed") + ml_client = SimpleNamespace( + workspaces=SimpleNamespace(list_by_subscription=lambda: [ws]), + machine_learning_compute=SimpleNamespace(list_by_workspace=_fail), + ) + mon_client = SimpleNamespace( + metrics=SimpleNamespace(list=lambda *a, **kw: _metric_response_zero()) + ) + findings = find_idle_aml_compute( + subscription_id="sub-123", credential=None, client=ml_client, monitor_client=mon_client + ) + assert findings == [] -def test_fallback_to_nodecount_when_active_nodes_unavailable(): - """If 'Active Nodes' returns no dimension-filtered timeseries, 'NodeCount' is tried. + def test_per_workspace_service_request_error_skips_workspace(self): + ws = _make_workspace() - NodeCount with ComputeName filter returning all-zero is a reliable idle signal. - """ - ws = _make_workspace() - compute = _make_compute(age_days=30) + def _fail(rg, ws_name): + raise ServiceRequestError("network error") - call_args = [] + ml_client = SimpleNamespace( + workspaces=SimpleNamespace(list_by_subscription=lambda: [ws]), + machine_learning_compute=SimpleNamespace(list_by_workspace=_fail), + ) + mon_client = SimpleNamespace( + metrics=SimpleNamespace(list=lambda *a, **kw: _metric_response_zero()) + ) + findings = find_idle_aml_compute( + subscription_id="sub-123", credential=None, client=ml_client, monitor_client=mon_client + ) + assert findings == [] - def _mock_metrics_list(*args, **kwargs): - metric_name = kwargs.get("metricnames", "") - has_filter = "filter" in kwargs - call_args.append((metric_name, has_filter)) - if metric_name == "Active Nodes": - return _make_empty_metric_response() # not available (filtered or unfiltered) - if metric_name == "NodeCount" and has_filter: - return _make_metric_response(0.0) # dimension-filtered zero -> confirmed idle - return _make_empty_metric_response() + def test_good_workspace_preserved_when_other_fails(self): + ws_good = _make_workspace(name="good-ws", rg="rg-good") + ws_bad = _make_workspace(name="bad-ws", rg="rg-bad") + good_compute = _make_compute(workspace_name="good-ws", rg="rg-good") - ml_client = SimpleNamespace( - workspaces=SimpleNamespace(list_by_subscription=lambda: [ws]), - machine_learning_compute=SimpleNamespace(list_by_workspace=lambda rg, ws_name: [compute]), - ) - mon_client = SimpleNamespace(metrics=SimpleNamespace(list=_mock_metrics_list)) + def _compute_list(rg, ws_name): + if ws_name == "bad-ws": + raise HttpResponseError("bad workspace") + return [good_compute] - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + ml_client = SimpleNamespace( + workspaces=SimpleNamespace(list_by_subscription=lambda: [ws_good, ws_bad]), + machine_learning_compute=SimpleNamespace(list_by_workspace=_compute_list), + ) + mon_client = SimpleNamespace( + metrics=SimpleNamespace(list=lambda *a, **kw: _metric_response_zero()) + ) + findings = find_idle_aml_compute( + subscription_id="sub-123", credential=None, client=ml_client, monitor_client=mon_client + ) + assert len(findings) == 1 + assert findings[0].details["workspace_name"] == "good-ws" - assert len(findings) == 1 # flagged via dimension-filtered NodeCount - assert any("Active Nodes" in a[0] for a in call_args) - assert any("NodeCount" in a[0] for a in call_args) + def test_workspace_list_failure_propagates(self): + def _fail(): + raise HttpResponseError("subscription list failed") + ml_client = SimpleNamespace( + workspaces=SimpleNamespace(list_by_subscription=_fail), + machine_learning_compute=SimpleNamespace(), + ) + mon_client = SimpleNamespace() + with pytest.raises(HttpResponseError): + find_idle_aml_compute( + subscription_id="sub-123", + credential=None, + client=ml_client, + monitor_client=mon_client, + ) + + def test_per_compute_http_error_skips_compute_continues(self): + """HttpResponseError during compute property access skips that compute.""" + + class _FailingCompute: + id = "/subscriptions/sub-123/resourceGroups/rg/providers/ML/workspaces/ws/computes/x" + name = "x" + location = "eastus" + tags = {} + + @property + def properties(self): + raise HttpResponseError("compute SDK error") + + ws = _make_workspace() + good_compute = _make_compute(name="test-cluster") + ml_client = SimpleNamespace( + workspaces=SimpleNamespace(list_by_subscription=lambda: [ws]), + machine_learning_compute=SimpleNamespace( + list_by_workspace=lambda rg, ws_n: [_FailingCompute(), good_compute] + ), + ) + mon_client = SimpleNamespace( + metrics=SimpleNamespace(list=lambda *a, **kw: _metric_response_zero()) + ) + findings = find_idle_aml_compute( + subscription_id="sub-123", credential=None, client=ml_client, monitor_client=mon_client + ) + assert len(findings) == 1 + assert findings[0].details["cluster_name"] == "test-cluster" + + def test_malformed_compute_no_properties_skips(self): + c = _make_compute() + del c.properties + assert _run(computes=[c]) == [] + + +# =========================================================================== +# Integration: TestFindingShape (spec 11) +# =========================================================================== + + +class TestFindingShape: + def _finding(self): + ws = _make_workspace(name="ws1", rg="rg1") + c = _make_compute( + name="cluster1", + location="eastus", + min_node_count=3, + max_node_count=10, + current_node_count=3, + target_node_count=3, + vm_size="Standard_D8_v3", + vm_priority="Dedicated", + age_days=30, + workspace_name="ws1", + rg="rg1", + tags={"env": "prod"}, + ) + ml, mon = _make_clients(ws, [c], compute_name="cluster1") + findings = find_idle_aml_compute( + subscription_id="sub-123", credential=None, client=ml, monitor_client=mon + ) + return findings[0] -def test_dimension_filter_fallback_to_unfiltered(): - """If filtered query returns no timeseries, unfiltered retry should be attempted. + def test_details_cluster_name(self): + assert self._finding().details["cluster_name"] == "cluster1" - Workspace-level zero is UNKNOWN (not idle) — one active cluster can hide idle ones. - When all metrics return no reliable per-cluster signal, assume active (conservative). - """ - ws = _make_workspace() - compute = _make_compute(age_days=30) + def test_details_workspace_name(self): + assert self._finding().details["workspace_name"] == "ws1" - call_kwargs = [] + def test_details_resource_group(self): + assert self._finding().details["resource_group"] == "rg1" - def _mock_metrics_list(*args, **kwargs): - call_kwargs.append(dict(kwargs)) - if "filter" in kwargs: - return _make_empty_metric_response() # filter dimension not supported - return _make_metric_response(0.0) # unfiltered zero = unknown, not idle + def test_details_subscription_id(self): + assert self._finding().details["subscription_id"] == "sub-123" - ml_client = SimpleNamespace( - workspaces=SimpleNamespace(list_by_subscription=lambda: [ws]), - machine_learning_compute=SimpleNamespace(list_by_workspace=lambda rg, ws_name: [compute]), - ) - mon_client = SimpleNamespace(metrics=SimpleNamespace(list=_mock_metrics_list)) + def test_details_vm_size(self): + assert self._finding().details["vm_size"] == "Standard_D8_v3" - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_details_vm_priority(self): + assert self._finding().details["vm_priority"] == "Dedicated" - # Workspace-level zero is not enough — no reliable per-cluster signal -> skipped - assert findings == [] - # Verify both filtered and unfiltered calls were made - assert any("filter" in kw for kw in call_kwargs) - assert any("filter" not in kw for kw in call_kwargs) + def test_details_min_node_count(self): + assert self._finding().details["min_node_count"] == 3 + def test_details_max_node_count(self): + assert self._finding().details["max_node_count"] == 10 -def test_unfiltered_active_workspace_causes_skip(): - """If unfiltered fallback shows activity (other computes active), cluster should be skipped. + def test_details_current_node_count(self): + assert self._finding().details["current_node_count"] == 3 - When the dimension filter is unsupported and the unfiltered workspace query - shows active nodes, we cannot determine if our specific cluster is idle. - Conservative: skip (avoid false positive). - """ - ws = _make_workspace() - compute = _make_compute(age_days=30) + def test_details_target_node_count(self): + assert self._finding().details["target_node_count"] == 3 - def _mock_metrics_list(*args, **kwargs): - if "filter" in kwargs: - return _make_empty_metric_response() # filter not supported - return _make_metric_response(5.0) # workspace-level activity detected + def test_details_allocation_state(self): + assert self._finding().details["allocation_state"] == "Steady" - ml_client = SimpleNamespace( - workspaces=SimpleNamespace(list_by_subscription=lambda: [ws]), - machine_learning_compute=SimpleNamespace(list_by_workspace=lambda rg, ws_name: [compute]), - ) - mon_client = SimpleNamespace(metrics=SimpleNamespace(list=_mock_metrics_list)) + def test_details_provisioning_state(self): + assert self._finding().details["provisioning_state"] == "Succeeded" - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_details_created_at_present(self): + assert self._finding().details["created_at"] is not None - assert findings == [] # conservative: can't confirm this cluster is idle + def test_details_idle_window_days(self): + assert self._finding().details["idle_window_days"] == 14 + def test_details_metrics_used(self): + assert self._finding().details["metrics_used"] == ["Active Nodes"] -def test_all_metrics_unavailable_assumes_active(): - """If no metric returns any timeseries at all, cluster is assumed active (conservative). + def test_details_tags(self): + assert self._finding().details["tags"] == {"env": "prod"} - No reliable per-cluster signal -> cannot confirm idle -> skip to avoid false positives. - """ - ws = _make_workspace() - compute = _make_compute(age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_empty_metric_response()) + def test_details_tags_never_none(self): + c = _make_compute() + c.tags = None + findings = _run(computes=[c]) + assert findings[0].details["tags"] == {} - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_details_node_idle_time_present(self): + d = self._finding().details + assert "node_idle_time_before_scale_down" in d - assert findings == [] + def test_details_node_idle_time_is_str_or_none(self): + v = self._finding().details["node_idle_time_before_scale_down"] + assert v is None or isinstance(v, str) + def test_details_vm_priority_is_str_or_none(self): + v = self._finding().details["vm_priority"] + assert v is None or isinstance(v, str) -def test_all_none_maximums_treated_as_unknown_not_idle(): - """Timeseries where every datapoint has maximum=None must not produce a finding. + def test_signals_used_includes_aml_compute_type(self): + signals = " ".join(self._finding().evidence.signals_used) + assert "AmlCompute" in signals - All-None maximums indicate a metric publishing gap or throttled ingestion — - the value was unavailable, not zero. Treating them as idle would be a false positive. - """ - ws = _make_workspace() - compute = _make_compute(age_days=30) + def test_signals_used_includes_provisioning_succeeded(self): + signals = " ".join(self._finding().evidence.signals_used) + assert "Succeeded" in signals - # Build a timeseries with data points but all maximum=None - none_data = [SimpleNamespace(maximum=None), SimpleNamespace(maximum=None)] - none_timeseries = SimpleNamespace(data=none_data) - none_metric = SimpleNamespace(timeseries=[none_timeseries]) - all_none_response = SimpleNamespace(value=[none_metric]) + def test_signals_used_includes_allocation_steady(self): + signals = " ".join(self._finding().evidence.signals_used) + assert "Steady" in signals - ml_client, mon_client = _make_clients(ws, [compute], all_none_response) + def test_signals_used_includes_age(self): + signals = " ".join(self._finding().evidence.signals_used) + assert "14" in signals - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + def test_signals_used_includes_min_node_count(self): + signals = " ".join(self._finding().evidence.signals_used) + assert "min_node_count" in signals - assert findings == [] # unknown signal — must not be reported as idle + def test_signals_used_includes_active_nodes_metric(self): + signals = " ".join(self._finding().evidence.signals_used) + assert "Active Nodes" in signals + def test_signals_not_checked_has_blind_spots(self): + snc = self._finding().evidence.signals_not_checked + assert len(snc) >= 3 -# --------------------------------------------------------------------------- -# idle_days clamping -# --------------------------------------------------------------------------- + def test_no_gpu_risk_escalation(self): + # Risk is always MEDIUM regardless of GPU (spec 10/11.1) + c = _make_compute(vm_size="Standard_NC6") + findings = _run(computes=[c]) + assert findings[0].risk.value == "medium" + def test_no_age_confidence_degradation(self): + # Confidence is always HIGH when all conditions are met (spec 11.1) + c = _make_compute(age_days=14) + findings = _run(computes=[c]) + assert findings[0].confidence.value == "high" -def test_idle_days_zero_does_not_silently_skip_all(): - """idle_days=0 must be clamped to 1 — not silently suppress all findings.""" - ws = _make_workspace() - compute = _make_compute(age_days=30) - ml_client, mon_client = _make_clients(ws, [compute], _make_metric_response(0.0)) - # Without clamping, idle_days=0 -> effective_window=0 -> effective_window < 3 -> no findings. - # With clamping to 1, the normal detection path runs and the cluster is flagged. - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - idle_days=0, - ) +# =========================================================================== +# Unit: TestSeriesIsClusterScoped +# =========================================================================== - assert len(findings) == 1 +class TestSeriesIsClusterScoped: + def _ts(self, metadata_values): + return SimpleNamespace(metadata_values=metadata_values) -# --------------------------------------------------------------------------- -# Per-workspace error handling -# --------------------------------------------------------------------------- + def _mv(self, dim_name, dim_value): + return SimpleNamespace(name=SimpleNamespace(value=dim_name), value=dim_value) + def test_matching_cluster_name_returns_true(self): + ts = self._ts([self._mv("ClusterName", "my-cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is True -def test_compute_list_auth_error_raises_permission_error(): - """AuthorizationFailed on compute.list() must surface as PermissionError, not be swallowed.""" - ws = _make_workspace() + def test_case_insensitive_dimension_key_lower(self): + ts = self._ts([self._mv("clustername", "my-cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is True - def _compute_list(rg, ws_name): - raise Exception("AuthorizationFailed: missing computes/read permission") + def test_case_insensitive_dimension_key_mixed(self): + ts = self._ts([self._mv("clusterName", "my-cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is True - ml_client = SimpleNamespace( - workspaces=SimpleNamespace(list_by_subscription=lambda: [ws]), - machine_learning_compute=SimpleNamespace(list_by_workspace=_compute_list), - ) - mon_client = SimpleNamespace() + def test_exact_value_match_required(self): + # Dimension value matching is case-sensitive + ts = self._ts([self._mv("ClusterName", "My-Cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is False - with pytest.raises(PermissionError) as exc_info: - find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, + def test_wrong_cluster_name_returns_false(self): + ts = self._ts([self._mv("ClusterName", "other-cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is False + + def test_no_metadata_values_attr_returns_false(self): + ts = SimpleNamespace() # no metadata_values attr + assert _series_is_cluster_scoped(ts, "my-cluster") is False + + def test_empty_metadata_values_returns_false(self): + ts = self._ts([]) + assert _series_is_cluster_scoped(ts, "my-cluster") is False + + def test_none_metadata_values_returns_false(self): + ts = self._ts(None) + assert _series_is_cluster_scoped(ts, "my-cluster") is False + + def test_non_clustername_dimension_ignored(self): + ts = self._ts([self._mv("NodePoolName", "my-cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is False + + def test_multiple_dims_one_matches(self): + ts = self._ts( + [ + self._mv("NodePoolName", "np1"), + self._mv("ClusterName", "my-cluster"), + ] ) + assert _series_is_cluster_scoped(ts, "my-cluster") is True - assert "Microsoft.MachineLearningServices/workspaces/computes/read" in str(exc_info.value) + def test_none_entry_in_metadata_list_handled_gracefully(self): + # None as a metadata entry should not crash; subsequent entries still checked + ts = self._ts([None, self._mv("ClusterName", "my-cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is True + def test_dim_name_not_string_skipped(self): + ts = self._ts([SimpleNamespace(name=SimpleNamespace(value=42), value="my-cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is False -def test_compute_list_transient_error_skips_workspace_preserves_findings(): - """Transient error in compute.list() for one workspace must not abort findings from others.""" - ws_good = _make_workspace(name="good-ws", rg="rg-good") - ws_bad = _make_workspace(name="bad-ws", rg="rg-bad") - good_compute = _make_compute(age_days=30, workspace="good-ws", rg="rg-good") + def test_dim_value_not_string_skipped(self): + ts = self._ts([SimpleNamespace(name=SimpleNamespace(value="ClusterName"), value=None)]) + assert _series_is_cluster_scoped(ts, "my-cluster") is False - call_count = 0 + def test_plain_string_dim_name_returns_true(self): + # mv.name is a plain str, not a LocalizableString object + ts = self._ts([SimpleNamespace(name="ClusterName", value="my-cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is True - def _compute_list(rg, ws_name): - nonlocal call_count - call_count += 1 - if ws_name == "bad-ws": - raise RuntimeError("transient SDK timeout") - return [good_compute] + def test_plain_string_dim_name_case_insensitive(self): + ts = self._ts([SimpleNamespace(name="clustername", value="my-cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is True - ml_client = SimpleNamespace( - workspaces=SimpleNamespace(list_by_subscription=lambda: [ws_good, ws_bad]), - machine_learning_compute=SimpleNamespace(list_by_workspace=_compute_list), - ) - mon_client = SimpleNamespace( - metrics=SimpleNamespace(list=lambda *a, **kw: _make_metric_response(0.0)) - ) + def test_plain_string_dim_name_wrong_cluster(self): + ts = self._ts([SimpleNamespace(name="ClusterName", value="other-cluster")]) + assert _series_is_cluster_scoped(ts, "my-cluster") is False - findings = find_idle_aml_compute( - subscription_id="sub-123", - credential=None, - client=ml_client, - monitor_client=mon_client, - ) - assert len(findings) == 1 - assert findings[0].details["workspace_name"] == "good-ws" - assert call_count == 2 # both workspaces were attempted +# =========================================================================== +# Unit: TestNormLocation +# =========================================================================== -# --------------------------------------------------------------------------- -# RULE_METADATA -# --------------------------------------------------------------------------- +class TestNormLocation: + def test_uppercase_lowercased(self): + assert _norm_location("EastUS") == "eastus" + + def test_spaces_preserved(self): + assert _norm_location("East US") == "east us" + + def test_hyphens_preserved(self): + assert _norm_location("east-us") == "east-us" + + def test_already_lowercase(self): + assert _norm_location("eastus") == "eastus" + + def test_empty_string(self): + assert _norm_location("") == "" + + def test_none_returns_empty(self): + assert _norm_location(None) == "" + + def test_mixed_case_with_digits(self): + assert _norm_location("UKSouth2") == "uksouth2" + + +# =========================================================================== +# Unit: TestExtractResourceGroup +# =========================================================================== + + +class TestExtractResourceGroup: + def test_valid_arm_id(self): + arm_id = "/subscriptions/sub/resourceGroups/my-rg/providers/ML/workspaces/ws" + assert _extract_resource_group(arm_id) == "my-rg" + + def test_lowercase_resource_groups_key(self): + arm_id = "/subscriptions/sub/resourcegroups/my-rg/providers/ML/workspaces/ws" + assert _extract_resource_group(arm_id) == "my-rg" + + def test_mixed_case_resource_groups_key(self): + arm_id = "/subscriptions/sub/ResourceGroups/my-rg/providers/ML" + assert _extract_resource_group(arm_id) == "my-rg" + + def test_no_resource_group_returns_none(self): + assert _extract_resource_group("/subscriptions/sub/providers/ML") is None + + def test_none_returns_none(self): + assert _extract_resource_group(None) is None + + def test_empty_string_returns_none(self): + assert _extract_resource_group("") is None + + +# =========================================================================== +# Unit: TestResolveStrField +# =========================================================================== + + +class TestResolveStrField: + def test_sdk_val_only(self): + obj = SimpleNamespace(snake_f="AmlCompute") + assert _resolve_str_field(obj, "snake_f", "camelF") == "AmlCompute" + + def test_raw_val_only(self): + obj = SimpleNamespace(camelF="AmlCompute") + assert _resolve_str_field(obj, "snake_f", "camelF") == "AmlCompute" + + def test_both_same_returns_value(self): + obj = SimpleNamespace(snake_f="AmlCompute", camelF="AmlCompute") + assert _resolve_str_field(obj, "snake_f", "camelF") == "AmlCompute" + + def test_conflict_returns_none(self): + obj = SimpleNamespace(snake_f="AmlCompute", camelF="ComputeInstance") + assert _resolve_str_field(obj, "snake_f", "camelF") is None + + def test_both_absent_returns_none(self): + obj = SimpleNamespace() + assert _resolve_str_field(obj, "snake_f", "camelF") is None + + def test_non_string_returns_none(self): + obj = SimpleNamespace(snake_f=42) + assert _resolve_str_field(obj, "snake_f", "camelF") is None + + def test_obj_none_returns_none(self): + assert _resolve_str_field(None, "snake_f", "camelF") is None + + +# =========================================================================== +# Unit: TestResolveIntField +# =========================================================================== + + +class TestResolveIntField: + def test_none_obj_returns_none(self): + assert _resolve_int_field(None, "foo", "fooBar") is None + + def test_snake_case_value(self): + assert _resolve_int_field(SimpleNamespace(foo=5), "foo", "fooBar") == 5 + + def test_camel_case_fallback(self): + assert _resolve_int_field(SimpleNamespace(fooBar=7), "foo", "fooBar") == 7 + + def test_snake_preferred_over_camel(self): + assert _resolve_int_field(SimpleNamespace(foo=3, fooBar=99), "foo", "fooBar") == 3 + + def test_none_snake_falls_through_to_camel(self): + assert _resolve_int_field(SimpleNamespace(foo=None, fooBar=8), "foo", "fooBar") == 8 + + def test_string_numeric_coerced(self): + assert _resolve_int_field(SimpleNamespace(foo="4"), "foo", "fooBar") == 4 + + def test_non_numeric_string_returns_none(self): + assert _resolve_int_field(SimpleNamespace(foo="bad"), "foo", "fooBar") is None + + def test_both_absent_returns_none(self): + assert _resolve_int_field(SimpleNamespace(), "foo", "fooBar") is None + + def test_zero_returned_as_zero(self): + # _resolve_int_field does not filter by range; caller decides + assert _resolve_int_field(SimpleNamespace(foo=0), "foo", "fooBar") == 0 + + def test_negative_returned_as_negative(self): + # caller enforces range; helper just parses + assert _resolve_int_field(SimpleNamespace(foo=-1), "foo", "fooBar") == -1 + + +# =========================================================================== +# Unit: TestToDetailStr +# =========================================================================== + + +class TestToDetailStr: + def test_none_returns_none(self): + assert _to_detail_str(None) is None + + def test_string_returned_unchanged(self): + assert _to_detail_str("Dedicated") == "Dedicated" + + def test_int_stringified(self): + assert _to_detail_str(3) == "3" + + def test_enum_like_object_uses_str(self): + class FakeEnum: + def __str__(self): + return "LowPriority" + + assert _to_detail_str(FakeEnum()) == "LowPriority" + + def test_result_is_always_str(self): + assert isinstance(_to_detail_str("PT120S"), str) + assert isinstance(_to_detail_str(0), str) + + +# =========================================================================== +# Unit: TestResolveComputeType +# =========================================================================== + + +class TestResolveComputeType: + def _make(self, **kwargs): + outer = SimpleNamespace(**kwargs) + return SimpleNamespace(properties=outer) + + def test_sdk_field(self): + c = self._make(compute_type="AmlCompute") + assert _resolve_compute_type(c) == "AmlCompute" + + def test_raw_camel_field(self): + c = self._make(computeType="AmlCompute") + assert _resolve_compute_type(c) == "AmlCompute" + + def test_conflict_returns_none(self): + c = self._make(compute_type="AmlCompute", computeType="ComputeInstance") + assert _resolve_compute_type(c) is None + + def test_properties_none_returns_none(self): + c = SimpleNamespace(properties=None) + assert _resolve_compute_type(c) is None + + def test_properties_absent_returns_none(self): + c = SimpleNamespace() + assert _resolve_compute_type(c) is None + + +# =========================================================================== +# Unit: TestResolveProvisioningState +# =========================================================================== + + +class TestResolveProvisioningState: + def _make(self, **kwargs): + return SimpleNamespace(properties=SimpleNamespace(**kwargs)) + + def test_succeeded(self): + assert ( + _resolve_provisioning_state(self._make(provisioning_state="Succeeded")) == "Succeeded" + ) + + def test_raw_field(self): + assert _resolve_provisioning_state(self._make(provisioningState="Succeeded")) == "Succeeded" + + def test_conflict_returns_none(self): + c = self._make(provisioning_state="Succeeded", provisioningState="Failed") + assert _resolve_provisioning_state(c) is None + + def test_absent_returns_none(self): + assert _resolve_provisioning_state(self._make()) is None + + +# =========================================================================== +# Unit: TestResolveAllocationState +# =========================================================================== + + +class TestResolveAllocationState: + def _make(self, **kwargs): + inner = SimpleNamespace(**kwargs) + outer = SimpleNamespace(properties=inner) + return SimpleNamespace(properties=outer) + + def test_steady(self): + assert _resolve_allocation_state(self._make(allocation_state="Steady")) == "Steady" + + def test_raw_field(self): + assert _resolve_allocation_state(self._make(allocationState="Steady")) == "Steady" + + def test_conflict_returns_none(self): + c = self._make(allocation_state="Steady", allocationState="Resizing") + assert _resolve_allocation_state(c) is None + + def test_inner_none_returns_none(self): + outer = SimpleNamespace(properties=None) + c = SimpleNamespace(properties=outer) + assert _resolve_allocation_state(c) is None + + def test_outer_none_returns_none(self): + c = SimpleNamespace(properties=None) + assert _resolve_allocation_state(c) is None + + +# =========================================================================== +# Unit: TestResolveCreatedAt +# =========================================================================== + + +class TestResolveCreatedAt: + def _make(self, created_on): + return SimpleNamespace(properties=SimpleNamespace(created_on=created_on)) + + def test_datetime_with_tz(self): + ts = datetime(2025, 1, 1, tzinfo=timezone.utc) + c = self._make(ts) + result = _resolve_created_at(c) + assert result == ts + + def test_datetime_without_tz_assumes_utc(self): + ts = datetime(2025, 1, 1) + c = self._make(ts) + result = _resolve_created_at(c) + assert result.tzinfo is not None + + def test_iso_string(self): + c = self._make("2025-06-01T12:00:00") + result = _resolve_created_at(c) + assert result is not None + assert result.year == 2025 + + def test_z_suffixed_string(self): + c = self._make("2025-06-01T12:00:00Z") + result = _resolve_created_at(c) + assert result is not None + + def test_invalid_string_returns_none(self): + c = self._make("not-a-date") + assert _resolve_created_at(c) is None + + def test_none_returns_none(self): + c = self._make(None) + assert _resolve_created_at(c) is None + + def test_future_returns_none(self): + future = datetime.now(timezone.utc) + timedelta(days=1) + c = self._make(future) + assert _resolve_created_at(c) is None + + def test_integer_returns_none(self): + c = self._make(12345) + assert _resolve_created_at(c) is None + + def test_properties_none_returns_none(self): + c = SimpleNamespace(properties=None) + assert _resolve_created_at(c) is None + + def test_camel_case_field_accepted(self): + now = datetime.now(timezone.utc) + ts = now - timedelta(days=30) + outer = SimpleNamespace(createdOn=ts) + c = SimpleNamespace(properties=outer) + assert _resolve_created_at(c) == ts + + +# =========================================================================== +# Unit: TestResolveMinNodeCount +# =========================================================================== + + +class TestResolveMinNodeCount: + def _make(self, min_node_count, scale_attr="min_node_count"): + scale = SimpleNamespace(**{scale_attr: min_node_count}) + inner = SimpleNamespace(scale_settings=scale) + outer = SimpleNamespace(properties=inner) + return SimpleNamespace(properties=outer) + + def test_positive_int(self): + assert _resolve_min_node_count(self._make(3)) == 3 + + def test_one(self): + assert _resolve_min_node_count(self._make(1)) == 1 + + def test_zero_returns_none(self): + assert _resolve_min_node_count(self._make(0)) is None + + def test_negative_returns_none(self): + assert _resolve_min_node_count(self._make(-1)) is None + + def test_none_returns_none(self): + assert _resolve_min_node_count(self._make(None)) is None + + def test_string_numeric_coerced(self): + assert _resolve_min_node_count(self._make("2")) == 2 + + def test_scale_settings_none_returns_none(self): + inner = SimpleNamespace(scale_settings=None) + outer = SimpleNamespace(properties=inner) + c = SimpleNamespace(properties=outer) + assert _resolve_min_node_count(c) is None + + def test_inner_none_returns_none(self): + outer = SimpleNamespace(properties=None) + c = SimpleNamespace(properties=outer) + assert _resolve_min_node_count(c) is None + + def test_raw_camel_case_scale_settings_accepted(self): + # scaleSettings (raw) accepted when scale_settings absent + scale = SimpleNamespace(min_node_count=4) + inner = SimpleNamespace(scaleSettings=scale) + outer = SimpleNamespace(properties=inner) + c = SimpleNamespace(properties=outer) + assert _resolve_min_node_count(c) == 4 + + def test_raw_min_node_count_camel_case_accepted(self): + # minNodeCount (raw) accepted when min_node_count absent on scale_settings + assert _resolve_min_node_count(self._make(5, scale_attr="minNodeCount")) == 5 + + def test_raw_min_node_count_zero_returns_none(self): + assert _resolve_min_node_count(self._make(0, scale_attr="minNodeCount")) is None + + def test_snake_case_preferred_over_camel(self): + # snake_case is tried first; camelCase only as fallback + scale = SimpleNamespace(min_node_count=2, minNodeCount=99) + inner = SimpleNamespace(scale_settings=scale) + outer = SimpleNamespace(properties=inner) + c = SimpleNamespace(properties=outer) + assert _resolve_min_node_count(c) == 2 + + +# =========================================================================== +# Unit: TestResolveCurrentNodeCount +# =========================================================================== + + +class TestResolveCurrentNodeCount: + def _make(self, current, attr="current_node_count"): + inner = SimpleNamespace(**{attr: current}) + outer = SimpleNamespace(properties=inner) + return SimpleNamespace(properties=outer) + + def test_positive(self): + assert _resolve_current_node_count(self._make(5)) == 5 + + def test_zero_allowed(self): + assert _resolve_current_node_count(self._make(0)) == 0 + + def test_negative_returns_none(self): + assert _resolve_current_node_count(self._make(-1)) is None + + def test_none_returns_none(self): + assert _resolve_current_node_count(self._make(None)) is None + + def test_string_numeric_coerced(self): + assert _resolve_current_node_count(self._make("3")) == 3 + + def test_inner_none_returns_none(self): + outer = SimpleNamespace(properties=None) + c = SimpleNamespace(properties=outer) + assert _resolve_current_node_count(c) is None + + def test_raw_camel_case_accepted(self): + assert _resolve_current_node_count(self._make(7, attr="currentNodeCount")) == 7 + + def test_raw_camel_case_negative_still_none(self): + assert _resolve_current_node_count(self._make(-2, attr="currentNodeCount")) is None + + def test_snake_case_preferred_over_camel(self): + inner = SimpleNamespace(current_node_count=3, currentNodeCount=99) + outer = SimpleNamespace(properties=inner) + c = SimpleNamespace(properties=outer) + assert _resolve_current_node_count(c) == 3 + + +# =========================================================================== +# Unit: TestEvaluateMetric +# =========================================================================== + + +def _monitor_client(response_fn): + return SimpleNamespace(metrics=SimpleNamespace(list=response_fn)) + + +def _window(): + now = datetime.now(timezone.utc) + return now - timedelta(days=14), now + + +class TestEvaluateMetric: + _NAME = "test-cluster" + + def _zero_response(self): + return _metric_response_zero(compute_name=self._NAME) + + def _active_response(self): + return _metric_response_active(compute_name=self._NAME) + + def _ts_with_metadata(self, datapoints): + """Build a timeseries with correct ClusterName metadata for self._NAME.""" + return SimpleNamespace(data=datapoints, metadata_values=_make_cluster_metadata(self._NAME)) + + def test_zero_returns_zero(self): + ws, we = _window() + mc = _monitor_client(lambda *a, **kw: self._zero_response()) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.ZERO + + def test_active_returns_active(self): + ws, we = _window() + mc = _monitor_client(lambda *a, **kw: self._active_response()) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.ACTIVE + + def test_empty_value_list_returns_unknown(self): + ws, we = _window() + mc = _monitor_client(lambda *a, **kw: SimpleNamespace(value=[])) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_no_value_attr_returns_unknown(self): + ws, we = _window() + mc = _monitor_client(lambda *a, **kw: SimpleNamespace()) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_value_none_returns_unknown(self): + ws, we = _window() + mc = _monitor_client(lambda *a, **kw: SimpleNamespace(value=None)) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_monitor_raises_returns_unknown(self): + ws, we = _window() + + def _raise(*a, **kw): + raise RuntimeError("monitor error") + + mc = _monitor_client(_raise) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_series_without_metadata_skipped_returns_unknown(self): + # Timeseries with no metadata_values -> not cluster-scoped -> UNKNOWN + ws, we = _window() + now = datetime.now(timezone.utc) + dp = SimpleNamespace(timestamp=now - timedelta(days=1), maximum=0.0) + ts_obj = SimpleNamespace(data=[dp]) # no metadata_values + mc = _monitor_client( + lambda *a, **kw: SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + ) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_series_with_wrong_cluster_name_returns_unknown(self): + # Timeseries scoped to a different cluster -> UNKNOWN + ws, we = _window() + now = datetime.now(timezone.utc) + dp = SimpleNamespace(timestamp=now - timedelta(days=1), maximum=0.0) + ts_obj = SimpleNamespace(data=[dp], metadata_values=_make_cluster_metadata("wrong-cluster")) + mc = _monitor_client( + lambda *a, **kw: SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + ) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_non_numeric_maximum_returns_unknown(self): + ws, we = _window() + now = datetime.now(timezone.utc) + dp = SimpleNamespace(timestamp=now - timedelta(days=1), maximum="not-a-number") + ts_obj = self._ts_with_metadata([dp]) + mc = _monitor_client( + lambda *a, **kw: SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + ) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_non_iterable_timeseries_returns_unknown(self): + # timeseries is a string: characters are not cluster-scoped -> UNKNOWN + ws, we = _window() + metric_obj = SimpleNamespace(timeseries="malformed") + mc = _monitor_client(lambda *a, **kw: SimpleNamespace(value=[metric_obj])) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_none_maximum_reduces_coverage_toward_unknown(self): + # Series is cluster-scoped but only datapoint has max=None -> no usable buckets -> UNKNOWN + ws, we = _window() + now = datetime.now(timezone.utc) + dp = SimpleNamespace(timestamp=now - timedelta(days=1), maximum=None) + ts_obj = self._ts_with_metadata([dp]) + mc = _monitor_client( + lambda *a, **kw: SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + ) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_datapoints_outside_window_filtered(self): + # All datapoints outside the window -> no observed buckets -> UNKNOWN + ws, we = _window() + now = datetime.now(timezone.utc) + dp = SimpleNamespace(timestamp=now - timedelta(days=30), maximum=0.0) + ts_obj = self._ts_with_metadata([dp]) + mc = _monitor_client( + lambda *a, **kw: SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + ) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_none_timestamp_returns_unknown(self): + ws, we = _window() + dp = SimpleNamespace(timestamp=None, maximum=0.0) + ts_obj = self._ts_with_metadata([dp]) + mc = _monitor_client( + lambda *a, **kw: SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + ) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_non_datetime_timestamp_returns_unknown(self): + ws, we = _window() + dp = SimpleNamespace(timestamp="2025-01-01", maximum=0.0) + ts_obj = self._ts_with_metadata([dp]) + mc = _monitor_client( + lambda *a, **kw: SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + ) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_coverage_below_threshold_returns_unknown(self): + # 1 bucket out of ~15 expected -> < 0.95 -> UNKNOWN + ws, we = _window() + now = datetime.now(timezone.utc) + dp = SimpleNamespace(timestamp=now - timedelta(days=1), maximum=0.0) + ts_obj = self._ts_with_metadata([dp]) + mc = _monitor_client( + lambda *a, **kw: SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + ) + assert _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) == _MetricResult.UNKNOWN + + def test_no_interval_kwarg_sent(self): + ws, we = _window() + captured = {} + + def _capture(*a, **kw): + captured.update(kw) + return self._zero_response() + + mc = _monitor_client(_capture) + _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) + assert "interval" not in captured + + def test_filter_kwarg_uses_cluster_name(self): + ws, we = _window() + captured = {} + + def _capture(*a, **kw): + captured.update(kw) + return self._zero_response() + + mc = _monitor_client(_capture) + _evaluate_metric(mc, "/ws/id", self._NAME, ws, we) + assert "ClusterName" in captured.get("filter", "") + assert self._NAME in captured.get("filter", "") + + def test_workspace_id_passed_as_positional(self): + ws, we = _window() + captured_args = [] + + def _capture(*a, **kw): + captured_args.extend(a) + return self._zero_response() + + mc = _monitor_client(_capture) + _evaluate_metric(mc, "/workspaces/my-ws", self._NAME, ws, we) + assert "/workspaces/my-ws" in captured_args + + def test_today_bucket_gap_does_not_cause_unknown(self): + # Azure Monitor may not emit today's datapoint yet. The expected-bucket + # formula excludes the current incomplete UTC day, so 14 complete + # past days of zero data (none from today) must evaluate to ZERO, not UNKNOWN. + now = datetime.now(timezone.utc) + window_start = now - timedelta(days=14) + window_end = now + first_bucket = window_start.replace(hour=0, minute=0, second=0, microsecond=0) + midnight_today = now.replace(hour=0, minute=0, second=0, microsecond=0) + + # One zero datapoint per complete past day; deliberately none from today. + dps = [] + for i in range(14): + noon = first_bucket + timedelta(days=i, hours=12) + ts_dp = max(noon, window_start + timedelta(seconds=1)) + if ts_dp < midnight_today: + dps.append(SimpleNamespace(timestamp=ts_dp, maximum=0.0)) + + ts_obj = self._ts_with_metadata(dps) + mc = _monitor_client( + lambda *a, **kw: SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + ) + result = _evaluate_metric(mc, "/ws/id", self._NAME, window_start, window_end) + assert result == _MetricResult.ZERO + + def test_today_partial_data_does_not_mask_missing_past_day(self): + # Today's partial-day bucket must NOT be counted toward observed coverage. + # If it were, a today bucket + 13 past days could satisfy 14/14 coverage + # even though one full past day is missing, producing a false ZERO emit. + # With the fix both sides cap at last_complete_midnight: observed = 13, + # expected = 14, coverage = 0.928 < 0.95 -> UNKNOWN. + now = datetime.now(timezone.utc) + window_start = now - timedelta(days=14) + window_end = now + first_bucket = window_start.replace(hour=0, minute=0, second=0, microsecond=0) + midnight_today = now.replace(hour=0, minute=0, second=0, microsecond=0) + + # 13 complete past days (skip day 7 to create a gap) + one today datapoint + dps = [] + for i in range(14): + if i == 7: + continue # intentional gap in a complete past day + noon = first_bucket + timedelta(days=i, hours=12) + ts_dp = max(noon, window_start + timedelta(seconds=1)) + if ts_dp < midnight_today: + dps.append(SimpleNamespace(timestamp=ts_dp, maximum=0.0)) + # Today's datapoint — should be filtered from observed, not mask the gap + today_dp = midnight_today + timedelta(hours=1) + if today_dp < window_end: + dps.append(SimpleNamespace(timestamp=today_dp, maximum=0.0)) + + ts_obj = self._ts_with_metadata(dps) + mc = _monitor_client( + lambda *a, **kw: SimpleNamespace(value=[SimpleNamespace(timeseries=[ts_obj])]) + ) + result = _evaluate_metric(mc, "/ws/id", self._NAME, window_start, window_end) + assert result == _MetricResult.UNKNOWN + + +# =========================================================================== +# Unit: TestRuleMetadata +# =========================================================================== + + +class TestRuleMetadata: + def test_id(self): + assert RULE_METADATA["id"] == "azure.aml.compute.idle" + def test_category(self): + assert RULE_METADATA["category"] == "ai" -def test_rule_metadata_present(): - """Rule must expose RULE_METADATA with correct fields.""" - from cleancloud.providers.azure.rules.ai.aml_compute_idle import RULE_METADATA + def test_service(self): + assert RULE_METADATA["service"] == "machinelearning" - assert RULE_METADATA["id"] == "azure.aml.compute.idle" - assert RULE_METADATA["category"] == "ai" - assert RULE_METADATA["service"] == "machinelearning" - assert RULE_METADATA["cost_impact"] == "high" + def test_cost_impact(self): + assert RULE_METADATA["cost_impact"] == "high" diff --git a/tests/cleancloud/providers/azure/test_azure_aml_compute_instance_idle.py b/tests/cleancloud/providers/azure/test_azure_aml_compute_instance_idle.py index a15d313..7f70dc0 100644 --- a/tests/cleancloud/providers/azure/test_azure_aml_compute_instance_idle.py +++ b/tests/cleancloud/providers/azure/test_azure_aml_compute_instance_idle.py @@ -1,3 +1,5 @@ +"""Tests for azure.ml.compute_instance.idle rule (hardened per spec).""" + from datetime import datetime, timedelta, timezone from types import SimpleNamespace @@ -5,37 +7,59 @@ from cleancloud.providers.azure.rules.ai.aml_compute_instance_idle import ( RULE_METADATA, + _extract_resource_group, + _extract_subscription_id, + _is_gpu, + _norm_location, + _parse_utc_timestamp, + _resolve_compute_type, + _resolve_created_at, + _resolve_location, + _resolve_modified_at, + _resolve_provisioning_state, + _resolve_state, + _resolve_str_field, find_idle_aml_compute_instances, ) # --------------------------------------------------------------------------- -# Helpers +# Test helpers # --------------------------------------------------------------------------- -def _make_workspace(name="test-workspace", location="eastus", rg="rg-ml"): +def _make_workspace(name="test-workspace", rg="rg-ml"): ws_id = ( f"/subscriptions/sub-123/resourceGroups/{rg}" f"/providers/Microsoft.MachineLearningServices/workspaces/{name}" ) - return SimpleNamespace(id=ws_id, name=name, location=location) + return SimpleNamespace(id=ws_id, name=name) def _make_instance( name="dev-instance", vm_size="Standard_DS3_v2", state="Running", - age_days=30, - idle_since_days=None, - op_name="Start", + compute_type="ComputeInstance", + provisioning_state="Succeeded", + location="eastus", + age_days=31, + last_op_time_days=20, # days ago for lastOperation.operationTime; None = absent + last_op_name="Start", + last_op_status="Succeeded", + modified_days=None, # days ago for modifiedOn on compute.properties; None = absent workspace="test-workspace", rg="rg-ml", - system_data_modified_days=None, + tags=None, ): """Build a mock ComputeResource for a ComputeInstance. - idle_since_days controls last_operation.operation_time (defaults to age_days). - system_data_modified_days, if set, overrides system_data.last_modified_at. + last_op_time_days: days ago for lastOperation.operationTime (None = field absent). + modified_days: days ago for compute.properties.modified_on (None = field absent). + + IMPORTANT: last_op_time_days must be strictly less than age_days so that + op_time > created_on. When they are equal, spec 9.4.7 skips the instance + (operationTime == created_at → no proven post-create signal). The defaults + (age_days=31, last_op_time_days=20) satisfy this invariant. """ compute_id = ( f"/subscriptions/sub-123/resourceGroups/{rg}" @@ -43,101 +67,242 @@ def _make_instance( ) now = datetime.now(timezone.utc) - if idle_since_days is None: - idle_since_days = age_days - - op_time = now - timedelta(days=idle_since_days) if idle_since_days is not None else None - last_op = SimpleNamespace(operation_time=op_time, operation_name=op_name) + # Build lastOperation + if last_op_time_days is not None: + op_time = now - timedelta(days=last_op_time_days) + last_op = SimpleNamespace( + operation_time=op_time, + operation_name=last_op_name, + operation_status=last_op_status, + ) + else: + last_op = None + # ComputeInstanceProperties (inner) ci_props = SimpleNamespace( vm_size=vm_size, state=state, last_operation=last_op, ) + + # Compute.properties (outer) — created_on and modifiedOn live here + created_on = (now - timedelta(days=age_days)) if age_days is not None else None + modified_on = (now - timedelta(days=modified_days)) if modified_days is not None else None + compute_obj = SimpleNamespace( - compute_type="ComputeInstance", + compute_type=compute_type, + provisioning_state=provisioning_state, + created_on=created_on, + modified_on=modified_on, properties=ci_props, - created_on=(now - timedelta(days=age_days)) if age_days is not None else None, ) - # system_data fallback - if system_data_modified_days is not None: - system_data = SimpleNamespace( - last_modified_at=now - timedelta(days=system_data_modified_days) - ) - else: - system_data = None - return SimpleNamespace( id=compute_id, name=name, + location=location, + tags=tags or {}, properties=compute_obj, - system_data=system_data, ) def _make_client(workspace, instances): return SimpleNamespace( workspaces=SimpleNamespace(list_by_subscription=lambda: [workspace]), - machine_learning_compute=SimpleNamespace(list_by_workspace=lambda rg, ws_name: instances), + machine_learning_compute=SimpleNamespace(list_by_workspace=lambda rg, ws: instances), ) # --------------------------------------------------------------------------- -# Core detection +# Unit tests for normalization helpers # --------------------------------------------------------------------------- -def test_idle_cpu_instance_detected(): - """Running CPU instance with no recent activity should be flagged as MEDIUM risk.""" - ws = _make_workspace() - instance = _make_instance(vm_size="Standard_DS3_v2", age_days=30) - ml_client = _make_client(ws, [instance]) +def test_norm_location_lowercase_preserves_spaces_and_hyphens(): + """spec 7: lowercase only — spaces and hyphens are preserved.""" + assert _norm_location("East US") == "east us" + assert _norm_location("west-europe") == "west-europe" + assert _norm_location("EastUS") == "eastus" + assert _norm_location("") == "" - findings = find_idle_aml_compute_instances( - subscription_id="sub-123", - credential=None, - client=ml_client, + +def test_resolve_str_field_sdk_wins(): + obj = SimpleNamespace(compute_type="ComputeInstance", computeType=None) + assert _resolve_str_field(obj, "compute_type", "computeType") == "ComputeInstance" + + +def test_resolve_str_field_raw_fallback(): + obj = SimpleNamespace(compute_type=None, computeType="ComputeInstance") + assert _resolve_str_field(obj, "compute_type", "computeType") == "ComputeInstance" + + +def test_resolve_str_field_conflict_returns_none(): + obj = SimpleNamespace(compute_type="ComputeInstance", computeType="AmlCompute") + assert _resolve_str_field(obj, "compute_type", "computeType") is None + + +def test_resolve_str_field_both_absent_returns_none(): + obj = SimpleNamespace(compute_type=None, computeType=None) + assert _resolve_str_field(obj, "compute_type", "computeType") is None + + +def test_extract_resource_group_happy_path(): + rid = "/subscriptions/sub/resourceGroups/my-rg/providers/foo/bar" + assert _extract_resource_group(rid) == "my-rg" + + +def test_extract_resource_group_none(): + assert _extract_resource_group(None) is None + assert _extract_resource_group("") is None + + +def test_extract_subscription_id_happy_path(): + rid = "/subscriptions/abc-123/resourceGroups/rg/providers/foo" + assert _extract_subscription_id(rid) == "abc-123" + + +def test_parse_utc_timestamp_datetime_with_tz(): + dt = datetime(2024, 1, 1, 12, 0, tzinfo=timezone.utc) + assert _parse_utc_timestamp(dt) == dt + + +def test_parse_utc_timestamp_naive_datetime_becomes_utc(): + dt = datetime(2024, 1, 1, 12, 0) + result = _parse_utc_timestamp(dt) + assert result.tzinfo is not None + + +def test_parse_utc_timestamp_aware_non_utc_converted_to_utc(): + """spec 9.4: aware non-UTC datetimes must be converted to UTC, not returned unchanged.""" + eastern = timezone(timedelta(hours=-5)) + aware_eastern = datetime(2024, 6, 1, 12, 0, 0, tzinfo=eastern) + result = _parse_utc_timestamp(aware_eastern) + assert result.tzinfo == timezone.utc + assert result.hour == 17 # 12:00 -05:00 -> 17:00 UTC + + +def test_parse_utc_timestamp_utc_unchanged(): + """UTC-aware datetimes stay UTC.""" + dt = datetime(2024, 6, 1, 12, 0, 0, tzinfo=timezone.utc) + result = _parse_utc_timestamp(dt) + assert result == dt + assert result.tzinfo == timezone.utc + + +def test_parse_utc_timestamp_invalid_string_returns_none(): + assert _parse_utc_timestamp("not-a-date") is None + + +def test_parse_utc_timestamp_none_returns_none(): + assert _parse_utc_timestamp(None) is None + + +def test_is_gpu_exact_case_sensitive_prefixes(): + """spec 7: exact case-sensitive prefix matching.""" + assert _is_gpu("Standard_NC6s_v3") is True + assert _is_gpu("Standard_ND6s") is True + assert _is_gpu("Standard_NV6") is True + # Case sensitivity: lowercase prefix is NOT GPU + assert _is_gpu("standard_nc6s_v3") is False + assert _is_gpu("STANDARD_NC6s_v3") is False + # Non-GPU families + assert _is_gpu("Standard_DS3_v2") is False + assert _is_gpu("Standard_D4s_v3") is False + assert _is_gpu(None) is False + assert _is_gpu("") is False + + +def test_resolve_location_top_level_wins(): + compute = SimpleNamespace( + location="eastus", + properties=SimpleNamespace(compute_location=None, computeLocation=None), ) + assert _resolve_location(compute) == "eastus" + + +def test_resolve_location_normalised_to_lowercase(): + compute = SimpleNamespace(location="East US", properties=None) + assert _resolve_location(compute) == "east us" + + +def test_resolve_location_conflict_returns_none(): + compute = SimpleNamespace( + location="eastus", + properties=SimpleNamespace(compute_location="westus", computeLocation=None), + ) + assert _resolve_location(compute) is None - assert len(findings) == 1 - f = findings[0] - assert f.rule_id == "azure.ml.compute_instance.idle" - assert f.resource_type == "azure.ml.compute_instance" - assert f.provider == "azure" - assert f.risk.value == "medium" - assert f.confidence.value == "high" - assert f.details["is_gpu"] is False - assert f.details["vm_size"] == "Standard_DS3_v2" - assert f.details["state"] == "Running" - assert f.estimated_monthly_cost_usd == 260.0 +def test_resolve_location_all_absent_returns_none(): + compute = SimpleNamespace(location=None, properties=None) + assert _resolve_location(compute) is None -def test_idle_gpu_instance_detected_high_risk(): - """GPU instance at exactly idle_days threshold (idle_ratio=1.0) -> HIGH risk.""" + +def test_resolve_compute_type_from_sdk(): + compute = _make_instance() + assert _resolve_compute_type(compute) == "ComputeInstance" + + +def test_resolve_provisioning_state_from_sdk(): + compute = _make_instance() + assert _resolve_provisioning_state(compute) == "Succeeded" + + +def test_resolve_state_from_sdk(): + compute = _make_instance() + assert _resolve_state(compute) == "Running" + + +def test_resolve_state_strips_surrounding_whitespace(): + """spec 7: state is normalized by surrounding-whitespace trimming.""" + compute = _make_instance() + compute.properties.properties.state = " Running " + assert _resolve_state(compute) == "Running" + + +def test_padded_running_state_emits(): + """spec 7: whitespace-padded 'Running' normalizes to 'Running' -> emits.""" ws = _make_workspace() - # age_days=14, idle_since_days=14 -> idle_ratio=1.0 -> HIGH (not CRITICAL) - instance = _make_instance(vm_size="Standard_NC6s_v3", age_days=14, idle_since_days=14) + instance = _make_instance(age_days=31, last_op_time_days=20) + instance.properties.properties.state = " Running " ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( - subscription_id="sub-123", - credential=None, - client=ml_client, + subscription_id="sub-123", credential=None, client=ml_client ) assert len(findings) == 1 - f = findings[0] - assert f.risk.value == "high" - assert f.details["is_gpu"] is True - assert f.estimated_monthly_cost_usd == 2203.0 + assert findings[0].details["state"] == "Running" -def test_idle_gpu_instance_critical_when_very_stale(): - """GPU instance idle ≥ 2× threshold -> CRITICAL risk.""" +def test_resolve_created_at_returns_utc(): + compute = _make_instance(age_days=31) + ts = _resolve_created_at(compute) + assert ts is not None + assert ts.tzinfo is not None + + +def test_resolve_modified_at_present(): + compute = _make_instance(modified_days=20) + ts = _resolve_modified_at(compute) + assert ts is not None + assert ts.tzinfo is not None + + +def test_resolve_modified_at_absent(): + compute = _make_instance(modified_days=None) + assert _resolve_modified_at(compute) is None + + +# --------------------------------------------------------------------------- +# Core detection +# --------------------------------------------------------------------------- + + +def test_idle_cpu_instance_detected(): + """Running CPU instance with stale last_operation -> MEDIUM risk, MEDIUM confidence.""" ws = _make_workspace() - # age_days=30, idle_since_days=30, idle_days=14 -> idle_ratio≈2.14 -> CRITICAL - instance = _make_instance(vm_size="Standard_NC12s_v3", age_days=30, idle_since_days=30) + instance = _make_instance(vm_size="Standard_DS3_v2", age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( @@ -147,15 +312,23 @@ def test_idle_gpu_instance_critical_when_very_stale(): ) assert len(findings) == 1 - assert findings[0].risk.value == "critical" - assert findings[0].details["idle_ratio"] >= 2.0 + f = findings[0] + assert f.rule_id == "azure.ml.compute_instance.idle" + assert f.resource_type == "azure.ml.compute_instance" + assert f.provider == "azure" + assert f.risk.value == "medium" + assert f.confidence.value == "medium" + assert f.estimated_monthly_cost_usd is None # spec 10: always None + assert f.details["vm_size"] == "Standard_DS3_v2" + assert f.details["state"] == "Running" + assert f.details["idle_signal_source"] == "last_operation" -def test_critical_boundary_exactly_at_2x(): - """idle_ratio == 2.0 exactly -> CRITICAL.""" +def test_idle_gpu_instance_high_risk(): + """GPU instance -> HIGH risk, MEDIUM confidence (last_operation signal).""" ws = _make_workspace() - # idle_days=14, idle_since_days=28 -> idle_ratio=2.0 - instance = _make_instance(vm_size="Standard_NC6s_v3", age_days=28, idle_since_days=28) + # age_days=15, last_op_time_days=14: op_time is 1 day after created_on + instance = _make_instance(vm_size="Standard_NC6s_v3", age_days=15, last_op_time_days=14) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( @@ -164,14 +337,17 @@ def test_critical_boundary_exactly_at_2x(): client=ml_client, ) - assert findings[0].risk.value == "critical" - assert findings[0].details["idle_ratio"] == 2.0 + assert len(findings) == 1 + f = findings[0] + assert f.risk.value == "high" + assert f.confidence.value == "medium" + assert f.estimated_monthly_cost_usd is None -def test_just_below_critical_is_high(): - """GPU instance with idle_ratio < 2.0 -> HIGH, not CRITICAL.""" +def test_gpu_risk_never_exceeds_high(): + """spec 9.5: no CRITICAL level — GPU is always HIGH regardless of idle duration.""" ws = _make_workspace() - instance = _make_instance(vm_size="Standard_NC6s_v3", age_days=14, idle_since_days=14) + instance = _make_instance(vm_size="Standard_NC12s_v3", age_days=61, last_op_time_days=60) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( @@ -180,14 +356,14 @@ def test_just_below_critical_is_high(): client=ml_client, ) + assert len(findings) == 1 assert findings[0].risk.value == "high" - assert findings[0].details["idle_ratio"] == 1.0 -def test_cpu_instance_never_reaches_critical(): - """CPU instances are capped at MEDIUM regardless of idle_ratio.""" +def test_cpu_instance_always_medium_risk(): + """CPU instances are always MEDIUM risk.""" ws = _make_workspace() - instance = _make_instance(vm_size="Standard_D8s_v3", age_days=60, idle_since_days=60) + instance = _make_instance(vm_size="Standard_D8s_v3", age_days=61, last_op_time_days=60) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( @@ -212,42 +388,71 @@ def test_no_instances_returns_empty(): # --------------------------------------------------------------------------- -# State filtering — only Running incurs charges +# spec 10: estimated_monthly_cost_usd is always None # --------------------------------------------------------------------------- -def test_stopped_instance_skipped(): - """Stopped instances do not incur charges — must not be flagged.""" +@pytest.mark.parametrize( + "vm_size", + [ + "Standard_DS3_v2", + "Standard_NC6s_v3", + "Standard_ND40rs_v2", + "Standard_NV12", + "Standard_FUTURE_99xlarge", + ], +) +def test_estimated_monthly_cost_is_always_none(vm_size): + """spec 10: cost must always be None — no hardcoded price tables.""" ws = _make_workspace() - instance = _make_instance(vm_size="Standard_DS3_v2", state="Stopped", age_days=30) + instance = _make_instance(vm_size=vm_size, age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=ml_client ) - assert len(findings) == 0 + assert len(findings) == 1 + assert findings[0].estimated_monthly_cost_usd is None + + +# --------------------------------------------------------------------------- +# State and type filtering (spec 8.5, 8.6, 8.7) +# --------------------------------------------------------------------------- + + +def test_stopped_instance_skipped(): + """Stopped instances are out of scope for this rule.""" + ws = _make_workspace() + instance = _make_instance(state="Stopped", age_days=31, last_op_time_days=20) + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) @pytest.mark.parametrize("state", ["Creating", "Deleting", "Starting", "Stopping", "Unknown"]) def test_non_running_states_skipped(state): - """Only Running state should be flagged.""" ws = _make_workspace() - instance = _make_instance(state=state, age_days=30) + instance = _make_instance(state=state, age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) - findings = find_idle_aml_compute_instances( - subscription_id="sub-123", credential=None, client=ml_client + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] ) - assert len(findings) == 0 - def test_non_compute_instance_type_skipped(): """AmlCompute clusters must not be picked up by this rule.""" ws = _make_workspace() - instance = _make_instance(age_days=30) - instance.properties.compute_type = "AmlCompute" + instance = _make_instance(compute_type="AmlCompute", age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) assert ( @@ -258,15 +463,11 @@ def test_non_compute_instance_type_skipped(): ) -# --------------------------------------------------------------------------- -# Age guard -# --------------------------------------------------------------------------- - - -def test_young_instance_skipped(): - """Instance younger than minimum age guard -> skipped.""" +@pytest.mark.parametrize("pstate", ["Failed", "Creating", "Deleting", "Canceled", "Unknown"]) +def test_non_succeeded_provisioning_state_skipped(pstate): + """spec 8.6: provisioning_state must be exactly 'Succeeded'.""" ws = _make_workspace() - instance = _make_instance(age_days=3) + instance = _make_instance(provisioning_state=pstate, age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) assert ( @@ -277,10 +478,12 @@ def test_young_instance_skipped(): ) -def test_instance_at_boundary_age_skipped(): - """Instance at exactly 6 days (below max(idle_days//2=7, 7)) -> skipped.""" +def test_conflicting_compute_type_skipped(): + """spec 9.1: conflicting SDK+raw compute_type -> skip.""" ws = _make_workspace() - instance = _make_instance(age_days=6) + instance = _make_instance(age_days=31, last_op_time_days=20) + instance.properties.compute_type = "ComputeInstance" + instance.properties.computeType = "AmlCompute" ml_client = _make_client(ws, [instance]) assert ( @@ -292,141 +495,214 @@ def test_instance_at_boundary_age_skipped(): # --------------------------------------------------------------------------- -# Confidence levels +# Location contract (spec 8.4, 8.8, 9.2) # --------------------------------------------------------------------------- -def test_high_confidence_at_full_threshold(): - """idle_since >= idle_days AND age >= idle_days -> HIGH confidence.""" +def test_unresolvable_location_skipped(): + """spec 8.8: unresolvable location -> skip.""" + ws = _make_workspace() + instance = _make_instance(location=None, age_days=31, last_op_time_days=20) + instance.location = None + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_region_filter_exact_lowercase_match(): + """spec 8.4: exact lowercase equality for region filter.""" ws = _make_workspace() - instance = _make_instance(age_days=14, idle_since_days=14) + instance = _make_instance(location="eastus", age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( - subscription_id="sub-123", credential=None, client=ml_client + subscription_id="sub-123", + credential=None, + client=ml_client, + region_filter="eastus", ) - assert findings[0].confidence.value == "high" + assert len(findings) == 1 + +def test_region_filter_excludes(): + ws = _make_workspace() + instance = _make_instance(location="westeurope", age_days=31, last_op_time_days=20) + ml_client = _make_client(ws, [instance]) -def test_medium_confidence_at_75_percent(): - """idle_since and age at 75% of threshold -> MEDIUM confidence.""" + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", + credential=None, + client=ml_client, + region_filter="eastus", + ) + == [] + ) + + +def test_region_filter_normalises_to_lowercase(): + """spec 7: filter is lowercased; compute location is lowercased too.""" ws = _make_workspace() - # idle_days=14, threshold_medium=10, age=11, idle=11 -> MEDIUM - instance = _make_instance(age_days=11, idle_since_days=11) + instance = _make_instance(location="East US", age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( - subscription_id="sub-123", credential=None, client=ml_client + subscription_id="sub-123", + credential=None, + client=ml_client, + region_filter="East US", ) - assert findings[0].confidence.value == "medium" + assert len(findings) == 1 + assert findings[0].region == "east us" -def test_below_medium_threshold_skipped(): - """Below 75% threshold -> skipped.""" +def test_compute_location_used_not_workspace_location(): + """spec 9.2: region comes from compute resource, not workspace.""" ws = _make_workspace() - # age=8, idle=8 -> 8 < 10 -> skip - instance = _make_instance(age_days=8, idle_since_days=8) + # compute location = westeurope, filter for eastus -> exclude + instance = _make_instance(location="westeurope", age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) assert ( find_idle_aml_compute_instances( - subscription_id="sub-123", credential=None, client=ml_client + subscription_id="sub-123", + credential=None, + client=ml_client, + region_filter="eastus", ) == [] ) -def test_recently_active_instance_skipped(): - """Instance active 3 days ago should not be flagged even if old.""" +# --------------------------------------------------------------------------- +# Age contract (spec 8.9, 9.3) +# --------------------------------------------------------------------------- + + +def test_instance_younger_than_idle_days_skipped(): + """spec 9.3.3: age < idle_days -> skip.""" ws = _make_workspace() - instance = _make_instance(age_days=60, idle_since_days=3) + # age=13, threshold=14 -> skip + instance = _make_instance(age_days=14, last_op_time_days=13) ml_client = _make_client(ws, [instance]) assert ( find_idle_aml_compute_instances( - subscription_id="sub-123", credential=None, client=ml_client + subscription_id="sub-123", credential=None, client=ml_client, idle_days=14 ) == [] ) -def test_custom_idle_days_respected(): - """Custom idle_days=7 — instance idle 7 days should be HIGH confidence.""" +def test_instance_exactly_idle_days_age_eligible(): + """spec 9.3.3: age == idle_days -> eligible (boundary case).""" ws = _make_workspace() - instance = _make_instance(age_days=7, idle_since_days=7) + # age=15, threshold=14: age gate passes; last_op 14 days ago -> idle_since_days=14 + instance = _make_instance(age_days=15, last_op_time_days=14) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( - subscription_id="sub-123", credential=None, client=ml_client, idle_days=7 + subscription_id="sub-123", credential=None, client=ml_client, idle_days=14 ) assert len(findings) == 1 - assert findings[0].confidence.value == "high" + + +def test_missing_created_at_skips(): + """spec 8.9: absent created_at -> skip.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=20) + instance.properties.created_on = None + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_future_created_at_skips(): + """spec 9.3.2: future created_at -> skip.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=20) + instance.properties.created_on = datetime.now(timezone.utc) + timedelta(days=1) + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) # --------------------------------------------------------------------------- -# Idle signal fallbacks +# Lifecycle-activity contract (spec 9.4) # --------------------------------------------------------------------------- -def test_idle_signal_source_last_operation(): - """idle_signal_source should be 'last_operation' when last_operation.operation_time is present.""" +def test_last_op_time_camelcase_operationtime_used(): + """Gap 1: camelCase operationTime is resolved when snake_case operation_time is absent.""" ws = _make_workspace() - instance = _make_instance(age_days=30, idle_since_days=20) + instance = _make_instance(age_days=31, last_op_time_days=20) + op_time = instance.properties.properties.last_operation.operation_time + instance.properties.properties.last_operation.operation_time = None + instance.properties.properties.last_operation.operationTime = op_time ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=ml_client ) + assert len(findings) == 1 assert findings[0].details["idle_signal_source"] == "last_operation" -def test_idle_signal_source_last_modified_at(): - """idle_signal_source should be 'last_modified_at' when falling back to system_data.""" +def test_last_op_name_camelcase_operationname_used(): + """Gap 1: camelCase operationName is resolved when snake_case operation_name is absent.""" ws = _make_workspace() - instance = _make_instance(age_days=30, idle_since_days=None) - instance.properties.properties.last_operation = None - instance.system_data = SimpleNamespace( - last_modified_at=datetime.now(timezone.utc) - timedelta(days=20) - ) + instance = _make_instance(age_days=31, last_op_time_days=20, last_op_name=None) + instance.properties.properties.last_operation.operation_name = None + instance.properties.properties.last_operation.operationName = "Restart" ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=ml_client ) - assert findings[0].details["idle_signal_source"] == "last_modified_at" + assert len(findings) == 1 + assert findings[0].details["last_operation_name"] == "Restart" -def test_idle_signal_source_age_fallback(): - """idle_signal_source 'age_fallback' caps confidence at MEDIUM — age alone is not an idle signal.""" +def test_last_op_status_camelcase_operationstatus_used(): + """Gap 1: camelCase operationStatus is resolved when snake_case operation_status is absent.""" ws = _make_workspace() - instance = _make_instance(age_days=30, idle_since_days=None) - instance.properties.properties.last_operation = None - instance.system_data = None + instance = _make_instance(age_days=31, last_op_time_days=20, last_op_status=None) + instance.properties.properties.last_operation.operation_status = None + instance.properties.properties.last_operation.operationStatus = "Succeeded" ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=ml_client ) - assert findings[0].details["idle_signal_source"] == "age_fallback" - assert findings[0].confidence.value == "medium" + assert len(findings) == 1 + assert findings[0].details["last_operation_status"] == "Succeeded" -def test_fallback_to_system_data_when_no_last_operation(): - """When last_operation is absent, system_data.last_modified_at should be used.""" +def test_last_op_time_present_and_stale_emits(): + """spec 9.4.3: lastOperation.operationTime present and stale -> emit.""" ws = _make_workspace() - instance = _make_instance(age_days=30, idle_since_days=None) - # Remove last_operation - instance.properties.properties.last_operation = None - # Supply system_data with 20-day-old modification time - instance.system_data = SimpleNamespace( - last_modified_at=datetime.now(timezone.utc) - timedelta(days=20) - ) + instance = _make_instance(age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( @@ -434,15 +710,52 @@ def test_fallback_to_system_data_when_no_last_operation(): ) assert len(findings) == 1 + assert findings[0].details["idle_signal_source"] == "last_operation" assert findings[0].details["idle_since_days"] >= 19 -def test_fallback_to_age_when_no_operation_or_system_data(): - """When both last_operation and system_data are absent, age is used as proxy.""" +def test_last_op_time_present_but_unparsable_skips(): + """spec 9.4.4: lastOperation.operationTime present but unparsable -> skip.""" ws = _make_workspace() - instance = _make_instance(age_days=30, idle_since_days=None) - instance.properties.properties.last_operation = None - instance.system_data = None + instance = _make_instance(age_days=31, last_op_time_days=20) + instance.properties.properties.last_operation.operation_time = "not-a-valid-timestamp" + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_last_op_time_equals_created_at_skips(): + """spec 9.4.7: operationTime == created_at -> no proven post-create signal -> skip.""" + ws = _make_workspace() + now = datetime.now(timezone.utc) + created_on = now - timedelta(days=30) + instance = _make_instance(age_days=31, last_op_time_days=20) + # Force both timestamps to be identical + instance.properties.created_on = created_on + instance.properties.properties.last_operation.operation_time = created_on + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_last_op_absent_falls_back_to_modified_on(): + """spec 9.4.8: lastOperation absent + modifiedOn > created_at -> modified_on signal.""" + ws = _make_workspace() + instance = _make_instance( + age_days=31, + last_op_time_days=None, # no last operation + modified_days=20, # modifiedOn 20 days ago (< age_days=31) + ) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( @@ -450,14 +763,16 @@ def test_fallback_to_age_when_no_operation_or_system_data(): ) assert len(findings) == 1 - assert findings[0].details["idle_since_days"] == 30 + assert findings[0].details["idle_signal_source"] == "modified_on" + assert findings[0].details["idle_since_days"] >= 19 -def test_missing_creation_time_uses_neutral_default(): - """Missing created_on should not silently skip the instance.""" +def test_last_op_no_operation_time_falls_back_to_modified_on(): + """spec 9.4.8: lastOperation present but operationTime absent -> try modifiedOn.""" ws = _make_workspace() - instance = _make_instance(age_days=30, idle_since_days=20) - instance.properties.created_on = None # no creation time + instance = _make_instance(age_days=31, last_op_time_days=20, modified_days=20) + # Clear operation_time to simulate field absent + instance.properties.properties.last_operation.operation_time = None ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( @@ -465,12 +780,141 @@ def test_missing_creation_time_uses_neutral_default(): ) assert len(findings) == 1 + assert findings[0].details["idle_signal_source"] == "modified_on" + + +def test_modified_on_equals_created_at_skips(): + """spec 9.4.9: modifiedOn == created_at -> no proven post-create signal -> skip.""" + ws = _make_workspace() + now = datetime.now(timezone.utc) + created_on = now - timedelta(days=30) + instance = _make_instance(age_days=31, last_op_time_days=None, modified_days=None) + instance.properties.created_on = created_on + instance.properties.modified_on = created_on # equal -> skip + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_modified_on_before_created_at_skips(): + """spec 9.4.8: modifiedOn <= created_at -> must not use -> skip.""" + ws = _make_workspace() + now = datetime.now(timezone.utc) + created_on = now - timedelta(days=30) + instance = _make_instance(age_days=31, last_op_time_days=None, modified_days=None) + instance.properties.created_on = created_on + instance.properties.modified_on = created_on - timedelta(days=1) # before created_on + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_no_lifecycle_signal_skips(): + """spec 9.4.12-13: no lastOperation and no modifiedOn -> fail closed -> skip.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=None, modified_days=None) + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_system_data_last_modified_not_used(): + """spec 9.4.12: systemData.lastModifiedAt is undocumented -> not used.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=None, modified_days=None) + # Attach system_data — must NOT trigger a finding + instance.system_data = SimpleNamespace( + last_modified_at=datetime.now(timezone.utc) - timedelta(days=20) + ) + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_age_only_fallback_not_used(): + """spec 9.4.13: age-only fallback must not be used to prove idleness.""" + ws = _make_workspace() + # Very old instance, no lifecycle signal at all + instance = _make_instance(age_days=365, last_op_time_days=None, modified_days=None) + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_future_last_op_time_skips(): + """spec 9.4.11: lifecycle timestamp in the future -> skip (no clock-skew tolerance).""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=20) + instance.properties.properties.last_operation.operation_time = datetime.now( + timezone.utc + ) + timedelta(days=1) + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_idle_since_days_below_threshold_skips(): + """spec 8.12: floored idle_since_days < effective_idle_days -> skip.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=5) # only 5 days idle + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client, idle_days=14 + ) + == [] + ) + + +def test_recently_active_instance_skipped(): + """Instance active 3 days ago should not be flagged even if old.""" + ws = _make_workspace() + instance = _make_instance(age_days=61, last_op_time_days=3) + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) def test_timezone_naive_op_time_handled(): - """Timezone-naive last_operation.operation_time should be normalised.""" + """Timezone-naive lastOperation.operationTime is normalized to UTC.""" ws = _make_workspace() - instance = _make_instance(age_days=30) + instance = _make_instance(age_days=31, last_op_time_days=20) instance.properties.properties.last_operation.operation_time = datetime.now() - timedelta( days=20 ) # naive @@ -485,211 +929,307 @@ def test_timezone_naive_op_time_handled(): # --------------------------------------------------------------------------- -# GPU family detection +# Confidence contract (spec 9.5) # --------------------------------------------------------------------------- -@pytest.mark.parametrize( - "vm_size,expected_gpu", - [ - ("Standard_NC6s_v3", True), - ("Standard_NC12s_v3", True), - ("Standard_ND6s", True), - ("Standard_ND40rs_v2", True), - ("Standard_NV6", True), - ("Standard_DS3_v2", False), - ("Standard_D4s_v3", False), - ("Standard_DS11_v2", False), - ], -) -def test_gpu_family_classification(vm_size, expected_gpu): +def test_confidence_medium_for_last_operation_source(): + """spec 9.5: MEDIUM confidence when idle_signal_source == last_operation.""" ws = _make_workspace() - # age_days=14 -> idle_ratio=1.0 -> GPU=HIGH, CPU=MEDIUM - instance = _make_instance(vm_size=vm_size, age_days=14, idle_since_days=14) + instance = _make_instance(age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=ml_client ) - assert len(findings) == 1 - assert findings[0].details["is_gpu"] is expected_gpu - if expected_gpu: - assert findings[0].risk.value == "high" - else: - assert findings[0].risk.value == "medium" + assert findings[0].confidence.value == "medium" + assert findings[0].details["idle_signal_source"] == "last_operation" + + +def test_confidence_low_for_modified_on_source(): + """spec 9.5: LOW confidence when idle_signal_source == modified_on.""" + ws = _make_workspace() + instance = _make_instance( + age_days=31, + last_op_time_days=None, + modified_days=20, + ) + ml_client = _make_client(ws, [instance]) + + findings = find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + + assert findings[0].confidence.value == "low" + assert findings[0].details["idle_signal_source"] == "modified_on" # --------------------------------------------------------------------------- -# Cost lookup +# GPU classification (spec 7, 9.5) # --------------------------------------------------------------------------- @pytest.mark.parametrize( - "vm_size,expected_cost", + "vm_size,expected_risk", [ - ("Standard_DS3_v2", 260.0), - ("Standard_D4s_v3", 192.0), - ("Standard_NC6s_v3", 2203.0), - ("Standard_NC24s_v3", 8812.0), - ("Standard_ND40rs_v2", 15862.0), - ("Standard_NV12", 2189.0), + ("Standard_NC6s_v3", "high"), + ("Standard_NC12s_v3", "high"), + ("Standard_ND6s", "high"), + ("Standard_ND40rs_v2", "high"), + ("Standard_NV6", "high"), + ("Standard_DS3_v2", "medium"), + ("Standard_D4s_v3", "medium"), + ("Standard_DS11_v2", "medium"), + # lowercase prefix — NOT GPU (case-sensitive matching) + ("standard_nc6s_v3", "medium"), ], ) -def test_cost_lookup_by_vm_size(vm_size, expected_cost): +def test_gpu_family_classification(vm_size, expected_risk): + """spec 7, 9.5: exact case-sensitive GPU prefix classification.""" ws = _make_workspace() - instance = _make_instance(vm_size=vm_size, age_days=30) + # age_days=15, last_op_time_days=14: op_time is 1 day after created_on + instance = _make_instance(vm_size=vm_size, age_days=15, last_op_time_days=14) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=ml_client ) - assert findings[0].estimated_monthly_cost_usd == expected_cost + assert len(findings) == 1 + assert findings[0].risk.value == expected_risk + +# --------------------------------------------------------------------------- +# Custom idle_days threshold +# --------------------------------------------------------------------------- -def test_unknown_vm_size_uses_default_cost(): + +def test_custom_idle_days_respected(): + """Custom idle_days=7 — instance idle 7 days should emit.""" ws = _make_workspace() - instance = _make_instance(vm_size="Standard_FUTURE_99xlarge", age_days=30) + # age=8 >= idle_days=7; last_op 7 days ago -> idle_since_days=7 >= 7 + instance = _make_instance(age_days=8, last_op_time_days=7) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( - subscription_id="sub-123", credential=None, client=ml_client + subscription_id="sub-123", credential=None, client=ml_client, idle_days=7 ) - assert findings[0].estimated_monthly_cost_usd == 200.0 + assert len(findings) == 1 + assert findings[0].details["idle_days_threshold"] == 7 -def test_vm_size_case_insensitive_lookup(): - """Azure ML may return VM sizes in uppercase or mixed case — cost lookup must handle it.""" +def test_idle_days_zero_clamped_to_one(): + """idle_days=0 must be clamped to 1 (spec 6.3).""" ws = _make_workspace() - instance = _make_instance(vm_size="STANDARD_NC6S_V3", age_days=30) + instance = _make_instance(age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( - subscription_id="sub-123", credential=None, client=ml_client + subscription_id="sub-123", credential=None, client=ml_client, idle_days=0 ) - assert findings[0].estimated_monthly_cost_usd == 2203.0 + assert len(findings) == 1 + assert findings[0].details["idle_days_threshold"] == 1 # --------------------------------------------------------------------------- -# Region filtering +# Finding shape (spec 11.1, 11.3, 11.4) # --------------------------------------------------------------------------- -def test_region_filter_matches(): - ws = _make_workspace(location="westeurope") - instance = _make_instance(age_days=30) +def test_finding_required_fields(): + """spec 11.1: required top-level finding fields.""" + ws = _make_workspace(name="ml-prod", rg="rg-prod") + instance = _make_instance( + name="gpu-dev", + vm_size="Standard_NC6s_v3", + age_days=31, + last_op_time_days=20, + last_op_status="Succeeded", + workspace="ml-prod", + rg="rg-prod", + ) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( - subscription_id="sub-123", - credential=None, - client=ml_client, - region_filter="westeurope", + subscription_id="sub-123", credential=None, client=ml_client ) - assert len(findings) == 1 + f = findings[0] + assert f.provider == "azure" + assert f.rule_id == "azure.ml.compute_instance.idle" + assert f.resource_type == "azure.ml.compute_instance" + assert f.region == "eastus" + assert f.estimated_monthly_cost_usd is None + assert f.detected_at is not None + assert f.evidence is not None -def test_region_filter_excludes(): - ws = _make_workspace(location="westeurope") - instance = _make_instance(age_days=30) +def test_finding_detail_fields_complete(): + """spec 11.4: all required detail fields are present.""" + ws = _make_workspace(name="ml-prod", rg="rg-prod") + instance = _make_instance( + name="gpu-dev", + vm_size="Standard_NC6s_v3", + age_days=31, + last_op_time_days=20, + last_op_name="Start", + last_op_status="Succeeded", + modified_days=25, + workspace="ml-prod", + rg="rg-prod", + tags={"team": "research"}, + ) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( - subscription_id="sub-123", - credential=None, - client=ml_client, - region_filter="eastus", + subscription_id="sub-123", credential=None, client=ml_client ) - assert len(findings) == 0 - - -def test_region_filter_normalises_case_and_spaces(): - """Region filter comparison should be case/space/hyphen insensitive.""" - ws = _make_workspace(location="East US") - instance = _make_instance(age_days=30) + d = findings[0].details + assert d["instance_name"] == "gpu-dev" + assert d["workspace_name"] == "ml-prod" + assert d["resource_group"] == "rg-prod" + assert d["subscription_id"] == "sub-123" + assert d["location"] == "eastus" + assert d["vm_size"] == "Standard_NC6s_v3" + assert d["compute_type"] == "ComputeInstance" + assert d["provisioning_state"] == "Succeeded" + assert d["state"] == "Running" + assert d["created_at"] is not None + assert d["modified_at"] is not None + assert d["last_operation_name"] == "Start" + assert d["last_operation_time"] is not None + assert d["last_operation_status"] == "Succeeded" + assert isinstance(d["idle_since_days"], int) + assert d["idle_days_threshold"] == 14 + assert d["idle_signal_source"] == "last_operation" + assert d["tags"] == {"team": "research"} + + +def test_last_operation_time_none_when_modified_on_signal(): + """spec 11.4: last_operation_time is null when idle_signal_source == modified_on.""" + ws = _make_workspace() + instance = _make_instance( + age_days=31, + last_op_time_days=None, + modified_days=20, + ) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( - subscription_id="sub-123", - credential=None, - client=ml_client, - region_filter="eastus", + subscription_id="sub-123", credential=None, client=ml_client ) - assert len(findings) == 1 - + d = findings[0].details + assert d["idle_signal_source"] == "modified_on" + assert d["last_operation_time"] is None + assert d["modified_at"] is not None -# --------------------------------------------------------------------------- -# Finding structure -# --------------------------------------------------------------------------- - -def test_finding_fields_complete(): - ws = _make_workspace(name="ml-prod", rg="rg-prod") - instance = _make_instance(name="gpu-dev", vm_size="Standard_NC6s_v3", age_days=30) +def test_modified_at_present_even_when_not_selected_signal(): + """spec 11.4: modified_at included even when last_operation is the idle signal.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=20, modified_days=25) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=ml_client ) - f = findings[0] - assert f.provider == "azure" - assert f.region == "eastus" - assert f.detected_at is not None - assert f.evidence is not None - assert f.details["instance_name"] == "gpu-dev" - assert f.details["workspace_name"] == "ml-prod" - assert f.details["resource_group"] == "rg-prod" - assert f.details["idle_days_threshold"] == 14 - assert "~$" in f.details["estimated_monthly_cost"] - assert f.details["cost_source"] == "approximate_eastus" + d = findings[0].details + assert d["idle_signal_source"] == "last_operation" + assert d["modified_at"] is not None # present even though not the selected signal -def test_summary_contains_instance_and_workspace(): - ws = _make_workspace(name="research-ws") - instance = _make_instance(name="cv-model-dev", age_days=30) +def test_tags_never_none(): + """spec 7: tags must never be None in output.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=20, tags=None) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=ml_client ) - assert "cv-model-dev" in findings[0].summary - assert "research-ws" in findings[0].summary - assert "control-plane" in findings[0].summary + assert findings[0].details["tags"] == {} -def test_title_format(): +def test_idle_since_days_is_floored_integer(): + """spec 9.4.16: idle_since_days must be the floored integer idle duration.""" ws = _make_workspace() - instance = _make_instance(age_days=30, idle_since_days=30) + instance = _make_instance(age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=ml_client ) - assert findings[0].title == "Idle Azure ML Compute Instance (No Activity for 30 Days)" + assert isinstance(findings[0].details["idle_since_days"], int) -def test_idle_days_zero_clamped_to_one(): - """idle_days=0 must be clamped to 1 to prevent division-by-zero.""" +def test_signals_used_disclose_required_items(): + """spec 11.3: signals_used discloses compute type, states, age, and actual field name.""" ws = _make_workspace() - instance = _make_instance(age_days=30, idle_since_days=30) + instance = _make_instance(age_days=31, last_op_time_days=20) + ml_client = _make_client(ws, [instance]) + + f = find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + )[0] + + signals = " ".join(f.evidence.signals_used) + assert "ComputeInstance" in signals + assert "Succeeded" in signals + assert "Running" in signals + # Gap 2: actual field name, not just source label + assert "lastOperation.operationTime" in signals + + +def test_signals_used_names_modified_on_field(): + """Gap 2: signals_used names 'modifiedOn' when that fallback is the signal source.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=None, modified_days=20) + ml_client = _make_client(ws, [instance]) + + f = find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + )[0] + + signals = " ".join(f.evidence.signals_used) + assert "modifiedOn" in signals + + +def test_signals_not_checked_includes_blind_spots(): + """spec 11.3: signals_not_checked lists runtime blind spots.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=20) + ml_client = _make_client(ws, [instance]) + + f = find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + )[0] + + combined = " ".join(f.evidence.signals_not_checked).lower() + assert "jupyter" in combined + assert "vs code" in combined + assert "aml" in combined or "experiment" in combined + + +def test_summary_contains_instance_and_workspace(): + ws = _make_workspace(name="research-ws") + instance = _make_instance(name="cv-model-dev", age_days=31, last_op_time_days=20) ml_client = _make_client(ws, [instance]) findings = find_idle_aml_compute_instances( - subscription_id="sub-123", credential=None, client=ml_client, idle_days=0 + subscription_id="sub-123", credential=None, client=ml_client ) - assert isinstance(findings, list) - assert len(findings) == 1 - assert findings[0].details["idle_days_threshold"] == 1 + assert "cv-model-dev" in findings[0].summary + assert "research-ws" in findings[0].summary + assert "Running" in findings[0].summary # --------------------------------------------------------------------------- @@ -698,13 +1238,22 @@ def test_idle_days_zero_clamped_to_one(): def test_multiple_instances_mixed(): - """Only idle Running instances should be flagged.""" + """Only idle Running Succeeded ComputeInstance instances should be flagged.""" ws = _make_workspace() instances = [ - _make_instance("idle-gpu", "Standard_NC6s_v3", age_days=30, idle_since_days=30), - _make_instance("active-cpu", "Standard_DS3_v2", age_days=30, idle_since_days=2), - _make_instance("stopped-gpu", "Standard_NC12s_v3", state="Stopped", age_days=30), - _make_instance("idle-cpu", "Standard_D4s_v3", age_days=14, idle_since_days=14), + _make_instance("idle-gpu", "Standard_NC6s_v3", age_days=31, last_op_time_days=20), + _make_instance("active-cpu", "Standard_DS3_v2", age_days=31, last_op_time_days=3), + _make_instance( + "stopped-gpu", "Standard_NC12s_v3", state="Stopped", age_days=31, last_op_time_days=20 + ), + _make_instance("idle-cpu", "Standard_D4s_v3", age_days=15, last_op_time_days=14), + _make_instance( + "failed-prov", + "Standard_DS3_v2", + provisioning_state="Failed", + age_days=31, + last_op_time_days=20, + ), ] ml_client = _make_client(ws, instances) @@ -718,15 +1267,16 @@ def test_multiple_instances_mixed(): assert "idle-cpu" in names assert "active-cpu" not in names assert "stopped-gpu" not in names + assert "failed-prov" not in names # --------------------------------------------------------------------------- -# Permission error handling +# Exception handling (spec 12) # --------------------------------------------------------------------------- -def test_permission_error_on_authorization_failure(): - """AuthorizationFailed in workspaces.list should raise PermissionError.""" +def test_workspace_list_auth_error_propagates_as_is(): + """spec 12: subscription-wide workspace inventory failures propagate unchanged.""" class _ForbiddenClient: class workspaces: # noqa: N801 @@ -734,33 +1284,33 @@ class workspaces: # noqa: N801 def list_by_subscription(): raise Exception("AuthorizationFailed: insufficient permissions") - compute = None + machine_learning_compute = None - with pytest.raises(PermissionError) as exc_info: + with pytest.raises(Exception, match="AuthorizationFailed"): find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=_ForbiddenClient() ) - assert "Microsoft.MachineLearningServices/workspaces/read" in str(exc_info.value) +def test_workspace_list_403_error_propagates_as_is(): + """spec 12: 403 from workspace listing propagates unchanged (not converted).""" -def test_403_error_raises_permission_error(): class _ForbiddenClient: class workspaces: # noqa: N801 @staticmethod def list_by_subscription(): raise Exception("Forbidden (403) — access denied") - compute = None + machine_learning_compute = None - with pytest.raises(PermissionError): + with pytest.raises(Exception, match="403"): find_idle_aml_compute_instances( subscription_id="sub-123", credential=None, client=_ForbiddenClient() ) -def test_unexpected_error_propagates(): - """Non-permission errors should propagate, not be swallowed.""" +def test_unexpected_workspace_list_error_propagates(): + """spec 12: any workspace listing failure propagates.""" class _BrokenClient: class workspaces: # noqa: N801 @@ -768,7 +1318,7 @@ class workspaces: # noqa: N801 def list_by_subscription(): raise RuntimeError("Unexpected SDK error") - compute = None + machine_learning_compute = None with pytest.raises(RuntimeError): find_idle_aml_compute_instances( @@ -776,8 +1326,8 @@ def list_by_subscription(): ) -def test_compute_list_auth_error_raises_permission_error(): - """AuthorizationFailed on compute.list() must surface as PermissionError, not be swallowed.""" +def test_compute_list_auth_error_skips_workspace(): + """spec 12: per-workspace compute listing failure (including auth) skips that workspace.""" ws = _make_workspace() def _compute_list(rg, ws_name): @@ -788,19 +1338,20 @@ def _compute_list(rg, ws_name): machine_learning_compute=SimpleNamespace(list_by_workspace=_compute_list), ) - with pytest.raises(PermissionError) as exc_info: - find_idle_aml_compute_instances( - subscription_id="sub-123", credential=None, client=ml_client - ) - - assert "Microsoft.MachineLearningServices/workspaces/computes/read" in str(exc_info.value) + # Must not raise — workspace is skipped, returning empty findings + findings = find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + assert findings == [] -def test_compute_list_error_skips_workspace_preserves_findings(): - """Transient error in compute.list() for one workspace must not abort findings from others.""" - ws_good = _make_workspace(name="good-ws", location="eastus", rg="rg-good") - ws_bad = _make_workspace(name="bad-ws", location="eastus", rg="rg-bad") - good_instance = _make_instance(age_days=30, workspace="good-ws", rg="rg-good") +def test_compute_list_transient_error_skips_workspace_preserves_findings(): + """spec 12: transient error on compute listing skips that workspace, preserves others.""" + ws_good = _make_workspace(name="good-ws", rg="rg-good") + ws_bad = _make_workspace(name="bad-ws", rg="rg-bad") + good_instance = _make_instance( + age_days=31, last_op_time_days=20, workspace="good-ws", rg="rg-good" + ) call_count = 0 @@ -820,10 +1371,54 @@ def _compute_list(rg, ws_name): subscription_id="sub-123", credential=None, client=ml_client ) - # The good workspace finding is preserved; bad workspace is skipped assert len(findings) == 1 assert findings[0].details["workspace_name"] == "good-ws" - assert call_count == 2 # both workspaces were attempted + assert call_count == 2 + + +def test_missing_compute_id_skips(): + """spec 8.1: absent compute id -> skip.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=20) + instance.id = None + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_missing_compute_name_skips(): + """spec 8.2: absent compute name -> skip.""" + ws = _make_workspace() + instance = _make_instance(age_days=31, last_op_time_days=20) + instance.name = None + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) + + +def test_missing_workspace_name_skips(): + """spec 8.3: absent workspace name -> skip.""" + ws = _make_workspace() + ws.name = None + instance = _make_instance(age_days=31, last_op_time_days=20) + ml_client = _make_client(ws, [instance]) + + assert ( + find_idle_aml_compute_instances( + subscription_id="sub-123", credential=None, client=ml_client + ) + == [] + ) # --------------------------------------------------------------------------- diff --git a/tests/cleancloud/providers/azure/test_azure_ml_online_endpoint_idle.py b/tests/cleancloud/providers/azure/test_azure_ml_online_endpoint_idle.py index 90ba5af..939fbb3 100644 --- a/tests/cleancloud/providers/azure/test_azure_ml_online_endpoint_idle.py +++ b/tests/cleancloud/providers/azure/test_azure_ml_online_endpoint_idle.py @@ -1,3 +1,19 @@ +""" +Tests for azure.ml.online_endpoint.idle rule (spec-compliant). + +Key spec contracts tested here: +- Managed scope via endpoint kind or deployment class (spec 9.1) +- Exact case-sensitive provisioning_state == "Succeeded" (spec 8.6) +- created_at required from systemData; age >= effective idle_days (spec 8.7, 9.2) +- Billing-relevant deployments: stable, min_instances > 0 (spec 8.9, 9.3) +- RequestsPerMinute / Average / PT1M on endpoint ARM resource id (spec 9.5) +- Coverage 80% threshold (MEDIUM), 95% threshold (HIGH) (spec 9.6) +- No workspace-level metric fallback; no age-only fallback (spec 9.5) +- estimated_monthly_cost_usd = None always (spec 10) +- Risk: HIGH (GPU) / MEDIUM only — no CRITICAL (spec 9.6) +- Exception handling: subscription-level propagates, per-workspace/per-endpoint skips (spec 12) +""" + from datetime import datetime, timedelta, timezone from types import SimpleNamespace from unittest.mock import Mock @@ -6,9 +22,14 @@ from azure.core.exceptions import HttpResponseError from cleancloud.providers.azure.rules.ai.ml_online_endpoint_idle import ( + RULE_METADATA, find_idle_ml_online_endpoints, ) +# --------------------------------------------------------------------------- +# Test helpers for HttpResponseError +# --------------------------------------------------------------------------- + def _http_error(status_code: int) -> HttpResponseError: resp = Mock() @@ -22,7 +43,7 @@ def _http_error(status_code: int) -> HttpResponseError: # --------------------------------------------------------------------------- -# Helpers +# Fixture helpers # --------------------------------------------------------------------------- @@ -40,10 +61,14 @@ def _make_endpoint( provisioning_state="Succeeded", rg=_WS_RG, ws_name=_WS_NAME, + location="eastus", + kind="Managed", + tags=None, ): ep_id = ( f"/subscriptions/{_SUB}/resourceGroups/{rg}" - f"/providers/Microsoft.MachineLearningServices/workspaces/{ws_name}/onlineEndpoints/{name}" + f"/providers/Microsoft.MachineLearningServices/workspaces/{ws_name}" + f"/onlineEndpoints/{name}" ) now = datetime.now(timezone.utc) created_at = now - timedelta(days=age_days) if age_days is not None else None @@ -53,62 +78,101 @@ def _make_endpoint( name=name, provisioning_state=provisioning_state, system_data=system_data, + location=location, + kind=kind, + tags=tags or {}, ) -def _make_deployment(instance_type=None, min_instances=None): - """Matches azure-ai-ml ManagedOnlineDeployment: instance_type + scale_settings.""" +def _make_deployment( + instance_type="Standard_DS3_v2", + min_instances=1, + provisioning_state="Succeeded", +): + """Stable CPU deployment with one billing-relevant instance by default.""" scale = SimpleNamespace(min_instances=min_instances) if min_instances is not None else None return SimpleNamespace( instance_type=instance_type, instance_count=min_instances, scale_settings=scale, + provisioning_state=provisioning_state, ) -def _make_total_metric_response(total: float = 0.0, has_timeseries: bool = True, count: int = 31): - """Return a monitor metrics response. +def _make_avg_metric_response(average=0.0, coverage_fraction=1.0, idle_days=7): + """ + Return a monitor metrics response using Average aggregation. - `count` controls how many datapoints are in the timeseries so the coverage - check (seen_datapoints >= days * 0.5) is satisfied by default for any - reasonable idle_days value. Set has_timeseries=False for 'no data' cases. + Each datapoint carries a time_stamp that is a valid UTC minute bucket inside + [window_start_utc, metric_end_utc), matching the window the implementation computes. + coverage_fraction controls the fraction of expected PT1M buckets that are returned. """ - if not has_timeseries: - return SimpleNamespace(value=[]) - data_points = [SimpleNamespace(total=total) for _ in range(count)] + expected = idle_days * 24 * 60 # PT1M buckets for the window + usable = int(expected * coverage_fraction) + + # Approximate metric_end_utc using the same formula as _query_requests_per_minute so + # that generated timestamps fall inside the implementation's acceptance window. + now_utc = datetime.now(timezone.utc) + metric_end_utc = (now_utc - timedelta(minutes=5)).replace(second=0, microsecond=0) + + # Generate minute buckets going backwards from just before metric_end_utc. + data_points = [ + SimpleNamespace(average=average, time_stamp=metric_end_utc - timedelta(minutes=i + 1)) + for i in range(usable) + ] timeseries = SimpleNamespace(data=data_points) metric = SimpleNamespace(timeseries=[timeseries]) return SimpleNamespace(value=[metric]) +def _make_empty_metric_response(): + """No timeseries — metric result will be UNKNOWN.""" + return SimpleNamespace(value=[]) + + def _make_clients( workspace, endpoints, metric_response=None, metric_fn=None, deployments_by_ep=None, + idle_days=7, ): - """Build mock (ml_client, mon_client). + """ + Build mock (ml_client, mon_client). The injected ml_client serves as both subscription-level (workspaces.list_by_subscription) - and workspace-scoped (online_endpoints / online_deployments) client — matching - the dual-mode pattern in the rule when a client is injected. + and workspace-scoped (online_endpoints / online_deployments) client. + + Default deployments_by_ep=None → one billing-relevant CPU deployment for any endpoint. + Default metric_response → full-coverage zero-traffic response (ZERO result → emit). """ - if deployments_by_ep is None: - deployments_by_ep = {} + # Default: one billing-relevant deployment for all endpoints + _default_dep = _make_deployment() + + def _list_deps(ep_name): + if deployments_by_ep is not None: + return deployments_by_ep.get(ep_name, []) + return [_default_dep] ml_client = SimpleNamespace( workspaces=SimpleNamespace(list_by_subscription=lambda: [workspace]), online_endpoints=SimpleNamespace(list=lambda: endpoints), - online_deployments=SimpleNamespace(list=lambda ep_name: deployments_by_ep.get(ep_name, [])), + online_deployments=SimpleNamespace(list=_list_deps), ) + if metric_fn is not None: mon_client = SimpleNamespace( metrics=SimpleNamespace(list=lambda *a, **kw: metric_fn(*a, **kw)) ) else: - resp = metric_response if metric_response is not None else _make_total_metric_response(0.0) + resp = ( + metric_response + if metric_response is not None + else _make_avg_metric_response(0.0, idle_days=idle_days) + ) mon_client = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: resp)) + return ml_client, mon_client @@ -122,13 +186,34 @@ def _call(client, monitor_client, **kwargs): ) +# --------------------------------------------------------------------------- +# Managed scope classes for class-name-based tests (spec 9.1) +# --------------------------------------------------------------------------- + + +class ManagedOnlineEndpoint(SimpleNamespace): # noqa: N801 + pass + + +class KubernetesOnlineEndpoint(SimpleNamespace): # noqa: N801 + pass + + +class ManagedOnlineDeployment(SimpleNamespace): # noqa: N801 + pass + + +class KubernetesOnlineDeployment(SimpleNamespace): # noqa: N801 + pass + + # --------------------------------------------------------------------------- # Core detection # --------------------------------------------------------------------------- def test_idle_endpoint_detected(): - """Endpoint with zero requests over the idle window should produce a finding.""" + """Managed endpoint with zero RequestsPerMinute should produce a finding.""" ws = _make_workspace() ep = _make_endpoint(age_days=30) ml, mon = _make_clients(ws, [ep]) @@ -150,10 +235,10 @@ def test_idle_endpoint_detected(): def test_active_endpoint_skipped(): - """Endpoint with non-zero requests must NOT be flagged.""" + """Endpoint with non-zero RequestsPerMinute must NOT produce a finding.""" ws = _make_workspace() ep = _make_endpoint(age_days=30) - ml, mon = _make_clients(ws, [ep], metric_response=_make_total_metric_response(42.0)) + ml, mon = _make_clients(ws, [ep], metric_response=_make_avg_metric_response(5.0)) assert _call(ml, mon) == [] @@ -171,18 +256,149 @@ def test_no_workspaces_returns_empty(): online_deployments=SimpleNamespace(list=lambda ep_name: []), ) mon_client = SimpleNamespace( - metrics=SimpleNamespace(list=lambda *a, **kw: _make_total_metric_response(0.0)) + metrics=SimpleNamespace(list=lambda *a, **kw: _make_avg_metric_response(0.0)) ) assert _call(ml_client, mon_client) == [] # --------------------------------------------------------------------------- -# Provisioning state +# Managed scope (spec 8.5, 9.1) +# --------------------------------------------------------------------------- + + +def test_kubernetes_kind_endpoint_skipped(): + """Endpoint with kind='Kubernetes' must be out of scope and skipped.""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30, kind="Kubernetes") + ml, mon = _make_clients(ws, [ep]) + assert _call(ml, mon) == [] + + +def test_unknown_kind_no_deployment_signal_skipped(): + """Endpoint with unknown kind and no deployment class signals must be skipped.""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30, kind=None) # no kind attribute signal + # Use SimpleNamespace deployments — class name is neither Managed nor Kubernetes + deps = [_make_deployment()] # SimpleNamespace + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": deps}) + assert _call(ml, mon) == [] + + +def test_managed_scope_from_endpoint_kind(): + """kind='Managed' establishes endpoint-level managed scope.""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30, kind="Managed") + ml, mon = _make_clients(ws, [ep]) + + findings = _call(ml, mon) + + assert len(findings) == 1 + assert findings[0].details["managed_scope_source"] == "endpoint" + + +def test_managed_scope_from_managedonlineendpoint_class(): + """ManagedOnlineEndpoint class name establishes endpoint-level managed scope.""" + ws = _make_workspace() + now = datetime.now(timezone.utc) + created_at = now - timedelta(days=30) + ep_id = ( + f"/subscriptions/{_SUB}/resourceGroups/{_WS_RG}" + f"/providers/Microsoft.MachineLearningServices/workspaces/{_WS_NAME}/onlineEndpoints/ep1" + ) + # Create instance of named class — no 'kind' attribute needed + ep = ManagedOnlineEndpoint( + id=ep_id, + name="ep1", + provisioning_state="Succeeded", + system_data=SimpleNamespace(created_at=created_at), + location="eastus", + tags={}, + ) + ml, mon = _make_clients(ws, [ep]) + + findings = _call(ml, mon) + + assert len(findings) == 1 + assert findings[0].details["managed_scope_source"] == "endpoint" + + +def test_managed_scope_from_deployment_class(): + """ManagedOnlineDeployment class on stable deployments establishes scope when endpoint has no signal.""" + ws = _make_workspace() + now = datetime.now(timezone.utc) + created_at = now - timedelta(days=30) + ep_id = ( + f"/subscriptions/{_SUB}/resourceGroups/{_WS_RG}" + f"/providers/Microsoft.MachineLearningServices/workspaces/{_WS_NAME}/onlineEndpoints/ep-no-kind" + ) + ep = SimpleNamespace( + id=ep_id, + name="ep-no-kind", + provisioning_state="Succeeded", + system_data=SimpleNamespace(created_at=created_at), + location="eastus", + kind=None, + tags={}, + ) + # Use ManagedOnlineDeployment class + dep = ManagedOnlineDeployment( + instance_type="Standard_DS3_v2", + min_instances=1, + instance_count=1, + scale_settings=SimpleNamespace(min_instances=1), + provisioning_state="Succeeded", + ) + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep-no-kind": [dep]}) + + findings = _call(ml, mon) + + assert len(findings) == 1 + assert findings[0].details["managed_scope_source"] == "deployment" + + +def test_kubernetes_deployment_under_managed_endpoint_skipped(): + """KubernetesOnlineDeployment under a managed endpoint causes conflict → skip (spec 9.1.6).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30, kind="Managed") + dep = KubernetesOnlineDeployment( + instance_type="Standard_DS3_v2", + min_instances=1, + instance_count=1, + scale_settings=SimpleNamespace(min_instances=1), + provisioning_state="Succeeded", + ) + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) + assert _call(ml, mon) == [] + + +def test_kubernetes_endpoint_class_skipped(): + """KubernetesOnlineEndpoint class → out of scope.""" + ws = _make_workspace() + now = datetime.now(timezone.utc) + created_at = now - timedelta(days=30) + ep_id = ( + f"/subscriptions/{_SUB}/resourceGroups/{_WS_RG}" + f"/providers/Microsoft.MachineLearningServices/workspaces/{_WS_NAME}/onlineEndpoints/ep-k8s" + ) + ep = KubernetesOnlineEndpoint( + id=ep_id, + name="ep-k8s", + provisioning_state="Succeeded", + system_data=SimpleNamespace(created_at=created_at), + location="eastus", + tags={}, + ) + ml, mon = _make_clients(ws, [ep]) + assert _call(ml, mon) == [] + + +# --------------------------------------------------------------------------- +# Provisioning state (spec 8.6) # --------------------------------------------------------------------------- def test_non_succeeded_state_skipped(): - """Endpoints not in Succeeded state (e.g. Creating, Failed) must be skipped.""" + """Endpoints not in exact 'Succeeded' state must be skipped.""" ws = _make_workspace() for state in ("Creating", "Deleting", "Failed", "Updating"): ep = _make_endpoint(age_days=30, provisioning_state=state) @@ -190,12 +406,12 @@ def test_non_succeeded_state_skipped(): assert _call(ml, mon) == [], f"Expected no findings for state={state}" -def test_provisioning_state_case_insensitive(): - """Provisioning state comparison must be case-insensitive (e.g. 'succeeded').""" +def test_provisioning_state_case_sensitive_lowercase_skipped(): + """'succeeded' (lowercase) must NOT match — comparison is exact case-sensitive (spec 8.6).""" ws = _make_workspace() ep = _make_endpoint(age_days=30, provisioning_state="succeeded") ml, mon = _make_clients(ws, [ep]) - assert len(_call(ml, mon)) == 1 + assert _call(ml, mon) == [] def test_none_provisioning_state_skipped(): @@ -206,718 +422,762 @@ def test_none_provisioning_state_skipped(): # --------------------------------------------------------------------------- -# Age filtering and effective window +# Age and created_at (spec 8.7, 9.2) # --------------------------------------------------------------------------- def test_young_endpoint_skipped(): - """Endpoint younger than max(idle_days // 2, 3) days should be skipped.""" + """Endpoint younger than effective idle_days must be skipped.""" ws = _make_workspace() - ep = _make_endpoint(age_days=2) + ep = _make_endpoint(age_days=6) # 6 < 7 = default idle_days ml, mon = _make_clients(ws, [ep]) assert _call(ml, mon) == [] -def test_endpoint_at_half_threshold_skipped(): - """age=3, idle_days=7: max(7//2=3, 3)=3 → age NOT < 3 → proceeds (borderline).""" +def test_age_exactly_idle_days_proceeds(): + """Endpoint age exactly equal to idle_days must proceed.""" ws = _make_workspace() - ep = _make_endpoint(age_days=3) + ep = _make_endpoint(age_days=7) ml, mon = _make_clients(ws, [ep]) - # age 3 < ceil(0.75*7)=6, so confidence ladder falls through → no finding + assert len(_call(ml, mon)) == 1 + + +def test_created_at_required_skips_when_absent(): + """Endpoint with no system_data / created_at must be skipped (spec 8.7).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=None) # age_days=None → created_at=None in system_data + ml, mon = _make_clients(ws, [ep]) + # None created_at → cannot establish age → skip assert _call(ml, mon) == [] -def test_effective_window_capped_to_age(): - """For an endpoint younger than idle_days, effective_window = age_days (not idle_days).""" +def test_future_created_at_skipped(): + """Future created_at timestamp must be skipped (spec 9.2.3).""" ws = _make_workspace() - ep = _make_endpoint(age_days=6) # 6 == ceil(0.75 * 7) + now = datetime.now(timezone.utc) + ep_id = ( + f"/subscriptions/{_SUB}/resourceGroups/{_WS_RG}" + f"/providers/Microsoft.MachineLearningServices/workspaces/{_WS_NAME}/onlineEndpoints/future" + ) + future = now + timedelta(days=1) + ep = SimpleNamespace( + id=ep_id, + name="future", + provisioning_state="Succeeded", + system_data=SimpleNamespace(created_at=future), + location="eastus", + kind="Managed", + tags={}, + ) ml, mon = _make_clients(ws, [ep]) + assert _call(ml, mon) == [] - findings = _call(ml, mon) + +def test_idle_days_minimum_is_1(): + """idle_days minimum effective value is 1 (spec 6.3), not higher.""" + ws = _make_workspace() + ep = _make_endpoint(age_days=2) # 2 >= 1 + ml, mon = _make_clients(ws, [ep], idle_days=1) + + findings = _call(ml, mon, idle_days=1) assert len(findings) == 1 - assert findings[0].details["age_days"] == 6 - assert findings[0].details["idle_days_threshold"] == 7 + assert findings[0].details["idle_days_threshold"] == 1 -def test_no_creation_time_uses_full_window(): - """Endpoint with unknown age should use full idle_days window and get MEDIUM confidence.""" +def test_idle_days_zero_clamped_to_1(): ws = _make_workspace() - ep = _make_endpoint(age_days=None) - ml, mon = _make_clients(ws, [ep]) + ep = _make_endpoint(age_days=2) + ml, mon = _make_clients(ws, [ep], idle_days=1) - findings = _call(ml, mon) + findings = _call(ml, mon, idle_days=0) assert len(findings) == 1 - assert findings[0].confidence.value == "medium" - assert findings[0].details["age_days"] is None + assert findings[0].details["idle_days_threshold"] == 1 # --------------------------------------------------------------------------- -# Confidence levels +# Deployment billing relevance (spec 8.8, 8.9, 9.3) # --------------------------------------------------------------------------- -def test_high_confidence_age_ge_idle_days(): - """Endpoint with age >= idle_days and zero requests → HIGH confidence.""" +def test_no_stable_deployments_skipped(): + """All deployments not in Succeeded state → no billing-relevant deployment → skip.""" ws = _make_workspace() ep = _make_endpoint(age_days=30) - ml, mon = _make_clients(ws, [ep]) - - findings = _call(ml, mon) - - assert len(findings) == 1 - assert findings[0].confidence.value == "high" + dep = _make_deployment(provisioning_state="Failed") + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) + assert _call(ml, mon) == [] -def test_medium_confidence_at_75_percent_age(): - """age = ceil(0.75 × idle_days), age < idle_days → MEDIUM confidence.""" +def test_deployment_with_unknown_instance_count_skipped(): + """Deployment with min_instances=None and instance_count=None is not billing-relevant.""" ws = _make_workspace() - ep = _make_endpoint(age_days=6) # ceil(0.75 * 7) = 6 - ml, mon = _make_clients(ws, [ep]) - - findings = _call(ml, mon) - - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" + ep = _make_endpoint(age_days=30) + dep = SimpleNamespace( + instance_type="Standard_DS3_v2", + instance_count=None, + scale_settings=None, + provisioning_state="Succeeded", + ) + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) + assert _call(ml, mon) == [] -def test_below_75_percent_age_skipped(): - """age = 5 < ceil(0.75 * 7) = 6 → confidence ladder falls through → skipped.""" +def test_scale_to_zero_endpoint_skipped(): + """Deployment with min_instances=0 is not billing-relevant → skip (spec 9.3.5, 9.3.7).""" ws = _make_workspace() - ep = _make_endpoint(age_days=5) - ml, mon = _make_clients(ws, [ep]) + ep = _make_endpoint(age_days=30) + dep = _make_deployment(min_instances=0) + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) assert _call(ml, mon) == [] -def test_medium_confidence_unknown_age(): - """Age unknown → MEDIUM confidence (can't rule out recent creation).""" +def test_min_instances_takes_priority_over_instance_count(): + """scale_settings.min_instances is resolved before instance_count (spec 9.3.4).""" ws = _make_workspace() - ep = _make_endpoint(age_days=None) - ml, mon = _make_clients(ws, [ep]) + ep = _make_endpoint(age_days=30) + # scale_settings.min_instances=2, instance_count=5 → min_instances wins → 2 + dep = SimpleNamespace( + instance_type="Standard_DS3_v2", + instance_count=5, + scale_settings=SimpleNamespace(min_instances=2), + provisioning_state="Succeeded", + ) + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) findings = _call(ml, mon) assert len(findings) == 1 - assert findings[0].confidence.value == "medium" + assert findings[0].details["baseline_instance_count_total"] == 2 -def test_workspace_level_signal_low_confidence(): - """Pass-2 (no EndpointName filter) zero traffic + age >= idle_days → LOW confidence.""" +def test_baseline_instances_summed_across_deployments(): + """Baseline instance counts are summed across all billing-relevant deployments.""" ws = _make_workspace() - ep = _make_endpoint(age_days=30) # 30 >= 7 - - def _mock_metrics(*args, **kwargs): - if "filter" in kwargs: - return _make_total_metric_response(0.0, has_timeseries=False) - return _make_total_metric_response(0.0, has_timeseries=True, count=31) - - ml, mon = _make_clients(ws, [ep], metric_fn=_mock_metrics) + ep = _make_endpoint(age_days=30) + deps = [ + _make_deployment(instance_type="Standard_DS3_v2", min_instances=2), + _make_deployment(instance_type="Standard_DS3_v2", min_instances=3), + ] + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": deps}) findings = _call(ml, mon) assert len(findings) == 1 - assert findings[0].confidence.value == "low" - assert findings[0].details["idle_signal_scope"] == "workspace_level" + assert findings[0].details["baseline_instance_count_total"] == 5 + assert findings[0].details["billing_relevant_deployment_count"] == 2 -def test_workspace_level_young_endpoint_skipped(): - """Pass-2 zero traffic but age < idle_days → not enough signal → skipped.""" +def test_deployment_list_failure_skips_endpoint(): + """Exception while listing deployments must skip that endpoint (spec 8.8, 12).""" ws = _make_workspace() - ep = _make_endpoint(age_days=6) # 6 < 7 + ep = _make_endpoint(age_days=30) - def _mock_metrics(*args, **kwargs): - if "filter" in kwargs: - return _make_total_metric_response(0.0, has_timeseries=False) - return _make_total_metric_response(0.0, has_timeseries=True, count=31) + ml_client = SimpleNamespace( + workspaces=SimpleNamespace(list_by_subscription=lambda: [ws]), + online_endpoints=SimpleNamespace(list=lambda: [ep]), + online_deployments=SimpleNamespace( + list=lambda ep_name: (_ for _ in ()).throw(RuntimeError("SDK error")) + ), + ) + mon_client = SimpleNamespace( + metrics=SimpleNamespace(list=lambda *a, **kw: _make_avg_metric_response(0.0)) + ) - ml, mon = _make_clients(ws, [ep], metric_fn=_mock_metrics) - assert _call(ml, mon) == [] + assert ( + find_idle_ml_online_endpoints( + subscription_id=_SUB, credential=None, client=ml_client, monitor_client=mon_client + ) + == [] + ) -def test_gpu_detected_on_any_deployment(): - """GPU classification must fire if any deployment has a GPU instance type.""" +def test_only_succeeded_deployments_count_toward_billing(): + """Non-Succeeded deployments must not contribute to baseline instances.""" ws = _make_workspace() ep = _make_endpoint(age_days=30) deps = [ - _make_deployment(instance_type="Standard_DS3_v2", min_instances=2), # CPU first - _make_deployment(instance_type="Standard_NC6", min_instances=1), # GPU second + _make_deployment(min_instances=5, provisioning_state="Updating"), # excluded + _make_deployment(min_instances=2, provisioning_state="Succeeded"), # included ] ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": deps}) findings = _call(ml, mon) assert len(findings) == 1 - assert findings[0].details["is_gpu"] is True - assert findings[0].details["instance_type"] == "Standard_DS3_v2" # first kept - assert findings[0].details["min_instance_count"] == 3 # 2 + 1 + assert findings[0].details["baseline_instance_count_total"] == 2 + assert findings[0].details["billing_relevant_deployment_count"] == 1 + + +def test_deployment_count_includes_all_not_just_stable(): + """deployment_count in details reflects ALL deployments, not just stable ones.""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30) + deps = [ + _make_deployment(min_instances=1, provisioning_state="Succeeded"), + _make_deployment(min_instances=1, provisioning_state="Failed"), + ] + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": deps}) + + findings = _call(ml, mon) + + assert len(findings) == 1 + assert findings[0].details["deployment_count"] == 2 + assert findings[0].details["stable_deployment_count"] == 1 + assert findings[0].details["billing_relevant_deployment_count"] == 1 # --------------------------------------------------------------------------- -# Age-only fallback (no metric data) +# Metric contract (spec 8.10, 9.5) # --------------------------------------------------------------------------- -def test_age_only_fallback_when_no_timeseries(): - """No timeseries returned from monitor + age >= 2× idle_days → age_only LOW finding.""" +def test_metric_queried_on_endpoint_arm_id(): + """Metrics must be queried against the endpoint ARM resource id, not workspace id.""" ws = _make_workspace() - ep = _make_endpoint(age_days=20) # 20 >= 2 * 7 = 14 - ml, mon = _make_clients( - ws, [ep], metric_response=_make_total_metric_response(0.0, has_timeseries=False) - ) + ep = _make_endpoint(age_days=30) + captured_resource_ids = [] - findings = _call(ml, mon) + def _capture(*args, **kwargs): + if args: + captured_resource_ids.append(args[0]) + return _make_avg_metric_response(0.0) - assert len(findings) == 1 - assert findings[0].confidence.value == "low" - assert findings[0].details["idle_signal_scope"] == "age_only" + ml, mon = _make_clients(ws, [ep], metric_fn=_capture) + _call(ml, mon) + + assert captured_resource_ids, "Expected at least one metrics.list call" + # Endpoint id must be used, not workspace id + assert ep.id in captured_resource_ids + ws_id = ws.id + assert ws_id not in captured_resource_ids + + +def test_metric_uses_correct_parameters(): + """RequestsPerMinute / PT1M / Average must be used (spec 9.5).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30) + captured = {} + + def _capture(*args, **kwargs): + captured.update(kwargs) + return _make_avg_metric_response(0.0) + ml, mon = _make_clients(ws, [ep], metric_fn=_capture) + _call(ml, mon) + + assert captured.get("metricnames") == "RequestsPerMinute" + assert captured.get("interval") == "PT1M" + assert captured.get("aggregation") == "Average" -def test_no_timeseries_young_endpoint_skipped(): - """No timeseries + age < 2× idle_days → not enough signal → skipped.""" + +def test_coverage_below_80pct_skips_endpoint(): + """Coverage < 80% → UNKNOWN metric result → endpoint skipped (spec 9.5.8).""" ws = _make_workspace() - ep = _make_endpoint(age_days=10) # 10 < 2 * 7 = 14 + ep = _make_endpoint(age_days=30) + # 79% coverage ml, mon = _make_clients( - ws, [ep], metric_response=_make_total_metric_response(0.0, has_timeseries=False) + ws, [ep], metric_response=_make_avg_metric_response(0.0, coverage_fraction=0.79) ) assert _call(ml, mon) == [] +def test_coverage_exactly_80pct_emits(): + """Coverage >= 80% → acceptable → endpoint emits (spec 9.5 acceptable coverage).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30) + ml, mon = _make_clients( + ws, [ep], metric_response=_make_avg_metric_response(0.0, coverage_fraction=0.80) + ) + assert len(_call(ml, mon)) == 1 + + def test_all_metric_calls_fail_skips_endpoint(): - """All monitor calls raising exceptions → None from _check_requests → endpoint skipped.""" + """Exception from monitor client → UNKNOWN metric result → endpoint skipped.""" ws = _make_workspace() ep = _make_endpoint(age_days=30) - def raise_transient(*a, **kw): + def _raise(*a, **kw): raise RuntimeError("SDK timeout") - ml, mon = _make_clients(ws, [ep], metric_fn=raise_transient) + ml, mon = _make_clients(ws, [ep], metric_fn=_raise) assert _call(ml, mon) == [] -# --------------------------------------------------------------------------- -# Risk levels -# --------------------------------------------------------------------------- - - -def test_cpu_endpoint_medium_risk(): +def test_empty_metric_response_skips_endpoint(): + """No timeseries in response → 0 usable datapoints → coverage 0% → skip (spec 9.5.8).""" ws = _make_workspace() ep = _make_endpoint(age_days=30) - ml, mon = _make_clients(ws, [ep]) - - findings = _call(ml, mon) - - assert findings[0].risk.value == "medium" - assert findings[0].details["is_gpu"] is False + ml, mon = _make_clients(ws, [ep], metric_response=_make_empty_metric_response()) + assert _call(ml, mon) == [] -def test_gpu_nc_series_high_risk(): - """GPU endpoint (NC series) with idle_ratio < 2.0 → HIGH risk.""" +def test_datapoints_without_timestamps_not_counted(): + """Datapoints with no time_stamp are not usable and do not count toward coverage (spec 9.5).""" ws = _make_workspace() - ep = _make_endpoint(age_days=7) # age/idle_days = 7/7 = 1.0 < 2.0 - dep = _make_deployment(instance_type="Standard_NC6", min_instances=1) - ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) - - findings = _call(ml, mon, idle_days=7) - - assert len(findings) == 1 - assert findings[0].risk.value == "high" - assert findings[0].details["is_gpu"] is True + ep = _make_endpoint(age_days=30) + # Points with average but no time_stamp attribute + data_points = [SimpleNamespace(average=0.0) for _ in range(10080)] + resp = SimpleNamespace(value=[SimpleNamespace(timeseries=[SimpleNamespace(data=data_points)])]) + ml, mon = _make_clients(ws, [ep], metric_response=resp) + # 0 usable buckets → coverage 0% → UNKNOWN → skip + assert _call(ml, mon) == [] -def test_gpu_nd_series_detected(): - """ND-series (deep learning) must be classified as GPU.""" +def test_out_of_window_datapoints_not_counted(): + """Datapoints with timestamps outside [window_start_utc, metric_end_utc) are discarded (spec 9.5.3).""" ws = _make_workspace() ep = _make_endpoint(age_days=30) - dep = _make_deployment(instance_type="Standard_ND40rs_v2", min_instances=1) - ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) + now_utc = datetime.now(timezone.utc) + # Timestamps in the far future — all outside the window + future_ts = now_utc + timedelta(days=100) + data_points = [SimpleNamespace(average=0.0, time_stamp=future_ts) for _ in range(10080)] + resp = SimpleNamespace(value=[SimpleNamespace(timeseries=[SimpleNamespace(data=data_points)])]) + ml, mon = _make_clients(ws, [ep], metric_response=resp) + # All out-of-window → 0 usable buckets → coverage 0% → UNKNOWN → skip + assert _call(ml, mon) == [] - findings = _call(ml, mon) - assert findings[0].details["is_gpu"] is True +def test_active_point_outside_window_not_flagged_as_active(): + """Average > 0 on an out-of-window timestamp must NOT trigger ACTIVE (spec 9.5.3).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30) + now_utc = datetime.now(timezone.utc) + # One future (out-of-window) point with average=99, one valid zero point + metric_end_utc = (now_utc - timedelta(minutes=5)).replace(second=0, microsecond=0) + valid_bucket = metric_end_utc - timedelta(minutes=1) + data_points = [ + SimpleNamespace(average=99.0, time_stamp=now_utc + timedelta(days=1)), # future, skip + SimpleNamespace(average=0.0, time_stamp=valid_bucket), # valid zero + ] + # One valid zero bucket → coverage 1/10080 ≈ 0% → UNKNOWN → skip (not ACTIVE) + resp = SimpleNamespace(value=[SimpleNamespace(timeseries=[SimpleNamespace(data=data_points)])]) + ml, mon = _make_clients(ws, [ep], metric_response=resp) + assert _call(ml, mon) == [] # skipped due to low coverage, not flagged as ACTIVE -def test_gpu_critical_when_idle_ratio_ge_2(): - """GPU endpoint with age/idle_days >= 2.0 → CRITICAL risk.""" +def test_duplicate_minute_buckets_counted_once(): + """Same minute bucket appearing in multiple timeseries is counted only once (spec 9.5 dedup).""" ws = _make_workspace() - ep = _make_endpoint(age_days=30) # 30/7 > 2.0 - dep = _make_deployment(instance_type="Standard_NC6", min_instances=1) - ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) + ep = _make_endpoint(age_days=2) + now_utc = datetime.now(timezone.utc) + metric_end_utc = (now_utc - timedelta(minutes=5)).replace(second=0, microsecond=0) + # One unique in-window bucket, duplicated across two timeseries + bucket = metric_end_utc - timedelta(minutes=1) + point = SimpleNamespace(average=0.0, time_stamp=bucket) + ts1 = SimpleNamespace(data=[point]) + ts2 = SimpleNamespace(data=[point]) + resp = SimpleNamespace(value=[SimpleNamespace(timeseries=[ts1, ts2])]) + ml, mon = _make_clients(ws, [ep], metric_response=resp) + # idle_days=7: expected=10080 buckets, unique usable=1 → coverage 1/10080 < 80% → UNKNOWN → skip + assert _call(ml, mon) == [] - findings = _call(ml, mon, idle_days=7) - assert len(findings) == 1 - assert findings[0].risk.value == "critical" +# --------------------------------------------------------------------------- +# Confidence levels (spec 9.6) +# --------------------------------------------------------------------------- -def test_gpu_critical_requires_per_endpoint_signal(): - """GPU + age_only (LOW confidence) must NOT escalate to CRITICAL — only HIGH.""" +def test_high_confidence_when_coverage_ge_95pct(): + """Coverage >= 95% → HIGH confidence (spec 9.6).""" ws = _make_workspace() - ep = _make_endpoint(age_days=20) # 20 >= 2*7=14 → age_only; idle_ratio=20/7>2 - dep = _make_deployment(instance_type="Standard_NC6", min_instances=1) + ep = _make_endpoint(age_days=30) ml, mon = _make_clients( - ws, - [ep], - deployments_by_ep={"ep1": [dep]}, - metric_response=_make_total_metric_response(0.0, has_timeseries=False), + ws, [ep], metric_response=_make_avg_metric_response(0.0, coverage_fraction=0.95) ) - findings = _call(ml, mon, idle_days=7) + findings = _call(ml, mon) assert len(findings) == 1 - assert findings[0].details["idle_signal_scope"] == "age_only" - assert findings[0].risk.value == "high" # not critical + assert findings[0].confidence.value == "high" -def test_gpu_detection_case_insensitive(): - """GPU family check must be case-insensitive (Azure SDK returns mixed case).""" +def test_high_confidence_at_full_coverage(): + """100% coverage → HIGH confidence.""" ws = _make_workspace() ep = _make_endpoint(age_days=30) - dep = _make_deployment(instance_type="standard_nc6", min_instances=1) # lowercase - ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) + ml, mon = _make_clients( + ws, [ep], metric_response=_make_avg_metric_response(0.0, coverage_fraction=1.0) + ) + assert _call(ml, mon)[0].confidence.value == "high" - findings = _call(ml, mon) - assert findings[0].details["is_gpu"] is True +def test_medium_confidence_when_coverage_between_80_and_95_pct(): + """Coverage 80–95% → MEDIUM confidence (spec 9.6).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30) + ml, mon = _make_clients( + ws, [ep], metric_response=_make_avg_metric_response(0.0, coverage_fraction=0.87) + ) + findings = _call(ml, mon) -# --------------------------------------------------------------------------- -# Scale-to-zero filtering -# --------------------------------------------------------------------------- + assert len(findings) == 1 + assert findings[0].confidence.value == "medium" -def test_scale_to_zero_endpoint_skipped(): - """Endpoint with min_instance_count=0 has no running instances → no cost → skip.""" +def test_medium_confidence_at_80pct_coverage(): + """Coverage exactly 80% → MEDIUM confidence (< 95% threshold).""" ws = _make_workspace() ep = _make_endpoint(age_days=30) - dep = _make_deployment(instance_type="Standard_NC6", min_instances=0) - ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) - - assert _call(ml, mon) == [] + ml, mon = _make_clients( + ws, [ep], metric_response=_make_avg_metric_response(0.0, coverage_fraction=0.80) + ) + assert _call(ml, mon)[0].confidence.value == "medium" # --------------------------------------------------------------------------- -# Instance type and cost estimation +# Risk levels (spec 9.6) # --------------------------------------------------------------------------- -def test_known_sku_cost_applied(): - """Standard_NC6 → $657/month; 2 instances → $1,314/month.""" +def test_cpu_endpoint_medium_risk(): ws = _make_workspace() ep = _make_endpoint(age_days=30) - dep = _make_deployment(instance_type="Standard_NC6", min_instances=2) - ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) + ml, mon = _make_clients(ws, [ep]) findings = _call(ml, mon) - assert findings[0].estimated_monthly_cost_usd == 657.0 * 2 - assert findings[0].details["cost_source"] == "heuristic_sku_table" - assert findings[0].details["instance_type"] == "Standard_NC6" - assert findings[0].details["min_instance_count"] == 2 + assert findings[0].risk.value == "medium" + assert findings[0].details["is_gpu"] is False -def test_known_sku_lowercase_still_matches(): - """SKU names returned in lowercase by the SDK must still resolve to a cost.""" +def test_gpu_nc_series_high_risk(): + """Any billing-relevant GPU deployment → HIGH risk (spec 9.6).""" ws = _make_workspace() ep = _make_endpoint(age_days=30) - dep = _make_deployment(instance_type="standard_nc6", min_instances=1) # lowercase + dep = _make_deployment(instance_type="Standard_NC6", min_instances=1) ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) findings = _call(ml, mon) - assert findings[0].estimated_monthly_cost_usd == 657.0 - assert findings[0].details["cost_source"] == "heuristic_sku_table" + assert len(findings) == 1 + assert findings[0].risk.value == "high" + assert findings[0].details["is_gpu"] is True -def test_unknown_sku_no_cost(): - """Unknown VM size → no cost estimate, cost_source='unknown'.""" +def test_gpu_nd_series_high_risk(): + """ND-series must be classified as GPU → HIGH risk.""" ws = _make_workspace() ep = _make_endpoint(age_days=30) - dep = _make_deployment(instance_type="Standard_FutureSeries_v99", min_instances=3) + dep = _make_deployment(instance_type="Standard_ND40rs_v2", min_instances=1) ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) - findings = _call(ml, mon) - - assert findings[0].estimated_monthly_cost_usd is None - assert findings[0].details["cost_source"] == "unknown" + assert _call(ml, mon)[0].risk.value == "high" -def test_no_deployments_no_cost(): - """Endpoint with no deployments → None cost, is_gpu=False.""" +def test_gpu_nv_series_high_risk(): + """NV-series must be classified as GPU → HIGH risk.""" ws = _make_workspace() ep = _make_endpoint(age_days=30) - ml, mon = _make_clients(ws, [ep]) - - findings = _call(ml, mon) + dep = _make_deployment(instance_type="Standard_NV12s_v3", min_instances=1) + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) - assert findings[0].estimated_monthly_cost_usd is None - assert findings[0].details["instance_type"] is None - assert findings[0].details["is_gpu"] is False + assert _call(ml, mon)[0].risk.value == "high" -def test_multiple_deployments_instances_summed(): - """With multiple deployments, instance counts are summed for billing accuracy.""" +def test_gpu_detected_on_any_billing_relevant_deployment(): + """GPU classification fires if any billing-relevant deployment is GPU.""" ws = _make_workspace() ep = _make_endpoint(age_days=30) deps = [ - _make_deployment(instance_type="Standard_NC6", min_instances=1), - _make_deployment(instance_type="Standard_NC6", min_instances=3), + _make_deployment(instance_type="Standard_DS3_v2", min_instances=2), # CPU first + _make_deployment(instance_type="Standard_NC6", min_instances=1), # GPU second ] ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": deps}) findings = _call(ml, mon) - assert findings[0].details["min_instance_count"] == 4 # 1 + 3 - assert findings[0].estimated_monthly_cost_usd == 657.0 * 4 + assert len(findings) == 1 + assert findings[0].details["is_gpu"] is True + assert findings[0].details["instance_type"] == "Standard_DS3_v2" # first kept + assert findings[0].details["baseline_instance_count_total"] == 3 -def test_deployment_list_failure_still_produces_finding(): - """If deployment listing raises, finding is still produced (best-effort cost).""" +def test_gpu_detection_uppercase_normalization(): + """GPU prefix matching is uppercase-normalized; lowercase instance_type detected as GPU.""" ws = _make_workspace() ep = _make_endpoint(age_days=30) + dep = _make_deployment(instance_type="standard_nc6", min_instances=1) # lowercase + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) - ml_client = SimpleNamespace( - workspaces=SimpleNamespace(list_by_subscription=lambda: [ws]), - online_endpoints=SimpleNamespace(list=lambda: [ep]), - online_deployments=SimpleNamespace( - list=lambda ep_name: (_ for _ in ()).throw(RuntimeError("SDK error")) - ), - ) - mon_client = SimpleNamespace( - metrics=SimpleNamespace(list=lambda *a, **kw: _make_total_metric_response(0.0)) - ) + assert _call(ml, mon)[0].details["is_gpu"] is True - findings = find_idle_ml_online_endpoints( - subscription_id=_SUB, - credential=None, - client=ml_client, - monitor_client=mon_client, - ) - # Finding still produced, just without cost details +def test_no_critical_risk_ever(): + """Risk must never exceed HIGH regardless of GPU or duration (spec 9.6).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=365) # very old + dep = _make_deployment(instance_type="Standard_NC24", min_instances=4) + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) + + findings = _call(ml, mon) + assert len(findings) == 1 - assert findings[0].details["instance_type"] is None + assert findings[0].risk.value == "high" + assert findings[0].risk.value != "critical" # --------------------------------------------------------------------------- -# Resource group extraction +# Cost (spec 10) # --------------------------------------------------------------------------- -def test_rg_parsed_from_id_when_attribute_missing(): - """resource_group attribute missing → RG parsed from workspace ARM id.""" - ws = _make_workspace(rg="rg-from-id") - del ws.resource_group # force fall-through to id parsing - ep = _make_endpoint(age_days=30, rg="rg-from-id") - ml, mon = _make_clients(ws, [ep]) +def test_estimated_cost_always_none(): + """estimated_monthly_cost_usd must always be None (spec 10).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30) + dep = _make_deployment(instance_type="Standard_NC6", min_instances=2) + ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) findings = _call(ml, mon) - assert len(findings) == 1 - assert findings[0].details["resource_group"] == "rg-from-id" - - -def test_workspace_missing_resource_group_and_id_skipped(): - """Workspace with no resource_group attribute AND no parseable id is skipped.""" - ws = _make_workspace() - ws.resource_group = None - ws.id = None - ep = _make_endpoint(age_days=30) - ml, mon = _make_clients(ws, [ep]) - assert _call(ml, mon) == [] + assert findings[0].estimated_monthly_cost_usd is None # --------------------------------------------------------------------------- -# Region filter +# Region filter (spec 8.4) # --------------------------------------------------------------------------- def test_region_filter_excludes_other_regions(): - ws = _make_workspace(location="westeurope") - ep = _make_endpoint(age_days=30) + ws = _make_workspace() + ep = _make_endpoint(age_days=30, location="westeurope") ml, mon = _make_clients(ws, [ep]) assert _call(ml, mon, region_filter="eastus") == [] -def test_region_filter_matches_workspace_normalised(): - """'East US' (with space) should match region_filter='eastus'.""" - ws = _make_workspace(location="East US") - ep = _make_endpoint(age_days=30) +def test_region_filter_exact_lowercase_match(): + """Region filter uses exact lowercase comparison (spec 7).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30, location="eastus") ml, mon = _make_clients(ws, [ep]) findings = _call(ml, mon, region_filter="eastus") assert len(findings) == 1 - assert findings[0].region == "East US" + assert findings[0].region == "eastus" -def test_no_region_filter_includes_all_regions(): - ws = _make_workspace(location="westeurope") - ep = _make_endpoint(age_days=30) +def test_region_filter_space_in_region_name(): + """'East US' (with space) matches region_filter='east us' after lowercase (spec 7). + The emitted region is the normalized (lowercase) location (spec 11.1).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30, location="East US") ml, mon = _make_clients(ws, [ep]) - assert len(_call(ml, mon)) == 1 - + findings = _call(ml, mon, region_filter="east us") -# --------------------------------------------------------------------------- -# Metric fallback (RequestCount → ModelEndpointRequests) -# --------------------------------------------------------------------------- + assert len(findings) == 1 + assert findings[0].region == "east us" -def test_falls_back_to_second_metric_when_first_has_no_data(): - """If RequestCount returns no timeseries, ModelEndpointRequests is tried.""" +def test_no_region_filter_includes_all_regions(): ws = _make_workspace() - ep = _make_endpoint(age_days=30) + ep = _make_endpoint(age_days=30, location="westeurope") + ml, mon = _make_clients(ws, [ep]) - call_args = [] + assert len(_call(ml, mon)) == 1 - def _mock_metrics(*args, **kwargs): - metric_name = kwargs.get("metricnames", "") - call_args.append(metric_name) - if metric_name == "RequestCount": - return _make_total_metric_response(0.0, has_timeseries=False) - return _make_total_metric_response(0.0) # ModelEndpointRequests → zero → idle - ml, mon = _make_clients(ws, [ep], metric_fn=_mock_metrics) +def test_region_normalized_in_finding(): + """Finding.region is the normalized (lowercase) endpoint location (spec 7, 11.1).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30, location="West Europe") + ml, mon = _make_clients(ws, [ep]) findings = _call(ml, mon) assert len(findings) == 1 - assert "RequestCount" in call_args - assert "ModelEndpointRequests" in call_args - assert findings[0].details["idle_signal_scope"] == "per_endpoint" - - -def test_active_on_second_metric_skips_endpoint(): - """If first metric has no data but second shows traffic, endpoint is skipped.""" - ws = _make_workspace() - ep = _make_endpoint(age_days=30) - - def _mock_metrics(*args, **kwargs): - metric_name = kwargs.get("metricnames", "") - if metric_name == "RequestCount": - return _make_total_metric_response(0.0, has_timeseries=False) - return _make_total_metric_response(100.0) # active - - ml, mon = _make_clients(ws, [ep], metric_fn=_mock_metrics) - assert _call(ml, mon) == [] + assert findings[0].region == "west europe" + assert findings[0].details["location"] == "west europe" -def test_endpoint_name_with_single_quote_escaped_in_filter(): - """Endpoint names containing ' must be escaped as '' in the OData filter.""" - ws = _make_workspace() - ep = _make_endpoint(name="test's-endpoint", age_days=30) +# --------------------------------------------------------------------------- +# Resource group and workspace identity +# --------------------------------------------------------------------------- - captured_filters = [] - def _mock_metrics(*args, **kwargs): - f = kwargs.get("filter", "") - if f: - captured_filters.append(f) - return _make_total_metric_response(0.0) +def test_rg_parsed_from_id_when_attribute_missing(): + """resource_group attribute missing → RG parsed from workspace ARM id.""" + ws = _make_workspace(rg="rg-from-id") + del ws.resource_group + ep = _make_endpoint(age_days=30, rg="rg-from-id") + ml, mon = _make_clients(ws, [ep]) - ml, mon = _make_clients(ws, [ep], metric_fn=_mock_metrics) - _call(ml, mon) + findings = _call(ml, mon) - assert captured_filters, "Expected at least one filtered metric call" - assert "test''s-endpoint" in captured_filters[0] - assert "test's-endpoint" not in captured_filters[0] + assert len(findings) == 1 + assert findings[0].details["resource_group"] == "rg-from-id" -def test_partial_datapoints_below_coverage_threshold_falls_to_pass2(): - """Pass-1 with fewer datapoints than coverage_threshold falls through to pass 2.""" +def test_workspace_no_rg_attribute_no_id_skipped(): + """Workspace with resource_group=None and no parseable id is skipped.""" ws = _make_workspace() + ws.resource_group = None + ws.id = None ep = _make_endpoint(age_days=30) - - # idle_days=7 → threshold=max(int(7*0.7),3)=4; give pass-1 only 2 datapoints - pass1_calls = [] - - def _mock_metrics(*args, **kwargs): - has_filter = "filter" in kwargs - if has_filter: - pass1_calls.append(True) - # Only 2 datapoints — below threshold of 4 - return _make_total_metric_response(0.0, has_timeseries=True, count=2) - # Pass 2: enough datapoints → workspace_level - return _make_total_metric_response(0.0, has_timeseries=True, count=31) - - ml, mon = _make_clients(ws, [ep], metric_fn=_mock_metrics) - findings = _call(ml, mon, idle_days=7) - - assert pass1_calls, "Pass 1 must have been called" - assert len(findings) == 1 - # Falls through pass-1 (insufficient coverage) → workspace_level signal - assert findings[0].details["idle_signal_scope"] == "workspace_level" - assert findings[0].confidence.value == "low" + ml, mon = _make_clients(ws, [ep]) + assert _call(ml, mon) == [] -def test_pass1_sufficient_coverage_returns_per_endpoint(): - """Pass-1 with enough datapoints returns per_endpoint signal (no pass-2 needed).""" +def test_workspace_name_required(): + """Workspace with name=None is skipped (spec 8.3).""" ws = _make_workspace() + ws.name = None ep = _make_endpoint(age_days=30) - - pass2_calls = [] - - def _mock_metrics(*args, **kwargs): - if "filter" not in kwargs: - pass2_calls.append(True) - # 31 datapoints — well above coverage threshold for any idle_days - return _make_total_metric_response(0.0, has_timeseries=True, count=31) - - ml, mon = _make_clients(ws, [ep], metric_fn=_mock_metrics) - findings = _call(ml, mon, idle_days=7) - - assert len(findings) == 1 - assert findings[0].details["idle_signal_scope"] == "per_endpoint" - assert not pass2_calls, "Pass 2 must NOT be called when pass 1 has sufficient coverage" - - -# --------------------------------------------------------------------------- -# Monitor auth and transient errors -# --------------------------------------------------------------------------- + ml, mon = _make_clients(ws, [ep]) + assert _call(ml, mon) == [] -def test_monitor_403_raises_permission_error(): - """HTTP 403 from monitor must surface as PermissionError.""" +def test_endpoint_id_required(): + """Endpoint with id=None is skipped (spec 8.1).""" ws = _make_workspace() ep = _make_endpoint(age_days=30) + ep.id = None + ml, mon = _make_clients(ws, [ep]) + assert _call(ml, mon) == [] - def raise_403(*a, **kw): - raise _http_error(403) - ml, mon = _make_clients(ws, [ep], metric_fn=raise_403) +def test_endpoint_name_required(): + """Endpoint with name=None is skipped (spec 8.2).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30) + ep.name = None + ml, mon = _make_clients(ws, [ep]) + assert _call(ml, mon) == [] - with pytest.raises(PermissionError) as exc_info: - _call(ml, mon) - assert "Microsoft.Insights/metrics/read" in str(exc_info.value) +def test_endpoint_missing_location_skipped(): + """Endpoint with absent/empty location must be skipped regardless of region_filter (spec 7).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30, location=None) + ml, mon = _make_clients(ws, [ep]) + assert _call(ml, mon) == [] -def test_monitor_401_raises_permission_error(): - """HTTP 401 from monitor must surface as PermissionError.""" +def test_endpoint_empty_string_location_skipped(): + """Endpoint with empty-string location must be skipped (spec 7).""" ws = _make_workspace() - ep = _make_endpoint(age_days=30) + ep = _make_endpoint(age_days=30, location="") + ml, mon = _make_clients(ws, [ep]) + assert _call(ml, mon) == [] - def raise_401(*a, **kw): - raise _http_error(401) - ml, mon = _make_clients(ws, [ep], metric_fn=raise_401) +# --------------------------------------------------------------------------- +# Exception handling (spec 12) +# --------------------------------------------------------------------------- - with pytest.raises(PermissionError): - _call(ml, mon) +def test_subscription_inventory_failure_propagates(): + """Subscription-wide workspace inventory failure must propagate (spec 12).""" -# --------------------------------------------------------------------------- -# Per-workspace and workspace-level error handling -# --------------------------------------------------------------------------- + def _raise(): + raise RuntimeError("inventory failed") + ml_client = SimpleNamespace(workspaces=SimpleNamespace(list_by_subscription=_raise)) + mon_client = SimpleNamespace() -def test_workspace_missing_resource_group_skipped(): - """Workspace with resource_group=None but parseable id still produces a finding.""" - ws = _make_workspace(rg=_WS_RG) - ws.resource_group = None # cleared — rule falls back to id parsing - ep = _make_endpoint(age_days=30) - ml, mon = _make_clients(ws, [ep]) - # id still contains rg-ml → finding is produced - findings = _call(ml, mon) - assert len(findings) == 1 - assert findings[0].details["resource_group"] == _WS_RG + with pytest.raises(RuntimeError, match="inventory failed"): + find_idle_ml_online_endpoints( + subscription_id=_SUB, credential=None, client=ml_client, monitor_client=mon_client + ) -def test_endpoint_list_transient_error_skips_workspace_preserves_others(): - """Transient error listing endpoints in one workspace must not abort others.""" +def test_per_workspace_endpoint_list_failure_skips_workspace(): + """Transient error listing endpoints skips that workspace and preserves other findings.""" ws_good = _make_workspace(name="good-ws", rg="rg-good") ws_bad = _make_workspace(name="bad-ws", rg="rg-bad") ep_good = _make_endpoint(name="ep-good", ws_name="good-ws", rg="rg-good", age_days=30) - ws_order = [] + call_count = [0] - def _endpoints_by_call(): - ws_order.append(len(ws_order)) - if len(ws_order) == 2: + def _list_endpoints(): + call_count[0] += 1 + if call_count[0] == 2: raise RuntimeError("transient SDK timeout") return [ep_good] ml_client = SimpleNamespace( workspaces=SimpleNamespace(list_by_subscription=lambda: [ws_good, ws_bad]), - online_endpoints=SimpleNamespace(list=_endpoints_by_call), - online_deployments=SimpleNamespace(list=lambda ep_name: []), + online_endpoints=SimpleNamespace(list=_list_endpoints), + online_deployments=SimpleNamespace(list=lambda ep_name: [_make_deployment()]), ) mon_client = SimpleNamespace( - metrics=SimpleNamespace(list=lambda *a, **kw: _make_total_metric_response(0.0)) + metrics=SimpleNamespace(list=lambda *a, **kw: _make_avg_metric_response(0.0)) ) findings = find_idle_ml_online_endpoints( - subscription_id=_SUB, - credential=None, - client=ml_client, - monitor_client=mon_client, + subscription_id=_SUB, credential=None, client=ml_client, monitor_client=mon_client ) assert len(findings) == 1 assert findings[0].details["endpoint_name"] == "ep-good" -def test_workspace_auth_error_raises_permission_error(): - """HTTP 403 on workspace listing must raise PermissionError.""" +def test_per_endpoint_failure_skips_endpoint(): + """Exception processing one endpoint must not abort others.""" + ws = _make_workspace() + ep_bad = _make_endpoint(name="ep-bad", age_days=30) + ep_good = _make_endpoint(name="ep-good", age_days=30) - def _raise_auth(): - raise _http_error(403) + # ep-bad will have a broken system_data that raises on attribute access + class _BrokenSystemData: + @property + def created_at(self): + raise RuntimeError("broken") - ml_client = SimpleNamespace( - workspaces=SimpleNamespace(list_by_subscription=_raise_auth), - ) - mon_client = SimpleNamespace() + ep_bad.system_data = _BrokenSystemData() - with pytest.raises(PermissionError) as exc_info: - find_idle_ml_online_endpoints( - subscription_id=_SUB, - credential=None, - client=ml_client, - monitor_client=mon_client, - ) + ml, mon = _make_clients(ws, [ep_bad, ep_good]) + findings = _call(ml, mon) - assert "Microsoft.MachineLearningServices/workspaces/read" in str(exc_info.value) + assert len(findings) == 1 + assert findings[0].details["endpoint_name"] == "ep-good" -# --------------------------------------------------------------------------- -# idle_days clamping -# --------------------------------------------------------------------------- +def test_monitor_403_raises_permission_error(): + """HTTP 403 from monitor must surface as PermissionError.""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30) + def _raise_403(*a, **kw): + raise _http_error(403) -def test_idle_days_clamped_to_3(): - """idle_days < 3 must be clamped to 3 — not silently suppress all findings.""" - ws = _make_workspace() - ep = _make_endpoint(age_days=20) - ml, mon = _make_clients(ws, [ep]) + ml, mon = _make_clients(ws, [ep], metric_fn=_raise_403) - findings = _call(ml, mon, idle_days=1) + with pytest.raises(PermissionError) as exc_info: + _call(ml, mon) - # idle_days clamped to 3; age=20 >= 3 → proceeds; HIGH confidence - assert len(findings) == 1 - assert findings[0].details["idle_days_threshold"] == 3 + assert "Microsoft.Insights/metrics/read" in str(exc_info.value) -def test_idle_days_zero_clamped_same_as_3(): +def test_monitor_401_raises_permission_error(): + """HTTP 401 from monitor must surface as PermissionError.""" ws = _make_workspace() - ep = _make_endpoint(age_days=20) - ml, mon = _make_clients(ws, [ep]) + ep = _make_endpoint(age_days=30) - findings = _call(ml, mon, idle_days=0) - assert len(findings) == 1 + def _raise_401(*a, **kw): + raise _http_error(401) + + ml, mon = _make_clients(ws, [ep], metric_fn=_raise_401) + + with pytest.raises(PermissionError): + _call(ml, mon) # --------------------------------------------------------------------------- -# Finding shape +# Finding shape (spec 11) # --------------------------------------------------------------------------- def test_finding_shape_complete(): - """All required finding fields must be populated.""" + """All required finding fields and detail keys must be present (spec 11.3).""" ws = _make_workspace() - ep = _make_endpoint(age_days=30) - dep = _make_deployment(instance_type="Standard_NC6", min_instances=1) + ep = _make_endpoint(age_days=30, kind="Managed") + dep = _make_deployment(instance_type="Standard_NC6", min_instances=2) ml, mon = _make_clients(ws, [ep], deployments_by_ep={"ep1": [dep]}) f = _call(ml, mon)[0] @@ -927,6 +1187,7 @@ def test_finding_shape_complete(): assert f.resource_type == "azure.ml.online_endpoint" assert f.resource_id == ep.id assert f.region == "eastus" + assert f.estimated_monthly_cost_usd is None # spec 10: always None assert f.title assert f.summary assert f.reason @@ -938,17 +1199,67 @@ def test_finding_shape_complete(): assert f.evidence.time_window d = f.details + # All spec 11.3 fields must be present assert d["endpoint_name"] == "ep1" assert d["workspace_name"] == _WS_NAME assert d["resource_group"] == _WS_RG + assert d["subscription_id"] == _SUB + assert d["location"] == "eastus" + assert "endpoint_kind" in d + assert d["managed_scope_source"] == "endpoint" + assert d["endpoint_provisioning_state"] == "Succeeded" + assert d["created_at"] is not None + assert d["billing_relevant_deployment_count"] == 1 + assert d["deployment_count"] == 1 + assert d["stable_deployment_count"] == 1 assert d["instance_type"] == "Standard_NC6" - assert d["min_instance_count"] == 1 assert d["is_gpu"] is True - assert d["age_days"] == 30 + assert d["baseline_instance_count_total"] == 2 assert d["idle_days_threshold"] == 7 - assert d["idle_signal_scope"] in ("per_endpoint", "age_only") - assert d["cost_source"] in ("heuristic_sku_table", "unknown") - assert "deployment_count" in d + assert d["idle_since_days"] == 7 + assert d["metric_name"] == "RequestsPerMinute" + assert d["metric_aggregation"] == "Average" + assert "metric_coverage_ratio" in d + assert isinstance(d["metric_coverage_ratio"], float) + assert isinstance(d["tags"], dict) + + +def test_tags_never_none(): + """tags must always be a dict, never None (spec 7).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30, tags=None) # explicitly None + ml, mon = _make_clients(ws, [ep]) + + findings = _call(ml, mon) + + assert len(findings) == 1 + assert findings[0].details["tags"] == {} + + +def test_idle_since_days_equals_effective_window(): + """idle_since_days = effective idle window (not an observational estimate) (spec 9.5).""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30) + ml, mon = _make_clients(ws, [ep], idle_days=14) + + findings = _call(ml, mon, idle_days=14) + + assert findings[0].details["idle_since_days"] == 14 + + +def test_evidence_signals_cover_required_disclosures(): + """signals_used must disclose managed scope, provisioning state, age, billing, and metric.""" + ws = _make_workspace() + ep = _make_endpoint(age_days=30) + ml, mon = _make_clients(ws, [ep]) + + f = _call(ml, mon)[0] + signals_text = " ".join(f.evidence.signals_used) + + assert "managed" in signals_text.lower() + assert "Succeeded" in signals_text + assert "RequestsPerMinute" in signals_text + assert "ZERO" in signals_text # --------------------------------------------------------------------------- @@ -957,8 +1268,6 @@ def test_finding_shape_complete(): def test_rule_metadata_present(): - from cleancloud.providers.azure.rules.ai.ml_online_endpoint_idle import RULE_METADATA - assert RULE_METADATA["id"] == "azure.ml.online_endpoint.idle" assert RULE_METADATA["category"] == "ai" assert RULE_METADATA["service"] == "machinelearningservices" diff --git a/tests/cleancloud/providers/azure/test_azure_openai_provisioned_idle.py b/tests/cleancloud/providers/azure/test_azure_openai_provisioned_idle.py index 6755b30..38d7ec3 100644 --- a/tests/cleancloud/providers/azure/test_azure_openai_provisioned_idle.py +++ b/tests/cleancloud/providers/azure/test_azure_openai_provisioned_idle.py @@ -8,43 +8,63 @@ ) # --------------------------------------------------------------------------- -# Helpers +# Constants # --------------------------------------------------------------------------- _SUB = "sub-123" _RG = "rg-openai" +_ACCT_NAME = "oai-account" +_DEP_NAME = "gpt4o-prod" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- -def _make_account(name="oai-account", location="eastus", kind="OpenAI", rg=_RG, tags=None): +def _make_account( + name=_ACCT_NAME, + location="eastus", + kind="OpenAI", + rg=_RG, + tags=None, + provisioning_state="Succeeded", +): acct_id = ( f"/subscriptions/{_SUB}/resourceGroups/{rg}" f"/providers/Microsoft.CognitiveServices/accounts/{name}" ) + properties = SimpleNamespace(provisioning_state=provisioning_state) return SimpleNamespace( id=acct_id, name=name, location=location, kind=kind, + properties=properties, tags=tags or {}, ) def _make_deployment( - name="gpt4o-prod", + name=_DEP_NAME, sku_name="ProvisionedManaged", capacity=10, + model_format="OpenAI", model_name="gpt-4o", + model_version="2024-05-13", age_days=30, + provisioning_state="Succeeded", rg=_RG, - account="oai-account", + account=_ACCT_NAME, + tags=None, ): dep_id = ( f"/subscriptions/{_SUB}/resourceGroups/{rg}" f"/providers/Microsoft.CognitiveServices/accounts/{account}/deployments/{name}" ) sku = SimpleNamespace(name=sku_name, capacity=capacity) - model = SimpleNamespace(name=model_name) - properties = SimpleNamespace(model=model) + model = SimpleNamespace(format=model_format, name=model_name, version=model_version) + properties = SimpleNamespace(model=model, provisioning_state=provisioning_state) now = datetime.now(timezone.utc) created_at = now - timedelta(days=age_days) if age_days is not None else None system_data = SimpleNamespace(created_at=created_at) @@ -54,25 +74,44 @@ def _make_deployment( sku=sku, properties=properties, system_data=system_data, + tags=tags, # None by default to test fallback ) -def _make_total_metric_response(total: float = 0.0, has_timeseries: bool = True): - """Azure Monitor metrics.list() response with Total aggregation.""" - if not has_timeseries: - return SimpleNamespace(value=[]) - data_point = SimpleNamespace(total=total) - timeseries = SimpleNamespace(data=[data_point]) - metric = SimpleNamespace(timeseries=[timeseries]) - return SimpleNamespace(value=[metric]) - +def _make_total_metric_response(total=0.0, coverage_fraction=1.0, idle_days=7, num_series=1): + """ + Build a mock AzureOpenAIRequests metric response with PT1M granularity. -def _make_clients(account, deployments, metric_fn=None, metric_response=None): - """Build mock CS and Monitor clients. + Each series gets (total / num_series) per datapoint so that bucket_total == + total when summed across all series in the same minute bucket. - metric_fn: callable(resource_uri, **kwargs) -> response (overrides metric_response) - metric_response: static response for all metric calls + coverage_fraction: fraction of expected minute buckets to populate (1.0 = full coverage). + num_series: number of separate dimension series (exercises cross-series aggregation). """ + expected = idle_days * 24 * 60 + usable = int(expected * coverage_fraction) + now_utc = datetime.now(timezone.utc) + metric_end_utc = (now_utc - timedelta(minutes=5)).replace(second=0, microsecond=0) + + total_per_series = (total / num_series) if num_series > 0 else 0.0 + + timeseries_list = [] + for _ in range(num_series): + data_points = [ + SimpleNamespace( + total=total_per_series, + time_stamp=metric_end_utc - timedelta(minutes=i + 1), + ) + for i in range(usable) + ] + timeseries_list.append(SimpleNamespace(data=data_points)) + + metric = SimpleNamespace(timeseries=timeseries_list) + return SimpleNamespace(value=[metric]) + + +def _make_clients(account, deployments, metric_fn=None, metric_response=None, idle_days=7): + """Build mock CS and Monitor clients.""" cs_client = SimpleNamespace( accounts=SimpleNamespace(list=lambda: [account]), deployments=SimpleNamespace(list=lambda rg, acct_name: deployments), @@ -82,7 +121,11 @@ def _make_clients(account, deployments, metric_fn=None, metric_response=None): metrics=SimpleNamespace(list=lambda resource_uri, **kw: metric_fn(resource_uri, **kw)) ) else: - resp = metric_response if metric_response is not None else _make_total_metric_response(0.0) + resp = ( + metric_response + if metric_response is not None + else _make_total_metric_response(0.0, idle_days=idle_days) + ) mon_client = SimpleNamespace(metrics=SimpleNamespace(list=lambda *a, **kw: resp)) return cs_client, mon_client @@ -95,7 +138,7 @@ def _make_clients(account, deployments, metric_fn=None, metric_response=None): def test_idle_provisioned_deployment_detected(): """Provisioned deployment with zero requests should be flagged.""" account = _make_account() - dep = _make_deployment(sku_name="ProvisionedManaged", capacity=10, age_days=30) + dep = _make_deployment() cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -110,18 +153,12 @@ def test_idle_provisioned_deployment_detected(): assert f.rule_id == "azure.openai.provisioned_deployment.idle" assert f.resource_type == "azure.openai.provisioned_deployment" assert f.provider == "azure" - assert f.confidence.value == "high" - assert f.details["sku_name"] == "ProvisionedManaged" - assert f.details["ptu_capacity"] == 10 - assert f.details["model"] == "gpt-4o" - assert f.details["age_days"] == 30 - assert f.estimated_monthly_cost_usd == 10 * 1_460.0 def test_global_provisioned_sku_detected(): - """GlobalProvisionedManaged SKU should also be flagged.""" + """GlobalProvisionedManaged SKU should be flagged.""" account = _make_account() - dep = _make_deployment(sku_name="GlobalProvisionedManaged", capacity=50, age_days=30) + dep = _make_deployment(sku_name="GlobalProvisionedManaged") cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -133,13 +170,12 @@ def test_global_provisioned_sku_detected(): assert len(findings) == 1 assert findings[0].details["sku_name"] == "GlobalProvisionedManaged" - assert findings[0].estimated_monthly_cost_usd == 50 * 1_460.0 def test_datazone_provisioned_sku_detected(): """DataZoneProvisionedManaged SKU should be flagged.""" account = _make_account() - dep = _make_deployment(sku_name="DataZoneProvisionedManaged", capacity=25, age_days=30) + dep = _make_deployment(sku_name="DataZoneProvisionedManaged") cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -156,7 +192,7 @@ def test_datazone_provisioned_sku_detected(): def test_active_deployment_skipped(): """Deployment with non-zero requests should NOT be flagged.""" account = _make_account() - dep = _make_deployment(age_days=30) + dep = _make_deployment() cs_client, mon_client = _make_clients( account, [dep], metric_response=_make_total_metric_response(500.0) ) @@ -174,7 +210,7 @@ def test_active_deployment_skipped(): def test_standard_sku_skipped(): """Non-provisioned SKU (Standard) should NOT be flagged.""" account = _make_account() - dep = _make_deployment(sku_name="Standard", capacity=100, age_days=30) + dep = _make_deployment(sku_name="Standard") cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -187,10 +223,26 @@ def test_standard_sku_skipped(): assert findings == [] -def test_non_openai_account_kind_skipped(): - """Cognitive Services accounts that are not OpenAI or AIServices should be skipped.""" - account = _make_account(kind="TextAnalytics") - dep = _make_deployment(age_days=30) +def test_non_openai_model_format_skipped(): + """Deployment with model_format != 'OpenAI' must be skipped (spec 8.9).""" + account = _make_account() + dep = _make_deployment(model_format="AzureML") + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] + + +def test_model_format_case_sensitive_skipped(): + """model_format comparison is case-sensitive: 'openai' must not match.""" + account = _make_account() + dep = _make_deployment(model_format="openai") cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -203,10 +255,26 @@ def test_non_openai_account_kind_skipped(): assert findings == [] -def test_aiservices_kind_detected(): - """AIServices kind (multi-service account that includes OpenAI) should be scanned.""" +def test_account_kind_does_not_gate_openai_scope(): + """Any account kind is scanned; only model_format establishes OpenAI scope (spec 9.1.4).""" account = _make_account(kind="AIServices") - dep = _make_deployment(age_days=30) + dep = _make_deployment(model_format="OpenAI") + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert len(findings) == 1 + + +def test_account_kind_cognitive_services_scanned(): + """CognitiveServices kind is also scanned when model_format is 'OpenAI'.""" + account = _make_account(kind="CognitiveServices") + dep = _make_deployment(model_format="OpenAI") cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -219,10 +287,74 @@ def test_aiservices_kind_detected(): assert len(findings) == 1 +def test_account_provisioning_state_not_succeeded_skipped(): + """Account with provisioning_state != 'Succeeded' must be skipped (spec 8.7).""" + account = _make_account(provisioning_state="Creating") + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] + + +def test_deployment_provisioning_state_not_succeeded_skipped(): + """Deployment with provisioning_state != 'Succeeded' must be skipped (spec 8.8).""" + account = _make_account() + dep = _make_deployment(provisioning_state="Failed") + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] + + +def test_zero_ptu_capacity_skipped(): + """Deployment with capacity=0 is not billing-relevant and must be skipped (spec 8.11).""" + account = _make_account() + dep = _make_deployment(capacity=0) + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] + + +def test_none_ptu_capacity_skipped(): + """Deployment with capacity=None must be skipped (spec 8.11).""" + account = _make_account() + dep = _make_deployment(capacity=None) + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] + + def test_young_deployment_skipped(): - """Deployment younger than min age guard should NOT be flagged.""" + """Deployment younger than effective idle_days must be skipped (spec 8.12).""" account = _make_account() - dep = _make_deployment(age_days=2) # below max(idle_days//2, 3) = 3 + dep = _make_deployment(age_days=3) # 3 < 7 (default idle_days) cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -251,14 +383,15 @@ def test_no_deployments_returns_empty(): # --------------------------------------------------------------------------- -# Confidence levels +# Account / deployment ID and name guards # --------------------------------------------------------------------------- -def test_high_confidence_per_deployment_old_enough(): - """Per-deployment zero confirmed AND age >= idle_days → HIGH confidence.""" +def test_missing_account_id_skipped(): + """Account with id=None must be skipped (spec 8.1).""" account = _make_account() - dep = _make_deployment(age_days=30) + account.id = None + dep = _make_deployment() cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -268,15 +401,14 @@ def test_high_confidence_per_deployment_old_enough(): monitor_client=mon_client, ) - assert findings[0].confidence.value == "high" - assert findings[0].details["idle_signal_scope"] == "per_deployment" + assert findings == [] -def test_medium_confidence_per_deployment_borderline_age(): - """Per-deployment zero confirmed, age exactly at ceil(75%) of idle_days → MEDIUM.""" +def test_missing_account_name_skipped(): + """Account with name=None must be skipped (spec 8.2).""" account = _make_account() - # idle_days=7, ceil(7*0.75)=ceil(5.25)=6 → age=6 is the minimum for MEDIUM - dep = _make_deployment(age_days=6) + account.name = None + dep = _make_deployment() cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -286,15 +418,14 @@ def test_medium_confidence_per_deployment_borderline_age(): monitor_client=mon_client, ) - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" + assert findings == [] -def test_below_75pct_age_skipped(): - """Deployment below the ceil(75%) threshold should be skipped entirely.""" +def test_missing_deployment_id_skipped(): + """Deployment with id=None must be skipped (spec 8.3).""" account = _make_account() - # idle_days=7, ceil(7*0.75)=6 → age=4 < 6 → skip - dep = _make_deployment(age_days=4) + dep = _make_deployment() + dep.id = None cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -307,10 +438,11 @@ def test_below_75pct_age_skipped(): assert findings == [] -def test_age_5_with_idle_days_7_skipped(): - """age=5 with idle_days=7: ceil(7*0.75)=6, so 5 < 6 must be skipped (not MEDIUM).""" +def test_missing_deployment_name_skipped(): + """Deployment with name=None must be skipped (spec 8.4).""" account = _make_account() - dep = _make_deployment(age_days=5) + dep = _make_deployment() + dep.name = None cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -323,10 +455,11 @@ def test_age_5_with_idle_days_7_skipped(): assert findings == [] -def test_medium_confidence_when_age_unknown(): - """Unknown creation time → MEDIUM confidence (can't rule out recent creation).""" +def test_unresolved_account_location_skipped(): + """Account with location=None must be skipped — unresolved location is a hard gate (spec 8.5).""" account = _make_account() - dep = _make_deployment(age_days=None) + account.location = None + dep = _make_deployment() cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -336,20 +469,37 @@ def test_medium_confidence_when_age_unknown(): monitor_client=mon_client, ) - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" + assert findings == [] + + +def test_empty_account_location_skipped(): + """Account with location='' must be skipped (spec 8.5).""" + account = _make_account(location="") + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] # --------------------------------------------------------------------------- -# Risk levels +# Confidence levels # --------------------------------------------------------------------------- -def test_high_risk_for_large_ptu_allocation(): - """≥ 7 PTUs (≥ $10K/month) should be HIGH risk.""" +def test_high_confidence_full_coverage(): + """Coverage >= 95% must produce HIGH confidence (spec 9.4).""" account = _make_account() - dep = _make_deployment(capacity=10, age_days=30) # 10 × $1,460 = $14,600 - cs_client, mon_client = _make_clients(account, [dep]) + dep = _make_deployment() + cs_client, mon_client = _make_clients( + account, [dep], metric_response=_make_total_metric_response(0.0, coverage_fraction=1.0) + ) findings = find_idle_openai_provisioned_deployments( subscription_id=_SUB, @@ -358,14 +508,17 @@ def test_high_risk_for_large_ptu_allocation(): monitor_client=mon_client, ) - assert findings[0].risk.value == "high" + assert len(findings) == 1 + assert findings[0].confidence.value == "high" -def test_medium_risk_for_small_ptu_allocation(): - """< 7 PTUs (< $10K/month) should be MEDIUM risk.""" +def test_medium_confidence_partial_coverage(): + """Coverage >= 80% but < 95% must produce MEDIUM confidence (spec 9.4).""" account = _make_account() - dep = _make_deployment(capacity=4, age_days=30) # 4 × $1,460 = $5,840 - cs_client, mon_client = _make_clients(account, [dep]) + dep = _make_deployment() + cs_client, mon_client = _make_clients( + account, [dep], metric_response=_make_total_metric_response(0.0, coverage_fraction=0.85) + ) findings = find_idle_openai_provisioned_deployments( subscription_id=_SUB, @@ -374,18 +527,37 @@ def test_medium_risk_for_small_ptu_allocation(): monitor_client=mon_client, ) - assert findings[0].risk.value == "medium" + assert len(findings) == 1 + assert findings[0].confidence.value == "medium" + + +def test_low_coverage_skipped(): + """Coverage < 80% must produce UNKNOWN -> no finding (spec 9.3.11).""" + account = _make_account() + dep = _make_deployment() + cs_client, mon_client = _make_clients( + account, [dep], metric_response=_make_total_metric_response(0.0, coverage_fraction=0.79) + ) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] # --------------------------------------------------------------------------- -# Cost estimation +# Risk # --------------------------------------------------------------------------- -def test_cost_scales_with_ptu_capacity(): - """Estimated cost should be PTU count × $1,460/month.""" +def test_risk_always_high(): + """Risk must always be HIGH regardless of PTU capacity (spec 9.4).""" account = _make_account() - dep = _make_deployment(capacity=100, age_days=30) + dep = _make_deployment(capacity=1) cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -395,13 +567,34 @@ def test_cost_scales_with_ptu_capacity(): monitor_client=mon_client, ) - assert findings[0].estimated_monthly_cost_usd == 100 * 1_460.0 + assert findings[0].risk.value == "high" -def test_zero_ptu_capacity_no_cost_estimate(): - """Deployment with capacity=0 should have no cost estimate (None).""" +def test_risk_always_high_large_allocation(): + """Risk is HIGH even for large PTU allocations (spec 9.4).""" account = _make_account() - dep = _make_deployment(capacity=0, age_days=30) + dep = _make_deployment(capacity=100) + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings[0].risk.value == "high" + + +# --------------------------------------------------------------------------- +# Cost model +# --------------------------------------------------------------------------- + + +def test_estimated_monthly_cost_always_none(): + """estimated_monthly_cost_usd must always be None (spec 10).""" + account = _make_account() + dep = _make_deployment(capacity=100) cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -411,105 +604,124 @@ def test_zero_ptu_capacity_no_cost_estimate(): monitor_client=mon_client, ) - assert len(findings) == 1 assert findings[0].estimated_monthly_cost_usd is None +def test_no_ptu_cost_constant(): + """No PTU price constant should be exported (spec 10).""" + import cleancloud.providers.azure.rules.ai.openai_provisioned_idle as m + + assert not hasattr( + m, "_PTU_MONTHLY_COST_USD" + ), "Spec 10 forbids hardcoding a fixed PTU monthly estimate" + + # --------------------------------------------------------------------------- -# Metric fallback strategy +# Metric contract # --------------------------------------------------------------------------- -def test_per_deployment_dimension_filter_used(): - """ModelDeploymentName dimension filter must be used for per-deployment scoping.""" +def test_metric_queried_on_account_id(): + """AzureOpenAIRequests must be queried on the parent account ARM id (spec 9.3.1).""" account = _make_account() - dep = _make_deployment(name="gpt4-prod", age_days=30) - call_kwargs = [] + dep = _make_deployment() + queried_ids = [] def _mock_metrics(resource_uri, **kwargs): - call_kwargs.append(dict(kwargs)) + queried_ids.append(resource_uri) return _make_total_metric_response(0.0) cs_client, mon_client = _make_clients(account, [dep], metric_fn=_mock_metrics) - findings = find_idle_openai_provisioned_deployments( + find_idle_openai_provisioned_deployments( subscription_id=_SUB, credential=None, client=cs_client, monitor_client=mon_client, ) - assert len(findings) == 1 - # First call must include the deployment name in the dimension filter - assert any("gpt4-prod" in str(kw.get("filter", "")) for kw in call_kwargs) + assert len(queried_ids) == 1 + assert queried_ids[0] == account.id + assert "deployments" not in queried_ids[0].lower().split("/providers/")[1] -def test_no_per_deployment_timeseries_falls_back_to_no_data(): - """If per-deployment dimension query returns no timeseries, account-level is NOT trusted. - The deployment is treated as no_data — account-level zero is unsafe because it only - covers deployments that emit the metric; those that don't are invisible to it.""" +def test_metric_deployment_name_filter(): + """ModelDeploymentName dimension filter must scope the query to the deployment (spec 9.3.2).""" account = _make_account() - dep = _make_deployment(age_days=30) # 30 >= 2×7=14 → age-only fallback applies + dep = _make_deployment(name="my-gpt4o-deploy") + call_kwargs = [] def _mock_metrics(resource_uri, **kwargs): - if "filter" in kwargs: - return _make_total_metric_response(has_timeseries=False) # dimension not supported - return _make_total_metric_response(0.0) # account-level zero — NOT used + call_kwargs.append(dict(kwargs)) + return _make_total_metric_response(0.0) cs_client, mon_client = _make_clients(account, [dep], metric_fn=_mock_metrics) - findings = find_idle_openai_provisioned_deployments( + find_idle_openai_provisioned_deployments( subscription_id=_SUB, credential=None, client=cs_client, monitor_client=mon_client, ) - # Finding produced via age-only fallback, not account-level - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" - assert findings[0].details["idle_signal_scope"] == "age_only" + assert len(call_kwargs) == 1 + assert "my-gpt4o-deploy" in call_kwargs[0].get("filter", "") + assert "ModelDeploymentName" in call_kwargs[0].get("filter", "") -def test_no_per_deployment_timeseries_young_deployment_skipped(): - """If per-deployment dimension unsupported AND deployment too young for age fallback → no finding.""" +def test_metric_pt1m_granularity(): + """Metric query must use PT1M granularity (spec 9.3.3).""" account = _make_account() - dep = _make_deployment(age_days=10) # 10 < 2×7=14 — age fallback does not apply + dep = _make_deployment() + call_kwargs = [] def _mock_metrics(resource_uri, **kwargs): - if "filter" in kwargs: - return _make_total_metric_response(has_timeseries=False) - return _make_total_metric_response(0.0) # account-level zero — NOT used + call_kwargs.append(dict(kwargs)) + return _make_total_metric_response(0.0) cs_client, mon_client = _make_clients(account, [dep], metric_fn=_mock_metrics) - findings = find_idle_openai_provisioned_deployments( + find_idle_openai_provisioned_deployments( subscription_id=_SUB, credential=None, client=cs_client, monitor_client=mon_client, ) - assert findings == [] + assert call_kwargs[0].get("interval") == "PT1M" -def test_fallback_to_processed_prompt_tokens(): - """If AzureOpenAIRequests returns no timeseries, ProcessedPromptTokens should be tried.""" +def test_metric_total_aggregation(): + """Metric query must use Total aggregation (spec 9.3.3).""" account = _make_account() - dep = _make_deployment(age_days=30) - metrics_called = [] + dep = _make_deployment() + call_kwargs = [] def _mock_metrics(resource_uri, **kwargs): - metric_name = kwargs.get("metricnames", "") - metrics_called.append(metric_name) - if metric_name == "AzureOpenAIRequests": - return _make_total_metric_response(has_timeseries=False) - if metric_name == "ProcessedPromptTokens": - return _make_total_metric_response(0.0) # confirmed zero - return _make_total_metric_response(has_timeseries=False) + call_kwargs.append(dict(kwargs)) + return _make_total_metric_response(0.0) cs_client, mon_client = _make_clients(account, [dep], metric_fn=_mock_metrics) + find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert call_kwargs[0].get("aggregation") == "Total" + + +def test_bucket_total_aggregates_across_series(): + """bucket_total must sum Total across all dimension series per minute bucket (spec 9.3.6.v).""" + account = _make_account() + dep = _make_deployment() + # 2 series, each contributing 50/2=25 total per bucket; bucket_total=50 > 0 -> ACTIVE + cs_client, mon_client = _make_clients( + account, [dep], metric_response=_make_total_metric_response(50.0, num_series=2) + ) + findings = find_idle_openai_provisioned_deployments( subscription_id=_SUB, credential=None, @@ -517,20 +729,15 @@ def _mock_metrics(resource_uri, **kwargs): monitor_client=mon_client, ) - assert len(findings) == 1 - assert "ProcessedPromptTokens" in metrics_called - signal_text = " ".join(findings[0].evidence.signals_used) - assert "ProcessedPromptTokens" in signal_text + assert findings == [] # ACTIVE -> no finding -def test_no_timeseries_young_deployment_skipped(): - """No timeseries + age < 2× idle_days → no finding (not enough signal).""" +def test_multi_series_zero_is_idle(): + """Multiple zero-traffic series in same bucket must still produce ZERO (spec 9.3.12).""" account = _make_account() - dep = _make_deployment(age_days=10) # 10 < 2×7=14 — age-only fallback does not apply + dep = _make_deployment() cs_client, mon_client = _make_clients( - account, - [dep], - metric_response=_make_total_metric_response(has_timeseries=False), + account, [dep], metric_response=_make_total_metric_response(0.0, num_series=3) ) findings = find_idle_openai_provisioned_deployments( @@ -540,17 +747,19 @@ def test_no_timeseries_young_deployment_skipped(): monitor_client=mon_client, ) - assert findings == [] + assert len(findings) == 1 -def test_no_timeseries_old_deployment_age_only_medium(): - """No timeseries + age >= 2× idle_days → MEDIUM age-only finding.""" +def test_duplicate_timestamps_do_not_overstate_coverage(): + """Duplicate timestamps from multiple series must not inflate coverage count (spec 9.3.8).""" account = _make_account() - dep = _make_deployment(age_days=30) # 30 >= 2×7=14 — age-only fallback applies + dep = _make_deployment() + # coverage_fraction=0.79 with num_series=2; correct coverage is still 0.79 (buckets deduped) + # -> UNKNOWN -> no finding. Without deduplication the fake coverage would be 1.58 -> finding. cs_client, mon_client = _make_clients( account, [dep], - metric_response=_make_total_metric_response(has_timeseries=False), + metric_response=_make_total_metric_response(0.0, coverage_fraction=0.79, num_series=2), ) findings = find_idle_openai_provisioned_deployments( @@ -560,41 +769,53 @@ def test_no_timeseries_old_deployment_age_only_medium(): monitor_client=mon_client, ) - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" - assert findings[0].details["idle_signal_scope"] == "age_only" - assert "age" in findings[0].evidence.signals_used[0].lower() + assert findings == [] + + +def test_no_timeseries_skipped(): + """Metric response with no timeseries produces UNKNOWN -> no finding.""" + account = _make_account() + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep], metric_response=SimpleNamespace(value=[])) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] # --------------------------------------------------------------------------- -# Effective window / idle_days clamping +# idle_days handling # --------------------------------------------------------------------------- -def test_effective_window_capped_to_age(): - """For a deployment younger than idle_days, effective_window is capped to age.""" +def test_idle_days_minimum_is_1(): + """idle_days below 1 is clamped to 1 (spec 6.3).""" account = _make_account() - # age=6 < idle_days=7, and ceil(7*0.75)=6 so age=6 qualifies for MEDIUM - dep = _make_deployment(age_days=6) - cs_client, mon_client = _make_clients(account, [dep]) + dep = _make_deployment(age_days=1) + cs_client, mon_client = _make_clients(account, [dep], idle_days=1) findings = find_idle_openai_provisioned_deployments( subscription_id=_SUB, credential=None, client=cs_client, monitor_client=mon_client, + idle_days=0, ) - # age=6 >= ceil(7*0.75)=6 → MEDIUM; effective_window=min(7,6)=6 assert len(findings) == 1 - assert findings[0].evidence.time_window == "6 days" + assert findings[0].details["idle_days_threshold"] == 1 -def test_idle_days_clamped_to_minimum(): - """idle_days below 3 is clamped to 3.""" +def test_idle_days_age_gate_is_not_3(): + """idle_days minimum must be 1, not 3 — deployment aged 1 day with idle_days=1 must be flagged.""" account = _make_account() - dep = _make_deployment(age_days=30) - cs_client, mon_client = _make_clients(account, [dep]) + dep = _make_deployment(age_days=1) + cs_client, mon_client = _make_clients(account, [dep], idle_days=1) findings = find_idle_openai_provisioned_deployments( subscription_id=_SUB, @@ -607,110 +828,572 @@ def test_idle_days_clamped_to_minimum(): assert len(findings) == 1 -# --------------------------------------------------------------------------- -# Region filter -# --------------------------------------------------------------------------- - - -def test_region_filter_excludes_other_regions(): - """Deployments in accounts outside region_filter should be skipped.""" - account = _make_account(location="westeurope") - dep = _make_deployment(age_days=30) +def test_deployment_exactly_at_idle_days_threshold(): + """Deployment aged exactly idle_days days must be flagged (age >= effective_idle_days).""" + account = _make_account() + dep = _make_deployment(age_days=7) cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( subscription_id=_SUB, credential=None, - region_filter="eastus", client=cs_client, monitor_client=mon_client, ) - assert findings == [] + assert len(findings) == 1 -def test_region_filter_matches_normalised(): - """Region filter matches after normalisation (spaces/dashes stripped).""" - account = _make_account(location="East US") - dep = _make_deployment(age_days=30) - cs_client, mon_client = _make_clients(account, [dep]) +def test_idle_days_clamped_to_max(): + """idle_days above _MAX_IDLE_DAYS (30) must be clamped (large-window guard).""" + account = _make_account() + dep = _make_deployment(age_days=30) # age == clamp ceiling + cs_client, mon_client = _make_clients(account, [dep], idle_days=30) findings = find_idle_openai_provisioned_deployments( subscription_id=_SUB, credential=None, - region_filter="eastus", client=cs_client, monitor_client=mon_client, + idle_days=60, # exceeds _MAX_IDLE_DAYS; clamped to 30 ) assert len(findings) == 1 - assert findings[0].region == "East US" # original location preserved - + assert findings[0].details["idle_days_threshold"] == 30 -# --------------------------------------------------------------------------- -# Resilience -# --------------------------------------------------------------------------- - - -def test_monitor_transient_failure_skipped(): - """Transient Azure Monitor errors (non-auth) should NOT produce a finding, even for old deployments.""" - - def _raise(*args, **kwargs): - raise RuntimeError("Monitor API unavailable") +def test_idle_days_above_max_still_emits_when_age_matches_clamp(): + """Deployment aged exactly _MAX_IDLE_DAYS must emit when idle_days is clamped to that value.""" account = _make_account() dep = _make_deployment(age_days=30) - cs_client = SimpleNamespace( - accounts=SimpleNamespace(list=lambda: [account]), - deployments=SimpleNamespace(list=lambda rg, acct: [dep]), - ) - mon_client = SimpleNamespace(metrics=SimpleNamespace(list=_raise)) + cs_client, mon_client = _make_clients(account, [dep], idle_days=30) findings = find_idle_openai_provisioned_deployments( subscription_id=_SUB, credential=None, client=cs_client, monitor_client=mon_client, + idle_days=90, # clamped to 30; age_days=30 >= 30 ) - assert findings == [] + assert len(findings) == 1 -def test_monitor_auth_failure_raises_permission_error(): - """AuthorizationFailed on metrics.list() should raise PermissionError, not silently return no findings.""" +def test_clamped_idle_days_visible_in_details(): + """When idle_days is clamped, details must expose both the original and applied values.""" + account = _make_account() + dep = _make_deployment(age_days=30) + cs_client, mon_client = _make_clients(account, [dep], idle_days=30) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + idle_days=60, # exceeds _MAX_IDLE_DAYS=30; clamped + ) + + assert len(findings) == 1 + d = findings[0].details + assert d["idle_days_requested"] == 60 # user's original input preserved + assert d["idle_days_threshold"] == 30 # effective value after clamping + + +def test_unclamped_idle_days_matches_requested(): + """When no clamping occurs, idle_days_requested and idle_days_threshold must be equal.""" + account = _make_account() + dep = _make_deployment(age_days=14) + cs_client, mon_client = _make_clients(account, [dep], idle_days=14) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + idle_days=14, + ) + + assert len(findings) == 1 + d = findings[0].details + assert d["idle_days_requested"] == d["idle_days_threshold"] == 14 + + +def test_deployment_one_day_under_threshold_skipped(): + """Deployment aged idle_days - 1 must be skipped.""" + account = _make_account() + dep = _make_deployment(age_days=6) # 6 < 7 (default idle_days) + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] + + +# --------------------------------------------------------------------------- +# Region filter +# --------------------------------------------------------------------------- + + +def test_region_filter_excludes_other_regions(): + """Accounts in other regions must be skipped when region_filter is set.""" + account = _make_account(location="westeurope") + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + region_filter="eastus", + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] + + +def test_region_filter_case_insensitive_match(): + """region_filter matches after lowercase normalization (spec 7).""" + account = _make_account(location="East US") + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + region_filter="East US", + client=cs_client, + monitor_client=mon_client, + ) + + assert len(findings) == 1 + assert findings[0].region == "east us" + + +def test_region_filter_spaces_preserved_in_normalization(): + """Spaces are preserved in normalized location (spec 7: do not remove spaces).""" + account = _make_account(location="East US") + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + # "eastus" (no space) must NOT match "East US" -> "east us" (with space) + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + region_filter="eastus", + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] + + +def test_region_is_normalized_location(): + """Finding region must be the normalized (lowercase) account location (spec 11.1).""" + account = _make_account(location="East US") + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings[0].region == "east us" + + +# --------------------------------------------------------------------------- +# Required details fields +# --------------------------------------------------------------------------- + + +def test_required_details_fields_present(): + """All required detail fields from spec 11.3 must be present.""" + account = _make_account() + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + d = findings[0].details + required = [ + "account_name", + "resource_group", + "subscription_id", + "account_location", + "account_kind", + "deployment_name", + "deployment_provisioning_state", + "sku_name", + "ptu_capacity", + "model_format", + "model_name", + "model_version", + "created_at", + "age_days", + "idle_days_requested", + "idle_days_threshold", + "idle_since_days", + "metric_name", + "metric_aggregation", + "metric_result_reason", + "metric_coverage_ratio", + "metric_expected_bucket_count", + "metric_observed_bucket_count", + "metric_window_start_utc", + "metric_end_utc", + "tags", + ] + for field in required: + assert field in d, f"Missing required detail field: {field}" + + +def test_details_values(): + """Detail fields must carry the correct values.""" + account = _make_account() + dep = _make_deployment(capacity=20, model_name="gpt-4o", model_version="2024-05-13") + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + d = findings[0].details + assert d["account_name"] == _ACCT_NAME + assert d["resource_group"] == _RG + assert d["subscription_id"] == _SUB + assert d["account_location"] == "eastus" + assert d["account_kind"] == "OpenAI" + assert d["deployment_name"] == _DEP_NAME + assert d["deployment_provisioning_state"] == "Succeeded" + assert d["sku_name"] == "ProvisionedManaged" + assert d["ptu_capacity"] == 20 + assert d["model_format"] == "OpenAI" + assert d["model_name"] == "gpt-4o" + assert d["model_version"] == "2024-05-13" + assert d["idle_days_threshold"] == 7 + assert d["idle_since_days"] == 7 + assert d["metric_name"] == "AzureOpenAIRequests" + assert d["metric_aggregation"] == "Total" + assert d["metric_coverage_ratio"] is not None + + +def test_metric_result_reason_is_zero(): + """metric_result_reason must be 'ZERO' for emitted findings.""" + account = _make_account() + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings[0].details["metric_result_reason"] == "ZERO" + + +def test_metric_window_timestamps_are_iso_strings(): + """metric_window_start_utc and metric_end_utc must be ISO-format UTC strings.""" + from datetime import datetime + + account = _make_account() + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + d = findings[0].details + for key in ("metric_window_start_utc", "metric_end_utc"): + assert isinstance(d[key], str), f"{key} must be a string" + parsed = datetime.fromisoformat(d[key]) + assert parsed.tzinfo is not None, f"{key} must be timezone-aware" + + +def test_metric_window_start_is_before_end(): + """metric_window_start_utc must be strictly before metric_end_utc.""" + from datetime import datetime + + account = _make_account() + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + d = findings[0].details + start = datetime.fromisoformat(d["metric_window_start_utc"]) + end = datetime.fromisoformat(d["metric_end_utc"]) + assert start < end + + +def test_tags_never_none(): + """tags detail must never be None — defaults to {} (spec 7).""" + account = _make_account() + dep = _make_deployment(tags=None) + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings[0].details["tags"] is not None + assert isinstance(findings[0].details["tags"], dict) + + +def test_deployment_tags_used_when_present(): + """Deployment tags must be preferred when present (spec 7).""" + account = _make_account(tags={"env": "prod"}) + dep = _make_deployment(tags={"team": "ml"}) + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings[0].details["tags"] == {"team": "ml"} + + +def test_deployment_tags_empty_dict_when_none(): + """When deployment.tags is None, tags must be {} (spec 7).""" + account = _make_account() + dep = _make_deployment(tags=None) + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings[0].details["tags"] == {} + + +# --------------------------------------------------------------------------- +# Signals and evidence +# --------------------------------------------------------------------------- + + +def test_metric_name_in_signals(): + """AzureOpenAIRequests must appear in signals_used.""" + account = _make_account() + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + signal_text = " ".join(findings[0].evidence.signals_used) + assert "AzureOpenAIRequests" in signal_text + + +def test_ptu_capacity_in_signals(): + """PTU capacity must appear in signals_used.""" + account = _make_account() + dep = _make_deployment(capacity=15) + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + signal_text = " ".join(findings[0].evidence.signals_used) + assert "15" in signal_text + + +def test_no_cost_estimate_in_signals(): + """No dollar cost estimate should appear in signals (spec 10).""" + account = _make_account() + dep = _make_deployment(capacity=10) + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + signal_text = " ".join(findings[0].evidence.signals_used) + # No fixed per-PTU price should be stated + assert "1,460" not in signal_text + assert "14,600" not in signal_text + + +def test_no_processedprompt_fallback(): + """ProcessedPromptTokens must not be used as a fallback (spec 9.3.14).""" + account = _make_account() + dep = _make_deployment() + metrics_called = [] + + def _mock_metrics(resource_uri, **kwargs): + metrics_called.append(kwargs.get("metricnames", "")) + return SimpleNamespace(value=[]) # no timeseries + + cs_client, mon_client = _make_clients(account, [dep], metric_fn=_mock_metrics) + + find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert "ProcessedPromptTokens" not in metrics_called + + +def test_no_age_only_finding(): + """No finding must be emitted from age-only evidence (spec 9.2.5, 9.3.15).""" + account = _make_account() + dep = _make_deployment(age_days=365) # very old deployment + # Return empty metric response (UNKNOWN coverage) + cs_client, mon_client = _make_clients(account, [dep], metric_response=SimpleNamespace(value=[])) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + # Even a very old deployment must not produce a finding when metric is UNKNOWN + assert findings == [] + + +# --------------------------------------------------------------------------- +# Exception handling +# --------------------------------------------------------------------------- + + +def test_account_listing_failure_propagates(): + """Subscription-wide account inventory failure must propagate (spec 12).""" + + def _raise(): + raise RuntimeError("accounts.list() API unavailable") + + cs_client = SimpleNamespace(accounts=SimpleNamespace(list=_raise)) + mon_client = SimpleNamespace() + + with pytest.raises(RuntimeError): + find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + +def test_per_account_deployment_listing_failure_skipped(): + """Transient deployment listing failure must skip that account, not abort (spec 12).""" + account_good = _make_account(name="good-account") + account_bad = _make_account(name="bad-account") + dep_good = _make_deployment(account="good-account") + + call_count = [0] + + def _dep_list(rg, acct_name): + call_count[0] += 1 + if acct_name == "bad-account": + raise RuntimeError("transient SDK timeout") + return [dep_good] + + cs_client = SimpleNamespace( + accounts=SimpleNamespace(list=lambda: [account_good, account_bad]), + deployments=SimpleNamespace(list=_dep_list), + ) + mon_client = SimpleNamespace( + metrics=SimpleNamespace(list=lambda *a, **kw: _make_total_metric_response(0.0)) + ) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert len(findings) == 1 + assert findings[0].details["account_name"] == "good-account" + assert call_count[0] == 2 # both accounts attempted + + +def test_per_deployment_metric_failure_skipped(): + """Transient metric failure must skip that deployment, not produce a finding (spec 12).""" def _raise(*args, **kwargs): - raise Exception("AuthorizationFailed: missing Microsoft.Insights/metrics/read") + raise RuntimeError("Monitor API unavailable") account = _make_account() - dep = _make_deployment(age_days=30) + dep = _make_deployment() cs_client = SimpleNamespace( accounts=SimpleNamespace(list=lambda: [account]), deployments=SimpleNamespace(list=lambda rg, acct: [dep]), ) mon_client = SimpleNamespace(metrics=SimpleNamespace(list=_raise)) - with pytest.raises(PermissionError) as exc_info: - find_idle_openai_provisioned_deployments( - subscription_id=_SUB, - credential=None, - client=cs_client, - monitor_client=mon_client, - ) + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) - assert "Microsoft.Insights/metrics/read" in str(exc_info.value) + assert findings == [] -def test_permission_error_raised_on_auth_failure(): - """AuthorizationFailed at accounts.list() should raise PermissionError.""" +def test_permission_error_propagates_from_metric(): + """PermissionError from metric query must propagate (not be swallowed).""" - def _raise(): - raise Exception("AuthorizationFailed: client lacks CognitiveServices/accounts/read") + def _raise(*args, **kwargs): + raise PermissionError("Missing Microsoft.Insights/metrics/read") - cs_client = SimpleNamespace(accounts=SimpleNamespace(list=_raise)) - mon_client = SimpleNamespace() + account = _make_account() + dep = _make_deployment() + cs_client = SimpleNamespace( + accounts=SimpleNamespace(list=lambda: [account]), + deployments=SimpleNamespace(list=lambda rg, acct: [dep]), + ) + mon_client = SimpleNamespace(metrics=SimpleNamespace(list=_raise)) - with pytest.raises(PermissionError) as exc_info: + with pytest.raises(PermissionError): find_idle_openai_provisioned_deployments( subscription_id=_SUB, credential=None, @@ -718,54 +1401,97 @@ def _raise(): monitor_client=mon_client, ) - assert "Microsoft.CognitiveServices/accounts/read" in str(exc_info.value) - -def test_account_auth_error_raises_permission_error(): - """AuthorizationFailed at deployments.list() should raise PermissionError.""" +def test_multiple_deployments_partial_failure(): + """One failing deployment must not prevent other deployments from being evaluated.""" account = _make_account() + dep_good = _make_deployment(name="good-dep") + dep_bad = _make_deployment(name="bad-dep") + + call_count = [0] - def _dep_list_raise(rg, acct): - raise Exception("AuthorizationFailed: missing deployments/read") + def _mock_metrics(resource_uri, **kwargs): + call_count[0] += 1 + if "bad-dep" in kwargs.get("filter", ""): + raise RuntimeError("transient error for bad-dep") + return _make_total_metric_response(0.0) cs_client = SimpleNamespace( accounts=SimpleNamespace(list=lambda: [account]), - deployments=SimpleNamespace(list=_dep_list_raise), + deployments=SimpleNamespace(list=lambda rg, acct: [dep_good, dep_bad]), + ) + mon_client = SimpleNamespace( + metrics=SimpleNamespace(list=lambda resource_uri, **kw: _mock_metrics(resource_uri, **kw)) ) - mon_client = SimpleNamespace() - with pytest.raises(PermissionError) as exc_info: - find_idle_openai_provisioned_deployments( - subscription_id=_SUB, - credential=None, - client=cs_client, - monitor_client=mon_client, - ) + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) - assert "Microsoft.CognitiveServices/accounts/deployments/read" in str(exc_info.value) + assert len(findings) == 1 + assert findings[0].details["deployment_name"] == "good-dep" -def test_transient_account_error_skipped_preserves_other_findings(): - """Transient error on one account should not abort findings from others.""" - account_good = _make_account(name="good-account", rg="rg-good") - account_bad = _make_account(name="bad-account", rg="rg-bad") - dep_good = _make_deployment(age_days=30, account="good-account", rg="rg-good") +# --------------------------------------------------------------------------- +# Deployment provisioning_state camelCase fallback +# --------------------------------------------------------------------------- - call_count = [0] - def _dep_list(rg, acct_name): - call_count[0] += 1 - if acct_name == "bad-account": - raise RuntimeError("transient SDK timeout") - return [dep_good] +def test_deployment_provisioning_state_camelcase_fallback(): + """provisioningState (camelCase) on deployment properties must be accepted as 'Succeeded'.""" + account = _make_account() + dep = _make_deployment() + # Replace properties with a shape that has only the camelCase field + dep.properties = SimpleNamespace( + model=SimpleNamespace(format="OpenAI", name="gpt-4o", version="2024-05-13"), + provisioningState="Succeeded", + ) + cs_client, mon_client = _make_clients(account, [dep]) - cs_client = SimpleNamespace( - accounts=SimpleNamespace(list=lambda: [account_good, account_bad]), - deployments=SimpleNamespace(list=_dep_list), + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, ) - mon_client = SimpleNamespace( - metrics=SimpleNamespace(list=lambda *a, **kw: _make_total_metric_response(0.0)) + + assert len(findings) == 1 + + +def test_deployment_provisioning_state_camelcase_failed_skipped(): + """provisioningState (camelCase) not 'Succeeded' on deployment must still skip.""" + account = _make_account() + dep = _make_deployment() + dep.properties = SimpleNamespace( + model=SimpleNamespace(format="OpenAI", name="gpt-4o", version="2024-05-13"), + provisioningState="Failed", + ) + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] + + +def test_deployment_provisioning_state_snake_case_preferred(): + """deployment provisioning_state (snake_case) takes precedence when both fields present.""" + account = _make_account() + dep = _make_deployment() + # snake_case says Succeeded, camelCase says Creating — snake_case wins + dep.properties = SimpleNamespace( + model=SimpleNamespace(format="OpenAI", name="gpt-4o", version="2024-05-13"), + provisioning_state="Succeeded", + provisioningState="Creating", ) + cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( subscription_id=_SUB, @@ -775,19 +1501,82 @@ def _dep_list(rg, acct_name): ) assert len(findings) == 1 - assert findings[0].details["account_name"] == "good-account" - assert call_count[0] == 2 # both accounts attempted # --------------------------------------------------------------------------- -# Evidence / signal surfacing +# Tags dict enforcement # --------------------------------------------------------------------------- -def test_idle_signal_metric_name_in_evidence(): - """The metric name used to confirm idle should appear in evidence signals.""" +def test_tags_non_dict_normalized_to_empty_dict(): + """Non-dict tags value on deployment must be normalized to {} (not passed through).""" account = _make_account() - dep = _make_deployment(age_days=30) + dep = _make_deployment() + dep.tags = "not-a-dict" # e.g. malformed SDK shape + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings[0].details["tags"] == {} + + +# --------------------------------------------------------------------------- +# Bucket count observability +# --------------------------------------------------------------------------- + + +def test_bucket_counts_present_in_details(): + """metric_expected_bucket_count and metric_observed_bucket_count must be in details.""" + account = _make_account() + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + d = findings[0].details + assert "metric_expected_bucket_count" in d + assert "metric_observed_bucket_count" in d + assert isinstance(d["metric_expected_bucket_count"], int) + assert isinstance(d["metric_observed_bucket_count"], int) + + +def test_bucket_counts_consistent_with_coverage_ratio(): + """observed / expected must equal metric_coverage_ratio exactly (within float precision).""" + account = _make_account() + dep = _make_deployment() + cs_client, mon_client = _make_clients( + account, [dep], metric_response=_make_total_metric_response(0.0, coverage_fraction=0.90) + ) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + d = findings[0].details + expected = d["metric_expected_bucket_count"] + observed = d["metric_observed_bucket_count"] + assert expected > 0 + assert observed <= expected + assert abs(d["metric_coverage_ratio"] - observed / expected) < 1e-9 + + +def test_bucket_counts_in_signal_string(): + """Signal string must include 'N/M minute buckets' for reviewer context.""" + account = _make_account() + dep = _make_deployment() cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -798,13 +1587,55 @@ def test_idle_signal_metric_name_in_evidence(): ) signal_text = " ".join(findings[0].evidence.signals_used) - assert "AzureOpenAIRequests" in signal_text + assert "minute buckets" in signal_text + # Format: "/ minute buckets" + import re + + assert re.search(r"\d+/\d+ minute buckets", signal_text) + + +# --------------------------------------------------------------------------- +# _escape_odata_string unit tests +# --------------------------------------------------------------------------- + +def test_escape_odata_string_single_quotes(): + """Single quotes must be doubled.""" + from cleancloud.providers.azure.rules.ai.openai_provisioned_idle import _escape_odata_string + + assert _escape_odata_string("it's") == "it''s" + assert _escape_odata_string("a'b'c") == "a''b''c" + assert _escape_odata_string("no quotes") == "no quotes" + + +def test_escape_odata_string_strips_control_chars(): + """ASCII control chars (< 0x20, except tab) must be removed.""" + from cleancloud.providers.azure.rules.ai.openai_provisioned_idle import _escape_odata_string + + assert _escape_odata_string("a\x00b") == "ab" + assert _escape_odata_string("a\x1fb") == "ab" + assert _escape_odata_string("a\nb") == "ab" # LF stripped + assert _escape_odata_string("a\tb") == "a\tb" # tab preserved + + +def test_escape_odata_string_combined(): + """Quote-escaping and control-char stripping work together.""" + from cleancloud.providers.azure.rules.ai.openai_provisioned_idle import _escape_odata_string + + assert _escape_odata_string("it'\x00s") == "it''s" + + +# --------------------------------------------------------------------------- +# Coverage display precision +# --------------------------------------------------------------------------- + + +def test_coverage_signal_shows_two_decimal_places(): + """Both threshold and observed coverage in signals must use 2 decimal places.""" + import re -def test_cost_warning_in_evidence_signals(): - """PTU cost estimate should appear in evidence signals.""" account = _make_account() - dep = _make_deployment(capacity=10, age_days=30) + dep = _make_deployment() cs_client, mon_client = _make_clients(account, [dep]) findings = find_idle_openai_provisioned_deployments( @@ -815,7 +1646,153 @@ def test_cost_warning_in_evidence_signals(): ) signal_text = " ".join(findings[0].evidence.signals_used) - assert "14,600" in signal_text # 10 PTU × $1,460 + # Both >=80.00% (threshold) and coverage: N.NN% must appear + matches = re.findall(r"\d+\.\d{2}%", signal_text) + assert ( + len(matches) >= 2 + ), f"Expected at least 2 two-decimal-place percentages in signal, got: {matches}" + + +# --------------------------------------------------------------------------- +# Filter injection / OData escaping +# --------------------------------------------------------------------------- + + +def test_deployment_name_single_quote_escaped_in_filter(): + """Single quotes in deployment name must be escaped as '' in the OData filter.""" + account = _make_account() + dep = _make_deployment(name="team's-gpt4") + call_kwargs = [] + + def _mock_metrics(resource_uri, **kwargs): + call_kwargs.append(dict(kwargs)) + return _make_total_metric_response(0.0) + + cs_client, mon_client = _make_clients(account, [dep], metric_fn=_mock_metrics) + + find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert len(call_kwargs) == 1 + odata_filter = call_kwargs[0].get("filter", "") + # Escaped form: ModelDeploymentName eq 'team''s-gpt4' + assert "team''s-gpt4" in odata_filter + # The unescaped form must not appear (it would break OData parsing) + assert "team's-gpt4" not in odata_filter.replace("team''s-gpt4", "") + + +def test_deployment_name_control_chars_stripped_from_filter(): + """ASCII control characters in deployment name must be stripped before building the filter.""" + account = _make_account() + dep = _make_deployment(name="gpt4\x00-prod\x1f") # NUL and US control chars + call_kwargs = [] + + def _mock_metrics(resource_uri, **kwargs): + call_kwargs.append(dict(kwargs)) + return _make_total_metric_response(0.0) + + cs_client, mon_client = _make_clients(account, [dep], metric_fn=_mock_metrics) + + find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + odata_filter = call_kwargs[0].get("filter", "") + assert "\x00" not in odata_filter + assert "\x1f" not in odata_filter + # The printable portion of the name must still be present + assert "gpt4-prod" in odata_filter + + +def test_deployment_name_no_quotes_filter_unchanged(): + """Deployment names without single quotes must pass through unchanged.""" + account = _make_account() + dep = _make_deployment(name="gpt4o-prod") + call_kwargs = [] + + def _mock_metrics(resource_uri, **kwargs): + call_kwargs.append(dict(kwargs)) + return _make_total_metric_response(0.0) + + cs_client, mon_client = _make_clients(account, [dep], metric_fn=_mock_metrics) + + find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + odata_filter = call_kwargs[0].get("filter", "") + assert "gpt4o-prod" in odata_filter + + +# --------------------------------------------------------------------------- +# Account provisioning state field name fallback +# --------------------------------------------------------------------------- + + +def test_account_provisioning_state_camelcase_fallback(): + """provisioningState (camelCase) on the properties object must be accepted as 'Succeeded'.""" + account = _make_account() + # Replace properties with a shape that has only the camelCase field + account.properties = SimpleNamespace(provisioningState="Succeeded") + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert len(findings) == 1 + + +def test_account_provisioning_state_camelcase_failed_skipped(): + """provisioningState (camelCase) not 'Succeeded' must still be skipped.""" + account = _make_account() + account.properties = SimpleNamespace(provisioningState="Failed") + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert findings == [] + + +def test_account_provisioning_state_snake_case_preferred(): + """provisioning_state (snake_case) takes precedence when both fields are present.""" + account = _make_account() + # snake_case says Succeeded but camelCase says Creating — snake_case wins + account.properties = SimpleNamespace( + provisioning_state="Succeeded", + provisioningState="Creating", + ) + dep = _make_deployment() + cs_client, mon_client = _make_clients(account, [dep]) + + findings = find_idle_openai_provisioned_deployments( + subscription_id=_SUB, + credential=None, + client=cs_client, + monitor_client=mon_client, + ) + + assert len(findings) == 1 # ---------------------------------------------------------------------------