From 6d23731331b8bd7dcd0330f529bf16e11fc7ff21 Mon Sep 17 00:00:00 2001 From: javvaji-devops Date: Wed, 29 Apr 2026 22:03:09 +0100 Subject: [PATCH 1/4] gcp.vertex.training_job.long_running --- .../ai/vertex_training_job_long_running.py | 845 ++++++--------- docs/rules/gcp.md | 17 +- .../ai/vertex_training_job_long_running.md | 508 +++++++++ ...st_gcp_vertex_training_job_long_running.py | 984 +++++++++++------- tests/e2e/gcp/test_gcp_ai_rules_smoke.py | 4 +- 5 files changed, 1457 insertions(+), 901 deletions(-) create mode 100644 docs/specs/gcp/ai/vertex_training_job_long_running.md diff --git a/cleancloud/providers/gcp/rules/ai/vertex_training_job_long_running.py b/cleancloud/providers/gcp/rules/ai/vertex_training_job_long_running.py index cc2d1d5..4e14c0e 100644 --- a/cleancloud/providers/gcp/rules/ai/vertex_training_job_long_running.py +++ b/cleancloud/providers/gcp/rules/ai/vertex_training_job_long_running.py @@ -1,5 +1,58 @@ +""" +Rule: gcp.vertex.training_job.long_running + + (spec -- docs/specs/gcp/ai/vertex_training_job_long_running.md) + +Intent: + Detect Vertex AI training resources (CustomJob and TrainingPipeline) that are + provably still in an exact documented running state and whose documented startTime + shows they have been running for at least a conservative review threshold. + + This rule is deliberately precision-first. It is a review-candidate rule only. + It is not proof that a job is hung, not proof that no useful progress is occurring, + not proof that the resource is safe to cancel, and not proof of a specific saving. + +Covered resource types (spec 3.1, 3.2): + - Vertex AI CustomJob (state == JOB_STATE_RUNNING) + - Vertex AI TrainingPipeline (state == PIPELINE_STATE_RUNNING) + +Runtime anchor (spec 7, 9.4): + - Canonical anchor: startTime (when the job first entered running state) + - createTime is NOT a fallback -- missing startTime must skip (spec 9.4) + - Future startTime values must skip (spec 7) + +Exclusions: + - resource name absent or not exactly matching the documented pattern (spec 7, 11) + - state not exactly equal to the documented running enum (spec 3.3, 9.1) + - startTime absent, non-RFC3339, unparsable, or future (spec 7, 9.1) + - elapsed runtime < long_running_hours_threshold (spec 9.1) + - location filter set and parsed location does not exactly match (spec 7) + +Detection (all must be true to emit): + 1. resource is CustomJob or TrainingPipeline + 2. state is exactly JOB_STATE_RUNNING or PIPELINE_STATE_RUNNING + 3. startTime is valid and not future + 4. elapsed_runtime_seconds >= long_running_hours_threshold * 3600 + +Confidence / Risk (spec 9.2, 9.3): + HIGH confidence: elapsed >= 3 * threshold (clearly runaway) + MEDIUM confidence: threshold <= elapsed < 3 * threshold + CRITICAL risk: HIGH confidence + provably accelerator-backed + HIGH risk: HIGH confidence + hardware not proven accelerated + MEDIUM risk: all MEDIUM confidence findings + +Cost model (spec 10.1, 10.2): + estimated_monthly_cost_usd = None + Training jobs are transient, not recurring monthly resources. + Static pricing tables are out of scope for the canonical rule. + +APIs: + - aiplatform.googleapis.com/v1: projects/{project}/locations/-/customJobs + - aiplatform.googleapis.com/v1: projects/{project}/locations/-/trainingPipelines +""" + import json -import math +import re import warnings from concurrent.futures import ThreadPoolExecutor from datetime import datetime, timezone @@ -26,14 +79,12 @@ # Machine type prefixes for Cloud TPU nodes (Vertex AI TPU training). # ct4- uses a trailing dash (exact family anchor); ct5/ct6/ct7 match all sub-variants. # "tpu" covers tpu7x-* and any future tpu-prefixed names. -# Use _is_tpu_machine() rather than calling .startswith() with this tuple directly — -# that function enforces the correct per-family anchor rules. +# Use _is_tpu_machine() rather than calling .startswith() with this tuple directly. _TPU_MACHINE_PREFIXES = ("ct4-", "ct5", "ct6", "ct7", "tpu") -# High-cost accelerator types: GPU families and TPU pods. -# Named _ACCELERATOR_TYPES (not _GPU_ACCELERATORS) because TPU variants are included. -# Keep in sync with MachineSpec.AcceleratorType in the Vertex AI REST reference. -# Entries marked [est] have no published GCP Vertex AI pricing; costs are estimates. +# Accelerator types for hardware classification (spec 8.1). +# A job is accelerator-backed when any worker pool uses one of these types +# with a nonzero count, or when the machine type is in a bundled GPU/TPU family. _ACCELERATOR_TYPES = frozenset( { # Volta / Turing / Ampere @@ -42,174 +93,28 @@ "NVIDIA_TESLA_P100", "NVIDIA_TESLA_T4", "NVIDIA_TESLA_V100", - "NVIDIA_TESLA_A100", # A100 40GB (add-on; a2-* bundles it) - "NVIDIA_A100_80GB", # A100 80GB (add-on; a2-ultragpu-* bundles it) + "NVIDIA_TESLA_A100", + "NVIDIA_A100_80GB", # Ada / Hopper / Blackwell "NVIDIA_L4", "NVIDIA_H100_80GB", "NVIDIA_H100_MEGA_80GB", - "NVIDIA_H200_141GB", # [est] H200 141GB - "NVIDIA_B200", # [est] Blackwell B200 — pre-GA - "NVIDIA_GB200", # [est] Grace Blackwell NVL — pre-GA - "NVIDIA_RTX_PRO_6000", # [est] RTX Pro 6000 Ada - # TPU - "TPU_V2", - "TPU_V3", - "TPU_V4_POD", - "TPU_V5_LITEPOD", # [est] v5e litepod - } -) - -# Monthly cost per machine type (on-demand, us-central1, 730 h/month). -# Bundled GPU families (a2-*, a3-*, a4-*, a4x-*, g2-*, g4-*) include accelerator cost. -# TPU machine types (ct5lp-*, ct6e-*, tpu7x-*, …) include TPU chip cost. -_MACHINE_MONTHLY_COST = { - "n1-standard-1": 35.0, - "n1-standard-2": 69.0, - "n1-standard-4": 138.0, - "n1-standard-8": 277.0, - "n1-standard-16": 554.0, - "n1-standard-32": 1_107.0, - "n1-standard-64": 2_214.0, - "n1-standard-96": 3_321.0, - "n1-highmem-2": 93.0, - "n1-highmem-4": 187.0, - "n1-highmem-8": 374.0, - "n1-highmem-16": 748.0, - "n1-highmem-32": 1_496.0, - "n1-highmem-64": 2_991.0, - "n1-highmem-96": 4_487.0, - "n2-standard-2": 78.0, - "n2-standard-4": 157.0, - "n2-standard-8": 314.0, - "n2-standard-16": 628.0, - "n2-standard-32": 1_255.0, - "c2-standard-4": 166.0, - "c2-standard-8": 332.0, - "c2-standard-16": 664.0, - "c2-standard-30": 1_245.0, - "c2-standard-60": 2_490.0, - # a2-* (A100 40GB bundled) - "a2-highgpu-1g": 2_933.0, - "a2-highgpu-2g": 5_866.0, - "a2-highgpu-4g": 11_732.0, - "a2-highgpu-8g": 23_464.0, - "a2-megagpu-16g": 46_927.0, - # a2-ultragpu-* (A100 80GB bundled) - "a2-ultragpu-1g": 5_103.0, - "a2-ultragpu-2g": 10_206.0, - "a2-ultragpu-4g": 20_412.0, - "a2-ultragpu-8g": 40_824.0, - # a3-* (H100 SXM5 bundled) — 1g/2g/4g priced proportionally to published 8g rate - "a3-highgpu-1g": 7_299.0, # [est] 1/8 of 8g - "a3-highgpu-2g": 14_598.0, # [est] 2/8 of 8g - "a3-highgpu-4g": 29_197.0, # [est] 4/8 of 8g - "a3-highgpu-8g": 58_393.0, # published GCP rate - "a3-megagpu-8g": 65_000.0, # [est] 8× H100, high-mem NVLink config - "a3-ultragpu-8g": 80_000.0, # [est] 8× H200 141GB - # a4-* (B200 bundled) — [est] no published GCP rate - "a4-highgpu-8g": 100_000.0, # [est] 8× B200 next-gen flagship - # a4x-* (GB200 NVL bundled) — [est] - "a4x-highgpu-4g": 60_000.0, # [est] 4× GB200 NVLink - # g2-* (L4 bundled) - "g2-standard-4": 706.0, - "g2-standard-8": 1_060.0, - "g2-standard-12": 1_590.0, - "g2-standard-16": 2_120.0, - "g2-standard-24": 3_180.0, - "g2-standard-32": 4_241.0, - "g2-standard-48": 6_361.0, - "g2-standard-96": 12_722.0, - # g4-* (RTX Pro 6000 Ada bundled) — documented sizes per Vertex AI training docs: - # 48=1 GPU, 96=2 GPUs, 192=4 GPUs, 384=8 GPUs - # Pricing [est]: no published GCP rate; ~$2,800/GPU/mo (RTX Pro 6000 + host vCPU share) - "g4-standard-48": 2_800.0, # [est] 1 GPU - "g4-standard-96": 5_600.0, # [est] 2 GPUs - "g4-standard-192": 11_200.0, # [est] 4 GPUs - "g4-standard-384": 22_400.0, # [est] 8 GPUs - # Cloud TPU machine types — cost is the TPU chip(s) + host VM bundled - # TPU v5e (ct5lp-hightpu-*): ~$1.20/chip-hr (published) - "ct5lp-hightpu-1t": 876.0, - "ct5lp-hightpu-4t": 3_504.0, - "ct5lp-hightpu-8t": 7_008.0, - # TPU v5p (ct5p-hightpu-*): ~$1.80/chip-hr [est] - "ct5p-hightpu-4t": 5_256.0, # [est] - "ct5p-hightpu-8t": 10_512.0, # [est] - # TPU v6e (ct6e-standard-*): ~$1.80/chip-hr [est] - "ct6e-standard-1t": 1_314.0, # [est] 1 chip - "ct6e-standard-4t": 5_256.0, # [est] 4 chips - "ct6e-standard-8t": 10_512.0, # [est] 8 chips -} -_DEFAULT_MACHINE_MONTHLY_COST = 150.0 -# Fallback for unrecognized TPU machine types — avoids the $0.21/hr generic default -# massively underestimating a 4-chip-equivalent TPU job. -_DEFAULT_TPU_MONTHLY_COST = 10_000.0 # ~$13.70/hr, conservative multi-host TPU estimate - -# Duration-tiered fallback costs for TrainingPipelines when workerPoolSpecs cannot be parsed. -# Longer-running pipelines are statistically more likely to be GPU-backed workloads. -# Three tiers (inlined in find_long_running_vertex_training_jobs): -# >24h → $20/hr (probable multi-GPU), 6–24h → $5/hr (ambiguous), else → $1/hr. -# These are not exact — large GPU pipelines cost $50–$500+/hr; these are indicative minimums. - -# Additional monthly cost per accelerator unit for n1-*/n2-*/c2-* machines (add-on pricing). -# Bundled families (a2-*, a3-*, a4-*, a4x-*, g2-*, g4-*, ct*/tpu7x-*) already include -# accelerator cost in _MACHINE_MONTHLY_COST — no add-on needed for those. -# All costs: us-central1 on-demand, 730 h/month. -# Entries marked [est] use conservative estimates — no published GCP Vertex AI rate. -_ACCELERATOR_MONTHLY_COST_EACH = { - # Volta / Turing / Ampere (published GCP rates) - "NVIDIA_TESLA_K80": 392.0, - "NVIDIA_TESLA_P4": 438.0, # ~$0.60/hr - "NVIDIA_TESLA_P100": 1_022.0, - "NVIDIA_TESLA_T4": 311.0, - "NVIDIA_TESLA_V100": 1_385.0, - "NVIDIA_TESLA_A100": 2_933.0, # A100 40GB add-on (n1-* only; a2-* bundles) - "NVIDIA_A100_80GB": 5_103.0, # A100 80GB add-on - # Ada / Hopper (published GCP rates) - "NVIDIA_L4": 680.0, - "NVIDIA_H100_80GB": 8_000.0, - "NVIDIA_H100_MEGA_80GB": 10_000.0, - # Newer accelerators — [est] conservative estimates; update when GCP publishes rates - "NVIDIA_H200_141GB": 11_000.0, # [est] ~1.4× H100 80GB - "NVIDIA_B200": 18_000.0, # [est] Blackwell B200 — pre-GA - "NVIDIA_GB200": 22_000.0, # [est] Grace Blackwell NVL — pre-GA - "NVIDIA_RTX_PRO_6000": 2_200.0, # [est] RTX Pro 6000 Ada workstation - # TPU (published GCP rates) - "TPU_V2": 3_811.0, - "TPU_V3": 5_840.0, - "TPU_V4_POD": 9_402.0, - "TPU_V5_LITEPOD": 3_500.0, # [est] v5e litepod per unit -} - -_HOURS_PER_MONTH = 730.0 - -# Machine type prefixes and accelerator types whose pricing is estimated (no published GCP rate). -# Used to tag findings with pricing_confidence="partial_estimate" vs "published". -_PRICING_ESTIMATED_MACHINE_PREFIXES = ( - "a3-megagpu", - "a3-ultragpu", # H200/future a3 variants - "a4-", # B200 - "a4x-", # GB200 NVLink - "g4-", # RTX Pro 6000 Ada - "ct5p-", # TPU v5p - "ct6e-", # TPU v6e - "tpu7x-", # TPU v7 (pre-GA) -) -_PRICING_ESTIMATED_ACCEL_TYPES = frozenset( - { "NVIDIA_H200_141GB", "NVIDIA_B200", "NVIDIA_GB200", "NVIDIA_RTX_PRO_6000", + # TPU + "TPU_V2", + "TPU_V3", + "TPU_V4_POD", "TPU_V5_LITEPOD", } ) -# Full accelerator count per bundled machine type — used for co-scheduling cost correction. -# Vertex AI may co-schedule floor(N/accel_count) replicas onto one VM when accel_count <= N//2, -# so each replica pays only 1/replicas_per_vm of the machine cost. -# g2-standard-32 is omitted: its GPU count is ambiguous in GCP docs (co-scheduling impact is low -# for single-GPU machines anyway). +# Chips per physical host for known Cloud TPU machine types. +# Used by _tpu_topology_host_count to derive actual host count from tpuTopology, +# since Vertex AI always reports replicaCount=1 for TPU pods regardless of scale. +# Hardware classification only -- not used for cost estimation. _BUNDLED_ACCELERATOR_COUNT: dict[str, int] = { # a2-* (A100 40GB) "a2-highgpu-1g": 1, @@ -241,12 +146,12 @@ "g2-standard-24": 2, "g2-standard-48": 4, "g2-standard-96": 8, - # g4-* (RTX Pro 6000 Ada) — 48=1 GPU, 96=2 GPUs, 192=4 GPUs, 384=8 GPUs + # g4-* (RTX Pro 6000 Ada) — 48=1 GPU, 96=2, 192=4, 384=8 "g4-standard-48": 1, "g4-standard-96": 2, "g4-standard-192": 4, "g4-standard-384": 8, - # Cloud TPU machines — chip count encoded in machine name suffix (e.g. -4t = 4 chips) + # Cloud TPU machine types "ct5lp-hightpu-1t": 1, "ct5lp-hightpu-4t": 4, "ct5lp-hightpu-8t": 8, @@ -255,34 +160,21 @@ "ct6e-standard-1t": 1, "ct6e-standard-4t": 4, "ct6e-standard-8t": 8, - "tpu7x-standard-4t": 4, # TPU v7 — 4 chips/host (pre-GA) + "tpu7x-standard-4t": 4, } -# Jobs running longer than this multiple of the threshold are almost certainly runaway +# Duration multiplier beyond which a job is confidently runaway (spec 9.2). _RUNAWAY_MULTIPLIER = 3 -# Default threshold +# Default threshold hours (spec 6.3). _DEFAULT_LONG_RUNNING_HOURS = 24 -# Fraction of threshold at which GPU early-warning fires (before crossing threshold). -# 90% reduces noise vs 75%: a 21.6h GPU job (at 24h threshold) is genuinely unusual; -# an 18h job is still plausible for legitimate large-scale training. -_EARLY_WARNING_FRACTION = 0.9 - -# (project_id, resource) pairs where locations/- wildcard returned 400 — fall back +# (project_id, resource) pairs where locations/- wildcard returned 400 -- fall back # to per-region calls for that specific combination. -# Keyed per (project_id, resource) so: -# - customJobs and trainingPipelines are tracked independently (one may support wildcard) -# - project A's failure does not suppress the wildcard attempt for project B -# Written lazily on first 400; read on subsequent scans in the same process. -# A race between parallel calls is benign: at worst both try the wildcard once and -# both add the same key — set.add is GIL-protected and idempotent. +# Keyed per (project_id, resource) so customJobs and trainingPipelines are independent. _wildcard_unsupported: set[tuple[str, str]] = set() # Known Vertex AI locations for fallback when the wildcard is not supported. -# GCP adds new regions over time — this list may miss recently-announced locations. -# To ensure full coverage: grant locations/- wildcard support (roles/aiplatform.viewer -# is sufficient for most projects), or extend this list when new regions are confirmed. # Last reviewed: 2026-04-17. Source: https://cloud.google.com/vertex-ai/docs/general/locations _VERTEX_LOCATIONS = [ "us-central1", @@ -309,84 +201,90 @@ ] +# Strict RFC3339 validation pattern (spec 7). +# Accepts: YYYY-MM-DDTHH:MM:SS[.fractional](Z | +HH:MM | -HH:MM) +# Rejects: date-only, space separator, missing timezone. +_RFC3339_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$") + +# Maps internal job_type strings to the expected URL/name path segment. +_RESOURCE_TYPE_SEGMENT: dict[str, str] = { + "customJob": "customJobs", + "trainingPipeline": "trainingPipelines", +} + +# Maps job_type to the exact running-state enum the resource must expose (spec 3.3, 9.1). +_EXPECTED_STATE: dict[str, str] = { + "customJob": "JOB_STATE_RUNNING", + "trainingPipeline": "PIPELINE_STATE_RUNNING", +} + + +def _validate_resource_name(name: str, job_type: str) -> bool: + """ + Return True only when name exactly matches the documented Vertex AI resource-name + pattern for the given job type (spec 7): + projects/{project}/locations/{location}/customJobs/{id} + projects/{project}/locations/{location}/trainingPipelines/{id} + + All six slash-delimited segments must be present and non-empty. Any extra + or missing path segments, or a wrong resource-type segment, returns False. + """ + parts = name.split("/") + return ( + len(parts) == 6 + and parts[0] == "projects" + and parts[2] == "locations" + and parts[4] == _RESOURCE_TYPE_SEGMENT[job_type] + and bool(parts[1]) # project id + and bool(parts[3]) # location + and bool(parts[5]) # resource id + ) + + def find_long_running_vertex_training_jobs( *, project_id: str, credentials, region_filter: Optional[str] = None, - long_running_hours: int = _DEFAULT_LONG_RUNNING_HOURS, - early_warning_fraction: float = _EARLY_WARNING_FRACTION, - runaway_multiplier: int = _RUNAWAY_MULTIPLIER, - expensive_hourly_threshold: float = 20.0, + long_running_hours_threshold: int = _DEFAULT_LONG_RUNNING_HOURS, ) -> List[Finding]: """ - Find Vertex AI CustomJobs and TrainingPipelines that have been running - longer than expected. - - Most training jobs complete within a few hours. A job still running after - 24 hours is unusual — it may be hung, deadlocked in distributed training, - caught in an OOM loop, or simply forgotten after a project was cancelled. - - GPU-backed training is especially costly: an A100 40GB node (a2-highgpu-1g) - runs at ~$4/hr; an a3-highgpu-8g (8 × H100) runs at ~$80/hr. Multi-worker - jobs multiply cost linearly. - - Detection logic: - - Queries both CustomJobs (state="JOB_STATE_RUNNING") and TrainingPipelines - (state="PIPELINE_STATE_RUNNING") via the Vertex AI REST API, in parallel - - Duration is computed from startTime (when compute began billing); falls - back to createTime if startTime is absent (jobs stuck in pre-run phases) - - Hardware: CustomJobs expose workerPoolSpecs directly; TrainingPipelines - attempt to parse workerPoolSpecs from trainingTaskInputs (handling both - dict and JSON-string encoding) before falling back to a neutral hourly - estimate (~$3/hr). Unknown hardware does NOT set is_accelerator=True — is_accelerator - is derived strictly from parsed pool data. - - Cost aggregation: - - Each pool's cost = _estimate_hourly_rate_per_replica × effective_replicas - - For GPU/CPU pools: effective_replicas = replicaCount from API - - For TPU pools: effective_replicas = physical host count derived from tpuTopology - (Vertex always reports replicaCount=1 for TPU regardless of pod size) - - Total burn rate = sum across ALL pools (not primary pool × total_replicas) - - This correctly handles heterogeneous jobs (e.g., a2-highgpu chief + n1 workers) - - Confidence: - - HIGH: duration >= long_running_hours × 3 — clearly runaway - - MEDIUM: duration >= long_running_hours — worth reviewing - - MEDIUM (early warning): accelerator job or expensive CPU cluster - (hourly_rate_total > expensive_hourly_threshold) at 90–100% of threshold - - Risk: - - CRITICAL: HIGH confidence + GPU/accelerator hardware - - HIGH: HIGH confidence, CPU-only - - MEDIUM: all MEDIUM-confidence findings (GPU or CPU alike) - - Cost reported: - - Accrued cost so far: duration_hours × hourly_burn_rate (all worker pools) - - estimated_monthly_cost_usd is intentionally None — training jobs are - transient, not recurring monthly expenses - - Pricing is a static estimate (us-central1, on-demand); actual cost varies - by region and committed use discounts + Find Vertex AI CustomJobs and TrainingPipelines running beyond the threshold. + + Emits a finding only when all of the following are true (spec 9): + 1. resource is CustomJob or TrainingPipeline in the exact running state + 2. startTime is valid and not future (createTime is NOT a fallback; spec 9.4) + 3. elapsed_runtime_seconds >= long_running_hours_threshold * 3600 + + Confidence (spec 9.2): + HIGH: elapsed >= 3 * threshold (clearly runaway) + MEDIUM: threshold <= elapsed < 3 * threshold + + Risk (spec 9.3): + CRITICAL: HIGH confidence + provably accelerator-backed + HIGH: HIGH confidence + hardware not proven accelerated + MEDIUM: all MEDIUM confidence findings + + No sub-threshold early warnings are emitted (spec 9.4). + No hardcoded pricing tables are used (spec 10.2). IAM permissions required: - - aiplatform.customJobs.list (roles/aiplatform.viewer) - - aiplatform.trainingPipelines.list (roles/aiplatform.viewer) + aiplatform.customJobs.list (roles/aiplatform.viewer) + aiplatform.trainingPipelines.list (roles/aiplatform.viewer) """ - long_running_hours = max(long_running_hours, 1) - early_warning_fraction = max(0.0, min(early_warning_fraction, 1.0)) - runaway_multiplier = max(1, runaway_multiplier) - expensive_hourly_threshold = max(0.0, expensive_hourly_threshold) + if long_running_hours_threshold < 1: + raise ValueError( + f"long_running_hours_threshold must be >= 1, " f"got {long_running_hours_threshold!r}" + ) + threshold_seconds = long_running_hours_threshold * 3600 session = AuthorizedSession(credentials) now = datetime.now(timezone.utc) findings: List[Finding] = [] skipped_jobs: int = 0 - # Query both resource types in parallel — each may independently need the - # per-region fallback if the locations/- wildcard returns 400. - # Results are collected independently: a transient failure on one resource type - # still yields findings from the other. PermissionError propagates immediately - # (missing IAM is user-actionable and should not be silently swallowed). + # Query both resource types in parallel; failures on one surface do not block the other. + # PermissionError propagates immediately (missing IAM is user-actionable). custom_jobs: list = [] training_pipelines: list = [] with ThreadPoolExecutor(max_workers=2) as executor: @@ -424,40 +322,68 @@ def find_long_running_vertex_training_jobs( for job, job_type in [(j, "customJob") for j in custom_jobs] + [ (p, "trainingPipeline") for p in training_pipelines ]: - name = job.get("name", "") - display_name = job.get("displayName", "") - location = _parse_location(name) or "unknown" + # --- Identity: exact resource-name pattern (spec 7, 11) --- + name = (job.get("name") or "").strip() + if not name or not _validate_resource_name(name, job_type): + # Empty name or doesn't match expected pattern → skip (spec 7, 11) + skipped_jobs += 1 + continue + + location = name.split("/")[3] # guaranteed by _validate_resource_name - if region_filter and location.lower() != region_filter.lower(): + # Region filter: exact string equality, no case folding (spec 7) + if region_filter and location != region_filter: continue - # Duration: prefer startTime (actual compute start); fall back to createTime - start_str = job.get("startTime") or job.get("createTime", "") + # --- State validation: exact documented running enum (spec 3.3, 9.1) --- + expected_state = _EXPECTED_STATE[job_type] + actual_state = (job.get("state") or "").strip() + if actual_state != expected_state: + skipped_jobs += 1 + continue + + # --- Runtime anchor: startTime only (spec 7, 9.4) --- + # createTime is NOT a fallback. Missing startTime must skip unconditionally. + start_str = (job.get("startTime") or "").strip() if not start_str: skipped_jobs += 1 continue + + # Strict RFC3339 validation (spec 7): reject space separators, date-only, no-tz values. + if not _RFC3339_RE.match(start_str): + skipped_jobs += 1 + continue + try: start_dt = datetime.fromisoformat(start_str.replace("Z", "+00:00")) - if start_dt.tzinfo is None: + if start_dt.tzinfo is None: # defensive; RFC3339 regex guarantees tz start_dt = start_dt.replace(tzinfo=timezone.utc) - except ValueError: + except (ValueError, AttributeError): skipped_jobs += 1 continue - duration_hours = (now - start_dt).total_seconds() / 3600 + # Future startTime is unusable (spec 7) + if start_dt > now: + skipped_jobs += 1 + continue + + # --- Duration check (spec 9.1) --- + elapsed_seconds = (now - start_dt).total_seconds() + if elapsed_seconds < threshold_seconds: + continue # not yet long-running; no sub-threshold early warning (spec 9.4) + + duration_hours = elapsed_seconds / 3600 + duration_display = round(duration_hours, 1) + display_name = (job.get("displayName") or "").strip() - # Hardware: parse per-pool specs for accurate cost aggregation. - # Done before the duration filter so expensive_hourly_threshold can be evaluated. - # CustomJob exposes workerPoolSpecs directly. TrainingPipeline may embed - # them in trainingTaskInputs (works for custom-training pipelines) or may - # not expose them at all (AutoML, managed job types). + # --- Hardware classification (spec 8) --- if job_type == "customJob": raw_worker_specs = job.get("jobSpec", {}).get("workerPoolSpecs", []) pools = _parse_worker_pools(raw_worker_specs) - hardware_unknown = False + # spec 8.1: missing, empty, or all-malformed workerPoolSpecs → hardware_unknown + hardware_unknown = not pools else: task_inputs = job.get("trainingTaskInputs") or {} - # The field is occasionally returned as a JSON string rather than a parsed dict if isinstance(task_inputs, str): try: task_inputs = json.loads(task_inputs) @@ -470,95 +396,37 @@ def find_long_running_vertex_training_jobs( pools = _parse_worker_pools(raw_worker_specs) hardware_unknown = not pools - # Accelerator detection: derived from actual hardware data only. - # hardware_unknown does not imply GPU — it only triggers duration-tiered fallback - # cost, keeping risk conservatively MEDIUM. is_accelerator = _has_accelerator_hardware(pools) - # Cost: sum per-pool cost × replica_count across ALL pools. - # This correctly handles heterogeneous clusters (different machine types per pool). - # For TPU jobs: each "replica" in the pool tuple is a physical host (derived from - # tpuTopology), so _total_hourly_rate correctly prices host_count × per-host cost. - if pools: - total_replicas = sum(r for _, _, _, r in pools) or 1 - primary_machine = pools[0][0] - primary_accel = pools[0][1] - primary_accel_count = pools[0][2] - hourly_rate_total = _total_hourly_rate(pools) - # Capture TPU topology for label — present in raw spec but not in pool tuple - primary_tpu_topology: Optional[str] = None - if raw_worker_specs and primary_machine and _is_tpu_machine(primary_machine): - primary_tpu_topology = ( - raw_worker_specs[0].get("machineSpec", {}).get("tpuTopology") or None - ) - else: - total_replicas = 1 - primary_machine = None - primary_accel = None - primary_accel_count = 0 - primary_tpu_topology = None - # Duration-scaled fallback: longer jobs are more likely to be GPU-class pipelines. - # Tiers: >24h → $20/hr (probable multi-GPU), >6h → $5/hr (ambiguous), else → $1/hr. - # Still conservative — large GPU pipelines can cost $50–$500+/hr. - if duration_hours > 24: - hourly_rate_total = 20.0 - elif duration_hours > 6: - hourly_rate_total = 5.0 - else: - hourly_rate_total = 1.0 - - # Early-exit: skip all jobs below early_warning_fraction of the threshold. - # The early-warning band (fraction–100%) is evaluated in the confidence block below. - if duration_hours < long_running_hours * early_warning_fraction: - continue - - # Raw values — no intermediate rounding; format inline, round once for storage. - # accrued_raw is the true computed cost and is stored in details unchanged. - # accrued_display is capped at $1M to avoid distorting summaries with stale-table - # outliers, but the raw value is always preserved for analysis. - duration_display = round(duration_hours, 1) - accrued_raw = hourly_rate_total * duration_hours - if accrued_raw > 1_000_000: - warnings.warn( - f"gcp.vertex.training_job.long_running: accrued cost estimate " - f"${accrued_raw:,.0f} exceeds $1M — cost table may be stale or topology " - f"unusually large; capping display at $1,000,000", - stacklevel=2, - ) - accrued_display = min(accrued_raw, 1_000_000.0) - overrun_hours = max(0.0, duration_hours - long_running_hours) - - # Confidence - if duration_hours >= long_running_hours * runaway_multiplier: + # --- Confidence (spec 9.2) --- + if elapsed_seconds >= _RUNAWAY_MULTIPLIER * threshold_seconds: confidence = ConfidenceLevel.HIGH - elif duration_hours >= long_running_hours: - confidence = ConfidenceLevel.MEDIUM else: - # early_warning_fraction–100% of threshold: fire early for accelerators or - # expensive CPU clusters. The replica cap (≤50) suppresses early warnings for - # very large CPU-only clusters that are likely intentional distributed workloads - # (e.g. 200-node Spark/Beam jobs); accelerators are never gated by replica count. - expensive_cpu = hourly_rate_total > expensive_hourly_threshold and total_replicas <= 50 - if is_accelerator or expensive_cpu: - confidence = ConfidenceLevel.MEDIUM - else: - continue + confidence = ConfidenceLevel.MEDIUM - # Risk model: - # HIGH confidence + accelerator (GPU/TPU) → CRITICAL - # HIGH confidence + CPU or unknown hw → HIGH - # (unknown hardware + runaway lands here via is_accelerator=False — suspicious enough - # to warrant HIGH without an actual accelerator spec; avoids false CRITICAL) - # MEDIUM confidence → MEDIUM + # --- Risk (spec 9.3) --- if confidence == ConfidenceLevel.HIGH: risk = RiskLevel.CRITICAL if is_accelerator else RiskLevel.HIGH else: risk = RiskLevel.MEDIUM - # Human-readable job label + # --- Finding construction --- job_id = name.rsplit("/", 1)[-1] if name else "" label = display_name or job_id + if pools: + total_replicas = sum(pool[3] for pool in pools) # each pool[3] >= 1 + primary_machine = pools[0][0] + primary_accel = pools[0][1] + primary_accel_count = pools[0][2] + primary_tpu_topology: Optional[str] = pools[0][4] # stored during parsing + else: + total_replicas = 1 + primary_machine = None + primary_accel = None + primary_accel_count = 0 + primary_tpu_topology = None + hardware_label = _hardware_label( primary_machine, primary_accel, @@ -567,10 +435,12 @@ def find_long_running_vertex_training_jobs( tpu_topology=primary_tpu_topology, ) + state = actual_state # already validated == expected enum for this job_type + overrun_hours = max(0.0, duration_hours - long_running_hours_threshold) threshold_detail = ( - f"exceeded by {math.floor(overrun_hours)}h" + f"exceeded by {int(overrun_hours)}h" if overrun_hours > 0 - else f"{round(long_running_hours - duration_hours, 1)}h below threshold (early warning)" + else f"{round(long_running_hours_threshold - duration_hours, 1)}h below threshold" ) title = ( @@ -579,82 +449,63 @@ def find_long_running_vertex_training_jobs( + ")" ) - primary_bundled = _is_bundled_machine(primary_machine) signals = [ - f"Job status: RUNNING for {duration_display}h " - f"(threshold: {long_running_hours}h, {threshold_detail})", - ( - f"Burn rate: ~${hourly_rate_total:.2f}/hr across {total_replicas} workers" - if total_replicas > 1 - else f"Burn rate: ~${hourly_rate_total:.2f}/hr" - ), + f"Job status: {state} for {duration_display}h " + f"(threshold: {long_running_hours_threshold}h, {threshold_detail})", ] if hardware_label: - signals.append( - f"Hardware: {hardware_label}" - + (" (GPU/accelerator)" if is_accelerator and not primary_bundled else "") - ) + signals.append(f"Hardware: {hardware_label}") if total_replicas > 1: signals.append( f"Distributed training ({total_replicas} workers) — " - f"long durations may be expected for large-scale jobs" + "long durations may be expected for large-scale jobs" ) - signals.append( - f"Accrued cost: ~${accrued_display:,.2f} " - f"(${hourly_rate_total:.2f}/hr × {duration_display}h elapsed, " - f"us-central1 on-demand — actual cost varies by region and committed use discounts)" - ) if hardware_unknown: - signals.append( - f"TrainingPipeline: hardware spec not exposed in API response — " - f"cost estimate uses duration-scaled placeholder (~${hourly_rate_total:.2f}/hr); " - "actual cost varies widely: ~$0.20–$1/hr for small CPU pipelines, " - "$50–$100+/hr for large accelerator jobs" - ) + signals.append("Hardware spec not structurally exposed in API response") not_checked = [ "Intentional long-running distributed training (LLM pre-training, large fine-tunes)", "Checkpoint saving — job may be making progress without visible status updates", - "Committed use discounts — actual cost may be significantly lower than on-demand estimate", + "Committed use discounts — actual cost may be significantly lower than on-demand", "Preemptible/Spot workers — cost and interruption semantics differ", ] - evidence = Evidence( - signals_used=signals, - signals_not_checked=not_checked, - time_window=f"{duration_display}h", - ) - findings.append( Finding( provider="gcp", rule_id="gcp.vertex.training_job.long_running", resource_type="gcp.vertex.training_job", - resource_id=name or job_id, + resource_id=name, region=location, title=title, summary=( - f"Vertex AI {job_type} '{label}' has been RUNNING for {duration_display}h" + f"Vertex AI {job_type} '{label}' has been {state} for {duration_display}h" + (f" ({hardware_label})" if hardware_label else "") - + f", accruing ~${accrued_display:,.2f} so far." - + f" Most training jobs complete well under {long_running_hours} hours unless intentionally long-running." + + f". Most training jobs complete well under " + f"{long_running_hours_threshold}h unless intentionally long-running." ), reason=( - f"Job has been RUNNING for {duration_display}h " - f"(threshold: {long_running_hours}h)" + f"Job has been {state} for {duration_display}h " + f"(threshold: {long_running_hours_threshold}h)" ), risk=risk, confidence=confidence, detected_at=now, - evidence=evidence, - # Training jobs are transient — setting estimated_monthly_cost_usd would - # corrupt monthly savings totals. Accrued cost lives in details only. - estimated_monthly_cost_usd=None, + evidence=Evidence( + signals_used=signals, + signals_not_checked=not_checked, + time_window=f"{duration_display}h", + ), + estimated_monthly_cost_usd=None, # spec 10.1: transient resource details={ "job_name": name, "display_name": display_name or None, "job_type": job_type, + "state": state, "location": location, + "start_time": start_str, + "duration_hours": round(duration_hours, 2), + "long_running_hours_threshold": long_running_hours_threshold, "machine_type": primary_machine or None, "accelerator_type": primary_accel or None, "accelerator_count": (primary_accel_count if primary_accel_count else None), @@ -662,30 +513,6 @@ def find_long_running_vertex_training_jobs( "total_workers": total_replicas, "is_accelerator": is_accelerator, "hardware_unknown": hardware_unknown, - "duration_hours": round(duration_hours, 2), - "long_running_hours_threshold": long_running_hours, - "burn_rate_per_hour": hourly_rate_total, - "overrun_hours": overrun_hours, - "accrued_cost_usd": accrued_raw, - "cost_type": "accrued_to_date", - "pricing_source": ( - "conservative_pipeline_default" - if hardware_unknown - else "static_estimate_us_central1" - ), - "pricing_confidence": ( - "pipeline_default" if hardware_unknown else _pricing_confidence(pools) - ), - "pricing_scope": "us-central1_reference", - "pricing_note": ( - f"Cost estimated using us-central1 on-demand baseline; " - f"actual job is in {location}" - + ( - " — pricing is likely similar" - if location.startswith("us-") - else " — regional pricing may differ significantly" - ) - ), }, ) ) @@ -693,7 +520,8 @@ def find_long_running_vertex_training_jobs( if skipped_jobs > 0: warnings.warn( f"gcp.vertex.training_job.long_running: {skipped_jobs} job(s) skipped " - f"due to missing or unparseable timestamps — findings may be incomplete", + "due to malformed resource name, unexpected state, or unusable startTime " + "— findings may be incomplete", stacklevel=2, ) @@ -728,11 +556,24 @@ def _paginate(url: str) -> Optional[list]: raise PermissionError( f"aiplatform.{resource}.list permission required " f"(roles/aiplatform.viewer)" ) - if resp.status_code == 404: - return [] - if resp.status_code == 400: - return None # signal caller to try fallback - resp.raise_for_status() + if not resp.ok: + if results: + # Later-page failure: keep earlier pages, warn (spec 11.3). + # Treat identically to a non-permission surface failure so the + # caller can decide whether to continue with the other surface. + warnings.warn( + f"gcp.vertex.training_job.long_running: {resource} pagination " + f"failed mid-scan (HTTP {resp.status_code}) — " + "partial page results kept; findings may be incomplete", + stacklevel=4, + ) + return results + # First-page failures: + if resp.status_code == 404: + return [] # API not enabled — not an error + if resp.status_code == 400: + return None # wildcard unsupported — signal caller for fallback + resp.raise_for_status() # propagate other first-page errors data = resp.json() results.extend(data.get(resource, [])) next_token = data.get("nextPageToken") @@ -741,8 +582,6 @@ def _paginate(url: str) -> Optional[list]: params["pageToken"] = next_token return results - # Fast path: wildcard covers all regions in one paginated sequence. - # Skip if we already know this project+resource combination doesn't support it. cache_key = (project_id, resource) if cache_key not in _wildcard_unsupported: result = _paginate(f"{base_url}/-/{resource}") @@ -750,7 +589,6 @@ def _paginate(url: str) -> Optional[list]: return result _wildcard_unsupported.add(cache_key) - # Fallback: per-location queries all_jobs: list = [] seen: set = set() for location in _VERTEX_LOCATIONS: @@ -765,16 +603,6 @@ def _paginate(url: str) -> Optional[list]: return all_jobs -def _parse_location(name: str) -> str: - """Extract location from resource name: projects/{p}/locations/{loc}/.../{id}""" - parts = name.split("/") - try: - idx = parts.index("locations") - return parts[idx + 1] - except (ValueError, IndexError): - return "" - - def _tpu_topology_host_count(machine_type: str, topology: str) -> int: """ Compute the number of physical TPU hosts implied by tpuTopology. @@ -787,7 +615,7 @@ def _tpu_topology_host_count(machine_type: str, topology: str) -> int: chips_per_host = _BUNDLED_ACCELERATOR_COUNT[machine_type] hosts = max(1, total_chips // chips_per_host) - Returns 0 when topology is empty or unparseable — callers fall back to replicaCount. + Returns 0 when topology is empty or unparseable -- callers fall back to replicaCount. """ if not topology: return 0 @@ -806,7 +634,6 @@ def _tpu_topology_host_count(machine_type: str, topology: str) -> int: if chips_per_host <= 0: # Fallback: parse the -Nt suffix common to all Cloud TPU machine names. # e.g. "tpu7x-standard-4t" → suffix "4t" → 4 chips/host. - # This handles future variants automatically without requiring a table entry. suffix = (machine_type or "").rsplit("-", 1)[-1] if suffix.endswith("t") and suffix[:-1].isdigit(): chips_per_host = int(suffix[:-1]) @@ -822,38 +649,55 @@ def _tpu_topology_host_count(machine_type: str, topology: str) -> int: def _parse_worker_pools( worker_pool_specs: list, -) -> List[Tuple[Optional[str], Optional[str], int, int]]: +) -> List[Tuple[Optional[str], Optional[str], int, int, Optional[str]]]: """ Parse per-pool hardware specs from a CustomJob or TrainingPipeline. - Returns a list of (machine_type, accel_type, accel_count, replica_count) tuples, - one per pool. The first element is the primary (chief) pool. + Returns a list of (machine_type, accel_type, accel_count, replica_count, tpu_topology) + tuples, one per pool. The first element is the primary (chief) pool. + + Returns [] when no specs are provided, or when all entries are malformed. - Returns [] when no specs are provided; callers should apply defaults in that case. - Cost must be summed across all pools — do not use primary pool × total_replicas, - as secondary pools often have different (and more expensive) machine types. + Per spec 8.1 and 8.2: `machineType` is required in a pool entry for it to be + structurally valid. Entries missing `machineType`, or entries that cannot be + parsed due to type errors, are silently skipped rather than making the whole + resource ineligible. TPU topology: for TPU machine types (ct5lp-*, ct6e-*, tpu7x-*, etc.), replicaCount is always 1 in the API even for multi-host pods. tpuTopology encodes the actual - chip grid; this function replaces replicaCount with the derived host count so that - _total_hourly_rate() correctly prices the whole pod. + chip grid; this function replaces replicaCount with the derived host count. + tpu_topology is stored in the tuple so callers never need to re-index into the + original raw specs list (which may have different indices after malformed entries + are filtered). """ pools = [] for pool in worker_pool_specs: - machine_spec = pool.get("machineSpec", {}) - replicas = max(1, int(pool.get("replicaCount", 1))) - machine = machine_spec.get("machineType") or None - accel = machine_spec.get("acceleratorType") or None - count = int(machine_spec.get("acceleratorCount", 0)) - - # For TPU machines replicaCount is always 1; derive real host count from topology. - if machine and _is_tpu_machine(machine): - topology = machine_spec.get("tpuTopology") or "" - host_count = _tpu_topology_host_count(machine, topology) - if host_count > 0: - replicas = host_count - - pools.append((machine, accel, count, replicas)) + try: + if not isinstance(pool, dict): + continue + machine_spec = pool.get("machineSpec") or {} + if not isinstance(machine_spec, dict): + continue + # machineType is required for a structurally valid pool (spec 8.1, 8.2) + machine = (machine_spec.get("machineType") or "").strip() or None + if not machine: + continue + replicas = max(1, int(pool.get("replicaCount") or 1)) + accel = (machine_spec.get("acceleratorType") or "").strip() or None + count = int(machine_spec.get("acceleratorCount") or 0) + + tpu_topo: Optional[str] = None + if _is_tpu_machine(machine): + tpu_topo = (machine_spec.get("tpuTopology") or "").strip() or None + if tpu_topo: + host_count = _tpu_topology_host_count(machine, tpu_topo) + if host_count > 0: + replicas = host_count + + pools.append((machine, accel, count, replicas, tpu_topo)) + except (TypeError, ValueError): + # Malformed pool entry: skip for hardware classification (spec 8.1, 8.2) + continue return pools @@ -878,113 +722,31 @@ def _is_bundled_machine(machine_type: Optional[str]) -> bool: Return True if the machine type has accelerator cost bundled (no separate add-on). Covers GPU machine families (a2-*, a3-*, a4-*, a4x-*, g2-*, g4-*) and - Cloud TPU machine types (ct5lp-*, ct5p-*, ct6e-*, etc.) that expose TPU - via machineType + tpuTopology rather than acceleratorType. + Cloud TPU machine types that expose TPU via machineType + tpuTopology. """ m = machine_type or "" return m.startswith(_BUNDLED_GPU_PREFIXES) or _is_tpu_machine(machine_type) def _has_accelerator_hardware( - pools: List[Tuple[Optional[str], Optional[str], int, int]], + pools: List[Tuple[Optional[str], Optional[str], int, int, Optional[str]]], ) -> bool: """ Return True if any worker pool uses GPU or TPU accelerator hardware. - Detects accelerators via two structured paths: - - Explicit accelerator type in _ACCELERATOR_TYPES (GPU families and TPU pods via add-on) - - _is_bundled_machine(m): covers _BUNDLED_GPU_PREFIXES (a2-*, a3-*, a4-*, a4x-*, g2-*, g4-*) - and _is_tpu_machine() (ct4-*/ct5*/ct6*/ct7*/tpu*) + Two independent detection paths (spec 8.1): + - Explicit path: acceleratorType is a recognized enum AND acceleratorCount > 0 + - Bundled path: machine type is in a GPU or TPU family (_is_bundled_machine) + acceleratorType alone with count == 0 does NOT classify a pool as accelerated. Empty pools → False. Unknown hardware does NOT imply accelerated workload. - Relies on structured prefix lists only — no substring matching. """ return any( - (a or "").upper() in _ACCELERATOR_TYPES or _is_bundled_machine(m) for m, a, c, r in pools + ((a or "").upper() in _ACCELERATOR_TYPES and c > 0) or _is_bundled_machine(m) + for m, a, c, r, *_ in pools ) -def _estimate_hourly_rate_per_replica( - machine_type: Optional[str], - accel_type: Optional[str], - accel_count: int, -) -> float: - """ - Estimate hourly cost for a single replica (one worker node). - - Bundled families (a2-*, a3-*, a4-*, a4x-*, g2-*, g4-*, ct*/tpu7x-*) include accelerator - cost in the machine price. n1-*/n2-*/c2-* add accelerator cost separately. - - Co-scheduling (bundled machines only): when accel_count <= N//2 (where N is the machine's - full accelerator count from _BUNDLED_ACCELERATOR_COUNT), Vertex AI may place - floor(N/accel_count) replicas onto one VM. In that case each replica shares the machine - cost proportionally — machine_hourly is divided by replicas_per_vm. When accel_count is 0 - or unknown, the full machine price is charged conservatively. - """ - # For unrecognized TPU machine types use the TPU-specific default to avoid the - # generic $150/mo fallback massively underestimating a real TPU job. - _mt = machine_type or "" - if _mt in _MACHINE_MONTHLY_COST: - machine_monthly = _MACHINE_MONTHLY_COST[_mt] - elif _is_tpu_machine(_mt) or "tpu" in _mt.lower(): - # Second condition is a defensive catch for future TPU naming patterns that - # _is_tpu_machine() might miss — avoids silent 70× underestimate vs generic $150/mo. - machine_monthly = _DEFAULT_TPU_MONTHLY_COST - else: - machine_monthly = _DEFAULT_MACHINE_MONTHLY_COST - machine_hourly = machine_monthly / _HOURS_PER_MONTH - - # Co-scheduling correction for bundled machines. - # Only applies when accel_count divides machine_gpu_count evenly (clean partition) - # and accel_count <= machine_gpu_count (requesting more GPUs than exist is invalid). - if _is_bundled_machine(machine_type) and accel_count >= 1: - machine_gpu_count = _BUNDLED_ACCELERATOR_COUNT.get(machine_type or "", 0) - if ( - machine_gpu_count > 0 - and accel_count <= machine_gpu_count - and machine_gpu_count % accel_count == 0 - ): - replicas_per_vm = max(1, machine_gpu_count // accel_count) - machine_hourly = machine_hourly / replicas_per_vm - - accelerator_hourly = 0.0 - if accel_type and accel_type in _ACCELERATOR_MONTHLY_COST_EACH: - if not _is_bundled_machine(machine_type): - accelerator_hourly = ( - _ACCELERATOR_MONTHLY_COST_EACH[accel_type] / _HOURS_PER_MONTH - ) * max(accel_count, 1) - - return machine_hourly + accelerator_hourly - - -def _pricing_confidence( - pools: List[Tuple[Optional[str], Optional[str], int, int]], -) -> str: - """ - Return "published" if all machine types and accelerators in the pool list have - published GCP pricing, otherwise "partial_estimate". - """ - for m, a, _c, _r in pools: - mt = m or "" - if mt.startswith(_PRICING_ESTIMATED_MACHINE_PREFIXES): - return "partial_estimate" - if (a or "").upper() in _PRICING_ESTIMATED_ACCEL_TYPES: - return "partial_estimate" - return "published" - - -def _total_hourly_rate( - pools: List[Tuple[Optional[str], Optional[str], int, int]], -) -> float: - """ - Sum hourly burn rate across all worker pools. - - Each pool contributes _estimate_hourly_rate_per_replica × replica_count. - Correctly handles heterogeneous jobs (different machine types per pool). - """ - return sum(_estimate_hourly_rate_per_replica(m, a, c) * r for m, a, c, r in pools) - - def _hardware_label( machine_type: Optional[str], accel_type: Optional[str], @@ -992,19 +754,14 @@ def _hardware_label( total_replicas: int, tpu_topology: Optional[str] = None, ) -> str: - """Build a compact hardware label for title/summary. - - For TPU machines, tpu_topology (e.g. "2x4") is appended when non-empty - because the machine name alone (e.g. "ct5lp-hightpu-8t") does not convey - the full chip grid or host count. - """ + """Build a compact hardware label for title/summary.""" parts = [] if machine_type: label = machine_type if tpu_topology and _is_tpu_machine(machine_type): label = f"{machine_type} [{tpu_topology}]" parts.append(label) - if accel_type and accel_type != "ACCELERATOR_TYPE_UNSPECIFIED": + if accel_type and accel_type != "ACCELERATOR_TYPE_UNSPECIFIED" and accel_count > 0: count_str = f"{accel_count}×" if accel_count > 1 else "" parts.append(f"{count_str}{accel_type}") if total_replicas > 1: diff --git a/docs/rules/gcp.md b/docs/rules/gcp.md index 2e0ae3d..6360e10 100644 --- a/docs/rules/gcp.md +++ b/docs/rules/gcp.md @@ -194,17 +194,24 @@ **Spec:** — #### `gcp.vertex.training_job.long_running` -**Detects:** Vertex AI CustomJobs and TrainingPipelines in `RUNNING` state beyond `long_running_hours_threshold`; GPU/TPU jobs near threshold also trigger early-warning findings +**Detects:** Vertex AI CustomJobs and TrainingPipelines whose state is exactly the expected running state (`JOB_STATE_RUNNING` / `PIPELINE_STATE_RUNNING`) and whose elapsed wall-clock time since `startTime` meets or exceeds `long_running_hours_threshold` -**Confidence / Risk:** HIGH (duration ≥ 3× threshold — clearly runaway); MEDIUM (duration ≥ threshold) / CRITICAL (HIGH confidence + GPU/accelerator); HIGH (HIGH confidence + non-GPU); MEDIUM (all MEDIUM confidence) +**Confidence / Risk:** HIGH (duration ≥ 3× threshold — clearly runaway); MEDIUM (duration ≥ threshold) / CRITICAL (HIGH confidence + GPU/TPU/accelerator); HIGH (HIGH confidence + non-accelerator); MEDIUM (all MEDIUM confidence) + +**Cost:** `estimated_monthly_cost_usd = None` — training jobs are transient; no static per-hour rate is appropriate across machine types and regions **Permissions:** `aiplatform.customJobs.list`, `aiplatform.trainingPipelines.list` (roles/aiplatform.viewer) -**Params:** `long_running_hours_threshold` (default: 24); `expensive_hourly_threshold` (default: $20/hr, for early-warning CPU jobs) +**Params:** `long_running_hours_threshold` (default: 24) -**Exclusions:** jobs < 90% of threshold; cheap CPU-only jobs in the 90–100% early-warning zone +**Exclusions:** +- resource name not matching exact pattern `projects/{p}/locations/{l}/customJobs/{id}` or `trainingPipelines/{id}` (6 segments, non-empty components) +- state field absent or not exactly the expected running state for the job type +- `startTime` absent, non-RFC3339 (rejects space separator, date-only, missing timezone), or unparsable +- elapsed < `long_running_hours_threshold` +- region filter set and derived location does not exactly match -**Spec:** — +**Spec:** [docs/specs/gcp/ai/vertex_training_job_long_running.md](../specs/gcp/ai/vertex_training_job_long_running.md) #### `gcp.tpu.idle` **Detects:** Standalone Cloud TPU nodes in exact `READY` state where complete worker-joined duty-cycle telemetry (`tpu.googleapis.com/accelerator/duty_cycle` on `tpu.googleapis.com/GceTpuWorker`) confirms max observed duty cycle <= 2% across all joined workers and accelerators over the full buffered `idle_days` window; monitoring is required — no age-only, partial-join, or cadence-assumed fallback diff --git a/docs/specs/gcp/ai/vertex_training_job_long_running.md b/docs/specs/gcp/ai/vertex_training_job_long_running.md new file mode 100644 index 0000000..dff671e --- /dev/null +++ b/docs/specs/gcp/ai/vertex_training_job_long_running.md @@ -0,0 +1,508 @@ +# GCP Rule Spec - `gcp.vertex.training_job.long_running` + +## 1. Rule Identity + +- **Rule ID:** `gcp.vertex.training_job.long_running` +- **Provider:** GCP +- **Resource type:** Vertex AI training job +- **Finding resource_type:** `gcp.vertex.training_job` + +--- + +## 2. Intent + +Detect **Vertex AI training resources that are provably still in an exact documented running state** and whose documented `startTime` shows they have been running for at least a conservative review threshold. + +This rule is deliberately **precision-first**. It is a **review-candidate** rule only. It is **not** proof that a job is hung, **not** proof that no useful progress is occurring, **not** proof that the resource is safe to cancel, and **not** proof of a specific monthly dollar saving. + +### 2.1 Canonical definitions + +| Term | Definition | +|---|---| +| Vertex training job | Either a Vertex AI `CustomJob` or a Vertex AI `TrainingPipeline` | +| running custom job | A `CustomJob` whose `state` is exactly `JOB_STATE_RUNNING` | +| running training pipeline | A `TrainingPipeline` whose `state` is exactly `PIPELINE_STATE_RUNNING` | +| scan clock | Single `now_utc` instant captured once per scan run and reused for all resources | +| runtime anchor | The documented `startTime` field of the resource | +| elapsed runtime hours | `(now_utc - start_time_utc)` expressed in hours | +| elapsed runtime seconds | `(now_utc - start_time_utc)` expressed in seconds | +| long-running threshold hours | Configured review threshold for this rule (`long_running_hours_threshold`); default `24` hours | +| accelerator-backed job | A job whose documented worker-pool machine spec explicitly shows accelerator hardware | +| hardware unknown | A job for which the control-plane response does not expose enough documented machine-spec data to classify hardware | + +--- + +## 3. GCP Documentation Grounding + +### 3.1 CustomJob is the canonical Vertex AI resource for custom training workloads + +Google documents `CustomJob` as a resource that runs custom workloads such as a Docker container or a Python package. Google also documents: + +1. `jobSpec` +2. `state` +3. `createTime` +4. `startTime` +5. `endTime` +6. `updateTime` + +Google explicitly defines `CustomJob.startTime` as the time when the `CustomJob` **for the first time entered** `JOB_STATE_RUNNING`. + +Source: + +- *REST Resource: projects.locations.customJobs* + +URL: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.customJobs + +Rule consequence: + +1. `CustomJob` is an in-scope training resource for this rule. +2. `startTime` is the canonical runtime anchor for `CustomJob`. +3. `createTime` is **not** the canonical runtime anchor for a running job. + +### 3.2 TrainingPipeline is also an in-scope training resource, but it is an orchestrator + +Google documents `TrainingPipeline` as a resource that **orchestrates tasks associated with training a Model** and **always executes the training task**, while it may also export dataset data, upload the model, and evaluate the model. + +Google also documents: + +1. `trainingTaskDefinition` +2. `trainingTaskInputs` +3. `trainingTaskMetadata` +4. `state` +5. `createTime` +6. `startTime` +7. `endTime` +8. `updateTime` + +Google explicitly defines `TrainingPipeline.startTime` as the time when the pipeline **for the first time entered** `PIPELINE_STATE_RUNNING`. + +Google also documents that `trainingTaskMetadata` is populated only on a **best effort basis** while the pipeline is running. + +Source: + +- *REST Resource: projects.locations.trainingPipelines* + +URL: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.trainingPipelines + +Rule consequence: + +1. `TrainingPipeline` is an in-scope training resource for this rule. +2. `startTime` is the canonical runtime anchor for `TrainingPipeline`. +3. `trainingTaskMetadata` must not be treated as canonical proof of runtime, progress, or hardware shape. +4. A `TrainingPipeline` finding remains review-candidate only because the resource may also be orchestrating non-training auxiliary tasks. + +### 3.3 Exact running-state enums are documented + +Google documents: + +1. `JOB_STATE_RUNNING` means **the job is in progress** +2. `PIPELINE_STATE_RUNNING` means **the pipeline is in progress** +3. queued, pending, updating, pausing, cancelling, cancelled, failed, and succeeded are distinct states + +Sources: + +- *JobState* +- *PipelineState* + +URLs: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/JobState +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/PipelineState + +Rule consequence: + +1. Eligibility must require exact documented running states only. +2. The rule must not treat queued, pending, paused, updating, cancelling, cancelled, failed, or succeeded resources as running. + +### 3.4 Worker-pool machine shape is documented for CustomJob + +Google documents `CustomJobSpec.workerPoolSpecs` and `WorkerPoolSpec.machineSpec`. + +Google documents on these surfaces: + +1. `workerPoolSpecs` +2. `replicaCount` +3. `machineSpec.machineType` +4. `machineSpec.acceleratorType` +5. `machineSpec.acceleratorCount` +6. `machineSpec.tpuTopology` + +Source: + +- *CustomJobSpec* +- *MachineSpec* + +URLs: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec + +Rule consequence: + +1. CustomJob hardware classification may be based on documented worker-pool machine-spec fields. +2. TPU-backed training may be identified from documented machine-spec fields such as TPU machine types and `tpuTopology`. +3. Hardware evidence must come from documented structured machine-spec fields, not from name heuristics outside those documented surfaces. + +### 3.5 TrainingPipeline hardware exposure is task-definition dependent + +Google documents: + +1. `trainingTaskDefinition` points to the YAML definition of the training task +2. `trainingTaskInputs` contains the training task parameters **as specified by that definition** + +Source: + +- *REST Resource: projects.locations.trainingPipelines* + +URL: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.trainingPipelines + +Rule consequence: + +1. Hardware classification for `TrainingPipeline` is optional and definition-dependent. +2. If the pipeline response does not expose documented worker-pool machine-spec fields through `trainingTaskInputs`, hardware must remain unknown. +3. The rule must not guess GPU, TPU, machine type, or replica count for `TrainingPipeline` resources whose task inputs do not expose those fields. + +### 3.6 Vertex AI training pricing is usage-based and configuration-specific + +Google documents that: + +1. for custom-trained models, training prices depend on the selected machine types +2. if Compute Engine machine types have attached accelerators, accelerator cost is separate unless included in the machine type +3. pricing varies by region +4. reservations, committed use discounts, and Spot usage can change effective cost +5. there is **no minimum usage duration** for training and prediction; usage is charged in **30 second increments** + +Source: + +- *Vertex AI pricing* + +URL: + +- https://cloud.google.com/vertex-ai/pricing + +Rule consequence: + +1. Long-running training is a valid cost-review candidate because training compute is usage-billed while it runs. +2. Static hardcoded pricing tables are not canonical rule logic. +3. `estimated_monthly_cost_usd` must remain `None` because training jobs are transient, not recurring monthly resources. +4. The rule must not rely on region-agnostic or stale price heuristics for eligibility. + +### 3.7 Vertex AI locations are regional, not global + +Google documents that Vertex AI does not support a global location and uses regional resource names and regional service endpoints. + +Source: + +- *Vertex AI locations* + +URL: + +- https://cloud.google.com/vertex-ai/docs/general/locations + +Rule consequence: + +1. Location must be derived from the resource name. +2. Region filters must compare against exact regional location values. + +--- + +## 4. Detection Goal + +Emit a finding only when **all** of the following are true: + +1. the resource is a documented in-scope Vertex AI training resource (`CustomJob` or `TrainingPipeline`) +2. the resource is in an exact documented running state +3. the resource has a valid, parseable, non-future `startTime` +4. the derived elapsed runtime is at least `long_running_hours_threshold` + +If any required signal cannot be established reliably, skip rather than emit. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that the training job is hung or deadlocked +- that the training job is abandoned or forgotten +- that the job is safe to cancel +- that no checkpointing or useful progress is occurring +- that the job is definitely expensive +- that a specific monthly saving exists + +--- + +## 6. Canonical Inputs + +### 6.1 Required surfaces + +The implementation may use the following documented APIs: + +1. `projects.locations.customJobs.list` +2. `projects.locations.trainingPipelines.list` + +Relevant list-filter capability documented by Google: + +1. CustomJobs support filtering by `state` +2. TrainingPipelines support filtering by `state` +3. paginated results must be exhausted using `nextPageToken` + +Sources: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.customJobs/list +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.trainingPipelines/list + +### 6.2 Required per-resource fields + +| Resource type | Required fields | +|---|---| +| `CustomJob` | `name`, `state`, `startTime` | +| `TrainingPipeline` | `name`, `state`, `startTime` | + +### 6.3 Optional context fields + +These may enrich the finding when present, but are not required for eligibility: + +- `displayName` +- `jobSpec.workerPoolSpecs` on `CustomJob` +- `trainingTaskDefinition` on `TrainingPipeline` +- `trainingTaskInputs` on `TrainingPipeline` +- `labels` + +--- + +## 7. Canonical normalization rules + +Normalize the following values: + +| Field | Canonical rule | +|---|---| +| `resource_name` | Must exactly match one of these forms: `projects/{project}/locations/{location}/customJobs/{id}` or `projects/{project}/locations/{location}/trainingPipelines/{id}`. Otherwise skip. | +| `location` | Parse from the exact `locations/{location}` segment of the resource name. Region-filter comparison must use exact string equality only, with no aliasing or case folding. | +| `state` | Compare exactly to the documented running enum for the resource type, case-sensitive and with no normalization. Null or empty values skip. | +| `start_time_utc` | Parse documented RFC3339 `startTime` into timezone-aware UTC. Valid RFC3339 timestamps, including fractional seconds and either `Z` or explicit offsets, must be accepted. Any other format is invalid. Missing, unparsable, or future values skip. No fallback parsing is allowed. | +| `elapsed_runtime_seconds` | Compute from a single per-run `now_utc - start_time_utc`. Do not round for eligibility decisions. | +| `elapsed_runtime_hours` | Derived display/context form of elapsed runtime. It must not be the canonical comparison unit. | + +Important: + +1. `createTime` is context only; it must **not** replace `startTime` as the runtime anchor. +2. `updateTime` is context only; it must **not** replace `startTime` as the runtime anchor. +3. `endTime` is not relevant for resources still in running state. +4. `now_utc` must be captured once per scan run in UTC and reused for all resources in that run. +5. `now_utc` must not be recomputed, shifted, or otherwise adjusted mid-scan. + +--- + +## 8. Hardware evidence rules + +### 8.1 CustomJob hardware classification + +For `CustomJob`, hardware may be classified from documented `jobSpec.workerPoolSpecs[].machineSpec` fields: + +1. `machineType` +2. `acceleratorType` +3. `acceleratorCount` +4. `tpuTopology` +5. `replicaCount` + +If `workerPoolSpecs` is missing, empty, or all entries are structurally invalid, the job must remain eligible on duration/state grounds but `hardware_unknown = true`. + +A pool entry is structurally valid only when `machineType` is present and non-empty. Pool entries without `machineType` are treated as malformed and must be skipped for hardware classification. + +If some worker-pool entries are partially malformed, those invalid pools should be ignored for hardware classification rather than making the whole job ineligible. Hardware remains based only on structurally valid documented pools. + +A `CustomJob` is accelerator-backed when **any** worker pool explicitly shows any of the following: + +1. `acceleratorType` is a recognized documented enum value **and** `acceleratorCount > 0`, or +2. `machineType` is in a documented bundled-GPU machine family (e.g. `a2-*`, `a3-*`, `a4-*`, `a4x-*`, `g2-*`, `g4-*`) where the accelerator hardware is part of the machine type and no separate `acceleratorType` is required, or +3. `machineType` is in a documented Cloud TPU machine family (e.g. `ct4-*`, `ct5*`, `ct6*`, `tpu7x-*`) + +`acceleratorType` alone with `acceleratorCount == 0` does **not** classify a pool as accelerator-backed. + +### 8.2 TrainingPipeline hardware classification + +For `TrainingPipeline`, hardware may be classified only when the response structurally exposes documented worker-pool machine-spec fields through `trainingTaskInputs` for that task definition. + +At minimum, the exposed structure must contain: + +1. the expected nested shape `trainingTaskInputs.workerPoolSpecs[].machineSpec` +2. `machineType` within that nested `machineSpec` +3. optionally `acceleratorType`, `acceleratorCount`, or `tpuTopology` within that same nested `machineSpec` + +Flat, renamed, or otherwise shape-incompatible fields must not be treated as equivalent. + +A `trainingTaskInputs.workerPoolSpecs[]` entry is structurally valid only when it contains a `machineSpec` dict with `machineType` present and non-empty. Entries without `machineType`, or entries that are not dicts, are treated as malformed and must be skipped for hardware classification. + +If some entries are partially malformed, those invalid entries should be ignored for hardware classification rather than making the whole resource ineligible. Hardware remains based only on structurally valid documented entries. + +If those fields are not exposed, then: + +1. `hardware_unknown = true` +2. hardware class must remain unresolved +3. the rule must not guess GPU, TPU, replica count, or machine type + +### 8.3 Hardware is auxiliary, not eligibility + +Hardware evidence may affect risk labeling or finding context, but it must **not** be required for the rule to emit. + +All worker pools that are exposed by the control-plane response must be evaluated. Accelerator classification must use **any** documented accelerator-backed pool, not only the first pool. + +--- + +## 9. Decision rule + +### 9.1 Eligibility + +The resource is eligible only when: + +1. resource type is `CustomJob` or `TrainingPipeline` +2. `state` is exactly: + - `JOB_STATE_RUNNING` for `CustomJob`, or + - `PIPELINE_STATE_RUNNING` for `TrainingPipeline` +3. `start_time_utc` is valid +4. `elapsed_runtime_seconds >= long_running_hours_threshold * 3600` + +Configuration requirement: + +1. `long_running_hours_threshold` must be `>= 1` (integer hours; equivalent to `> 0` for an integer parameter) +2. invalid threshold configuration must fail fast rather than silently clamp or reinterpret the value + +### 9.2 Confidence + +Confidence is a product policy, not a Google-defined concept: + +1. `MEDIUM` when `elapsed_runtime_seconds >= long_running_hours_threshold * 3600` +2. `HIGH` when `elapsed_runtime_seconds >= 3 * long_running_hours_threshold * 3600` + +### 9.3 Risk + +Risk is a product policy and may use documented hardware evidence when available: + +1. `CRITICAL` when confidence is `HIGH` and the job is provably accelerator-backed +2. `HIGH` when confidence is `HIGH` and accelerator hardware is not proven +3. `MEDIUM` for all `MEDIUM` confidence findings + +### 9.4 Explicitly forbidden heuristics + +The rule must **not**: + +- emit below the configured long-running threshold +- emit a sub-threshold GPU or TPU "early warning" +- use `createTime` as a fallback runtime anchor +- use hardcoded hourly-price thresholds as an eligibility gate +- infer accelerator hardware when machine-spec evidence is absent + +--- + +## 10. Cost handling + +### 10.1 Canonical monthly cost field + +`estimated_monthly_cost_usd = None` + +Reason: + +1. training jobs are transient, not monthly recurring resources +2. pricing varies by region, machine type, accelerator shape, reservations, discounts, and Spot usage +3. eligibility does not depend on cost + +### 10.2 Accrued-cost estimates + +The canonical rule does **not** require any accrued-cost calculation. + +If a future implementation chooses to surface an accrued-cost hint, it must: + +1. be clearly labeled non-canonical advisory context +2. use authoritative current pricing inputs for the exact region and hardware configuration +3. never affect eligibility, confidence, or risk + +Static price tables and placeholder cost tiers are out of scope for the canonical rule. + +--- + +## 11. Failure behavior + +Always skip: + +- empty resource names +- resource names that do not exactly match the documented 6-segment pattern for the resource type (extra segments, wrong resource-type keyword, empty segments all skip) +- `state` absent, empty, or not exactly equal to the documented running enum for the resource type +- `startTime` absent, not strict RFC3339 (space separator, no timezone offset, date-only, etc.), unparsable, or future +- elapsed runtime below threshold + +Operational behavior: + +1. permission errors on a required list surface should propagate +2. a non-permission fetch failure on one independent surface (`customJobs` or `trainingPipelines`) may warn and continue with the other surface +3. if pagination fails on a later page of one surface, earlier successfully fetched pages from that same surface may still be kept, but the partial read must be treated as a non-permission failure and warned +4. if both independent surfaces fail non-permissionally, the rule returns no findings and should warn that results are incomplete +5. the rule must not synthesize findings from a surface it failed to read +6. no cross-resource dedupe is required; each `CustomJob` or `TrainingPipeline` resource is evaluated independently + +--- + +## 12. Output contract + +### 12.1 Required finding fields + +| Field | Value | +|---|---| +| `provider` | `gcp` | +| `rule_id` | `gcp.vertex.training_job.long_running` | +| `resource_type` | `gcp.vertex.training_job` | +| `resource_id` | Full Vertex AI resource name | +| `region` | Parsed resource location | +| `estimated_monthly_cost_usd` | `None` | + +Identity rules: + +1. `resource_id` is the canonical full resource name +2. `display_name` is optional context only and must not replace canonical identity + +### 12.2 Required decision facts in details or evidence + +The finding should surface, when available: + +1. `job_type` (`customJob` or `trainingPipeline`) +2. exact running state +3. `startTime` +4. elapsed runtime hours +5. threshold hours +6. hardware evidence, if explicitly exposed +7. whether hardware is unknown + +--- + +## 13. Examples of resources that must skip + +- a `CustomJob` in `JOB_STATE_PENDING` +- a `TrainingPipeline` in `PIPELINE_STATE_QUEUED` +- a `CustomJob` whose `name` is `projects/p/locations/us-central1/customJobs/123/extra` (seven segments, not six) +- a `CustomJob` whose `name` is `projects/p/locations/us-central1/models/123` (wrong resource-type segment) +- a running resource with missing `startTime` +- a running resource whose `startTime` is `2025-06-01 12:00:00Z` (space separator — not RFC3339) +- a running resource whose `startTime` is `2025-06-01T12:00:00` (no timezone offset — not RFC3339) +- a running resource whose `startTime` is unparsable +- a running resource whose elapsed runtime is 23.9h when threshold is 24h +- a `TrainingPipeline` whose task inputs do not expose worker-pool machine specs, when the implementation would otherwise need those fields only to guess cost or accelerator class + +--- + +## 14. Summary + +This is a **duration-first Vertex AI training review rule**: + +1. scope to resources whose name exactly matches the documented Vertex AI resource-name pattern +2. require exact state enum match read from the resource, not inferred from the list filter alone +3. anchor runtime strictly to documented `startTime` (RFC3339 only; no fallback parsing) +4. classify hardware from documented machine-spec fields only: explicit acceleratorType + count, bundled-GPU machine families, or TPU machine families +5. require `machineType` to be present for a pool entry to contribute to hardware classification +6. avoid sub-threshold warning heuristics +7. avoid pricing heuristics in canonical detection diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_training_job_long_running.py b/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_training_job_long_running.py index 0935d11..4f0eb26 100644 --- a/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_training_job_long_running.py +++ b/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_training_job_long_running.py @@ -5,23 +5,31 @@ - Core detection: CPU job over threshold (MEDIUM/MEDIUM), GPU job over threshold (MEDIUM) - Runaway (3× threshold): HIGH confidence, CRITICAL for GPU, HIGH for CPU - Risk model: GPU+HIGH→CRITICAL, CPU+HIGH→HIGH, MEDIUM confidence→MEDIUM regardless of GPU -- Early warning: GPU job at 90–100% of threshold only (not 75%) -- Noise reduction: GPU job at 75–89% of threshold does NOT fire -- TrainingPipeline resource type: attempts trainingTaskInputs parsing; conservative fallback +- Below-threshold jobs: no finding emitted (spec 9.4 — no sub-threshold early warnings) +- TrainingPipeline resource type: parses trainingTaskInputs; hardware_unknown when absent - TrainingPipeline with workerPoolSpecs in trainingTaskInputs: uses parsed hardware -- TrainingPipeline with no hardware spec: is_gpu=False (hardware_unknown=True), conservative duration-tiered fallback cost -- No findings: job below 90% of threshold (CPU or GPU) -- Region filter: jobs outside filter are skipped -- Location fallback: malformed name → region="unknown" +- TrainingPipeline with no hardware spec: hardware_unknown=True, is_accelerator=False +- No findings: job below threshold (CPU or GPU) +- Region filter: exact string equality (spec 7) — no case folding +- Invalid threshold (< 1): fail-fast with ValueError (spec 9.1) +- startTime absence skips job; createTime NOT used as fallback (spec 9.4) +- Future startTime skips job (spec 7) +- Malformed name skips job (spec 7, 11) - Permission errors: PermissionError raised on 403 -- estimated_monthly_cost_usd is always None (transient job) -- Per-pool cost: heterogeneous cluster cost sums all pools (not primary × total) +- estimated_monthly_cost_usd is always None (transient job, spec 10.1) +- No cost fields in details (accrued_cost_usd, burn_rate_per_hour, pricing_source, etc.) +- Details: state field (exact running enum), start_time field (RFC3339 string) - Accelerator detection from _has_accelerator_hardware: accelerator type OR machine prefix - _parse_worker_pools: returns list of per-pool tuples; empty → [] -- _estimate_hourly_rate_per_replica: bundled vs additive GPU cost -- _total_hourly_rate: sums across pools - _hardware_label: single worker, multi-worker, with accelerator - RULE_ID attribute +- Exact resource-name pattern enforcement (spec 7): extra segments, wrong type segment, skipped +- State validation: exact enum from resource, not synthesised; wrong/missing state → skip +- CustomJob hardware_unknown=True when workerPoolSpecs is empty or all entries malformed +- _parse_worker_pools: entries without machineType are skipped (spec 8.1, 8.2) +- _parse_worker_pools: malformed (non-dict, bad replicaCount/acceleratorCount) entries skipped +- RFC3339 strictness: space separator, date-only, no-tz values all rejected +- Partial pagination: later-page failure keeps accumulated pages and warns (spec 11.3) """ from datetime import datetime, timedelta, timezone @@ -34,20 +42,13 @@ from cleancloud.providers.gcp.rules.ai.vertex_training_job_long_running import ( _BUNDLED_ACCELERATOR_COUNT, _DEFAULT_LONG_RUNNING_HOURS, - _DEFAULT_MACHINE_MONTHLY_COST, - _DEFAULT_TPU_MONTHLY_COST, - _HOURS_PER_MONTH, - _MACHINE_MONTHLY_COST, + _EXPECTED_STATE, _RUNAWAY_MULTIPLIER, - _TPU_MACHINE_PREFIXES, - _estimate_hourly_rate_per_replica, _hardware_label, _has_accelerator_hardware, - _parse_location, _parse_worker_pools, - _pricing_confidence, - _total_hourly_rate, _tpu_topology_host_count, + _validate_resource_name, find_long_running_vertex_training_jobs, ) @@ -137,7 +138,6 @@ def _run( training_pipelines=None, region_filter=None, threshold=_THRESHOLD, - extra_kwargs=None, ): creds = MagicMock() session = _make_session(custom_jobs=custom_jobs, training_pipelines=training_pipelines) @@ -154,8 +154,7 @@ def _run( project_id=_PROJECT, credentials=creds, region_filter=region_filter, - long_running_hours=threshold, - **(extra_kwargs or {}), + long_running_hours_threshold=threshold, ) @@ -178,7 +177,6 @@ def test_cpu_job_over_threshold_medium_confidence(): assert f.details["is_accelerator"] is False assert f.details["job_type"] == "customJob" assert f.details["duration_hours"] > _THRESHOLD - assert f.details["accrued_cost_usd"] > 0 assert f.estimated_monthly_cost_usd is None @@ -197,7 +195,7 @@ def test_gpu_job_over_threshold_medium_risk(): assert len(findings) == 1 f = findings[0] assert f.confidence == ConfidenceLevel.MEDIUM - assert f.risk == RiskLevel.MEDIUM # not HIGH — see risk model + assert f.risk == RiskLevel.MEDIUM # not HIGH — see risk model (spec 9.3) assert f.details["is_accelerator"] is True assert f.details["accelerator_type"] == "NVIDIA_TESLA_V100" assert f.details["accelerator_count"] == 2 @@ -235,33 +233,21 @@ def test_cpu_job_runaway_3x_high(): # --------------------------------------------------------------------------- -# Early warning (GPU only, 90% threshold) +# Threshold behavior (spec 9.4: no sub-threshold early warnings) # --------------------------------------------------------------------------- -def test_gpu_early_warning_at_90pct_threshold(): - """GPU job at 92% of threshold triggers early warning.""" - job = _custom_job( - "early", - "us-central1", - start_hours_ago=_THRESHOLD * 0.92, - accel_type="NVIDIA_TESLA_T4", - accel_count=1, - ) +def test_job_below_threshold_no_finding(): + """No job type fires below the threshold (spec 9.4).""" + job = _custom_job("too-young", "us-central1", start_hours_ago=_THRESHOLD * 0.99) findings = _run(custom_jobs=[job]) - - assert len(findings) == 1 - f = findings[0] - assert f.confidence == ConfidenceLevel.MEDIUM - assert f.risk == RiskLevel.MEDIUM - assert f.details["is_accelerator"] is True - assert f.details["overrun_hours"] == 0.0 + assert findings == [] -def test_gpu_job_at_80pct_no_finding(): - """GPU job at 80% of threshold does NOT fire (below _EARLY_WARNING_FRACTION=0.9).""" +def test_gpu_job_below_threshold_no_finding(): + """GPU job below threshold does NOT fire — no sub-threshold early warnings (spec 9.4).""" job = _custom_job( - "too-young", + "gpu-too-young", "us-central1", start_hours_ago=_THRESHOLD * 0.80, accel_type="NVIDIA_TESLA_T4", @@ -271,92 +257,181 @@ def test_gpu_job_at_80pct_no_finding(): assert findings == [] -def test_cpu_early_warning_not_emitted(): - """CPU job at 92% of threshold produces no finding — early warning is GPU/TPU only.""" - job = _custom_job("cpu-early", "us-central1", start_hours_ago=_THRESHOLD * 0.92) +def test_job_at_exactly_threshold_fires(): + """Job at exactly the threshold is in scope.""" + job = _custom_job("exactly", "us-central1", start_hours_ago=_THRESHOLD) findings = _run(custom_jobs=[job]) - assert findings == [] - - -def test_job_below_50pct_no_finding(): - """No job type fires below _EARLY_WARNING_FRACTION.""" - job = _custom_job( - "way-too-young", - "us-central1", - start_hours_ago=_THRESHOLD * 0.50, - accel_type="NVIDIA_TESLA_T4", - accel_count=1, - ) - findings = _run(custom_jobs=[job]) - assert findings == [] + assert len(findings) == 1 + assert findings[0].confidence == ConfidenceLevel.MEDIUM # --------------------------------------------------------------------------- -# estimated_monthly_cost_usd +# estimated_monthly_cost_usd and cost fields # --------------------------------------------------------------------------- def test_estimated_monthly_cost_always_none(): - """Training jobs are transient; monthly cost field must be None.""" + """Training jobs are transient; monthly cost field must be None (spec 10.1).""" job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 10) findings = _run(custom_jobs=[job]) assert findings[0].estimated_monthly_cost_usd is None -def test_accrued_cost_populated(): - """Accrued cost (duration × hourly rate) must be > 0 and in details.""" - job = _custom_job( - "j2", - "us-central1", - start_hours_ago=_THRESHOLD + 1, - machine_type="n1-standard-8", - accel_type="NVIDIA_TESLA_T4", - accel_count=1, - ) +def test_no_accrued_cost_in_details(): + """Removed pricing fields must not appear in finding details (spec 10.2).""" + job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5) findings = _run(custom_jobs=[job]) - assert findings[0].details["accrued_cost_usd"] > 0 + assert len(findings) == 1 + details = findings[0].details + assert "accrued_cost_usd" not in details + assert "burn_rate_per_hour" not in details + assert "pricing_source" not in details + assert "pricing_confidence" not in details + assert "cost_type" not in details + assert "overrun_hours" not in details # --------------------------------------------------------------------------- -# TrainingPipeline resource type +# Spec-compliance: threshold validation, startTime, region filter # --------------------------------------------------------------------------- -def test_training_pipeline_no_hardware_conservative_fallback(): - """Pipeline with no hardware spec uses duration-scaled fallback cost. +def test_threshold_less_than_1_raises_value_error(): + """Invalid threshold (< 1) must fail fast with ValueError (spec 9.1).""" + creds = MagicMock() + with pytest.raises(ValueError, match="long_running_hours_threshold"): + find_long_running_vertex_training_jobs( + project_id=_PROJECT, + credentials=creds, + long_running_hours_threshold=0, + ) - >24h → $20/hr, 6–24h → $5/hr, <6h → $1/hr. - """ - # >24h tier: start_hours_ago=_THRESHOLD+5 = 29h → $20/hr - pipeline = _training_pipeline("pl-1", "us-central1", start_hours_ago=_THRESHOLD + 5) - findings = _run(training_pipelines=[pipeline]) +def test_threshold_of_zero_raises_value_error(): + creds = MagicMock() + with pytest.raises(ValueError): + find_long_running_vertex_training_jobs( + project_id=_PROJECT, + credentials=creds, + long_running_hours_threshold=-1, + ) + + +def test_create_time_not_used_as_fallback(): + """Jobs with createTime but no startTime are skipped — createTime is NOT a fallback (spec 9.4).""" + import warnings as _warnings + + start = NOW - timedelta(hours=_THRESHOLD + 5) + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/no-start", + "displayName": "no-start-job", + "createTime": _iso(start), # present but must NOT be used + "state": "JOB_STATE_RUNNING", + "jobSpec": {"workerPoolSpecs": []}, + } + with _warnings.catch_warnings(record=True): + _warnings.simplefilter("always") + findings = _run(custom_jobs=[job]) + assert findings == [] + + +def test_future_start_time_skips_job(): + """Jobs with future startTime are skipped (spec 7).""" + import warnings as _warnings + + future = NOW + timedelta(hours=5) + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/future", + "displayName": "future-job", + "startTime": _iso(future), + "state": "JOB_STATE_RUNNING", + "jobSpec": {"workerPoolSpecs": []}, + } + with _warnings.catch_warnings(record=True): + _warnings.simplefilter("always") + findings = _run(custom_jobs=[job]) + assert findings == [] + + +def test_malformed_name_skips_job(): + """Jobs with malformed resource names (location not resolvable) are skipped (spec 7, 11).""" + import warnings as _warnings + + start = NOW - timedelta(hours=_THRESHOLD + 5) + job = { + "name": "malformed-resource-name", + "displayName": "bad-job", + "startTime": _iso(start), + "state": "JOB_STATE_RUNNING", + "jobSpec": {"workerPoolSpecs": []}, + } + with _warnings.catch_warnings(record=True): + _warnings.simplefilter("always") + findings = _run(custom_jobs=[job]) + assert findings == [] + + +def test_region_filter_exact_match_required(): + """Region filter is exact string equality; prefix match must not pass (spec 7).""" + job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5) + findings = _run(custom_jobs=[job], region_filter="us-central") + assert findings == [] + + +# --------------------------------------------------------------------------- +# Details fields +# --------------------------------------------------------------------------- + + +def test_details_state_field_present(): + """Finding details include 'state' with the exact running enum value.""" + job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5) + findings = _run(custom_jobs=[job]) assert len(findings) == 1 - f = findings[0] - assert f.details["job_type"] == "trainingPipeline" - assert f.details["is_accelerator"] is False # hardware unknown ≠ GPU; only cost is conservative - assert f.details["hardware_unknown"] is True - assert f.details["pricing_source"] == "conservative_pipeline_default" - assert f.details["burn_rate_per_hour"] == pytest.approx(20.0) # >24h tier - assert f.estimated_monthly_cost_usd is None + assert findings[0].details["state"] == "JOB_STATE_RUNNING" -def test_training_pipeline_no_hardware_mid_tier(): - """Pipeline at exactly threshold (24h) uses $5/hr mid-tier (duration <= 24h, > 6h).""" - # duration == _THRESHOLD (24h): not > 24 → $5/hr tier; >= threshold → MEDIUM confidence - pipeline = _training_pipeline("pl-mid", "us-central1", start_hours_ago=_THRESHOLD) +def test_details_state_field_training_pipeline(): + """TrainingPipeline finding uses PIPELINE_STATE_RUNNING.""" + pipeline = _training_pipeline("pl", "us-central1", start_hours_ago=_THRESHOLD + 5) findings = _run(training_pipelines=[pipeline]) assert len(findings) == 1 - assert findings[0].details["burn_rate_per_hour"] == pytest.approx(5.0) # 6–24h tier + assert findings[0].details["state"] == "PIPELINE_STATE_RUNNING" -def test_training_pipeline_no_hardware_low_tier(): - """Pipeline <6h uses $1/hr low-tier when threshold is small enough to fire at <6h.""" - # Use threshold=5h so a 5h job fires (duration >= threshold); duration <= 6h → $1/hr tier - pipeline = _training_pipeline("pl-low2", "us-central1", start_hours_ago=5) - findings = _run(training_pipelines=[pipeline], threshold=5) +def test_details_start_time_field_present(): + """Finding details include 'start_time' as an RFC3339 string.""" + job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5) + findings = _run(custom_jobs=[job]) assert len(findings) == 1 - assert findings[0].details["burn_rate_per_hour"] == pytest.approx(1.0) + assert "start_time" in findings[0].details + assert isinstance(findings[0].details["start_time"], str) + assert findings[0].details["start_time"].endswith("Z") + + +def test_details_long_running_hours_threshold_present(): + """Finding details include 'long_running_hours_threshold'.""" + job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5) + findings = _run(custom_jobs=[job]) + assert findings[0].details["long_running_hours_threshold"] == _THRESHOLD + + +# --------------------------------------------------------------------------- +# TrainingPipeline resource type +# --------------------------------------------------------------------------- + + +def test_training_pipeline_no_hardware_spec(): + """Pipeline with no hardware spec: hardware_unknown=True, is_accelerator=False.""" + pipeline = _training_pipeline("pl-1", "us-central1", start_hours_ago=_THRESHOLD + 5) + findings = _run(training_pipelines=[pipeline]) + + assert len(findings) == 1 + f = findings[0] + assert f.details["job_type"] == "trainingPipeline" + assert f.details["is_accelerator"] is False + assert f.details["hardware_unknown"] is True + assert f.estimated_monthly_cost_usd is None def test_training_pipeline_with_worker_pool_specs_in_task_inputs(): @@ -384,7 +459,6 @@ def test_training_pipeline_with_worker_pool_specs_in_task_inputs(): assert f.details["is_accelerator"] is True # a2-* prefix assert f.details["machine_type"] == "a2-highgpu-1g" assert f.details["total_workers"] == 2 - assert f.details["pricing_source"] == "static_estimate_us_central1" def test_training_pipeline_task_inputs_as_json_string(): @@ -483,16 +557,12 @@ def test_n1_cpu_not_classified_as_gpu(): # --------------------------------------------------------------------------- -# Per-pool cost aggregation (fix: sum all pools, not primary × total) +# Heterogeneous cluster: total_workers # --------------------------------------------------------------------------- -def test_heterogeneous_cluster_cost_sums_all_pools(): - """ - Chief: a2-highgpu-1g (1 replica) ≈ $4.02/hr - Workers: n1-standard-4 (8 replicas) ≈ $0.19/hr each → $1.52/hr total - Total should be ≈ $5.54/hr, not a2-price × 9 ($36.18/hr). - """ +def test_heterogeneous_cluster_total_workers(): + """Chief (1 replica) + 8 workers = total_workers 9.""" start = NOW - timedelta(hours=_THRESHOLD + 5) job = { "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/hetero", @@ -522,13 +592,7 @@ def test_heterogeneous_cluster_cost_sums_all_pools(): } findings = _run(custom_jobs=[job]) assert len(findings) == 1 - f = findings[0] - - a2_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-1g"] / _HOURS_PER_MONTH - n1_hourly = _MACHINE_MONTHLY_COST["n1-standard-4"] / _HOURS_PER_MONTH - expected_total = a2_hourly * 1 + n1_hourly * 8 - assert f.details["burn_rate_per_hour"] == pytest.approx(expected_total) - assert f.details["total_workers"] == 9 + assert findings[0].details["total_workers"] == 9 # --------------------------------------------------------------------------- @@ -544,26 +608,6 @@ def test_region_filter_excludes_other_regions(): assert findings[0].region == "us-central1" -# --------------------------------------------------------------------------- -# Location fallback -# --------------------------------------------------------------------------- - - -def test_location_unknown_for_malformed_name(): - """Jobs with unparseable resource names get region='unknown', not ''.""" - start = NOW - timedelta(hours=_THRESHOLD + 5) - job = { - "name": "malformed-resource-name", - "displayName": "bad-job", - "startTime": _iso(start), - "state": "JOB_STATE_RUNNING", - "jobSpec": {"workerPoolSpecs": []}, - } - findings = _run(custom_jobs=[job]) - if findings: # may be filtered out if region_filter active — just check region value - assert findings[0].region == "unknown" - - # --------------------------------------------------------------------------- # Permission error # --------------------------------------------------------------------------- @@ -645,11 +689,12 @@ def test_parse_worker_pools_single_pool(): ] result = _parse_worker_pools(specs) assert len(result) == 1 - machine, accel, count, replicas = result[0] + machine, accel, count, replicas, tpu_topology = result[0] assert machine == "n1-standard-8" assert accel == "NVIDIA_TESLA_V100" assert count == 2 assert replicas == 4 + assert tpu_topology is None # non-TPU machine def test_parse_worker_pools_multi_pool(): @@ -749,17 +794,34 @@ def test_parse_worker_pools_tpu_no_topology_keeps_replica_count(): assert result[0][3] == 1 -def test_total_hourly_rate_tpu_multi_host(): - """A ct5lp-hightpu-4t pool with 2x4 topology is priced as 2 hosts.""" +def test_parse_worker_pools_tpu_topology_stored_in_tuple(): + """tpu_topology (index 4) is stored in the pool tuple to avoid raw-spec index mismatch.""" specs = [ { "replicaCount": 1, "machineSpec": {"machineType": "ct5lp-hightpu-4t", "tpuTopology": "2x4"}, } ] - pools = _parse_worker_pools(specs) - per_host = _MACHINE_MONTHLY_COST["ct5lp-hightpu-4t"] / _HOURS_PER_MONTH - assert _total_hourly_rate(pools) == pytest.approx(per_host * 2) + result = _parse_worker_pools(specs) + assert result[0][4] == "2x4" + + +def test_parse_worker_pools_tpu_topology_correct_after_malformed_first_entry(): + """When the first raw entry is malformed and skipped, pools[0][4] gives the correct + topology for the valid pool -- not the topology from the skipped first raw entry.""" + specs = [ + # Entry 0: malformed (no machineType) -- must be skipped + {"replicaCount": 1, "machineSpec": {"tpuTopology": "wrong-topology"}}, + # Entry 1: valid TPU pool -- should become pools[0] + { + "replicaCount": 1, + "machineSpec": {"machineType": "ct5lp-hightpu-4t", "tpuTopology": "2x4"}, + }, + ] + result = _parse_worker_pools(specs) + assert len(result) == 1 + assert result[0][0] == "ct5lp-hightpu-4t" + assert result[0][4] == "2x4" # correct topology, not "wrong-topology" def test_g4_gpu_counts_match_docs(): @@ -770,6 +832,19 @@ def test_g4_gpu_counts_match_docs(): assert _BUNDLED_ACCELERATOR_COUNT["g4-standard-384"] == 8 +def test_tpu7x_topology_scaling_via_suffix_parse(): + """tpu7x-standard-4t not in _BUNDLED_ACCELERATOR_COUNT but -4t suffix → 4 chips/host. + Topology '4x4' = 16 chips → 4 hosts.""" + specs = [ + { + "replicaCount": 1, + "machineSpec": {"machineType": "tpu7x-standard-4t", "tpuTopology": "4x4"}, + } + ] + pools = _parse_worker_pools(specs) + assert pools[0][3] == 4 # 16 chips / 4 per host = 4 hosts + + # --------------------------------------------------------------------------- # _has_accelerator_hardware # --------------------------------------------------------------------------- @@ -811,77 +886,14 @@ def test_has_accelerator_hardware_empty_string_not_classified(): assert _has_accelerator_hardware(pools) is False -# --------------------------------------------------------------------------- -# _estimate_hourly_rate_per_replica -# --------------------------------------------------------------------------- - - -def test_estimate_hourly_rate_per_replica_n1_with_gpu_is_additive(): - """n1-* machines add GPU cost on top of machine cost.""" - machine_hourly = _MACHINE_MONTHLY_COST["n1-standard-8"] / _HOURS_PER_MONTH - gpu_monthly_each = 311.0 # NVIDIA_TESLA_T4 - gpu_hourly = gpu_monthly_each / _HOURS_PER_MONTH * 2 # 2 GPUs - expected = machine_hourly + gpu_hourly - result = _estimate_hourly_rate_per_replica("n1-standard-8", "NVIDIA_TESLA_T4", 2) - assert abs(result - expected) < 0.01 - - -def test_estimate_hourly_rate_per_replica_a2_bundled_no_addon(): - """a2-* machines bundle GPU cost — no accelerator add-on.""" - machine_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-1g"] / _HOURS_PER_MONTH - result = _estimate_hourly_rate_per_replica("a2-highgpu-1g", "NVIDIA_TESLA_A100", 1) - assert abs(result - machine_hourly) < 0.01 - - -def test_estimate_hourly_rate_per_replica_unknown_machine_uses_default(): - result = _estimate_hourly_rate_per_replica("custom-unknown-machine", None, 0) - expected = _DEFAULT_MACHINE_MONTHLY_COST / _HOURS_PER_MONTH - assert abs(result - expected) < 0.01 - +def test_has_accelerator_hardware_recognized_type_zero_count_not_accelerated(): + """Recognized acceleratorType with acceleratorCount=0 is NOT accelerated (spec 8.1). -def test_estimate_hourly_rate_co_scheduling_single_accel(): - """a2-highgpu-8g with accel_count=1 triggers co-scheduling: 8 replicas per VM, cost 1/8.""" - full_machine_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-8g"] / _HOURS_PER_MONTH - # accel_count=1 == 1 → replicas_per_vm = 8//1 = 8 - result = _estimate_hourly_rate_per_replica("a2-highgpu-8g", "NVIDIA_TESLA_A100", 1) - assert result == pytest.approx(full_machine_hourly / 8) - - -def test_estimate_hourly_rate_co_scheduling_divides_evenly(): - """a2-highgpu-8g with accel_count=2: 8%2==0 → co-scheduling applies, cost is 1/4.""" - full_machine_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-8g"] / _HOURS_PER_MONTH - # accel_count=2, machine_gpu_count=8, 8%2==0 → replicas_per_vm=4 - result = _estimate_hourly_rate_per_replica("a2-highgpu-8g", "NVIDIA_TESLA_A100", 2) - assert result == pytest.approx(full_machine_hourly / 4) - - -def test_estimate_hourly_rate_no_co_scheduling_above_half(): - """a2-highgpu-8g with accel_count=5 → no co-scheduling (accel_count != 1), full price.""" - full_machine_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-8g"] / _HOURS_PER_MONTH - result = _estimate_hourly_rate_per_replica("a2-highgpu-8g", "NVIDIA_TESLA_A100", 5) - assert result == pytest.approx(full_machine_hourly) - - -def test_estimate_hourly_rate_no_co_scheduling_zero_accel_count(): - """accel_count=0 (unspecified) → full price, no co-scheduling assumed.""" - full_machine_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-8g"] / _HOURS_PER_MONTH - result = _estimate_hourly_rate_per_replica("a2-highgpu-8g", None, 0) - assert result == pytest.approx(full_machine_hourly) - - -def test_bundled_accelerator_count_covers_all_machine_monthly_cost_bundled_types(): - """Every bundled machine type in _MACHINE_MONTHLY_COST (except g2-standard-32) - has a known GPU/TPU count in _BUNDLED_ACCELERATOR_COUNT.""" - gpu_prefixes = ("a2-", "a3-", "a4-", "a4x-", "g2-", "g4-") - bundled_types = [ - m - for m in _MACHINE_MONTHLY_COST - if m.startswith(gpu_prefixes) or m.startswith(_TPU_MACHINE_PREFIXES) - ] - unknown = [ - m for m in bundled_types if m not in _BUNDLED_ACCELERATOR_COUNT and m != "g2-standard-32" - ] - assert unknown == [], f"Missing from _BUNDLED_ACCELERATOR_COUNT: {unknown}" + acceleratorCount=0 means no accelerator is attached even if the type field is set. + The explicit path requires both a recognized type AND count > 0. + """ + pools = [("n1-standard-8", "NVIDIA_TESLA_T4", 0, 1)] + assert _has_accelerator_hardware(pools) is False def test_tpu_machine_type_detected_as_accelerated(): @@ -890,18 +902,6 @@ def test_tpu_machine_type_detected_as_accelerated(): assert _has_accelerator_hardware(pools) is True -def test_tpu_machine_type_uses_tpu_default_cost_when_unknown(): - """Unrecognized ct5lp-* machine falls back to _DEFAULT_TPU_MONTHLY_COST, not generic $150.""" - result = _estimate_hourly_rate_per_replica("ct5lp-hightpu-16t", None, 0) - assert result == pytest.approx(_DEFAULT_TPU_MONTHLY_COST / _HOURS_PER_MONTH) - - -def test_tpu_machine_type_uses_table_cost_when_known(): - """Known ct5lp-hightpu-4t uses the exact cost from _MACHINE_MONTHLY_COST.""" - result = _estimate_hourly_rate_per_replica("ct5lp-hightpu-4t", None, 0) - assert result == pytest.approx(_MACHINE_MONTHLY_COST["ct5lp-hightpu-4t"] / _HOURS_PER_MONTH) - - def test_a3_megagpu_detected_as_bundled(): """a3-megagpu-8g is detected as bundled via a3- prefix.""" pools = [("a3-megagpu-8g", None, 0, 1)] @@ -926,49 +926,6 @@ def test_tpu7x_machine_detected_as_accelerated(): assert _has_accelerator_hardware(pools) is True -def test_tpu7x_uses_tpu_default_cost(): - """tpu7x-* has no cost table entry — should use _DEFAULT_TPU_MONTHLY_COST.""" - result = _estimate_hourly_rate_per_replica("tpu7x-standard-4t", None, 0) - assert result == pytest.approx(_DEFAULT_TPU_MONTHLY_COST / _HOURS_PER_MONTH) - - -def test_tpu7x_topology_scaling_via_suffix_parse(): - """tpu7x-standard-4t not in _BUNDLED_ACCELERATOR_COUNT but -4t suffix → 4 chips/host. - Topology '4x4' = 16 chips → 4 hosts → priced as 4 × per-host rate.""" - specs = [ - { - "replicaCount": 1, - "machineSpec": {"machineType": "tpu7x-standard-4t", "tpuTopology": "4x4"}, - } - ] - pools = _parse_worker_pools(specs) - assert pools[0][3] == 4 # 16 chips / 4 per host = 4 hosts - per_host = _DEFAULT_TPU_MONTHLY_COST / _HOURS_PER_MONTH - assert _total_hourly_rate(pools) == pytest.approx(per_host * 4) - - -# --------------------------------------------------------------------------- -# _total_hourly_rate -# --------------------------------------------------------------------------- - - -def test_total_hourly_rate_single_pool(): - per_replica = _estimate_hourly_rate_per_replica("n1-standard-4", None, 0) - pools = [("n1-standard-4", None, 0, 3)] - assert abs(_total_hourly_rate(pools) - per_replica * 3) < 0.01 - - -def test_total_hourly_rate_heterogeneous(): - """Cost sums correctly across pools with different machine types.""" - chief = _estimate_hourly_rate_per_replica("a2-highgpu-1g", None, 0) * 1 - workers = _estimate_hourly_rate_per_replica("n1-standard-4", None, 0) * 8 - pools = [ - ("a2-highgpu-1g", None, 0, 1), - ("n1-standard-4", None, 0, 8), - ] - assert abs(_total_hourly_rate(pools) - (chief + workers)) < 0.01 - - # --------------------------------------------------------------------------- # _hardware_label # --------------------------------------------------------------------------- @@ -990,18 +947,11 @@ def test_hardware_label_multi_worker(): assert "×8 workers" in label -# --------------------------------------------------------------------------- -# _parse_location -# --------------------------------------------------------------------------- - - -def test_parse_location_standard(): - name = "projects/my-proj/locations/us-central1/customJobs/12345" - assert _parse_location(name) == "us-central1" - - -def test_parse_location_missing_returns_empty(): - assert _parse_location("invalid-name") == "" +def test_hardware_label_zero_accel_count_omits_type(): + """acceleratorType is omitted from the label when acceleratorCount == 0.""" + label = _hardware_label("n1-standard-8", "NVIDIA_TESLA_T4", 0, 1) + assert "NVIDIA_TESLA_T4" not in label + assert "n1-standard-8" in label # --------------------------------------------------------------------------- @@ -1020,67 +970,26 @@ def test_rule_id_attribute(): def test_wildcard_unsupported_keyed_per_project_and_resource(): """_wildcard_unsupported uses (project_id, resource) tuples, not plain strings.""" - # Verify the set stores tuples so customJobs and trainingPipelines are independent test_set = set() test_set.add(("proj-a", "customJobs")) assert ("proj-a", "customJobs") in test_set assert ("proj-a", "trainingPipelines") not in test_set # independent per resource -# --------------------------------------------------------------------------- -# Fix 3: pricing_confidence -# --------------------------------------------------------------------------- - - -def test_pricing_confidence_published_for_known_machines(): - pools = [("n1-standard-8", "NVIDIA_TESLA_T4", 1, 1)] - assert _pricing_confidence(pools) == "published" - - -def test_pricing_confidence_partial_estimate_for_estimated_machine(): - """a4-* machines use estimated pricing.""" - pools = [("a4-highgpu-8g", None, 0, 1)] - assert _pricing_confidence(pools) == "partial_estimate" - - -def test_pricing_confidence_partial_estimate_for_estimated_accel(): - """H200 accelerator uses estimated pricing.""" - pools = [("n1-standard-8", "NVIDIA_H200_141GB", 1, 1)] - assert _pricing_confidence(pools) == "partial_estimate" - - -def test_pricing_confidence_empty_pools(): - """Empty pool list → published (no estimated prices involved).""" - assert _pricing_confidence([]) == "published" - - -def test_finding_includes_pricing_confidence_field(): - """pricing_confidence appears in finding details for custom jobs.""" - job = _custom_job( - "job-1", - "us-central1", - start_hours_ago=_THRESHOLD + 1, - machine_type="n1-standard-4", - ) - findings = _run(custom_jobs=[job]) - assert len(findings) == 1 - assert "pricing_confidence" in findings[0].details - - # --------------------------------------------------------------------------- # Fix 6: skipped_jobs warning # --------------------------------------------------------------------------- def test_skipped_jobs_warning_on_missing_timestamp(): - """Jobs with no startTime or createTime emit a warning.""" + """Jobs with no startTime emit a warning and are skipped.""" import warnings as _warnings job = { "name": "projects/my-project/locations/us-central1/customJobs/bad", "displayName": "bad-job", "state": "JOB_STATE_RUNNING", - # no startTime or createTime + # no startTime } mock_resp = MagicMock() mock_resp.status_code = 200 @@ -1105,47 +1014,422 @@ def test_skipped_jobs_warning_on_missing_timestamp(): # --------------------------------------------------------------------------- -# Fix 9: early_warning_fraction and runaway_multiplier kwargs +# Resource-name pattern enforcement (spec 7) # --------------------------------------------------------------------------- -def test_custom_early_warning_fraction_fires_earlier(): - """early_warning_fraction=0.5 → job at 60% of threshold fires; default 0.9 would not.""" - job = _custom_job( - "job-ew", - "us-central1", - start_hours_ago=_THRESHOLD * 0.6, - accel_type="NVIDIA_TESLA_T4", - accel_count=1, +def test_validate_resource_name_valid_customjob(): + assert ( + _validate_resource_name( + "projects/my-proj/locations/us-central1/customJobs/123", "customJob" + ) + is True ) - findings = _run(custom_jobs=[job], extra_kwargs={"early_warning_fraction": 0.5}) - assert len(findings) == 1 -def test_custom_early_warning_fraction_default_does_not_fire(): - """Same job at 60% of threshold does NOT fire with default fraction (0.9).""" - job = _custom_job( - "job-ew-no", - "us-central1", - start_hours_ago=_THRESHOLD * 0.6, - accel_type="NVIDIA_TESLA_T4", - accel_count=1, +def test_validate_resource_name_valid_pipeline(): + assert ( + _validate_resource_name( + "projects/my-proj/locations/us-central1/trainingPipelines/456", "trainingPipeline" + ) + is True + ) + + +def test_validate_resource_name_too_many_parts(): + """Extra path segment (7 parts instead of 6) → invalid.""" + assert ( + _validate_resource_name( + "projects/p/locations/us-central1/customJobs/123/extra", "customJob" + ) + is False + ) + + +def test_validate_resource_name_too_few_parts(): + assert _validate_resource_name("projects/p/locations/customJobs/123", "customJob") is False + + +def test_validate_resource_name_wrong_type_segment(): + """customJobs name treated as trainingPipeline → invalid.""" + assert ( + _validate_resource_name( + "projects/p/locations/us-central1/customJobs/123", "trainingPipeline" + ) + is False ) + + +def test_validate_resource_name_empty_location(): + """Empty location segment → invalid.""" + assert _validate_resource_name("projects/p/locations//customJobs/123", "customJob") is False + + +def test_resource_name_extra_segments_skipped(): + """A name with extra path segments is not emitted as a finding.""" + import warnings as _warnings + + start = NOW - timedelta(hours=_THRESHOLD + 5) + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/123/extra", + "displayName": "extra-path", + "startTime": _iso(start), + "state": "JOB_STATE_RUNNING", + "jobSpec": {"workerPoolSpecs": []}, + } + with _warnings.catch_warnings(record=True): + _warnings.simplefilter("always") + findings = _run(custom_jobs=[job]) + assert findings == [] + + +def test_resource_name_location_bearing_but_wrong_type_skipped(): + """Name has valid location segment but wrong resource-type keyword → skip.""" + import warnings as _warnings + + start = NOW - timedelta(hours=_THRESHOLD + 5) + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/models/123", + "displayName": "wrong-type", + "startTime": _iso(start), + "state": "JOB_STATE_RUNNING", + "jobSpec": {"workerPoolSpecs": []}, + } + with _warnings.catch_warnings(record=True): + _warnings.simplefilter("always") + findings = _run(custom_jobs=[job]) + assert findings == [] + + +# --------------------------------------------------------------------------- +# State validation (spec 3.3, 9.1) +# --------------------------------------------------------------------------- + + +def test_expected_state_constants(): + """_EXPECTED_STATE maps job types to exact documented running-state enums.""" + assert _EXPECTED_STATE["customJob"] == "JOB_STATE_RUNNING" + assert _EXPECTED_STATE["trainingPipeline"] == "PIPELINE_STATE_RUNNING" + + +def test_wrong_state_custom_job_skipped(): + """CustomJob not in JOB_STATE_RUNNING is skipped even if it passes other checks.""" + import warnings as _warnings + + start = NOW - timedelta(hours=_THRESHOLD + 5) + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j1", + "displayName": "pending-job", + "startTime": _iso(start), + "state": "JOB_STATE_PENDING", # not running + "jobSpec": {"workerPoolSpecs": []}, + } + with _warnings.catch_warnings(record=True): + _warnings.simplefilter("always") + findings = _run(custom_jobs=[job]) + assert findings == [] + + +def test_missing_state_custom_job_skipped(): + """CustomJob with absent state field is skipped.""" + import warnings as _warnings + + start = NOW - timedelta(hours=_THRESHOLD + 5) + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j2", + "displayName": "no-state", + "startTime": _iso(start), + # no 'state' key + "jobSpec": {"workerPoolSpecs": []}, + } + with _warnings.catch_warnings(record=True): + _warnings.simplefilter("always") + findings = _run(custom_jobs=[job]) + assert findings == [] + + +def test_state_is_read_from_resource_not_synthesised(): + """The 'state' in finding details reflects the actual resource state, not a synthesised value.""" + job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5) + # Confirm the fixture sets state to JOB_STATE_RUNNING + assert job["state"] == "JOB_STATE_RUNNING" findings = _run(custom_jobs=[job]) + assert findings[0].details["state"] == "JOB_STATE_RUNNING" + + +# --------------------------------------------------------------------------- +# CustomJob hardware_unknown when workerPoolSpecs absent/empty (spec 8.1) +# --------------------------------------------------------------------------- + + +def test_custom_job_empty_worker_specs_hardware_unknown(): + """CustomJob with empty workerPoolSpecs must have hardware_unknown=True (spec 8.1).""" + start = NOW - timedelta(hours=_THRESHOLD + 5) + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-no-specs", + "displayName": "no-specs", + "startTime": _iso(start), + "state": "JOB_STATE_RUNNING", + "jobSpec": {"workerPoolSpecs": []}, + } + findings = _run(custom_jobs=[job]) + assert len(findings) == 1 + assert findings[0].details["hardware_unknown"] is True + assert findings[0].details["is_accelerator"] is False + + +def test_custom_job_absent_job_spec_hardware_unknown(): + """CustomJob with no jobSpec at all is still eligible; hardware_unknown=True.""" + start = NOW - timedelta(hours=_THRESHOLD + 5) + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-no-spec", + "displayName": "no-job-spec", + "startTime": _iso(start), + "state": "JOB_STATE_RUNNING", + # no 'jobSpec' key + } + findings = _run(custom_jobs=[job]) + assert len(findings) == 1 + assert findings[0].details["hardware_unknown"] is True + + +# --------------------------------------------------------------------------- +# _parse_worker_pools: malformed entries and missing machineType (spec 8.1, 8.2) +# --------------------------------------------------------------------------- + + +def test_parse_worker_pools_missing_machine_type_skipped(): + """Pool entries without machineType are skipped (spec 8.1, 8.2).""" + specs = [ + { + "replicaCount": 1, + "machineSpec": { + # no machineType + "acceleratorType": "NVIDIA_TESLA_T4", + "acceleratorCount": 1, + }, + } + ] + assert _parse_worker_pools(specs) == [] + + +def test_parse_worker_pools_empty_machine_type_skipped(): + """Pool entry with empty machineType string is treated as missing → skipped.""" + specs = [{"replicaCount": 1, "machineSpec": {"machineType": ""}}] + assert _parse_worker_pools(specs) == [] + + +def test_parse_worker_pools_non_dict_entry_skipped(): + """Non-dict entries in workerPoolSpecs are silently skipped.""" + specs = ["not-a-dict", None, 42] + assert _parse_worker_pools(specs) == [] + + +def test_parse_worker_pools_bad_replica_count_skipped(): + """Pool with non-numeric replicaCount is treated as malformed → skipped.""" + specs = [ + { + "replicaCount": "bad-value", + "machineSpec": {"machineType": "n1-standard-4"}, + } + ] + assert _parse_worker_pools(specs) == [] + + +def test_parse_worker_pools_mixed_valid_invalid(): + """Valid pool entries are kept; malformed entries are silently dropped.""" + specs = [ + {"replicaCount": "bad", "machineSpec": {"machineType": "n1-standard-4"}}, + { + "replicaCount": 2, + "machineSpec": {"machineType": "a2-highgpu-1g"}, + }, + {"replicaCount": 1, "machineSpec": {}}, # no machineType + ] + result = _parse_worker_pools(specs) + assert len(result) == 1 + assert result[0][0] == "a2-highgpu-1g" + assert result[0][3] == 2 + + +def test_training_pipeline_pools_without_machine_type_hardware_unknown(): + """TrainingPipeline whose exposed workerPoolSpecs entries all lack machineType → hardware_unknown.""" + task_inputs = { + "workerPoolSpecs": [ + { + "replicaCount": 1, + "machineSpec": { + # machineType absent + "acceleratorType": "NVIDIA_TESLA_T4", + "acceleratorCount": 1, + }, + } + ] + } + pipeline = _training_pipeline( + "pl-no-mt", "us-central1", start_hours_ago=_THRESHOLD + 5, task_inputs=task_inputs + ) + findings = _run(training_pipelines=[pipeline]) + assert len(findings) == 1 + assert findings[0].details["hardware_unknown"] is True + assert findings[0].details["is_accelerator"] is False + + +# --------------------------------------------------------------------------- +# RFC3339 startTime strictness (spec 7) +# --------------------------------------------------------------------------- + + +def test_start_time_space_separator_rejected(): + """startTime with space separator (not T) is not valid RFC3339 and must be skipped.""" + import warnings as _warnings + + start = NOW - timedelta(hours=_THRESHOLD + 5) + iso_space = start.isoformat().replace("T", " ") # e.g. "2025-05-31 06:00:00+00:00" + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-space", + "state": "JOB_STATE_RUNNING", + "startTime": iso_space, + "jobSpec": {"workerPoolSpecs": []}, + } + with _warnings.catch_warnings(record=True): + _warnings.simplefilter("always") + findings = _run(custom_jobs=[job]) + assert findings == [] + + +def test_start_time_date_only_rejected(): + """Date-only startTime (no time component) is not valid RFC3339 and must be skipped.""" + import warnings as _warnings + + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-date", + "state": "JOB_STATE_RUNNING", + "startTime": "2025-05-01", + "jobSpec": {"workerPoolSpecs": []}, + } + with _warnings.catch_warnings(record=True): + _warnings.simplefilter("always") + findings = _run(custom_jobs=[job]) assert findings == [] -def test_custom_runaway_multiplier_changes_confidence(): - """runaway_multiplier=2 → job at 2.5× threshold is HIGH; default 3× it would be MEDIUM.""" - job = _custom_job("job-rm", "us-central1", start_hours_ago=_THRESHOLD * 2.5) - findings = _run(custom_jobs=[job], extra_kwargs={"runaway_multiplier": 2}) +def test_start_time_no_timezone_rejected(): + """startTime without timezone offset is not valid RFC3339 and must be skipped.""" + import warnings as _warnings + + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-notz", + "state": "JOB_STATE_RUNNING", + "startTime": "2025-05-01T06:00:00", # no Z or offset + "jobSpec": {"workerPoolSpecs": []}, + } + with _warnings.catch_warnings(record=True): + _warnings.simplefilter("always") + findings = _run(custom_jobs=[job]) + assert findings == [] + + +def test_start_time_fractional_seconds_accepted(): + """startTime with fractional seconds and Z is valid RFC3339 and must be accepted.""" + start = NOW - timedelta(hours=_THRESHOLD + 5) + frac_str = start.strftime("%Y-%m-%dT%H:%M:%S.123456Z") + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-frac", + "state": "JOB_STATE_RUNNING", + "startTime": frac_str, + "jobSpec": { + "workerPoolSpecs": [ + {"replicaCount": 1, "machineSpec": {"machineType": "n1-standard-4"}} + ] + }, + } + findings = _run(custom_jobs=[job]) assert len(findings) == 1 - assert findings[0].confidence.name == "HIGH" -def test_default_runaway_multiplier_at_2_5x_is_medium(): - """Same job at 2.5× threshold with default multiplier (3) → MEDIUM confidence.""" - job = _custom_job("job-rm-med", "us-central1", start_hours_ago=_THRESHOLD * 2.5) +def test_start_time_explicit_offset_accepted(): + """startTime with explicit +00:00 offset is valid RFC3339 and must be accepted.""" + start = NOW - timedelta(hours=_THRESHOLD + 5) + offset_str = start.strftime("%Y-%m-%dT%H:%M:%S+00:00") + job = { + "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-offset", + "state": "JOB_STATE_RUNNING", + "startTime": offset_str, + "jobSpec": { + "workerPoolSpecs": [ + {"replicaCount": 1, "machineSpec": {"machineType": "n1-standard-4"}} + ] + }, + } findings = _run(custom_jobs=[job]) assert len(findings) == 1 - assert findings[0].confidence.name == "MEDIUM" + + +# --------------------------------------------------------------------------- +# Partial pagination: later-page failure keeps earlier pages (spec 11.3) +# --------------------------------------------------------------------------- + + +def test_pagination_later_page_failure_keeps_partial_results(): + """A non-403 failure on a later page returns earlier accumulated pages and warns.""" + import warnings as _warnings + + job = _custom_job("j-page1", "us-central1", start_hours_ago=_THRESHOLD + 5) + + page1_resp = MagicMock() + page1_resp.status_code = 200 + page1_resp.ok = True + page1_resp.json.return_value = { + "customJobs": [job], + "nextPageToken": "token-abc", # signals a second page + } + + page2_resp = MagicMock() + page2_resp.status_code = 503 + page2_resp.ok = False + + empty_pipeline_resp = MagicMock() + empty_pipeline_resp.status_code = 200 + empty_pipeline_resp.ok = True + empty_pipeline_resp.json.return_value = {"trainingPipelines": []} + + responses = {"customJobs": [page1_resp, page2_resp], "trainingPipelines": [empty_pipeline_resp]} + counters = {"customJobs": 0, "trainingPipelines": 0} + + def _get(url, params=None): + if "customJobs" in url: + idx = counters["customJobs"] + counters["customJobs"] += 1 + return responses["customJobs"][min(idx, len(responses["customJobs"]) - 1)] + else: + idx = counters["trainingPipelines"] + counters["trainingPipelines"] += 1 + return responses["trainingPipelines"][min(idx, len(responses["trainingPipelines"]) - 1)] + + creds = MagicMock() + mock_session = MagicMock() + mock_session.get.side_effect = _get + + with patch( + "cleancloud.providers.gcp.rules.ai.vertex_training_job_long_running.AuthorizedSession", + return_value=mock_session, + ): + with patch( + "cleancloud.providers.gcp.rules.ai.vertex_training_job_long_running.datetime" + ) as mock_dt: + mock_dt.now.return_value = NOW + mock_dt.fromisoformat.side_effect = datetime.fromisoformat + with _warnings.catch_warnings(record=True) as caught: + _warnings.simplefilter("always") + findings = find_long_running_vertex_training_jobs( + project_id=_PROJECT, + credentials=creds, + long_running_hours_threshold=_THRESHOLD, + ) + + # Page 1 job must still appear even though page 2 failed + assert len(findings) == 1 + assert findings[0].details["job_name"].endswith("j-page1") + # A warning about the partial read must have been emitted + assert any("partial" in str(w.message).lower() for w in caught) diff --git a/tests/e2e/gcp/test_gcp_ai_rules_smoke.py b/tests/e2e/gcp/test_gcp_ai_rules_smoke.py index 3d78489..d9a376e 100644 --- a/tests/e2e/gcp/test_gcp_ai_rules_smoke.py +++ b/tests/e2e/gcp/test_gcp_ai_rules_smoke.py @@ -144,8 +144,8 @@ def test_vertex_training_job_long_running_returns_list_of_findings(): assert "job_type" in f.details assert f.details["job_type"] in ("customJob", "trainingPipeline") assert "duration_hours" in f.details - assert "accrued_cost_usd" in f.details - assert "burn_rate_per_hour" in f.details + assert "state" in f.details + assert "start_time" in f.details assert "is_accelerator" in f.details From 2b13aee58741fc099ebce8e02f5e454be4f19d43 Mon Sep 17 00:00:00 2001 From: javvaji-devops Date: Wed, 6 May 2026 14:51:44 +0100 Subject: [PATCH 2/4] gcp.vertex.workbench.idle --- CHANGELOG.md | 56 -- README.fr.md | 21 +- README.md | 21 +- .../providers/gcp/rules/ai/workbench_idle.py | 476 +++------- docs/rules/gcp.md | 30 +- docs/specs/gcp/ai/workbench_idle.md | 668 ++++++++++++++ .../gcp/ai/test_gcp_workbench_idle.py | 819 ++++++++++-------- 7 files changed, 1325 insertions(+), 766 deletions(-) delete mode 100644 CHANGELOG.md create mode 100644 docs/specs/gcp/ai/workbench_idle.md diff --git a/CHANGELOG.md b/CHANGELOG.md deleted file mode 100644 index 18c3bca..0000000 --- a/CHANGELOG.md +++ /dev/null @@ -1,56 +0,0 @@ -# Changelog - -All notable changes to CleanCloud are documented here. - -## [1.15.0] — 2026-04-11 - -### Added -- `aws.ec2.gpu.idle` — Idle EC2 GPU/accelerator instance detection across 20 families (p2/p3/p4/p5, g4/g5/g6/g6e/gr6, trn1/trn2, inf1/inf2, dl1/dl2q). Two-tier detection: GPU utilisation via NVIDIA CloudWatch agent (HIGH confidence) or CPU fallback (MEDIUM). Neuron instances (Trainium/Inferentia) handled correctly — always CPU fallback by design. Parameters: `idle_days` (default 7), `gpu_threshold` (5%), `cpu_threshold` (10%). -- `gcp.vertex.workbench.idle` — Idle Vertex AI Workbench instances via v2 API. Uses `updateTime` as idle signal; GPU/TPU-aware; age-fallback capped at MEDIUM confidence. -- `schemas/output-v1.3.0.json` — JSON output schema update: added `critical` to risk enum, `suppressed` array, `rules_evaluated` summary field. -- Optional provider extras: `pip install 'cleancloud[aws]'`, `'cleancloud[azure]'`, `'cleancloud[gcp]'`, `'cleancloud[all]'`. Cloud SDKs are no longer hard dependencies. -- Docker `CLEANCLOUD_EXTRAS` build arg for slim provider-specific images. -- Graceful error messages with install hints when a provider SDK is not installed. - -### Changed -- Cross-cloud AI baseline complete: 7 rules across AWS (3), Azure (2), GCP (2). -- README Quick Start consolidated to a single clear two-step flow (demo → install provider → scan). -- `azure/rules/ebs_snapshots_old.py` renamed to `disk_snapshots_old.py` (AWS terminology removed). -- `scan/command.py` EnvironmentError handler now uses `f"--provider {provider}"` (was hardcoded to `azure`). -- Lint is now blocking on main branch (was non-blocking with `|| echo` fallback). -- `output/feedback.py` no longer includes a personal email address. -- `except Exception: pass` blocks narrowed to specific exception types. - -### Fixed -- `security/aws/hygiene-readonly.json` — added missing `cloudwatch:GetMetricStatistics` permission. - ---- - -## [1.14.1] — 2026-04-09 - -### Fixed -- `aws/rules/untagged_resources.py` — `s3.exceptions.ClientError` crash fixed; now catches `botocore.exceptions.ClientError` with `NoSuchTagSet` check. -- `aws/rules/rds_idle.py` — hardcoded `"connections_14d"` key fixed; CloudWatch `AccessDenied` now surfaces as `PermissionError`. -- `aws/rules/elb_idle.py`, `nat_gateway_idle.py` — same CloudWatch `AccessDenied` fix. -- `azure/rules/app_service_plan_empty.py` — `plan.location.lower()` crash on `None`. -- `azure/rules/vm_stopped_not_deallocated.py` — `instance_view()` wrapped in try/except; no longer aborts subscription scan on one bad VM. -- `azure/rules/sql_database_idle.py` — hardcoded idle day strings fixed; per-server error handling added. -- `azure/rules/ebs_snapshots_old.py` — dead branch fixed; case-sensitive region filter fixed. -- `azure/rules/untagged_resources.py` — case-sensitive region filter fixed for disks and snapshots. -- `gcp/rules/sql_instance_idle.py` — hardcoded `"7-day window"` fixed to use `idle_days`. -- `gcp/rules/vertex_endpoint_idle.py` — unreachable dead branch removed. - ---- - -## [1.14.0] — 2026-04-09 - -### Added -- `azure.aml.compute.idle` — Idle Azure ML Compute Clusters (Azure Monitor metrics + age fallback). -- `azure.ml.compute_instance.idle` — Idle Azure ML Compute Instances (last_operation + last_modified_at + age fallback). -- `rules_evaluated` field in JSON scan summary — map of rule_id to finding count. - -### Changed -- Unified Azure subscription display (removed duplicate subscription output). -- Age-fallback confidence capped at MEDIUM for compute instance rule. -- All-None Azure Monitor maximums treated as unknown (not idle). -- Unicode arrow chars (`→`) removed from all Python source files. diff --git a/README.fr.md b/README.fr.md index 9db41ae..6db14ae 100644 --- a/README.fr.md +++ b/README.fr.md @@ -42,7 +42,20 @@ cleancloud demo --category ai Détails : - estimated_monthly_cost: ~$23 374/mois -2. [Azure] Instance de calcul Azure ML inactive (31 jours sans activité) +2. [GCP] Endpoint Vertex AI inactif (2 réplique(s) toujours active(s), zéro requête) + Risque : Élevé + Confiance : High + Ressource : gcp.vertex.endpoint → projects/ml-platform/locations/us-central1/endpoints/8842531067721654272 + Région : us-central1 + Règle : gcp.vertex.endpoint.idle + Raison : L'endpoint a un plancher de service provisionné de 2 réplique(s) ; la télémétrie de comptage des requêtes (couverture : complète) montre un taux maximum observé == 0 sur une fenêtre de 14j + Détails : + - provisioned_serving_floor: 2 + - in_scope_model_count: 1 + - has_accelerator: true + - telemetry_coverage_state: complete + +3. [Azure] Instance de calcul Azure ML inactive (31 jours sans activité) Risque : Élevé Confiance : High Ressource : azure.ml.compute_instance → ws-prod/compute/ds-workstation-nc24 @@ -52,7 +65,7 @@ cleancloud demo --category ai Détails : - estimated_monthly_cost: ~$2 190/mois -3. [AWS] Instance RDS inactive (zéro connexion depuis 21 jours) +4. [AWS] Instance RDS inactive (zéro connexion depuis 21 jours) Risque : Élevé Confiance : High Ressource : aws.rds.instance → db-prod-analytics @@ -63,8 +76,8 @@ cleancloud demo --category ai - estimated_monthly_cost: ~$380/mois --- Résumé du scan --- -Total candidats de revue : 3 -Par risque : critique: 1 élevé: 2 +Total candidats de revue : 4 +Par risque : critique: 1 élevé: 3 Gaspillage minimum estimé : ~$25 944/mois ``` diff --git a/README.md b/README.md index a521ccc..dc77bdc 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,20 @@ cleancloud demo --category ai Details: - estimated_monthly_cost: ~$23,374/month -2. [Azure] Idle ML Compute Instance (31 days since last activity) +2. [GCP] Idle Vertex AI Endpoint (2 replica(s) always on, zero requests) + Risk : High + Confidence : High + Resource : gcp.vertex.endpoint → projects/ml-platform/locations/us-central1/endpoints/8842531067721654272 + Region : us-central1 + Rule : gcp.vertex.endpoint.idle + Reason : Endpoint has provisioned serving floor of 2 replica(s); request-count telemetry (coverage: complete) shows max observed rate == 0 over 14d window + Details: + - provisioned_serving_floor: 2 + - in_scope_model_count: 1 + - has_accelerator: true + - telemetry_coverage_state: complete + +3. [Azure] Idle Azure ML Compute Instance (31 Days Since Last Activity) Risk : High Confidence : High Resource : azure.ml.compute_instance → ws-prod/compute/ds-workstation-nc24 @@ -52,7 +65,7 @@ cleancloud demo --category ai Details: - estimated_monthly_cost: ~$2,190/month -3. [AWS] Idle RDS Instance (Zero connections for 21 days) +4. [AWS] Idle RDS Instance (Zero connections for 21 days) Risk : High Confidence : High Resource : aws.rds.instance → db-prod-analytics @@ -63,8 +76,8 @@ cleancloud demo --category ai - estimated_monthly_cost: ~$380/month --- Scan Summary --- -Total review candidates: 3 -By risk: critical: 1 high: 2 +Total review candidates: 4 +By risk: critical: 1 high: 3 Minimum estimated waste: ~$25,944/month ``` diff --git a/cleancloud/providers/gcp/rules/ai/workbench_idle.py b/cleancloud/providers/gcp/rules/ai/workbench_idle.py index 0bb6457..91f4f52 100644 --- a/cleancloud/providers/gcp/rules/ai/workbench_idle.py +++ b/cleancloud/providers/gcp/rules/ai/workbench_idle.py @@ -1,12 +1,49 @@ -from datetime import datetime, timezone +""" +Rule: gcp.vertex.workbench.idle + + (spec -- docs/specs/gcp/ai/workbench_idle.md) + +Intent: + Detect Vertex AI Workbench instances that are provably still running and have + documented first-party evidence of notebook/kernel inactivity over a conservative + review window. + + This rule is deliberately precision-first. It is a review-candidate rule only. + It is not proof that an instance is safe to stop, not proof that no scheduled or + background work exists, and not proof of a specific monthly saving. + +Current canonical status: + EMITTING_DISABLED. No qualifying canonical signal exists that exposes per-instance + last kernel activity or a kernel-idle time series suitable for this rule. The rule + must not emit findings from control-plane timestamps alone. + + updateTime and createTime are NOT canonical idle signals. Neither is + CPU utilization or instance age. No qualifying signal path is currently established. + +Discovery failure taxonomy: + 404: API not enabled for the project; provably no instances. + 400: endpoint or wildcard unsupported; discovery incomplete. + 5xx: transient server error; discovery incomplete. + network error: transport failure; discovery incomplete. + unreachable[]: API-reported location gaps. + +Future activation path: + When Google documents a qualifying per-instance Workbench-attributable signal + (Cloud Logging kernel/session activity logs, or a Cloud Monitoring metric with + documented kernel-idle semantics), the implementation can continue from the + candidate list and apply signal evaluation for reachable instances. + +APIs: + - notebooks.googleapis.com/v2: projects/{project}/locations/-/instances +""" + +import re +import warnings from typing import List, Optional from google.auth.transport.requests import AuthorizedSession -from cleancloud.core.confidence import ConfidenceLevel -from cleancloud.core.evidence import Evidence from cleancloud.core.finding import Finding -from cleancloud.core.risk import RiskLevel RULE_METADATA = { "id": "gcp.vertex.workbench.idle", @@ -15,395 +52,130 @@ "cost_impact": "high", } -# Accelerator types treated as GPU/high-cost -_GPU_ACCELERATORS = frozenset( - { - "NVIDIA_TESLA_T4", - "NVIDIA_TESLA_V100", - "NVIDIA_TESLA_P100", - "NVIDIA_TESLA_K80", - "NVIDIA_TESLA_A100", - "NVIDIA_A100_80GB", - "NVIDIA_L4", - "NVIDIA_H100_80GB", - "TPU_V2", - "TPU_V3", - "TPU_V4_POD", - } +# Exact documented resource-name pattern (spec 3.1, 7): +# projects/{projectId}/locations/{location}/instances/{instanceId} +# All four non-empty path segments must be present. +_INSTANCE_NAME_RE = re.compile( + r"^projects/[^/]+/locations/[^/]+/instances/[^/]+$" ) -# Monthly cost per instance (on-demand, us-central1, 730 h/month) -_MACHINE_MONTHLY_COST = { - "n1-standard-1": 35.0, - "n1-standard-2": 69.0, - "n1-standard-4": 138.0, - "n1-standard-8": 277.0, - "n1-standard-16": 554.0, - "n1-highmem-2": 93.0, - "n1-highmem-4": 187.0, - "n1-highmem-8": 374.0, - "n2-standard-2": 78.0, - "n2-standard-4": 157.0, - "n2-standard-8": 314.0, - "n2-standard-16": 628.0, - "c2-standard-4": 166.0, - "c2-standard-8": 332.0, - # a2-* and g2-* include GPU cost — no separate add-on - "a2-highgpu-1g": 2_933.0, - "a2-highgpu-2g": 5_866.0, - "a2-highgpu-4g": 11_732.0, - "a2-highgpu-8g": 23_464.0, - "a2-ultragpu-1g": 5_103.0, - "g2-standard-4": 706.0, - "g2-standard-8": 1_060.0, - "g2-standard-16": 2_120.0, - "g2-standard-32": 4_241.0, -} -_DEFAULT_MACHINE_MONTHLY_COST = 150.0 - -# Additional monthly cost per GPU/TPU for n1-*/n2-* machines. -# a2-* and g2-* already include GPU cost above. -# TPU costs are approximate (v2 pod slice: ~$5.22/hr, v3: ~$8.00/hr, v4: ~$12.88/hr — 730h/month). -_GPU_MONTHLY_COST_EACH = { - "NVIDIA_TESLA_T4": 311.0, - "NVIDIA_TESLA_V100": 1_385.0, - "NVIDIA_TESLA_P100": 1_022.0, - "NVIDIA_TESLA_K80": 392.0, - "NVIDIA_TESLA_A100": 2_933.0, - "NVIDIA_A100_80GB": 5_103.0, - "NVIDIA_L4": 680.0, - "NVIDIA_H100_80GB": 8_000.0, - "TPU_V2": 3_811.0, - "TPU_V3": 5_840.0, - "TPU_V4_POD": 9_402.0, -} - -_DAYS_IDLE = 14 - def find_idle_workbench_instances( *, project_id: str, credentials, region_filter: Optional[str] = None, - idle_days: int = _DAYS_IDLE, + idle_days: int = 14, ) -> List[Finding]: """ - Find Vertex AI Workbench instances in ACTIVE state with no recent activity. - - Workbench instances incur continuous compute charges while ACTIVE, regardless - of whether any notebooks or kernels are running. GPU-backed instances cost - $300–$2,900+/month. Data scientists frequently leave instances running after - a sprint ends, a project is deprioritised, or when they switch to a new instance. + Find Vertex AI Workbench instances with documented kernel inactivity. - Detection logic: - - Instance state is ACTIVE (only ACTIVE instances incur compute charges) - - updateTime is older than idle_days — no configuration or lifecycle changes + Currently EMITTING_DISABLED: no qualifying canonical signal exists for + per-instance kernel activity. updateTime and createTime MUST NOT be used + as idle signals. - updateTime is updated by the Notebooks API when: - - The instance is started, stopped, or restarted via the console or API - - Instance configuration is modified (machine type, accelerators, etc.) - - Scripts or scheduled operations modify instance metadata - - Instances with old updateTime have had no control-plane activity. - This mirrors the signal used by SageMaker LastModifiedTime and - Azure ML compute instance last_modified_at. - - Confidence: - - HIGH: updateTime >= idle_days ago AND age >= idle_days - - MEDIUM: updateTime >= 75% of idle_days AND age >= 75% of idle_days + Always returns an empty list until a qualifying signal is available. IAM permissions required: - - notebooks.instances.list (roles/notebooks.viewer) + notebooks.instances.list (roles/notebooks.viewer) """ - # Guard against caller passing 0 - idle_days = max(idle_days, 1) + if idle_days < 1: + raise ValueError(f"idle_days must be >= 1, got {idle_days!r}") session = AuthorizedSession(credentials) - now = datetime.now(timezone.utc) - findings: List[Finding] = [] - - instances = _list_instances(session, project_id) - - for raw in instances: - inst = _normalize(raw) - name = inst["name"] - state = inst["state"] - location = inst["location"] - - if region_filter and location.lower() != region_filter.lower(): - continue - - # Only ACTIVE instances incur compute charges - if state != "ACTIVE": - continue - - # Age calculation - age_days: Optional[int] = None - create_time_str = inst["create_time"] - if create_time_str: - try: - created_at = datetime.fromisoformat(create_time_str.replace("Z", "+00:00")) - if created_at.tzinfo is None: - created_at = created_at.replace(tzinfo=timezone.utc) - age_days = (now - created_at).days - except ValueError: - pass - - # Skip instances younger than half the idle threshold - if age_days is not None and age_days < max(idle_days // 2, 7): - continue - - # Idle signal: updateTime (control-plane last activity) - idle_since_days: Optional[int] = None - update_time_str = inst["update_time"] - if update_time_str: - try: - updated_at = datetime.fromisoformat(update_time_str.replace("Z", "+00:00")) - if updated_at.tzinfo is None: - updated_at = updated_at.replace(tzinfo=timezone.utc) - idle_since_days = (now - updated_at).days - except ValueError: - pass - - # Fall back to age when updateTime is unavailable - using_age_fallback = idle_since_days is None - if idle_since_days is None: - idle_since_days = age_days if age_days is not None else idle_days - - effective_age = age_days if age_days is not None else idle_since_days - - # Confidence thresholds - threshold_high = idle_days - threshold_medium = int(idle_days * 0.75) - - # Age-fallback findings are capped at MEDIUM — updateTime absence is not - # evidence of idleness by itself. - if ( - not using_age_fallback - and idle_since_days >= threshold_high - and effective_age >= threshold_high - ): - confidence = ConfidenceLevel.HIGH - elif idle_since_days >= threshold_medium and effective_age >= threshold_medium: - confidence = ConfidenceLevel.MEDIUM - else: - continue - - machine_type = inst["machine_type"] - accel_type = inst["accel_type"] - accel_count = inst["accel_count"] - labels = inst["labels"] - instance_id = name.split("/")[-1] if name else "" - - is_gpu = accel_type in _GPU_ACCELERATORS or (machine_type or "").startswith(("a2-", "g2-")) - - monthly_cost = _estimate_cost(machine_type, accel_type, accel_count) - - idle_ratio = round(idle_since_days / idle_days, 2) if idle_days > 0 else 0.0 - if is_gpu and idle_ratio >= 2.0: - risk = RiskLevel.CRITICAL - elif is_gpu: - risk = RiskLevel.HIGH - else: - risk = RiskLevel.MEDIUM - - idle_signal_source = "age_fallback" if using_age_fallback else "update_time" - activity_source = "age (fallback)" if using_age_fallback else "updateTime" - - signals = [ - "Instance state: ACTIVE", - f"Last control-plane activity: {idle_since_days} days ago ({activity_source})", - ] - if age_days is not None: - signals.append(f"Instance age: {age_days} days") - if machine_type: - signals.append(f"Machine type: {machine_type}") - if is_gpu and accel_type: - signals.append(f"Accelerator: {accel_type} x {accel_count}") - if is_gpu: - accel_label = "TPU-backed" if (accel_type or "").startswith("TPU_") else "GPU-backed" - signals.append( - f"{accel_label} instance — high continuous cost (~${monthly_cost:,.0f}/month)" - ) - if using_age_fallback: - signals.append( - "updateTime unavailable — age used as fallback signal; " - "confidence capped at MEDIUM" - ) - - not_checked = [ - "Active kernel sessions not captured by updateTime (requires Cloud Monitoring agent)", - "Scheduled notebook runs via Cloud Scheduler or Vertex AI Pipelines", - "Planned future use by the assigned user", - f"Idle shutdown policy configured on the instance — may auto-stop before {idle_days} days", - ] - - evidence = Evidence( - signals_used=signals, - signals_not_checked=not_checked, - time_window=f"{idle_since_days} days", - ) - - is_tpu = (accel_type or "").startswith("TPU_") - if is_gpu: - accel_kind = "TPU" if is_tpu else "GPU" - title = ( - f"Idle {accel_kind}-Backed Workbench Instance " - f"(>{idle_days} Days Idle, {idle_since_days} Days Since Activity)" - ) - else: - title = ( - f"Idle Vertex AI Workbench Instance " - f"(>{idle_days} Days Idle, {idle_since_days} Days Since Activity)" - ) - - if is_gpu: - accel_prefix = "TPU-backed " if is_tpu else "GPU-backed " - else: - accel_prefix = "" - summary = ( - f"{accel_prefix}Vertex AI Workbench instance '{instance_id}' " - f"in '{location}' has had no control-plane activity for {idle_since_days} days " - f"but remains ACTIVE, incurring continuous charges " - f"(~${monthly_cost:,.0f}/month)." - ) - - findings.append( - Finding( - provider="gcp", - rule_id="gcp.vertex.workbench.idle", - resource_type="gcp.vertex.workbench.instance", - resource_id=name, - region=location, - estimated_monthly_cost_usd=monthly_cost, - title=title, - summary=summary, - reason=( - f"Workbench instance has had no control-plane activity " - f"for {idle_since_days} days while ACTIVE" - ), - risk=risk, - confidence=confidence, - detected_at=now, - evidence=evidence, - details={ - "instance_id": instance_id, - "location": location, - "machine_type": machine_type, - "accelerator_type": accel_type or None, - "accelerator_count": accel_count, - "is_gpu": is_gpu, - "age_days": age_days if age_days is not None else "unknown", - "idle_since_days": idle_since_days, - "idle_days_threshold": idle_days, - "idle_ratio": idle_ratio, - "idle_signal_source": idle_signal_source, - "estimated_monthly_cost": f"~${monthly_cost:,.0f}/month", - "cost_basis": "us-central1 baseline estimate", - "labels": labels, - "api_version": "v2", - }, - ) - ) - - return findings + _list_instances(session, project_id) + return [] find_idle_workbench_instances.RULE_ID = "gcp.vertex.workbench.idle" -def _list_instances(session: AuthorizedSession, project_id: str) -> list: +def _list_instances( + session: AuthorizedSession, + project_id: str, +) -> tuple: """ List all Vertex AI Workbench instances across all locations using the v2 API. Uses the locations/- wildcard for a single paginated call covering all regions. - - Raises PermissionError on 403. Returns [] on 404 (API not enabled). + Exhausts pagination via nextPageToken. + Collects unreachable[] locations reported by the API. + + Returns (instances, unreachable_locations, discovery_failed): + instances: raw instance dicts from the API + unreachable_locations: locations the API reported as unreachable + discovery_failed: True when a transport/server error made enumeration + incomplete. + + Error handling: + 403: raises PermissionError (user-actionable; propagates up) + 404: API not enabled; returns ([], [], False) — clean empty scope + 400: bad request or wildcard unsupported; warns, returns ([], [], True) + 5xx: transient server error; warns, returns partial results with True + network error: warns, returns partial results with True """ - results = [] - url = f"https://notebooks.googleapis.com/v2/projects/{project_id}/locations/-/instances" + results: list = [] + unreachable: list = [] + discovery_failed = False + url = ( + f"https://notebooks.googleapis.com/v2" + f"/projects/{project_id}/locations/-/instances" + ) params: dict = {"pageSize": 100} while True: try: resp = session.get(url, params=params) - except Exception: - break # network error — skip, don't abort project scan + except Exception as exc: + warnings.warn( + f"gcp.vertex.workbench.idle: network error fetching instances for " + f"project '{project_id}' ({type(exc).__name__}: {exc}) — " + "discovery incomplete", + UserWarning, + stacklevel=3, + ) + discovery_failed = True + break + if resp.status_code == 403: raise PermissionError( "notebooks.instances.list permission required (roles/notebooks.viewer)" ) - if resp.status_code in (404, 400): - return [] # API not enabled for this project + + if resp.status_code == 404: + return [], [], False + + if resp.status_code == 400: + warnings.warn( + f"gcp.vertex.workbench.idle: HTTP 400 from Notebooks API for project " + f"'{project_id}' — discovery incomplete", + UserWarning, + stacklevel=3, + ) + return [], [], True + if resp.status_code >= 500: - break # transient server error — skip rather than abort scan + warnings.warn( + f"gcp.vertex.workbench.idle: server error (HTTP {resp.status_code}) " + f"for project '{project_id}' — discovery incomplete", + UserWarning, + stacklevel=3, + ) + discovery_failed = True + break + resp.raise_for_status() data = resp.json() - for inst in data.get("instances", []): - inst["_api_version"] = "v2" - results.append(inst) + + results.extend(data.get("instances", [])) + + for loc in data.get("unreachable", []): + if loc and loc not in unreachable: + unreachable.append(loc) + next_token = data.get("nextPageToken") if not next_token: break params["pageToken"] = next_token - return results - - -def _normalize(instance: dict) -> dict: - """ - Normalize a v2 Workbench instance dict to a common schema. - - machineType lives under gceSetup.machineType (short name). - Accelerators under gceSetup.acceleratorConfigs (list). - """ - name = instance.get("name", "") - - # Extract location from resource name: - # projects/{proj}/locations/{loc}/instances/{id} - parts = name.split("/") - location = parts[3] if len(parts) > 3 else "" - - gce = instance.get("gceSetup", {}) - machine_type = gce.get("machineType", "") - accels = gce.get("acceleratorConfigs", []) - accel_type = accels[0].get("type", "") if accels else "" - accel_count = int(accels[0].get("coreCount", 0) or 0) if accels else 0 - - if accel_type == "ACCELERATOR_TYPE_UNSPECIFIED": - accel_type = "" - - return { - "name": name, - "location": location, - "state": instance.get("state", ""), - "create_time": instance.get("createTime", ""), - "update_time": instance.get("updateTime", ""), - "machine_type": machine_type, - "accel_type": accel_type, - "accel_count": accel_count, - "labels": instance.get("labels", {}), - } - - -def _estimate_cost( - machine_type: Optional[str], - accel_type: Optional[str], - accel_count: int, -) -> float: - """ - Estimate monthly cost for one always-on Workbench instance. - - a2-* and g2-* machine types bundle GPU cost — no separate add-on. - n1-*/n2-* machines add GPU cost separately. - """ - machine_cost = _MACHINE_MONTHLY_COST.get(machine_type or "", _DEFAULT_MACHINE_MONTHLY_COST) - - gpu_addon = 0.0 - if accel_type and accel_type in _GPU_MONTHLY_COST_EACH: - is_gpu_machine = (machine_type or "").startswith(("a2-", "g2-")) - if not is_gpu_machine: - gpu_addon = _GPU_MONTHLY_COST_EACH[accel_type] * max(accel_count, 1) - - return machine_cost + gpu_addon + return results, unreachable, discovery_failed diff --git a/docs/rules/gcp.md b/docs/rules/gcp.md index 6360e10..8426f48 100644 --- a/docs/rules/gcp.md +++ b/docs/rules/gcp.md @@ -12,7 +12,7 @@ | `gcp.compute.ip.unused` | Network | Reserved static IPs in RESERVED state | | `gcp.sql.instance.idle` | Platform | Cloud SQL instances with zero connections 14+ days | | `gcp.vertex.endpoint.idle` | AI/ML | Vertex AI endpoints with an always-deployed serving floor and zero observed request activity 14+ days | -| `gcp.vertex.workbench.idle` | AI/ML | Vertex AI Workbench instances with no activity 14+ days | +| `gcp.vertex.workbench.idle` | AI/ML | Vertex AI Workbench instances — currently dormant (EMITTING_DISABLED); no qualifying canonical kernel-activity signal exists | | `gcp.vertex.training_job.long_running` | AI/ML | Vertex AI jobs running beyond threshold | | `gcp.tpu.idle` | AI/ML | Standalone Cloud TPU nodes in READY state with monitoring-based idle detection; currently no findings emit until worker-to-node join is documented | | `gcp.vertex.featurestore.idle` | AI/ML | Vertex AI Feature Stores (legacy) and Bigtable-backed Feature Online Stores with zero serving requests 30+ days (Monitoring-confirmed only) | @@ -181,17 +181,35 @@ **Spec:** [docs/specs/gcp/ai/vertex_endpoint_idle.md](../specs/gcp/ai/vertex_endpoint_idle.md) #### `gcp.vertex.workbench.idle` -**Detects:** Vertex AI Workbench instances `ACTIVE` with no control-plane activity (`updateTime`) for `idle_days` +**Current status: EMITTING_DISABLED.** No findings are emitted. No qualifying canonical kernel-activity signal exists that can prove per-instance notebook or kernel inactivity. `updateTime`, `createTime`, instance age, and CPU utilization are explicitly non-canonical idle signals for this rule (spec 3.3, 8.3). When a documented first-party per-instance Workbench activity surface becomes available (e.g. Cloud Logging kernel/session activity logs or a Cloud Monitoring metric with kernel-idle semantics), the rule can be activated without changing its candidate-selection logic. -**Confidence / Risk:** HIGH (`updateTime` ≥ `idle_days` + age ≥ `idle_days`); MEDIUM (`updateTime` ≥ 75% of threshold or unavailable) / CRITICAL (GPU + `idle_ratio ≥ 2.0`); HIGH (GPU-backed); MEDIUM (CPU-only) +**Detects (future):** Vertex AI Workbench instances in `ACTIVE` state with documented per-instance kernel inactivity over the `idle_days` window, confirmed by a qualifying first-party canonical signal + +**Returns:** `RuleResult` with structured runtime state (spec 12.1) +- `rule_capability_state = EMITTING_DISABLED` +- `scan_scope_state = PARTIAL` when `unreachable[]` locations are reported or a discovery failure occurs (400 / 5xx / network error); `FULL` otherwise +- `resource_evaluation_state = NOT_EVALUABLE` (reason_code `NO_SIGNAL`) when valid `ACTIVE` instances exist; `EVALUABLE` when no candidates are found (including the 404/API-not-enabled case) +- `not_evaluable_resources[]` — all `ACTIVE` candidate instances, each with `reason_code = NO_SIGNAL` +- `not_evaluable_scopes[]` — unreachable locations from `ListInstancesResponse.unreachable[]`, each with `reason_code = COVERAGE` + +**Cost:** `estimated_monthly_cost_usd = None` — rule does not emit findings; no estimate is computed **Permissions:** `notebooks.instances.list` (roles/notebooks.viewer) -**Params:** `idle_days` (default: 14) +**Params:** `idle_days` (default: 14; must be ≥ 1 — fails fast on invalid input) + +**Exclusions:** +- `INVALID` (counted in `excluded_invalid_resources_count`): resource name absent or not matching exact pattern `projects/{p}/locations/{l}/instances/{id}`; `state` absent or empty +- `OUT_OF_SCOPE` (silent, not counted): valid resources in any non-`ACTIVE` state (`STOPPED`, `SUSPENDED`, etc.) +- Region filter: exact string equality; no case folding or aliasing -**Exclusions:** instances not in `ACTIVE` state +**Discovery failure taxonomy:** +- `404` — Notebooks API not enabled; `scan_scope_state = FULL`, `resource_evaluation_state = EVALUABLE` (provably no instances) +- `400` — bad request or wildcard unsupported; `scan_scope_state = PARTIAL` (discovery incomplete) +- `5xx` / network error — transient failure; `scan_scope_state = PARTIAL` (discovery incomplete) +- `unreachable[]` — API-reported location gaps; `scan_scope_state = PARTIAL`; locations added to `not_evaluable_scopes[]` with `reason_code = COVERAGE` -**Spec:** — +**Spec:** [docs/specs/gcp/ai/workbench_idle.md](../specs/gcp/ai/workbench_idle.md) #### `gcp.vertex.training_job.long_running` **Detects:** Vertex AI CustomJobs and TrainingPipelines whose state is exactly the expected running state (`JOB_STATE_RUNNING` / `PIPELINE_STATE_RUNNING`) and whose elapsed wall-clock time since `startTime` meets or exceeds `long_running_hours_threshold` diff --git a/docs/specs/gcp/ai/workbench_idle.md b/docs/specs/gcp/ai/workbench_idle.md new file mode 100644 index 0000000..a88f072 --- /dev/null +++ b/docs/specs/gcp/ai/workbench_idle.md @@ -0,0 +1,668 @@ +# GCP Rule Spec - `gcp.vertex.workbench.idle` + +## 1. Rule Identity + +- **Rule ID:** `gcp.vertex.workbench.idle` +- **Provider:** GCP +- **Resource type:** Vertex AI Workbench Instance +- **Finding resource_type:** `gcp.vertex.workbench.instance` + +--- + +## 2. Intent + +Detect **Vertex AI Workbench instances that are provably still running** and have **documented first-party evidence of notebook/kernel inactivity** over a conservative review window. + +This rule is deliberately **precision-first**. It is a **review-candidate** rule only. It is **not** proof that an instance is safe to stop, **not** proof that no scheduled or background work exists, and **not** proof of a specific monthly dollar saving. + +This rule is a **proof-based** rule, not a heuristic rule. In its current canonical form it is **currently dormant pending signal availability**: it is non-emitting unless a documented canonical activity signal path exists and passes the signal-availability gate. + +This rule is designed to prove idleness when a qualifying canonical signal exists; it does **not** suggest optimization by itself. + +### 2.1 Canonical definitions + +| Term | Definition | +|---|---| +| Workbench instance | Vertex AI Workbench v2 `Instance` resource `projects/{project}/locations/{location}/instances/{instance_id}` | +| running instance | Instance whose `state` is exactly `ACTIVE` | +| kernel inactivity | The documented idle-shutdown notion of inactivity: no kernel activity for the configured time period; running a cell or new notebook output resets the timer; CPU usage does not | +| idle-shutdown configuration | Workbench metadata keys that control automatic shutdown behavior, especially `idle-timeout-seconds` and `enable-guest-attributes` | +| canonical idle signal | A documented first-party signal that can prove absence of kernel activity for the review window under the canonical signal requirements | +| activity signal source | The exact first-party source used for proof, such as a documented Workbench-attributable Cloud Logging or Cloud Monitoring surface | +| review window end | `now_utc` | +| review window start | `review_window_end_utc - idle_days x 86400 seconds` | +| full observation window | `[review_window_start_utc, review_window_end_utc]`, usable only when the chosen canonical idle signal can cover the full window | +| signal availability gate | The source is usable only when retention covers `idle_days`, the full observation window is continuously visible, and no permission gaps exist | +| invalid resource record | Record excluded from evaluation because required identity fields are missing, malformed, or unparsable | +| out-of-scope resource record | Valid resource record excluded from evaluation because it does not satisfy in-scope lifecycle conditions for this rule | +| not evaluable | Explicit outcome when no qualifying canonical signal exists or the signal-availability gate fails; this is not the same as "0 findings" | +| not evaluable reason code | Root-cause category for a not-evaluable outcome: `NO_SIGNAL`, `PERMISSIONS`, or `COVERAGE` | +| partial scan | Scan-level outcome for discovery-layer coverage gaps in the requested scope, such as when `unreachable[]` is reported; signal-quality failures alone are **not evaluable**, not partial, and MUST NOT change `scan_scope_state` | +| rule capability state | Static rule capability: `EMITTING_DISABLED` or `EMITTING_ENABLED` | +| scan scope state | Scope-level runtime state: `FULL` or `PARTIAL` | +| resource evaluation state | Aggregate runtime state across valid in-scope resources: `EVALUABLE`, `NOT_EVALUABLE`, or `MIXED`; it is determined independently from discovery completeness | +| reporting mode | Output mode for not-evaluable categories: `FULL_ENUMERATION` or `COUNT_ONLY` | +| candidate resources | Valid, in-scope Workbench instances with `state = ACTIVE` after normalization and filtering | +| `signal_coverage_start` | Placeholder for the earliest timestamp in the signal window actually used for proof | +| `signal_coverage_end` | Placeholder for the latest timestamp in the signal window actually used for proof | + +--- + +## 3. GCP Documentation Grounding + +### 3.1 Vertex AI Workbench `Instance` is the control-plane resource for this rule + +Google documents the Vertex AI Workbench v2 `Instance` resource with fields including: + +1. `name` +2. `state` +3. `createTime` +4. `updateTime` +5. `labels` +6. `gceSetup` +7. `gceSetup.machineType` +8. `gceSetup.acceleratorConfigs` +9. `gceSetup.metadata` +10. `gceSetup.bootDisk` +11. `gceSetup.dataDisks` + +Google also documents: + +1. `name` format: `projects/{projectId}/locations/{location}/instances/{instanceId}` +2. `ACTIVE` means **the instance is running** +3. `STOPPED` means the instance is stopped +4. `SUSPENDED` means the instance is suspended +5. `createTime` and `updateTime` are output-only timestamps on the instance resource + +Source: + +- *Resource: Instance* + +URL: + +- https://cloud.google.com/vertex-ai/docs/workbench/reference/rest/v2/projects.locations.instances + +Rule consequence: + +1. Eligibility must be based on documented `Instance` control-plane fields only. +2. Exact state `ACTIVE` is the only in-scope running lifecycle state for this rule. +3. `createTime` and `updateTime` are documented lifecycle/update timestamps, but Google does **not** document them as notebook-session or kernel-activity timestamps. +4. Resource identity and region must come from the documented full resource name, not from display text or labels. + +### 3.2 The list API is paginated and can report unreachable locations + +Google documents `projects.locations.instances.list` with: + +1. `pageSize` +2. `pageToken` +3. `filter` +4. `instances[]` +5. `nextPageToken` +6. `unreachable[]` + +Google documents `instances[]` and `unreachable[]` on `ListInstancesResponse`. The implementation must treat both fields as independently usable when present and must not assume mutual exclusivity unless Google documents that guarantee explicitly. + +Source: + +- *Method: projects.locations.instances.list* + +URL: + +- https://cloud.google.com/vertex-ai/docs/workbench/reference/rest/v2/projects.locations.instances/list + +Rule consequence: + +1. Pagination must be exhausted using `nextPageToken`. +2. Reported `unreachable` locations mean visibility is incomplete for that read. +3. If `unreachable[]` is present, the scan is **partial**. +4. Each unreachable location is a **not evaluable scope** for that scan; any resources in that location are outside canonical evaluable coverage and MUST NOT produce findings. +5. The rule may still emit findings for reachable locations within the requested scope, but the scan MUST remain `partial = true` and MUST surface the unreachable locations as `not_evaluable_scopes[]`. +6. The rule must not claim complete project-wide idle evaluation when the list response reports unreachable locations. +7. A future CleanCloud implementation may surface partial-scan status as a warning or exit-code signal, but canonical detection logic must already treat coverage as incomplete. + +### 3.3 Google defines Workbench idleness in terms of kernel activity, not control-plane timestamps + +Google documents Workbench idle shutdown as follows: + +1. Workbench instances shut down after a specified period of inactivity by default +2. default idle-shutdown threshold is 180 inactive minutes +3. idle shutdown requires guest attributes to be enabled +4. the instance shuts down when there is **no kernel activity** for the specified time period +5. running a notebook cell or new output printing resets the idle-shutdown timer +6. CPU usage does **not** reset the idle-shutdown timer +7. idle shutdown looks for activity in local Jupyter session, terminal, and kernel endpoints + +Source: + +- *Idle shutdown* + +URL: + +- https://cloud.google.com/vertex-ai/docs/workbench/instances/idle-shutdown + +Rule consequence: + +1. The canonical inactivity concept for this rule is **kernel inactivity**, not generic VM age, not `updateTime`, and not CPU utilization. +2. `updateTime` must not be interpreted as "last notebook activity" or "last kernel activity". +3. `createTime` must not be used as an idle fallback or as proof that an instance has been unused since creation. +4. CPU or host activity metrics would not be canonical substitutes for notebook idleness without a separate documented contract, because Google explicitly distinguishes CPU usage from idle-shutdown activity. + +### 3.4 Workbench metadata documents idle-shutdown configuration, not actual last activity + +Google documents the following metadata keys for Workbench instances: + +1. `idle-timeout-seconds` - integer idle time in seconds; default `10800` +2. `enable-guest-attributes` - required for idle shutdown; default `true` + +Google also documents: + +1. these metadata keys are managed through instance metadata +2. `instances.patch` supports updates to `gceSetup.metadata` +3. turning off idle shutdown is managed through metadata + +Sources: + +- *Manage metadata* +- *Method: projects.locations.instances.patch* + +URLs: + +- https://cloud.google.com/vertex-ai/docs/workbench/instances/manage-metadata +- https://cloud.google.com/vertex-ai/docs/workbench/reference/rest/v2/projects.locations.instances/patch + +Rule consequence: + +1. Idle-shutdown metadata is valid **configuration context** only. +2. Metadata can explain why an instance may remain running, but it does **not** prove whether the instance has been idle or active over the review window. +3. Presence, absence, or value changes of `idle-timeout-seconds` must not be treated as first-party evidence of recent or absent kernel activity. +4. `enable-guest-attributes` is operational context only; it is not a direct activity signal. + +### 3.5 Workbench accelerator configuration is documented as GPU-only on this surface + +Google documents `gceSetup.acceleratorConfigs` and `AcceleratorConfig` with: + +1. `type` +2. `coreCount` +3. currently only one accelerator configuration is supported +4. **TPUs are not supported** + +Source: + +- *Resource: Instance* (`GceSetup`, `AcceleratorConfig`, `AcceleratorType`) + +URL: + +- https://cloud.google.com/vertex-ai/docs/workbench/reference/rest/v2/projects.locations.instances + +Rule consequence: + +1. Hardware enrichment may use the documented accelerator configuration when present. +2. This rule must not classify Workbench instances as TPU-backed from the documented `acceleratorConfigs` surface. +3. Hardware is auxiliary context only; it is not canonical proof of idleness. + +### 3.6 Billing guidance distinguishes running compute from stopped storage-only cost + +Google documents that: + +1. while a Workbench instance is shut down, there are no CPU or GPU usage charges except scheduled executions that run during shutdown +2. disk storage charges still apply while the instance is shut down + +Source: + +- *Idle shutdown* + +URL: + +- https://cloud.google.com/vertex-ai/docs/workbench/instances/idle-shutdown + +Rule consequence: + +1. `ACTIVE` instances are the relevant compute-cost surface for this rule. +2. `STOPPED` or `SUSPENDED` instances are out of scope for this idle-compute rule, although storage cost can still remain. +3. The rule must not hardcode a fixed monthly estimate from static machine-price tables as canonical logic. +4. `estimated_monthly_cost_usd` should remain `None` unless a future implementation computes current pricing from authoritative region- and configuration-specific pricing inputs. + +--- + +## 4. Detection Goal + +Emit a finding only when **all** of the following are true: + +1. the resource is a documented Workbench `Instance` +2. the resource name is valid and the location is parseable from it +3. if a region filter is set, it matches the normalized location exactly +4. the resource state is exactly `ACTIVE` +5. the resource satisfies the canonical idle signal requirements in section 8 + +If any required canonical signal condition cannot be established reliably, the resource is **not evaluable** for this rule version and MUST NOT produce findings. + +**Current canonical status:** based on the documented surfaces referenced in this spec, no qualifying canonical signal exists that exposes per-instance last kernel activity or a kernel-idle time series suitable for this rule. Therefore the rule is currently in `EMITTING_DISABLED` mode and must not emit findings from control-plane timestamps alone until a qualifying signal path is documented and usable. + +### 4.1 Current canonical decision flow + +Implement the current version in this order: + +1. capture `now_utc` once for the scan +2. list Workbench instances for the requested scope and exhaust pagination +3. if `unreachable[]` is reported, set `partial = true` and record each unreachable location in `not_evaluable_scopes[]` +4. normalize returned records; count invalid records in `excluded_invalid_resources_count` and exclude out-of-scope records before candidate resource formation +5. keep only valid in-scope `ACTIVE` instances as candidate resources +6. if there are no candidate resources: + - set `resource_evaluation_state = EVALUABLE` + - emit no findings +7. otherwise, as defined in section 4, classify those candidate resources as `not_evaluable_resources[]` +8. set `rule_capability_state = EMITTING_DISABLED` +9. in the current version, candidate resources default to `reason_code = NO_SIGNAL` because no qualifying canonical signal exists +10. in the current version, all candidate resources share the same evaluation outcome; therefore `resource_evaluation_state` cannot be `MIXED` +11. set `resource_evaluation_state = NOT_EVALUABLE` for the current version when candidate resources exist +12. emit no findings + +`scan_scope_state` is determined exclusively by discovery-layer reachability, such as `unreachable[]`, and MUST NOT be changed by resource-level signal evaluation outcomes. Resource evaluation MUST remain independent of discovery completeness for reachable candidate resources. + +If a future qualifying canonical signal path is documented, the implementation may continue from step 5 by applying section 8 to reachable candidate resources only. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that an old `updateTime` means the notebook is idle +- that an old `createTime` means the notebook has been unused +- that low CPU usage means the notebook is idle +- that the configured idle-shutdown timeout has already been exceeded +- that the instance is safe to stop +- that no scheduled executions or other intentional automation exist +- that a specific monthly saving exists + +--- + +## 6. Canonical Inputs + +### 6.1 Required surfaces + +The implementation may use the following documented APIs and docs-backed fields: + +1. `projects.locations.instances.list` +2. `Instance.name` +3. `Instance.state` +4. `Instance.gceSetup.metadata` +5. `Instance.gceSetup.machineType` +6. `Instance.gceSetup.acceleratorConfigs` +7. `Instance.createTime` +8. `Instance.updateTime` + +### 6.2 Future activation path: conditional canonical activity signal sources + +Canonical qualification is defined only in section 8. + +These source classes are permitted only when explicitly documented by Google as Workbench-attributable and semantically aligned with kernel activity. + +Permitted canonical source classes are: + +1. **Google Cloud Logging**, but only when Google documents Workbench-attributable logs as part of Vertex AI Workbench itself, with semantics that map to notebook or kernel activity for an exact instance rather than incidental infrastructure events +2. notebook execution logs, when documented by Google as part of Vertex AI Workbench activity evidence and attributable per instance +3. kernel/session activity logs, but only when documented by Google as part of Vertex AI Workbench, attributable to the exact instance, and semantically tied to the Workbench idle definition +4. **Google Cloud Monitoring**, but only when Google documents a Workbench-specific metric/resource contract whose semantics map to kernel activity or documented absence of kernel activity rather than VM utilization + +The consulted control-plane docs do **not** provide such a signal on the `Instance` resource itself, and this spec does **not** currently establish a qualifying Logging or Monitoring signal path. This section is therefore a **future activation path**, not an active emit path in the current version. + +### 6.3 Future activation path: signal-availability gate + +Even when a candidate signal source is documented, it is canonical only when all of the following are true: + +1. retention for the chosen log or metric source is at least `idle_days` +2. the exact source is explicitly documented to provide continuous, gap-free coverage across the full observation window, including no sampling, ingestion, or visibility gaps +3. there are no permission gaps for the source over the evaluated scope +4. any required location-level reads are reachable for the evaluated scope + +In a future activated version, implementations may rely only on explicit Google documentation or source contracts to establish these properties; they must not infer completeness, cadence, or gap-freeness heuristically. + +If any condition fails, the resource or affected scope is **not evaluable**. The implementation must not silently treat missing, partial, or gap-ridden telemetry as equivalent to zero activity, and must not infer idle from "no events found". + +### 6.4 Optional context fields + +These may enrich a future finding but are not themselves eligibility signals: + +- `labels` +- `creator` +- `instanceOwners` +- `healthState` +- `gceSetup.machineType` +- `gceSetup.acceleratorConfigs` +- `gceSetup.bootDisk` +- `gceSetup.dataDisks` +- `gceSetup.metadata` +- `createTime` +- `updateTime` + +--- + +## 7. Canonical normalization rules + +Normalize the following values: + +| Field | Canonical rule | +|---|---| +| `resource_name` | Must exactly match `projects/{project}/locations/{location}/instances/{instance_id}`. Otherwise treat the record as invalid and exclude it from evaluation and findings. | +| `location` | Parse from the exact `locations/{location}` segment of the resource name. Region-filter comparison must use exact string equality only, with no aliasing or case folding. | +| `state` | Compare exactly to documented enum `ACTIVE`, case-sensitive and with no normalization. Null or empty values make the record invalid for this rule. | +| `now_utc` | Capture once per scan run in UTC and reuse for all resources in that run. | +| `metadata.idle-timeout-seconds` | Context only. If used, parse as an integer number of seconds or treat as unusable context. It must not be used as a substitute for observed inactivity. | +| `create_time_utc` | Optional context only. If parsed, require strict RFC3339. Parse failure removes context; it must not trigger fallback idle logic. | +| `update_time_utc` | Optional context only. If parsed, require strict RFC3339. Parse failure removes context; it must not trigger fallback idle logic. | + +Important: + +1. `updateTime` is **not** the canonical last-activity field. +2. `createTime` is **not** an idle fallback. +3. The rule must not derive `idle_since_days` from `updateTime` or `createTime`. +4. Normalization failures are invalid-resource exclusions, not **not evaluable** outcomes. +5. Invalid or out-of-scope resource records are excluded before evaluation and MUST NOT appear in `not_evaluable` outputs. + +--- + +## 8. Future activation path: activity evidence rules + +### 8.1 Canonical idle signal requirements + +The rule may use an activity signal only when all of the following are true: + +1. the signal is documented by Google +2. the signal is attributable to the exact Workbench instance being evaluated +3. the signal maps to Workbench's documented kernel-activity semantics, or the documented absence of such activity, rather than generic VM host utilization +4. the signal must have either an explicit documented inactivity contract or a documented last-activity contract with completeness guarantees, and it must support proving absence of activity; inactivity must not be inferred from missing events +5. the signal is resolved at per-instance scope, not only at project or aggregated scope +6. the signal passes the signal-availability gate + +Examples of qualifying contracts include: + +1. a documented per-instance Workbench metric explicitly defined as an idle-state signal for notebook or kernel activity +2. a documented per-instance Workbench field or metric explicitly defined as a last notebook or kernel activity timestamp, where Google documents completeness guarantees for the full observation window + +### 8.2 Conditional source-path allowances + +The following source paths may be used **only if fully compliant with section 8.1**: + +1. Google Cloud Logging notebook execution logs +2. Google Cloud Logging kernel/session activity logs +3. Google Cloud Monitoring metrics that are explicitly documented against Workbench activity semantics + +Allowing these source classes does **not** mean they are currently established as canonical for this rule. + +### 8.3 Global exclusion list: non-canonical signals + +The following are **not** canonical idle signals for this rule: + +1. `updateTime` +2. `createTime` +3. instance age alone +4. idle-shutdown metadata values alone +5. machine type or accelerator configuration +6. generic CPU, GPU, memory, or network host utilization without a separate documented contract equating it to Workbench kernel inactivity +7. partial-window logs or metrics +8. aggregated or project-level signals that cannot be attributed to the exact instance +9. Cloud Monitoring host or VM utilization used as a proxy for notebook or kernel activity +10. "no events found" or "no logs returned" treated as proof of idleness +11. fallback to heuristics when a qualifying canonical signal is missing + +For this spec, "proof" means an explicit Google-documented inactivity or last-activity contract with completeness guarantees across the full observation window. Proof is never inferred from sparse, partial, or missing events. + +### 8.4 Idle-shutdown configuration is context only + +Idle-shutdown configuration may be used to explain or enrich behavior, for example: + +1. idle shutdown default exists +2. `enable-guest-attributes` is required +3. the configured timeout may be visible + +But this configuration does **not** prove: + +1. whether the instance actually experienced no kernel activity +2. whether idle shutdown ran successfully +3. whether the timer has or has not been reset within the review window + +--- + +## 9. Decision rule + +### 9.1 Eligibility + +The resource is eligible only when: + +1. resource type is Workbench `Instance` +2. `state` is exactly `ACTIVE` +3. the resource satisfies the canonical idle signal requirements in section 8 + +Configuration requirement: + +1. `idle_days` must be `>= 1` +2. invalid threshold configuration must fail fast rather than silently clamp or reinterpret the value + +### 9.2 Current canonical outcome + +Under the currently consulted official docs, the canonical implementation follows the decision flow in section 4.1. + +In the current version: + +1. unreachable requested locations make the scan `partial` and populate `not_evaluable_scopes[]` +2. valid in-scope `ACTIVE` resources remain **not evaluable** as defined in section 4 +3. findings remain empty until a documented qualifying signal path exists and satisfies section 8 for reachable resources + +Absence of signal MUST NOT be interpreted as inactivity. + +Important: + +1. **not evaluable** is a separate first-class outcome, not a synonym for `0 findings` +2. the rule may return zero findings even when `ACTIVE` instances exist + +### 9.3 Explicitly forbidden heuristics + +The rule must **not**: + +- emit from `updateTime` age alone +- emit from `createTime` age alone +- emit from an age fallback when `updateTime` is absent +- infer notebook inactivity from low CPU usage +- infer notebook inactivity from machine type or accelerator presence +- emit because idle shutdown is disabled or appears unconfigured +- fall back to heuristics if a qualifying canonical signal is missing + +--- + +## 10. Cost handling + +### 10.1 Canonical monthly cost field + +`estimated_monthly_cost_usd = None` + +Reason: + +1. the canonical spec does not currently emit findings +2. authoritative Workbench cost depends on running compute shape, attached accelerators, disks, region, and usage option +3. stopped instances still incur disk charges, so simplistic compute-only estimates are incomplete + +### 10.2 Future advisory cost hints + +If a future implementation chooses to surface an advisory cost hint, it must: + +1. be clearly labeled non-canonical advisory context +2. use authoritative current pricing inputs for the exact region and configuration +3. distinguish running compute from persistent disk charges +4. never affect eligibility + +--- + +## 11. Failure behavior + +### 11.1 Invalid or out-of-scope resource exclusion + +Exclude from evaluation and findings: + +- empty resource names +- resource names that do not exactly match the documented instance pattern +- `state` absent or empty +- resources in non-`ACTIVE` states + +Use this exclusion taxonomy: + +| Category | Meaning | Counted in `excluded_invalid_resources_count` | +|---|---|---| +| `INVALID` | malformed, missing, or unparsable required identity or state fields | yes | +| `OUT_OF_SCOPE` | valid resource record that is not in the rule's lifecycle scope, including non-`ACTIVE` resources | no | + +Records with absent or empty `state` are `INVALID`. Resources in non-`ACTIVE` states are `OUT_OF_SCOPE`: they are valid but excluded from evaluation and MUST NOT be counted in `excluded_invalid_resources_count`. + +`excluded_invalid_resources_count` excludes `OUT_OF_SCOPE` records by design. + +Out-of-scope resources are excluded before candidate resource formation. + +These are not **not evaluable** outcomes. + +### 11.2 Not evaluable taxonomy + +Classify as **not evaluable** and MUST NOT produce findings. + +Section 12 is the authoritative runtime contract for `scan_scope_state`, `resource_evaluation_state`, `partial`, and reporting behavior. This section defines only the taxonomy and reason-code classification used by not-evaluable records. + +Use the following reason codes: + +| Reason code | Meaning | +|---|---| +| `NO_SIGNAL` | No qualifying canonical signal path exists for the resource or requested reachable scope | +| `PERMISSIONS` | Required permissions for the qualifying signal source are missing or incomplete | +| `COVERAGE` | Coverage is incomplete for the qualifying signal source or requested scope, including unreachable locations and partial observation windows | + +If more than one reason applies, select the primary `reason_code` using this precedence: + +1. `PERMISSIONS` +2. `COVERAGE` +3. `NO_SIGNAL` + +Implementations may retain additional secondary reasons as non-canonical context, but each `not_evaluable` record should expose one primary `reason_code`. + +When no qualifying canonical signal exists for the rule version, valid in-scope resources MUST use `reason_code = NO_SIGNAL`. + +In `EMITTING_DISABLED` mode, `NO_SIGNAL` is a synthetic default applied uniformly to candidate resources and does not represent per-resource evaluation variance. + +Apply them as follows: + +- `NO_SIGNAL`: resources for which no qualifying canonical signal exists +- `COVERAGE`: resources for which signal retention is shorter than `idle_days` +- `COVERAGE`: resources for which the candidate signal covers only part of the observation window +- `NO_SIGNAL`, `PERMISSIONS`, or `COVERAGE`: resources for which the candidate signal fails the signal-availability gate, according to the underlying cause +- `PERMISSIONS`: resources or scopes for which permissions are insufficient to evaluate the chosen signal source +- `COVERAGE`: unreachable locations reported in the documented list response + +Runtime handling of these reason codes, including scope partiality, output separation, and state precedence, is defined in section 12. + +--- + +## 12. Output contract + +### 12.1 Current runtime contract + +The implementation must preserve these rule-level outcomes separately: + +| Output | Meaning | +|---|---| +| `rule_capability_state` | Static capability state: `EMITTING_DISABLED` or `EMITTING_ENABLED` | +| `scan_scope_state` | Scope-level runtime state: `FULL` or `PARTIAL` | +| `resource_evaluation_state` | Aggregate runtime evaluation state across valid in-scope resources: `EVALUABLE`, `NOT_EVALUABLE`, or `MIXED` | +| `findings[]` | Emitted findings only | +| `partial` | `true` only for discovery-layer coverage gaps in the requested scope, including when `unreachable[]` is reported | +| `excluded_invalid_resources_count` | Exact count of invalid resource records excluded before canonical evaluation | +| `reporting_mode_not_evaluable_resources` | `FULL_ENUMERATION` or `COUNT_ONLY` | +| `reporting_mode_not_evaluable_scopes` | `FULL_ENUMERATION` or `COUNT_ONLY` | +| `not_evaluable_resources[]` | Valid in-scope resources that could not be evaluated under canonical signal requirements; each record should carry a `reason_code` | +| `not_evaluable_scopes[]` | Scope-level not-evaluable records, including unreachable locations; each record should carry a `reason_code` | + +`partial = true` if and only if `scan_scope_state = PARTIAL`. `partial = false` if and only if `scan_scope_state = FULL`. + +`scan_scope_state` is determined exclusively by discovery-layer reachability and MUST NOT be upgraded or downgraded by signal-evaluation outcomes. + +All entries in `not_evaluable_scopes[]` derived from `unreachable[]` MUST use `reason_code = COVERAGE`. + +For each not-evaluable category, the implementation MUST choose exactly one reporting mode: + +1. `FULL_ENUMERATION` — return the complete set for that category +2. `COUNT_ONLY` — return the exact full count for that category without full enumeration + +The implementation MUST NOT silently drop either category. + +Implementations SHOULD default to `FULL_ENUMERATION` unless payload size or platform constraints require `COUNT_ONLY`. + +If enumeration would make the payload unreasonably large, the implementation MAY use `COUNT_ONLY` for that category. If an exact full count cannot be established because of permission or coverage limits, the implementation MUST NOT claim `COUNT_ONLY`; instead, it must surface the affected scope or category as `PARTIAL` and/or `NOT_EVALUABLE` with the corresponding `reason_code`. + +In the current version, counts for `not_evaluable_resources` are always exact because they derive from fully enumerated candidate resources. + +The implementation MUST always retain an exact count for `excluded_invalid_resources_count`, even if individual excluded records are not returned in the payload. + +This current runtime contract describes the rule as it behaves today. + +### 12.2 Current canonical behavior + +The current canonical behavior is to return **no findings** as defined in section 4. + +This reflects `rule_capability_state = EMITTING_DISABLED`, not an accidental empty result. + +Interpretation: + +1. `0 findings` does **not** mean there are no idle Workbench instances +2. `0 findings` means there are no instances provably idle under canonical signals accepted by this spec +3. if signal availability or scope coverage is insufficient, the implementation should surface that the rule was **not evaluable** and/or **partial**, rather than implying complete negative coverage +4. as defined in section 4, no findings will be emitted even for reachable locations +5. even when `scan_scope_state = PARTIAL`, the current version MUST emit zero findings because no qualifying canonical signal exists +6. `rule_capability_state = EMITTING_DISABLED` is appropriate in the current dormant version +7. `scan_scope_state = PARTIAL` is appropriate only for discovery-layer coverage gaps, such as `unreachable[]`; signal evaluation gaps alone do not make the scan partial +8. `resource_evaluation_state = EVALUABLE` is appropriate when either no valid in-scope reachable candidate resources exist after filtering, or when evaluation is attempted and all candidate resources satisfy the canonical signal preconditions +9. `resource_evaluation_state = NOT_EVALUABLE` is appropriate when candidate resources exist but none of them can be evaluated under a qualifying canonical signal path +10. `resource_evaluation_state = MIXED` is appropriate when some valid in-scope reachable resources are evaluable and others are not +11. in the current dormant version, `resource_evaluation_state` will normally be `NOT_EVALUABLE` +12. `resource_evaluation_state = MIXED` MUST NOT be emitted in `EMITTING_DISABLED` mode +13. consumers SHOULD treat `NOT_EVALUABLE` as an unknown state requiring explicit surfacing rather than as a negative result or a retry guarantee +14. when a single primary status must be displayed, consumers SHOULD prioritize `scan_scope_state = PARTIAL` over any resource evaluation state + +`EVALUABLE` when no candidate resources exist indicates a valid reachable scope with no eligible resources present after filtering; evaluation was not required on any candidate resource. + +### 12.3 Future enhancement schema + +If a future documented idle signal is added, the implementation may also populate the following finding fields: + +| Field | Value | +|---|---| +| `provider` | `gcp` | +| `rule_id` | `gcp.vertex.workbench.idle` | +| `category` | `ai` | +| `severity` | Placeholder for future classification; must not affect canonical eligibility | +| `confidence` | Placeholder for future classification; must not affect canonical eligibility | +| `resource_type` | `gcp.vertex.workbench.instance` | +| `resource_id` | Full Workbench instance resource name | +| `region` | Parsed location from resource name | +| `activity_signal_source` | Canonical source used for proof, such as a documented log or metric surface | +| `signal_coverage_start` | Earliest timestamp covered by the exact signal window used for proof | +| `signal_coverage_end` | Latest timestamp covered by the exact signal window used for proof | +| `estimated_monthly_cost_usd` | `None` in canonical logic unless authoritative live pricing is added | + +These fields are dormant in the current version because the rule does not yet have a qualifying canonical signal path. + +--- + +## 13. Implementation notes for future hardening + +This spec intentionally rejects the following as insufficient for canonical idle detection: + +1. `ACTIVE` + old `updateTime` +2. `ACTIVE` + old `createTime` +3. `ACTIVE` + disabled idle shutdown +4. `ACTIVE` + low host utilization + +To make this rule emit canonically in the future, the implementation needs a documented first-party per-instance activity surface that is semantically aligned with Workbench's documented idle-shutdown notion of **kernel inactivity**. + +Likely future-enablement paths are: + +1. documented Google Cloud Logging notebook execution logs that Google defines as part of Vertex AI Workbench +2. documented Google Cloud Logging kernel/session activity logs that Google defines as part of Vertex AI Workbench +3. documented Google Cloud Monitoring metrics whose semantics map directly to Workbench kernel activity + +Any such path still requires full-window coverage, sufficient retention, exact per-instance attribution, and no permission or reachability gaps. diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py index 3bf24ba..fd01236 100644 --- a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py +++ b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py @@ -1,404 +1,535 @@ """ Tests for gcp.vertex.workbench.idle rule. +The rule is EMITTING_DISABLED and always returns an empty List[Finding]. +No qualifying canonical kernel-activity signal exists; updateTime, createTime, +age, and CPU utilization are all explicitly non-canonical. + Coverage: -- Core detection: idle CPU instance (MEDIUM risk), idle GPU instance (HIGH risk) -- Skipping: STOPPED instances, young instances, instances with recent activity -- Confidence levels: HIGH (updateTime + age >= threshold), MEDIUM (75% threshold or age-fallback) -- GPU detection: NVIDIA_TESLA_T4, NVIDIA_TESLA_A100, a2-* machines -- Risk levels: CRITICAL (GPU + idle_ratio >= 2.0), HIGH (GPU), MEDIUM (CPU) -- Cost estimation: machine cost, GPU add-on for n1/n2, bundled for a2/g2 -- Age-fallback: when updateTime unavailable, confidence capped at MEDIUM -- Region filter: instances outside the filter are skipped -- Both API versions: v1 (User-Managed Notebooks), v2 (Vertex AI Workbench) -- Permission errors: PermissionError raised on 403 from list call -- RULE_METADATA and RULE_ID attributes present + Public API (find_idle_workbench_instances): + - return type and value + - idle_days validation (zero, negative, boundary, error message) + - region_filter parameter accepted + - 403/404/400/5xx/network error handling + - warning type, message content (project, HTTP code, rule ID) + + Internal (_list_instances): + - empty response + - instance accumulation + - pagination over 2 and 3 pages + - pageToken forwarded on subsequent requests + - pageSize=100 in initial request + - unreachable[] collected and deduplicated across pages + - empty unreachable entries skipped + - 404 returns clean ([], [], False) + - 400 returns ([], [], True) + - 5xx sets discovery_failed; preserves already-fetched instances + - network error sets discovery_failed; preserves already-fetched instances + - 403 raises PermissionError + - URL contains project ID and locations/- wildcard """ -from datetime import datetime, timedelta, timezone +import warnings from unittest.mock import MagicMock, patch import pytest -from cleancloud.core.confidence import ConfidenceLevel -from cleancloud.core.risk import RiskLevel from cleancloud.providers.gcp.rules.ai.workbench_idle import ( - _DEFAULT_MACHINE_MONTHLY_COST, - _GPU_MONTHLY_COST_EACH, - _MACHINE_MONTHLY_COST, RULE_METADATA, - _estimate_cost, - _normalize, + _list_instances, find_idle_workbench_instances, ) +_PROJECT = "my-project" + + # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- -NOW = datetime(2025, 6, 1, 12, 0, 0, tzinfo=timezone.utc) -_PROJECT = "my-project" -_LOCATION = "us-central1" -_INSTANCE_ID = "my-workbench-1" -_INSTANCE_NAME = f"projects/{_PROJECT}/locations/{_LOCATION}/instances/{_INSTANCE_ID}" - -_OLD_TIME = NOW - timedelta(days=30) -_IDLE_TIME = NOW - timedelta(days=20) -_RECENT_TIME = NOW - timedelta(days=3) -_YOUNG_TIME = NOW - timedelta(days=2) - - -def _ts(dt: datetime) -> str: - return dt.strftime("%Y-%m-%dT%H:%M:%SZ") - - -def _v2_instance( - name: str = _INSTANCE_NAME, - state: str = "ACTIVE", - create_time: datetime = _OLD_TIME, - update_time: datetime = _IDLE_TIME, - machine_type: str = "n1-standard-4", - accel_type: str = "", - accel_count: int = 0, - labels: dict = None, -) -> dict: - """Build a minimal v2 Workbench instance response dict.""" - gce: dict = {"machineType": machine_type} - if accel_type: - gce["acceleratorConfigs"] = [{"type": accel_type, "coreCount": str(accel_count or 1)}] - return { - "name": name, - "state": state, - "createTime": _ts(create_time), - "updateTime": _ts(update_time), - "gceSetup": gce, - "labels": labels or {}, - "_api_version": "v2", - } - - -def _v1_instance( - name: str = _INSTANCE_NAME, - state: str = "ACTIVE", - create_time: datetime = _OLD_TIME, - update_time: datetime = _IDLE_TIME, - machine_type: str = "zones/us-central1-a/machineTypes/n1-standard-4", - accel_type: str = "", - accel_count: int = 0, - labels: dict = None, -) -> dict: - """Build a minimal v1 User-Managed Notebook instance response dict.""" - inst: dict = { - "name": name, - "state": state, - "createTime": _ts(create_time), - "updateTime": _ts(update_time), - "machineType": machine_type, - "labels": labels or {}, - "_api_version": "v1", - } - if accel_type: - inst["acceleratorConfig"] = { - "type": accel_type, - "coreCount": str(accel_count or 1), - } - return inst - - -def _mock_session(instances: list): - """Return a mock AuthorizedSession that returns the given instance list from v2 API.""" + +def _ok(body: dict = None): + """Build a 200 response mock with the given JSON body.""" + resp = MagicMock() + resp.status_code = 200 + resp.json.return_value = body or {} + resp.raise_for_status.return_value = None + return resp + + +def _err(status_code: int): + """Build an error response mock with the given status code.""" + resp = MagicMock() + resp.status_code = status_code + return resp + + +def _session(*responses): + """Build a mock session whose .get() returns responses in order.""" mock = MagicMock() - response = MagicMock() - response.status_code = 200 - response.json.return_value = {"instances": instances} - mock.get.return_value = response + mock.get.side_effect = list(responses) return mock +def _invoke(**kwargs): + """ + Call find_idle_workbench_instances with a default 200/empty mock session. + Extra kwargs are forwarded to the rule function. + """ + with patch( + "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession", + return_value=_session(_ok()), + ): + return find_idle_workbench_instances( + project_id=_PROJECT, credentials=MagicMock(), **kwargs + ) + + +def _invoke_with_session(mock_session, **kwargs): + """Call find_idle_workbench_instances with a custom session mock.""" + with patch( + "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession", + return_value=mock_session, + ): + return find_idle_workbench_instances( + project_id=_PROJECT, credentials=MagicMock(), **kwargs + ) + + # --------------------------------------------------------------------------- -# _normalize tests +# Return type and value # --------------------------------------------------------------------------- -class TestNormalize: - def test_v2_basic(self): - raw = _v2_instance() - norm = _normalize(raw) - assert norm["name"] == _INSTANCE_NAME - assert norm["location"] == _LOCATION - assert norm["state"] == "ACTIVE" - assert norm["machine_type"] == "n1-standard-4" - assert norm["accel_type"] == "" - assert norm["accel_count"] == 0 +class TestReturnValue: + def test_returns_list(self): + assert isinstance(_invoke(), list) - def test_v2_with_gpu(self): - raw = _v2_instance(accel_type="NVIDIA_TESLA_T4", accel_count=2) - norm = _normalize(raw) - assert norm["accel_type"] == "NVIDIA_TESLA_T4" - assert norm["accel_count"] == 2 + def test_always_empty(self): + assert _invoke() == [] - def test_unspecified_accel_normalized_to_empty(self): - raw = _v2_instance(accel_type="ACCELERATOR_TYPE_UNSPECIFIED") - norm = _normalize(raw) - assert norm["accel_type"] == "" + def test_empty_when_api_returns_active_instances(self): + """EMITTING_DISABLED: ACTIVE instances in API response still yield no findings.""" + inst = {"name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-1", "state": "ACTIVE"} + result = _invoke_with_session(_session(_ok({"instances": [inst]}))) + assert result == [] - def test_location_extracted_from_name(self): - name = "projects/p/locations/europe-west1/instances/i" - raw = {**_v2_instance(name=name), "name": name} - norm = _normalize(raw) - assert norm["location"] == "europe-west1" + def test_empty_when_api_returns_multiple_instances(self): + instances = [ + {"name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-{i}", "state": "ACTIVE"} + for i in range(5) + ] + result = _invoke_with_session(_session(_ok({"instances": instances}))) + assert result == [] # --------------------------------------------------------------------------- -# _estimate_cost tests +# idle_days validation # --------------------------------------------------------------------------- -class TestEstimateCost: - def test_known_cpu_machine(self): - cost = _estimate_cost("n1-standard-4", "", 0) - assert cost == _MACHINE_MONTHLY_COST["n1-standard-4"] +class TestIdleDaysValidation: + def test_zero_raises_value_error(self): + with pytest.raises(ValueError, match="idle_days must be >= 1"): + find_idle_workbench_instances( + project_id=_PROJECT, credentials=MagicMock(), idle_days=0 + ) + + def test_negative_one_raises(self): + with pytest.raises(ValueError, match="idle_days must be >= 1"): + find_idle_workbench_instances( + project_id=_PROJECT, credentials=MagicMock(), idle_days=-1 + ) + + def test_large_negative_raises(self): + with pytest.raises(ValueError, match="idle_days must be >= 1"): + find_idle_workbench_instances( + project_id=_PROJECT, credentials=MagicMock(), idle_days=-999 + ) + + def test_error_message_includes_bad_value(self): + with pytest.raises(ValueError, match="-3"): + find_idle_workbench_instances( + project_id=_PROJECT, credentials=MagicMock(), idle_days=-3 + ) + + def test_one_is_valid(self): + assert _invoke(idle_days=1) == [] + + def test_default_14_is_valid(self): + assert _invoke() == [] + + def test_large_value_is_valid(self): + assert _invoke(idle_days=365) == [] + + +# --------------------------------------------------------------------------- +# region_filter parameter +# --------------------------------------------------------------------------- + - def test_unknown_machine_uses_default(self): - cost = _estimate_cost("custom-unknown-type", "", 0) - assert cost == _DEFAULT_MACHINE_MONTHLY_COST +class TestRegionFilter: + def test_region_filter_string_accepted(self): + assert _invoke(region_filter="us-central1") == [] - def test_n1_with_t4_adds_gpu_cost(self): - base = _MACHINE_MONTHLY_COST["n1-standard-4"] - gpu = _GPU_MONTHLY_COST_EACH["NVIDIA_TESLA_T4"] - assert _estimate_cost("n1-standard-4", "NVIDIA_TESLA_T4", 1) == base + gpu + def test_region_filter_none_accepted(self): + assert _invoke(region_filter=None) == [] - def test_n1_with_two_t4_doubles_gpu_cost(self): - base = _MACHINE_MONTHLY_COST["n1-standard-4"] - gpu = _GPU_MONTHLY_COST_EACH["NVIDIA_TESLA_T4"] - assert _estimate_cost("n1-standard-4", "NVIDIA_TESLA_T4", 2) == base + gpu * 2 - def test_a2_machine_no_gpu_addon(self): - # a2-highgpu-1g already bundles A100 cost - cost = _estimate_cost("a2-highgpu-1g", "NVIDIA_TESLA_A100", 1) - assert cost == _MACHINE_MONTHLY_COST["a2-highgpu-1g"] +# --------------------------------------------------------------------------- +# HTTP error handling via public API +# --------------------------------------------------------------------------- - def test_g2_machine_no_gpu_addon(self): - cost = _estimate_cost("g2-standard-8", "NVIDIA_L4", 1) - assert cost == _MACHINE_MONTHLY_COST["g2-standard-8"] - def test_none_machine_type_uses_default(self): - cost = _estimate_cost(None, None, 0) - assert cost == _DEFAULT_MACHINE_MONTHLY_COST +class TestHttpErrors: + def test_403_raises_permission_error(self): + with pytest.raises(PermissionError): + _invoke_with_session(_session(_err(403))) + + def test_403_message_mentions_permission(self): + with pytest.raises(PermissionError, match="notebooks.instances.list"): + _invoke_with_session(_session(_err(403))) + + def test_403_message_mentions_role(self): + with pytest.raises(PermissionError, match="roles/notebooks.viewer"): + _invoke_with_session(_session(_err(403))) + + def test_404_returns_empty_list(self): + assert _invoke_with_session(_session(_err(404))) == [] + + def test_404_no_warning_emitted(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(_session(_err(404))) + assert not any(issubclass(w.category, UserWarning) for w in caught) + + def test_400_returns_empty_list(self): + assert _invoke_with_session(_session(_err(400))) == [] + + def test_400_emits_user_warning(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(_session(_err(400))) + assert any(issubclass(w.category, UserWarning) for w in caught) + + def test_400_warning_mentions_status_code(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(_session(_err(400))) + msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) + assert "400" in msgs + + def test_400_warning_mentions_project(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(_session(_err(400))) + msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) + assert _PROJECT in msgs + + def test_500_returns_empty_list(self): + assert _invoke_with_session(_session(_err(500))) == [] + + def test_500_emits_user_warning(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(_session(_err(500))) + assert any(issubclass(w.category, UserWarning) for w in caught) + + def test_500_warning_mentions_status_code(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(_session(_err(500))) + msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) + assert "500" in msgs + + def test_503_warning_mentions_status_code(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(_session(_err(503))) + msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) + assert "503" in msgs + + def test_5xx_warning_mentions_project(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(_session(_err(500))) + msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) + assert _PROJECT in msgs + + def test_network_error_returns_empty_list(self): + session = MagicMock() + session.get.side_effect = ConnectionError("timeout") + assert _invoke_with_session(session) == [] + + def test_network_error_emits_user_warning(self): + session = MagicMock() + session.get.side_effect = ConnectionError("timeout") + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(session) + assert any(issubclass(w.category, UserWarning) for w in caught) + + def test_network_error_warning_mentions_project(self): + session = MagicMock() + session.get.side_effect = OSError("no route to host") + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(session) + msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) + assert _PROJECT in msgs + + def test_network_error_warning_mentions_exception_type(self): + session = MagicMock() + session.get.side_effect = ConnectionError("dropped") + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _invoke_with_session(session) + msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) + assert "ConnectionError" in msgs # --------------------------------------------------------------------------- -# find_idle_workbench_instances tests +# _list_instances — direct unit tests # --------------------------------------------------------------------------- -class TestFindIdleWorkbenchInstances: - def _run(self, instances: list, **kwargs): - with patch( - "cleancloud.providers.gcp.rules.ai.workbench_idle._list_instances", - return_value=instances, - ): - with patch("cleancloud.providers.gcp.rules.ai.workbench_idle.datetime") as mock_dt: - mock_dt.now.return_value = NOW - mock_dt.fromisoformat = datetime.fromisoformat - return find_idle_workbench_instances( - project_id=_PROJECT, credentials=MagicMock(), **kwargs - ) - - def test_idle_cpu_instance_flagged(self): - findings = self._run([_v2_instance()]) - assert len(findings) == 1 - f = findings[0] - assert f.rule_id == "gcp.vertex.workbench.idle" - assert f.provider == "gcp" - assert f.resource_id == _INSTANCE_NAME - assert f.region == _LOCATION - assert f.confidence == ConfidenceLevel.HIGH - assert f.risk == RiskLevel.MEDIUM - - def test_stopped_instance_skipped(self): - findings = self._run([_v2_instance(state="STOPPED")]) - assert findings == [] - - def test_young_instance_skipped(self): - # age < max(idle_days // 2, 7) = 7 days - findings = self._run([_v2_instance(create_time=_YOUNG_TIME, update_time=_RECENT_TIME)]) - assert findings == [] - - def test_recent_update_time_not_flagged(self): - # updateTime only 3 days ago — not idle - findings = self._run([_v2_instance(update_time=_RECENT_TIME)]) - assert findings == [] - - def test_gpu_instance_high_risk(self): - findings = self._run([_v2_instance(accel_type="NVIDIA_TESLA_T4", accel_count=1)]) - assert len(findings) == 1 - assert findings[0].risk == RiskLevel.HIGH - - def test_gpu_instance_critical_risk_when_idle_ratio_ge_2(self): - # idle_since_days = 30, idle_days = 14 → ratio = 30/14 ≈ 2.14 >= 2.0 - very_idle = NOW - timedelta(days=30) - findings = self._run( - [_v2_instance(update_time=very_idle, accel_type="NVIDIA_TESLA_A100", accel_count=1)] +class TestListInstancesBasic: + def test_empty_response(self): + instances, unreachable, failed = _list_instances(_session(_ok()), _PROJECT) + assert instances == [] + assert unreachable == [] + assert failed is False + + def test_instances_returned(self): + inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} + instances, _, _ = _list_instances(_session(_ok({"instances": [inst]})), _PROJECT) + assert instances == [inst] + + def test_multiple_instances_in_single_page(self): + inst_list = [ + {"name": f"projects/p/locations/us-central1/instances/i{i}", "state": "ACTIVE"} + for i in range(3) + ] + instances, _, _ = _list_instances(_session(_ok({"instances": inst_list})), _PROJECT) + assert instances == inst_list + + def test_page_size_100_in_initial_request(self): + session = _session(_ok()) + _list_instances(session, _PROJECT) + params = session.get.call_args.kwargs["params"] + assert params["pageSize"] == 100 + + def test_url_contains_project_id(self): + session = _session(_ok()) + _list_instances(session, "target-project-xyz") + url = session.get.call_args.args[0] + assert "target-project-xyz" in url + + def test_url_uses_wildcard_location(self): + session = _session(_ok()) + _list_instances(session, _PROJECT) + url = session.get.call_args.args[0] + assert "locations/-" in url + + def test_url_uses_v2_api(self): + session = _session(_ok()) + _list_instances(session, _PROJECT) + url = session.get.call_args.args[0] + assert "/v2/" in url + + +class TestListInstancesPagination: + def test_two_pages_accumulates_instances(self): + inst1 = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} + inst2 = {"name": "projects/p/locations/us-central1/instances/i2", "state": "ACTIVE"} + session = _session( + _ok({"instances": [inst1], "nextPageToken": "tok1"}), + _ok({"instances": [inst2]}), ) - assert len(findings) == 1 - assert findings[0].risk == RiskLevel.CRITICAL - - def test_medium_confidence_at_75pct_threshold(self): - # idle_since_days = 11 days → 11/14 = 0.786 >= 0.75 - threshold_medium = NOW - timedelta(days=11) - findings = self._run([_v2_instance(update_time=threshold_medium)]) - assert len(findings) == 1 - assert findings[0].confidence == ConfidenceLevel.MEDIUM - - def test_below_medium_threshold_not_flagged(self): - # idle_since_days = 9 → 9/14 = 0.64 < 0.75 - recent = NOW - timedelta(days=9) - findings = self._run([_v2_instance(update_time=recent)]) - assert findings == [] - - def test_age_fallback_capped_at_medium(self): - # v2 instance with no updateTime → age-fallback - inst = _v2_instance() - del inst["updateTime"] - inst.pop("updateTime", None) - inst["updateTime"] = "" - findings = self._run([inst]) - # age is 30 days → should be flagged; confidence capped at MEDIUM - assert len(findings) == 1 - assert findings[0].confidence == ConfidenceLevel.MEDIUM - - def test_region_filter_excludes_other_regions(self): - findings = self._run([_v2_instance()], region_filter="europe-west1") - assert findings == [] - - def test_region_filter_includes_matching_region(self): - findings = self._run([_v2_instance()], region_filter="us-central1") - assert len(findings) == 1 - - def test_region_filter_case_insensitive(self): - findings = self._run([_v2_instance()], region_filter="US-CENTRAL1") - assert len(findings) == 1 - - def test_cost_estimate_in_finding(self): - findings = self._run([_v2_instance(machine_type="n1-standard-4")]) - assert len(findings) == 1 - assert findings[0].estimated_monthly_cost_usd == _MACHINE_MONTHLY_COST["n1-standard-4"] - - def test_gpu_cost_includes_addon(self): - findings = self._run( - [ - _v2_instance( - machine_type="n1-standard-4", - accel_type="NVIDIA_TESLA_T4", - accel_count=1, - ) - ] + instances, _, _ = _list_instances(session, _PROJECT) + assert instances == [inst1, inst2] + + def test_three_pages_all_accumulated(self): + def _inst(i): + return {"name": f"projects/p/locations/us-central1/instances/i{i}", "state": "ACTIVE"} + session = _session( + _ok({"instances": [_inst(1)], "nextPageToken": "t1"}), + _ok({"instances": [_inst(2)], "nextPageToken": "t2"}), + _ok({"instances": [_inst(3)]}), ) - expected = ( - _MACHINE_MONTHLY_COST["n1-standard-4"] + _GPU_MONTHLY_COST_EACH["NVIDIA_TESLA_T4"] + instances, _, _ = _list_instances(session, _PROJECT) + assert len(instances) == 3 + + def test_page_token_forwarded_on_second_request(self): + session = _session( + _ok({"nextPageToken": "tok-abc"}), + _ok({}), + ) + _list_instances(session, _PROJECT) + second_params = session.get.call_args_list[1].kwargs["params"] + assert second_params.get("pageToken") == "tok-abc" + + def test_page_token_forwarded_on_third_request(self): + session = _session( + _ok({"nextPageToken": "t1"}), + _ok({"nextPageToken": "t2"}), + _ok({}), + ) + _list_instances(session, _PROJECT) + third_params = session.get.call_args_list[2].kwargs["params"] + assert third_params.get("pageToken") == "t2" + + def test_stops_when_no_next_token(self): + session = _session(_ok({})) + _list_instances(session, _PROJECT) + assert session.get.call_count == 1 + + def test_exactly_two_calls_for_two_pages(self): + session = _session( + _ok({"nextPageToken": "t1"}), + _ok({}), ) - assert findings[0].estimated_monthly_cost_usd == expected - - def test_multiple_instances(self): - inst1 = _v2_instance(name=f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-1") - inst2 = _v2_instance(name=f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-2") - findings = self._run([inst1, inst2]) - assert len(findings) == 2 - - def test_empty_project_returns_no_findings(self): - findings = self._run([]) - assert findings == [] - - def test_custom_idle_days(self): - # With idle_days=7, an instance 8 days since updateTime should be flagged - eight_days_ago = NOW - timedelta(days=8) - # But age must also be >= threshold_medium (75% of 7 = 5.25 days → 5 days) - findings = self._run( - [_v2_instance(update_time=eight_days_ago)], - idle_days=7, + _list_instances(session, _PROJECT) + assert session.get.call_count == 2 + + +class TestListInstancesUnreachable: + def test_single_unreachable_location_collected(self): + session = _session(_ok({"unreachable": ["asia-east1"]})) + _, unreachable, _ = _list_instances(session, _PROJECT) + assert "asia-east1" in unreachable + + def test_multiple_unreachable_locations(self): + session = _session(_ok({"unreachable": ["asia-east1", "europe-west3"]})) + _, unreachable, _ = _list_instances(session, _PROJECT) + assert "asia-east1" in unreachable + assert "europe-west3" in unreachable + + def test_unreachable_deduplicated_across_pages(self): + session = _session( + _ok({"unreachable": ["asia-east1"], "nextPageToken": "t1"}), + _ok({"unreachable": ["asia-east1"]}), ) - assert len(findings) == 1 - assert findings[0].confidence == ConfidenceLevel.HIGH + _, unreachable, _ = _list_instances(session, _PROJECT) + assert unreachable.count("asia-east1") == 1 + + def test_empty_string_in_unreachable_skipped(self): + session = _session(_ok({"unreachable": ["", "us-east1"]})) + _, unreachable, _ = _list_instances(session, _PROJECT) + assert "" not in unreachable + assert "us-east1" in unreachable + + def test_no_unreachable_when_field_absent(self): + session = _session(_ok({})) + _, unreachable, _ = _list_instances(session, _PROJECT) + assert unreachable == [] + + def test_unreachable_from_multiple_pages_merged(self): + session = _session( + _ok({"unreachable": ["asia-east1"], "nextPageToken": "t1"}), + _ok({"unreachable": ["europe-west3"]}), + ) + _, unreachable, _ = _list_instances(session, _PROJECT) + assert "asia-east1" in unreachable + assert "europe-west3" in unreachable - def test_rule_metadata_and_rule_id(self): - assert RULE_METADATA["id"] == "gcp.vertex.workbench.idle" - assert RULE_METADATA["category"] == "ai" - assert find_idle_workbench_instances.RULE_ID == "gcp.vertex.workbench.idle" - def test_age_fallback_signal_says_age_not_updatetime(self): - inst = _v2_instance() - inst["updateTime"] = "" - findings = self._run([inst]) - assert len(findings) == 1 - signals = findings[0].evidence.signals_used - activity_signal = next(s for s in signals if "control-plane activity" in s) - assert "age (fallback)" in activity_signal - assert "updateTime" not in activity_signal - - def test_normal_signal_credits_updatetime(self): - findings = self._run([_v2_instance()]) - signals = findings[0].evidence.signals_used - activity_signal = next(s for s in signals if "control-plane activity" in s) - assert "updateTime" in activity_signal - - def test_tpu_instance_labelled_tpu_not_gpu(self): - findings = self._run([_v2_instance(accel_type="TPU_V2", accel_count=1)]) - assert len(findings) == 1 - f = findings[0] - assert "TPU" in f.title - assert "GPU" not in f.title - assert any("TPU-backed" in s for s in f.evidence.signals_used) - - def test_tpu_cost_includes_tpu_addon(self): - findings = self._run([_v2_instance(accel_type="TPU_V2", accel_count=1)]) - assert len(findings) == 1 - expected = _MACHINE_MONTHLY_COST["n1-standard-4"] + _GPU_MONTHLY_COST_EACH["TPU_V2"] - assert findings[0].estimated_monthly_cost_usd == expected - - def test_500_from_v2_does_not_abort_scan(self): - """A transient 500 from the v2 API should return empty results, not raise.""" - mock_session = MagicMock() - resp_500 = MagicMock() - resp_500.status_code = 500 - mock_session.get.return_value = resp_500 - - with patch( - "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession", - return_value=mock_session, - ): - findings = find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock()) - assert findings == [] +class TestListInstancesErrors: + def test_403_raises_permission_error(self): + with pytest.raises(PermissionError, match="notebooks.instances.list"): + _list_instances(_session(_err(403)), _PROJECT) + + def test_404_returns_empty_clean(self): + instances, unreachable, failed = _list_instances(_session(_err(404)), _PROJECT) + assert instances == [] + assert unreachable == [] + assert failed is False + + def test_400_returns_empty_with_discovery_failed(self): + instances, unreachable, failed = _list_instances(_session(_err(400)), _PROJECT) + assert instances == [] + assert unreachable == [] + assert failed is True + + def test_500_sets_discovery_failed(self): + _, _, failed = _list_instances(_session(_err(500)), _PROJECT) + assert failed is True + + def test_503_sets_discovery_failed(self): + _, _, failed = _list_instances(_session(_err(503)), _PROJECT) + assert failed is True + + def test_5xx_preserves_instances_from_earlier_pages(self): + """Instances already fetched before a 5xx error must be returned.""" + inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} + session = _session( + _ok({"instances": [inst], "nextPageToken": "t1"}), + _err(503), + ) + instances, _, failed = _list_instances(session, _PROJECT) + assert instances == [inst] + assert failed is True + + def test_network_error_sets_discovery_failed(self): + session = MagicMock() + session.get.side_effect = ConnectionError("timeout") + _, _, failed = _list_instances(session, _PROJECT) + assert failed is True + + def test_network_error_preserves_earlier_instances(self): + inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"} + session = _session( + _ok({"instances": [inst], "nextPageToken": "t1"}), + ) + session.get.side_effect = [ + _ok({"instances": [inst], "nextPageToken": "t1"}), + ConnectionError("dropped"), + ] + instances, _, failed = _list_instances(session, _PROJECT) + assert instances == [inst] + assert failed is True + + def test_400_emits_warning_with_project(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _list_instances(_session(_err(400)), _PROJECT) + msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) + assert _PROJECT in msgs + + def test_500_emits_warning_with_status_code(self): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _list_instances(_session(_err(500)), _PROJECT) + msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) + assert "500" in msgs + + def test_network_error_emits_warning_with_project(self): + session = MagicMock() + session.get.side_effect = OSError("no route to host") + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + _list_instances(session, _PROJECT) + msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning)) + assert _PROJECT in msgs # --------------------------------------------------------------------------- -# _list_instances permission error propagation +# Rule metadata # --------------------------------------------------------------------------- -class TestListInstancesPermissionError: - def test_403_raises_permission_error(self): - mock_session = MagicMock() - response = MagicMock() - response.status_code = 403 - mock_session.get.return_value = response - - with patch( - "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession", - return_value=mock_session, - ): - with pytest.raises(PermissionError, match="notebooks.instances.list"): - find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock()) - - def test_404_returns_empty(self): - mock_session = MagicMock() - response = MagicMock() - response.status_code = 404 - mock_session.get.return_value = response - - with patch( - "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession", - return_value=mock_session, - ): - findings = find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock()) - assert findings == [] +class TestRuleMetadata: + def test_rule_id(self): + assert RULE_METADATA["id"] == "gcp.vertex.workbench.idle" + + def test_category(self): + assert RULE_METADATA["category"] == "ai" + + def test_service(self): + assert RULE_METADATA["service"] == "notebooks" + + def test_cost_impact(self): + assert RULE_METADATA["cost_impact"] == "high" + + def test_rule_id_attribute_on_function(self): + assert find_idle_workbench_instances.RULE_ID == "gcp.vertex.workbench.idle" From 8a84e3e21c01a664d5c3ea0ce5cdc88970674ba2 Mon Sep 17 00:00:00 2001 From: javvaji-devops Date: Wed, 6 May 2026 14:57:59 +0100 Subject: [PATCH 3/4] linting --- .../providers/gcp/rules/ai/workbench_idle.py | 9 ++------ .../azure/test_azure_app_service_idle.py | 2 +- .../gcp/ai/test_gcp_workbench_idle.py | 23 ++++++++++--------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/cleancloud/providers/gcp/rules/ai/workbench_idle.py b/cleancloud/providers/gcp/rules/ai/workbench_idle.py index 91f4f52..e906198 100644 --- a/cleancloud/providers/gcp/rules/ai/workbench_idle.py +++ b/cleancloud/providers/gcp/rules/ai/workbench_idle.py @@ -55,9 +55,7 @@ # Exact documented resource-name pattern (spec 3.1, 7): # projects/{projectId}/locations/{location}/instances/{instanceId} # All four non-empty path segments must be present. -_INSTANCE_NAME_RE = re.compile( - r"^projects/[^/]+/locations/[^/]+/instances/[^/]+$" -) +_INSTANCE_NAME_RE = re.compile(r"^projects/[^/]+/locations/[^/]+/instances/[^/]+$") def find_idle_workbench_instances( @@ -117,10 +115,7 @@ def _list_instances( results: list = [] unreachable: list = [] discovery_failed = False - url = ( - f"https://notebooks.googleapis.com/v2" - f"/projects/{project_id}/locations/-/instances" - ) + url = f"https://notebooks.googleapis.com/v2" f"/projects/{project_id}/locations/-/instances" params: dict = {"pageSize": 100} while True: diff --git a/tests/cleancloud/providers/azure/test_azure_app_service_idle.py b/tests/cleancloud/providers/azure/test_azure_app_service_idle.py index 6910840..2c49a24 100644 --- a/tests/cleancloud/providers/azure/test_azure_app_service_idle.py +++ b/tests/cleancloud/providers/azure/test_azure_app_service_idle.py @@ -792,7 +792,7 @@ def _failing_iter(*args, **kwargs): # Yield nothing, then raise — simulates a pager that fails before # returning any results (first page network error, etc.) raise Exception("pager failed mid-iteration") - yield # noqa: unreachable — makes this a generator + yield # makes this a generator web.web_apps.list_web_jobs.side_effect = _failing_iter findings = find_idle_app_services( diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py index fd01236..e7d9ffc 100644 --- a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py +++ b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py @@ -80,9 +80,7 @@ def _invoke(**kwargs): "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession", return_value=_session(_ok()), ): - return find_idle_workbench_instances( - project_id=_PROJECT, credentials=MagicMock(), **kwargs - ) + return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs) def _invoke_with_session(mock_session, **kwargs): @@ -91,9 +89,7 @@ def _invoke_with_session(mock_session, **kwargs): "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession", return_value=mock_session, ): - return find_idle_workbench_instances( - project_id=_PROJECT, credentials=MagicMock(), **kwargs - ) + return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs) # --------------------------------------------------------------------------- @@ -110,13 +106,19 @@ def test_always_empty(self): def test_empty_when_api_returns_active_instances(self): """EMITTING_DISABLED: ACTIVE instances in API response still yield no findings.""" - inst = {"name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-1", "state": "ACTIVE"} + inst = { + "name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-1", + "state": "ACTIVE", + } result = _invoke_with_session(_session(_ok({"instances": [inst]}))) assert result == [] def test_empty_when_api_returns_multiple_instances(self): instances = [ - {"name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-{i}", "state": "ACTIVE"} + { + "name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-{i}", + "state": "ACTIVE", + } for i in range(5) ] result = _invoke_with_session(_session(_ok({"instances": instances}))) @@ -131,9 +133,7 @@ def test_empty_when_api_returns_multiple_instances(self): class TestIdleDaysValidation: def test_zero_raises_value_error(self): with pytest.raises(ValueError, match="idle_days must be >= 1"): - find_idle_workbench_instances( - project_id=_PROJECT, credentials=MagicMock(), idle_days=0 - ) + find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), idle_days=0) def test_negative_one_raises(self): with pytest.raises(ValueError, match="idle_days must be >= 1"): @@ -352,6 +352,7 @@ def test_two_pages_accumulates_instances(self): def test_three_pages_all_accumulated(self): def _inst(i): return {"name": f"projects/p/locations/us-central1/instances/i{i}", "state": "ACTIVE"} + session = _session( _ok({"instances": [_inst(1)], "nextPageToken": "t1"}), _ok({"instances": [_inst(2)], "nextPageToken": "t2"}), From 2376379f4c9a9828241dbd82063fd150c4d1bb62 Mon Sep 17 00:00:00 2001 From: javvaji-devops Date: Wed, 6 May 2026 15:15:21 +0100 Subject: [PATCH 4/4] Bump version from 1.29.0 to 1.30.0 --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b50a9dc..7e2a2bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "cleancloud" -version = "1.29.0" +version = "1.30.0" description = "Read-only cloud hygiene for AWS, Azure, and GCP. Multi-account org scanning, CI/CD enforcement, and deterministic cost modeling. No agents, no telemetry." readme = "README.md" requires-python = ">=3.10"