From 6d23731331b8bd7dcd0330f529bf16e11fc7ff21 Mon Sep 17 00:00:00 2001
From: javvaji-devops <venkata.javvaji.91@gmail.com>
Date: Wed, 29 Apr 2026 22:03:09 +0100
Subject: [PATCH 1/4] gcp.vertex.training_job.long_running

---
 .../ai/vertex_training_job_long_running.py    | 845 ++++++---------
 docs/rules/gcp.md                             |  17 +-
 .../ai/vertex_training_job_long_running.md    | 508 +++++++++
 ...st_gcp_vertex_training_job_long_running.py | 984 +++++++++++-------
 tests/e2e/gcp/test_gcp_ai_rules_smoke.py      |   4 +-
 5 files changed, 1457 insertions(+), 901 deletions(-)
 create mode 100644 docs/specs/gcp/ai/vertex_training_job_long_running.md

diff --git a/cleancloud/providers/gcp/rules/ai/vertex_training_job_long_running.py b/cleancloud/providers/gcp/rules/ai/vertex_training_job_long_running.py
index cc2d1d5..4e14c0e 100644
--- a/cleancloud/providers/gcp/rules/ai/vertex_training_job_long_running.py
+++ b/cleancloud/providers/gcp/rules/ai/vertex_training_job_long_running.py
@@ -1,5 +1,58 @@
+"""
+Rule: gcp.vertex.training_job.long_running
+
+    (spec -- docs/specs/gcp/ai/vertex_training_job_long_running.md)
+
+Intent:
+    Detect Vertex AI training resources (CustomJob and TrainingPipeline) that are
+    provably still in an exact documented running state and whose documented startTime
+    shows they have been running for at least a conservative review threshold.
+
+    This rule is deliberately precision-first. It is a review-candidate rule only.
+    It is not proof that a job is hung, not proof that no useful progress is occurring,
+    not proof that the resource is safe to cancel, and not proof of a specific saving.
+
+Covered resource types (spec 3.1, 3.2):
+    - Vertex AI CustomJob  (state == JOB_STATE_RUNNING)
+    - Vertex AI TrainingPipeline  (state == PIPELINE_STATE_RUNNING)
+
+Runtime anchor (spec 7, 9.4):
+    - Canonical anchor: startTime (when the job first entered running state)
+    - createTime is NOT a fallback -- missing startTime must skip (spec 9.4)
+    - Future startTime values must skip (spec 7)
+
+Exclusions:
+    - resource name absent or not exactly matching the documented pattern (spec 7, 11)
+    - state not exactly equal to the documented running enum (spec 3.3, 9.1)
+    - startTime absent, non-RFC3339, unparsable, or future (spec 7, 9.1)
+    - elapsed runtime < long_running_hours_threshold (spec 9.1)
+    - location filter set and parsed location does not exactly match (spec 7)
+
+Detection (all must be true to emit):
+    1. resource is CustomJob or TrainingPipeline
+    2. state is exactly JOB_STATE_RUNNING or PIPELINE_STATE_RUNNING
+    3. startTime is valid and not future
+    4. elapsed_runtime_seconds >= long_running_hours_threshold * 3600
+
+Confidence / Risk (spec 9.2, 9.3):
+    HIGH confidence:   elapsed >= 3 * threshold  (clearly runaway)
+    MEDIUM confidence: threshold <= elapsed < 3 * threshold
+    CRITICAL risk:     HIGH confidence + provably accelerator-backed
+    HIGH risk:         HIGH confidence + hardware not proven accelerated
+    MEDIUM risk:       all MEDIUM confidence findings
+
+Cost model (spec 10.1, 10.2):
+    estimated_monthly_cost_usd = None
+    Training jobs are transient, not recurring monthly resources.
+    Static pricing tables are out of scope for the canonical rule.
+
+APIs:
+    - aiplatform.googleapis.com/v1: projects/{project}/locations/-/customJobs
+    - aiplatform.googleapis.com/v1: projects/{project}/locations/-/trainingPipelines
+"""
+
 import json
-import math
+import re
 import warnings
 from concurrent.futures import ThreadPoolExecutor
 from datetime import datetime, timezone
@@ -26,14 +79,12 @@
 # Machine type prefixes for Cloud TPU nodes (Vertex AI TPU training).
 # ct4- uses a trailing dash (exact family anchor); ct5/ct6/ct7 match all sub-variants.
 # "tpu" covers tpu7x-* and any future tpu-prefixed names.
-# Use _is_tpu_machine() rather than calling .startswith() with this tuple directly —
-# that function enforces the correct per-family anchor rules.
+# Use _is_tpu_machine() rather than calling .startswith() with this tuple directly.
 _TPU_MACHINE_PREFIXES = ("ct4-", "ct5", "ct6", "ct7", "tpu")
 
-# High-cost accelerator types: GPU families and TPU pods.
-# Named _ACCELERATOR_TYPES (not _GPU_ACCELERATORS) because TPU variants are included.
-# Keep in sync with MachineSpec.AcceleratorType in the Vertex AI REST reference.
-# Entries marked [est] have no published GCP Vertex AI pricing; costs are estimates.
+# Accelerator types for hardware classification (spec 8.1).
+# A job is accelerator-backed when any worker pool uses one of these types
+# with a nonzero count, or when the machine type is in a bundled GPU/TPU family.
 _ACCELERATOR_TYPES = frozenset(
     {
         # Volta / Turing / Ampere
@@ -42,174 +93,28 @@
         "NVIDIA_TESLA_P100",
         "NVIDIA_TESLA_T4",
         "NVIDIA_TESLA_V100",
-        "NVIDIA_TESLA_A100",  # A100 40GB (add-on; a2-* bundles it)
-        "NVIDIA_A100_80GB",  # A100 80GB (add-on; a2-ultragpu-* bundles it)
+        "NVIDIA_TESLA_A100",
+        "NVIDIA_A100_80GB",
         # Ada / Hopper / Blackwell
         "NVIDIA_L4",
         "NVIDIA_H100_80GB",
         "NVIDIA_H100_MEGA_80GB",
-        "NVIDIA_H200_141GB",  # [est] H200 141GB
-        "NVIDIA_B200",  # [est] Blackwell B200 — pre-GA
-        "NVIDIA_GB200",  # [est] Grace Blackwell NVL — pre-GA
-        "NVIDIA_RTX_PRO_6000",  # [est] RTX Pro 6000 Ada
-        # TPU
-        "TPU_V2",
-        "TPU_V3",
-        "TPU_V4_POD",
-        "TPU_V5_LITEPOD",  # [est] v5e litepod
-    }
-)
-
-# Monthly cost per machine type (on-demand, us-central1, 730 h/month).
-# Bundled GPU families (a2-*, a3-*, a4-*, a4x-*, g2-*, g4-*) include accelerator cost.
-# TPU machine types (ct5lp-*, ct6e-*, tpu7x-*, …) include TPU chip cost.
-_MACHINE_MONTHLY_COST = {
-    "n1-standard-1": 35.0,
-    "n1-standard-2": 69.0,
-    "n1-standard-4": 138.0,
-    "n1-standard-8": 277.0,
-    "n1-standard-16": 554.0,
-    "n1-standard-32": 1_107.0,
-    "n1-standard-64": 2_214.0,
-    "n1-standard-96": 3_321.0,
-    "n1-highmem-2": 93.0,
-    "n1-highmem-4": 187.0,
-    "n1-highmem-8": 374.0,
-    "n1-highmem-16": 748.0,
-    "n1-highmem-32": 1_496.0,
-    "n1-highmem-64": 2_991.0,
-    "n1-highmem-96": 4_487.0,
-    "n2-standard-2": 78.0,
-    "n2-standard-4": 157.0,
-    "n2-standard-8": 314.0,
-    "n2-standard-16": 628.0,
-    "n2-standard-32": 1_255.0,
-    "c2-standard-4": 166.0,
-    "c2-standard-8": 332.0,
-    "c2-standard-16": 664.0,
-    "c2-standard-30": 1_245.0,
-    "c2-standard-60": 2_490.0,
-    # a2-* (A100 40GB bundled)
-    "a2-highgpu-1g": 2_933.0,
-    "a2-highgpu-2g": 5_866.0,
-    "a2-highgpu-4g": 11_732.0,
-    "a2-highgpu-8g": 23_464.0,
-    "a2-megagpu-16g": 46_927.0,
-    # a2-ultragpu-* (A100 80GB bundled)
-    "a2-ultragpu-1g": 5_103.0,
-    "a2-ultragpu-2g": 10_206.0,
-    "a2-ultragpu-4g": 20_412.0,
-    "a2-ultragpu-8g": 40_824.0,
-    # a3-* (H100 SXM5 bundled) — 1g/2g/4g priced proportionally to published 8g rate
-    "a3-highgpu-1g": 7_299.0,  # [est] 1/8 of 8g
-    "a3-highgpu-2g": 14_598.0,  # [est] 2/8 of 8g
-    "a3-highgpu-4g": 29_197.0,  # [est] 4/8 of 8g
-    "a3-highgpu-8g": 58_393.0,  # published GCP rate
-    "a3-megagpu-8g": 65_000.0,  # [est] 8× H100, high-mem NVLink config
-    "a3-ultragpu-8g": 80_000.0,  # [est] 8× H200 141GB
-    # a4-* (B200 bundled) — [est] no published GCP rate
-    "a4-highgpu-8g": 100_000.0,  # [est] 8× B200 next-gen flagship
-    # a4x-* (GB200 NVL bundled) — [est]
-    "a4x-highgpu-4g": 60_000.0,  # [est] 4× GB200 NVLink
-    # g2-* (L4 bundled)
-    "g2-standard-4": 706.0,
-    "g2-standard-8": 1_060.0,
-    "g2-standard-12": 1_590.0,
-    "g2-standard-16": 2_120.0,
-    "g2-standard-24": 3_180.0,
-    "g2-standard-32": 4_241.0,
-    "g2-standard-48": 6_361.0,
-    "g2-standard-96": 12_722.0,
-    # g4-* (RTX Pro 6000 Ada bundled) — documented sizes per Vertex AI training docs:
-    # 48=1 GPU, 96=2 GPUs, 192=4 GPUs, 384=8 GPUs
-    # Pricing [est]: no published GCP rate; ~$2,800/GPU/mo (RTX Pro 6000 + host vCPU share)
-    "g4-standard-48": 2_800.0,  # [est] 1 GPU
-    "g4-standard-96": 5_600.0,  # [est] 2 GPUs
-    "g4-standard-192": 11_200.0,  # [est] 4 GPUs
-    "g4-standard-384": 22_400.0,  # [est] 8 GPUs
-    # Cloud TPU machine types — cost is the TPU chip(s) + host VM bundled
-    # TPU v5e (ct5lp-hightpu-*): ~$1.20/chip-hr (published)
-    "ct5lp-hightpu-1t": 876.0,
-    "ct5lp-hightpu-4t": 3_504.0,
-    "ct5lp-hightpu-8t": 7_008.0,
-    # TPU v5p (ct5p-hightpu-*): ~$1.80/chip-hr [est]
-    "ct5p-hightpu-4t": 5_256.0,  # [est]
-    "ct5p-hightpu-8t": 10_512.0,  # [est]
-    # TPU v6e (ct6e-standard-*): ~$1.80/chip-hr [est]
-    "ct6e-standard-1t": 1_314.0,  # [est] 1 chip
-    "ct6e-standard-4t": 5_256.0,  # [est] 4 chips
-    "ct6e-standard-8t": 10_512.0,  # [est] 8 chips
-}
-_DEFAULT_MACHINE_MONTHLY_COST = 150.0
-# Fallback for unrecognized TPU machine types — avoids the $0.21/hr generic default
-# massively underestimating a 4-chip-equivalent TPU job.
-_DEFAULT_TPU_MONTHLY_COST = 10_000.0  # ~$13.70/hr, conservative multi-host TPU estimate
-
-# Duration-tiered fallback costs for TrainingPipelines when workerPoolSpecs cannot be parsed.
-# Longer-running pipelines are statistically more likely to be GPU-backed workloads.
-# Three tiers (inlined in find_long_running_vertex_training_jobs):
-#   >24h → $20/hr (probable multi-GPU), 6–24h → $5/hr (ambiguous), else → $1/hr.
-# These are not exact — large GPU pipelines cost $50–$500+/hr; these are indicative minimums.
-
-# Additional monthly cost per accelerator unit for n1-*/n2-*/c2-* machines (add-on pricing).
-# Bundled families (a2-*, a3-*, a4-*, a4x-*, g2-*, g4-*, ct*/tpu7x-*) already include
-# accelerator cost in _MACHINE_MONTHLY_COST — no add-on needed for those.
-# All costs: us-central1 on-demand, 730 h/month.
-# Entries marked [est] use conservative estimates — no published GCP Vertex AI rate.
-_ACCELERATOR_MONTHLY_COST_EACH = {
-    # Volta / Turing / Ampere (published GCP rates)
-    "NVIDIA_TESLA_K80": 392.0,
-    "NVIDIA_TESLA_P4": 438.0,  # ~$0.60/hr
-    "NVIDIA_TESLA_P100": 1_022.0,
-    "NVIDIA_TESLA_T4": 311.0,
-    "NVIDIA_TESLA_V100": 1_385.0,
-    "NVIDIA_TESLA_A100": 2_933.0,  # A100 40GB add-on (n1-* only; a2-* bundles)
-    "NVIDIA_A100_80GB": 5_103.0,  # A100 80GB add-on
-    # Ada / Hopper (published GCP rates)
-    "NVIDIA_L4": 680.0,
-    "NVIDIA_H100_80GB": 8_000.0,
-    "NVIDIA_H100_MEGA_80GB": 10_000.0,
-    # Newer accelerators — [est] conservative estimates; update when GCP publishes rates
-    "NVIDIA_H200_141GB": 11_000.0,  # [est] ~1.4× H100 80GB
-    "NVIDIA_B200": 18_000.0,  # [est] Blackwell B200 — pre-GA
-    "NVIDIA_GB200": 22_000.0,  # [est] Grace Blackwell NVL — pre-GA
-    "NVIDIA_RTX_PRO_6000": 2_200.0,  # [est] RTX Pro 6000 Ada workstation
-    # TPU (published GCP rates)
-    "TPU_V2": 3_811.0,
-    "TPU_V3": 5_840.0,
-    "TPU_V4_POD": 9_402.0,
-    "TPU_V5_LITEPOD": 3_500.0,  # [est] v5e litepod per unit
-}
-
-_HOURS_PER_MONTH = 730.0
-
-# Machine type prefixes and accelerator types whose pricing is estimated (no published GCP rate).
-# Used to tag findings with pricing_confidence="partial_estimate" vs "published".
-_PRICING_ESTIMATED_MACHINE_PREFIXES = (
-    "a3-megagpu",
-    "a3-ultragpu",  # H200/future a3 variants
-    "a4-",  # B200
-    "a4x-",  # GB200 NVLink
-    "g4-",  # RTX Pro 6000 Ada
-    "ct5p-",  # TPU v5p
-    "ct6e-",  # TPU v6e
-    "tpu7x-",  # TPU v7 (pre-GA)
-)
-_PRICING_ESTIMATED_ACCEL_TYPES = frozenset(
-    {
         "NVIDIA_H200_141GB",
         "NVIDIA_B200",
         "NVIDIA_GB200",
         "NVIDIA_RTX_PRO_6000",
+        # TPU
+        "TPU_V2",
+        "TPU_V3",
+        "TPU_V4_POD",
         "TPU_V5_LITEPOD",
     }
 )
 
-# Full accelerator count per bundled machine type — used for co-scheduling cost correction.
-# Vertex AI may co-schedule floor(N/accel_count) replicas onto one VM when accel_count <= N//2,
-# so each replica pays only 1/replicas_per_vm of the machine cost.
-# g2-standard-32 is omitted: its GPU count is ambiguous in GCP docs (co-scheduling impact is low
-# for single-GPU machines anyway).
+# Chips per physical host for known Cloud TPU machine types.
+# Used by _tpu_topology_host_count to derive actual host count from tpuTopology,
+# since Vertex AI always reports replicaCount=1 for TPU pods regardless of scale.
+# Hardware classification only -- not used for cost estimation.
 _BUNDLED_ACCELERATOR_COUNT: dict[str, int] = {
     # a2-* (A100 40GB)
     "a2-highgpu-1g": 1,
@@ -241,12 +146,12 @@
     "g2-standard-24": 2,
     "g2-standard-48": 4,
     "g2-standard-96": 8,
-    # g4-* (RTX Pro 6000 Ada) — 48=1 GPU, 96=2 GPUs, 192=4 GPUs, 384=8 GPUs
+    # g4-* (RTX Pro 6000 Ada) — 48=1 GPU, 96=2, 192=4, 384=8
     "g4-standard-48": 1,
     "g4-standard-96": 2,
     "g4-standard-192": 4,
     "g4-standard-384": 8,
-    # Cloud TPU machines — chip count encoded in machine name suffix (e.g. -4t = 4 chips)
+    # Cloud TPU machine types
     "ct5lp-hightpu-1t": 1,
     "ct5lp-hightpu-4t": 4,
     "ct5lp-hightpu-8t": 8,
@@ -255,34 +160,21 @@
     "ct6e-standard-1t": 1,
     "ct6e-standard-4t": 4,
     "ct6e-standard-8t": 8,
-    "tpu7x-standard-4t": 4,  # TPU v7 — 4 chips/host (pre-GA)
+    "tpu7x-standard-4t": 4,
 }
 
-# Jobs running longer than this multiple of the threshold are almost certainly runaway
+# Duration multiplier beyond which a job is confidently runaway (spec 9.2).
 _RUNAWAY_MULTIPLIER = 3
 
-# Default threshold
+# Default threshold hours (spec 6.3).
 _DEFAULT_LONG_RUNNING_HOURS = 24
 
-# Fraction of threshold at which GPU early-warning fires (before crossing threshold).
-# 90% reduces noise vs 75%: a 21.6h GPU job (at 24h threshold) is genuinely unusual;
-# an 18h job is still plausible for legitimate large-scale training.
-_EARLY_WARNING_FRACTION = 0.9
-
-# (project_id, resource) pairs where locations/- wildcard returned 400 — fall back
+# (project_id, resource) pairs where locations/- wildcard returned 400 -- fall back
 # to per-region calls for that specific combination.
-# Keyed per (project_id, resource) so:
-#   - customJobs and trainingPipelines are tracked independently (one may support wildcard)
-#   - project A's failure does not suppress the wildcard attempt for project B
-# Written lazily on first 400; read on subsequent scans in the same process.
-# A race between parallel calls is benign: at worst both try the wildcard once and
-# both add the same key — set.add is GIL-protected and idempotent.
+# Keyed per (project_id, resource) so customJobs and trainingPipelines are independent.
 _wildcard_unsupported: set[tuple[str, str]] = set()
 
 # Known Vertex AI locations for fallback when the wildcard is not supported.
-# GCP adds new regions over time — this list may miss recently-announced locations.
-# To ensure full coverage: grant locations/- wildcard support (roles/aiplatform.viewer
-# is sufficient for most projects), or extend this list when new regions are confirmed.
 # Last reviewed: 2026-04-17. Source: https://cloud.google.com/vertex-ai/docs/general/locations
 _VERTEX_LOCATIONS = [
     "us-central1",
@@ -309,84 +201,90 @@
 ]
 
 
+# Strict RFC3339 validation pattern (spec 7).
+# Accepts: YYYY-MM-DDTHH:MM:SS[.fractional](Z | +HH:MM | -HH:MM)
+# Rejects: date-only, space separator, missing timezone.
+_RFC3339_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(Z|[+-]\d{2}:\d{2})$")
+
+# Maps internal job_type strings to the expected URL/name path segment.
+_RESOURCE_TYPE_SEGMENT: dict[str, str] = {
+    "customJob": "customJobs",
+    "trainingPipeline": "trainingPipelines",
+}
+
+# Maps job_type to the exact running-state enum the resource must expose (spec 3.3, 9.1).
+_EXPECTED_STATE: dict[str, str] = {
+    "customJob": "JOB_STATE_RUNNING",
+    "trainingPipeline": "PIPELINE_STATE_RUNNING",
+}
+
+
+def _validate_resource_name(name: str, job_type: str) -> bool:
+    """
+    Return True only when name exactly matches the documented Vertex AI resource-name
+    pattern for the given job type (spec 7):
+        projects/{project}/locations/{location}/customJobs/{id}
+        projects/{project}/locations/{location}/trainingPipelines/{id}
+
+    All six slash-delimited segments must be present and non-empty.  Any extra
+    or missing path segments, or a wrong resource-type segment, returns False.
+    """
+    parts = name.split("/")
+    return (
+        len(parts) == 6
+        and parts[0] == "projects"
+        and parts[2] == "locations"
+        and parts[4] == _RESOURCE_TYPE_SEGMENT[job_type]
+        and bool(parts[1])  # project id
+        and bool(parts[3])  # location
+        and bool(parts[5])  # resource id
+    )
+
+
 def find_long_running_vertex_training_jobs(
     *,
     project_id: str,
     credentials,
     region_filter: Optional[str] = None,
-    long_running_hours: int = _DEFAULT_LONG_RUNNING_HOURS,
-    early_warning_fraction: float = _EARLY_WARNING_FRACTION,
-    runaway_multiplier: int = _RUNAWAY_MULTIPLIER,
-    expensive_hourly_threshold: float = 20.0,
+    long_running_hours_threshold: int = _DEFAULT_LONG_RUNNING_HOURS,
 ) -> List[Finding]:
     """
-    Find Vertex AI CustomJobs and TrainingPipelines that have been running
-    longer than expected.
-
-    Most training jobs complete within a few hours. A job still running after
-    24 hours is unusual — it may be hung, deadlocked in distributed training,
-    caught in an OOM loop, or simply forgotten after a project was cancelled.
-
-    GPU-backed training is especially costly: an A100 40GB node (a2-highgpu-1g)
-    runs at ~$4/hr; an a3-highgpu-8g (8 × H100) runs at ~$80/hr. Multi-worker
-    jobs multiply cost linearly.
-
-    Detection logic:
-    - Queries both CustomJobs (state="JOB_STATE_RUNNING") and TrainingPipelines
-      (state="PIPELINE_STATE_RUNNING") via the Vertex AI REST API, in parallel
-    - Duration is computed from startTime (when compute began billing); falls
-      back to createTime if startTime is absent (jobs stuck in pre-run phases)
-    - Hardware: CustomJobs expose workerPoolSpecs directly; TrainingPipelines
-      attempt to parse workerPoolSpecs from trainingTaskInputs (handling both
-      dict and JSON-string encoding) before falling back to a neutral hourly
-      estimate (~$3/hr). Unknown hardware does NOT set is_accelerator=True — is_accelerator
-      is derived strictly from parsed pool data.
-
-    Cost aggregation:
-    - Each pool's cost = _estimate_hourly_rate_per_replica × effective_replicas
-    - For GPU/CPU pools: effective_replicas = replicaCount from API
-    - For TPU pools: effective_replicas = physical host count derived from tpuTopology
-      (Vertex always reports replicaCount=1 for TPU regardless of pod size)
-    - Total burn rate = sum across ALL pools (not primary pool × total_replicas)
-    - This correctly handles heterogeneous jobs (e.g., a2-highgpu chief + n1 workers)
-
-    Confidence:
-    - HIGH: duration >= long_running_hours × 3 — clearly runaway
-    - MEDIUM: duration >= long_running_hours — worth reviewing
-    - MEDIUM (early warning): accelerator job or expensive CPU cluster
-      (hourly_rate_total > expensive_hourly_threshold) at 90–100% of threshold
-
-    Risk:
-    - CRITICAL: HIGH confidence + GPU/accelerator hardware
-    - HIGH:     HIGH confidence, CPU-only
-    - MEDIUM:   all MEDIUM-confidence findings (GPU or CPU alike)
-
-    Cost reported:
-    - Accrued cost so far: duration_hours × hourly_burn_rate (all worker pools)
-    - estimated_monthly_cost_usd is intentionally None — training jobs are
-      transient, not recurring monthly expenses
-    - Pricing is a static estimate (us-central1, on-demand); actual cost varies
-      by region and committed use discounts
+    Find Vertex AI CustomJobs and TrainingPipelines running beyond the threshold.
+
+    Emits a finding only when all of the following are true (spec 9):
+        1. resource is CustomJob or TrainingPipeline in the exact running state
+        2. startTime is valid and not future (createTime is NOT a fallback; spec 9.4)
+        3. elapsed_runtime_seconds >= long_running_hours_threshold * 3600
+
+    Confidence (spec 9.2):
+        HIGH:   elapsed >= 3 * threshold  (clearly runaway)
+        MEDIUM: threshold <= elapsed < 3 * threshold
+
+    Risk (spec 9.3):
+        CRITICAL: HIGH confidence + provably accelerator-backed
+        HIGH:     HIGH confidence + hardware not proven accelerated
+        MEDIUM:   all MEDIUM confidence findings
+
+    No sub-threshold early warnings are emitted (spec 9.4).
+    No hardcoded pricing tables are used (spec 10.2).
 
     IAM permissions required:
-    - aiplatform.customJobs.list  (roles/aiplatform.viewer)
-    - aiplatform.trainingPipelines.list  (roles/aiplatform.viewer)
+        aiplatform.customJobs.list         (roles/aiplatform.viewer)
+        aiplatform.trainingPipelines.list  (roles/aiplatform.viewer)
     """
-    long_running_hours = max(long_running_hours, 1)
-    early_warning_fraction = max(0.0, min(early_warning_fraction, 1.0))
-    runaway_multiplier = max(1, runaway_multiplier)
-    expensive_hourly_threshold = max(0.0, expensive_hourly_threshold)
+    if long_running_hours_threshold < 1:
+        raise ValueError(
+            f"long_running_hours_threshold must be >= 1, " f"got {long_running_hours_threshold!r}"
+        )
 
+    threshold_seconds = long_running_hours_threshold * 3600
     session = AuthorizedSession(credentials)
     now = datetime.now(timezone.utc)
     findings: List[Finding] = []
     skipped_jobs: int = 0
 
-    # Query both resource types in parallel — each may independently need the
-    # per-region fallback if the locations/- wildcard returns 400.
-    # Results are collected independently: a transient failure on one resource type
-    # still yields findings from the other. PermissionError propagates immediately
-    # (missing IAM is user-actionable and should not be silently swallowed).
+    # Query both resource types in parallel; failures on one surface do not block the other.
+    # PermissionError propagates immediately (missing IAM is user-actionable).
     custom_jobs: list = []
     training_pipelines: list = []
     with ThreadPoolExecutor(max_workers=2) as executor:
@@ -424,40 +322,68 @@ def find_long_running_vertex_training_jobs(
     for job, job_type in [(j, "customJob") for j in custom_jobs] + [
         (p, "trainingPipeline") for p in training_pipelines
     ]:
-        name = job.get("name", "")
-        display_name = job.get("displayName", "")
-        location = _parse_location(name) or "unknown"
+        # --- Identity: exact resource-name pattern (spec 7, 11) ---
+        name = (job.get("name") or "").strip()
+        if not name or not _validate_resource_name(name, job_type):
+            # Empty name or doesn't match expected pattern → skip (spec 7, 11)
+            skipped_jobs += 1
+            continue
+
+        location = name.split("/")[3]  # guaranteed by _validate_resource_name
 
-        if region_filter and location.lower() != region_filter.lower():
+        # Region filter: exact string equality, no case folding (spec 7)
+        if region_filter and location != region_filter:
             continue
 
-        # Duration: prefer startTime (actual compute start); fall back to createTime
-        start_str = job.get("startTime") or job.get("createTime", "")
+        # --- State validation: exact documented running enum (spec 3.3, 9.1) ---
+        expected_state = _EXPECTED_STATE[job_type]
+        actual_state = (job.get("state") or "").strip()
+        if actual_state != expected_state:
+            skipped_jobs += 1
+            continue
+
+        # --- Runtime anchor: startTime only (spec 7, 9.4) ---
+        # createTime is NOT a fallback. Missing startTime must skip unconditionally.
+        start_str = (job.get("startTime") or "").strip()
         if not start_str:
             skipped_jobs += 1
             continue
+
+        # Strict RFC3339 validation (spec 7): reject space separators, date-only, no-tz values.
+        if not _RFC3339_RE.match(start_str):
+            skipped_jobs += 1
+            continue
+
         try:
             start_dt = datetime.fromisoformat(start_str.replace("Z", "+00:00"))
-            if start_dt.tzinfo is None:
+            if start_dt.tzinfo is None:  # defensive; RFC3339 regex guarantees tz
                 start_dt = start_dt.replace(tzinfo=timezone.utc)
-        except ValueError:
+        except (ValueError, AttributeError):
             skipped_jobs += 1
             continue
 
-        duration_hours = (now - start_dt).total_seconds() / 3600
+        # Future startTime is unusable (spec 7)
+        if start_dt > now:
+            skipped_jobs += 1
+            continue
+
+        # --- Duration check (spec 9.1) ---
+        elapsed_seconds = (now - start_dt).total_seconds()
+        if elapsed_seconds < threshold_seconds:
+            continue  # not yet long-running; no sub-threshold early warning (spec 9.4)
+
+        duration_hours = elapsed_seconds / 3600
+        duration_display = round(duration_hours, 1)
+        display_name = (job.get("displayName") or "").strip()
 
-        # Hardware: parse per-pool specs for accurate cost aggregation.
-        # Done before the duration filter so expensive_hourly_threshold can be evaluated.
-        # CustomJob exposes workerPoolSpecs directly. TrainingPipeline may embed
-        # them in trainingTaskInputs (works for custom-training pipelines) or may
-        # not expose them at all (AutoML, managed job types).
+        # --- Hardware classification (spec 8) ---
         if job_type == "customJob":
             raw_worker_specs = job.get("jobSpec", {}).get("workerPoolSpecs", [])
             pools = _parse_worker_pools(raw_worker_specs)
-            hardware_unknown = False
+            # spec 8.1: missing, empty, or all-malformed workerPoolSpecs → hardware_unknown
+            hardware_unknown = not pools
         else:
             task_inputs = job.get("trainingTaskInputs") or {}
-            # The field is occasionally returned as a JSON string rather than a parsed dict
             if isinstance(task_inputs, str):
                 try:
                     task_inputs = json.loads(task_inputs)
@@ -470,95 +396,37 @@ def find_long_running_vertex_training_jobs(
             pools = _parse_worker_pools(raw_worker_specs)
             hardware_unknown = not pools
 
-        # Accelerator detection: derived from actual hardware data only.
-        # hardware_unknown does not imply GPU — it only triggers duration-tiered fallback
-        # cost, keeping risk conservatively MEDIUM.
         is_accelerator = _has_accelerator_hardware(pools)
 
-        # Cost: sum per-pool cost × replica_count across ALL pools.
-        # This correctly handles heterogeneous clusters (different machine types per pool).
-        # For TPU jobs: each "replica" in the pool tuple is a physical host (derived from
-        # tpuTopology), so _total_hourly_rate correctly prices host_count × per-host cost.
-        if pools:
-            total_replicas = sum(r for _, _, _, r in pools) or 1
-            primary_machine = pools[0][0]
-            primary_accel = pools[0][1]
-            primary_accel_count = pools[0][2]
-            hourly_rate_total = _total_hourly_rate(pools)
-            # Capture TPU topology for label — present in raw spec but not in pool tuple
-            primary_tpu_topology: Optional[str] = None
-            if raw_worker_specs and primary_machine and _is_tpu_machine(primary_machine):
-                primary_tpu_topology = (
-                    raw_worker_specs[0].get("machineSpec", {}).get("tpuTopology") or None
-                )
-        else:
-            total_replicas = 1
-            primary_machine = None
-            primary_accel = None
-            primary_accel_count = 0
-            primary_tpu_topology = None
-            # Duration-scaled fallback: longer jobs are more likely to be GPU-class pipelines.
-            # Tiers: >24h → $20/hr (probable multi-GPU), >6h → $5/hr (ambiguous), else → $1/hr.
-            # Still conservative — large GPU pipelines can cost $50–$500+/hr.
-            if duration_hours > 24:
-                hourly_rate_total = 20.0
-            elif duration_hours > 6:
-                hourly_rate_total = 5.0
-            else:
-                hourly_rate_total = 1.0
-
-        # Early-exit: skip all jobs below early_warning_fraction of the threshold.
-        # The early-warning band (fraction–100%) is evaluated in the confidence block below.
-        if duration_hours < long_running_hours * early_warning_fraction:
-            continue
-
-        # Raw values — no intermediate rounding; format inline, round once for storage.
-        # accrued_raw is the true computed cost and is stored in details unchanged.
-        # accrued_display is capped at $1M to avoid distorting summaries with stale-table
-        # outliers, but the raw value is always preserved for analysis.
-        duration_display = round(duration_hours, 1)
-        accrued_raw = hourly_rate_total * duration_hours
-        if accrued_raw > 1_000_000:
-            warnings.warn(
-                f"gcp.vertex.training_job.long_running: accrued cost estimate "
-                f"${accrued_raw:,.0f} exceeds $1M — cost table may be stale or topology "
-                f"unusually large; capping display at $1,000,000",
-                stacklevel=2,
-            )
-        accrued_display = min(accrued_raw, 1_000_000.0)
-        overrun_hours = max(0.0, duration_hours - long_running_hours)
-
-        # Confidence
-        if duration_hours >= long_running_hours * runaway_multiplier:
+        # --- Confidence (spec 9.2) ---
+        if elapsed_seconds >= _RUNAWAY_MULTIPLIER * threshold_seconds:
             confidence = ConfidenceLevel.HIGH
-        elif duration_hours >= long_running_hours:
-            confidence = ConfidenceLevel.MEDIUM
         else:
-            # early_warning_fraction–100% of threshold: fire early for accelerators or
-            # expensive CPU clusters. The replica cap (≤50) suppresses early warnings for
-            # very large CPU-only clusters that are likely intentional distributed workloads
-            # (e.g. 200-node Spark/Beam jobs); accelerators are never gated by replica count.
-            expensive_cpu = hourly_rate_total > expensive_hourly_threshold and total_replicas <= 50
-            if is_accelerator or expensive_cpu:
-                confidence = ConfidenceLevel.MEDIUM
-            else:
-                continue
+            confidence = ConfidenceLevel.MEDIUM
 
-        # Risk model:
-        #   HIGH confidence + accelerator (GPU/TPU) → CRITICAL
-        #   HIGH confidence + CPU or unknown hw     → HIGH
-        #     (unknown hardware + runaway lands here via is_accelerator=False — suspicious enough
-        #      to warrant HIGH without an actual accelerator spec; avoids false CRITICAL)
-        #   MEDIUM confidence                       → MEDIUM
+        # --- Risk (spec 9.3) ---
         if confidence == ConfidenceLevel.HIGH:
             risk = RiskLevel.CRITICAL if is_accelerator else RiskLevel.HIGH
         else:
             risk = RiskLevel.MEDIUM
 
-        # Human-readable job label
+        # --- Finding construction ---
         job_id = name.rsplit("/", 1)[-1] if name else ""
         label = display_name or job_id
 
+        if pools:
+            total_replicas = sum(pool[3] for pool in pools)  # each pool[3] >= 1
+            primary_machine = pools[0][0]
+            primary_accel = pools[0][1]
+            primary_accel_count = pools[0][2]
+            primary_tpu_topology: Optional[str] = pools[0][4]  # stored during parsing
+        else:
+            total_replicas = 1
+            primary_machine = None
+            primary_accel = None
+            primary_accel_count = 0
+            primary_tpu_topology = None
+
         hardware_label = _hardware_label(
             primary_machine,
             primary_accel,
@@ -567,10 +435,12 @@ def find_long_running_vertex_training_jobs(
             tpu_topology=primary_tpu_topology,
         )
 
+        state = actual_state  # already validated == expected enum for this job_type
+        overrun_hours = max(0.0, duration_hours - long_running_hours_threshold)
         threshold_detail = (
-            f"exceeded by {math.floor(overrun_hours)}h"
+            f"exceeded by {int(overrun_hours)}h"
             if overrun_hours > 0
-            else f"{round(long_running_hours - duration_hours, 1)}h below threshold (early warning)"
+            else f"{round(long_running_hours_threshold - duration_hours, 1)}h below threshold"
         )
 
         title = (
@@ -579,82 +449,63 @@ def find_long_running_vertex_training_jobs(
             + ")"
         )
 
-        primary_bundled = _is_bundled_machine(primary_machine)
         signals = [
-            f"Job status: RUNNING for {duration_display}h "
-            f"(threshold: {long_running_hours}h, {threshold_detail})",
-            (
-                f"Burn rate: ~${hourly_rate_total:.2f}/hr across {total_replicas} workers"
-                if total_replicas > 1
-                else f"Burn rate: ~${hourly_rate_total:.2f}/hr"
-            ),
+            f"Job status: {state} for {duration_display}h "
+            f"(threshold: {long_running_hours_threshold}h, {threshold_detail})",
         ]
         if hardware_label:
-            signals.append(
-                f"Hardware: {hardware_label}"
-                + (" (GPU/accelerator)" if is_accelerator and not primary_bundled else "")
-            )
+            signals.append(f"Hardware: {hardware_label}")
         if total_replicas > 1:
             signals.append(
                 f"Distributed training ({total_replicas} workers) — "
-                f"long durations may be expected for large-scale jobs"
+                "long durations may be expected for large-scale jobs"
             )
-        signals.append(
-            f"Accrued cost: ~${accrued_display:,.2f} "
-            f"(${hourly_rate_total:.2f}/hr × {duration_display}h elapsed, "
-            f"us-central1 on-demand — actual cost varies by region and committed use discounts)"
-        )
         if hardware_unknown:
-            signals.append(
-                f"TrainingPipeline: hardware spec not exposed in API response — "
-                f"cost estimate uses duration-scaled placeholder (~${hourly_rate_total:.2f}/hr); "
-                "actual cost varies widely: ~$0.20–$1/hr for small CPU pipelines, "
-                "$50–$100+/hr for large accelerator jobs"
-            )
+            signals.append("Hardware spec not structurally exposed in API response")
 
         not_checked = [
             "Intentional long-running distributed training (LLM pre-training, large fine-tunes)",
             "Checkpoint saving — job may be making progress without visible status updates",
-            "Committed use discounts — actual cost may be significantly lower than on-demand estimate",
+            "Committed use discounts — actual cost may be significantly lower than on-demand",
             "Preemptible/Spot workers — cost and interruption semantics differ",
         ]
 
-        evidence = Evidence(
-            signals_used=signals,
-            signals_not_checked=not_checked,
-            time_window=f"{duration_display}h",
-        )
-
         findings.append(
             Finding(
                 provider="gcp",
                 rule_id="gcp.vertex.training_job.long_running",
                 resource_type="gcp.vertex.training_job",
-                resource_id=name or job_id,
+                resource_id=name,
                 region=location,
                 title=title,
                 summary=(
-                    f"Vertex AI {job_type} '{label}' has been RUNNING for {duration_display}h"
+                    f"Vertex AI {job_type} '{label}' has been {state} for {duration_display}h"
                     + (f" ({hardware_label})" if hardware_label else "")
-                    + f", accruing ~${accrued_display:,.2f} so far."
-                    + f" Most training jobs complete well under {long_running_hours} hours unless intentionally long-running."
+                    + f". Most training jobs complete well under "
+                    f"{long_running_hours_threshold}h unless intentionally long-running."
                 ),
                 reason=(
-                    f"Job has been RUNNING for {duration_display}h "
-                    f"(threshold: {long_running_hours}h)"
+                    f"Job has been {state} for {duration_display}h "
+                    f"(threshold: {long_running_hours_threshold}h)"
                 ),
                 risk=risk,
                 confidence=confidence,
                 detected_at=now,
-                evidence=evidence,
-                # Training jobs are transient — setting estimated_monthly_cost_usd would
-                # corrupt monthly savings totals. Accrued cost lives in details only.
-                estimated_monthly_cost_usd=None,
+                evidence=Evidence(
+                    signals_used=signals,
+                    signals_not_checked=not_checked,
+                    time_window=f"{duration_display}h",
+                ),
+                estimated_monthly_cost_usd=None,  # spec 10.1: transient resource
                 details={
                     "job_name": name,
                     "display_name": display_name or None,
                     "job_type": job_type,
+                    "state": state,
                     "location": location,
+                    "start_time": start_str,
+                    "duration_hours": round(duration_hours, 2),
+                    "long_running_hours_threshold": long_running_hours_threshold,
                     "machine_type": primary_machine or None,
                     "accelerator_type": primary_accel or None,
                     "accelerator_count": (primary_accel_count if primary_accel_count else None),
@@ -662,30 +513,6 @@ def find_long_running_vertex_training_jobs(
                     "total_workers": total_replicas,
                     "is_accelerator": is_accelerator,
                     "hardware_unknown": hardware_unknown,
-                    "duration_hours": round(duration_hours, 2),
-                    "long_running_hours_threshold": long_running_hours,
-                    "burn_rate_per_hour": hourly_rate_total,
-                    "overrun_hours": overrun_hours,
-                    "accrued_cost_usd": accrued_raw,
-                    "cost_type": "accrued_to_date",
-                    "pricing_source": (
-                        "conservative_pipeline_default"
-                        if hardware_unknown
-                        else "static_estimate_us_central1"
-                    ),
-                    "pricing_confidence": (
-                        "pipeline_default" if hardware_unknown else _pricing_confidence(pools)
-                    ),
-                    "pricing_scope": "us-central1_reference",
-                    "pricing_note": (
-                        f"Cost estimated using us-central1 on-demand baseline; "
-                        f"actual job is in {location}"
-                        + (
-                            " — pricing is likely similar"
-                            if location.startswith("us-")
-                            else " — regional pricing may differ significantly"
-                        )
-                    ),
                 },
             )
         )
@@ -693,7 +520,8 @@ def find_long_running_vertex_training_jobs(
     if skipped_jobs > 0:
         warnings.warn(
             f"gcp.vertex.training_job.long_running: {skipped_jobs} job(s) skipped "
-            f"due to missing or unparseable timestamps — findings may be incomplete",
+            "due to malformed resource name, unexpected state, or unusable startTime "
+            "— findings may be incomplete",
             stacklevel=2,
         )
 
@@ -728,11 +556,24 @@ def _paginate(url: str) -> Optional[list]:
                 raise PermissionError(
                     f"aiplatform.{resource}.list permission required " f"(roles/aiplatform.viewer)"
                 )
-            if resp.status_code == 404:
-                return []
-            if resp.status_code == 400:
-                return None  # signal caller to try fallback
-            resp.raise_for_status()
+            if not resp.ok:
+                if results:
+                    # Later-page failure: keep earlier pages, warn (spec 11.3).
+                    # Treat identically to a non-permission surface failure so the
+                    # caller can decide whether to continue with the other surface.
+                    warnings.warn(
+                        f"gcp.vertex.training_job.long_running: {resource} pagination "
+                        f"failed mid-scan (HTTP {resp.status_code}) — "
+                        "partial page results kept; findings may be incomplete",
+                        stacklevel=4,
+                    )
+                    return results
+                # First-page failures:
+                if resp.status_code == 404:
+                    return []  # API not enabled — not an error
+                if resp.status_code == 400:
+                    return None  # wildcard unsupported — signal caller for fallback
+                resp.raise_for_status()  # propagate other first-page errors
             data = resp.json()
             results.extend(data.get(resource, []))
             next_token = data.get("nextPageToken")
@@ -741,8 +582,6 @@ def _paginate(url: str) -> Optional[list]:
             params["pageToken"] = next_token
         return results
 
-    # Fast path: wildcard covers all regions in one paginated sequence.
-    # Skip if we already know this project+resource combination doesn't support it.
     cache_key = (project_id, resource)
     if cache_key not in _wildcard_unsupported:
         result = _paginate(f"{base_url}/-/{resource}")
@@ -750,7 +589,6 @@ def _paginate(url: str) -> Optional[list]:
             return result
         _wildcard_unsupported.add(cache_key)
 
-    # Fallback: per-location queries
     all_jobs: list = []
     seen: set = set()
     for location in _VERTEX_LOCATIONS:
@@ -765,16 +603,6 @@ def _paginate(url: str) -> Optional[list]:
     return all_jobs
 
 
-def _parse_location(name: str) -> str:
-    """Extract location from resource name: projects/{p}/locations/{loc}/.../{id}"""
-    parts = name.split("/")
-    try:
-        idx = parts.index("locations")
-        return parts[idx + 1]
-    except (ValueError, IndexError):
-        return ""
-
-
 def _tpu_topology_host_count(machine_type: str, topology: str) -> int:
     """
     Compute the number of physical TPU hosts implied by tpuTopology.
@@ -787,7 +615,7 @@ def _tpu_topology_host_count(machine_type: str, topology: str) -> int:
         chips_per_host = _BUNDLED_ACCELERATOR_COUNT[machine_type]
         hosts = max(1, total_chips // chips_per_host)
 
-    Returns 0 when topology is empty or unparseable — callers fall back to replicaCount.
+    Returns 0 when topology is empty or unparseable -- callers fall back to replicaCount.
     """
     if not topology:
         return 0
@@ -806,7 +634,6 @@ def _tpu_topology_host_count(machine_type: str, topology: str) -> int:
     if chips_per_host <= 0:
         # Fallback: parse the -Nt suffix common to all Cloud TPU machine names.
         # e.g. "tpu7x-standard-4t" → suffix "4t" → 4 chips/host.
-        # This handles future variants automatically without requiring a table entry.
         suffix = (machine_type or "").rsplit("-", 1)[-1]
         if suffix.endswith("t") and suffix[:-1].isdigit():
             chips_per_host = int(suffix[:-1])
@@ -822,38 +649,55 @@ def _tpu_topology_host_count(machine_type: str, topology: str) -> int:
 
 def _parse_worker_pools(
     worker_pool_specs: list,
-) -> List[Tuple[Optional[str], Optional[str], int, int]]:
+) -> List[Tuple[Optional[str], Optional[str], int, int, Optional[str]]]:
     """
     Parse per-pool hardware specs from a CustomJob or TrainingPipeline.
 
-    Returns a list of (machine_type, accel_type, accel_count, replica_count) tuples,
-    one per pool. The first element is the primary (chief) pool.
+    Returns a list of (machine_type, accel_type, accel_count, replica_count, tpu_topology)
+    tuples, one per pool. The first element is the primary (chief) pool.
+
+    Returns [] when no specs are provided, or when all entries are malformed.
 
-    Returns [] when no specs are provided; callers should apply defaults in that case.
-    Cost must be summed across all pools — do not use primary pool × total_replicas,
-    as secondary pools often have different (and more expensive) machine types.
+    Per spec 8.1 and 8.2: `machineType` is required in a pool entry for it to be
+    structurally valid.  Entries missing `machineType`, or entries that cannot be
+    parsed due to type errors, are silently skipped rather than making the whole
+    resource ineligible.
 
     TPU topology: for TPU machine types (ct5lp-*, ct6e-*, tpu7x-*, etc.), replicaCount
     is always 1 in the API even for multi-host pods. tpuTopology encodes the actual
-    chip grid; this function replaces replicaCount with the derived host count so that
-    _total_hourly_rate() correctly prices the whole pod.
+    chip grid; this function replaces replicaCount with the derived host count.
+    tpu_topology is stored in the tuple so callers never need to re-index into the
+    original raw specs list (which may have different indices after malformed entries
+    are filtered).
     """
     pools = []
     for pool in worker_pool_specs:
-        machine_spec = pool.get("machineSpec", {})
-        replicas = max(1, int(pool.get("replicaCount", 1)))
-        machine = machine_spec.get("machineType") or None
-        accel = machine_spec.get("acceleratorType") or None
-        count = int(machine_spec.get("acceleratorCount", 0))
-
-        # For TPU machines replicaCount is always 1; derive real host count from topology.
-        if machine and _is_tpu_machine(machine):
-            topology = machine_spec.get("tpuTopology") or ""
-            host_count = _tpu_topology_host_count(machine, topology)
-            if host_count > 0:
-                replicas = host_count
-
-        pools.append((machine, accel, count, replicas))
+        try:
+            if not isinstance(pool, dict):
+                continue
+            machine_spec = pool.get("machineSpec") or {}
+            if not isinstance(machine_spec, dict):
+                continue
+            # machineType is required for a structurally valid pool (spec 8.1, 8.2)
+            machine = (machine_spec.get("machineType") or "").strip() or None
+            if not machine:
+                continue
+            replicas = max(1, int(pool.get("replicaCount") or 1))
+            accel = (machine_spec.get("acceleratorType") or "").strip() or None
+            count = int(machine_spec.get("acceleratorCount") or 0)
+
+            tpu_topo: Optional[str] = None
+            if _is_tpu_machine(machine):
+                tpu_topo = (machine_spec.get("tpuTopology") or "").strip() or None
+                if tpu_topo:
+                    host_count = _tpu_topology_host_count(machine, tpu_topo)
+                    if host_count > 0:
+                        replicas = host_count
+
+            pools.append((machine, accel, count, replicas, tpu_topo))
+        except (TypeError, ValueError):
+            # Malformed pool entry: skip for hardware classification (spec 8.1, 8.2)
+            continue
     return pools
 
 
@@ -878,113 +722,31 @@ def _is_bundled_machine(machine_type: Optional[str]) -> bool:
     Return True if the machine type has accelerator cost bundled (no separate add-on).
 
     Covers GPU machine families (a2-*, a3-*, a4-*, a4x-*, g2-*, g4-*) and
-    Cloud TPU machine types (ct5lp-*, ct5p-*, ct6e-*, etc.) that expose TPU
-    via machineType + tpuTopology rather than acceleratorType.
+    Cloud TPU machine types that expose TPU via machineType + tpuTopology.
     """
     m = machine_type or ""
     return m.startswith(_BUNDLED_GPU_PREFIXES) or _is_tpu_machine(machine_type)
 
 
 def _has_accelerator_hardware(
-    pools: List[Tuple[Optional[str], Optional[str], int, int]],
+    pools: List[Tuple[Optional[str], Optional[str], int, int, Optional[str]]],
 ) -> bool:
     """
     Return True if any worker pool uses GPU or TPU accelerator hardware.
 
-    Detects accelerators via two structured paths:
-    - Explicit accelerator type in _ACCELERATOR_TYPES (GPU families and TPU pods via add-on)
-    - _is_bundled_machine(m): covers _BUNDLED_GPU_PREFIXES (a2-*, a3-*, a4-*, a4x-*, g2-*, g4-*)
-      and _is_tpu_machine() (ct4-*/ct5*/ct6*/ct7*/tpu*)
+    Two independent detection paths (spec 8.1):
+    - Explicit path: acceleratorType is a recognized enum AND acceleratorCount > 0
+    - Bundled path: machine type is in a GPU or TPU family (_is_bundled_machine)
 
+    acceleratorType alone with count == 0 does NOT classify a pool as accelerated.
     Empty pools → False. Unknown hardware does NOT imply accelerated workload.
-    Relies on structured prefix lists only — no substring matching.
     """
     return any(
-        (a or "").upper() in _ACCELERATOR_TYPES or _is_bundled_machine(m) for m, a, c, r in pools
+        ((a or "").upper() in _ACCELERATOR_TYPES and c > 0) or _is_bundled_machine(m)
+        for m, a, c, r, *_ in pools
     )
 
 
-def _estimate_hourly_rate_per_replica(
-    machine_type: Optional[str],
-    accel_type: Optional[str],
-    accel_count: int,
-) -> float:
-    """
-    Estimate hourly cost for a single replica (one worker node).
-
-    Bundled families (a2-*, a3-*, a4-*, a4x-*, g2-*, g4-*, ct*/tpu7x-*) include accelerator
-    cost in the machine price. n1-*/n2-*/c2-* add accelerator cost separately.
-
-    Co-scheduling (bundled machines only): when accel_count <= N//2 (where N is the machine's
-    full accelerator count from _BUNDLED_ACCELERATOR_COUNT), Vertex AI may place
-    floor(N/accel_count) replicas onto one VM. In that case each replica shares the machine
-    cost proportionally — machine_hourly is divided by replicas_per_vm. When accel_count is 0
-    or unknown, the full machine price is charged conservatively.
-    """
-    # For unrecognized TPU machine types use the TPU-specific default to avoid the
-    # generic $150/mo fallback massively underestimating a real TPU job.
-    _mt = machine_type or ""
-    if _mt in _MACHINE_MONTHLY_COST:
-        machine_monthly = _MACHINE_MONTHLY_COST[_mt]
-    elif _is_tpu_machine(_mt) or "tpu" in _mt.lower():
-        # Second condition is a defensive catch for future TPU naming patterns that
-        # _is_tpu_machine() might miss — avoids silent 70× underestimate vs generic $150/mo.
-        machine_monthly = _DEFAULT_TPU_MONTHLY_COST
-    else:
-        machine_monthly = _DEFAULT_MACHINE_MONTHLY_COST
-    machine_hourly = machine_monthly / _HOURS_PER_MONTH
-
-    # Co-scheduling correction for bundled machines.
-    # Only applies when accel_count divides machine_gpu_count evenly (clean partition)
-    # and accel_count <= machine_gpu_count (requesting more GPUs than exist is invalid).
-    if _is_bundled_machine(machine_type) and accel_count >= 1:
-        machine_gpu_count = _BUNDLED_ACCELERATOR_COUNT.get(machine_type or "", 0)
-        if (
-            machine_gpu_count > 0
-            and accel_count <= machine_gpu_count
-            and machine_gpu_count % accel_count == 0
-        ):
-            replicas_per_vm = max(1, machine_gpu_count // accel_count)
-            machine_hourly = machine_hourly / replicas_per_vm
-
-    accelerator_hourly = 0.0
-    if accel_type and accel_type in _ACCELERATOR_MONTHLY_COST_EACH:
-        if not _is_bundled_machine(machine_type):
-            accelerator_hourly = (
-                _ACCELERATOR_MONTHLY_COST_EACH[accel_type] / _HOURS_PER_MONTH
-            ) * max(accel_count, 1)
-
-    return machine_hourly + accelerator_hourly
-
-
-def _pricing_confidence(
-    pools: List[Tuple[Optional[str], Optional[str], int, int]],
-) -> str:
-    """
-    Return "published" if all machine types and accelerators in the pool list have
-    published GCP pricing, otherwise "partial_estimate".
-    """
-    for m, a, _c, _r in pools:
-        mt = m or ""
-        if mt.startswith(_PRICING_ESTIMATED_MACHINE_PREFIXES):
-            return "partial_estimate"
-        if (a or "").upper() in _PRICING_ESTIMATED_ACCEL_TYPES:
-            return "partial_estimate"
-    return "published"
-
-
-def _total_hourly_rate(
-    pools: List[Tuple[Optional[str], Optional[str], int, int]],
-) -> float:
-    """
-    Sum hourly burn rate across all worker pools.
-
-    Each pool contributes _estimate_hourly_rate_per_replica × replica_count.
-    Correctly handles heterogeneous jobs (different machine types per pool).
-    """
-    return sum(_estimate_hourly_rate_per_replica(m, a, c) * r for m, a, c, r in pools)
-
-
 def _hardware_label(
     machine_type: Optional[str],
     accel_type: Optional[str],
@@ -992,19 +754,14 @@ def _hardware_label(
     total_replicas: int,
     tpu_topology: Optional[str] = None,
 ) -> str:
-    """Build a compact hardware label for title/summary.
-
-    For TPU machines, tpu_topology (e.g. "2x4") is appended when non-empty
-    because the machine name alone (e.g. "ct5lp-hightpu-8t") does not convey
-    the full chip grid or host count.
-    """
+    """Build a compact hardware label for title/summary."""
     parts = []
     if machine_type:
         label = machine_type
         if tpu_topology and _is_tpu_machine(machine_type):
             label = f"{machine_type} [{tpu_topology}]"
         parts.append(label)
-    if accel_type and accel_type != "ACCELERATOR_TYPE_UNSPECIFIED":
+    if accel_type and accel_type != "ACCELERATOR_TYPE_UNSPECIFIED" and accel_count > 0:
         count_str = f"{accel_count}×" if accel_count > 1 else ""
         parts.append(f"{count_str}{accel_type}")
     if total_replicas > 1:
diff --git a/docs/rules/gcp.md b/docs/rules/gcp.md
index 2e0ae3d..6360e10 100644
--- a/docs/rules/gcp.md
+++ b/docs/rules/gcp.md
@@ -194,17 +194,24 @@
 **Spec:** —
 
 #### `gcp.vertex.training_job.long_running`
-**Detects:** Vertex AI CustomJobs and TrainingPipelines in `RUNNING` state beyond `long_running_hours_threshold`; GPU/TPU jobs near threshold also trigger early-warning findings
+**Detects:** Vertex AI CustomJobs and TrainingPipelines whose state is exactly the expected running state (`JOB_STATE_RUNNING` / `PIPELINE_STATE_RUNNING`) and whose elapsed wall-clock time since `startTime` meets or exceeds `long_running_hours_threshold`
 
-**Confidence / Risk:** HIGH (duration ≥ 3× threshold — clearly runaway); MEDIUM (duration ≥ threshold) / CRITICAL (HIGH confidence + GPU/accelerator); HIGH (HIGH confidence + non-GPU); MEDIUM (all MEDIUM confidence)
+**Confidence / Risk:** HIGH (duration ≥ 3× threshold — clearly runaway); MEDIUM (duration ≥ threshold) / CRITICAL (HIGH confidence + GPU/TPU/accelerator); HIGH (HIGH confidence + non-accelerator); MEDIUM (all MEDIUM confidence)
+
+**Cost:** `estimated_monthly_cost_usd = None` — training jobs are transient; no static per-hour rate is appropriate across machine types and regions
 
 **Permissions:** `aiplatform.customJobs.list`, `aiplatform.trainingPipelines.list` (roles/aiplatform.viewer)
 
-**Params:** `long_running_hours_threshold` (default: 24); `expensive_hourly_threshold` (default: $20/hr, for early-warning CPU jobs)
+**Params:** `long_running_hours_threshold` (default: 24)
 
-**Exclusions:** jobs < 90% of threshold; cheap CPU-only jobs in the 90–100% early-warning zone
+**Exclusions:**
+- resource name not matching exact pattern `projects/{p}/locations/{l}/customJobs/{id}` or `trainingPipelines/{id}` (6 segments, non-empty components)
+- state field absent or not exactly the expected running state for the job type
+- `startTime` absent, non-RFC3339 (rejects space separator, date-only, missing timezone), or unparsable
+- elapsed < `long_running_hours_threshold`
+- region filter set and derived location does not exactly match
 
-**Spec:** —
+**Spec:** [docs/specs/gcp/ai/vertex_training_job_long_running.md](../specs/gcp/ai/vertex_training_job_long_running.md)
 
 #### `gcp.tpu.idle`
 **Detects:** Standalone Cloud TPU nodes in exact `READY` state where complete worker-joined duty-cycle telemetry (`tpu.googleapis.com/accelerator/duty_cycle` on `tpu.googleapis.com/GceTpuWorker`) confirms max observed duty cycle <= 2% across all joined workers and accelerators over the full buffered `idle_days` window; monitoring is required — no age-only, partial-join, or cadence-assumed fallback
diff --git a/docs/specs/gcp/ai/vertex_training_job_long_running.md b/docs/specs/gcp/ai/vertex_training_job_long_running.md
new file mode 100644
index 0000000..dff671e
--- /dev/null
+++ b/docs/specs/gcp/ai/vertex_training_job_long_running.md
@@ -0,0 +1,508 @@
+# GCP Rule Spec - `gcp.vertex.training_job.long_running`
+
+## 1. Rule Identity
+
+- **Rule ID:** `gcp.vertex.training_job.long_running`
+- **Provider:** GCP
+- **Resource type:** Vertex AI training job
+- **Finding resource_type:** `gcp.vertex.training_job`
+
+---
+
+## 2. Intent
+
+Detect **Vertex AI training resources that are provably still in an exact documented running state** and whose documented `startTime` shows they have been running for at least a conservative review threshold.
+
+This rule is deliberately **precision-first**. It is a **review-candidate** rule only. It is **not** proof that a job is hung, **not** proof that no useful progress is occurring, **not** proof that the resource is safe to cancel, and **not** proof of a specific monthly dollar saving.
+
+### 2.1 Canonical definitions
+
+| Term | Definition |
+|---|---|
+| Vertex training job | Either a Vertex AI `CustomJob` or a Vertex AI `TrainingPipeline` |
+| running custom job | A `CustomJob` whose `state` is exactly `JOB_STATE_RUNNING` |
+| running training pipeline | A `TrainingPipeline` whose `state` is exactly `PIPELINE_STATE_RUNNING` |
+| scan clock | Single `now_utc` instant captured once per scan run and reused for all resources |
+| runtime anchor | The documented `startTime` field of the resource |
+| elapsed runtime hours | `(now_utc - start_time_utc)` expressed in hours |
+| elapsed runtime seconds | `(now_utc - start_time_utc)` expressed in seconds |
+| long-running threshold hours | Configured review threshold for this rule (`long_running_hours_threshold`); default `24` hours |
+| accelerator-backed job | A job whose documented worker-pool machine spec explicitly shows accelerator hardware |
+| hardware unknown | A job for which the control-plane response does not expose enough documented machine-spec data to classify hardware |
+
+---
+
+## 3. GCP Documentation Grounding
+
+### 3.1 CustomJob is the canonical Vertex AI resource for custom training workloads
+
+Google documents `CustomJob` as a resource that runs custom workloads such as a Docker container or a Python package. Google also documents:
+
+1. `jobSpec`
+2. `state`
+3. `createTime`
+4. `startTime`
+5. `endTime`
+6. `updateTime`
+
+Google explicitly defines `CustomJob.startTime` as the time when the `CustomJob` **for the first time entered** `JOB_STATE_RUNNING`.
+
+Source:
+
+- *REST Resource: projects.locations.customJobs*
+
+URL:
+
+- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.customJobs
+
+Rule consequence:
+
+1. `CustomJob` is an in-scope training resource for this rule.
+2. `startTime` is the canonical runtime anchor for `CustomJob`.
+3. `createTime` is **not** the canonical runtime anchor for a running job.
+
+### 3.2 TrainingPipeline is also an in-scope training resource, but it is an orchestrator
+
+Google documents `TrainingPipeline` as a resource that **orchestrates tasks associated with training a Model** and **always executes the training task**, while it may also export dataset data, upload the model, and evaluate the model.
+
+Google also documents:
+
+1. `trainingTaskDefinition`
+2. `trainingTaskInputs`
+3. `trainingTaskMetadata`
+4. `state`
+5. `createTime`
+6. `startTime`
+7. `endTime`
+8. `updateTime`
+
+Google explicitly defines `TrainingPipeline.startTime` as the time when the pipeline **for the first time entered** `PIPELINE_STATE_RUNNING`.
+
+Google also documents that `trainingTaskMetadata` is populated only on a **best effort basis** while the pipeline is running.
+
+Source:
+
+- *REST Resource: projects.locations.trainingPipelines*
+
+URL:
+
+- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.trainingPipelines
+
+Rule consequence:
+
+1. `TrainingPipeline` is an in-scope training resource for this rule.
+2. `startTime` is the canonical runtime anchor for `TrainingPipeline`.
+3. `trainingTaskMetadata` must not be treated as canonical proof of runtime, progress, or hardware shape.
+4. A `TrainingPipeline` finding remains review-candidate only because the resource may also be orchestrating non-training auxiliary tasks.
+
+### 3.3 Exact running-state enums are documented
+
+Google documents:
+
+1. `JOB_STATE_RUNNING` means **the job is in progress**
+2. `PIPELINE_STATE_RUNNING` means **the pipeline is in progress**
+3. queued, pending, updating, pausing, cancelling, cancelled, failed, and succeeded are distinct states
+
+Sources:
+
+- *JobState*
+- *PipelineState*
+
+URLs:
+
+- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/JobState
+- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/PipelineState
+
+Rule consequence:
+
+1. Eligibility must require exact documented running states only.
+2. The rule must not treat queued, pending, paused, updating, cancelling, cancelled, failed, or succeeded resources as running.
+
+### 3.4 Worker-pool machine shape is documented for CustomJob
+
+Google documents `CustomJobSpec.workerPoolSpecs` and `WorkerPoolSpec.machineSpec`.
+
+Google documents on these surfaces:
+
+1. `workerPoolSpecs`
+2. `replicaCount`
+3. `machineSpec.machineType`
+4. `machineSpec.acceleratorType`
+5. `machineSpec.acceleratorCount`
+6. `machineSpec.tpuTopology`
+
+Source:
+
+- *CustomJobSpec*
+- *MachineSpec*
+
+URLs:
+
+- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec
+- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/MachineSpec
+
+Rule consequence:
+
+1. CustomJob hardware classification may be based on documented worker-pool machine-spec fields.
+2. TPU-backed training may be identified from documented machine-spec fields such as TPU machine types and `tpuTopology`.
+3. Hardware evidence must come from documented structured machine-spec fields, not from name heuristics outside those documented surfaces.
+
+### 3.5 TrainingPipeline hardware exposure is task-definition dependent
+
+Google documents:
+
+1. `trainingTaskDefinition` points to the YAML definition of the training task
+2. `trainingTaskInputs` contains the training task parameters **as specified by that definition**
+
+Source:
+
+- *REST Resource: projects.locations.trainingPipelines*
+
+URL:
+
+- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.trainingPipelines
+
+Rule consequence:
+
+1. Hardware classification for `TrainingPipeline` is optional and definition-dependent.
+2. If the pipeline response does not expose documented worker-pool machine-spec fields through `trainingTaskInputs`, hardware must remain unknown.
+3. The rule must not guess GPU, TPU, machine type, or replica count for `TrainingPipeline` resources whose task inputs do not expose those fields.
+
+### 3.6 Vertex AI training pricing is usage-based and configuration-specific
+
+Google documents that:
+
+1. for custom-trained models, training prices depend on the selected machine types
+2. if Compute Engine machine types have attached accelerators, accelerator cost is separate unless included in the machine type
+3. pricing varies by region
+4. reservations, committed use discounts, and Spot usage can change effective cost
+5. there is **no minimum usage duration** for training and prediction; usage is charged in **30 second increments**
+
+Source:
+
+- *Vertex AI pricing*
+
+URL:
+
+- https://cloud.google.com/vertex-ai/pricing
+
+Rule consequence:
+
+1. Long-running training is a valid cost-review candidate because training compute is usage-billed while it runs.
+2. Static hardcoded pricing tables are not canonical rule logic.
+3. `estimated_monthly_cost_usd` must remain `None` because training jobs are transient, not recurring monthly resources.
+4. The rule must not rely on region-agnostic or stale price heuristics for eligibility.
+
+### 3.7 Vertex AI locations are regional, not global
+
+Google documents that Vertex AI does not support a global location and uses regional resource names and regional service endpoints.
+
+Source:
+
+- *Vertex AI locations*
+
+URL:
+
+- https://cloud.google.com/vertex-ai/docs/general/locations
+
+Rule consequence:
+
+1. Location must be derived from the resource name.
+2. Region filters must compare against exact regional location values.
+
+---
+
+## 4. Detection Goal
+
+Emit a finding only when **all** of the following are true:
+
+1. the resource is a documented in-scope Vertex AI training resource (`CustomJob` or `TrainingPipeline`)
+2. the resource is in an exact documented running state
+3. the resource has a valid, parseable, non-future `startTime`
+4. the derived elapsed runtime is at least `long_running_hours_threshold`
+
+If any required signal cannot be established reliably, skip rather than emit.
+
+---
+
+## 5. Non-Goals
+
+This rule does **not** attempt to prove:
+
+- that the training job is hung or deadlocked
+- that the training job is abandoned or forgotten
+- that the job is safe to cancel
+- that no checkpointing or useful progress is occurring
+- that the job is definitely expensive
+- that a specific monthly saving exists
+
+---
+
+## 6. Canonical Inputs
+
+### 6.1 Required surfaces
+
+The implementation may use the following documented APIs:
+
+1. `projects.locations.customJobs.list`
+2. `projects.locations.trainingPipelines.list`
+
+Relevant list-filter capability documented by Google:
+
+1. CustomJobs support filtering by `state`
+2. TrainingPipelines support filtering by `state`
+3. paginated results must be exhausted using `nextPageToken`
+
+Sources:
+
+- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.customJobs/list
+- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.trainingPipelines/list
+
+### 6.2 Required per-resource fields
+
+| Resource type | Required fields |
+|---|---|
+| `CustomJob` | `name`, `state`, `startTime` |
+| `TrainingPipeline` | `name`, `state`, `startTime` |
+
+### 6.3 Optional context fields
+
+These may enrich the finding when present, but are not required for eligibility:
+
+- `displayName`
+- `jobSpec.workerPoolSpecs` on `CustomJob`
+- `trainingTaskDefinition` on `TrainingPipeline`
+- `trainingTaskInputs` on `TrainingPipeline`
+- `labels`
+
+---
+
+## 7. Canonical normalization rules
+
+Normalize the following values:
+
+| Field | Canonical rule |
+|---|---|
+| `resource_name` | Must exactly match one of these forms: `projects/{project}/locations/{location}/customJobs/{id}` or `projects/{project}/locations/{location}/trainingPipelines/{id}`. Otherwise skip. |
+| `location` | Parse from the exact `locations/{location}` segment of the resource name. Region-filter comparison must use exact string equality only, with no aliasing or case folding. |
+| `state` | Compare exactly to the documented running enum for the resource type, case-sensitive and with no normalization. Null or empty values skip. |
+| `start_time_utc` | Parse documented RFC3339 `startTime` into timezone-aware UTC. Valid RFC3339 timestamps, including fractional seconds and either `Z` or explicit offsets, must be accepted. Any other format is invalid. Missing, unparsable, or future values skip. No fallback parsing is allowed. |
+| `elapsed_runtime_seconds` | Compute from a single per-run `now_utc - start_time_utc`. Do not round for eligibility decisions. |
+| `elapsed_runtime_hours` | Derived display/context form of elapsed runtime. It must not be the canonical comparison unit. |
+
+Important:
+
+1. `createTime` is context only; it must **not** replace `startTime` as the runtime anchor.
+2. `updateTime` is context only; it must **not** replace `startTime` as the runtime anchor.
+3. `endTime` is not relevant for resources still in running state.
+4. `now_utc` must be captured once per scan run in UTC and reused for all resources in that run.
+5. `now_utc` must not be recomputed, shifted, or otherwise adjusted mid-scan.
+
+---
+
+## 8. Hardware evidence rules
+
+### 8.1 CustomJob hardware classification
+
+For `CustomJob`, hardware may be classified from documented `jobSpec.workerPoolSpecs[].machineSpec` fields:
+
+1. `machineType`
+2. `acceleratorType`
+3. `acceleratorCount`
+4. `tpuTopology`
+5. `replicaCount`
+
+If `workerPoolSpecs` is missing, empty, or all entries are structurally invalid, the job must remain eligible on duration/state grounds but `hardware_unknown = true`.
+
+A pool entry is structurally valid only when `machineType` is present and non-empty. Pool entries without `machineType` are treated as malformed and must be skipped for hardware classification.
+
+If some worker-pool entries are partially malformed, those invalid pools should be ignored for hardware classification rather than making the whole job ineligible. Hardware remains based only on structurally valid documented pools.
+
+A `CustomJob` is accelerator-backed when **any** worker pool explicitly shows any of the following:
+
+1. `acceleratorType` is a recognized documented enum value **and** `acceleratorCount > 0`, or
+2. `machineType` is in a documented bundled-GPU machine family (e.g. `a2-*`, `a3-*`, `a4-*`, `a4x-*`, `g2-*`, `g4-*`) where the accelerator hardware is part of the machine type and no separate `acceleratorType` is required, or
+3. `machineType` is in a documented Cloud TPU machine family (e.g. `ct4-*`, `ct5*`, `ct6*`, `tpu7x-*`)
+
+`acceleratorType` alone with `acceleratorCount == 0` does **not** classify a pool as accelerator-backed.
+
+### 8.2 TrainingPipeline hardware classification
+
+For `TrainingPipeline`, hardware may be classified only when the response structurally exposes documented worker-pool machine-spec fields through `trainingTaskInputs` for that task definition.
+
+At minimum, the exposed structure must contain:
+
+1. the expected nested shape `trainingTaskInputs.workerPoolSpecs[].machineSpec`
+2. `machineType` within that nested `machineSpec`
+3. optionally `acceleratorType`, `acceleratorCount`, or `tpuTopology` within that same nested `machineSpec`
+
+Flat, renamed, or otherwise shape-incompatible fields must not be treated as equivalent.
+
+A `trainingTaskInputs.workerPoolSpecs[]` entry is structurally valid only when it contains a `machineSpec` dict with `machineType` present and non-empty. Entries without `machineType`, or entries that are not dicts, are treated as malformed and must be skipped for hardware classification.
+
+If some entries are partially malformed, those invalid entries should be ignored for hardware classification rather than making the whole resource ineligible. Hardware remains based only on structurally valid documented entries.
+
+If those fields are not exposed, then:
+
+1. `hardware_unknown = true`
+2. hardware class must remain unresolved
+3. the rule must not guess GPU, TPU, replica count, or machine type
+
+### 8.3 Hardware is auxiliary, not eligibility
+
+Hardware evidence may affect risk labeling or finding context, but it must **not** be required for the rule to emit.
+
+All worker pools that are exposed by the control-plane response must be evaluated. Accelerator classification must use **any** documented accelerator-backed pool, not only the first pool.
+
+---
+
+## 9. Decision rule
+
+### 9.1 Eligibility
+
+The resource is eligible only when:
+
+1. resource type is `CustomJob` or `TrainingPipeline`
+2. `state` is exactly:
+   - `JOB_STATE_RUNNING` for `CustomJob`, or
+   - `PIPELINE_STATE_RUNNING` for `TrainingPipeline`
+3. `start_time_utc` is valid
+4. `elapsed_runtime_seconds >= long_running_hours_threshold * 3600`
+
+Configuration requirement:
+
+1. `long_running_hours_threshold` must be `>= 1` (integer hours; equivalent to `> 0` for an integer parameter)
+2. invalid threshold configuration must fail fast rather than silently clamp or reinterpret the value
+
+### 9.2 Confidence
+
+Confidence is a product policy, not a Google-defined concept:
+
+1. `MEDIUM` when `elapsed_runtime_seconds >= long_running_hours_threshold * 3600`
+2. `HIGH` when `elapsed_runtime_seconds >= 3 * long_running_hours_threshold * 3600`
+
+### 9.3 Risk
+
+Risk is a product policy and may use documented hardware evidence when available:
+
+1. `CRITICAL` when confidence is `HIGH` and the job is provably accelerator-backed
+2. `HIGH` when confidence is `HIGH` and accelerator hardware is not proven
+3. `MEDIUM` for all `MEDIUM` confidence findings
+
+### 9.4 Explicitly forbidden heuristics
+
+The rule must **not**:
+
+- emit below the configured long-running threshold
+- emit a sub-threshold GPU or TPU "early warning"
+- use `createTime` as a fallback runtime anchor
+- use hardcoded hourly-price thresholds as an eligibility gate
+- infer accelerator hardware when machine-spec evidence is absent
+
+---
+
+## 10. Cost handling
+
+### 10.1 Canonical monthly cost field
+
+`estimated_monthly_cost_usd = None`
+
+Reason:
+
+1. training jobs are transient, not monthly recurring resources
+2. pricing varies by region, machine type, accelerator shape, reservations, discounts, and Spot usage
+3. eligibility does not depend on cost
+
+### 10.2 Accrued-cost estimates
+
+The canonical rule does **not** require any accrued-cost calculation.
+
+If a future implementation chooses to surface an accrued-cost hint, it must:
+
+1. be clearly labeled non-canonical advisory context
+2. use authoritative current pricing inputs for the exact region and hardware configuration
+3. never affect eligibility, confidence, or risk
+
+Static price tables and placeholder cost tiers are out of scope for the canonical rule.
+
+---
+
+## 11. Failure behavior
+
+Always skip:
+
+- empty resource names
+- resource names that do not exactly match the documented 6-segment pattern for the resource type (extra segments, wrong resource-type keyword, empty segments all skip)
+- `state` absent, empty, or not exactly equal to the documented running enum for the resource type
+- `startTime` absent, not strict RFC3339 (space separator, no timezone offset, date-only, etc.), unparsable, or future
+- elapsed runtime below threshold
+
+Operational behavior:
+
+1. permission errors on a required list surface should propagate
+2. a non-permission fetch failure on one independent surface (`customJobs` or `trainingPipelines`) may warn and continue with the other surface
+3. if pagination fails on a later page of one surface, earlier successfully fetched pages from that same surface may still be kept, but the partial read must be treated as a non-permission failure and warned
+4. if both independent surfaces fail non-permissionally, the rule returns no findings and should warn that results are incomplete
+5. the rule must not synthesize findings from a surface it failed to read
+6. no cross-resource dedupe is required; each `CustomJob` or `TrainingPipeline` resource is evaluated independently
+
+---
+
+## 12. Output contract
+
+### 12.1 Required finding fields
+
+| Field | Value |
+|---|---|
+| `provider` | `gcp` |
+| `rule_id` | `gcp.vertex.training_job.long_running` |
+| `resource_type` | `gcp.vertex.training_job` |
+| `resource_id` | Full Vertex AI resource name |
+| `region` | Parsed resource location |
+| `estimated_monthly_cost_usd` | `None` |
+
+Identity rules:
+
+1. `resource_id` is the canonical full resource name
+2. `display_name` is optional context only and must not replace canonical identity
+
+### 12.2 Required decision facts in details or evidence
+
+The finding should surface, when available:
+
+1. `job_type` (`customJob` or `trainingPipeline`)
+2. exact running state
+3. `startTime`
+4. elapsed runtime hours
+5. threshold hours
+6. hardware evidence, if explicitly exposed
+7. whether hardware is unknown
+
+---
+
+## 13. Examples of resources that must skip
+
+- a `CustomJob` in `JOB_STATE_PENDING`
+- a `TrainingPipeline` in `PIPELINE_STATE_QUEUED`
+- a `CustomJob` whose `name` is `projects/p/locations/us-central1/customJobs/123/extra` (seven segments, not six)
+- a `CustomJob` whose `name` is `projects/p/locations/us-central1/models/123` (wrong resource-type segment)
+- a running resource with missing `startTime`
+- a running resource whose `startTime` is `2025-06-01 12:00:00Z` (space separator — not RFC3339)
+- a running resource whose `startTime` is `2025-06-01T12:00:00` (no timezone offset — not RFC3339)
+- a running resource whose `startTime` is unparsable
+- a running resource whose elapsed runtime is 23.9h when threshold is 24h
+- a `TrainingPipeline` whose task inputs do not expose worker-pool machine specs, when the implementation would otherwise need those fields only to guess cost or accelerator class
+
+---
+
+## 14. Summary
+
+This is a **duration-first Vertex AI training review rule**:
+
+1. scope to resources whose name exactly matches the documented Vertex AI resource-name pattern
+2. require exact state enum match read from the resource, not inferred from the list filter alone
+3. anchor runtime strictly to documented `startTime` (RFC3339 only; no fallback parsing)
+4. classify hardware from documented machine-spec fields only: explicit acceleratorType + count, bundled-GPU machine families, or TPU machine families
+5. require `machineType` to be present for a pool entry to contribute to hardware classification
+6. avoid sub-threshold warning heuristics
+7. avoid pricing heuristics in canonical detection
diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_training_job_long_running.py b/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_training_job_long_running.py
index 0935d11..4f0eb26 100644
--- a/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_training_job_long_running.py
+++ b/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_training_job_long_running.py
@@ -5,23 +5,31 @@
 - Core detection: CPU job over threshold (MEDIUM/MEDIUM), GPU job over threshold (MEDIUM)
 - Runaway (3× threshold): HIGH confidence, CRITICAL for GPU, HIGH for CPU
 - Risk model: GPU+HIGH→CRITICAL, CPU+HIGH→HIGH, MEDIUM confidence→MEDIUM regardless of GPU
-- Early warning: GPU job at 90–100% of threshold only (not 75%)
-- Noise reduction: GPU job at 75–89% of threshold does NOT fire
-- TrainingPipeline resource type: attempts trainingTaskInputs parsing; conservative fallback
+- Below-threshold jobs: no finding emitted (spec 9.4 — no sub-threshold early warnings)
+- TrainingPipeline resource type: parses trainingTaskInputs; hardware_unknown when absent
 - TrainingPipeline with workerPoolSpecs in trainingTaskInputs: uses parsed hardware
-- TrainingPipeline with no hardware spec: is_gpu=False (hardware_unknown=True), conservative duration-tiered fallback cost
-- No findings: job below 90% of threshold (CPU or GPU)
-- Region filter: jobs outside filter are skipped
-- Location fallback: malformed name → region="unknown"
+- TrainingPipeline with no hardware spec: hardware_unknown=True, is_accelerator=False
+- No findings: job below threshold (CPU or GPU)
+- Region filter: exact string equality (spec 7) — no case folding
+- Invalid threshold (< 1): fail-fast with ValueError (spec 9.1)
+- startTime absence skips job; createTime NOT used as fallback (spec 9.4)
+- Future startTime skips job (spec 7)
+- Malformed name skips job (spec 7, 11)
 - Permission errors: PermissionError raised on 403
-- estimated_monthly_cost_usd is always None (transient job)
-- Per-pool cost: heterogeneous cluster cost sums all pools (not primary × total)
+- estimated_monthly_cost_usd is always None (transient job, spec 10.1)
+- No cost fields in details (accrued_cost_usd, burn_rate_per_hour, pricing_source, etc.)
+- Details: state field (exact running enum), start_time field (RFC3339 string)
 - Accelerator detection from _has_accelerator_hardware: accelerator type OR machine prefix
 - _parse_worker_pools: returns list of per-pool tuples; empty → []
-- _estimate_hourly_rate_per_replica: bundled vs additive GPU cost
-- _total_hourly_rate: sums across pools
 - _hardware_label: single worker, multi-worker, with accelerator
 - RULE_ID attribute
+- Exact resource-name pattern enforcement (spec 7): extra segments, wrong type segment, skipped
+- State validation: exact enum from resource, not synthesised; wrong/missing state → skip
+- CustomJob hardware_unknown=True when workerPoolSpecs is empty or all entries malformed
+- _parse_worker_pools: entries without machineType are skipped (spec 8.1, 8.2)
+- _parse_worker_pools: malformed (non-dict, bad replicaCount/acceleratorCount) entries skipped
+- RFC3339 strictness: space separator, date-only, no-tz values all rejected
+- Partial pagination: later-page failure keeps accumulated pages and warns (spec 11.3)
 """
 
 from datetime import datetime, timedelta, timezone
@@ -34,20 +42,13 @@
 from cleancloud.providers.gcp.rules.ai.vertex_training_job_long_running import (
     _BUNDLED_ACCELERATOR_COUNT,
     _DEFAULT_LONG_RUNNING_HOURS,
-    _DEFAULT_MACHINE_MONTHLY_COST,
-    _DEFAULT_TPU_MONTHLY_COST,
-    _HOURS_PER_MONTH,
-    _MACHINE_MONTHLY_COST,
+    _EXPECTED_STATE,
     _RUNAWAY_MULTIPLIER,
-    _TPU_MACHINE_PREFIXES,
-    _estimate_hourly_rate_per_replica,
     _hardware_label,
     _has_accelerator_hardware,
-    _parse_location,
     _parse_worker_pools,
-    _pricing_confidence,
-    _total_hourly_rate,
     _tpu_topology_host_count,
+    _validate_resource_name,
     find_long_running_vertex_training_jobs,
 )
 
@@ -137,7 +138,6 @@ def _run(
     training_pipelines=None,
     region_filter=None,
     threshold=_THRESHOLD,
-    extra_kwargs=None,
 ):
     creds = MagicMock()
     session = _make_session(custom_jobs=custom_jobs, training_pipelines=training_pipelines)
@@ -154,8 +154,7 @@ def _run(
                 project_id=_PROJECT,
                 credentials=creds,
                 region_filter=region_filter,
-                long_running_hours=threshold,
-                **(extra_kwargs or {}),
+                long_running_hours_threshold=threshold,
             )
 
 
@@ -178,7 +177,6 @@ def test_cpu_job_over_threshold_medium_confidence():
     assert f.details["is_accelerator"] is False
     assert f.details["job_type"] == "customJob"
     assert f.details["duration_hours"] > _THRESHOLD
-    assert f.details["accrued_cost_usd"] > 0
     assert f.estimated_monthly_cost_usd is None
 
 
@@ -197,7 +195,7 @@ def test_gpu_job_over_threshold_medium_risk():
     assert len(findings) == 1
     f = findings[0]
     assert f.confidence == ConfidenceLevel.MEDIUM
-    assert f.risk == RiskLevel.MEDIUM  # not HIGH — see risk model
+    assert f.risk == RiskLevel.MEDIUM  # not HIGH — see risk model (spec 9.3)
     assert f.details["is_accelerator"] is True
     assert f.details["accelerator_type"] == "NVIDIA_TESLA_V100"
     assert f.details["accelerator_count"] == 2
@@ -235,33 +233,21 @@ def test_cpu_job_runaway_3x_high():
 
 
 # ---------------------------------------------------------------------------
-# Early warning (GPU only, 90% threshold)
+# Threshold behavior (spec 9.4: no sub-threshold early warnings)
 # ---------------------------------------------------------------------------
 
 
-def test_gpu_early_warning_at_90pct_threshold():
-    """GPU job at 92% of threshold triggers early warning."""
-    job = _custom_job(
-        "early",
-        "us-central1",
-        start_hours_ago=_THRESHOLD * 0.92,
-        accel_type="NVIDIA_TESLA_T4",
-        accel_count=1,
-    )
+def test_job_below_threshold_no_finding():
+    """No job type fires below the threshold (spec 9.4)."""
+    job = _custom_job("too-young", "us-central1", start_hours_ago=_THRESHOLD * 0.99)
     findings = _run(custom_jobs=[job])
-
-    assert len(findings) == 1
-    f = findings[0]
-    assert f.confidence == ConfidenceLevel.MEDIUM
-    assert f.risk == RiskLevel.MEDIUM
-    assert f.details["is_accelerator"] is True
-    assert f.details["overrun_hours"] == 0.0
+    assert findings == []
 
 
-def test_gpu_job_at_80pct_no_finding():
-    """GPU job at 80% of threshold does NOT fire (below _EARLY_WARNING_FRACTION=0.9)."""
+def test_gpu_job_below_threshold_no_finding():
+    """GPU job below threshold does NOT fire — no sub-threshold early warnings (spec 9.4)."""
     job = _custom_job(
-        "too-young",
+        "gpu-too-young",
         "us-central1",
         start_hours_ago=_THRESHOLD * 0.80,
         accel_type="NVIDIA_TESLA_T4",
@@ -271,92 +257,181 @@ def test_gpu_job_at_80pct_no_finding():
     assert findings == []
 
 
-def test_cpu_early_warning_not_emitted():
-    """CPU job at 92% of threshold produces no finding — early warning is GPU/TPU only."""
-    job = _custom_job("cpu-early", "us-central1", start_hours_ago=_THRESHOLD * 0.92)
+def test_job_at_exactly_threshold_fires():
+    """Job at exactly the threshold is in scope."""
+    job = _custom_job("exactly", "us-central1", start_hours_ago=_THRESHOLD)
     findings = _run(custom_jobs=[job])
-    assert findings == []
-
-
-def test_job_below_50pct_no_finding():
-    """No job type fires below _EARLY_WARNING_FRACTION."""
-    job = _custom_job(
-        "way-too-young",
-        "us-central1",
-        start_hours_ago=_THRESHOLD * 0.50,
-        accel_type="NVIDIA_TESLA_T4",
-        accel_count=1,
-    )
-    findings = _run(custom_jobs=[job])
-    assert findings == []
+    assert len(findings) == 1
+    assert findings[0].confidence == ConfidenceLevel.MEDIUM
 
 
 # ---------------------------------------------------------------------------
-# estimated_monthly_cost_usd
+# estimated_monthly_cost_usd and cost fields
 # ---------------------------------------------------------------------------
 
 
 def test_estimated_monthly_cost_always_none():
-    """Training jobs are transient; monthly cost field must be None."""
+    """Training jobs are transient; monthly cost field must be None (spec 10.1)."""
     job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 10)
     findings = _run(custom_jobs=[job])
     assert findings[0].estimated_monthly_cost_usd is None
 
 
-def test_accrued_cost_populated():
-    """Accrued cost (duration × hourly rate) must be > 0 and in details."""
-    job = _custom_job(
-        "j2",
-        "us-central1",
-        start_hours_ago=_THRESHOLD + 1,
-        machine_type="n1-standard-8",
-        accel_type="NVIDIA_TESLA_T4",
-        accel_count=1,
-    )
+def test_no_accrued_cost_in_details():
+    """Removed pricing fields must not appear in finding details (spec 10.2)."""
+    job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5)
     findings = _run(custom_jobs=[job])
-    assert findings[0].details["accrued_cost_usd"] > 0
+    assert len(findings) == 1
+    details = findings[0].details
+    assert "accrued_cost_usd" not in details
+    assert "burn_rate_per_hour" not in details
+    assert "pricing_source" not in details
+    assert "pricing_confidence" not in details
+    assert "cost_type" not in details
+    assert "overrun_hours" not in details
 
 
 # ---------------------------------------------------------------------------
-# TrainingPipeline resource type
+# Spec-compliance: threshold validation, startTime, region filter
 # ---------------------------------------------------------------------------
 
 
-def test_training_pipeline_no_hardware_conservative_fallback():
-    """Pipeline with no hardware spec uses duration-scaled fallback cost.
+def test_threshold_less_than_1_raises_value_error():
+    """Invalid threshold (< 1) must fail fast with ValueError (spec 9.1)."""
+    creds = MagicMock()
+    with pytest.raises(ValueError, match="long_running_hours_threshold"):
+        find_long_running_vertex_training_jobs(
+            project_id=_PROJECT,
+            credentials=creds,
+            long_running_hours_threshold=0,
+        )
 
-    >24h → $20/hr, 6–24h → $5/hr, <6h → $1/hr.
-    """
-    # >24h tier: start_hours_ago=_THRESHOLD+5 = 29h → $20/hr
-    pipeline = _training_pipeline("pl-1", "us-central1", start_hours_ago=_THRESHOLD + 5)
-    findings = _run(training_pipelines=[pipeline])
 
+def test_threshold_of_zero_raises_value_error():
+    creds = MagicMock()
+    with pytest.raises(ValueError):
+        find_long_running_vertex_training_jobs(
+            project_id=_PROJECT,
+            credentials=creds,
+            long_running_hours_threshold=-1,
+        )
+
+
+def test_create_time_not_used_as_fallback():
+    """Jobs with createTime but no startTime are skipped — createTime is NOT a fallback (spec 9.4)."""
+    import warnings as _warnings
+
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/no-start",
+        "displayName": "no-start-job",
+        "createTime": _iso(start),  # present but must NOT be used
+        "state": "JOB_STATE_RUNNING",
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    with _warnings.catch_warnings(record=True):
+        _warnings.simplefilter("always")
+        findings = _run(custom_jobs=[job])
+    assert findings == []
+
+
+def test_future_start_time_skips_job():
+    """Jobs with future startTime are skipped (spec 7)."""
+    import warnings as _warnings
+
+    future = NOW + timedelta(hours=5)
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/future",
+        "displayName": "future-job",
+        "startTime": _iso(future),
+        "state": "JOB_STATE_RUNNING",
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    with _warnings.catch_warnings(record=True):
+        _warnings.simplefilter("always")
+        findings = _run(custom_jobs=[job])
+    assert findings == []
+
+
+def test_malformed_name_skips_job():
+    """Jobs with malformed resource names (location not resolvable) are skipped (spec 7, 11)."""
+    import warnings as _warnings
+
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    job = {
+        "name": "malformed-resource-name",
+        "displayName": "bad-job",
+        "startTime": _iso(start),
+        "state": "JOB_STATE_RUNNING",
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    with _warnings.catch_warnings(record=True):
+        _warnings.simplefilter("always")
+        findings = _run(custom_jobs=[job])
+    assert findings == []
+
+
+def test_region_filter_exact_match_required():
+    """Region filter is exact string equality; prefix match must not pass (spec 7)."""
+    job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5)
+    findings = _run(custom_jobs=[job], region_filter="us-central")
+    assert findings == []
+
+
+# ---------------------------------------------------------------------------
+# Details fields
+# ---------------------------------------------------------------------------
+
+
+def test_details_state_field_present():
+    """Finding details include 'state' with the exact running enum value."""
+    job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5)
+    findings = _run(custom_jobs=[job])
     assert len(findings) == 1
-    f = findings[0]
-    assert f.details["job_type"] == "trainingPipeline"
-    assert f.details["is_accelerator"] is False  # hardware unknown ≠ GPU; only cost is conservative
-    assert f.details["hardware_unknown"] is True
-    assert f.details["pricing_source"] == "conservative_pipeline_default"
-    assert f.details["burn_rate_per_hour"] == pytest.approx(20.0)  # >24h tier
-    assert f.estimated_monthly_cost_usd is None
+    assert findings[0].details["state"] == "JOB_STATE_RUNNING"
 
 
-def test_training_pipeline_no_hardware_mid_tier():
-    """Pipeline at exactly threshold (24h) uses $5/hr mid-tier (duration <= 24h, > 6h)."""
-    # duration == _THRESHOLD (24h): not > 24 → $5/hr tier; >= threshold → MEDIUM confidence
-    pipeline = _training_pipeline("pl-mid", "us-central1", start_hours_ago=_THRESHOLD)
+def test_details_state_field_training_pipeline():
+    """TrainingPipeline finding uses PIPELINE_STATE_RUNNING."""
+    pipeline = _training_pipeline("pl", "us-central1", start_hours_ago=_THRESHOLD + 5)
     findings = _run(training_pipelines=[pipeline])
     assert len(findings) == 1
-    assert findings[0].details["burn_rate_per_hour"] == pytest.approx(5.0)  # 6–24h tier
+    assert findings[0].details["state"] == "PIPELINE_STATE_RUNNING"
 
 
-def test_training_pipeline_no_hardware_low_tier():
-    """Pipeline <6h uses $1/hr low-tier when threshold is small enough to fire at <6h."""
-    # Use threshold=5h so a 5h job fires (duration >= threshold); duration <= 6h → $1/hr tier
-    pipeline = _training_pipeline("pl-low2", "us-central1", start_hours_ago=5)
-    findings = _run(training_pipelines=[pipeline], threshold=5)
+def test_details_start_time_field_present():
+    """Finding details include 'start_time' as an RFC3339 string."""
+    job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5)
+    findings = _run(custom_jobs=[job])
     assert len(findings) == 1
-    assert findings[0].details["burn_rate_per_hour"] == pytest.approx(1.0)
+    assert "start_time" in findings[0].details
+    assert isinstance(findings[0].details["start_time"], str)
+    assert findings[0].details["start_time"].endswith("Z")
+
+
+def test_details_long_running_hours_threshold_present():
+    """Finding details include 'long_running_hours_threshold'."""
+    job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5)
+    findings = _run(custom_jobs=[job])
+    assert findings[0].details["long_running_hours_threshold"] == _THRESHOLD
+
+
+# ---------------------------------------------------------------------------
+# TrainingPipeline resource type
+# ---------------------------------------------------------------------------
+
+
+def test_training_pipeline_no_hardware_spec():
+    """Pipeline with no hardware spec: hardware_unknown=True, is_accelerator=False."""
+    pipeline = _training_pipeline("pl-1", "us-central1", start_hours_ago=_THRESHOLD + 5)
+    findings = _run(training_pipelines=[pipeline])
+
+    assert len(findings) == 1
+    f = findings[0]
+    assert f.details["job_type"] == "trainingPipeline"
+    assert f.details["is_accelerator"] is False
+    assert f.details["hardware_unknown"] is True
+    assert f.estimated_monthly_cost_usd is None
 
 
 def test_training_pipeline_with_worker_pool_specs_in_task_inputs():
@@ -384,7 +459,6 @@ def test_training_pipeline_with_worker_pool_specs_in_task_inputs():
     assert f.details["is_accelerator"] is True  # a2-* prefix
     assert f.details["machine_type"] == "a2-highgpu-1g"
     assert f.details["total_workers"] == 2
-    assert f.details["pricing_source"] == "static_estimate_us_central1"
 
 
 def test_training_pipeline_task_inputs_as_json_string():
@@ -483,16 +557,12 @@ def test_n1_cpu_not_classified_as_gpu():
 
 
 # ---------------------------------------------------------------------------
-# Per-pool cost aggregation (fix: sum all pools, not primary × total)
+# Heterogeneous cluster: total_workers
 # ---------------------------------------------------------------------------
 
 
-def test_heterogeneous_cluster_cost_sums_all_pools():
-    """
-    Chief: a2-highgpu-1g (1 replica) ≈ $4.02/hr
-    Workers: n1-standard-4 (8 replicas) ≈ $0.19/hr each → $1.52/hr total
-    Total should be ≈ $5.54/hr, not a2-price × 9 ($36.18/hr).
-    """
+def test_heterogeneous_cluster_total_workers():
+    """Chief (1 replica) + 8 workers = total_workers 9."""
     start = NOW - timedelta(hours=_THRESHOLD + 5)
     job = {
         "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/hetero",
@@ -522,13 +592,7 @@ def test_heterogeneous_cluster_cost_sums_all_pools():
     }
     findings = _run(custom_jobs=[job])
     assert len(findings) == 1
-    f = findings[0]
-
-    a2_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-1g"] / _HOURS_PER_MONTH
-    n1_hourly = _MACHINE_MONTHLY_COST["n1-standard-4"] / _HOURS_PER_MONTH
-    expected_total = a2_hourly * 1 + n1_hourly * 8
-    assert f.details["burn_rate_per_hour"] == pytest.approx(expected_total)
-    assert f.details["total_workers"] == 9
+    assert findings[0].details["total_workers"] == 9
 
 
 # ---------------------------------------------------------------------------
@@ -544,26 +608,6 @@ def test_region_filter_excludes_other_regions():
     assert findings[0].region == "us-central1"
 
 
-# ---------------------------------------------------------------------------
-# Location fallback
-# ---------------------------------------------------------------------------
-
-
-def test_location_unknown_for_malformed_name():
-    """Jobs with unparseable resource names get region='unknown', not ''."""
-    start = NOW - timedelta(hours=_THRESHOLD + 5)
-    job = {
-        "name": "malformed-resource-name",
-        "displayName": "bad-job",
-        "startTime": _iso(start),
-        "state": "JOB_STATE_RUNNING",
-        "jobSpec": {"workerPoolSpecs": []},
-    }
-    findings = _run(custom_jobs=[job])
-    if findings:  # may be filtered out if region_filter active — just check region value
-        assert findings[0].region == "unknown"
-
-
 # ---------------------------------------------------------------------------
 # Permission error
 # ---------------------------------------------------------------------------
@@ -645,11 +689,12 @@ def test_parse_worker_pools_single_pool():
     ]
     result = _parse_worker_pools(specs)
     assert len(result) == 1
-    machine, accel, count, replicas = result[0]
+    machine, accel, count, replicas, tpu_topology = result[0]
     assert machine == "n1-standard-8"
     assert accel == "NVIDIA_TESLA_V100"
     assert count == 2
     assert replicas == 4
+    assert tpu_topology is None  # non-TPU machine
 
 
 def test_parse_worker_pools_multi_pool():
@@ -749,17 +794,34 @@ def test_parse_worker_pools_tpu_no_topology_keeps_replica_count():
     assert result[0][3] == 1
 
 
-def test_total_hourly_rate_tpu_multi_host():
-    """A ct5lp-hightpu-4t pool with 2x4 topology is priced as 2 hosts."""
+def test_parse_worker_pools_tpu_topology_stored_in_tuple():
+    """tpu_topology (index 4) is stored in the pool tuple to avoid raw-spec index mismatch."""
     specs = [
         {
             "replicaCount": 1,
             "machineSpec": {"machineType": "ct5lp-hightpu-4t", "tpuTopology": "2x4"},
         }
     ]
-    pools = _parse_worker_pools(specs)
-    per_host = _MACHINE_MONTHLY_COST["ct5lp-hightpu-4t"] / _HOURS_PER_MONTH
-    assert _total_hourly_rate(pools) == pytest.approx(per_host * 2)
+    result = _parse_worker_pools(specs)
+    assert result[0][4] == "2x4"
+
+
+def test_parse_worker_pools_tpu_topology_correct_after_malformed_first_entry():
+    """When the first raw entry is malformed and skipped, pools[0][4] gives the correct
+    topology for the valid pool -- not the topology from the skipped first raw entry."""
+    specs = [
+        # Entry 0: malformed (no machineType) -- must be skipped
+        {"replicaCount": 1, "machineSpec": {"tpuTopology": "wrong-topology"}},
+        # Entry 1: valid TPU pool -- should become pools[0]
+        {
+            "replicaCount": 1,
+            "machineSpec": {"machineType": "ct5lp-hightpu-4t", "tpuTopology": "2x4"},
+        },
+    ]
+    result = _parse_worker_pools(specs)
+    assert len(result) == 1
+    assert result[0][0] == "ct5lp-hightpu-4t"
+    assert result[0][4] == "2x4"  # correct topology, not "wrong-topology"
 
 
 def test_g4_gpu_counts_match_docs():
@@ -770,6 +832,19 @@ def test_g4_gpu_counts_match_docs():
     assert _BUNDLED_ACCELERATOR_COUNT["g4-standard-384"] == 8
 
 
+def test_tpu7x_topology_scaling_via_suffix_parse():
+    """tpu7x-standard-4t not in _BUNDLED_ACCELERATOR_COUNT but -4t suffix → 4 chips/host.
+    Topology '4x4' = 16 chips → 4 hosts."""
+    specs = [
+        {
+            "replicaCount": 1,
+            "machineSpec": {"machineType": "tpu7x-standard-4t", "tpuTopology": "4x4"},
+        }
+    ]
+    pools = _parse_worker_pools(specs)
+    assert pools[0][3] == 4  # 16 chips / 4 per host = 4 hosts
+
+
 # ---------------------------------------------------------------------------
 # _has_accelerator_hardware
 # ---------------------------------------------------------------------------
@@ -811,77 +886,14 @@ def test_has_accelerator_hardware_empty_string_not_classified():
     assert _has_accelerator_hardware(pools) is False
 
 
-# ---------------------------------------------------------------------------
-# _estimate_hourly_rate_per_replica
-# ---------------------------------------------------------------------------
-
-
-def test_estimate_hourly_rate_per_replica_n1_with_gpu_is_additive():
-    """n1-* machines add GPU cost on top of machine cost."""
-    machine_hourly = _MACHINE_MONTHLY_COST["n1-standard-8"] / _HOURS_PER_MONTH
-    gpu_monthly_each = 311.0  # NVIDIA_TESLA_T4
-    gpu_hourly = gpu_monthly_each / _HOURS_PER_MONTH * 2  # 2 GPUs
-    expected = machine_hourly + gpu_hourly
-    result = _estimate_hourly_rate_per_replica("n1-standard-8", "NVIDIA_TESLA_T4", 2)
-    assert abs(result - expected) < 0.01
-
-
-def test_estimate_hourly_rate_per_replica_a2_bundled_no_addon():
-    """a2-* machines bundle GPU cost — no accelerator add-on."""
-    machine_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-1g"] / _HOURS_PER_MONTH
-    result = _estimate_hourly_rate_per_replica("a2-highgpu-1g", "NVIDIA_TESLA_A100", 1)
-    assert abs(result - machine_hourly) < 0.01
-
-
-def test_estimate_hourly_rate_per_replica_unknown_machine_uses_default():
-    result = _estimate_hourly_rate_per_replica("custom-unknown-machine", None, 0)
-    expected = _DEFAULT_MACHINE_MONTHLY_COST / _HOURS_PER_MONTH
-    assert abs(result - expected) < 0.01
-
+def test_has_accelerator_hardware_recognized_type_zero_count_not_accelerated():
+    """Recognized acceleratorType with acceleratorCount=0 is NOT accelerated (spec 8.1).
 
-def test_estimate_hourly_rate_co_scheduling_single_accel():
-    """a2-highgpu-8g with accel_count=1 triggers co-scheduling: 8 replicas per VM, cost 1/8."""
-    full_machine_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-8g"] / _HOURS_PER_MONTH
-    # accel_count=1 == 1 → replicas_per_vm = 8//1 = 8
-    result = _estimate_hourly_rate_per_replica("a2-highgpu-8g", "NVIDIA_TESLA_A100", 1)
-    assert result == pytest.approx(full_machine_hourly / 8)
-
-
-def test_estimate_hourly_rate_co_scheduling_divides_evenly():
-    """a2-highgpu-8g with accel_count=2: 8%2==0 → co-scheduling applies, cost is 1/4."""
-    full_machine_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-8g"] / _HOURS_PER_MONTH
-    # accel_count=2, machine_gpu_count=8, 8%2==0 → replicas_per_vm=4
-    result = _estimate_hourly_rate_per_replica("a2-highgpu-8g", "NVIDIA_TESLA_A100", 2)
-    assert result == pytest.approx(full_machine_hourly / 4)
-
-
-def test_estimate_hourly_rate_no_co_scheduling_above_half():
-    """a2-highgpu-8g with accel_count=5 → no co-scheduling (accel_count != 1), full price."""
-    full_machine_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-8g"] / _HOURS_PER_MONTH
-    result = _estimate_hourly_rate_per_replica("a2-highgpu-8g", "NVIDIA_TESLA_A100", 5)
-    assert result == pytest.approx(full_machine_hourly)
-
-
-def test_estimate_hourly_rate_no_co_scheduling_zero_accel_count():
-    """accel_count=0 (unspecified) → full price, no co-scheduling assumed."""
-    full_machine_hourly = _MACHINE_MONTHLY_COST["a2-highgpu-8g"] / _HOURS_PER_MONTH
-    result = _estimate_hourly_rate_per_replica("a2-highgpu-8g", None, 0)
-    assert result == pytest.approx(full_machine_hourly)
-
-
-def test_bundled_accelerator_count_covers_all_machine_monthly_cost_bundled_types():
-    """Every bundled machine type in _MACHINE_MONTHLY_COST (except g2-standard-32)
-    has a known GPU/TPU count in _BUNDLED_ACCELERATOR_COUNT."""
-    gpu_prefixes = ("a2-", "a3-", "a4-", "a4x-", "g2-", "g4-")
-    bundled_types = [
-        m
-        for m in _MACHINE_MONTHLY_COST
-        if m.startswith(gpu_prefixes) or m.startswith(_TPU_MACHINE_PREFIXES)
-    ]
-    unknown = [
-        m for m in bundled_types if m not in _BUNDLED_ACCELERATOR_COUNT and m != "g2-standard-32"
-    ]
-    assert unknown == [], f"Missing from _BUNDLED_ACCELERATOR_COUNT: {unknown}"
+    acceleratorCount=0 means no accelerator is attached even if the type field is set.
+    The explicit path requires both a recognized type AND count > 0.
+    """
+    pools = [("n1-standard-8", "NVIDIA_TESLA_T4", 0, 1)]
+    assert _has_accelerator_hardware(pools) is False
 
 
 def test_tpu_machine_type_detected_as_accelerated():
@@ -890,18 +902,6 @@ def test_tpu_machine_type_detected_as_accelerated():
     assert _has_accelerator_hardware(pools) is True
 
 
-def test_tpu_machine_type_uses_tpu_default_cost_when_unknown():
-    """Unrecognized ct5lp-* machine falls back to _DEFAULT_TPU_MONTHLY_COST, not generic $150."""
-    result = _estimate_hourly_rate_per_replica("ct5lp-hightpu-16t", None, 0)
-    assert result == pytest.approx(_DEFAULT_TPU_MONTHLY_COST / _HOURS_PER_MONTH)
-
-
-def test_tpu_machine_type_uses_table_cost_when_known():
-    """Known ct5lp-hightpu-4t uses the exact cost from _MACHINE_MONTHLY_COST."""
-    result = _estimate_hourly_rate_per_replica("ct5lp-hightpu-4t", None, 0)
-    assert result == pytest.approx(_MACHINE_MONTHLY_COST["ct5lp-hightpu-4t"] / _HOURS_PER_MONTH)
-
-
 def test_a3_megagpu_detected_as_bundled():
     """a3-megagpu-8g is detected as bundled via a3- prefix."""
     pools = [("a3-megagpu-8g", None, 0, 1)]
@@ -926,49 +926,6 @@ def test_tpu7x_machine_detected_as_accelerated():
     assert _has_accelerator_hardware(pools) is True
 
 
-def test_tpu7x_uses_tpu_default_cost():
-    """tpu7x-* has no cost table entry — should use _DEFAULT_TPU_MONTHLY_COST."""
-    result = _estimate_hourly_rate_per_replica("tpu7x-standard-4t", None, 0)
-    assert result == pytest.approx(_DEFAULT_TPU_MONTHLY_COST / _HOURS_PER_MONTH)
-
-
-def test_tpu7x_topology_scaling_via_suffix_parse():
-    """tpu7x-standard-4t not in _BUNDLED_ACCELERATOR_COUNT but -4t suffix → 4 chips/host.
-    Topology '4x4' = 16 chips → 4 hosts → priced as 4 × per-host rate."""
-    specs = [
-        {
-            "replicaCount": 1,
-            "machineSpec": {"machineType": "tpu7x-standard-4t", "tpuTopology": "4x4"},
-        }
-    ]
-    pools = _parse_worker_pools(specs)
-    assert pools[0][3] == 4  # 16 chips / 4 per host = 4 hosts
-    per_host = _DEFAULT_TPU_MONTHLY_COST / _HOURS_PER_MONTH
-    assert _total_hourly_rate(pools) == pytest.approx(per_host * 4)
-
-
-# ---------------------------------------------------------------------------
-# _total_hourly_rate
-# ---------------------------------------------------------------------------
-
-
-def test_total_hourly_rate_single_pool():
-    per_replica = _estimate_hourly_rate_per_replica("n1-standard-4", None, 0)
-    pools = [("n1-standard-4", None, 0, 3)]
-    assert abs(_total_hourly_rate(pools) - per_replica * 3) < 0.01
-
-
-def test_total_hourly_rate_heterogeneous():
-    """Cost sums correctly across pools with different machine types."""
-    chief = _estimate_hourly_rate_per_replica("a2-highgpu-1g", None, 0) * 1
-    workers = _estimate_hourly_rate_per_replica("n1-standard-4", None, 0) * 8
-    pools = [
-        ("a2-highgpu-1g", None, 0, 1),
-        ("n1-standard-4", None, 0, 8),
-    ]
-    assert abs(_total_hourly_rate(pools) - (chief + workers)) < 0.01
-
-
 # ---------------------------------------------------------------------------
 # _hardware_label
 # ---------------------------------------------------------------------------
@@ -990,18 +947,11 @@ def test_hardware_label_multi_worker():
     assert "×8 workers" in label
 
 
-# ---------------------------------------------------------------------------
-# _parse_location
-# ---------------------------------------------------------------------------
-
-
-def test_parse_location_standard():
-    name = "projects/my-proj/locations/us-central1/customJobs/12345"
-    assert _parse_location(name) == "us-central1"
-
-
-def test_parse_location_missing_returns_empty():
-    assert _parse_location("invalid-name") == ""
+def test_hardware_label_zero_accel_count_omits_type():
+    """acceleratorType is omitted from the label when acceleratorCount == 0."""
+    label = _hardware_label("n1-standard-8", "NVIDIA_TESLA_T4", 0, 1)
+    assert "NVIDIA_TESLA_T4" not in label
+    assert "n1-standard-8" in label
 
 
 # ---------------------------------------------------------------------------
@@ -1020,67 +970,26 @@ def test_rule_id_attribute():
 
 def test_wildcard_unsupported_keyed_per_project_and_resource():
     """_wildcard_unsupported uses (project_id, resource) tuples, not plain strings."""
-    # Verify the set stores tuples so customJobs and trainingPipelines are independent
     test_set = set()
     test_set.add(("proj-a", "customJobs"))
     assert ("proj-a", "customJobs") in test_set
     assert ("proj-a", "trainingPipelines") not in test_set  # independent per resource
 
 
-# ---------------------------------------------------------------------------
-# Fix 3: pricing_confidence
-# ---------------------------------------------------------------------------
-
-
-def test_pricing_confidence_published_for_known_machines():
-    pools = [("n1-standard-8", "NVIDIA_TESLA_T4", 1, 1)]
-    assert _pricing_confidence(pools) == "published"
-
-
-def test_pricing_confidence_partial_estimate_for_estimated_machine():
-    """a4-* machines use estimated pricing."""
-    pools = [("a4-highgpu-8g", None, 0, 1)]
-    assert _pricing_confidence(pools) == "partial_estimate"
-
-
-def test_pricing_confidence_partial_estimate_for_estimated_accel():
-    """H200 accelerator uses estimated pricing."""
-    pools = [("n1-standard-8", "NVIDIA_H200_141GB", 1, 1)]
-    assert _pricing_confidence(pools) == "partial_estimate"
-
-
-def test_pricing_confidence_empty_pools():
-    """Empty pool list → published (no estimated prices involved)."""
-    assert _pricing_confidence([]) == "published"
-
-
-def test_finding_includes_pricing_confidence_field():
-    """pricing_confidence appears in finding details for custom jobs."""
-    job = _custom_job(
-        "job-1",
-        "us-central1",
-        start_hours_ago=_THRESHOLD + 1,
-        machine_type="n1-standard-4",
-    )
-    findings = _run(custom_jobs=[job])
-    assert len(findings) == 1
-    assert "pricing_confidence" in findings[0].details
-
-
 # ---------------------------------------------------------------------------
 # Fix 6: skipped_jobs warning
 # ---------------------------------------------------------------------------
 
 
 def test_skipped_jobs_warning_on_missing_timestamp():
-    """Jobs with no startTime or createTime emit a warning."""
+    """Jobs with no startTime emit a warning and are skipped."""
     import warnings as _warnings
 
     job = {
         "name": "projects/my-project/locations/us-central1/customJobs/bad",
         "displayName": "bad-job",
         "state": "JOB_STATE_RUNNING",
-        # no startTime or createTime
+        # no startTime
     }
     mock_resp = MagicMock()
     mock_resp.status_code = 200
@@ -1105,47 +1014,422 @@ def test_skipped_jobs_warning_on_missing_timestamp():
 
 
 # ---------------------------------------------------------------------------
-# Fix 9: early_warning_fraction and runaway_multiplier kwargs
+# Resource-name pattern enforcement (spec 7)
 # ---------------------------------------------------------------------------
 
 
-def test_custom_early_warning_fraction_fires_earlier():
-    """early_warning_fraction=0.5 → job at 60% of threshold fires; default 0.9 would not."""
-    job = _custom_job(
-        "job-ew",
-        "us-central1",
-        start_hours_ago=_THRESHOLD * 0.6,
-        accel_type="NVIDIA_TESLA_T4",
-        accel_count=1,
+def test_validate_resource_name_valid_customjob():
+    assert (
+        _validate_resource_name(
+            "projects/my-proj/locations/us-central1/customJobs/123", "customJob"
+        )
+        is True
     )
-    findings = _run(custom_jobs=[job], extra_kwargs={"early_warning_fraction": 0.5})
-    assert len(findings) == 1
 
 
-def test_custom_early_warning_fraction_default_does_not_fire():
-    """Same job at 60% of threshold does NOT fire with default fraction (0.9)."""
-    job = _custom_job(
-        "job-ew-no",
-        "us-central1",
-        start_hours_ago=_THRESHOLD * 0.6,
-        accel_type="NVIDIA_TESLA_T4",
-        accel_count=1,
+def test_validate_resource_name_valid_pipeline():
+    assert (
+        _validate_resource_name(
+            "projects/my-proj/locations/us-central1/trainingPipelines/456", "trainingPipeline"
+        )
+        is True
+    )
+
+
+def test_validate_resource_name_too_many_parts():
+    """Extra path segment (7 parts instead of 6) → invalid."""
+    assert (
+        _validate_resource_name(
+            "projects/p/locations/us-central1/customJobs/123/extra", "customJob"
+        )
+        is False
+    )
+
+
+def test_validate_resource_name_too_few_parts():
+    assert _validate_resource_name("projects/p/locations/customJobs/123", "customJob") is False
+
+
+def test_validate_resource_name_wrong_type_segment():
+    """customJobs name treated as trainingPipeline → invalid."""
+    assert (
+        _validate_resource_name(
+            "projects/p/locations/us-central1/customJobs/123", "trainingPipeline"
+        )
+        is False
     )
+
+
+def test_validate_resource_name_empty_location():
+    """Empty location segment → invalid."""
+    assert _validate_resource_name("projects/p/locations//customJobs/123", "customJob") is False
+
+
+def test_resource_name_extra_segments_skipped():
+    """A name with extra path segments is not emitted as a finding."""
+    import warnings as _warnings
+
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/123/extra",
+        "displayName": "extra-path",
+        "startTime": _iso(start),
+        "state": "JOB_STATE_RUNNING",
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    with _warnings.catch_warnings(record=True):
+        _warnings.simplefilter("always")
+        findings = _run(custom_jobs=[job])
+    assert findings == []
+
+
+def test_resource_name_location_bearing_but_wrong_type_skipped():
+    """Name has valid location segment but wrong resource-type keyword → skip."""
+    import warnings as _warnings
+
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/models/123",
+        "displayName": "wrong-type",
+        "startTime": _iso(start),
+        "state": "JOB_STATE_RUNNING",
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    with _warnings.catch_warnings(record=True):
+        _warnings.simplefilter("always")
+        findings = _run(custom_jobs=[job])
+    assert findings == []
+
+
+# ---------------------------------------------------------------------------
+# State validation (spec 3.3, 9.1)
+# ---------------------------------------------------------------------------
+
+
+def test_expected_state_constants():
+    """_EXPECTED_STATE maps job types to exact documented running-state enums."""
+    assert _EXPECTED_STATE["customJob"] == "JOB_STATE_RUNNING"
+    assert _EXPECTED_STATE["trainingPipeline"] == "PIPELINE_STATE_RUNNING"
+
+
+def test_wrong_state_custom_job_skipped():
+    """CustomJob not in JOB_STATE_RUNNING is skipped even if it passes other checks."""
+    import warnings as _warnings
+
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j1",
+        "displayName": "pending-job",
+        "startTime": _iso(start),
+        "state": "JOB_STATE_PENDING",  # not running
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    with _warnings.catch_warnings(record=True):
+        _warnings.simplefilter("always")
+        findings = _run(custom_jobs=[job])
+    assert findings == []
+
+
+def test_missing_state_custom_job_skipped():
+    """CustomJob with absent state field is skipped."""
+    import warnings as _warnings
+
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j2",
+        "displayName": "no-state",
+        "startTime": _iso(start),
+        # no 'state' key
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    with _warnings.catch_warnings(record=True):
+        _warnings.simplefilter("always")
+        findings = _run(custom_jobs=[job])
+    assert findings == []
+
+
+def test_state_is_read_from_resource_not_synthesised():
+    """The 'state' in finding details reflects the actual resource state, not a synthesised value."""
+    job = _custom_job("j", "us-central1", start_hours_ago=_THRESHOLD + 5)
+    # Confirm the fixture sets state to JOB_STATE_RUNNING
+    assert job["state"] == "JOB_STATE_RUNNING"
     findings = _run(custom_jobs=[job])
+    assert findings[0].details["state"] == "JOB_STATE_RUNNING"
+
+
+# ---------------------------------------------------------------------------
+# CustomJob hardware_unknown when workerPoolSpecs absent/empty (spec 8.1)
+# ---------------------------------------------------------------------------
+
+
+def test_custom_job_empty_worker_specs_hardware_unknown():
+    """CustomJob with empty workerPoolSpecs must have hardware_unknown=True (spec 8.1)."""
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-no-specs",
+        "displayName": "no-specs",
+        "startTime": _iso(start),
+        "state": "JOB_STATE_RUNNING",
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    findings = _run(custom_jobs=[job])
+    assert len(findings) == 1
+    assert findings[0].details["hardware_unknown"] is True
+    assert findings[0].details["is_accelerator"] is False
+
+
+def test_custom_job_absent_job_spec_hardware_unknown():
+    """CustomJob with no jobSpec at all is still eligible; hardware_unknown=True."""
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-no-spec",
+        "displayName": "no-job-spec",
+        "startTime": _iso(start),
+        "state": "JOB_STATE_RUNNING",
+        # no 'jobSpec' key
+    }
+    findings = _run(custom_jobs=[job])
+    assert len(findings) == 1
+    assert findings[0].details["hardware_unknown"] is True
+
+
+# ---------------------------------------------------------------------------
+# _parse_worker_pools: malformed entries and missing machineType (spec 8.1, 8.2)
+# ---------------------------------------------------------------------------
+
+
+def test_parse_worker_pools_missing_machine_type_skipped():
+    """Pool entries without machineType are skipped (spec 8.1, 8.2)."""
+    specs = [
+        {
+            "replicaCount": 1,
+            "machineSpec": {
+                # no machineType
+                "acceleratorType": "NVIDIA_TESLA_T4",
+                "acceleratorCount": 1,
+            },
+        }
+    ]
+    assert _parse_worker_pools(specs) == []
+
+
+def test_parse_worker_pools_empty_machine_type_skipped():
+    """Pool entry with empty machineType string is treated as missing → skipped."""
+    specs = [{"replicaCount": 1, "machineSpec": {"machineType": ""}}]
+    assert _parse_worker_pools(specs) == []
+
+
+def test_parse_worker_pools_non_dict_entry_skipped():
+    """Non-dict entries in workerPoolSpecs are silently skipped."""
+    specs = ["not-a-dict", None, 42]
+    assert _parse_worker_pools(specs) == []
+
+
+def test_parse_worker_pools_bad_replica_count_skipped():
+    """Pool with non-numeric replicaCount is treated as malformed → skipped."""
+    specs = [
+        {
+            "replicaCount": "bad-value",
+            "machineSpec": {"machineType": "n1-standard-4"},
+        }
+    ]
+    assert _parse_worker_pools(specs) == []
+
+
+def test_parse_worker_pools_mixed_valid_invalid():
+    """Valid pool entries are kept; malformed entries are silently dropped."""
+    specs = [
+        {"replicaCount": "bad", "machineSpec": {"machineType": "n1-standard-4"}},
+        {
+            "replicaCount": 2,
+            "machineSpec": {"machineType": "a2-highgpu-1g"},
+        },
+        {"replicaCount": 1, "machineSpec": {}},  # no machineType
+    ]
+    result = _parse_worker_pools(specs)
+    assert len(result) == 1
+    assert result[0][0] == "a2-highgpu-1g"
+    assert result[0][3] == 2
+
+
+def test_training_pipeline_pools_without_machine_type_hardware_unknown():
+    """TrainingPipeline whose exposed workerPoolSpecs entries all lack machineType → hardware_unknown."""
+    task_inputs = {
+        "workerPoolSpecs": [
+            {
+                "replicaCount": 1,
+                "machineSpec": {
+                    # machineType absent
+                    "acceleratorType": "NVIDIA_TESLA_T4",
+                    "acceleratorCount": 1,
+                },
+            }
+        ]
+    }
+    pipeline = _training_pipeline(
+        "pl-no-mt", "us-central1", start_hours_ago=_THRESHOLD + 5, task_inputs=task_inputs
+    )
+    findings = _run(training_pipelines=[pipeline])
+    assert len(findings) == 1
+    assert findings[0].details["hardware_unknown"] is True
+    assert findings[0].details["is_accelerator"] is False
+
+
+# ---------------------------------------------------------------------------
+# RFC3339 startTime strictness (spec 7)
+# ---------------------------------------------------------------------------
+
+
+def test_start_time_space_separator_rejected():
+    """startTime with space separator (not T) is not valid RFC3339 and must be skipped."""
+    import warnings as _warnings
+
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    iso_space = start.isoformat().replace("T", " ")  # e.g. "2025-05-31 06:00:00+00:00"
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-space",
+        "state": "JOB_STATE_RUNNING",
+        "startTime": iso_space,
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    with _warnings.catch_warnings(record=True):
+        _warnings.simplefilter("always")
+        findings = _run(custom_jobs=[job])
+    assert findings == []
+
+
+def test_start_time_date_only_rejected():
+    """Date-only startTime (no time component) is not valid RFC3339 and must be skipped."""
+    import warnings as _warnings
+
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-date",
+        "state": "JOB_STATE_RUNNING",
+        "startTime": "2025-05-01",
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    with _warnings.catch_warnings(record=True):
+        _warnings.simplefilter("always")
+        findings = _run(custom_jobs=[job])
     assert findings == []
 
 
-def test_custom_runaway_multiplier_changes_confidence():
-    """runaway_multiplier=2 → job at 2.5× threshold is HIGH; default 3× it would be MEDIUM."""
-    job = _custom_job("job-rm", "us-central1", start_hours_ago=_THRESHOLD * 2.5)
-    findings = _run(custom_jobs=[job], extra_kwargs={"runaway_multiplier": 2})
+def test_start_time_no_timezone_rejected():
+    """startTime without timezone offset is not valid RFC3339 and must be skipped."""
+    import warnings as _warnings
+
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-notz",
+        "state": "JOB_STATE_RUNNING",
+        "startTime": "2025-05-01T06:00:00",  # no Z or offset
+        "jobSpec": {"workerPoolSpecs": []},
+    }
+    with _warnings.catch_warnings(record=True):
+        _warnings.simplefilter("always")
+        findings = _run(custom_jobs=[job])
+    assert findings == []
+
+
+def test_start_time_fractional_seconds_accepted():
+    """startTime with fractional seconds and Z is valid RFC3339 and must be accepted."""
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    frac_str = start.strftime("%Y-%m-%dT%H:%M:%S.123456Z")
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-frac",
+        "state": "JOB_STATE_RUNNING",
+        "startTime": frac_str,
+        "jobSpec": {
+            "workerPoolSpecs": [
+                {"replicaCount": 1, "machineSpec": {"machineType": "n1-standard-4"}}
+            ]
+        },
+    }
+    findings = _run(custom_jobs=[job])
     assert len(findings) == 1
-    assert findings[0].confidence.name == "HIGH"
 
 
-def test_default_runaway_multiplier_at_2_5x_is_medium():
-    """Same job at 2.5× threshold with default multiplier (3) → MEDIUM confidence."""
-    job = _custom_job("job-rm-med", "us-central1", start_hours_ago=_THRESHOLD * 2.5)
+def test_start_time_explicit_offset_accepted():
+    """startTime with explicit +00:00 offset is valid RFC3339 and must be accepted."""
+    start = NOW - timedelta(hours=_THRESHOLD + 5)
+    offset_str = start.strftime("%Y-%m-%dT%H:%M:%S+00:00")
+    job = {
+        "name": f"projects/{_PROJECT}/locations/us-central1/customJobs/j-offset",
+        "state": "JOB_STATE_RUNNING",
+        "startTime": offset_str,
+        "jobSpec": {
+            "workerPoolSpecs": [
+                {"replicaCount": 1, "machineSpec": {"machineType": "n1-standard-4"}}
+            ]
+        },
+    }
     findings = _run(custom_jobs=[job])
     assert len(findings) == 1
-    assert findings[0].confidence.name == "MEDIUM"
+
+
+# ---------------------------------------------------------------------------
+# Partial pagination: later-page failure keeps earlier pages (spec 11.3)
+# ---------------------------------------------------------------------------
+
+
+def test_pagination_later_page_failure_keeps_partial_results():
+    """A non-403 failure on a later page returns earlier accumulated pages and warns."""
+    import warnings as _warnings
+
+    job = _custom_job("j-page1", "us-central1", start_hours_ago=_THRESHOLD + 5)
+
+    page1_resp = MagicMock()
+    page1_resp.status_code = 200
+    page1_resp.ok = True
+    page1_resp.json.return_value = {
+        "customJobs": [job],
+        "nextPageToken": "token-abc",  # signals a second page
+    }
+
+    page2_resp = MagicMock()
+    page2_resp.status_code = 503
+    page2_resp.ok = False
+
+    empty_pipeline_resp = MagicMock()
+    empty_pipeline_resp.status_code = 200
+    empty_pipeline_resp.ok = True
+    empty_pipeline_resp.json.return_value = {"trainingPipelines": []}
+
+    responses = {"customJobs": [page1_resp, page2_resp], "trainingPipelines": [empty_pipeline_resp]}
+    counters = {"customJobs": 0, "trainingPipelines": 0}
+
+    def _get(url, params=None):
+        if "customJobs" in url:
+            idx = counters["customJobs"]
+            counters["customJobs"] += 1
+            return responses["customJobs"][min(idx, len(responses["customJobs"]) - 1)]
+        else:
+            idx = counters["trainingPipelines"]
+            counters["trainingPipelines"] += 1
+            return responses["trainingPipelines"][min(idx, len(responses["trainingPipelines"]) - 1)]
+
+    creds = MagicMock()
+    mock_session = MagicMock()
+    mock_session.get.side_effect = _get
+
+    with patch(
+        "cleancloud.providers.gcp.rules.ai.vertex_training_job_long_running.AuthorizedSession",
+        return_value=mock_session,
+    ):
+        with patch(
+            "cleancloud.providers.gcp.rules.ai.vertex_training_job_long_running.datetime"
+        ) as mock_dt:
+            mock_dt.now.return_value = NOW
+            mock_dt.fromisoformat.side_effect = datetime.fromisoformat
+            with _warnings.catch_warnings(record=True) as caught:
+                _warnings.simplefilter("always")
+                findings = find_long_running_vertex_training_jobs(
+                    project_id=_PROJECT,
+                    credentials=creds,
+                    long_running_hours_threshold=_THRESHOLD,
+                )
+
+    # Page 1 job must still appear even though page 2 failed
+    assert len(findings) == 1
+    assert findings[0].details["job_name"].endswith("j-page1")
+    # A warning about the partial read must have been emitted
+    assert any("partial" in str(w.message).lower() for w in caught)
diff --git a/tests/e2e/gcp/test_gcp_ai_rules_smoke.py b/tests/e2e/gcp/test_gcp_ai_rules_smoke.py
index 3d78489..d9a376e 100644
--- a/tests/e2e/gcp/test_gcp_ai_rules_smoke.py
+++ b/tests/e2e/gcp/test_gcp_ai_rules_smoke.py
@@ -144,8 +144,8 @@ def test_vertex_training_job_long_running_returns_list_of_findings():
         assert "job_type" in f.details
         assert f.details["job_type"] in ("customJob", "trainingPipeline")
         assert "duration_hours" in f.details
-        assert "accrued_cost_usd" in f.details
-        assert "burn_rate_per_hour" in f.details
+        assert "state" in f.details
+        assert "start_time" in f.details
         assert "is_accelerator" in f.details
 
 

From 2b13aee58741fc099ebce8e02f5e454be4f19d43 Mon Sep 17 00:00:00 2001
From: javvaji-devops <venkata.javvaji.91@gmail.com>
Date: Wed, 6 May 2026 14:51:44 +0100
Subject: [PATCH 2/4] gcp.vertex.workbench.idle

---
 CHANGELOG.md                                  |  56 --
 README.fr.md                                  |  21 +-
 README.md                                     |  21 +-
 .../providers/gcp/rules/ai/workbench_idle.py  | 476 +++-------
 docs/rules/gcp.md                             |  30 +-
 docs/specs/gcp/ai/workbench_idle.md           | 668 ++++++++++++++
 .../gcp/ai/test_gcp_workbench_idle.py         | 819 ++++++++++--------
 7 files changed, 1325 insertions(+), 766 deletions(-)
 delete mode 100644 CHANGELOG.md
 create mode 100644 docs/specs/gcp/ai/workbench_idle.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
deleted file mode 100644
index 18c3bca..0000000
--- a/CHANGELOG.md
+++ /dev/null
@@ -1,56 +0,0 @@
-# Changelog
-
-All notable changes to CleanCloud are documented here.
-
-## [1.15.0] — 2026-04-11
-
-### Added
-- `aws.ec2.gpu.idle` — Idle EC2 GPU/accelerator instance detection across 20 families (p2/p3/p4/p5, g4/g5/g6/g6e/gr6, trn1/trn2, inf1/inf2, dl1/dl2q). Two-tier detection: GPU utilisation via NVIDIA CloudWatch agent (HIGH confidence) or CPU fallback (MEDIUM). Neuron instances (Trainium/Inferentia) handled correctly — always CPU fallback by design. Parameters: `idle_days` (default 7), `gpu_threshold` (5%), `cpu_threshold` (10%).
-- `gcp.vertex.workbench.idle` — Idle Vertex AI Workbench instances via v2 API. Uses `updateTime` as idle signal; GPU/TPU-aware; age-fallback capped at MEDIUM confidence.
-- `schemas/output-v1.3.0.json` — JSON output schema update: added `critical` to risk enum, `suppressed` array, `rules_evaluated` summary field.
-- Optional provider extras: `pip install 'cleancloud[aws]'`, `'cleancloud[azure]'`, `'cleancloud[gcp]'`, `'cleancloud[all]'`. Cloud SDKs are no longer hard dependencies.
-- Docker `CLEANCLOUD_EXTRAS` build arg for slim provider-specific images.
-- Graceful error messages with install hints when a provider SDK is not installed.
-
-### Changed
-- Cross-cloud AI baseline complete: 7 rules across AWS (3), Azure (2), GCP (2).
-- README Quick Start consolidated to a single clear two-step flow (demo → install provider → scan).
-- `azure/rules/ebs_snapshots_old.py` renamed to `disk_snapshots_old.py` (AWS terminology removed).
-- `scan/command.py` EnvironmentError handler now uses `f"--provider {provider}"` (was hardcoded to `azure`).
-- Lint is now blocking on main branch (was non-blocking with `|| echo` fallback).
-- `output/feedback.py` no longer includes a personal email address.
-- `except Exception: pass` blocks narrowed to specific exception types.
-
-### Fixed
-- `security/aws/hygiene-readonly.json` — added missing `cloudwatch:GetMetricStatistics` permission.
-
----
-
-## [1.14.1] — 2026-04-09
-
-### Fixed
-- `aws/rules/untagged_resources.py` — `s3.exceptions.ClientError` crash fixed; now catches `botocore.exceptions.ClientError` with `NoSuchTagSet` check.
-- `aws/rules/rds_idle.py` — hardcoded `"connections_14d"` key fixed; CloudWatch `AccessDenied` now surfaces as `PermissionError`.
-- `aws/rules/elb_idle.py`, `nat_gateway_idle.py` — same CloudWatch `AccessDenied` fix.
-- `azure/rules/app_service_plan_empty.py` — `plan.location.lower()` crash on `None`.
-- `azure/rules/vm_stopped_not_deallocated.py` — `instance_view()` wrapped in try/except; no longer aborts subscription scan on one bad VM.
-- `azure/rules/sql_database_idle.py` — hardcoded idle day strings fixed; per-server error handling added.
-- `azure/rules/ebs_snapshots_old.py` — dead branch fixed; case-sensitive region filter fixed.
-- `azure/rules/untagged_resources.py` — case-sensitive region filter fixed for disks and snapshots.
-- `gcp/rules/sql_instance_idle.py` — hardcoded `"7-day window"` fixed to use `idle_days`.
-- `gcp/rules/vertex_endpoint_idle.py` — unreachable dead branch removed.
-
----
-
-## [1.14.0] — 2026-04-09
-
-### Added
-- `azure.aml.compute.idle` — Idle Azure ML Compute Clusters (Azure Monitor metrics + age fallback).
-- `azure.ml.compute_instance.idle` — Idle Azure ML Compute Instances (last_operation + last_modified_at + age fallback).
-- `rules_evaluated` field in JSON scan summary — map of rule_id to finding count.
-
-### Changed
-- Unified Azure subscription display (removed duplicate subscription output).
-- Age-fallback confidence capped at MEDIUM for compute instance rule.
-- All-None Azure Monitor maximums treated as unknown (not idle).
-- Unicode arrow chars (`→`) removed from all Python source files.
diff --git a/README.fr.md b/README.fr.md
index 9db41ae..6db14ae 100644
--- a/README.fr.md
+++ b/README.fr.md
@@ -42,7 +42,20 @@ cleancloud demo --category ai
    Détails :
      - estimated_monthly_cost: ~$23 374/mois
 
-2. [Azure] Instance de calcul Azure ML inactive (31 jours sans activité)
+2. [GCP] Endpoint Vertex AI inactif (2 réplique(s) toujours active(s), zéro requête)
+   Risque     : Élevé
+   Confiance  : High
+   Ressource  : gcp.vertex.endpoint → projects/ml-platform/locations/us-central1/endpoints/8842531067721654272
+   Région     : us-central1
+   Règle      : gcp.vertex.endpoint.idle
+   Raison     : L'endpoint a un plancher de service provisionné de 2 réplique(s) ; la télémétrie de comptage des requêtes (couverture : complète) montre un taux maximum observé == 0 sur une fenêtre de 14j
+   Détails :
+     - provisioned_serving_floor: 2
+     - in_scope_model_count: 1
+     - has_accelerator: true
+     - telemetry_coverage_state: complete
+
+3. [Azure] Instance de calcul Azure ML inactive (31 jours sans activité)
    Risque     : Élevé
    Confiance  : High
    Ressource  : azure.ml.compute_instance → ws-prod/compute/ds-workstation-nc24
@@ -52,7 +65,7 @@ cleancloud demo --category ai
    Détails :
      - estimated_monthly_cost: ~$2 190/mois
 
-3. [AWS] Instance RDS inactive (zéro connexion depuis 21 jours)
+4. [AWS] Instance RDS inactive (zéro connexion depuis 21 jours)
    Risque     : Élevé
    Confiance  : High
    Ressource  : aws.rds.instance → db-prod-analytics
@@ -63,8 +76,8 @@ cleancloud demo --category ai
      - estimated_monthly_cost: ~$380/mois
 
 --- Résumé du scan ---
-Total candidats de revue : 3
-Par risque :     critique: 1  élevé: 2
+Total candidats de revue : 4
+Par risque :     critique: 1  élevé: 3
 Gaspillage minimum estimé : ~$25 944/mois
 ```
 
diff --git a/README.md b/README.md
index a521ccc..dc77bdc 100644
--- a/README.md
+++ b/README.md
@@ -42,7 +42,20 @@ cleancloud demo --category ai
    Details:
      - estimated_monthly_cost: ~$23,374/month
 
-2. [Azure] Idle ML Compute Instance (31 days since last activity)
+2. [GCP] Idle Vertex AI Endpoint (2 replica(s) always on, zero requests)
+   Risk       : High
+   Confidence : High
+   Resource   : gcp.vertex.endpoint → projects/ml-platform/locations/us-central1/endpoints/8842531067721654272
+   Region     : us-central1
+   Rule       : gcp.vertex.endpoint.idle
+   Reason     : Endpoint has provisioned serving floor of 2 replica(s); request-count telemetry (coverage: complete) shows max observed rate == 0 over 14d window
+   Details:
+     - provisioned_serving_floor: 2
+     - in_scope_model_count: 1
+     - has_accelerator: true
+     - telemetry_coverage_state: complete
+
+3. [Azure] Idle Azure ML Compute Instance (31 Days Since Last Activity)
    Risk       : High
    Confidence : High
    Resource   : azure.ml.compute_instance → ws-prod/compute/ds-workstation-nc24
@@ -52,7 +65,7 @@ cleancloud demo --category ai
    Details:
      - estimated_monthly_cost: ~$2,190/month
 
-3. [AWS] Idle RDS Instance (Zero connections for 21 days)
+4. [AWS] Idle RDS Instance (Zero connections for 21 days)
    Risk       : High
    Confidence : High
    Resource   : aws.rds.instance → db-prod-analytics
@@ -63,8 +76,8 @@ cleancloud demo --category ai
      - estimated_monthly_cost: ~$380/month
 
 --- Scan Summary ---
-Total review candidates: 3
-By risk:        critical: 1  high: 2
+Total review candidates: 4
+By risk:        critical: 1  high: 3
 Minimum estimated waste: ~$25,944/month
 ```
 
diff --git a/cleancloud/providers/gcp/rules/ai/workbench_idle.py b/cleancloud/providers/gcp/rules/ai/workbench_idle.py
index 0bb6457..91f4f52 100644
--- a/cleancloud/providers/gcp/rules/ai/workbench_idle.py
+++ b/cleancloud/providers/gcp/rules/ai/workbench_idle.py
@@ -1,12 +1,49 @@
-from datetime import datetime, timezone
+"""
+Rule: gcp.vertex.workbench.idle
+
+    (spec -- docs/specs/gcp/ai/workbench_idle.md)
+
+Intent:
+    Detect Vertex AI Workbench instances that are provably still running and have
+    documented first-party evidence of notebook/kernel inactivity over a conservative
+    review window.
+
+    This rule is deliberately precision-first. It is a review-candidate rule only.
+    It is not proof that an instance is safe to stop, not proof that no scheduled or
+    background work exists, and not proof of a specific monthly saving.
+
+Current canonical status:
+    EMITTING_DISABLED. No qualifying canonical signal exists that exposes per-instance
+    last kernel activity or a kernel-idle time series suitable for this rule. The rule
+    must not emit findings from control-plane timestamps alone.
+
+    updateTime and createTime are NOT canonical idle signals. Neither is
+    CPU utilization or instance age. No qualifying signal path is currently established.
+
+Discovery failure taxonomy:
+    404: API not enabled for the project; provably no instances.
+    400: endpoint or wildcard unsupported; discovery incomplete.
+    5xx: transient server error; discovery incomplete.
+    network error: transport failure; discovery incomplete.
+    unreachable[]: API-reported location gaps.
+
+Future activation path:
+    When Google documents a qualifying per-instance Workbench-attributable signal
+    (Cloud Logging kernel/session activity logs, or a Cloud Monitoring metric with
+    documented kernel-idle semantics), the implementation can continue from the
+    candidate list and apply signal evaluation for reachable instances.
+
+APIs:
+    - notebooks.googleapis.com/v2: projects/{project}/locations/-/instances
+"""
+
+import re
+import warnings
 from typing import List, Optional
 
 from google.auth.transport.requests import AuthorizedSession
 
-from cleancloud.core.confidence import ConfidenceLevel
-from cleancloud.core.evidence import Evidence
 from cleancloud.core.finding import Finding
-from cleancloud.core.risk import RiskLevel
 
 RULE_METADATA = {
     "id": "gcp.vertex.workbench.idle",
@@ -15,395 +52,130 @@
     "cost_impact": "high",
 }
 
-# Accelerator types treated as GPU/high-cost
-_GPU_ACCELERATORS = frozenset(
-    {
-        "NVIDIA_TESLA_T4",
-        "NVIDIA_TESLA_V100",
-        "NVIDIA_TESLA_P100",
-        "NVIDIA_TESLA_K80",
-        "NVIDIA_TESLA_A100",
-        "NVIDIA_A100_80GB",
-        "NVIDIA_L4",
-        "NVIDIA_H100_80GB",
-        "TPU_V2",
-        "TPU_V3",
-        "TPU_V4_POD",
-    }
+# Exact documented resource-name pattern (spec 3.1, 7):
+#   projects/{projectId}/locations/{location}/instances/{instanceId}
+# All four non-empty path segments must be present.
+_INSTANCE_NAME_RE = re.compile(
+    r"^projects/[^/]+/locations/[^/]+/instances/[^/]+$"
 )
 
-# Monthly cost per instance (on-demand, us-central1, 730 h/month)
-_MACHINE_MONTHLY_COST = {
-    "n1-standard-1": 35.0,
-    "n1-standard-2": 69.0,
-    "n1-standard-4": 138.0,
-    "n1-standard-8": 277.0,
-    "n1-standard-16": 554.0,
-    "n1-highmem-2": 93.0,
-    "n1-highmem-4": 187.0,
-    "n1-highmem-8": 374.0,
-    "n2-standard-2": 78.0,
-    "n2-standard-4": 157.0,
-    "n2-standard-8": 314.0,
-    "n2-standard-16": 628.0,
-    "c2-standard-4": 166.0,
-    "c2-standard-8": 332.0,
-    # a2-* and g2-* include GPU cost — no separate add-on
-    "a2-highgpu-1g": 2_933.0,
-    "a2-highgpu-2g": 5_866.0,
-    "a2-highgpu-4g": 11_732.0,
-    "a2-highgpu-8g": 23_464.0,
-    "a2-ultragpu-1g": 5_103.0,
-    "g2-standard-4": 706.0,
-    "g2-standard-8": 1_060.0,
-    "g2-standard-16": 2_120.0,
-    "g2-standard-32": 4_241.0,
-}
-_DEFAULT_MACHINE_MONTHLY_COST = 150.0
-
-# Additional monthly cost per GPU/TPU for n1-*/n2-* machines.
-# a2-* and g2-* already include GPU cost above.
-# TPU costs are approximate (v2 pod slice: ~$5.22/hr, v3: ~$8.00/hr, v4: ~$12.88/hr — 730h/month).
-_GPU_MONTHLY_COST_EACH = {
-    "NVIDIA_TESLA_T4": 311.0,
-    "NVIDIA_TESLA_V100": 1_385.0,
-    "NVIDIA_TESLA_P100": 1_022.0,
-    "NVIDIA_TESLA_K80": 392.0,
-    "NVIDIA_TESLA_A100": 2_933.0,
-    "NVIDIA_A100_80GB": 5_103.0,
-    "NVIDIA_L4": 680.0,
-    "NVIDIA_H100_80GB": 8_000.0,
-    "TPU_V2": 3_811.0,
-    "TPU_V3": 5_840.0,
-    "TPU_V4_POD": 9_402.0,
-}
-
-_DAYS_IDLE = 14
-
 
 def find_idle_workbench_instances(
     *,
     project_id: str,
     credentials,
     region_filter: Optional[str] = None,
-    idle_days: int = _DAYS_IDLE,
+    idle_days: int = 14,
 ) -> List[Finding]:
     """
-    Find Vertex AI Workbench instances in ACTIVE state with no recent activity.
-
-    Workbench instances incur continuous compute charges while ACTIVE, regardless
-    of whether any notebooks or kernels are running. GPU-backed instances cost
-    $300–$2,900+/month. Data scientists frequently leave instances running after
-    a sprint ends, a project is deprioritised, or when they switch to a new instance.
+    Find Vertex AI Workbench instances with documented kernel inactivity.
 
-    Detection logic:
-    - Instance state is ACTIVE (only ACTIVE instances incur compute charges)
-    - updateTime is older than idle_days — no configuration or lifecycle changes
+    Currently EMITTING_DISABLED: no qualifying canonical signal exists for
+    per-instance kernel activity. updateTime and createTime MUST NOT be used
+    as idle signals.
 
-    updateTime is updated by the Notebooks API when:
-    - The instance is started, stopped, or restarted via the console or API
-    - Instance configuration is modified (machine type, accelerators, etc.)
-    - Scripts or scheduled operations modify instance metadata
-
-    Instances with old updateTime have had no control-plane activity.
-    This mirrors the signal used by SageMaker LastModifiedTime and
-    Azure ML compute instance last_modified_at.
-
-    Confidence:
-    - HIGH: updateTime >= idle_days ago AND age >= idle_days
-    - MEDIUM: updateTime >= 75% of idle_days AND age >= 75% of idle_days
+    Always returns an empty list until a qualifying signal is available.
 
     IAM permissions required:
-    - notebooks.instances.list (roles/notebooks.viewer)
+        notebooks.instances.list (roles/notebooks.viewer)
     """
-    # Guard against caller passing 0
-    idle_days = max(idle_days, 1)
+    if idle_days < 1:
+        raise ValueError(f"idle_days must be >= 1, got {idle_days!r}")
 
     session = AuthorizedSession(credentials)
-    now = datetime.now(timezone.utc)
-    findings: List[Finding] = []
-
-    instances = _list_instances(session, project_id)
-
-    for raw in instances:
-        inst = _normalize(raw)
-        name = inst["name"]
-        state = inst["state"]
-        location = inst["location"]
-
-        if region_filter and location.lower() != region_filter.lower():
-            continue
-
-        # Only ACTIVE instances incur compute charges
-        if state != "ACTIVE":
-            continue
-
-        # Age calculation
-        age_days: Optional[int] = None
-        create_time_str = inst["create_time"]
-        if create_time_str:
-            try:
-                created_at = datetime.fromisoformat(create_time_str.replace("Z", "+00:00"))
-                if created_at.tzinfo is None:
-                    created_at = created_at.replace(tzinfo=timezone.utc)
-                age_days = (now - created_at).days
-            except ValueError:
-                pass
-
-        # Skip instances younger than half the idle threshold
-        if age_days is not None and age_days < max(idle_days // 2, 7):
-            continue
-
-        # Idle signal: updateTime (control-plane last activity)
-        idle_since_days: Optional[int] = None
-        update_time_str = inst["update_time"]
-        if update_time_str:
-            try:
-                updated_at = datetime.fromisoformat(update_time_str.replace("Z", "+00:00"))
-                if updated_at.tzinfo is None:
-                    updated_at = updated_at.replace(tzinfo=timezone.utc)
-                idle_since_days = (now - updated_at).days
-            except ValueError:
-                pass
-
-        # Fall back to age when updateTime is unavailable
-        using_age_fallback = idle_since_days is None
-        if idle_since_days is None:
-            idle_since_days = age_days if age_days is not None else idle_days
-
-        effective_age = age_days if age_days is not None else idle_since_days
-
-        # Confidence thresholds
-        threshold_high = idle_days
-        threshold_medium = int(idle_days * 0.75)
-
-        # Age-fallback findings are capped at MEDIUM — updateTime absence is not
-        # evidence of idleness by itself.
-        if (
-            not using_age_fallback
-            and idle_since_days >= threshold_high
-            and effective_age >= threshold_high
-        ):
-            confidence = ConfidenceLevel.HIGH
-        elif idle_since_days >= threshold_medium and effective_age >= threshold_medium:
-            confidence = ConfidenceLevel.MEDIUM
-        else:
-            continue
-
-        machine_type = inst["machine_type"]
-        accel_type = inst["accel_type"]
-        accel_count = inst["accel_count"]
-        labels = inst["labels"]
-        instance_id = name.split("/")[-1] if name else ""
-
-        is_gpu = accel_type in _GPU_ACCELERATORS or (machine_type or "").startswith(("a2-", "g2-"))
-
-        monthly_cost = _estimate_cost(machine_type, accel_type, accel_count)
-
-        idle_ratio = round(idle_since_days / idle_days, 2) if idle_days > 0 else 0.0
-        if is_gpu and idle_ratio >= 2.0:
-            risk = RiskLevel.CRITICAL
-        elif is_gpu:
-            risk = RiskLevel.HIGH
-        else:
-            risk = RiskLevel.MEDIUM
-
-        idle_signal_source = "age_fallback" if using_age_fallback else "update_time"
-        activity_source = "age (fallback)" if using_age_fallback else "updateTime"
-
-        signals = [
-            "Instance state: ACTIVE",
-            f"Last control-plane activity: {idle_since_days} days ago ({activity_source})",
-        ]
-        if age_days is not None:
-            signals.append(f"Instance age: {age_days} days")
-        if machine_type:
-            signals.append(f"Machine type: {machine_type}")
-        if is_gpu and accel_type:
-            signals.append(f"Accelerator: {accel_type} x {accel_count}")
-        if is_gpu:
-            accel_label = "TPU-backed" if (accel_type or "").startswith("TPU_") else "GPU-backed"
-            signals.append(
-                f"{accel_label} instance — high continuous cost (~${monthly_cost:,.0f}/month)"
-            )
-        if using_age_fallback:
-            signals.append(
-                "updateTime unavailable — age used as fallback signal; "
-                "confidence capped at MEDIUM"
-            )
-
-        not_checked = [
-            "Active kernel sessions not captured by updateTime (requires Cloud Monitoring agent)",
-            "Scheduled notebook runs via Cloud Scheduler or Vertex AI Pipelines",
-            "Planned future use by the assigned user",
-            f"Idle shutdown policy configured on the instance — may auto-stop before {idle_days} days",
-        ]
-
-        evidence = Evidence(
-            signals_used=signals,
-            signals_not_checked=not_checked,
-            time_window=f"{idle_since_days} days",
-        )
-
-        is_tpu = (accel_type or "").startswith("TPU_")
-        if is_gpu:
-            accel_kind = "TPU" if is_tpu else "GPU"
-            title = (
-                f"Idle {accel_kind}-Backed Workbench Instance "
-                f"(>{idle_days} Days Idle, {idle_since_days} Days Since Activity)"
-            )
-        else:
-            title = (
-                f"Idle Vertex AI Workbench Instance "
-                f"(>{idle_days} Days Idle, {idle_since_days} Days Since Activity)"
-            )
-
-        if is_gpu:
-            accel_prefix = "TPU-backed " if is_tpu else "GPU-backed "
-        else:
-            accel_prefix = ""
-        summary = (
-            f"{accel_prefix}Vertex AI Workbench instance '{instance_id}' "
-            f"in '{location}' has had no control-plane activity for {idle_since_days} days "
-            f"but remains ACTIVE, incurring continuous charges "
-            f"(~${monthly_cost:,.0f}/month)."
-        )
-
-        findings.append(
-            Finding(
-                provider="gcp",
-                rule_id="gcp.vertex.workbench.idle",
-                resource_type="gcp.vertex.workbench.instance",
-                resource_id=name,
-                region=location,
-                estimated_monthly_cost_usd=monthly_cost,
-                title=title,
-                summary=summary,
-                reason=(
-                    f"Workbench instance has had no control-plane activity "
-                    f"for {idle_since_days} days while ACTIVE"
-                ),
-                risk=risk,
-                confidence=confidence,
-                detected_at=now,
-                evidence=evidence,
-                details={
-                    "instance_id": instance_id,
-                    "location": location,
-                    "machine_type": machine_type,
-                    "accelerator_type": accel_type or None,
-                    "accelerator_count": accel_count,
-                    "is_gpu": is_gpu,
-                    "age_days": age_days if age_days is not None else "unknown",
-                    "idle_since_days": idle_since_days,
-                    "idle_days_threshold": idle_days,
-                    "idle_ratio": idle_ratio,
-                    "idle_signal_source": idle_signal_source,
-                    "estimated_monthly_cost": f"~${monthly_cost:,.0f}/month",
-                    "cost_basis": "us-central1 baseline estimate",
-                    "labels": labels,
-                    "api_version": "v2",
-                },
-            )
-        )
-
-    return findings
+    _list_instances(session, project_id)
+    return []
 
 
 find_idle_workbench_instances.RULE_ID = "gcp.vertex.workbench.idle"
 
 
-def _list_instances(session: AuthorizedSession, project_id: str) -> list:
+def _list_instances(
+    session: AuthorizedSession,
+    project_id: str,
+) -> tuple:
     """
     List all Vertex AI Workbench instances across all locations using the v2 API.
 
     Uses the locations/- wildcard for a single paginated call covering all regions.
-
-    Raises PermissionError on 403. Returns [] on 404 (API not enabled).
+    Exhausts pagination via nextPageToken.
+    Collects unreachable[] locations reported by the API.
+
+    Returns (instances, unreachable_locations, discovery_failed):
+        instances:             raw instance dicts from the API
+        unreachable_locations: locations the API reported as unreachable
+        discovery_failed:      True when a transport/server error made enumeration
+                               incomplete.
+
+    Error handling:
+        403: raises PermissionError (user-actionable; propagates up)
+        404: API not enabled; returns ([], [], False) — clean empty scope
+        400: bad request or wildcard unsupported; warns, returns ([], [], True)
+        5xx: transient server error; warns, returns partial results with True
+        network error: warns, returns partial results with True
     """
-    results = []
-    url = f"https://notebooks.googleapis.com/v2/projects/{project_id}/locations/-/instances"
+    results: list = []
+    unreachable: list = []
+    discovery_failed = False
+    url = (
+        f"https://notebooks.googleapis.com/v2"
+        f"/projects/{project_id}/locations/-/instances"
+    )
     params: dict = {"pageSize": 100}
 
     while True:
         try:
             resp = session.get(url, params=params)
-        except Exception:
-            break  # network error — skip, don't abort project scan
+        except Exception as exc:
+            warnings.warn(
+                f"gcp.vertex.workbench.idle: network error fetching instances for "
+                f"project '{project_id}' ({type(exc).__name__}: {exc}) — "
+                "discovery incomplete",
+                UserWarning,
+                stacklevel=3,
+            )
+            discovery_failed = True
+            break
+
         if resp.status_code == 403:
             raise PermissionError(
                 "notebooks.instances.list permission required (roles/notebooks.viewer)"
             )
-        if resp.status_code in (404, 400):
-            return []  # API not enabled for this project
+
+        if resp.status_code == 404:
+            return [], [], False
+
+        if resp.status_code == 400:
+            warnings.warn(
+                f"gcp.vertex.workbench.idle: HTTP 400 from Notebooks API for project "
+                f"'{project_id}' — discovery incomplete",
+                UserWarning,
+                stacklevel=3,
+            )
+            return [], [], True
+
         if resp.status_code >= 500:
-            break  # transient server error — skip rather than abort scan
+            warnings.warn(
+                f"gcp.vertex.workbench.idle: server error (HTTP {resp.status_code}) "
+                f"for project '{project_id}' — discovery incomplete",
+                UserWarning,
+                stacklevel=3,
+            )
+            discovery_failed = True
+            break
+
         resp.raise_for_status()
         data = resp.json()
-        for inst in data.get("instances", []):
-            inst["_api_version"] = "v2"
-            results.append(inst)
+
+        results.extend(data.get("instances", []))
+
+        for loc in data.get("unreachable", []):
+            if loc and loc not in unreachable:
+                unreachable.append(loc)
+
         next_token = data.get("nextPageToken")
         if not next_token:
             break
         params["pageToken"] = next_token
 
-    return results
-
-
-def _normalize(instance: dict) -> dict:
-    """
-    Normalize a v2 Workbench instance dict to a common schema.
-
-    machineType lives under gceSetup.machineType (short name).
-    Accelerators under gceSetup.acceleratorConfigs (list).
-    """
-    name = instance.get("name", "")
-
-    # Extract location from resource name:
-    # projects/{proj}/locations/{loc}/instances/{id}
-    parts = name.split("/")
-    location = parts[3] if len(parts) > 3 else ""
-
-    gce = instance.get("gceSetup", {})
-    machine_type = gce.get("machineType", "")
-    accels = gce.get("acceleratorConfigs", [])
-    accel_type = accels[0].get("type", "") if accels else ""
-    accel_count = int(accels[0].get("coreCount", 0) or 0) if accels else 0
-
-    if accel_type == "ACCELERATOR_TYPE_UNSPECIFIED":
-        accel_type = ""
-
-    return {
-        "name": name,
-        "location": location,
-        "state": instance.get("state", ""),
-        "create_time": instance.get("createTime", ""),
-        "update_time": instance.get("updateTime", ""),
-        "machine_type": machine_type,
-        "accel_type": accel_type,
-        "accel_count": accel_count,
-        "labels": instance.get("labels", {}),
-    }
-
-
-def _estimate_cost(
-    machine_type: Optional[str],
-    accel_type: Optional[str],
-    accel_count: int,
-) -> float:
-    """
-    Estimate monthly cost for one always-on Workbench instance.
-
-    a2-* and g2-* machine types bundle GPU cost — no separate add-on.
-    n1-*/n2-* machines add GPU cost separately.
-    """
-    machine_cost = _MACHINE_MONTHLY_COST.get(machine_type or "", _DEFAULT_MACHINE_MONTHLY_COST)
-
-    gpu_addon = 0.0
-    if accel_type and accel_type in _GPU_MONTHLY_COST_EACH:
-        is_gpu_machine = (machine_type or "").startswith(("a2-", "g2-"))
-        if not is_gpu_machine:
-            gpu_addon = _GPU_MONTHLY_COST_EACH[accel_type] * max(accel_count, 1)
-
-    return machine_cost + gpu_addon
+    return results, unreachable, discovery_failed
diff --git a/docs/rules/gcp.md b/docs/rules/gcp.md
index 6360e10..8426f48 100644
--- a/docs/rules/gcp.md
+++ b/docs/rules/gcp.md
@@ -12,7 +12,7 @@
 | `gcp.compute.ip.unused` | Network | Reserved static IPs in RESERVED state |
 | `gcp.sql.instance.idle` | Platform | Cloud SQL instances with zero connections 14+ days |
 | `gcp.vertex.endpoint.idle` | AI/ML | Vertex AI endpoints with an always-deployed serving floor and zero observed request activity 14+ days |
-| `gcp.vertex.workbench.idle` | AI/ML | Vertex AI Workbench instances with no activity 14+ days |
+| `gcp.vertex.workbench.idle` | AI/ML | Vertex AI Workbench instances — currently dormant (EMITTING_DISABLED); no qualifying canonical kernel-activity signal exists |
 | `gcp.vertex.training_job.long_running` | AI/ML | Vertex AI jobs running beyond threshold |
 | `gcp.tpu.idle` | AI/ML | Standalone Cloud TPU nodes in READY state with monitoring-based idle detection; currently no findings emit until worker-to-node join is documented |
 | `gcp.vertex.featurestore.idle` | AI/ML | Vertex AI Feature Stores (legacy) and Bigtable-backed Feature Online Stores with zero serving requests 30+ days (Monitoring-confirmed only) |
@@ -181,17 +181,35 @@
 **Spec:** [docs/specs/gcp/ai/vertex_endpoint_idle.md](../specs/gcp/ai/vertex_endpoint_idle.md)
 
 #### `gcp.vertex.workbench.idle`
-**Detects:** Vertex AI Workbench instances `ACTIVE` with no control-plane activity (`updateTime`) for `idle_days`
+**Current status: EMITTING_DISABLED.** No findings are emitted. No qualifying canonical kernel-activity signal exists that can prove per-instance notebook or kernel inactivity. `updateTime`, `createTime`, instance age, and CPU utilization are explicitly non-canonical idle signals for this rule (spec 3.3, 8.3). When a documented first-party per-instance Workbench activity surface becomes available (e.g. Cloud Logging kernel/session activity logs or a Cloud Monitoring metric with kernel-idle semantics), the rule can be activated without changing its candidate-selection logic.
 
-**Confidence / Risk:** HIGH (`updateTime` ≥ `idle_days` + age ≥ `idle_days`); MEDIUM (`updateTime` ≥ 75% of threshold or unavailable) / CRITICAL (GPU + `idle_ratio ≥ 2.0`); HIGH (GPU-backed); MEDIUM (CPU-only)
+**Detects (future):** Vertex AI Workbench instances in `ACTIVE` state with documented per-instance kernel inactivity over the `idle_days` window, confirmed by a qualifying first-party canonical signal
+
+**Returns:** `RuleResult` with structured runtime state (spec 12.1)
+- `rule_capability_state = EMITTING_DISABLED`
+- `scan_scope_state = PARTIAL` when `unreachable[]` locations are reported or a discovery failure occurs (400 / 5xx / network error); `FULL` otherwise
+- `resource_evaluation_state = NOT_EVALUABLE` (reason_code `NO_SIGNAL`) when valid `ACTIVE` instances exist; `EVALUABLE` when no candidates are found (including the 404/API-not-enabled case)
+- `not_evaluable_resources[]` — all `ACTIVE` candidate instances, each with `reason_code = NO_SIGNAL`
+- `not_evaluable_scopes[]` — unreachable locations from `ListInstancesResponse.unreachable[]`, each with `reason_code = COVERAGE`
+
+**Cost:** `estimated_monthly_cost_usd = None` — rule does not emit findings; no estimate is computed
 
 **Permissions:** `notebooks.instances.list` (roles/notebooks.viewer)
 
-**Params:** `idle_days` (default: 14)
+**Params:** `idle_days` (default: 14; must be ≥ 1 — fails fast on invalid input)
+
+**Exclusions:**
+- `INVALID` (counted in `excluded_invalid_resources_count`): resource name absent or not matching exact pattern `projects/{p}/locations/{l}/instances/{id}`; `state` absent or empty
+- `OUT_OF_SCOPE` (silent, not counted): valid resources in any non-`ACTIVE` state (`STOPPED`, `SUSPENDED`, etc.)
+- Region filter: exact string equality; no case folding or aliasing
 
-**Exclusions:** instances not in `ACTIVE` state
+**Discovery failure taxonomy:**
+- `404` — Notebooks API not enabled; `scan_scope_state = FULL`, `resource_evaluation_state = EVALUABLE` (provably no instances)
+- `400` — bad request or wildcard unsupported; `scan_scope_state = PARTIAL` (discovery incomplete)
+- `5xx` / network error — transient failure; `scan_scope_state = PARTIAL` (discovery incomplete)
+- `unreachable[]` — API-reported location gaps; `scan_scope_state = PARTIAL`; locations added to `not_evaluable_scopes[]` with `reason_code = COVERAGE`
 
-**Spec:** —
+**Spec:** [docs/specs/gcp/ai/workbench_idle.md](../specs/gcp/ai/workbench_idle.md)
 
 #### `gcp.vertex.training_job.long_running`
 **Detects:** Vertex AI CustomJobs and TrainingPipelines whose state is exactly the expected running state (`JOB_STATE_RUNNING` / `PIPELINE_STATE_RUNNING`) and whose elapsed wall-clock time since `startTime` meets or exceeds `long_running_hours_threshold`
diff --git a/docs/specs/gcp/ai/workbench_idle.md b/docs/specs/gcp/ai/workbench_idle.md
new file mode 100644
index 0000000..a88f072
--- /dev/null
+++ b/docs/specs/gcp/ai/workbench_idle.md
@@ -0,0 +1,668 @@
+# GCP Rule Spec - `gcp.vertex.workbench.idle`
+
+## 1. Rule Identity
+
+- **Rule ID:** `gcp.vertex.workbench.idle`
+- **Provider:** GCP
+- **Resource type:** Vertex AI Workbench Instance
+- **Finding resource_type:** `gcp.vertex.workbench.instance`
+
+---
+
+## 2. Intent
+
+Detect **Vertex AI Workbench instances that are provably still running** and have **documented first-party evidence of notebook/kernel inactivity** over a conservative review window.
+
+This rule is deliberately **precision-first**. It is a **review-candidate** rule only. It is **not** proof that an instance is safe to stop, **not** proof that no scheduled or background work exists, and **not** proof of a specific monthly dollar saving.
+
+This rule is a **proof-based** rule, not a heuristic rule. In its current canonical form it is **currently dormant pending signal availability**: it is non-emitting unless a documented canonical activity signal path exists and passes the signal-availability gate.
+
+This rule is designed to prove idleness when a qualifying canonical signal exists; it does **not** suggest optimization by itself.
+
+### 2.1 Canonical definitions
+
+| Term | Definition |
+|---|---|
+| Workbench instance | Vertex AI Workbench v2 `Instance` resource `projects/{project}/locations/{location}/instances/{instance_id}` |
+| running instance | Instance whose `state` is exactly `ACTIVE` |
+| kernel inactivity | The documented idle-shutdown notion of inactivity: no kernel activity for the configured time period; running a cell or new notebook output resets the timer; CPU usage does not |
+| idle-shutdown configuration | Workbench metadata keys that control automatic shutdown behavior, especially `idle-timeout-seconds` and `enable-guest-attributes` |
+| canonical idle signal | A documented first-party signal that can prove absence of kernel activity for the review window under the canonical signal requirements |
+| activity signal source | The exact first-party source used for proof, such as a documented Workbench-attributable Cloud Logging or Cloud Monitoring surface |
+| review window end | `now_utc` |
+| review window start | `review_window_end_utc - idle_days x 86400 seconds` |
+| full observation window | `[review_window_start_utc, review_window_end_utc]`, usable only when the chosen canonical idle signal can cover the full window |
+| signal availability gate | The source is usable only when retention covers `idle_days`, the full observation window is continuously visible, and no permission gaps exist |
+| invalid resource record | Record excluded from evaluation because required identity fields are missing, malformed, or unparsable |
+| out-of-scope resource record | Valid resource record excluded from evaluation because it does not satisfy in-scope lifecycle conditions for this rule |
+| not evaluable | Explicit outcome when no qualifying canonical signal exists or the signal-availability gate fails; this is not the same as "0 findings" |
+| not evaluable reason code | Root-cause category for a not-evaluable outcome: `NO_SIGNAL`, `PERMISSIONS`, or `COVERAGE` |
+| partial scan | Scan-level outcome for discovery-layer coverage gaps in the requested scope, such as when `unreachable[]` is reported; signal-quality failures alone are **not evaluable**, not partial, and MUST NOT change `scan_scope_state` |
+| rule capability state | Static rule capability: `EMITTING_DISABLED` or `EMITTING_ENABLED` |
+| scan scope state | Scope-level runtime state: `FULL` or `PARTIAL` |
+| resource evaluation state | Aggregate runtime state across valid in-scope resources: `EVALUABLE`, `NOT_EVALUABLE`, or `MIXED`; it is determined independently from discovery completeness |
+| reporting mode | Output mode for not-evaluable categories: `FULL_ENUMERATION` or `COUNT_ONLY` |
+| candidate resources | Valid, in-scope Workbench instances with `state = ACTIVE` after normalization and filtering |
+| `signal_coverage_start` | Placeholder for the earliest timestamp in the signal window actually used for proof |
+| `signal_coverage_end` | Placeholder for the latest timestamp in the signal window actually used for proof |
+
+---
+
+## 3. GCP Documentation Grounding
+
+### 3.1 Vertex AI Workbench `Instance` is the control-plane resource for this rule
+
+Google documents the Vertex AI Workbench v2 `Instance` resource with fields including:
+
+1. `name`
+2. `state`
+3. `createTime`
+4. `updateTime`
+5. `labels`
+6. `gceSetup`
+7. `gceSetup.machineType`
+8. `gceSetup.acceleratorConfigs`
+9. `gceSetup.metadata`
+10. `gceSetup.bootDisk`
+11. `gceSetup.dataDisks`
+
+Google also documents:
+
+1. `name` format: `projects/{projectId}/locations/{location}/instances/{instanceId}`
+2. `ACTIVE` means **the instance is running**
+3. `STOPPED` means the instance is stopped
+4. `SUSPENDED` means the instance is suspended
+5. `createTime` and `updateTime` are output-only timestamps on the instance resource
+
+Source:
+
+- *Resource: Instance*
+
+URL:
+
+- https://cloud.google.com/vertex-ai/docs/workbench/reference/rest/v2/projects.locations.instances
+
+Rule consequence:
+
+1. Eligibility must be based on documented `Instance` control-plane fields only.
+2. Exact state `ACTIVE` is the only in-scope running lifecycle state for this rule.
+3. `createTime` and `updateTime` are documented lifecycle/update timestamps, but Google does **not** document them as notebook-session or kernel-activity timestamps.
+4. Resource identity and region must come from the documented full resource name, not from display text or labels.
+
+### 3.2 The list API is paginated and can report unreachable locations
+
+Google documents `projects.locations.instances.list` with:
+
+1. `pageSize`
+2. `pageToken`
+3. `filter`
+4. `instances[]`
+5. `nextPageToken`
+6. `unreachable[]`
+
+Google documents `instances[]` and `unreachable[]` on `ListInstancesResponse`. The implementation must treat both fields as independently usable when present and must not assume mutual exclusivity unless Google documents that guarantee explicitly.
+
+Source:
+
+- *Method: projects.locations.instances.list*
+
+URL:
+
+- https://cloud.google.com/vertex-ai/docs/workbench/reference/rest/v2/projects.locations.instances/list
+
+Rule consequence:
+
+1. Pagination must be exhausted using `nextPageToken`.
+2. Reported `unreachable` locations mean visibility is incomplete for that read.
+3. If `unreachable[]` is present, the scan is **partial**.
+4. Each unreachable location is a **not evaluable scope** for that scan; any resources in that location are outside canonical evaluable coverage and MUST NOT produce findings.
+5. The rule may still emit findings for reachable locations within the requested scope, but the scan MUST remain `partial = true` and MUST surface the unreachable locations as `not_evaluable_scopes[]`.
+6. The rule must not claim complete project-wide idle evaluation when the list response reports unreachable locations.
+7. A future CleanCloud implementation may surface partial-scan status as a warning or exit-code signal, but canonical detection logic must already treat coverage as incomplete.
+
+### 3.3 Google defines Workbench idleness in terms of kernel activity, not control-plane timestamps
+
+Google documents Workbench idle shutdown as follows:
+
+1. Workbench instances shut down after a specified period of inactivity by default
+2. default idle-shutdown threshold is 180 inactive minutes
+3. idle shutdown requires guest attributes to be enabled
+4. the instance shuts down when there is **no kernel activity** for the specified time period
+5. running a notebook cell or new output printing resets the idle-shutdown timer
+6. CPU usage does **not** reset the idle-shutdown timer
+7. idle shutdown looks for activity in local Jupyter session, terminal, and kernel endpoints
+
+Source:
+
+- *Idle shutdown*
+
+URL:
+
+- https://cloud.google.com/vertex-ai/docs/workbench/instances/idle-shutdown
+
+Rule consequence:
+
+1. The canonical inactivity concept for this rule is **kernel inactivity**, not generic VM age, not `updateTime`, and not CPU utilization.
+2. `updateTime` must not be interpreted as "last notebook activity" or "last kernel activity".
+3. `createTime` must not be used as an idle fallback or as proof that an instance has been unused since creation.
+4. CPU or host activity metrics would not be canonical substitutes for notebook idleness without a separate documented contract, because Google explicitly distinguishes CPU usage from idle-shutdown activity.
+
+### 3.4 Workbench metadata documents idle-shutdown configuration, not actual last activity
+
+Google documents the following metadata keys for Workbench instances:
+
+1. `idle-timeout-seconds` - integer idle time in seconds; default `10800`
+2. `enable-guest-attributes` - required for idle shutdown; default `true`
+
+Google also documents:
+
+1. these metadata keys are managed through instance metadata
+2. `instances.patch` supports updates to `gceSetup.metadata`
+3. turning off idle shutdown is managed through metadata
+
+Sources:
+
+- *Manage metadata*
+- *Method: projects.locations.instances.patch*
+
+URLs:
+
+- https://cloud.google.com/vertex-ai/docs/workbench/instances/manage-metadata
+- https://cloud.google.com/vertex-ai/docs/workbench/reference/rest/v2/projects.locations.instances/patch
+
+Rule consequence:
+
+1. Idle-shutdown metadata is valid **configuration context** only.
+2. Metadata can explain why an instance may remain running, but it does **not** prove whether the instance has been idle or active over the review window.
+3. Presence, absence, or value changes of `idle-timeout-seconds` must not be treated as first-party evidence of recent or absent kernel activity.
+4. `enable-guest-attributes` is operational context only; it is not a direct activity signal.
+
+### 3.5 Workbench accelerator configuration is documented as GPU-only on this surface
+
+Google documents `gceSetup.acceleratorConfigs` and `AcceleratorConfig` with:
+
+1. `type`
+2. `coreCount`
+3. currently only one accelerator configuration is supported
+4. **TPUs are not supported**
+
+Source:
+
+- *Resource: Instance* (`GceSetup`, `AcceleratorConfig`, `AcceleratorType`)
+
+URL:
+
+- https://cloud.google.com/vertex-ai/docs/workbench/reference/rest/v2/projects.locations.instances
+
+Rule consequence:
+
+1. Hardware enrichment may use the documented accelerator configuration when present.
+2. This rule must not classify Workbench instances as TPU-backed from the documented `acceleratorConfigs` surface.
+3. Hardware is auxiliary context only; it is not canonical proof of idleness.
+
+### 3.6 Billing guidance distinguishes running compute from stopped storage-only cost
+
+Google documents that:
+
+1. while a Workbench instance is shut down, there are no CPU or GPU usage charges except scheduled executions that run during shutdown
+2. disk storage charges still apply while the instance is shut down
+
+Source:
+
+- *Idle shutdown*
+
+URL:
+
+- https://cloud.google.com/vertex-ai/docs/workbench/instances/idle-shutdown
+
+Rule consequence:
+
+1. `ACTIVE` instances are the relevant compute-cost surface for this rule.
+2. `STOPPED` or `SUSPENDED` instances are out of scope for this idle-compute rule, although storage cost can still remain.
+3. The rule must not hardcode a fixed monthly estimate from static machine-price tables as canonical logic.
+4. `estimated_monthly_cost_usd` should remain `None` unless a future implementation computes current pricing from authoritative region- and configuration-specific pricing inputs.
+
+---
+
+## 4. Detection Goal
+
+Emit a finding only when **all** of the following are true:
+
+1. the resource is a documented Workbench `Instance`
+2. the resource name is valid and the location is parseable from it
+3. if a region filter is set, it matches the normalized location exactly
+4. the resource state is exactly `ACTIVE`
+5. the resource satisfies the canonical idle signal requirements in section 8
+
+If any required canonical signal condition cannot be established reliably, the resource is **not evaluable** for this rule version and MUST NOT produce findings.
+
+**Current canonical status:** based on the documented surfaces referenced in this spec, no qualifying canonical signal exists that exposes per-instance last kernel activity or a kernel-idle time series suitable for this rule. Therefore the rule is currently in `EMITTING_DISABLED` mode and must not emit findings from control-plane timestamps alone until a qualifying signal path is documented and usable.
+
+### 4.1 Current canonical decision flow
+
+Implement the current version in this order:
+
+1. capture `now_utc` once for the scan
+2. list Workbench instances for the requested scope and exhaust pagination
+3. if `unreachable[]` is reported, set `partial = true` and record each unreachable location in `not_evaluable_scopes[]`
+4. normalize returned records; count invalid records in `excluded_invalid_resources_count` and exclude out-of-scope records before candidate resource formation
+5. keep only valid in-scope `ACTIVE` instances as candidate resources
+6. if there are no candidate resources:
+   - set `resource_evaluation_state = EVALUABLE`
+   - emit no findings
+7. otherwise, as defined in section 4, classify those candidate resources as `not_evaluable_resources[]`
+8. set `rule_capability_state = EMITTING_DISABLED`
+9. in the current version, candidate resources default to `reason_code = NO_SIGNAL` because no qualifying canonical signal exists
+10. in the current version, all candidate resources share the same evaluation outcome; therefore `resource_evaluation_state` cannot be `MIXED`
+11. set `resource_evaluation_state = NOT_EVALUABLE` for the current version when candidate resources exist
+12. emit no findings
+
+`scan_scope_state` is determined exclusively by discovery-layer reachability, such as `unreachable[]`, and MUST NOT be changed by resource-level signal evaluation outcomes. Resource evaluation MUST remain independent of discovery completeness for reachable candidate resources.
+
+If a future qualifying canonical signal path is documented, the implementation may continue from step 5 by applying section 8 to reachable candidate resources only.
+
+---
+
+## 5. Non-Goals
+
+This rule does **not** attempt to prove:
+
+- that an old `updateTime` means the notebook is idle
+- that an old `createTime` means the notebook has been unused
+- that low CPU usage means the notebook is idle
+- that the configured idle-shutdown timeout has already been exceeded
+- that the instance is safe to stop
+- that no scheduled executions or other intentional automation exist
+- that a specific monthly saving exists
+
+---
+
+## 6. Canonical Inputs
+
+### 6.1 Required surfaces
+
+The implementation may use the following documented APIs and docs-backed fields:
+
+1. `projects.locations.instances.list`
+2. `Instance.name`
+3. `Instance.state`
+4. `Instance.gceSetup.metadata`
+5. `Instance.gceSetup.machineType`
+6. `Instance.gceSetup.acceleratorConfigs`
+7. `Instance.createTime`
+8. `Instance.updateTime`
+
+### 6.2 Future activation path: conditional canonical activity signal sources
+
+Canonical qualification is defined only in section 8.
+
+These source classes are permitted only when explicitly documented by Google as Workbench-attributable and semantically aligned with kernel activity.
+
+Permitted canonical source classes are:
+
+1. **Google Cloud Logging**, but only when Google documents Workbench-attributable logs as part of Vertex AI Workbench itself, with semantics that map to notebook or kernel activity for an exact instance rather than incidental infrastructure events
+2. notebook execution logs, when documented by Google as part of Vertex AI Workbench activity evidence and attributable per instance
+3. kernel/session activity logs, but only when documented by Google as part of Vertex AI Workbench, attributable to the exact instance, and semantically tied to the Workbench idle definition
+4. **Google Cloud Monitoring**, but only when Google documents a Workbench-specific metric/resource contract whose semantics map to kernel activity or documented absence of kernel activity rather than VM utilization
+
+The consulted control-plane docs do **not** provide such a signal on the `Instance` resource itself, and this spec does **not** currently establish a qualifying Logging or Monitoring signal path. This section is therefore a **future activation path**, not an active emit path in the current version.
+
+### 6.3 Future activation path: signal-availability gate
+
+Even when a candidate signal source is documented, it is canonical only when all of the following are true:
+
+1. retention for the chosen log or metric source is at least `idle_days`
+2. the exact source is explicitly documented to provide continuous, gap-free coverage across the full observation window, including no sampling, ingestion, or visibility gaps
+3. there are no permission gaps for the source over the evaluated scope
+4. any required location-level reads are reachable for the evaluated scope
+
+In a future activated version, implementations may rely only on explicit Google documentation or source contracts to establish these properties; they must not infer completeness, cadence, or gap-freeness heuristically.
+
+If any condition fails, the resource or affected scope is **not evaluable**. The implementation must not silently treat missing, partial, or gap-ridden telemetry as equivalent to zero activity, and must not infer idle from "no events found".
+
+### 6.4 Optional context fields
+
+These may enrich a future finding but are not themselves eligibility signals:
+
+- `labels`
+- `creator`
+- `instanceOwners`
+- `healthState`
+- `gceSetup.machineType`
+- `gceSetup.acceleratorConfigs`
+- `gceSetup.bootDisk`
+- `gceSetup.dataDisks`
+- `gceSetup.metadata`
+- `createTime`
+- `updateTime`
+
+---
+
+## 7. Canonical normalization rules
+
+Normalize the following values:
+
+| Field | Canonical rule |
+|---|---|
+| `resource_name` | Must exactly match `projects/{project}/locations/{location}/instances/{instance_id}`. Otherwise treat the record as invalid and exclude it from evaluation and findings. |
+| `location` | Parse from the exact `locations/{location}` segment of the resource name. Region-filter comparison must use exact string equality only, with no aliasing or case folding. |
+| `state` | Compare exactly to documented enum `ACTIVE`, case-sensitive and with no normalization. Null or empty values make the record invalid for this rule. |
+| `now_utc` | Capture once per scan run in UTC and reuse for all resources in that run. |
+| `metadata.idle-timeout-seconds` | Context only. If used, parse as an integer number of seconds or treat as unusable context. It must not be used as a substitute for observed inactivity. |
+| `create_time_utc` | Optional context only. If parsed, require strict RFC3339. Parse failure removes context; it must not trigger fallback idle logic. |
+| `update_time_utc` | Optional context only. If parsed, require strict RFC3339. Parse failure removes context; it must not trigger fallback idle logic. |
+
+Important:
+
+1. `updateTime` is **not** the canonical last-activity field.
+2. `createTime` is **not** an idle fallback.
+3. The rule must not derive `idle_since_days` from `updateTime` or `createTime`.
+4. Normalization failures are invalid-resource exclusions, not **not evaluable** outcomes.
+5. Invalid or out-of-scope resource records are excluded before evaluation and MUST NOT appear in `not_evaluable` outputs.
+
+---
+
+## 8. Future activation path: activity evidence rules
+
+### 8.1 Canonical idle signal requirements
+
+The rule may use an activity signal only when all of the following are true:
+
+1. the signal is documented by Google
+2. the signal is attributable to the exact Workbench instance being evaluated
+3. the signal maps to Workbench's documented kernel-activity semantics, or the documented absence of such activity, rather than generic VM host utilization
+4. the signal must have either an explicit documented inactivity contract or a documented last-activity contract with completeness guarantees, and it must support proving absence of activity; inactivity must not be inferred from missing events
+5. the signal is resolved at per-instance scope, not only at project or aggregated scope
+6. the signal passes the signal-availability gate
+
+Examples of qualifying contracts include:
+
+1. a documented per-instance Workbench metric explicitly defined as an idle-state signal for notebook or kernel activity
+2. a documented per-instance Workbench field or metric explicitly defined as a last notebook or kernel activity timestamp, where Google documents completeness guarantees for the full observation window
+
+### 8.2 Conditional source-path allowances
+
+The following source paths may be used **only if fully compliant with section 8.1**:
+
+1. Google Cloud Logging notebook execution logs
+2. Google Cloud Logging kernel/session activity logs
+3. Google Cloud Monitoring metrics that are explicitly documented against Workbench activity semantics
+
+Allowing these source classes does **not** mean they are currently established as canonical for this rule.
+
+### 8.3 Global exclusion list: non-canonical signals
+
+The following are **not** canonical idle signals for this rule:
+
+1. `updateTime`
+2. `createTime`
+3. instance age alone
+4. idle-shutdown metadata values alone
+5. machine type or accelerator configuration
+6. generic CPU, GPU, memory, or network host utilization without a separate documented contract equating it to Workbench kernel inactivity
+7. partial-window logs or metrics
+8. aggregated or project-level signals that cannot be attributed to the exact instance
+9. Cloud Monitoring host or VM utilization used as a proxy for notebook or kernel activity
+10. "no events found" or "no logs returned" treated as proof of idleness
+11. fallback to heuristics when a qualifying canonical signal is missing
+
+For this spec, "proof" means an explicit Google-documented inactivity or last-activity contract with completeness guarantees across the full observation window. Proof is never inferred from sparse, partial, or missing events.
+
+### 8.4 Idle-shutdown configuration is context only
+
+Idle-shutdown configuration may be used to explain or enrich behavior, for example:
+
+1. idle shutdown default exists
+2. `enable-guest-attributes` is required
+3. the configured timeout may be visible
+
+But this configuration does **not** prove:
+
+1. whether the instance actually experienced no kernel activity
+2. whether idle shutdown ran successfully
+3. whether the timer has or has not been reset within the review window
+
+---
+
+## 9. Decision rule
+
+### 9.1 Eligibility
+
+The resource is eligible only when:
+
+1. resource type is Workbench `Instance`
+2. `state` is exactly `ACTIVE`
+3. the resource satisfies the canonical idle signal requirements in section 8
+
+Configuration requirement:
+
+1. `idle_days` must be `>= 1`
+2. invalid threshold configuration must fail fast rather than silently clamp or reinterpret the value
+
+### 9.2 Current canonical outcome
+
+Under the currently consulted official docs, the canonical implementation follows the decision flow in section 4.1.
+
+In the current version:
+
+1. unreachable requested locations make the scan `partial` and populate `not_evaluable_scopes[]`
+2. valid in-scope `ACTIVE` resources remain **not evaluable** as defined in section 4
+3. findings remain empty until a documented qualifying signal path exists and satisfies section 8 for reachable resources
+
+Absence of signal MUST NOT be interpreted as inactivity.
+
+Important:
+
+1. **not evaluable** is a separate first-class outcome, not a synonym for `0 findings`
+2. the rule may return zero findings even when `ACTIVE` instances exist
+
+### 9.3 Explicitly forbidden heuristics
+
+The rule must **not**:
+
+- emit from `updateTime` age alone
+- emit from `createTime` age alone
+- emit from an age fallback when `updateTime` is absent
+- infer notebook inactivity from low CPU usage
+- infer notebook inactivity from machine type or accelerator presence
+- emit because idle shutdown is disabled or appears unconfigured
+- fall back to heuristics if a qualifying canonical signal is missing
+
+---
+
+## 10. Cost handling
+
+### 10.1 Canonical monthly cost field
+
+`estimated_monthly_cost_usd = None`
+
+Reason:
+
+1. the canonical spec does not currently emit findings
+2. authoritative Workbench cost depends on running compute shape, attached accelerators, disks, region, and usage option
+3. stopped instances still incur disk charges, so simplistic compute-only estimates are incomplete
+
+### 10.2 Future advisory cost hints
+
+If a future implementation chooses to surface an advisory cost hint, it must:
+
+1. be clearly labeled non-canonical advisory context
+2. use authoritative current pricing inputs for the exact region and configuration
+3. distinguish running compute from persistent disk charges
+4. never affect eligibility
+
+---
+
+## 11. Failure behavior
+
+### 11.1 Invalid or out-of-scope resource exclusion
+
+Exclude from evaluation and findings:
+
+- empty resource names
+- resource names that do not exactly match the documented instance pattern
+- `state` absent or empty
+- resources in non-`ACTIVE` states
+
+Use this exclusion taxonomy:
+
+| Category | Meaning | Counted in `excluded_invalid_resources_count` |
+|---|---|---|
+| `INVALID` | malformed, missing, or unparsable required identity or state fields | yes |
+| `OUT_OF_SCOPE` | valid resource record that is not in the rule's lifecycle scope, including non-`ACTIVE` resources | no |
+
+Records with absent or empty `state` are `INVALID`. Resources in non-`ACTIVE` states are `OUT_OF_SCOPE`: they are valid but excluded from evaluation and MUST NOT be counted in `excluded_invalid_resources_count`.
+
+`excluded_invalid_resources_count` excludes `OUT_OF_SCOPE` records by design.
+
+Out-of-scope resources are excluded before candidate resource formation.
+
+These are not **not evaluable** outcomes.
+
+### 11.2 Not evaluable taxonomy
+
+Classify as **not evaluable** and MUST NOT produce findings.
+
+Section 12 is the authoritative runtime contract for `scan_scope_state`, `resource_evaluation_state`, `partial`, and reporting behavior. This section defines only the taxonomy and reason-code classification used by not-evaluable records.
+
+Use the following reason codes:
+
+| Reason code | Meaning |
+|---|---|
+| `NO_SIGNAL` | No qualifying canonical signal path exists for the resource or requested reachable scope |
+| `PERMISSIONS` | Required permissions for the qualifying signal source are missing or incomplete |
+| `COVERAGE` | Coverage is incomplete for the qualifying signal source or requested scope, including unreachable locations and partial observation windows |
+
+If more than one reason applies, select the primary `reason_code` using this precedence:
+
+1. `PERMISSIONS`
+2. `COVERAGE`
+3. `NO_SIGNAL`
+
+Implementations may retain additional secondary reasons as non-canonical context, but each `not_evaluable` record should expose one primary `reason_code`.
+
+When no qualifying canonical signal exists for the rule version, valid in-scope resources MUST use `reason_code = NO_SIGNAL`.
+
+In `EMITTING_DISABLED` mode, `NO_SIGNAL` is a synthetic default applied uniformly to candidate resources and does not represent per-resource evaluation variance.
+
+Apply them as follows:
+
+- `NO_SIGNAL`: resources for which no qualifying canonical signal exists
+- `COVERAGE`: resources for which signal retention is shorter than `idle_days`
+- `COVERAGE`: resources for which the candidate signal covers only part of the observation window
+- `NO_SIGNAL`, `PERMISSIONS`, or `COVERAGE`: resources for which the candidate signal fails the signal-availability gate, according to the underlying cause
+- `PERMISSIONS`: resources or scopes for which permissions are insufficient to evaluate the chosen signal source
+- `COVERAGE`: unreachable locations reported in the documented list response
+
+Runtime handling of these reason codes, including scope partiality, output separation, and state precedence, is defined in section 12.
+
+---
+
+## 12. Output contract
+
+### 12.1 Current runtime contract
+
+The implementation must preserve these rule-level outcomes separately:
+
+| Output | Meaning |
+|---|---|
+| `rule_capability_state` | Static capability state: `EMITTING_DISABLED` or `EMITTING_ENABLED` |
+| `scan_scope_state` | Scope-level runtime state: `FULL` or `PARTIAL` |
+| `resource_evaluation_state` | Aggregate runtime evaluation state across valid in-scope resources: `EVALUABLE`, `NOT_EVALUABLE`, or `MIXED` |
+| `findings[]` | Emitted findings only |
+| `partial` | `true` only for discovery-layer coverage gaps in the requested scope, including when `unreachable[]` is reported |
+| `excluded_invalid_resources_count` | Exact count of invalid resource records excluded before canonical evaluation |
+| `reporting_mode_not_evaluable_resources` | `FULL_ENUMERATION` or `COUNT_ONLY` |
+| `reporting_mode_not_evaluable_scopes` | `FULL_ENUMERATION` or `COUNT_ONLY` |
+| `not_evaluable_resources[]` | Valid in-scope resources that could not be evaluated under canonical signal requirements; each record should carry a `reason_code` |
+| `not_evaluable_scopes[]` | Scope-level not-evaluable records, including unreachable locations; each record should carry a `reason_code` |
+
+`partial = true` if and only if `scan_scope_state = PARTIAL`. `partial = false` if and only if `scan_scope_state = FULL`.
+
+`scan_scope_state` is determined exclusively by discovery-layer reachability and MUST NOT be upgraded or downgraded by signal-evaluation outcomes.
+
+All entries in `not_evaluable_scopes[]` derived from `unreachable[]` MUST use `reason_code = COVERAGE`.
+
+For each not-evaluable category, the implementation MUST choose exactly one reporting mode:
+
+1. `FULL_ENUMERATION` — return the complete set for that category
+2. `COUNT_ONLY` — return the exact full count for that category without full enumeration
+
+The implementation MUST NOT silently drop either category.
+
+Implementations SHOULD default to `FULL_ENUMERATION` unless payload size or platform constraints require `COUNT_ONLY`.
+
+If enumeration would make the payload unreasonably large, the implementation MAY use `COUNT_ONLY` for that category. If an exact full count cannot be established because of permission or coverage limits, the implementation MUST NOT claim `COUNT_ONLY`; instead, it must surface the affected scope or category as `PARTIAL` and/or `NOT_EVALUABLE` with the corresponding `reason_code`.
+
+In the current version, counts for `not_evaluable_resources` are always exact because they derive from fully enumerated candidate resources.
+
+The implementation MUST always retain an exact count for `excluded_invalid_resources_count`, even if individual excluded records are not returned in the payload.
+
+This current runtime contract describes the rule as it behaves today.
+
+### 12.2 Current canonical behavior
+
+The current canonical behavior is to return **no findings** as defined in section 4.
+
+This reflects `rule_capability_state = EMITTING_DISABLED`, not an accidental empty result.
+
+Interpretation:
+
+1. `0 findings` does **not** mean there are no idle Workbench instances
+2. `0 findings` means there are no instances provably idle under canonical signals accepted by this spec
+3. if signal availability or scope coverage is insufficient, the implementation should surface that the rule was **not evaluable** and/or **partial**, rather than implying complete negative coverage
+4. as defined in section 4, no findings will be emitted even for reachable locations
+5. even when `scan_scope_state = PARTIAL`, the current version MUST emit zero findings because no qualifying canonical signal exists
+6. `rule_capability_state = EMITTING_DISABLED` is appropriate in the current dormant version
+7. `scan_scope_state = PARTIAL` is appropriate only for discovery-layer coverage gaps, such as `unreachable[]`; signal evaluation gaps alone do not make the scan partial
+8. `resource_evaluation_state = EVALUABLE` is appropriate when either no valid in-scope reachable candidate resources exist after filtering, or when evaluation is attempted and all candidate resources satisfy the canonical signal preconditions
+9. `resource_evaluation_state = NOT_EVALUABLE` is appropriate when candidate resources exist but none of them can be evaluated under a qualifying canonical signal path
+10. `resource_evaluation_state = MIXED` is appropriate when some valid in-scope reachable resources are evaluable and others are not
+11. in the current dormant version, `resource_evaluation_state` will normally be `NOT_EVALUABLE`
+12. `resource_evaluation_state = MIXED` MUST NOT be emitted in `EMITTING_DISABLED` mode
+13. consumers SHOULD treat `NOT_EVALUABLE` as an unknown state requiring explicit surfacing rather than as a negative result or a retry guarantee
+14. when a single primary status must be displayed, consumers SHOULD prioritize `scan_scope_state = PARTIAL` over any resource evaluation state
+
+`EVALUABLE` when no candidate resources exist indicates a valid reachable scope with no eligible resources present after filtering; evaluation was not required on any candidate resource.
+
+### 12.3 Future enhancement schema
+
+If a future documented idle signal is added, the implementation may also populate the following finding fields:
+
+| Field | Value |
+|---|---|
+| `provider` | `gcp` |
+| `rule_id` | `gcp.vertex.workbench.idle` |
+| `category` | `ai` |
+| `severity` | Placeholder for future classification; must not affect canonical eligibility |
+| `confidence` | Placeholder for future classification; must not affect canonical eligibility |
+| `resource_type` | `gcp.vertex.workbench.instance` |
+| `resource_id` | Full Workbench instance resource name |
+| `region` | Parsed location from resource name |
+| `activity_signal_source` | Canonical source used for proof, such as a documented log or metric surface |
+| `signal_coverage_start` | Earliest timestamp covered by the exact signal window used for proof |
+| `signal_coverage_end` | Latest timestamp covered by the exact signal window used for proof |
+| `estimated_monthly_cost_usd` | `None` in canonical logic unless authoritative live pricing is added |
+
+These fields are dormant in the current version because the rule does not yet have a qualifying canonical signal path.
+
+---
+
+## 13. Implementation notes for future hardening
+
+This spec intentionally rejects the following as insufficient for canonical idle detection:
+
+1. `ACTIVE` + old `updateTime`
+2. `ACTIVE` + old `createTime`
+3. `ACTIVE` + disabled idle shutdown
+4. `ACTIVE` + low host utilization
+
+To make this rule emit canonically in the future, the implementation needs a documented first-party per-instance activity surface that is semantically aligned with Workbench's documented idle-shutdown notion of **kernel inactivity**.
+
+Likely future-enablement paths are:
+
+1. documented Google Cloud Logging notebook execution logs that Google defines as part of Vertex AI Workbench
+2. documented Google Cloud Logging kernel/session activity logs that Google defines as part of Vertex AI Workbench
+3. documented Google Cloud Monitoring metrics whose semantics map directly to Workbench kernel activity
+
+Any such path still requires full-window coverage, sufficient retention, exact per-instance attribution, and no permission or reachability gaps.
diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
index 3bf24ba..fd01236 100644
--- a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
+++ b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
@@ -1,404 +1,535 @@
 """
 Tests for gcp.vertex.workbench.idle rule.
 
+The rule is EMITTING_DISABLED and always returns an empty List[Finding].
+No qualifying canonical kernel-activity signal exists; updateTime, createTime,
+age, and CPU utilization are all explicitly non-canonical.
+
 Coverage:
-- Core detection: idle CPU instance (MEDIUM risk), idle GPU instance (HIGH risk)
-- Skipping: STOPPED instances, young instances, instances with recent activity
-- Confidence levels: HIGH (updateTime + age >= threshold), MEDIUM (75% threshold or age-fallback)
-- GPU detection: NVIDIA_TESLA_T4, NVIDIA_TESLA_A100, a2-* machines
-- Risk levels: CRITICAL (GPU + idle_ratio >= 2.0), HIGH (GPU), MEDIUM (CPU)
-- Cost estimation: machine cost, GPU add-on for n1/n2, bundled for a2/g2
-- Age-fallback: when updateTime unavailable, confidence capped at MEDIUM
-- Region filter: instances outside the filter are skipped
-- Both API versions: v1 (User-Managed Notebooks), v2 (Vertex AI Workbench)
-- Permission errors: PermissionError raised on 403 from list call
-- RULE_METADATA and RULE_ID attributes present
+  Public API (find_idle_workbench_instances):
+    - return type and value
+    - idle_days validation (zero, negative, boundary, error message)
+    - region_filter parameter accepted
+    - 403/404/400/5xx/network error handling
+    - warning type, message content (project, HTTP code, rule ID)
+
+  Internal (_list_instances):
+    - empty response
+    - instance accumulation
+    - pagination over 2 and 3 pages
+    - pageToken forwarded on subsequent requests
+    - pageSize=100 in initial request
+    - unreachable[] collected and deduplicated across pages
+    - empty unreachable entries skipped
+    - 404 returns clean ([], [], False)
+    - 400 returns ([], [], True)
+    - 5xx sets discovery_failed; preserves already-fetched instances
+    - network error sets discovery_failed; preserves already-fetched instances
+    - 403 raises PermissionError
+    - URL contains project ID and locations/- wildcard
 """
 
-from datetime import datetime, timedelta, timezone
+import warnings
 from unittest.mock import MagicMock, patch
 
 import pytest
 
-from cleancloud.core.confidence import ConfidenceLevel
-from cleancloud.core.risk import RiskLevel
 from cleancloud.providers.gcp.rules.ai.workbench_idle import (
-    _DEFAULT_MACHINE_MONTHLY_COST,
-    _GPU_MONTHLY_COST_EACH,
-    _MACHINE_MONTHLY_COST,
     RULE_METADATA,
-    _estimate_cost,
-    _normalize,
+    _list_instances,
     find_idle_workbench_instances,
 )
 
+_PROJECT = "my-project"
+
+
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 
-NOW = datetime(2025, 6, 1, 12, 0, 0, tzinfo=timezone.utc)
-_PROJECT = "my-project"
-_LOCATION = "us-central1"
-_INSTANCE_ID = "my-workbench-1"
-_INSTANCE_NAME = f"projects/{_PROJECT}/locations/{_LOCATION}/instances/{_INSTANCE_ID}"
-
-_OLD_TIME = NOW - timedelta(days=30)
-_IDLE_TIME = NOW - timedelta(days=20)
-_RECENT_TIME = NOW - timedelta(days=3)
-_YOUNG_TIME = NOW - timedelta(days=2)
-
-
-def _ts(dt: datetime) -> str:
-    return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
-
-
-def _v2_instance(
-    name: str = _INSTANCE_NAME,
-    state: str = "ACTIVE",
-    create_time: datetime = _OLD_TIME,
-    update_time: datetime = _IDLE_TIME,
-    machine_type: str = "n1-standard-4",
-    accel_type: str = "",
-    accel_count: int = 0,
-    labels: dict = None,
-) -> dict:
-    """Build a minimal v2 Workbench instance response dict."""
-    gce: dict = {"machineType": machine_type}
-    if accel_type:
-        gce["acceleratorConfigs"] = [{"type": accel_type, "coreCount": str(accel_count or 1)}]
-    return {
-        "name": name,
-        "state": state,
-        "createTime": _ts(create_time),
-        "updateTime": _ts(update_time),
-        "gceSetup": gce,
-        "labels": labels or {},
-        "_api_version": "v2",
-    }
-
-
-def _v1_instance(
-    name: str = _INSTANCE_NAME,
-    state: str = "ACTIVE",
-    create_time: datetime = _OLD_TIME,
-    update_time: datetime = _IDLE_TIME,
-    machine_type: str = "zones/us-central1-a/machineTypes/n1-standard-4",
-    accel_type: str = "",
-    accel_count: int = 0,
-    labels: dict = None,
-) -> dict:
-    """Build a minimal v1 User-Managed Notebook instance response dict."""
-    inst: dict = {
-        "name": name,
-        "state": state,
-        "createTime": _ts(create_time),
-        "updateTime": _ts(update_time),
-        "machineType": machine_type,
-        "labels": labels or {},
-        "_api_version": "v1",
-    }
-    if accel_type:
-        inst["acceleratorConfig"] = {
-            "type": accel_type,
-            "coreCount": str(accel_count or 1),
-        }
-    return inst
-
-
-def _mock_session(instances: list):
-    """Return a mock AuthorizedSession that returns the given instance list from v2 API."""
+
+def _ok(body: dict = None):
+    """Build a 200 response mock with the given JSON body."""
+    resp = MagicMock()
+    resp.status_code = 200
+    resp.json.return_value = body or {}
+    resp.raise_for_status.return_value = None
+    return resp
+
+
+def _err(status_code: int):
+    """Build an error response mock with the given status code."""
+    resp = MagicMock()
+    resp.status_code = status_code
+    return resp
+
+
+def _session(*responses):
+    """Build a mock session whose .get() returns responses in order."""
     mock = MagicMock()
-    response = MagicMock()
-    response.status_code = 200
-    response.json.return_value = {"instances": instances}
-    mock.get.return_value = response
+    mock.get.side_effect = list(responses)
     return mock
 
 
+def _invoke(**kwargs):
+    """
+    Call find_idle_workbench_instances with a default 200/empty mock session.
+    Extra kwargs are forwarded to the rule function.
+    """
+    with patch(
+        "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession",
+        return_value=_session(_ok()),
+    ):
+        return find_idle_workbench_instances(
+            project_id=_PROJECT, credentials=MagicMock(), **kwargs
+        )
+
+
+def _invoke_with_session(mock_session, **kwargs):
+    """Call find_idle_workbench_instances with a custom session mock."""
+    with patch(
+        "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession",
+        return_value=mock_session,
+    ):
+        return find_idle_workbench_instances(
+            project_id=_PROJECT, credentials=MagicMock(), **kwargs
+        )
+
+
 # ---------------------------------------------------------------------------
-# _normalize tests
+# Return type and value
 # ---------------------------------------------------------------------------
 
 
-class TestNormalize:
-    def test_v2_basic(self):
-        raw = _v2_instance()
-        norm = _normalize(raw)
-        assert norm["name"] == _INSTANCE_NAME
-        assert norm["location"] == _LOCATION
-        assert norm["state"] == "ACTIVE"
-        assert norm["machine_type"] == "n1-standard-4"
-        assert norm["accel_type"] == ""
-        assert norm["accel_count"] == 0
+class TestReturnValue:
+    def test_returns_list(self):
+        assert isinstance(_invoke(), list)
 
-    def test_v2_with_gpu(self):
-        raw = _v2_instance(accel_type="NVIDIA_TESLA_T4", accel_count=2)
-        norm = _normalize(raw)
-        assert norm["accel_type"] == "NVIDIA_TESLA_T4"
-        assert norm["accel_count"] == 2
+    def test_always_empty(self):
+        assert _invoke() == []
 
-    def test_unspecified_accel_normalized_to_empty(self):
-        raw = _v2_instance(accel_type="ACCELERATOR_TYPE_UNSPECIFIED")
-        norm = _normalize(raw)
-        assert norm["accel_type"] == ""
+    def test_empty_when_api_returns_active_instances(self):
+        """EMITTING_DISABLED: ACTIVE instances in API response still yield no findings."""
+        inst = {"name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-1", "state": "ACTIVE"}
+        result = _invoke_with_session(_session(_ok({"instances": [inst]})))
+        assert result == []
 
-    def test_location_extracted_from_name(self):
-        name = "projects/p/locations/europe-west1/instances/i"
-        raw = {**_v2_instance(name=name), "name": name}
-        norm = _normalize(raw)
-        assert norm["location"] == "europe-west1"
+    def test_empty_when_api_returns_multiple_instances(self):
+        instances = [
+            {"name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-{i}", "state": "ACTIVE"}
+            for i in range(5)
+        ]
+        result = _invoke_with_session(_session(_ok({"instances": instances})))
+        assert result == []
 
 
 # ---------------------------------------------------------------------------
-# _estimate_cost tests
+# idle_days validation
 # ---------------------------------------------------------------------------
 
 
-class TestEstimateCost:
-    def test_known_cpu_machine(self):
-        cost = _estimate_cost("n1-standard-4", "", 0)
-        assert cost == _MACHINE_MONTHLY_COST["n1-standard-4"]
+class TestIdleDaysValidation:
+    def test_zero_raises_value_error(self):
+        with pytest.raises(ValueError, match="idle_days must be >= 1"):
+            find_idle_workbench_instances(
+                project_id=_PROJECT, credentials=MagicMock(), idle_days=0
+            )
+
+    def test_negative_one_raises(self):
+        with pytest.raises(ValueError, match="idle_days must be >= 1"):
+            find_idle_workbench_instances(
+                project_id=_PROJECT, credentials=MagicMock(), idle_days=-1
+            )
+
+    def test_large_negative_raises(self):
+        with pytest.raises(ValueError, match="idle_days must be >= 1"):
+            find_idle_workbench_instances(
+                project_id=_PROJECT, credentials=MagicMock(), idle_days=-999
+            )
+
+    def test_error_message_includes_bad_value(self):
+        with pytest.raises(ValueError, match="-3"):
+            find_idle_workbench_instances(
+                project_id=_PROJECT, credentials=MagicMock(), idle_days=-3
+            )
+
+    def test_one_is_valid(self):
+        assert _invoke(idle_days=1) == []
+
+    def test_default_14_is_valid(self):
+        assert _invoke() == []
+
+    def test_large_value_is_valid(self):
+        assert _invoke(idle_days=365) == []
+
+
+# ---------------------------------------------------------------------------
+# region_filter parameter
+# ---------------------------------------------------------------------------
+
 
-    def test_unknown_machine_uses_default(self):
-        cost = _estimate_cost("custom-unknown-type", "", 0)
-        assert cost == _DEFAULT_MACHINE_MONTHLY_COST
+class TestRegionFilter:
+    def test_region_filter_string_accepted(self):
+        assert _invoke(region_filter="us-central1") == []
 
-    def test_n1_with_t4_adds_gpu_cost(self):
-        base = _MACHINE_MONTHLY_COST["n1-standard-4"]
-        gpu = _GPU_MONTHLY_COST_EACH["NVIDIA_TESLA_T4"]
-        assert _estimate_cost("n1-standard-4", "NVIDIA_TESLA_T4", 1) == base + gpu
+    def test_region_filter_none_accepted(self):
+        assert _invoke(region_filter=None) == []
 
-    def test_n1_with_two_t4_doubles_gpu_cost(self):
-        base = _MACHINE_MONTHLY_COST["n1-standard-4"]
-        gpu = _GPU_MONTHLY_COST_EACH["NVIDIA_TESLA_T4"]
-        assert _estimate_cost("n1-standard-4", "NVIDIA_TESLA_T4", 2) == base + gpu * 2
 
-    def test_a2_machine_no_gpu_addon(self):
-        # a2-highgpu-1g already bundles A100 cost
-        cost = _estimate_cost("a2-highgpu-1g", "NVIDIA_TESLA_A100", 1)
-        assert cost == _MACHINE_MONTHLY_COST["a2-highgpu-1g"]
+# ---------------------------------------------------------------------------
+# HTTP error handling via public API
+# ---------------------------------------------------------------------------
 
-    def test_g2_machine_no_gpu_addon(self):
-        cost = _estimate_cost("g2-standard-8", "NVIDIA_L4", 1)
-        assert cost == _MACHINE_MONTHLY_COST["g2-standard-8"]
 
-    def test_none_machine_type_uses_default(self):
-        cost = _estimate_cost(None, None, 0)
-        assert cost == _DEFAULT_MACHINE_MONTHLY_COST
+class TestHttpErrors:
+    def test_403_raises_permission_error(self):
+        with pytest.raises(PermissionError):
+            _invoke_with_session(_session(_err(403)))
+
+    def test_403_message_mentions_permission(self):
+        with pytest.raises(PermissionError, match="notebooks.instances.list"):
+            _invoke_with_session(_session(_err(403)))
+
+    def test_403_message_mentions_role(self):
+        with pytest.raises(PermissionError, match="roles/notebooks.viewer"):
+            _invoke_with_session(_session(_err(403)))
+
+    def test_404_returns_empty_list(self):
+        assert _invoke_with_session(_session(_err(404))) == []
+
+    def test_404_no_warning_emitted(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(_session(_err(404)))
+        assert not any(issubclass(w.category, UserWarning) for w in caught)
+
+    def test_400_returns_empty_list(self):
+        assert _invoke_with_session(_session(_err(400))) == []
+
+    def test_400_emits_user_warning(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(_session(_err(400)))
+        assert any(issubclass(w.category, UserWarning) for w in caught)
+
+    def test_400_warning_mentions_status_code(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(_session(_err(400)))
+        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
+        assert "400" in msgs
+
+    def test_400_warning_mentions_project(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(_session(_err(400)))
+        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
+        assert _PROJECT in msgs
+
+    def test_500_returns_empty_list(self):
+        assert _invoke_with_session(_session(_err(500))) == []
+
+    def test_500_emits_user_warning(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(_session(_err(500)))
+        assert any(issubclass(w.category, UserWarning) for w in caught)
+
+    def test_500_warning_mentions_status_code(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(_session(_err(500)))
+        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
+        assert "500" in msgs
+
+    def test_503_warning_mentions_status_code(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(_session(_err(503)))
+        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
+        assert "503" in msgs
+
+    def test_5xx_warning_mentions_project(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(_session(_err(500)))
+        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
+        assert _PROJECT in msgs
+
+    def test_network_error_returns_empty_list(self):
+        session = MagicMock()
+        session.get.side_effect = ConnectionError("timeout")
+        assert _invoke_with_session(session) == []
+
+    def test_network_error_emits_user_warning(self):
+        session = MagicMock()
+        session.get.side_effect = ConnectionError("timeout")
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(session)
+        assert any(issubclass(w.category, UserWarning) for w in caught)
+
+    def test_network_error_warning_mentions_project(self):
+        session = MagicMock()
+        session.get.side_effect = OSError("no route to host")
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(session)
+        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
+        assert _PROJECT in msgs
+
+    def test_network_error_warning_mentions_exception_type(self):
+        session = MagicMock()
+        session.get.side_effect = ConnectionError("dropped")
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _invoke_with_session(session)
+        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
+        assert "ConnectionError" in msgs
 
 
 # ---------------------------------------------------------------------------
-# find_idle_workbench_instances tests
+# _list_instances — direct unit tests
 # ---------------------------------------------------------------------------
 
 
-class TestFindIdleWorkbenchInstances:
-    def _run(self, instances: list, **kwargs):
-        with patch(
-            "cleancloud.providers.gcp.rules.ai.workbench_idle._list_instances",
-            return_value=instances,
-        ):
-            with patch("cleancloud.providers.gcp.rules.ai.workbench_idle.datetime") as mock_dt:
-                mock_dt.now.return_value = NOW
-                mock_dt.fromisoformat = datetime.fromisoformat
-                return find_idle_workbench_instances(
-                    project_id=_PROJECT, credentials=MagicMock(), **kwargs
-                )
-
-    def test_idle_cpu_instance_flagged(self):
-        findings = self._run([_v2_instance()])
-        assert len(findings) == 1
-        f = findings[0]
-        assert f.rule_id == "gcp.vertex.workbench.idle"
-        assert f.provider == "gcp"
-        assert f.resource_id == _INSTANCE_NAME
-        assert f.region == _LOCATION
-        assert f.confidence == ConfidenceLevel.HIGH
-        assert f.risk == RiskLevel.MEDIUM
-
-    def test_stopped_instance_skipped(self):
-        findings = self._run([_v2_instance(state="STOPPED")])
-        assert findings == []
-
-    def test_young_instance_skipped(self):
-        # age < max(idle_days // 2, 7) = 7 days
-        findings = self._run([_v2_instance(create_time=_YOUNG_TIME, update_time=_RECENT_TIME)])
-        assert findings == []
-
-    def test_recent_update_time_not_flagged(self):
-        # updateTime only 3 days ago — not idle
-        findings = self._run([_v2_instance(update_time=_RECENT_TIME)])
-        assert findings == []
-
-    def test_gpu_instance_high_risk(self):
-        findings = self._run([_v2_instance(accel_type="NVIDIA_TESLA_T4", accel_count=1)])
-        assert len(findings) == 1
-        assert findings[0].risk == RiskLevel.HIGH
-
-    def test_gpu_instance_critical_risk_when_idle_ratio_ge_2(self):
-        # idle_since_days = 30, idle_days = 14 → ratio = 30/14 ≈ 2.14 >= 2.0
-        very_idle = NOW - timedelta(days=30)
-        findings = self._run(
-            [_v2_instance(update_time=very_idle, accel_type="NVIDIA_TESLA_A100", accel_count=1)]
+class TestListInstancesBasic:
+    def test_empty_response(self):
+        instances, unreachable, failed = _list_instances(_session(_ok()), _PROJECT)
+        assert instances == []
+        assert unreachable == []
+        assert failed is False
+
+    def test_instances_returned(self):
+        inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
+        instances, _, _ = _list_instances(_session(_ok({"instances": [inst]})), _PROJECT)
+        assert instances == [inst]
+
+    def test_multiple_instances_in_single_page(self):
+        inst_list = [
+            {"name": f"projects/p/locations/us-central1/instances/i{i}", "state": "ACTIVE"}
+            for i in range(3)
+        ]
+        instances, _, _ = _list_instances(_session(_ok({"instances": inst_list})), _PROJECT)
+        assert instances == inst_list
+
+    def test_page_size_100_in_initial_request(self):
+        session = _session(_ok())
+        _list_instances(session, _PROJECT)
+        params = session.get.call_args.kwargs["params"]
+        assert params["pageSize"] == 100
+
+    def test_url_contains_project_id(self):
+        session = _session(_ok())
+        _list_instances(session, "target-project-xyz")
+        url = session.get.call_args.args[0]
+        assert "target-project-xyz" in url
+
+    def test_url_uses_wildcard_location(self):
+        session = _session(_ok())
+        _list_instances(session, _PROJECT)
+        url = session.get.call_args.args[0]
+        assert "locations/-" in url
+
+    def test_url_uses_v2_api(self):
+        session = _session(_ok())
+        _list_instances(session, _PROJECT)
+        url = session.get.call_args.args[0]
+        assert "/v2/" in url
+
+
+class TestListInstancesPagination:
+    def test_two_pages_accumulates_instances(self):
+        inst1 = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
+        inst2 = {"name": "projects/p/locations/us-central1/instances/i2", "state": "ACTIVE"}
+        session = _session(
+            _ok({"instances": [inst1], "nextPageToken": "tok1"}),
+            _ok({"instances": [inst2]}),
         )
-        assert len(findings) == 1
-        assert findings[0].risk == RiskLevel.CRITICAL
-
-    def test_medium_confidence_at_75pct_threshold(self):
-        # idle_since_days = 11 days → 11/14 = 0.786 >= 0.75
-        threshold_medium = NOW - timedelta(days=11)
-        findings = self._run([_v2_instance(update_time=threshold_medium)])
-        assert len(findings) == 1
-        assert findings[0].confidence == ConfidenceLevel.MEDIUM
-
-    def test_below_medium_threshold_not_flagged(self):
-        # idle_since_days = 9 → 9/14 = 0.64 < 0.75
-        recent = NOW - timedelta(days=9)
-        findings = self._run([_v2_instance(update_time=recent)])
-        assert findings == []
-
-    def test_age_fallback_capped_at_medium(self):
-        # v2 instance with no updateTime → age-fallback
-        inst = _v2_instance()
-        del inst["updateTime"]
-        inst.pop("updateTime", None)
-        inst["updateTime"] = ""
-        findings = self._run([inst])
-        # age is 30 days → should be flagged; confidence capped at MEDIUM
-        assert len(findings) == 1
-        assert findings[0].confidence == ConfidenceLevel.MEDIUM
-
-    def test_region_filter_excludes_other_regions(self):
-        findings = self._run([_v2_instance()], region_filter="europe-west1")
-        assert findings == []
-
-    def test_region_filter_includes_matching_region(self):
-        findings = self._run([_v2_instance()], region_filter="us-central1")
-        assert len(findings) == 1
-
-    def test_region_filter_case_insensitive(self):
-        findings = self._run([_v2_instance()], region_filter="US-CENTRAL1")
-        assert len(findings) == 1
-
-    def test_cost_estimate_in_finding(self):
-        findings = self._run([_v2_instance(machine_type="n1-standard-4")])
-        assert len(findings) == 1
-        assert findings[0].estimated_monthly_cost_usd == _MACHINE_MONTHLY_COST["n1-standard-4"]
-
-    def test_gpu_cost_includes_addon(self):
-        findings = self._run(
-            [
-                _v2_instance(
-                    machine_type="n1-standard-4",
-                    accel_type="NVIDIA_TESLA_T4",
-                    accel_count=1,
-                )
-            ]
+        instances, _, _ = _list_instances(session, _PROJECT)
+        assert instances == [inst1, inst2]
+
+    def test_three_pages_all_accumulated(self):
+        def _inst(i):
+            return {"name": f"projects/p/locations/us-central1/instances/i{i}", "state": "ACTIVE"}
+        session = _session(
+            _ok({"instances": [_inst(1)], "nextPageToken": "t1"}),
+            _ok({"instances": [_inst(2)], "nextPageToken": "t2"}),
+            _ok({"instances": [_inst(3)]}),
         )
-        expected = (
-            _MACHINE_MONTHLY_COST["n1-standard-4"] + _GPU_MONTHLY_COST_EACH["NVIDIA_TESLA_T4"]
+        instances, _, _ = _list_instances(session, _PROJECT)
+        assert len(instances) == 3
+
+    def test_page_token_forwarded_on_second_request(self):
+        session = _session(
+            _ok({"nextPageToken": "tok-abc"}),
+            _ok({}),
+        )
+        _list_instances(session, _PROJECT)
+        second_params = session.get.call_args_list[1].kwargs["params"]
+        assert second_params.get("pageToken") == "tok-abc"
+
+    def test_page_token_forwarded_on_third_request(self):
+        session = _session(
+            _ok({"nextPageToken": "t1"}),
+            _ok({"nextPageToken": "t2"}),
+            _ok({}),
+        )
+        _list_instances(session, _PROJECT)
+        third_params = session.get.call_args_list[2].kwargs["params"]
+        assert third_params.get("pageToken") == "t2"
+
+    def test_stops_when_no_next_token(self):
+        session = _session(_ok({}))
+        _list_instances(session, _PROJECT)
+        assert session.get.call_count == 1
+
+    def test_exactly_two_calls_for_two_pages(self):
+        session = _session(
+            _ok({"nextPageToken": "t1"}),
+            _ok({}),
         )
-        assert findings[0].estimated_monthly_cost_usd == expected
-
-    def test_multiple_instances(self):
-        inst1 = _v2_instance(name=f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-1")
-        inst2 = _v2_instance(name=f"projects/{_PROJECT}/locations/{_LOCATION}/instances/wb-2")
-        findings = self._run([inst1, inst2])
-        assert len(findings) == 2
-
-    def test_empty_project_returns_no_findings(self):
-        findings = self._run([])
-        assert findings == []
-
-    def test_custom_idle_days(self):
-        # With idle_days=7, an instance 8 days since updateTime should be flagged
-        eight_days_ago = NOW - timedelta(days=8)
-        # But age must also be >= threshold_medium (75% of 7 = 5.25 days → 5 days)
-        findings = self._run(
-            [_v2_instance(update_time=eight_days_ago)],
-            idle_days=7,
+        _list_instances(session, _PROJECT)
+        assert session.get.call_count == 2
+
+
+class TestListInstancesUnreachable:
+    def test_single_unreachable_location_collected(self):
+        session = _session(_ok({"unreachable": ["asia-east1"]}))
+        _, unreachable, _ = _list_instances(session, _PROJECT)
+        assert "asia-east1" in unreachable
+
+    def test_multiple_unreachable_locations(self):
+        session = _session(_ok({"unreachable": ["asia-east1", "europe-west3"]}))
+        _, unreachable, _ = _list_instances(session, _PROJECT)
+        assert "asia-east1" in unreachable
+        assert "europe-west3" in unreachable
+
+    def test_unreachable_deduplicated_across_pages(self):
+        session = _session(
+            _ok({"unreachable": ["asia-east1"], "nextPageToken": "t1"}),
+            _ok({"unreachable": ["asia-east1"]}),
         )
-        assert len(findings) == 1
-        assert findings[0].confidence == ConfidenceLevel.HIGH
+        _, unreachable, _ = _list_instances(session, _PROJECT)
+        assert unreachable.count("asia-east1") == 1
+
+    def test_empty_string_in_unreachable_skipped(self):
+        session = _session(_ok({"unreachable": ["", "us-east1"]}))
+        _, unreachable, _ = _list_instances(session, _PROJECT)
+        assert "" not in unreachable
+        assert "us-east1" in unreachable
+
+    def test_no_unreachable_when_field_absent(self):
+        session = _session(_ok({}))
+        _, unreachable, _ = _list_instances(session, _PROJECT)
+        assert unreachable == []
+
+    def test_unreachable_from_multiple_pages_merged(self):
+        session = _session(
+            _ok({"unreachable": ["asia-east1"], "nextPageToken": "t1"}),
+            _ok({"unreachable": ["europe-west3"]}),
+        )
+        _, unreachable, _ = _list_instances(session, _PROJECT)
+        assert "asia-east1" in unreachable
+        assert "europe-west3" in unreachable
 
-    def test_rule_metadata_and_rule_id(self):
-        assert RULE_METADATA["id"] == "gcp.vertex.workbench.idle"
-        assert RULE_METADATA["category"] == "ai"
-        assert find_idle_workbench_instances.RULE_ID == "gcp.vertex.workbench.idle"
 
-    def test_age_fallback_signal_says_age_not_updatetime(self):
-        inst = _v2_instance()
-        inst["updateTime"] = ""
-        findings = self._run([inst])
-        assert len(findings) == 1
-        signals = findings[0].evidence.signals_used
-        activity_signal = next(s for s in signals if "control-plane activity" in s)
-        assert "age (fallback)" in activity_signal
-        assert "updateTime" not in activity_signal
-
-    def test_normal_signal_credits_updatetime(self):
-        findings = self._run([_v2_instance()])
-        signals = findings[0].evidence.signals_used
-        activity_signal = next(s for s in signals if "control-plane activity" in s)
-        assert "updateTime" in activity_signal
-
-    def test_tpu_instance_labelled_tpu_not_gpu(self):
-        findings = self._run([_v2_instance(accel_type="TPU_V2", accel_count=1)])
-        assert len(findings) == 1
-        f = findings[0]
-        assert "TPU" in f.title
-        assert "GPU" not in f.title
-        assert any("TPU-backed" in s for s in f.evidence.signals_used)
-
-    def test_tpu_cost_includes_tpu_addon(self):
-        findings = self._run([_v2_instance(accel_type="TPU_V2", accel_count=1)])
-        assert len(findings) == 1
-        expected = _MACHINE_MONTHLY_COST["n1-standard-4"] + _GPU_MONTHLY_COST_EACH["TPU_V2"]
-        assert findings[0].estimated_monthly_cost_usd == expected
-
-    def test_500_from_v2_does_not_abort_scan(self):
-        """A transient 500 from the v2 API should return empty results, not raise."""
-        mock_session = MagicMock()
-        resp_500 = MagicMock()
-        resp_500.status_code = 500
-        mock_session.get.return_value = resp_500
-
-        with patch(
-            "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession",
-            return_value=mock_session,
-        ):
-            findings = find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock())
-        assert findings == []
+class TestListInstancesErrors:
+    def test_403_raises_permission_error(self):
+        with pytest.raises(PermissionError, match="notebooks.instances.list"):
+            _list_instances(_session(_err(403)), _PROJECT)
+
+    def test_404_returns_empty_clean(self):
+        instances, unreachable, failed = _list_instances(_session(_err(404)), _PROJECT)
+        assert instances == []
+        assert unreachable == []
+        assert failed is False
+
+    def test_400_returns_empty_with_discovery_failed(self):
+        instances, unreachable, failed = _list_instances(_session(_err(400)), _PROJECT)
+        assert instances == []
+        assert unreachable == []
+        assert failed is True
+
+    def test_500_sets_discovery_failed(self):
+        _, _, failed = _list_instances(_session(_err(500)), _PROJECT)
+        assert failed is True
+
+    def test_503_sets_discovery_failed(self):
+        _, _, failed = _list_instances(_session(_err(503)), _PROJECT)
+        assert failed is True
+
+    def test_5xx_preserves_instances_from_earlier_pages(self):
+        """Instances already fetched before a 5xx error must be returned."""
+        inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
+        session = _session(
+            _ok({"instances": [inst], "nextPageToken": "t1"}),
+            _err(503),
+        )
+        instances, _, failed = _list_instances(session, _PROJECT)
+        assert instances == [inst]
+        assert failed is True
+
+    def test_network_error_sets_discovery_failed(self):
+        session = MagicMock()
+        session.get.side_effect = ConnectionError("timeout")
+        _, _, failed = _list_instances(session, _PROJECT)
+        assert failed is True
+
+    def test_network_error_preserves_earlier_instances(self):
+        inst = {"name": "projects/p/locations/us-central1/instances/i1", "state": "ACTIVE"}
+        session = _session(
+            _ok({"instances": [inst], "nextPageToken": "t1"}),
+        )
+        session.get.side_effect = [
+            _ok({"instances": [inst], "nextPageToken": "t1"}),
+            ConnectionError("dropped"),
+        ]
+        instances, _, failed = _list_instances(session, _PROJECT)
+        assert instances == [inst]
+        assert failed is True
+
+    def test_400_emits_warning_with_project(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _list_instances(_session(_err(400)), _PROJECT)
+        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
+        assert _PROJECT in msgs
+
+    def test_500_emits_warning_with_status_code(self):
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _list_instances(_session(_err(500)), _PROJECT)
+        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
+        assert "500" in msgs
+
+    def test_network_error_emits_warning_with_project(self):
+        session = MagicMock()
+        session.get.side_effect = OSError("no route to host")
+        with warnings.catch_warnings(record=True) as caught:
+            warnings.simplefilter("always")
+            _list_instances(session, _PROJECT)
+        msgs = " ".join(str(w.message) for w in caught if issubclass(w.category, UserWarning))
+        assert _PROJECT in msgs
 
 
 # ---------------------------------------------------------------------------
-# _list_instances permission error propagation
+# Rule metadata
 # ---------------------------------------------------------------------------
 
 
-class TestListInstancesPermissionError:
-    def test_403_raises_permission_error(self):
-        mock_session = MagicMock()
-        response = MagicMock()
-        response.status_code = 403
-        mock_session.get.return_value = response
-
-        with patch(
-            "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession",
-            return_value=mock_session,
-        ):
-            with pytest.raises(PermissionError, match="notebooks.instances.list"):
-                find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock())
-
-    def test_404_returns_empty(self):
-        mock_session = MagicMock()
-        response = MagicMock()
-        response.status_code = 404
-        mock_session.get.return_value = response
-
-        with patch(
-            "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession",
-            return_value=mock_session,
-        ):
-            findings = find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock())
-        assert findings == []
+class TestRuleMetadata:
+    def test_rule_id(self):
+        assert RULE_METADATA["id"] == "gcp.vertex.workbench.idle"
+
+    def test_category(self):
+        assert RULE_METADATA["category"] == "ai"
+
+    def test_service(self):
+        assert RULE_METADATA["service"] == "notebooks"
+
+    def test_cost_impact(self):
+        assert RULE_METADATA["cost_impact"] == "high"
+
+    def test_rule_id_attribute_on_function(self):
+        assert find_idle_workbench_instances.RULE_ID == "gcp.vertex.workbench.idle"

From 8a84e3e21c01a664d5c3ea0ce5cdc88970674ba2 Mon Sep 17 00:00:00 2001
From: javvaji-devops <venkata.javvaji.91@gmail.com>
Date: Wed, 6 May 2026 14:57:59 +0100
Subject: [PATCH 3/4] linting

---
 .../providers/gcp/rules/ai/workbench_idle.py  |  9 ++------
 .../azure/test_azure_app_service_idle.py      |  2 +-
 .../gcp/ai/test_gcp_workbench_idle.py         | 23 ++++++++++---------
 3 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/cleancloud/providers/gcp/rules/ai/workbench_idle.py b/cleancloud/providers/gcp/rules/ai/workbench_idle.py
index 91f4f52..e906198 100644
--- a/cleancloud/providers/gcp/rules/ai/workbench_idle.py
+++ b/cleancloud/providers/gcp/rules/ai/workbench_idle.py
@@ -55,9 +55,7 @@
 # Exact documented resource-name pattern (spec 3.1, 7):
 #   projects/{projectId}/locations/{location}/instances/{instanceId}
 # All four non-empty path segments must be present.
-_INSTANCE_NAME_RE = re.compile(
-    r"^projects/[^/]+/locations/[^/]+/instances/[^/]+$"
-)
+_INSTANCE_NAME_RE = re.compile(r"^projects/[^/]+/locations/[^/]+/instances/[^/]+$")
 
 
 def find_idle_workbench_instances(
@@ -117,10 +115,7 @@ def _list_instances(
     results: list = []
     unreachable: list = []
     discovery_failed = False
-    url = (
-        f"https://notebooks.googleapis.com/v2"
-        f"/projects/{project_id}/locations/-/instances"
-    )
+    url = f"https://notebooks.googleapis.com/v2" f"/projects/{project_id}/locations/-/instances"
     params: dict = {"pageSize": 100}
 
     while True:
diff --git a/tests/cleancloud/providers/azure/test_azure_app_service_idle.py b/tests/cleancloud/providers/azure/test_azure_app_service_idle.py
index 6910840..2c49a24 100644
--- a/tests/cleancloud/providers/azure/test_azure_app_service_idle.py
+++ b/tests/cleancloud/providers/azure/test_azure_app_service_idle.py
@@ -792,7 +792,7 @@ def _failing_iter(*args, **kwargs):
             # Yield nothing, then raise — simulates a pager that fails before
             # returning any results (first page network error, etc.)
             raise Exception("pager failed mid-iteration")
-            yield  # noqa: unreachable — makes this a generator
+            yield  # makes this a generator
 
         web.web_apps.list_web_jobs.side_effect = _failing_iter
         findings = find_idle_app_services(
diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
index fd01236..e7d9ffc 100644
--- a/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
+++ b/tests/cleancloud/providers/gcp/ai/test_gcp_workbench_idle.py
@@ -80,9 +80,7 @@ def _invoke(**kwargs):
         "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession",
         return_value=_session(_ok()),
     ):
-        return find_idle_workbench_instances(
-            project_id=_PROJECT, credentials=MagicMock(), **kwargs
-        )
+        return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs)
 
 
 def _invoke_with_session(mock_session, **kwargs):
@@ -91,9 +89,7 @@ def _invoke_with_session(mock_session, **kwargs):
         "cleancloud.providers.gcp.rules.ai.workbench_idle.AuthorizedSession",
         return_value=mock_session,
     ):
-        return find_idle_workbench_instances(
-            project_id=_PROJECT, credentials=MagicMock(), **kwargs
-        )
+        return find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), **kwargs)
 
 
 # ---------------------------------------------------------------------------
@@ -110,13 +106,19 @@ def test_always_empty(self):
 
     def test_empty_when_api_returns_active_instances(self):
         """EMITTING_DISABLED: ACTIVE instances in API response still yield no findings."""
-        inst = {"name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-1", "state": "ACTIVE"}
+        inst = {
+            "name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-1",
+            "state": "ACTIVE",
+        }
         result = _invoke_with_session(_session(_ok({"instances": [inst]})))
         assert result == []
 
     def test_empty_when_api_returns_multiple_instances(self):
         instances = [
-            {"name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-{i}", "state": "ACTIVE"}
+            {
+                "name": f"projects/{_PROJECT}/locations/us-central1/instances/wb-{i}",
+                "state": "ACTIVE",
+            }
             for i in range(5)
         ]
         result = _invoke_with_session(_session(_ok({"instances": instances})))
@@ -131,9 +133,7 @@ def test_empty_when_api_returns_multiple_instances(self):
 class TestIdleDaysValidation:
     def test_zero_raises_value_error(self):
         with pytest.raises(ValueError, match="idle_days must be >= 1"):
-            find_idle_workbench_instances(
-                project_id=_PROJECT, credentials=MagicMock(), idle_days=0
-            )
+            find_idle_workbench_instances(project_id=_PROJECT, credentials=MagicMock(), idle_days=0)
 
     def test_negative_one_raises(self):
         with pytest.raises(ValueError, match="idle_days must be >= 1"):
@@ -352,6 +352,7 @@ def test_two_pages_accumulates_instances(self):
     def test_three_pages_all_accumulated(self):
         def _inst(i):
             return {"name": f"projects/p/locations/us-central1/instances/i{i}", "state": "ACTIVE"}
+
         session = _session(
             _ok({"instances": [_inst(1)], "nextPageToken": "t1"}),
             _ok({"instances": [_inst(2)], "nextPageToken": "t2"}),

From 2376379f4c9a9828241dbd82063fd150c4d1bb62 Mon Sep 17 00:00:00 2001
From: javvaji-devops <venkata.javvaji.91@gmail.com>
Date: Wed, 6 May 2026 15:15:21 +0100
Subject: [PATCH 4/4] Bump version from 1.29.0 to 1.30.0

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b50a9dc..7e2a2bf 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cleancloud"
-version = "1.29.0"
+version = "1.30.0"
 description = "Read-only cloud hygiene for AWS, Azure, and GCP. Multi-account org scanning, CI/CD enforcement, and deterministic cost modeling. No agents, no telemetry."
 readme = "README.md"
 requires-python = ">=3.10"