From d30d98bbef23df15d9c64093f5a79c61ef814848 Mon Sep 17 00:00:00 2001
From: Tofik Hasanov <annexcies@gmail.com>
Date: Thu, 4 Jun 2026 17:54:44 -0400
Subject: [PATCH 1/3] fix(cloud-security): support AWS Config new recording
 model in checks + auto-remediation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Customer-reported: AWS auto-remediation for "AWS Config recorder not fully
active" generated empty configs, the Retry button did nothing, and applying a
generated fix failed and fell back to dated manual steps. Root cause: the
entire Config check + auto-fix assumed the legacy `recordingGroup.allSupported`
model, but the customer's recorder uses AWS's current model ("Record all
resource types with customizable overrides" = recordingStrategy /
exclusionByResourceTypes).

- config.adapter: the recorder check now treats
  recordingStrategy.useOnly === 'ALL_SUPPORTED_RESOURCE_TYPES' (and legacy
  allSupported) as "records all", eliminating false positives on the new
  model. Genuine EXCLUSION/INCLUSION recorders stay flagged.
- config.adapter: remediation guidance now produces an AWS-valid call —
  read the existing recorder, then PutConfigurationRecorder with a clean
  recordingGroup { allSupported: true, includeGlobalResourceTypes: true } and
  NO recordingStrategy/exclusionByResourceTypes/resourceTypes (those are
  mutually exclusive with allSupported and trigger a ValidationException). This
  also records the global IAM resource types the customer was missing.
- aws-command-executor: deterministic guardrail (normalizeConfigRecordingGroup)
  collapses any all-supported-intent PutConfigurationRecorder to the single
  valid shape right before the SDK call, regardless of what the AI emits.
- remediation.service: never cache an empty / non-auto-fixable plan and drop
  the stale entry on execute — this is what made "Retry" a guaranteed no-op
  (it reloaded the same dead plan). Retry now regenerates.
- ai-remediation.service: generateFixPlan retries once at a higher temperature
  when the first pass yields zero fix steps (temp 0 would reproduce it).
- prompts: discourage S3 ACL steps (cause of empty plans), reinforce the valid
  Config recorder call, and base manual steps on the current AWS Console.
- RemediationDialog: disable "Apply Fix" on an empty plan and explain why.

Tests: new config.adapter.spec, recordingGroup-normalizer and retry-on-empty
cases; full cloud-security jest suite green (288 passing).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../cloud-security/ai-remediation.prompt.ts   |   9 +
 .../ai-remediation.service.spec.ts            |  97 +++++++++-
 .../cloud-security/ai-remediation.service.ts  |  52 ++++--
 .../aws-command-executor.spec.ts              |  99 +++++++++++
 .../cloud-security/aws-command-executor.ts    |  40 +++++
 .../providers/aws/config.adapter.spec.ts      | 166 ++++++++++++++++++
 .../providers/aws/config.adapter.ts           |  32 +++-
 .../remediation.service.spec.ts               |  31 ++++
 .../src/cloud-security/remediation.service.ts |  68 ++++---
 .../components/RemediationDialog.tsx          |  15 +-
 10 files changed, 569 insertions(+), 40 deletions(-)
 create mode 100644 apps/api/src/cloud-security/providers/aws/config.adapter.spec.ts

diff --git a/apps/api/src/cloud-security/ai-remediation.prompt.ts b/apps/api/src/cloud-security/ai-remediation.prompt.ts
index f179d32ef1..810a8425db 100644
--- a/apps/api/src/cloud-security/ai-remediation.prompt.ts
+++ b/apps/api/src/cloud-security/ai-remediation.prompt.ts
@@ -162,6 +162,15 @@ A human will ALWAYS review your plan before execution. Be precise and correct.
 - ALWAYS make changes reversible when possible
 - For service-linked roles: create them as a setup step using IAM CreateServiceLinkedRoleCommand
 
+## S3 PUBLIC ACCESS AND ACLs (IMPORTANT)
+- NEVER use PutBucketAclCommand or bucket/object ACLs. Modern buckets use Object Ownership = BucketOwnerEnforced, which disables ACLs — the call fails, and the executor strips ACL steps, which can leave an EMPTY plan.
+- To block public access on a bucket: use s3:PutPublicAccessBlockCommand with PublicAccessBlockConfiguration set to { BlockPublicAcls: true, IgnorePublicAcls: true, BlockPublicPolicy: true, RestrictPublicBuckets: true }.
+- To remediate a public bucket POLICY: read it first with GetBucketPolicy, then use s3:PutBucketPolicyCommand with a corrected least-privilege policy. Never rely on ACLs to fix public access.
+
+## AWS CONFIG RECORDER (IMPORTANT)
+- To make a recorder record ALL supported resource types, first read the existing recorder with config-service:DescribeConfigurationRecordersCommand (readSteps) to get its exact name and roleARN, then call config-service:PutConfigurationRecorderCommand with ConfigurationRecorder = { name, roleARN, recordingGroup: { allSupported: true, includeGlobalResourceTypes: true } }.
+- NEVER set allSupported:true together with recordingStrategy, exclusionByResourceTypes, or resourceTypes — they are mutually exclusive and AWS rejects the request with a ValidationException. Omit those fields entirely (this also overwrites an existing exclusion-based strategy so global IAM resources are recorded).
+
 ## IDEMPOTENCY (CRITICAL)
 - All fix steps MUST be safe to run even if the resource already exists
 - For Create operations: our executor automatically handles "already exists" errors — they are treated as success, not failure
diff --git a/apps/api/src/cloud-security/ai-remediation.service.spec.ts b/apps/api/src/cloud-security/ai-remediation.service.spec.ts
index 22bb90eb62..d40c92eb85 100644
--- a/apps/api/src/cloud-security/ai-remediation.service.spec.ts
+++ b/apps/api/src/cloud-security/ai-remediation.service.spec.ts
@@ -142,9 +142,12 @@ describe('AiRemediationService.generateFixPlan empty-state backstop', () => {
 
   it('leaves the plan untouched when AI returns {}/{} but the plan has no actionable steps', async () => {
     // Verify-only plans (only readSteps) should still be left alone —
-    // we never fabricate state when there's nothing to act on.
+    // we never fabricate state when there's nothing to act on. A plan with no
+    // fix steps is not auto-fixable, so canAutoFix is false (which also means
+    // the empty-plan retry does not apply to it).
     generateObjectMock.mockResolvedValueOnce({
       object: basePlan({
+        canAutoFix: false,
         readSteps: [
           { service: 's3', command: 'GetBucketVersioningCommand', params: {}, purpose: 'check' },
         ],
@@ -172,6 +175,17 @@ describe('AiRemediationService.generateFixPlan empty-state backstop', () => {
       object: basePlan({
         currentState: { versioning: 'Disabled' },
         proposedState: { versioning: 'Enabled' },
+        fixSteps: [
+          {
+            service: 's3',
+            command: 'PutBucketVersioningCommand',
+            params: {
+              Bucket: 'logs-archive',
+              VersioningConfiguration: { Status: 'Enabled' },
+            },
+            purpose: 'enable versioning',
+          },
+        ],
       }),
     });
 
@@ -226,8 +240,11 @@ describe('AiRemediationService.generateFixPlan empty-state backstop', () => {
   });
 
   it('leaves a plan alone when only one side is empty (legitimate verify-only case)', async () => {
+    // Verify-only: no fix steps, so the plan is not auto-fixable (canAutoFix
+    // false) and the empty-plan retry does not apply.
     generateObjectMock.mockResolvedValueOnce({
       object: basePlan({
+        canAutoFix: false,
         currentState: { someField: 'X' },
         proposedState: {},
       }),
@@ -508,3 +525,81 @@ describe('AiRemediationService.generateManualSteps', () => {
     expect(callArgs.prompt).toContain('account-level');
   });
 });
+
+describe('AiRemediationService.generateFixPlan empty-plan retry', () => {
+  const generateObjectMock = generateObject as unknown as jest.Mock;
+
+  beforeEach(() => {
+    generateObjectMock.mockReset();
+  });
+
+  it('retries once when the first plan has canAutoFix=true but zero fixSteps, and uses the non-empty retry', async () => {
+    // First pass: empty fix plan (the "AI generated an empty fix plan" case).
+    generateObjectMock.mockResolvedValueOnce({
+      object: basePlan({ canAutoFix: true, fixSteps: [] }),
+    });
+    // Second pass (higher temperature): a real plan.
+    generateObjectMock.mockResolvedValueOnce({
+      object: basePlan({
+        canAutoFix: true,
+        fixSteps: [
+          {
+            service: 'config-service',
+            command: 'PutConfigurationRecorderCommand',
+            params: { ConfigurationRecorder: { name: 'default' } },
+            purpose: 'Record all resources',
+          },
+        ],
+      }),
+    });
+
+    const service = new AiRemediationService();
+    const plan = await service.generateFixPlan({
+      title: 'AWS Config recorder not fully active',
+      description: null,
+      severity: 'high',
+      resourceType: 'AwsConfigRecorder',
+      resourceId: 'default',
+      remediation: null,
+      findingKey: 'config-recorder-incomplete',
+      evidence: {},
+    });
+
+    expect(generateObjectMock).toHaveBeenCalledTimes(2);
+    // The retry runs at a non-zero temperature so it is a genuinely different sample.
+    expect(generateObjectMock.mock.calls[0][0].temperature).toBe(0);
+    expect(generateObjectMock.mock.calls[1][0].temperature).toBeGreaterThan(0);
+    expect(plan.fixSteps).toHaveLength(1);
+    expect(plan.fixSteps[0].command).toBe('PutConfigurationRecorderCommand');
+  });
+
+  it('does not retry when the first plan already has fix steps', async () => {
+    generateObjectMock.mockResolvedValueOnce({
+      object: basePlan({
+        canAutoFix: true,
+        fixSteps: [
+          {
+            service: 'iam',
+            command: 'UpdateAccountPasswordPolicyCommand',
+            params: {},
+            purpose: 'fix',
+          },
+        ],
+      }),
+    });
+
+    const service = new AiRemediationService();
+    await service.generateFixPlan({
+      title: 'Weak password policy',
+      description: null,
+      severity: null,
+      resourceType: 'AwsIamPolicy',
+      resourceId: 'account-level',
+      remediation: null,
+      findingKey: 'iam-weak-password',
+      evidence: {},
+    });
+
+    expect(generateObjectMock).toHaveBeenCalledTimes(1);
+  });
+});
diff --git a/apps/api/src/cloud-security/ai-remediation.service.ts b/apps/api/src/cloud-security/ai-remediation.service.ts
index 0bafbbdb7d..ba09a5b95f 100644
--- a/apps/api/src/cloud-security/ai-remediation.service.ts
+++ b/apps/api/src/cloud-security/ai-remediation.service.ts
@@ -53,20 +53,23 @@ export class AiRemediationService {
   /** Phase 1: Generate initial plan (read steps + preliminary fix plan). */
   async generateFixPlan(finding: FindingContext): Promise<FixPlan> {
     try {
-      const { object } = await generateObject({
-        model: MODEL,
-        schema: fixPlanSchema,
-        system: SYSTEM_PROMPT,
-        prompt: buildFixPlanPrompt(finding),
-        temperature: 0,
-      });
+      let plan = await this.requestFixPlan(finding, 0);
+
+      // The model occasionally returns canAutoFix=true with zero fixSteps, or
+      // the normalizer strips every step (e.g. unsupported S3 ACL calls). That
+      // surfaces to the user as "AI generated an empty fix plan. Cannot
+      // proceed." and — combined with plan caching — a Retry that does
+      // nothing. Generation is non-deterministic, so retry ONCE at a higher
+      // temperature to force a genuinely different sample before giving up.
+      if (plan.canAutoFix && plan.fixSteps.length === 0) {
+        this.logger.warn(
+          `Empty fix plan for ${finding.findingKey}; regenerating once at higher temperature`,
+        );
+        const retry = await this.requestFixPlan(finding, 0.5);
+        if (retry.fixSteps.length > 0) plan = retry;
+      }
 
-      this.logger.log(
-        `AI plan for ${finding.findingKey}: canAutoFix=${object.canAutoFix}, risk=${object.risk}`,
-      );
-      return normalizeFixPlan(enrichEmptyState(object), {
-        resourceId: finding.resourceId,
-      });
+      return plan;
     } catch (err) {
       this.logger.error(
         `AI plan failed: ${err instanceof Error ? err.message : String(err)}`,
@@ -75,6 +78,27 @@ export class AiRemediationService {
     }
   }
 
+  /** Single fix-plan generation pass (generate → enrich → normalize). */
+  private async requestFixPlan(
+    finding: FindingContext,
+    temperature: number,
+  ): Promise<FixPlan> {
+    const { object } = await generateObject({
+      model: MODEL,
+      schema: fixPlanSchema,
+      system: SYSTEM_PROMPT,
+      prompt: buildFixPlanPrompt(finding),
+      temperature,
+    });
+
+    this.logger.log(
+      `AI plan for ${finding.findingKey}: canAutoFix=${object.canAutoFix}, risk=${object.risk}`,
+    );
+    return normalizeFixPlan(enrichEmptyState(object), {
+      resourceId: finding.resourceId,
+    });
+  }
+
   /**
    * Phase 2: Refine fix steps using REAL data from AWS.
    * Called after read steps executed successfully.
@@ -394,7 +418,7 @@ INSTRUCTIONS:
             ),
         }),
         system:
-          'You are an AWS security expert writing manual remediation steps for a customer whose automatic fix failed. Be concrete: name exact services, exact resources, and exact actions. Prefer AWS Console clicks over CLI when the path is short, but include CLI commands when they are clearer. Never reference SDK class names. Never apologize. Never speculate about "if the issue persists" — just give the steps.',
+          'You are an AWS security expert writing manual remediation steps for a customer whose automatic fix failed. Be concrete: name exact services, exact resources, and exact actions. Prefer AWS Console clicks over CLI when the path is short, but include CLI commands when they are clearer. Never reference SDK class names. Never apologize. Never speculate about "if the issue persists" — just give the steps. Base every instruction on the CURRENT AWS Console; do NOT describe deprecated layouts or removed menu options. For AWS Config recorder settings specifically: the current console is Config → Settings, which shows a "Recording method"/"Recording strategy" under a customer managed recorder — to record everything, click Edit, choose to record all resource types, enable "Include global resource types (IAM resources)", and remove any per-resource-type overrides or exclusions. Do not instruct the user to select an old "Record all resource types supported in this region" radio option if it is not present in the current console.',
         prompt: `A finding could not be auto-remediated. Generate clear manual steps the customer can follow.
 
 FINDING:
diff --git a/apps/api/src/cloud-security/aws-command-executor.spec.ts b/apps/api/src/cloud-security/aws-command-executor.spec.ts
index 4a88777172..94748773a5 100644
--- a/apps/api/src/cloud-security/aws-command-executor.spec.ts
+++ b/apps/api/src/cloud-security/aws-command-executor.spec.ts
@@ -2,6 +2,7 @@ import type { AwsCommandStep } from './ai-remediation.prompt';
 import {
   REQUIRED_PARAMS,
   looksLikeValidationError,
+  normalizeConfigRecordingGroup,
   validatePlanSteps,
 } from './aws-command-executor';
 
@@ -302,3 +303,101 @@ describe('validatePlanSteps — pre-existing behavior preserved', () => {
     ).toBeDefined();
   });
 });
+
+/**
+ * AWS Config recorder: `allSupported:true` is mutually exclusive with
+ * `recordingStrategy` / `exclusionByResourceTypes` / `resourceTypes`. The AI
+ * (and a customer's existing exclusion-based recorder) frequently echoes those
+ * fields back alongside allSupported:true, which AWS rejects with a
+ * ValidationException. normalizeConfigRecordingGroup collapses the group to the
+ * single valid "record everything (incl. global IAM)" shape.
+ */
+describe('normalizeConfigRecordingGroup', () => {
+  it('strips conflicting fields when an exclusion-based group is converted to all-supported', () => {
+    const input: Record<string, unknown> = {
+      ConfigurationRecorder: {
+        name: 'default',
+        roleARN: 'arn:aws:iam::123:role/aws-service-role/config',
+        recordingGroup: {
+          allSupported: true,
+          recordingStrategy: { useOnly: 'EXCLUSION_BY_RESOURCE_TYPES' },
+          exclusionByResourceTypes: {
+            resourceTypes: ['AWS::IAM::User', 'AWS::IAM::Role'],
+          },
+        },
+      },
+    };
+    normalizeConfigRecordingGroup(input);
+    const recorder = input.ConfigurationRecorder as Record<string, unknown>;
+    expect(recorder.recordingGroup).toEqual({
+      allSupported: true,
+      includeGlobalResourceTypes: true,
+    });
+    // name + roleARN are preserved untouched.
+    expect(recorder.name).toBe('default');
+    expect(recorder.roleARN).toBe(
+      'arn:aws:iam::123:role/aws-service-role/config',
+    );
+  });
+
+  it('converts a pure exclusion strategy (allSupported absent) to all-supported', () => {
+    const input: Record<string, unknown> = {
+      ConfigurationRecorder: {
+        name: 'default',
+        recordingGroup: {
+          recordingStrategy: { useOnly: 'EXCLUSION_BY_RESOURCE_TYPES' },
+          exclusionByResourceTypes: { resourceTypes: ['AWS::IAM::Role'] },
+        },
+      },
+    };
+    normalizeConfigRecordingGroup(input);
+    const recorder = input.ConfigurationRecorder as Record<string, unknown>;
+    expect(recorder.recordingGroup).toEqual({
+      allSupported: true,
+      includeGlobalResourceTypes: true,
+    });
+  });
+
+  it('cleans an ALL_SUPPORTED_RESOURCE_TYPES strategy to the minimal valid shape', () => {
+    const input: Record<string, unknown> = {
+      ConfigurationRecorder: {
+        name: 'default',
+        recordingGroup: {
+          allSupported: true,
+          recordingStrategy: { useOnly: 'ALL_SUPPORTED_RESOURCE_TYPES' },
+        },
+      },
+    };
+    normalizeConfigRecordingGroup(input);
+    expect(
+      (input.ConfigurationRecorder as Record<string, unknown>).recordingGroup,
+    ).toEqual({ allSupported: true, includeGlobalResourceTypes: true });
+  });
+
+  it('leaves an INCLUSION_BY_RESOURCE_TYPES recorder untouched (records only specific types)', () => {
+    const recordingGroup = {
+      allSupported: false,
+      recordingStrategy: { useOnly: 'INCLUSION_BY_RESOURCE_TYPES' },
+      resourceTypes: ['AWS::S3::Bucket'],
+    };
+    const input: Record<string, unknown> = {
+      ConfigurationRecorder: { name: 'default', recordingGroup },
+    };
+    normalizeConfigRecordingGroup(input);
+    expect(
+      (input.ConfigurationRecorder as Record<string, unknown>).recordingGroup,
+    ).toEqual(recordingGroup);
+  });
+
+  it('is a no-op when there is no ConfigurationRecorder/recordingGroup', () => {
+    const input: Record<string, unknown> = {};
+    expect(() => normalizeConfigRecordingGroup(input)).not.toThrow();
+    expect(input).toEqual({});
+
+    const input2: Record<string, unknown> = {
+      ConfigurationRecorder: { name: 'default' },
+    };
+    normalizeConfigRecordingGroup(input2);
+    expect(input2).toEqual({ ConfigurationRecorder: { name: 'default' } });
+  });
+});
diff --git a/apps/api/src/cloud-security/aws-command-executor.ts b/apps/api/src/cloud-security/aws-command-executor.ts
index 26a913a4c5..5893beee6a 100644
--- a/apps/api/src/cloud-security/aws-command-executor.ts
+++ b/apps/api/src/cloud-security/aws-command-executor.ts
@@ -264,6 +264,46 @@ function normaliseInputParams(
     if (!input.IsMultiRegionTrail) input.IsMultiRegionTrail = true;
     if (!input.EnableLogFileValidation) input.EnableLogFileValidation = true;
   }
+
+  // Rule 4: AWS Config recorder — `allSupported: true` is mutually exclusive
+  // with `recordingStrategy`, `exclusionByResourceTypes`, and `resourceTypes`.
+  // AWS rejects the combination with a ValidationException. The AI (and a
+  // customer's existing exclusion-based recorder, e.g. one that excludes the
+  // IAM resource types) frequently echoes those fields back alongside
+  // allSupported:true. When the intent is "record all supported types",
+  // collapse recordingGroup to the single valid shape that records everything,
+  // including the global IAM resource types.
+  if (command === 'PutConfigurationRecorderCommand') {
+    normalizeConfigRecordingGroup(input);
+  }
+}
+
+export function normalizeConfigRecordingGroup(
+  input: Record<string, unknown>,
+): void {
+  const recorder = input.ConfigurationRecorder;
+  if (!recorder || typeof recorder !== 'object' || Array.isArray(recorder)) {
+    return;
+  }
+  const recorderObj = recorder as Record<string, unknown>;
+  const group = recorderObj.recordingGroup;
+  if (!group || typeof group !== 'object' || Array.isArray(group)) return;
+
+  const groupObj = group as Record<string, unknown>;
+  const strategy = groupObj.recordingStrategy as
+    | { useOnly?: string }
+    | undefined;
+  const wantsAllSupported =
+    groupObj.allSupported === true ||
+    strategy?.useOnly === 'ALL_SUPPORTED_RESOURCE_TYPES' ||
+    groupObj.exclusionByResourceTypes != null;
+
+  if (!wantsAllSupported) return;
+
+  recorderObj.recordingGroup = {
+    allSupported: true,
+    includeGlobalResourceTypes: true,
+  };
 }
 
 /**
diff --git a/apps/api/src/cloud-security/providers/aws/config.adapter.spec.ts b/apps/api/src/cloud-security/providers/aws/config.adapter.spec.ts
new file mode 100644
index 0000000000..c50f8bf46d
--- /dev/null
+++ b/apps/api/src/cloud-security/providers/aws/config.adapter.spec.ts
@@ -0,0 +1,166 @@
+import {
+  DescribeConfigurationRecordersCommand,
+  DescribeConfigurationRecorderStatusCommand,
+  type ConfigurationRecorder,
+} from '@aws-sdk/client-config-service';
+import { ConfigAdapter } from './config.adapter';
+import type { SecurityFinding } from '../../cloud-security.service';
+
+type SendHandler = (command: unknown) => unknown;
+
+function buildClient(handler: SendHandler) {
+  return {
+    send: jest.fn((command: unknown) => Promise.resolve(handler(command))),
+  } as unknown as Parameters<
+    ConfigAdapter['scan']
+  >[0] extends infer _
+    ? import('@aws-sdk/client-config-service').ConfigServiceClient
+    : never;
+}
+
+/** Invoke the private checkRecorders() with a mocked Config client. */
+function runCheckRecorders(args: {
+  recorders: ConfigurationRecorder[];
+  recording: boolean;
+}): Promise<SecurityFinding[]> {
+  const adapter = new ConfigAdapter();
+  const handler: SendHandler = (command) => {
+    if (command instanceof DescribeConfigurationRecordersCommand) {
+      return { ConfigurationRecorders: args.recorders };
+    }
+    if (command instanceof DescribeConfigurationRecorderStatusCommand) {
+      return {
+        ConfigurationRecordersStatus: args.recorders.length
+          ? [{ recording: args.recording }]
+          : [],
+      };
+    }
+    return {};
+  };
+  const client = buildClient(handler);
+  const fn = (
+    adapter as unknown as {
+      checkRecorders: (
+        c: unknown,
+        region: string,
+      ) => Promise<SecurityFinding[]>;
+    }
+  ).checkRecorders;
+  return fn.call(adapter, client, 'us-east-1');
+}
+
+describe('ConfigAdapter — checkRecorders recording-model awareness', () => {
+  it('passes a legacy recorder with recordingGroup.allSupported=true', async () => {
+    const findings = await runCheckRecorders({
+      recorders: [{ name: 'default', recordingGroup: { allSupported: true } }],
+      recording: true,
+    });
+    expect(findings).toHaveLength(1);
+    expect(findings[0].passed).toBe(true);
+    expect(findings[0].title).toBe('AWS Config recorder is active');
+  });
+
+  it('passes a new-model recorder using recordingStrategy ALL_SUPPORTED_RESOURCE_TYPES (regression: no longer false-flagged)', async () => {
+    const findings = await runCheckRecorders({
+      recorders: [
+        {
+          name: 'default',
+          // New model: allSupported is false/absent; the strategy is the source of truth.
+          recordingGroup: {
+            allSupported: false,
+            recordingStrategy: { useOnly: 'ALL_SUPPORTED_RESOURCE_TYPES' },
+          },
+        },
+      ],
+      recording: true,
+    });
+    expect(findings).toHaveLength(1);
+    expect(findings[0].passed).toBe(true);
+    expect(findings[0].title).toBe('AWS Config recorder is active');
+  });
+
+  it('flags a recorder using the EXCLUSION_BY_RESOURCE_TYPES strategy (e.g. IAM excluded)', async () => {
+    const findings = await runCheckRecorders({
+      recorders: [
+        {
+          name: 'default',
+          recordingGroup: {
+            allSupported: false,
+            recordingStrategy: { useOnly: 'EXCLUSION_BY_RESOURCE_TYPES' },
+            exclusionByResourceTypes: {
+              resourceTypes: [
+                'AWS::IAM::User',
+                'AWS::IAM::Role',
+                'AWS::IAM::Group',
+                'AWS::IAM::Policy',
+              ],
+            },
+          },
+        },
+      ],
+      recording: true,
+    });
+    expect(findings).toHaveLength(1);
+    expect(findings[0].passed).toBeFalsy();
+    expect(findings[0].title).toBe('AWS Config recorder not fully active');
+    expect(findings[0].severity).toBe('high');
+  });
+
+  it('flags an INCLUSION_BY_RESOURCE_TYPES recorder (records only specific types)', async () => {
+    const findings = await runCheckRecorders({
+      recorders: [
+        {
+          name: 'default',
+          recordingGroup: {
+            allSupported: false,
+            recordingStrategy: { useOnly: 'INCLUSION_BY_RESOURCE_TYPES' },
+            resourceTypes: ['AWS::S3::Bucket'],
+          },
+        },
+      ],
+      recording: true,
+    });
+    expect(findings[0].passed).toBeFalsy();
+    expect(findings[0].title).toBe('AWS Config recorder not fully active');
+  });
+
+  it('flags an all-supported recorder that is stopped', async () => {
+    const findings = await runCheckRecorders({
+      recorders: [{ name: 'default', recordingGroup: { allSupported: true } }],
+      recording: false,
+    });
+    expect(findings[0].passed).toBeFalsy();
+    expect(findings[0].description).toContain('not recording');
+  });
+
+  it('produces AWS-valid remediation guidance (clean recordingGroup, no conflicting fields)', async () => {
+    const findings = await runCheckRecorders({
+      recorders: [
+        {
+          name: 'default',
+          recordingGroup: {
+            allSupported: false,
+            recordingStrategy: { useOnly: 'EXCLUSION_BY_RESOURCE_TYPES' },
+            exclusionByResourceTypes: { resourceTypes: ['AWS::IAM::Role'] },
+          },
+        },
+      ],
+      recording: true,
+    });
+    const remediation = findings[0].remediation ?? '';
+    expect(remediation).toContain('allSupported: true');
+    expect(remediation).toContain('includeGlobalResourceTypes: true');
+    expect(remediation).toContain('ValidationException');
+    // Must instruct AGAINST mixing the conflicting fields.
+    expect(remediation).toMatch(/Do NOT include recordingStrategy/);
+  });
+
+  it('flags a missing recorder with includeGlobalResourceTypes in the create guidance', async () => {
+    const findings = await runCheckRecorders({
+      recorders: [],
+      recording: false,
+    });
+    expect(findings[0].title).toBe('AWS Config recorder not configured');
+    expect(findings[0].remediation).toContain('includeGlobalResourceTypes');
+  });
+});
diff --git a/apps/api/src/cloud-security/providers/aws/config.adapter.ts b/apps/api/src/cloud-security/providers/aws/config.adapter.ts
index 2994e1181c..0130317e01 100644
--- a/apps/api/src/cloud-security/providers/aws/config.adapter.ts
+++ b/apps/api/src/cloud-security/providers/aws/config.adapter.ts
@@ -1,5 +1,6 @@
 import {
   ConfigServiceClient,
+  type ConfigurationRecorder,
   DescribeConfigurationRecordersCommand,
   DescribeConfigurationRecorderStatusCommand,
   DescribeDeliveryChannelsCommand,
@@ -63,7 +64,7 @@ export class ConfigAdapter implements AwsServiceAdapter {
           severity: 'high',
           resourceId: `arn:aws:config:${region}`,
           remediation:
-            'Step 1: Create a service-linked role using iam:CreateServiceLinkedRoleCommand with AWSServiceName set to "config.amazonaws.com" (skip if the role already exists). Step 2: Create a configuration recorder using config-service:PutConfigurationRecorderCommand with ConfigurationRecorder containing name "compai-config-recorder", roleARN set to the Config service role ARN, and recordingGroup with allSupported set to true. Step 3: Create a delivery channel using config-service:PutDeliveryChannelCommand with DeliveryChannel containing name "compai-delivery-channel" and s3BucketName set to the target bucket. Step 4: Start the recorder using config-service:StartConfigurationRecorderCommand with ConfigurationRecorderName "compai-config-recorder". Rollback by calling config-service:StopConfigurationRecorderCommand with ConfigurationRecorderName "compai-config-recorder".',
+            'Step 1: Create a service-linked role using iam:CreateServiceLinkedRoleCommand with AWSServiceName set to "config.amazonaws.com" (skip if the role already exists). Step 2: Create a configuration recorder using config-service:PutConfigurationRecorderCommand with ConfigurationRecorder containing name "compai-config-recorder", roleARN set to the Config service role ARN, and recordingGroup set to exactly { allSupported: true, includeGlobalResourceTypes: true }. Do NOT include recordingStrategy, exclusionByResourceTypes, or resourceTypes in recordingGroup — allSupported:true is mutually exclusive with those and AWS rejects the request with a ValidationException. Step 3: Create a delivery channel using config-service:PutDeliveryChannelCommand with DeliveryChannel containing name "compai-delivery-channel" and s3BucketName set to the target bucket. Step 4: Start the recorder using config-service:StartConfigurationRecorderCommand with ConfigurationRecorderName "compai-config-recorder". Rollback by calling config-service:StopConfigurationRecorderCommand with ConfigurationRecorderName "compai-config-recorder".',
         }),
       );
       return findings;
@@ -76,9 +77,9 @@ export class ConfigAdapter implements AwsServiceAdapter {
     const status = ConfigurationRecordersStatus?.[0];
     const recorder = ConfigurationRecorders[0];
     const isRecording = status?.recording === true;
-    const allSupported = recorder?.recordingGroup?.allSupported === true;
+    const recordsAllSupported = this.recordsAllSupportedResources(recorder);
 
-    if (isRecording && allSupported) {
+    if (isRecording && recordsAllSupported) {
       findings.push(
         this.makeFinding({
           id: `config-recorder-enabled-${region}`,
@@ -98,7 +99,7 @@ export class ConfigAdapter implements AwsServiceAdapter {
           severity: 'high',
           resourceId: recorder.name ?? `config-recorder-${region}`,
           remediation:
-            'Use config-service:PutConfigurationRecorderCommand with ConfigurationRecorder containing the existing recorder name, roleARN, and recordingGroup with allSupported set to true. Then call config-service:StartConfigurationRecorderCommand with ConfigurationRecorderName set to the recorder name. Rollback by calling config-service:StopConfigurationRecorderCommand with ConfigurationRecorderName set to the recorder name.',
+            'Step 1: Read the existing recorder with config-service:DescribeConfigurationRecordersCommand to get its exact name and roleARN. Step 2: Call config-service:PutConfigurationRecorderCommand with ConfigurationRecorder set to { name: <existing recorder name>, roleARN: <existing roleARN>, recordingGroup: { allSupported: true, includeGlobalResourceTypes: true } }. Do NOT include recordingStrategy, exclusionByResourceTypes, or resourceTypes in recordingGroup — allSupported:true is mutually exclusive with those and AWS rejects the request with a ValidationException. This overwrites any exclusion-based recording strategy so all supported resource types (including the global IAM resource types) are recorded. Step 3: Call config-service:StartConfigurationRecorderCommand with ConfigurationRecorderName set to the recorder name. Rollback by calling config-service:StopConfigurationRecorderCommand with ConfigurationRecorderName set to the recorder name.',
         }),
       );
     }
@@ -106,6 +107,29 @@ export class ConfigAdapter implements AwsServiceAdapter {
     return findings;
   }
 
+  /**
+   * Whether the recorder captures ALL supported resource types.
+   *
+   * AWS Config has two recording models:
+   *  - Legacy: `recordingGroup.allSupported === true`.
+   *  - Current ("Record all resource types with customizable overrides"):
+   *    `recordingGroup.recordingStrategy.useOnly === 'ALL_SUPPORTED_RESOURCE_TYPES'`.
+   *    On this model `allSupported` is typically false, so the legacy-only
+   *    check produced false positives for recorders that DO record everything.
+   *
+   * A recorder using `EXCLUSION_BY_RESOURCE_TYPES` (records all-except) or
+   * `INCLUSION_BY_RESOURCE_TYPES` (records only-listed) does NOT capture
+   * every supported type, so it stays flagged for remediation.
+   */
+  private recordsAllSupportedResources(
+    recorder: ConfigurationRecorder | undefined,
+  ): boolean {
+    const group = recorder?.recordingGroup;
+    if (!group) return false;
+    if (group.allSupported === true) return true;
+    return group.recordingStrategy?.useOnly === 'ALL_SUPPORTED_RESOURCE_TYPES';
+  }
+
   private async checkDeliveryChannels(
     client: ConfigServiceClient,
     region: string,
diff --git a/apps/api/src/cloud-security/remediation.service.spec.ts b/apps/api/src/cloud-security/remediation.service.spec.ts
index 7436e8dcc4..8c23c01baf 100644
--- a/apps/api/src/cloud-security/remediation.service.spec.ts
+++ b/apps/api/src/cloud-security/remediation.service.spec.ts
@@ -75,3 +75,34 @@ describe('RemediationService.previewRemediation', () => {
     expect(getDecryptedCredentials).not.toHaveBeenCalled();
   });
 });
+
+describe('RemediationService.isUsablePlan (plan-cache guard)', () => {
+  const service = makeService();
+  const callIsUsable = (plan: unknown): boolean =>
+    (
+      service as unknown as { isUsablePlan: (p: unknown) => boolean }
+    ).isUsablePlan(plan);
+
+  it('treats an empty fix plan as unusable so it is never cached/reused (Retry can regenerate)', () => {
+    expect(callIsUsable({ canAutoFix: true, fixSteps: [] })).toBe(false);
+  });
+
+  it('treats a non-auto-fixable plan as unusable', () => {
+    expect(
+      callIsUsable({ canAutoFix: false, fixSteps: [{ command: 'X' }] }),
+    ).toBe(false);
+  });
+
+  it('treats undefined as unusable', () => {
+    expect(callIsUsable(undefined)).toBe(false);
+  });
+
+  it('treats an auto-fixable plan with at least one fix step as usable', () => {
+    expect(
+      callIsUsable({
+        canAutoFix: true,
+        fixSteps: [{ command: 'PutConfigurationRecorderCommand' }],
+      }),
+    ).toBe(true);
+  });
+});
diff --git a/apps/api/src/cloud-security/remediation.service.ts b/apps/api/src/cloud-security/remediation.service.ts
index 7b7275b605..9a2e760726 100644
--- a/apps/api/src/cloud-security/remediation.service.ts
+++ b/apps/api/src/cloud-security/remediation.service.ts
@@ -37,6 +37,18 @@ export class RemediationService {
   private readonly PLAN_CACHE_MAX = 100;
   private readonly PLAN_CACHE_TTL = 5 * 60 * 1000;
 
+  /**
+   * A plan is only worth caching/reusing if it can actually be auto-applied.
+   * Caching an empty or non-auto-fixable plan makes "Retry" a guaranteed
+   * no-op: execute would reload the same dead plan and fail identically,
+   * never re-running the (non-deterministic) AI generation that might succeed.
+   */
+  private isUsablePlan(plan: FixPlan | undefined): boolean {
+    return Boolean(
+      plan?.canAutoFix && plan.fixSteps && plan.fixSteps.length > 0,
+    );
+  }
+
   private evictStalePlans() {
     if (this.planCache.size <= this.PLAN_CACHE_MAX) return;
     const now = Date.now();
@@ -350,12 +362,16 @@ export class RemediationService {
               this.buildStaticPermissionScript(permissionsList);
           }
 
-          // Cache the refined plan + permissions for execute and Recheck
-          this.evictStalePlans();
-          this.planCache.set(
-            `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`,
-            { plan: refined, timestamp: Date.now(), permissionsList },
-          );
+          // Cache the refined plan + permissions for execute and Recheck.
+          // Never cache an unusable (empty / non-auto-fixable) plan — caching
+          // one turns "Retry" into a no-op that reloads the same dead plan.
+          if (this.isUsablePlan(refined)) {
+            this.evictStalePlans();
+            this.planCache.set(
+              `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`,
+              { plan: refined, timestamp: Date.now(), permissionsList },
+            );
+          }
 
           return {
             currentState: refined.currentState,
@@ -381,16 +397,19 @@ export class RemediationService {
       }
     }
 
-    // Fallback: show initial AI plan without real data
-    this.evictStalePlans();
-    this.planCache.set(
-      `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`,
-      {
-        plan,
-        timestamp: Date.now(),
-        permissionsList: plan.requiredPermissions,
-      },
-    );
+    // Fallback: show initial AI plan without real data. Only cache it when
+    // usable — caching an empty/non-auto-fixable plan makes Retry a no-op.
+    if (this.isUsablePlan(plan)) {
+      this.evictStalePlans();
+      this.planCache.set(
+        `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`,
+        {
+          plan,
+          timestamp: Date.now(),
+          permissionsList: plan.requiredPermissions,
+        },
+      );
+    }
 
     return {
       currentState: plan.currentState,
@@ -440,12 +459,21 @@ export class RemediationService {
 
     // Get plan from cache or regenerate
     let plan: FixPlan;
-    const cached = this.planCache.get(
-      `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`,
-    );
-    if (cached && Date.now() - cached.timestamp < 5 * 60 * 1000) {
+    const cacheKey = `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`;
+    const cached = this.planCache.get(cacheKey);
+    // Only reuse a cached plan if it is still fresh AND usable. Reusing a
+    // stale empty / non-auto-fixable plan is exactly what made "Retry" a
+    // no-op — execute reloaded the same dead plan and failed identically.
+    // Falling through (and dropping the dead entry) regenerates a fresh plan,
+    // which is what gives Retry a chance to succeed.
+    if (
+      cached &&
+      Date.now() - cached.timestamp < this.PLAN_CACHE_TTL &&
+      this.isUsablePlan(cached.plan)
+    ) {
       plan = cached.plan;
     } else {
+      this.planCache.delete(cacheKey);
       const evidence = (finding.evidence ?? {}) as Record<string, unknown>;
       plan = await this.aiRemediationService.generateFixPlan({
         title: finding.title ?? 'Unknown',
diff --git a/apps/app/src/app/(app)/[orgId]/cloud-tests/components/RemediationDialog.tsx b/apps/app/src/app/(app)/[orgId]/cloud-tests/components/RemediationDialog.tsx
index f847545787..03d2dcced8 100644
--- a/apps/app/src/app/(app)/[orgId]/cloud-tests/components/RemediationDialog.tsx
+++ b/apps/app/src/app/(app)/[orgId]/cloud-tests/components/RemediationDialog.tsx
@@ -512,6 +512,11 @@ export function RemediationDialog({
   };
 
   const isGuided = preview?.guidedOnly;
+  // An auto-fix plan with no API calls has nothing to apply (the AI returned
+  // an empty/unusable plan). Block Apply so the user can't submit a no-op that
+  // fails server-side — they can Cancel and re-open to regenerate.
+  const hasNothingToApply =
+    !isGuided && (preview?.apiCalls?.length ?? 0) === 0;
 
   return (
     <Dialog open={open} onOpenChange={onOpenChange}>
@@ -766,6 +771,14 @@ export function RemediationDialog({
                     </label>
                   )}
 
+                  {hasNothingToApply && (
+                    <p className="text-xs text-muted-foreground">
+                      We couldn&apos;t build an automatic fix for this finding.
+                      Close this dialog and try again, or follow the manual
+                      remediation guidance for this finding.
+                    </p>
+                  )}
+
                   <div className="flex justify-end gap-2 pt-2">
                     <Button
                       variant="outline"
@@ -778,7 +791,7 @@ export function RemediationDialog({
                     <Button
                       size="sm"
                       onClick={handleExecute}
-                      disabled={isExecuting || acknowledgment !== 'acknowledged' || (preview.missingPermissions?.length ?? 0) > 0}
+                      disabled={isExecuting || acknowledgment !== 'acknowledged' || (preview.missingPermissions?.length ?? 0) > 0 || hasNothingToApply}
                     >
                       {isExecuting ? (
                         <Loader2 className="mr-1.5 h-3.5 w-3.5 animate-spin" />

From 10611bcd527fe6bda914ae9a367b8a7fbb85c40e Mon Sep 17 00:00:00 2001
From: Tofik Hasanov <annexcies@gmail.com>
Date: Thu, 4 Jun 2026 18:04:49 -0400
Subject: [PATCH 2/3] fix(cloud-security): port remediation robustness fixes to
 GCP and Azure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Retry-no-op and empty-plan bugs fixed for AWS also existed in the separate
GCP and Azure remediation paths (own services + own planCache + own
generators). Bring them to parity.

GCP (gcp-remediation.service.ts):
- isUsablePlan guard: never cache an empty / non-auto-fixable plan; on execute,
  only reuse a fresh AND usable cached plan and drop the stale entry otherwise
  (was the Retry no-op — execute reloaded the same dead plan).
Azure (azure-remediation.service.ts):
- Same isUsablePlan cache guard on preview + execute.
- Delete the cached plan in the execute catch block so Retry regenerates.
- If the refined plan flips canAutoFix to false, return guided steps instead of
  a misleading auto-fix preview.
GCP + Azure generators (ai-remediation.service.ts):
- generateGcpFixPlan / generateAzureFixPlan now retry once at a higher
  temperature when the first pass yields zero fix steps (temp 0 reproduces it).

Tests: GCP/Azure empty-plan retry cases added; full cloud-security jest suite
green (292 passing). Typecheck clean for all changed files.

Note (flagged, NOT changed here — separate scope / higher risk):
- GCP/Azure scanners swallow per-adapter/per-scope errors (return [] on
  failure), so a real API/permission failure can look like "0 findings".
- GCP Cloud SQL databaseFlags is a REPLACE op with no guard that all existing
  flags are preserved; disabling public IP has no private-IP precondition check.
These are real but need their own design + tests before touching.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../ai-remediation.service.spec.ts            | 75 +++++++++++++++++++
 .../cloud-security/ai-remediation.service.ts  | 70 +++++++++++++----
 .../azure-remediation.service.ts              | 40 +++++++++-
 .../cloud-security/gcp-remediation.service.ts | 68 +++++++++++------
 4 files changed, 213 insertions(+), 40 deletions(-)

diff --git a/apps/api/src/cloud-security/ai-remediation.service.spec.ts b/apps/api/src/cloud-security/ai-remediation.service.spec.ts
index d40c92eb85..5c223e9139 100644
--- a/apps/api/src/cloud-security/ai-remediation.service.spec.ts
+++ b/apps/api/src/cloud-security/ai-remediation.service.spec.ts
@@ -603,3 +603,78 @@ describe('AiRemediationService.generateFixPlan empty-plan retry', () => {
     expect(generateObjectMock).toHaveBeenCalledTimes(1);
   });
 });
+
+describe('AiRemediationService GCP/Azure empty-plan retry', () => {
+  const generateObjectMock = generateObject as unknown as jest.Mock;
+
+  beforeEach(() => {
+    generateObjectMock.mockReset();
+  });
+
+  const finding = {
+    title: 'finding',
+    description: null,
+    severity: 'high',
+    resourceType: 'CloudResource',
+    resourceId: 'r',
+    remediation: null,
+    findingKey: 'fk',
+    evidence: {},
+  };
+
+  it('GCP: retries once at higher temperature when the first plan is empty', async () => {
+    generateObjectMock.mockResolvedValueOnce({
+      object: { canAutoFix: true, fixSteps: [] },
+    });
+    generateObjectMock.mockResolvedValueOnce({
+      object: { canAutoFix: true, fixSteps: [{ method: 'PATCH' }] },
+    });
+
+    const service = new AiRemediationService();
+    const plan = await service.generateGcpFixPlan(finding);
+
+    expect(generateObjectMock).toHaveBeenCalledTimes(2);
+    expect(generateObjectMock.mock.calls[0][0].temperature).toBe(0);
+    expect(generateObjectMock.mock.calls[1][0].temperature).toBeGreaterThan(0);
+    expect(plan.fixSteps).toHaveLength(1);
+  });
+
+  it('GCP: does not retry when the first plan already has steps', async () => {
+    generateObjectMock.mockResolvedValueOnce({
+      object: { canAutoFix: true, fixSteps: [{ method: 'PATCH' }] },
+    });
+
+    const service = new AiRemediationService();
+    await service.generateGcpFixPlan(finding);
+
+    expect(generateObjectMock).toHaveBeenCalledTimes(1);
+  });
+
+  it('Azure: retries once at higher temperature when the first plan is empty', async () => {
+    generateObjectMock.mockResolvedValueOnce({
+      object: { canAutoFix: true, fixSteps: [] },
+    });
+    generateObjectMock.mockResolvedValueOnce({
+      object: { canAutoFix: true, fixSteps: [{ method: 'PATCH' }] },
+    });
+
+    const service = new AiRemediationService();
+    const plan = await service.generateAzureFixPlan(finding);
+
+    expect(generateObjectMock).toHaveBeenCalledTimes(2);
+    expect(generateObjectMock.mock.calls[0][0].temperature).toBe(0);
+    expect(generateObjectMock.mock.calls[1][0].temperature).toBeGreaterThan(0);
+    expect(plan.fixSteps).toHaveLength(1);
+  });
+
+  it('Azure: does not retry when the first plan already has steps', async () => {
+    generateObjectMock.mockResolvedValueOnce({
+      object: { canAutoFix: true, fixSteps: [{ method: 'PATCH' }] },
+    });
+
+    const service = new AiRemediationService();
+    await service.generateAzureFixPlan(finding);
+
+    expect(generateObjectMock).toHaveBeenCalledTimes(1);
+  });
+});
diff --git a/apps/api/src/cloud-security/ai-remediation.service.ts b/apps/api/src/cloud-security/ai-remediation.service.ts
index ba09a5b95f..1c52423d11 100644
--- a/apps/api/src/cloud-security/ai-remediation.service.ts
+++ b/apps/api/src/cloud-security/ai-remediation.service.ts
@@ -460,13 +460,19 @@ Produce 3-8 ordered steps. Each step is a single concrete action the customer ca
   async generateGcpFixPlan(finding: FindingContext): Promise<GcpFixPlan> {
     for (let attempt = 0; attempt < 2; attempt++) {
       try {
-        const { object } = await generateObject({
-          model: MODEL,
-          schema: gcpFixPlanSchema,
-          system: GCP_SYSTEM_PROMPT,
-          prompt: buildGcpFixPlanPrompt(finding),
-          temperature: 0,
-        });
+        let object = await this.requestGcpFixPlan(finding, 0);
+
+        // canAutoFix=true with zero fixSteps surfaces as "AI generated an
+        // empty fix plan" and (with caching) a Retry that does nothing.
+        // Generation is non-deterministic — retry once at a higher
+        // temperature to force a genuinely different sample.
+        if (object.canAutoFix && object.fixSteps.length === 0) {
+          this.logger.warn(
+            `Empty GCP fix plan for ${finding.findingKey}; regenerating once at higher temperature`,
+          );
+          const retry = await this.requestGcpFixPlan(finding, 0.5);
+          if (retry.fixSteps.length > 0) object = retry;
+        }
 
         this.logger.log(
           `GCP AI plan for ${finding.findingKey}: canAutoFix=${object.canAutoFix}, risk=${object.risk}`,
@@ -483,6 +489,21 @@ Produce 3-8 ordered steps. Each step is a single concrete action the customer ca
     return this.fallbackGcpPlan(finding);
   }
 
+  /** Single GCP fix-plan generation pass. */
+  private async requestGcpFixPlan(
+    finding: FindingContext,
+    temperature: number,
+  ): Promise<GcpFixPlan> {
+    const { object } = await generateObject({
+      model: MODEL,
+      schema: gcpFixPlanSchema,
+      system: GCP_SYSTEM_PROMPT,
+      prompt: buildGcpFixPlanPrompt(finding),
+      temperature,
+    });
+    return object;
+  }
+
   async refineGcpFixPlan(params: {
     finding: FindingContext;
     originalPlan: GcpFixPlan;
@@ -527,13 +548,19 @@ Generate the complete fix plan with EXACT JSON values from the real GCP state.`,
 
   async generateAzureFixPlan(finding: FindingContext): Promise<AzureFixPlan> {
     try {
-      const { object } = await generateObject({
-        model: MODEL,
-        schema: azureFixPlanSchema,
-        system: AZURE_SYSTEM_PROMPT,
-        prompt: buildAzureFixPlanPrompt(finding),
-        temperature: 0,
-      });
+      let object = await this.requestAzureFixPlan(finding, 0);
+
+      // canAutoFix=true with zero fixSteps surfaces as "AI generated an empty
+      // fix plan" and (with caching) a Retry that does nothing. Generation is
+      // non-deterministic — retry once at a higher temperature to force a
+      // genuinely different sample.
+      if (object.canAutoFix && object.fixSteps.length === 0) {
+        this.logger.warn(
+          `Empty Azure fix plan for ${finding.findingKey}; regenerating once at higher temperature`,
+        );
+        const retry = await this.requestAzureFixPlan(finding, 0.5);
+        if (retry.fixSteps.length > 0) object = retry;
+      }
 
       this.logger.log(
         `Azure AI plan for ${finding.findingKey}: canAutoFix=${object.canAutoFix}, risk=${object.risk}`,
@@ -547,6 +574,21 @@ Generate the complete fix plan with EXACT JSON values from the real GCP state.`,
     }
   }
 
+  /** Single Azure fix-plan generation pass. */
+  private async requestAzureFixPlan(
+    finding: FindingContext,
+    temperature: number,
+  ): Promise<AzureFixPlan> {
+    const { object } = await generateObject({
+      model: MODEL,
+      schema: azureFixPlanSchema,
+      system: AZURE_SYSTEM_PROMPT,
+      prompt: buildAzureFixPlanPrompt(finding),
+      temperature,
+    });
+    return object;
+  }
+
   async refineAzureFixPlan(params: {
     finding: FindingContext;
     originalPlan: AzureFixPlan;
diff --git a/apps/api/src/cloud-security/azure-remediation.service.ts b/apps/api/src/cloud-security/azure-remediation.service.ts
index 07ae2e2815..215ee85c87 100644
--- a/apps/api/src/cloud-security/azure-remediation.service.ts
+++ b/apps/api/src/cloud-security/azure-remediation.service.ts
@@ -23,6 +23,17 @@ export class AzureRemediationService {
   >();
   private readonly PLAN_CACHE_MAX = 100;
 
+  /**
+   * A plan is only worth caching/reusing if it can actually be auto-applied.
+   * Caching an empty or non-auto-fixable plan makes "Retry" a guaranteed
+   * no-op: execute would reload the same dead plan and fail identically.
+   */
+  private isUsablePlan(plan: AzureFixPlan | undefined): boolean {
+    return Boolean(
+      plan?.canAutoFix && plan.fixSteps && plan.fixSteps.length > 0,
+    );
+  }
+
   private evictStalePlans() {
     if (this.planCache.size <= this.PLAN_CACHE_MAX) return;
     const now = Date.now();
@@ -101,6 +112,12 @@ export class AzureRemediationService {
       }
     }
 
+    // The refined plan can flip canAutoFix to false after seeing real state —
+    // surface guided steps instead of a misleading auto-fix preview.
+    if (!plan.canAutoFix) {
+      return this.buildGuidedResponse(plan);
+    }
+
     // Validate fix steps
     const validationErrors = validateAzurePlanSteps(plan.fixSteps);
     if (validationErrors.length > 0) {
@@ -110,10 +127,14 @@ export class AzureRemediationService {
       return this.buildGuidedResponse(plan);
     }
 
-    // Cache plan for execute
+    // Cache plan for execute. Never cache an unusable (empty / non-auto-
+    // fixable) plan — caching one turns "Retry" into a no-op that reloads the
+    // same dead plan.
     const cacheKey = `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`;
-    this.evictStalePlans();
-    this.planCache.set(cacheKey, { plan, timestamp: Date.now() });
+    if (this.isUsablePlan(plan)) {
+      this.evictStalePlans();
+      this.planCache.set(cacheKey, { plan, timestamp: Date.now() });
+    }
 
     return this.buildPreviewResponse(plan);
   }
@@ -141,9 +162,17 @@ export class AzureRemediationService {
     const cached = this.planCache.get(cacheKey);
     let plan: AzureFixPlan;
 
-    if (cached && Date.now() - cached.timestamp < PLAN_CACHE_TTL) {
+    // Only reuse a fresh AND usable plan — reusing a stale empty / non-auto-
+    // fixable plan is what makes "Retry" a no-op (execute reloads the same
+    // dead plan and fails identically).
+    if (
+      cached &&
+      Date.now() - cached.timestamp < PLAN_CACHE_TTL &&
+      this.isUsablePlan(cached.plan)
+    ) {
       plan = cached.plan;
     } else {
+      this.planCache.delete(cacheKey);
       plan = await this.aiRemediationService.generateAzureFixPlan(finding);
       if (!plan.canAutoFix) {
         throw new Error(
@@ -418,6 +447,9 @@ export class AzureRemediationService {
       };
     } catch (error) {
       const msg = error instanceof Error ? error.message : String(error);
+      // Drop the cached plan so a subsequent "Retry" regenerates instead of
+      // reloading the plan that just failed.
+      this.planCache.delete(cacheKey);
       await db.remediationAction.update({
         where: { id: action.id },
         data: {
diff --git a/apps/api/src/cloud-security/gcp-remediation.service.ts b/apps/api/src/cloud-security/gcp-remediation.service.ts
index fd722bdc27..bdd8fda1df 100644
--- a/apps/api/src/cloud-security/gcp-remediation.service.ts
+++ b/apps/api/src/cloud-security/gcp-remediation.service.ts
@@ -21,6 +21,17 @@ export class GcpRemediationService {
   private readonly PLAN_CACHE_MAX = 100;
   private readonly PLAN_CACHE_TTL = 5 * 60 * 1000;
 
+  /**
+   * A plan is only worth caching/reusing if it can actually be auto-applied.
+   * Caching an empty or non-auto-fixable plan makes "Retry" a guaranteed
+   * no-op: execute would reload the same dead plan and fail identically.
+   */
+  private isUsablePlan(plan: GcpFixPlan | undefined): boolean {
+    return Boolean(
+      plan?.canAutoFix && plan.fixSteps && plan.fixSteps.length > 0,
+    );
+  }
+
   private evictStalePlans() {
     if (this.planCache.size <= this.PLAN_CACHE_MAX) return;
     const now = Date.now();
@@ -137,14 +148,18 @@ export class GcpRemediationService {
             };
           }
 
-          this.evictStalePlans();
-          this.planCache.set(
-            `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`,
-            {
-              plan: refined,
-              timestamp: Date.now(),
-            },
-          );
+          // Never cache an unusable (empty / non-auto-fixable) plan — caching
+          // one turns "Retry" into a no-op that reloads the same dead plan.
+          if (this.isUsablePlan(refined)) {
+            this.evictStalePlans();
+            this.planCache.set(
+              `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`,
+              {
+                plan: refined,
+                timestamp: Date.now(),
+              },
+            );
+          }
 
           return this.buildPreviewResponse(refined);
         } catch {
@@ -153,15 +168,18 @@ export class GcpRemediationService {
       }
     }
 
-    // Fallback: show initial AI plan without real data
-    this.evictStalePlans();
-    this.planCache.set(
-      `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`,
-      {
-        plan,
-        timestamp: Date.now(),
-      },
-    );
+    // Fallback: show initial AI plan without real data. Only cache it when
+    // usable — caching an empty/non-auto-fixable plan makes Retry a no-op.
+    if (this.isUsablePlan(plan)) {
+      this.evictStalePlans();
+      this.planCache.set(
+        `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`,
+        {
+          plan,
+          timestamp: Date.now(),
+        },
+      );
+    }
     return this.buildPreviewResponse(plan);
   }
 
@@ -175,14 +193,20 @@ export class GcpRemediationService {
   }) {
     const { finding, accessToken } = await this.resolveContext(params);
 
-    // Get plan from cache or regenerate
+    // Get plan from cache or regenerate. Only reuse a fresh AND usable plan —
+    // reusing a stale empty / non-auto-fixable plan is what makes "Retry" a
+    // no-op (execute reloads the same dead plan and fails identically).
     let plan: GcpFixPlan;
-    const cached = this.planCache.get(
-      `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`,
-    );
-    if (cached && Date.now() - cached.timestamp < 5 * 60 * 1000) {
+    const cacheKey = `${params.connectionId}:${params.checkResultId}:${params.remediationKey}`;
+    const cached = this.planCache.get(cacheKey);
+    if (
+      cached &&
+      Date.now() - cached.timestamp < this.PLAN_CACHE_TTL &&
+      this.isUsablePlan(cached.plan)
+    ) {
       plan = cached.plan;
     } else {
+      this.planCache.delete(cacheKey);
       const evidence = (finding.evidence ?? {}) as Record<string, unknown>;
       plan = await this.aiRemediationService.generateGcpFixPlan({
         title: finding.title ?? 'Unknown',

From 432fd391c3123787f6f788f8043ffbb9fc32aca5 Mon Sep 17 00:00:00 2001
From: Tofik Hasanov <annexcies@gmail.com>
Date: Thu, 4 Jun 2026 18:36:50 -0400
Subject: [PATCH 3/3] =?UTF-8?q?fix(cloud-security):=20address=20cubic=20re?=
 =?UTF-8?q?view=20=E2=80=94=20prompt=20command=20consistency=20+=20retry?=
 =?UTF-8?q?=20selection?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- ai-remediation.prompt.ts: use bare AWS SDK command names (with the service
  named separately) in the new S3 + Config guidance, matching the OUTPUT RULES
  schema — the prior "s3:PutPublicAccessBlockCommand" / "config-service:..."
  shorthand could nudge the model to emit a service-prefixed (invalid) command
  value. Add the missing "Command" suffix to GetBucketPolicy. Remove PutBucketAcl
  from the "permissions you need" chain so it no longer contradicts the new
  "never use ACLs" rule.
- ai-remediation.service.ts: the empty-plan retry now prefers a retry that is
  usable OR correctly canAutoFix=false (routes to guided steps) instead of
  discarding it and returning the original empty canAutoFix=true plan. Applied
  to AWS, GCP, and Azure generators.
- Test: retry prefers a non-auto-fixable result.

cloud-security jest suite green (293 passing); changed files typecheck clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../cloud-security/ai-remediation.prompt.ts   |  8 ++--
 .../ai-remediation.service.spec.ts            | 41 +++++++++++++++++++
 .../cloud-security/ai-remediation.service.ts  | 15 +++++--
 3 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/apps/api/src/cloud-security/ai-remediation.prompt.ts b/apps/api/src/cloud-security/ai-remediation.prompt.ts
index 810a8425db..03dbd03660 100644
--- a/apps/api/src/cloud-security/ai-remediation.prompt.ts
+++ b/apps/api/src/cloud-security/ai-remediation.prompt.ts
@@ -164,11 +164,11 @@ A human will ALWAYS review your plan before execution. Be precise and correct.
 
 ## S3 PUBLIC ACCESS AND ACLs (IMPORTANT)
 - NEVER use PutBucketAclCommand or bucket/object ACLs. Modern buckets use Object Ownership = BucketOwnerEnforced, which disables ACLs — the call fails, and the executor strips ACL steps, which can leave an EMPTY plan.
-- To block public access on a bucket: use s3:PutPublicAccessBlockCommand with PublicAccessBlockConfiguration set to { BlockPublicAcls: true, IgnorePublicAcls: true, BlockPublicPolicy: true, RestrictPublicBuckets: true }.
-- To remediate a public bucket POLICY: read it first with GetBucketPolicy, then use s3:PutBucketPolicyCommand with a corrected least-privilege policy. Never rely on ACLs to fix public access.
+- To block public access on a bucket: use PutPublicAccessBlockCommand (service "s3") with PublicAccessBlockConfiguration set to { BlockPublicAcls: true, IgnorePublicAcls: true, BlockPublicPolicy: true, RestrictPublicBuckets: true }.
+- To remediate a public bucket POLICY: read it first with GetBucketPolicyCommand, then use PutBucketPolicyCommand with a corrected least-privilege policy (service "s3"). Never rely on ACLs to fix public access.
 
 ## AWS CONFIG RECORDER (IMPORTANT)
-- To make a recorder record ALL supported resource types, first read the existing recorder with config-service:DescribeConfigurationRecordersCommand (readSteps) to get its exact name and roleARN, then call config-service:PutConfigurationRecorderCommand with ConfigurationRecorder = { name, roleARN, recordingGroup: { allSupported: true, includeGlobalResourceTypes: true } }.
+- To make a recorder record ALL supported resource types, first read the existing recorder with DescribeConfigurationRecordersCommand (service "config-service", in readSteps) to get its exact name and roleARN, then call PutConfigurationRecorderCommand with ConfigurationRecorder = { name, roleARN, recordingGroup: { allSupported: true, includeGlobalResourceTypes: true } }.
 - NEVER set allSupported:true together with recordingStrategy, exclusionByResourceTypes, or resourceTypes — they are mutually exclusive and AWS rejects the request with a ValidationException. Omit those fields entirely (this also overwrites an existing exclusion-based strategy so global IAM resources are recorded).
 
 ## IDEMPOTENCY (CRITICAL)
@@ -268,7 +268,7 @@ NEVER omit AWSServiceName, leave it as null, or use a placeholder string.
 
 ## REQUIRED PERMISSIONS (VERY IMPORTANT — GET THIS RIGHT FIRST TIME)
 - List EVERY IAM action needed for the COMPLETE operation, not just the direct API calls
-- Think through the FULL chain: if you CreateBucket, you also need PutBucketPolicy, GetBucketPolicy, PutBucketAcl
+- Think through the FULL chain: if you CreateBucket, you also need PutBucketPolicy, GetBucketPolicy, PutPublicAccessBlock (do NOT use PutBucketAcl — ACLs are disabled on modern buckets)
 - Include iam:CreateRole and iam:PutRolePolicy when creating AWS service delivery roles
 - Include iam:PassRole when attaching a role to an AWS service (CloudTrail, Config, etc.)
 - NEVER include iam:AttachRolePolicy — use iam:PutRolePolicy (inline policies) instead
diff --git a/apps/api/src/cloud-security/ai-remediation.service.spec.ts b/apps/api/src/cloud-security/ai-remediation.service.spec.ts
index 5c223e9139..1a77965974 100644
--- a/apps/api/src/cloud-security/ai-remediation.service.spec.ts
+++ b/apps/api/src/cloud-security/ai-remediation.service.spec.ts
@@ -678,3 +678,44 @@ describe('AiRemediationService GCP/Azure empty-plan retry', () => {
     expect(generateObjectMock).toHaveBeenCalledTimes(1);
   });
 });
+
+describe('AiRemediationService.generateFixPlan retry selection', () => {
+  const generateObjectMock = generateObject as unknown as jest.Mock;
+
+  beforeEach(() => {
+    generateObjectMock.mockReset();
+  });
+
+  it('prefers a canAutoFix=false retry over the original empty canAutoFix=true plan', async () => {
+    // First pass: the degenerate empty plan (canAutoFix true, no steps).
+    generateObjectMock.mockResolvedValueOnce({
+      object: basePlan({ canAutoFix: true, fixSteps: [] }),
+    });
+    // Retry: the model correctly concludes the finding is not auto-fixable.
+    generateObjectMock.mockResolvedValueOnce({
+      object: basePlan({
+        canAutoFix: false,
+        fixSteps: [],
+        reason: 'Requires manual setup',
+        guidedSteps: ['Do the thing in the console'],
+      }),
+    });
+
+    const service = new AiRemediationService();
+    const plan = await service.generateFixPlan({
+      title: 't',
+      description: null,
+      severity: null,
+      resourceType: 'X',
+      resourceId: 'y',
+      remediation: null,
+      findingKey: 'fk',
+      evidence: {},
+    });
+
+    expect(generateObjectMock).toHaveBeenCalledTimes(2);
+    // The non-auto-fixable retry is used → routes to guided steps instead of
+    // the "AI generated an empty fix plan" dead end.
+    expect(plan.canAutoFix).toBe(false);
+  });
+});
diff --git a/apps/api/src/cloud-security/ai-remediation.service.ts b/apps/api/src/cloud-security/ai-remediation.service.ts
index 1c52423d11..03c76378cb 100644
--- a/apps/api/src/cloud-security/ai-remediation.service.ts
+++ b/apps/api/src/cloud-security/ai-remediation.service.ts
@@ -66,7 +66,12 @@ export class AiRemediationService {
           `Empty fix plan for ${finding.findingKey}; regenerating once at higher temperature`,
         );
         const retry = await this.requestFixPlan(finding, 0.5);
-        if (retry.fixSteps.length > 0) plan = retry;
+        // Prefer the retry if it is usable (has steps) OR if it correctly
+        // concludes the finding is not auto-fixable — either is better than
+        // returning the original empty canAutoFix=true plan (which only yields
+        // the "empty fix plan" dead end). Keep the original only when the
+        // retry is no improvement (still empty + still canAutoFix).
+        if (retry.fixSteps.length > 0 || !retry.canAutoFix) plan = retry;
       }
 
       return plan;
@@ -471,7 +476,9 @@ Produce 3-8 ordered steps. Each step is a single concrete action the customer ca
             `Empty GCP fix plan for ${finding.findingKey}; regenerating once at higher temperature`,
           );
           const retry = await this.requestGcpFixPlan(finding, 0.5);
-          if (retry.fixSteps.length > 0) object = retry;
+          // Prefer a retry that is usable OR correctly non-auto-fixable —
+          // either beats returning the original empty canAutoFix=true plan.
+          if (retry.fixSteps.length > 0 || !retry.canAutoFix) object = retry;
         }
 
         this.logger.log(
@@ -559,7 +566,9 @@ Generate the complete fix plan with EXACT JSON values from the real GCP state.`,
           `Empty Azure fix plan for ${finding.findingKey}; regenerating once at higher temperature`,
         );
         const retry = await this.requestAzureFixPlan(finding, 0.5);
-        if (retry.fixSteps.length > 0) object = retry;
+        // Prefer a retry that is usable OR correctly non-auto-fixable —
+        // either beats returning the original empty canAutoFix=true plan.
+        if (retry.fixSteps.length > 0 || !retry.canAutoFix) object = retry;
       }
 
       this.logger.log(