From e7ec2adc75065fb21601048de65df0329493441e Mon Sep 17 00:00:00 2001 From: Robbie McKinstry Date: Thu, 15 Jan 2026 16:36:00 -0500 Subject: [PATCH 1/3] Fix conformance test booting. With this change, the conformance tests can now boot. All tests are disabled, but the suite can boot successfully. Next, we enable one test at a time until they all pass. --- .claude/commands/run-conformance.md | 1 + .claude/skills/gateway-conformance-runner/SKILL.md | 2 +- Dockerfile | 7 ++++--- 3 files changed, 6 insertions(+), 4 deletions(-) create mode 100644 .claude/commands/run-conformance.md diff --git a/.claude/commands/run-conformance.md b/.claude/commands/run-conformance.md new file mode 100644 index 0000000..18f9e6c --- /dev/null +++ b/.claude/commands/run-conformance.md @@ -0,0 +1 @@ +I'd like you to help me run the Kubernetes Gateway API conformance suite. Please use the `gateway-conformance-runner` skill. I would like to run the tests locally. diff --git a/.claude/skills/gateway-conformance-runner/SKILL.md b/.claude/skills/gateway-conformance-runner/SKILL.md index e9d7ff4..02404b0 100644 --- a/.claude/skills/gateway-conformance-runner/SKILL.md +++ b/.claude/skills/gateway-conformance-runner/SKILL.md @@ -1,7 +1,7 @@ --- name: gateway-conformance-runner description: Use this skill when you need to run the Gateway API conformance test suite for the multiway project. It includes setting up a Kind cluster, building and deploying the gateway controller, running the official conformance tests, and analyzing the results. The skill handles the complete workflow from cluster creation to test execution and log retrieval. Examples:\n\n\nContext: The user wants to verify their Gateway API implementation meets conformance standards.\nuser: "I need to run the conformance tests for our gateway"\nassistant: "I'll use the gateway-conformance-runner agent to set up the environment and run the full conformance test suite."\n\nSince the user wants to run conformance tests, use the Task tool to launch the gateway-conformance-runner agent to handle the complete testing workflow.\n\n\n\n\nContext: The user has made changes to the gateway controller and wants to verify conformance.\nuser: "Can you check if our gateway still passes the conformance tests after my latest changes?"\nassistant: "Let me use the gateway-conformance-runner agent to run the full conformance test suite and check the results."\n\nThe user needs to verify conformance after changes, so use the gateway-conformance-runner agent to execute the tests.\n\n\n\n\nContext: The user is debugging a failed conformance test.\nuser: "The HTTPRoute tests are failing, can you run them and show me the logs?"\nassistant: "I'll use the gateway-conformance-runner agent to run the conformance tests and retrieve the detailed logs for analysis."\n\nSince the user needs to debug conformance test failures, use the gateway-conformance-runner agent to run tests and get logs.\n\n -allowed-tools: Bash(kind:*) Bash(kubectl:*) Bash(docker:*) Bash(cargo:make) Bash(ls:*) Bash(docker:*) Bash(cd:*) Bash(direnv:*) Read +allowed-tools: Bash(kind:*) Bash(kubectl:*) Bash(docker:*) Bash(cargo:make) Bash(ls:*) Bash(docker:*) Bash(cd:*) Bash(direnv:*) Bash(make:conformance) Read --- You are an expert in Kubernetes Gateway API conformance testing and container orchestration. You specialize in running the official Gateway API conformance test suite against gateway implementations, with deep knowledge of Kind clusters, Docker containerization, and Kubernetes deployments. diff --git a/Dockerfile b/Dockerfile index f0443e8..903fe55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -99,10 +99,11 @@ COPY --from=builder /multiway-dataplane /usr/local/bin/multiway-dataplane # Copy config directory (owned by nonroot user in distroless) COPY --from=builder --chown=nonroot:nonroot /etc/multiway /etc/multiway -# Default config path -ENV CONFIG_PATH=/etc/multiway/config.json +# Default config path (matches binary default and control plane mount point) +ENV CONFIG_PATH=/config/config.json EXPOSE 8080 ENTRYPOINT ["/usr/local/bin/multiway-dataplane"] -CMD ["--config", "/etc/multiway/config.json"] +# Config path is set via CONFIG_PATH env var by the control plane +# Default in binary is /config/config.json which matches control plane mount From 6af39f2c5fc7303b832bbd01c3609426f4f9ce41 Mon Sep 17 00:00:00 2001 From: Robbie McKinstry Date: Thu, 15 Jan 2026 17:15:27 -0500 Subject: [PATCH 2/3] Add Development Loop Skill This commit introduces a Claude Skill to implement a development loop for the agent so it can provide implementations for the Gateway API conformance tests. --- .claude/skills/development-loop/SKILL.md | 233 ++++++++++++++++++ .../development-loop/bug-reports/.gitkeep | 0 .../test-tiers/tier-1-essential.csv | 8 + .../test-tiers/tier-2-important-http.csv | 11 + .../test-tiers/tier-3-production.csv | 11 + .../test-tiers/tier-4-advanced.csv | 13 + .../test-tiers/tier-5-observability.csv | 5 + .../test-tiers/tier-6-validation.csv | 18 ++ .../test-tiers/tier-7-not-relevant.csv | 17 ++ 9 files changed, 316 insertions(+) create mode 100644 .claude/skills/development-loop/SKILL.md create mode 100644 .claude/skills/development-loop/bug-reports/.gitkeep create mode 100644 .claude/skills/development-loop/test-tiers/tier-1-essential.csv create mode 100644 .claude/skills/development-loop/test-tiers/tier-2-important-http.csv create mode 100644 .claude/skills/development-loop/test-tiers/tier-3-production.csv create mode 100644 .claude/skills/development-loop/test-tiers/tier-4-advanced.csv create mode 100644 .claude/skills/development-loop/test-tiers/tier-5-observability.csv create mode 100644 .claude/skills/development-loop/test-tiers/tier-6-validation.csv create mode 100644 .claude/skills/development-loop/test-tiers/tier-7-not-relevant.csv diff --git a/.claude/skills/development-loop/SKILL.md b/.claude/skills/development-loop/SKILL.md new file mode 100644 index 0000000..04c1ae6 --- /dev/null +++ b/.claude/skills/development-loop/SKILL.md @@ -0,0 +1,233 @@ +--- +name: development-loop +description: Red-green-refactor development loop for implementing Gateway API conformance tests. Use this skill when working on implementing new conformance tests for the multiway project. It guides the agent through selecting the next test to implement based on priority tiers, running the conformance suite, diagnosing failures, and implementing fixes. +allowed-tools: Bash(cargo:*) Bash(kubectl:*) Bash(kind:*) Bash(docker:*) Bash(cd:*) Bash(ls:*) Bash(make:*) Bash(direnv:*) Read Edit Write Grep Glob Task +--- + +# Development Loop for Gateway API Conformance Implementation + +You are an expert in implementing Kubernetes Gateway API conformance tests. You follow a disciplined red-green-refactor development loop to systematically implement support for each test case in the official Gateway API conformance suite. + +## Overview + +This skill guides you through a development loop for implementing conformance tests one at a time. Each iteration of the loop: +1. Selects the highest-priority unimplemented test +2. Verifies the test is currently skipped +3. Enables the test and observes the failure +4. Diagnoses the root cause +5. Implements and verifies the fix +6. Documents the results + +**CRITICAL**: All conformance tests MUST be run **locally** using the `gateway-conformance-runner` skill's local testing workflow. Never run tests in-cluster during development. + +## Test Priority Tiers + +Test cases have been prioritized into 7 tiers, stored in CSV files within this skill's directory: + +| File | Priority | Description | +|------|----------|-------------| +| `test-tiers/tier-1-essential.csv` | Highest | Core functionality that must work | +| `test-tiers/tier-2-important-http.csv` | High | Important HTTP routing features | +| `test-tiers/tier-3-production.csv` | Medium-High | Production-ready features | +| `test-tiers/tier-4-advanced.csv` | Medium | Advanced routing capabilities | +| `test-tiers/tier-5-observability.csv` | Medium-Low | Observability features | +| `test-tiers/tier-6-validation.csv` | Low | Validation and edge cases | +| `test-tiers/tier-7-not-relevant.csv` | Lowest | Tests not relevant to this implementation | + +Each CSV file has the following columns: +- `test_name`: The name of the conformance test +- `description`: A brief description of what the test validates +- `implemented`: Status - `false`, `in-progress`, or `true` + +## Development Loop Steps + +### Step 1: Select the Next Test + +Scan the tier CSV files in order of priority (tier-1 first, tier-7 last) to find the first test where `implemented` is `false`. + +Once you've selected a test: +1. Update the CSV file to change `implemented` from `false` to `in-progress` +2. Record the test name and its description for reference + +**Example:** +```csv +HTTPRouteSimpleSameNamespace,Basic HTTP routing...,in-progress +``` + +### Step 2: Verify Test is Currently Skipped + +Before making any code changes, verify the current state: + +1. Ensure the `GATEWAY_CONFORMANCE_SUITE` environment variable is set +2. Navigate to `$GATEWAY_CONFORMANCE_SUITE` +3. Use the `gateway-conformance-runner` skill to run the conformance suite locally +4. Verify: + - The selected test is currently **skipped** (not running) + - All other enabled tests are **passing** + +If other tests are failing, stop and address those failures first before enabling a new test. + +### Step 3: Enable the Test and Observe Failure + +1. Enable the test by removing it from the skip list or adding it to the enabled tests in the conformance configuration +2. Run the conformance suite again using `gateway-conformance-runner` +3. Observe and capture the test failure output +4. Document the specific failure message and any relevant stack traces + +### Step 4: Handle Test Results + +**If the test passes immediately:** +- Update the CSV file to change `implemented` from `in-progress` to `true` +- Document this finding (the feature was already implemented) +- Return to Step 1 to select the next test + +**If the test fails:** +- Proceed to Step 5 (Diagnosis) + +### Step 5: Diagnose the Failure + +#### 5a: Attempt to Create a Unit Test (Recommended) + +Before diving into the implementation, try to recreate the conformance test as a purely functional unit test within this repository: + +1. Study the conformance test implementation in `$GATEWAY_CONFORMANCE_SUITE/conformance` +2. Understand what scenario the test is validating +3. Create a unit test using this project's testing patterns: + - Use `snapshot` semantics for expected outputs + - Use `world state` semantics for modeling the reconciler + - Implement as a purely functional controller test + +Having a local unit test provides: +- Faster iteration cycles +- Easier debugging +- Better test isolation +- Documentation of the expected behavior + +If you cannot successfully create a unit test, proceed to the next step. + +#### 5b: Investigate Root Cause + +1. Analyze the failure message to identify the failing assertion +2. Trace through the code to understand the request flow: + - Control plane: How are resources being reconciled? + - Data plane: How are requests being routed? +3. Identify the specific code paths responsible for the failure +4. Document your findings + +#### 5c: File a Bug Report + +Create a Markdown file in `./bug-reports/` documenting: + +```markdown +# Bug Report: [Test Name] + +## Test Description +[What the conformance test is validating] + +## Failure Message +[The exact error message from the conformance test] + +## Root Cause Analysis +[Your findings about why the test is failing] + +## Affected Code +- Control plane: [relevant files/functions] +- Data plane: [relevant files/functions] + +## Proposed Fix +[Your plan to address the issue] +``` + +### Step 6: Implement the Fix + +1. Make the necessary code changes to fix the identified issue +2. Keep changes minimal and focused on the specific test +3. Follow the project's coding conventions and patterns + +### Step 7: Verify the Fix + +1. If you created a unit test in Step 5a, run it first: + ```bash + cargo nextest run [test_name] + ``` +2. Run the full conformance suite using `gateway-conformance-runner` +3. Verify: + - The previously failing test now **passes** + - No other tests have regressed + +If verification fails, return to Step 5 to continue diagnosis. + +### Step 8: Document and Report + +Once the test passes: + +1. Update the CSV file to change `implemented` from `in-progress` to `true` + +2. Create a summary report with the following format: + +```markdown +## Test Completed: [Test Name] + +### Summary +[Brief description of what was implemented] + +### Changes Made + +**Before:** +[Code or behavior before the fix] + +**After:** +[Code or behavior after the fix] + +### Files Modified +- `path/to/file1.rs`: [description of changes] +- `path/to/file2.rs`: [description of changes] + +### Unit Test Added +[Yes/No - if yes, describe the test] + +### Lessons Learned +[Any insights that might help with future tests] +``` + +3. Return to Step 1 to continue with the next test + +## Running Conformance Tests Locally + +Always use the `gateway-conformance-runner` skill for running conformance tests. The local testing workflow provides: +- Faster iteration cycles +- Real-time output for debugging +- Direct access to test logs +- Ability to run individual tests + +Key commands: +```bash +# Verify environment +echo $GATEWAY_CONFORMANCE_SUITE + +# Run conformance tests locally +cd $GATEWAY_CONFORMANCE_SUITE && make conformance +``` + +## Best Practices + +1. **One test at a time**: Focus on a single test per iteration +2. **Verify first**: Always confirm the test is skipped before enabling +3. **Minimal changes**: Make the smallest change needed to pass the test +4. **Document everything**: Keep thorough records in bug reports and summaries +5. **Unit tests preferred**: Local unit tests make debugging much faster +6. **No regressions**: Ensure all previously passing tests continue to pass + +## Error Recovery + +If you encounter issues: +- **Wrong kubectl context**: Stop immediately, switch to the correct context +- **Conformance suite not found**: Verify `GATEWAY_CONFORMANCE_SUITE` is set correctly +- **Multiple tests failing**: Address failing tests before enabling new ones +- **Stuck on a test**: Document findings, mark as `in-progress`, and consider moving to the next test with a note + +## Files and Directories + +- `./test-tiers/*.csv`: Test priority lists and implementation status +- `./bug-reports/`: Diagnostic reports for failing tests +- `$GATEWAY_CONFORMANCE_SUITE/conformance`: The official conformance test suite diff --git a/.claude/skills/development-loop/bug-reports/.gitkeep b/.claude/skills/development-loop/bug-reports/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/.claude/skills/development-loop/test-tiers/tier-1-essential.csv b/.claude/skills/development-loop/test-tiers/tier-1-essential.csv new file mode 100644 index 0000000..95a5344 --- /dev/null +++ b/.claude/skills/development-loop/test-tiers/tier-1-essential.csv @@ -0,0 +1,8 @@ +test_name,description,implemented +HTTPRouteSimpleSameNamespace,Basic HTTP routing from a route to a backend service in the same namespace. Foundation of all routing.,false +HTTPRouteMatching,Path and header matching for routing requests to different backends based on request criteria.,false +HTTPRouteExactPathMatching,Exact path matching where /foo matches only /foo and not /foo/bar.,false +HTTPRouteWeight,Traffic distribution across multiple backends based on specified weights for load balancing.,false +GatewayWithAttachedRoutes,Core Gateway-Route attachment model verifying routes attach correctly and status is tracked.,false +HTTPRouteListenerHostnameMatching,Multiple listeners with different hostnames routing to different backends.,false +HTTPRouteListenerPortMatching,HTTP listeners on different ports with port-based routing.,false diff --git a/.claude/skills/development-loop/test-tiers/tier-2-important-http.csv b/.claude/skills/development-loop/test-tiers/tier-2-important-http.csv new file mode 100644 index 0000000..c915d6c --- /dev/null +++ b/.claude/skills/development-loop/test-tiers/tier-2-important-http.csv @@ -0,0 +1,11 @@ +test_name,description,implemented +HTTPRouteHeaderMatching,Header-based routing rules enabling request filtering and routing based on HTTP headers.,false +HTTPRouteMethodMatching,HTTP method-based routing (GET/POST/PUT/DELETE etc) for REST API routing.,false +HTTPRouteQueryParamMatching,Query parameter-based routing for feature flags and conditional routing.,false +HTTPRouteRequestHeaderModifier,Adding/removing/replacing request headers before forwarding to backends.,false +HTTPRouteResponseHeaderModifier,Response header modification for security headers and CORS.,false +HTTPRouteRewritePath,Path rewriting and prefix stripping for backend URL compatibility.,false +HTTPRouteRewriteHost,Host header rewriting for multi-domain backend support.,false +HTTPRouteHostnameIntersection,Hostname matching with wildcard and specific hostname handling and precedence.,false +HTTPRouteMatchingAcrossRoutes,Routing rules across multiple HTTPRoutes on same gateway verifying rule precedence.,false +HTTPRoutePathMatchOrder,Correct matching order for path-based rules ensuring predictable routing behavior.,false diff --git a/.claude/skills/development-loop/test-tiers/tier-3-production.csv b/.claude/skills/development-loop/test-tiers/tier-3-production.csv new file mode 100644 index 0000000..855bab7 --- /dev/null +++ b/.claude/skills/development-loop/test-tiers/tier-3-production.csv @@ -0,0 +1,11 @@ +test_name,description,implemented +HTTPRouteHTTPSListener,HTTPS listener support with TLS certificate management and termination.,false +HTTPRouteRedirectScheme,HTTP to HTTPS redirect for standard security enforcement.,false +HTTPRouteRedirectPath,Path-based redirects for URL migration and restructuring.,false +HTTPRouteRedirectPort,Port-based redirects for traffic management.,false +HTTPRouteRedirectHostAndStatus,Host redirect with HTTP status code control (301/302/etc).,false +HTTPRouteRedirectPortAndScheme,Combined port and scheme redirects in a single rule.,false +HTTPRouteTimeoutRequest,Request timeout handling to prevent hung connections.,false +HTTPRouteTimeoutBackendRequest,Backend connection timeout to handle slow backends.,false +HTTPRouteCrossNamespace,Cross-namespace route attachment with ReferenceGrant for namespace isolation.,false +GatewayWithAttachedRoutesWithPort8080,Gateway with routes on non-standard ports (8080).,false diff --git a/.claude/skills/development-loop/test-tiers/tier-4-advanced.csv b/.claude/skills/development-loop/test-tiers/tier-4-advanced.csv new file mode 100644 index 0000000..4842fc7 --- /dev/null +++ b/.claude/skills/development-loop/test-tiers/tier-4-advanced.csv @@ -0,0 +1,13 @@ +test_name,description,implemented +HTTPRouteRequestMirror,Request mirroring/shadowing to additional backends for canary testing.,false +HTTPRouteRequestMultipleMirrors,Multiple request mirrors to several backends simultaneously.,false +HTTPRouteRequestPercentageMirror,Percentage-based traffic mirroring for gradual testing rollout.,false +HTTPRouteBackendRequestHeaderModifier,Per-backend header modification separate from route-level modification.,false +HTTPRouteRequestHeaderModifierBackendWeights,Header modification combined with weighted backend distribution.,false +HTTPRouteCORSAllowCredentialsBehavior,CORS credential handling for web applications with authentication.,false +HTTPRouteNamedRule,Named HTTPRoute rules for better observability and metrics reference.,false +HTTPRouteServiceTypes,Support for various Kubernetes service types (headless/manual endpoint slices).,false +GatewayModifyListeners,Dynamic listener modification and status updates at runtime.,false +GatewayHTTPListenerIsolation,Listener isolation ensuring requests don't cross listener boundaries.,false +GatewayStaticAddresses,Gateway static IP address assignment and management.,false +GatewayOptionalAddressValue,Optional gateway address handling when address is not specified.,false diff --git a/.claude/skills/development-loop/test-tiers/tier-5-observability.csv b/.claude/skills/development-loop/test-tiers/tier-5-observability.csv new file mode 100644 index 0000000..ad5ff4d --- /dev/null +++ b/.claude/skills/development-loop/test-tiers/tier-5-observability.csv @@ -0,0 +1,5 @@ +test_name,description,implemented +HTTPRouteObservedGenerationBump,Route status condition tracking for configuration change detection.,false +GatewayObservedGenerationBump,Gateway status generation tracking for state synchronization.,false +GatewayClassObservedGenerationBump,GatewayClass status tracking for control plane consistency verification.,false +GatewayInfrastructure,Infrastructure metadata propagation to provisioned resources.,false diff --git a/.claude/skills/development-loop/test-tiers/tier-6-validation.csv b/.claude/skills/development-loop/test-tiers/tier-6-validation.csv new file mode 100644 index 0000000..54e13c9 --- /dev/null +++ b/.claude/skills/development-loop/test-tiers/tier-6-validation.csv @@ -0,0 +1,18 @@ +test_name,description,implemented +HTTPRouteInvalidBackendRefUnknownKind,Verification that unknown backend reference kinds are properly rejected.,false +HTTPRouteInvalidNonExistentBackendRef,Non-existent backend reference rejection with proper status.,false +HTTPRouteInvalidCrossNamespaceBackendRef,Cross-namespace backend ref without ReferenceGrant permission rejection.,false +HTTPRouteInvalidCrossNamespaceParentRef,Cross-namespace parent ref validation and rejection.,false +HTTPRouteInvalidParentRefNotMatchingListenerPort,Port matching validation when parent ref port doesn't match listener.,false +HTTPRouteInvalidParentRefNotMatchingSectionName,Section name matching validation for parent references.,false +HTTPRouteInvalidParentRefSectionNameNotMatchingPort,Combined port/section name validation for parent refs.,false +HTTPRouteInvalidReferenceGrant,Invalid ReferenceGrant handling and rejection.,false +HTTPRoutePartiallyInvalidViaInvalidReferenceGrant,Partial route acceptance when some refs are invalid.,false +HTTPRouteDisallowedKind,Route kind rejection when listener doesn't allow it.,false +HTTPRouteReferenceGrant,Valid ReferenceGrant acceptance for cross-namespace access.,false +GatewaySecretInvalidReferenceGrant,Invalid secret ReferenceGrant handling for TLS secrets.,false +GatewaySecretMissingReferenceGrant,Missing ReferenceGrant rejection for cross-namespace secrets.,false +GatewaySecretReferenceGrantAllInNamespace,ReferenceGrant with all-in-namespace permissions for secrets.,false +GatewaySecretReferenceGrantSpecific,Specific ReferenceGrant for particular secrets only.,false +GatewayInvalidRouteKind,Invalid route kind rejection at Gateway level.,false +GatewayInvalidTLSConfiguration,TLS configuration validation and error reporting.,false diff --git a/.claude/skills/development-loop/test-tiers/tier-7-not-relevant.csv b/.claude/skills/development-loop/test-tiers/tier-7-not-relevant.csv new file mode 100644 index 0000000..d857166 --- /dev/null +++ b/.claude/skills/development-loop/test-tiers/tier-7-not-relevant.csv @@ -0,0 +1,17 @@ +test_name,description,implemented +GRPCExactMethodMatching,gRPC method-based routing with exact service/method matching.,false +GRPCRouteHeaderMatching,gRPC route header matching for gRPC metadata routing.,false +GRPCRouteListenerHostnameMatching,gRPC hostname matching for multi-tenant gRPC services.,false +GRPCRouteNamedRule,Named gRPC route rules for observability.,false +GRPCRouteWeight,gRPC traffic weighting for load balancing gRPC services.,false +TLSRouteSimpleSameNamespace,TLS passthrough routing based on SNI without termination.,false +TLSRouteInvalidReferenceGrant,TLS route reference grant validation.,false +UDPRoute,UDP protocol routing for non-HTTP traffic.,false +HTTPRouteBackendProtocolWebSocket,WebSocket protocol support and upgrade handling.,false +HTTPRouteBackendProtocolH2C,HTTP/2 Cleartext (h2c) backend protocol support.,false +BackendTLSPolicy,TLS configuration for gateway-to-backend connections (mTLS).,false +BackendTLSPolicyConflictResolution,Handling conflicting TLS policies on same backend.,false +BackendTLSPolicyInvalidCACertificateRef,Invalid CA certificate reference handling.,false +BackendTLSPolicyInvalidKind,Invalid backend reference kind rejection in TLS policy.,false +BackendTLSPolicyObservedGenerationBump,TLS policy status generation tracking.,false +BackendTLSPolicySANValidation,SubjectAltName validation in backend TLS certificates.,false From 4d173070648cf102ce79ad956cb120c9e5f6bacc Mon Sep 17 00:00:00 2001 From: Robbie McKinstry Date: Thu, 15 Jan 2026 18:39:24 -0500 Subject: [PATCH 3/3] Work-in-progress test implementation. --- .../HTTPRouteSimpleSameNamespace.md | 111 +++++++++++++++ .../test-tiers/tier-1-essential.csv | 2 +- crates/controlplane/src/core/reconcile.rs | 60 +++++++-- crates/controlplane/src/shell/executor.rs | 127 +++++++++--------- 4 files changed, 223 insertions(+), 77 deletions(-) create mode 100644 .claude/skills/development-loop/bug-reports/HTTPRouteSimpleSameNamespace.md diff --git a/.claude/skills/development-loop/bug-reports/HTTPRouteSimpleSameNamespace.md b/.claude/skills/development-loop/bug-reports/HTTPRouteSimpleSameNamespace.md new file mode 100644 index 0000000..0e69674 --- /dev/null +++ b/.claude/skills/development-loop/bug-reports/HTTPRouteSimpleSameNamespace.md @@ -0,0 +1,111 @@ +# Bug Report: HTTPRouteSimpleSameNamespace + +## Test Description +This conformance test validates basic HTTP routing from an HTTPRoute to a backend service in the same namespace. It creates: +- A Gateway named `same-namespace` in `gateway-conformance-infra` namespace +- An HTTPRoute named `gateway-conformance-infra-test` that routes to `infra-backend-v1:8080` +- Makes an HTTP request to the Gateway and expects a 200 response from the backend + +## Issues Found + +### Issue 1: Missing ResolvedRefs Condition on Gateway Listeners (FIXED) + +**Failure Message:** +``` +gateway gateway-conformance-infra/same-namespace doesn't have ResolvedRefs condition set to True on http listener +``` + +**Root Cause:** +Gateway listeners only had the `Accepted` condition set, but Gateway API requires listeners to also have `ResolvedRefs` and `Programmed` conditions. + +**Fix Applied:** +Added `ResolvedRefs` and `Programmed` conditions to `GatewayStatusListeners` in `crates/controlplane/src/core/reconcile.rs:288-333`. + +### Issue 2: Missing observedGeneration on HTTPRoute Conditions (FIXED) + +**Failure Message:** +``` +HTTPRoute expected observedGeneration to be updated to 1 for all conditions, only 0/2 were updated. stale conditions are: Accepted (generation 0), ResolvedRefs (generation 0) +``` + +**Root Cause:** +HTTPRoute parent status conditions had `observed_generation: None` instead of the route's generation. + +**Fix Applied:** +Updated `build_parent_status` function in `crates/controlplane/src/core/reconcile.rs:672-713` to accept and use the route's generation. + +### Issue 3: ConfigMap Server-Side Apply Conflict (FIXED) + +**Failure Message:** +``` +Failed to execute upsert error=Kubernetes API error: ApiError: Apply failed with 1 conflict: conflict with "unknown" using v1: .data.config.json +``` + +**Root Cause Analysis:** +The controller uses Server-Side Apply (SSA) to update ConfigMaps containing data plane configuration. There was a conflict with an "unknown" field manager on the `.data.config.json` field. + +**Investigation Findings:** + +1. **Original Code Issue:** The upsert functions in `executor.rs` used a "get-then-create" pattern: + - If resource exists: Update with SSA using field manager "multiway-controller" + - If resource doesn't exist: Create with `api.create()` using `PostParams::default()` (no field manager = "unknown") + + This meant ConfigMaps created by our controller were tagged with field manager "unknown", causing conflicts on subsequent SSA updates. + +2. **Initial Attempted Fix:** Changed all upsert functions to use pure SSA with `PatchParams::apply("multiway-controller").force()`: + - SSA handles both creation and updates idempotently + - The `.force()` flag should force the controller to take ownership of conflicting fields + +3. **SSA with Force Still Failed:** Even with `.force()` enabled, the SSA conflict persisted. Investigation showed that k8s-openapi's `ConfigMap` type, when serialized via `Patch::Apply`, may not properly include TypeMeta (apiVersion/kind) fields required by SSA. + +4. **Working Solution:** Implemented a create-or-replace pattern for ConfigMaps instead of SSA: + - Use `api.get()` to check if ConfigMap exists + - If exists: Use `api.replace()` with the existing `resource_version` to update + - If not exists (404): Use `api.create()` to create + + This avoids SSA field manager conflicts entirely while still providing idempotent upsert semantics. + +**Fix Applied:** +Updated `upsert_configmap` in `crates/controlplane/src/shell/executor.rs:216-264` to use the create-or-replace pattern. + +### Issue 4: Data Plane Not Responding to HTTP Requests (IN PROGRESS) + +**Failure Message:** +``` +Request failed, not ready yet: Get "http://10.96.65.60/": context deadline exceeded +``` + +**Root Cause Analysis:** +After fixing the ConfigMap SSA conflict, the ConfigMaps are now being created successfully and the data plane pods are running. However, HTTP requests to the Gateway Service IP are timing out. + +**Current Status:** +- ConfigMaps: ✅ Created successfully (`multiway-config-same-namespace`, etc.) +- Data plane pods: ✅ Running (`multiway-dp-same-namespace-*`) +- HTTP routing: ❌ Requests timeout + +This is a separate issue from the SSA conflict and requires further investigation into: +1. Data plane configuration loading +2. Envoy/Pingora routing setup +3. Service endpoints and networking + +## Test Status + +The status condition and ConfigMap fixes are working: +- Gateway listener ResolvedRefs condition: ✅ PASSING +- HTTPRoute observedGeneration on conditions: ✅ PASSING +- ConfigMap creation/update (SSA conflict fix): ✅ FIXED + +The HTTP request test fails due to data plane routing issues: +- Simple HTTP request should reach infra-backend: ❌ FAILING (timeout - data plane not routing traffic) + +## Files Modified + +1. `crates/controlplane/src/core/reconcile.rs`: + - Lines 288-334: Added ResolvedRefs and Programmed conditions to listener status + - Lines 639-646: Pass route generation to build_parent_status + - Lines 672-713: Updated build_parent_status to accept and use generation parameter + - Lines 385-405: Updated build_configmap to explicitly set all fields + +2. `crates/controlplane/src/shell/executor.rs`: + - Lines 216-264: Changed `upsert_configmap` from SSA to create-or-replace pattern + - This fix avoids SSA field manager conflicts that persisted even with `.force()` enabled diff --git a/.claude/skills/development-loop/test-tiers/tier-1-essential.csv b/.claude/skills/development-loop/test-tiers/tier-1-essential.csv index 95a5344..c076639 100644 --- a/.claude/skills/development-loop/test-tiers/tier-1-essential.csv +++ b/.claude/skills/development-loop/test-tiers/tier-1-essential.csv @@ -1,5 +1,5 @@ test_name,description,implemented -HTTPRouteSimpleSameNamespace,Basic HTTP routing from a route to a backend service in the same namespace. Foundation of all routing.,false +HTTPRouteSimpleSameNamespace,Basic HTTP routing from a route to a backend service in the same namespace. Foundation of all routing.,in-progress HTTPRouteMatching,Path and header matching for routing requests to different backends based on request criteria.,false HTTPRouteExactPathMatching,Exact path matching where /foo matches only /foo and not /foo/bar.,false HTTPRouteWeight,Traffic distribution across multiple backends based on specified weights for load balancing.,false diff --git a/crates/controlplane/src/core/reconcile.rs b/crates/controlplane/src/core/reconcile.rs index 19e81a5..7ff4734 100644 --- a/crates/controlplane/src/core/reconcile.rs +++ b/crates/controlplane/src/core/reconcile.rs @@ -289,14 +289,48 @@ fn build_gateway_status_with_routes( name: l.name.clone(), attached_routes, supported_kinds, - conditions: vec![Condition { - type_: "Accepted".to_string(), - status: status_value.to_string(), - observed_generation: gateway.metadata.generation, - last_transition_time: time.clone(), - reason: reason.to_string(), - message: message.to_string(), - }], + conditions: vec![ + Condition { + type_: "Accepted".to_string(), + status: status_value.to_string(), + observed_generation: gateway.metadata.generation, + last_transition_time: time.clone(), + reason: reason.to_string(), + message: message.to_string(), + }, + Condition { + type_: "ResolvedRefs".to_string(), + status: status_value.to_string(), + observed_generation: gateway.metadata.generation, + last_transition_time: time.clone(), + reason: if accepted { + "ResolvedRefs".to_string() + } else { + reason.to_string() + }, + message: if accepted { + "All references resolved".to_string() + } else { + message.to_string() + }, + }, + Condition { + type_: "Programmed".to_string(), + status: status_value.to_string(), + observed_generation: gateway.metadata.generation, + last_transition_time: time.clone(), + reason: if accepted { + "Programmed".to_string() + } else { + "Invalid".to_string() + }, + message: if accepted { + "Listener is programmed".to_string() + } else { + message.to_string() + }, + }, + ], } }) .collect(); @@ -364,7 +398,9 @@ fn build_configmap(names: &DataPlaneNames, config: &GatewayConfig, gateway: &Gat ..Default::default() }, data: Some(data), - ..Default::default() + // Explicitly set immutable to None to avoid SSA conflicts with unmanaged fields + immutable: None, + binary_data: None, } } @@ -608,6 +644,7 @@ pub fn reconcile_httproute( validation.accepted, validation.reason, &validation.message, + httproute.metadata.generation, ); parent_statuses.push(parent_status); @@ -641,6 +678,7 @@ fn build_parent_status( accepted: bool, reason: &str, message: &str, + generation: Option, ) -> HttpRouteStatusParents { let time = Time(now); let status_value = if accepted { "True" } else { "False" }; @@ -659,7 +697,7 @@ fn build_parent_status( Condition { type_: "Accepted".to_string(), status: status_value.to_string(), - observed_generation: None, + observed_generation: generation, last_transition_time: time.clone(), reason: reason.to_string(), message: message.to_string(), @@ -667,7 +705,7 @@ fn build_parent_status( Condition { type_: "ResolvedRefs".to_string(), status: status_value.to_string(), - observed_generation: None, + observed_generation: generation, last_transition_time: time, reason: if accepted { "ResolvedRefs" } else { reason }.to_string(), message: message.to_string(), diff --git a/crates/controlplane/src/shell/executor.rs b/crates/controlplane/src/shell/executor.rs index fc8d1c8..a9c7d7c 100644 --- a/crates/controlplane/src/shell/executor.rs +++ b/crates/controlplane/src/shell/executor.rs @@ -9,7 +9,7 @@ use std::time::Duration; use gateway_crds::{Gateway, GatewayClass, HTTPRoute}; use k8s_openapi::api::apps::v1::Deployment; use k8s_openapi::api::core::v1::{ConfigMap, Service, ServiceAccount}; -use kube::api::{DeleteParams, Patch, PatchParams, PostParams}; +use kube::api::{DeleteParams, Patch, PatchParams}; use kube::runtime::controller::Action; use kube::{Api, Client}; use tracing::{debug, error, info, warn}; @@ -175,7 +175,7 @@ impl ReconcileExecutor { Ok(()) } - /// Upsert a Deployment + /// Upsert a Deployment using Server-Side Apply async fn upsert_deployment(&self, deployment: &Deployment) -> Result<()> { let namespace = deployment .metadata @@ -185,100 +185,97 @@ impl ReconcileExecutor { let name = deployment.metadata.name.as_deref().unwrap(); let api: Api = Api::namespaced(self.client.clone(), namespace); - match api.get(name).await { - Ok(_) => { - debug!(namespace, name, "Updating Deployment"); - api.patch( - name, - &PatchParams::apply("multiway-controller"), - &Patch::Apply(deployment), - ) - .await?; - } - Err(kube::Error::Api(err)) if err.code == 404 => { - info!(namespace, name, "Creating Deployment"); - api.create(&PostParams::default(), deployment).await?; - } - Err(e) => return Err(e.into()), - } + debug!(namespace, name, "Applying Deployment"); + api.patch( + name, + &PatchParams::apply("multiway-controller").force(), + &Patch::Apply(deployment), + ) + .await?; Ok(()) } - /// Upsert a Service + /// Upsert a Service using Server-Side Apply async fn upsert_service(&self, service: &Service) -> Result<()> { let namespace = service.metadata.namespace.as_deref().unwrap_or("default"); let name = service.metadata.name.as_deref().unwrap(); let api: Api = Api::namespaced(self.client.clone(), namespace); - match api.get(name).await { - Ok(_) => { - debug!(namespace, name, "Updating Service"); - api.patch( - name, - &PatchParams::apply("multiway-controller"), - &Patch::Apply(service), - ) - .await?; - } - Err(kube::Error::Api(err)) if err.code == 404 => { - info!(namespace, name, "Creating Service"); - api.create(&PostParams::default(), service).await?; - } - Err(e) => return Err(e.into()), - } + debug!(namespace, name, "Applying Service"); + api.patch( + name, + &PatchParams::apply("multiway-controller").force(), + &Patch::Apply(service), + ) + .await?; Ok(()) } - /// Upsert a ConfigMap + /// Upsert a ConfigMap using create-or-replace pattern + /// + /// We use a create-or-replace pattern instead of Server-Side Apply because + /// SSA field manager conflicts with "unknown" persist even when using + /// `.force()`. This appears to be a kube-rs or Kubernetes API server issue. + /// The create-or-replace pattern avoids these conflicts entirely. async fn upsert_configmap(&self, configmap: &ConfigMap) -> Result<()> { + use kube::api::PostParams; + let namespace = configmap.metadata.namespace.as_deref().unwrap_or("default"); let name = configmap.metadata.name.as_deref().unwrap(); let api: Api = Api::namespaced(self.client.clone(), namespace); + debug!(namespace, name, "Upserting ConfigMap"); + + // Build a clean ConfigMap with required fields for creation + let mut cm = ConfigMap { + metadata: kube::core::ObjectMeta { + name: Some(name.to_string()), + namespace: Some(namespace.to_string()), + labels: configmap.metadata.labels.clone(), + owner_references: configmap.metadata.owner_references.clone(), + ..Default::default() + }, + data: configmap.data.clone(), + binary_data: None, + immutable: None, + }; + + // Try to get the existing ConfigMap match api.get(name).await { - Ok(_) => { - debug!(namespace, name, "Updating ConfigMap"); - api.patch( - name, - &PatchParams::apply("multiway-controller"), - &Patch::Apply(configmap), - ) - .await?; + Ok(existing) => { + // ConfigMap exists - update it by setting resource_version + cm.metadata.resource_version = existing.metadata.resource_version; + api.replace(name, &PostParams::default(), &cm).await?; + debug!(namespace, name, "Replaced ConfigMap"); + } + Err(kube::Error::Api(api_err)) if api_err.code == 404 => { + // ConfigMap doesn't exist - create it + api.create(&PostParams::default(), &cm).await?; + debug!(namespace, name, "Created ConfigMap"); } - Err(kube::Error::Api(err)) if err.code == 404 => { - info!(namespace, name, "Creating ConfigMap"); - api.create(&PostParams::default(), configmap).await?; + Err(e) => { + return Err(e.into()); } - Err(e) => return Err(e.into()), } Ok(()) } - /// Upsert a ServiceAccount + /// Upsert a ServiceAccount using Server-Side Apply async fn upsert_serviceaccount(&self, sa: &ServiceAccount) -> Result<()> { let namespace = sa.metadata.namespace.as_deref().unwrap_or("default"); let name = sa.metadata.name.as_deref().unwrap(); let api: Api = Api::namespaced(self.client.clone(), namespace); - match api.get(name).await { - Ok(_) => { - debug!(namespace, name, "Updating ServiceAccount"); - api.patch( - name, - &PatchParams::apply("multiway-controller"), - &Patch::Apply(sa), - ) - .await?; - } - Err(kube::Error::Api(err)) if err.code == 404 => { - info!(namespace, name, "Creating ServiceAccount"); - api.create(&PostParams::default(), sa).await?; - } - Err(e) => return Err(e.into()), - } + debug!(namespace, name, "Applying ServiceAccount"); + api.patch( + name, + &PatchParams::apply("multiway-controller").force(), + &Patch::Apply(sa), + ) + .await?; Ok(()) }