diff --git a/CLAUDE.md b/CLAUDE.md index bedcf49..0678d5a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -147,8 +147,9 @@ boundary without touching the service layer. |---|---|---| | Auth | Static API key (`Authorization: Bearer ` and `Authorization: sso-key :`) and OIDC | `port.Authenticator` | | Identity certificate issuance | File-backed self-signed CA | `port.IdentityCertificateAuthority` | -| Server certificate issuance | File-backed self-signed CA (`serverCsrPEM` path) and BYOC (`serverCertificatePEM` + chain). Exactly one per registration/renewal. | `port.ServerCertificateAuthority` | +| Server certificate issuance | File-backed self-signed CA (`ca.server.type: self`) or external RFC 8555 ACME CA such as Let's Encrypt (`ca.server.type: acme`) for the `serverCsrPEM` path, plus BYOC (`serverCertificatePEM` + chain). Exactly one of CSR/BYOC per registration/renewal. Issuance runs through a certificate-order lifecycle: `CreateOrder` (at registration/renewal submission) returns the provider's domain-control challenges, which are relayed verbatim to the domain owner — ANS never writes DNS or serves challenge files on their behalf; `FinalizeOrder` (at verify-acme, gated on a verified challenge artifact) returns the cert, or `ErrOrderPending` for asynchronous providers (ACME CAs such as Let's Encrypt), in which case re-POSTing verify-acme re-drives the order. | `port.ServerCertificateIssuer` | | DNS verification | `noop` (quickstart; accepts any state) and `lookup` (real miekg/dns queries with TXT / TLSA / HTTPS support; TLSA responses carry the resolver's DNSSEC AuthenticatedData bit through to the TL attestation as `dnsRecordsProvisioned[].dnssecVerified`) | `port.DNSVerifier` | +| HTTP-01 challenge verification | Plain-HTTP fetch of the owner-published challenge artifact (`/.well-known/acme-challenge/` by default). The verify-acme gate passes when either the DNS-01 TXT record or the HTTP-01 resource verifies. | `port.HTTPChallengeVerifier` | | Signing keys | File-based ECDSA P-256 PEM | `port.KeyManager` | | Storage (RA) | SQLite | `port.AgentStore`, `port.CertificateStore`, `port.RenewalStore`, `port.OutboxStore`, `port.UnitOfWork` | | Storage (TL) | SQLite + Tessera POSIX tile storage | `tl/event` codec interfaces | @@ -197,7 +198,7 @@ verify-dns flow without touching real DNS infrastructure. - `internal/domain/` + `internal/crypto/` — pure logic; 100% coverage expected. - `internal/port/` — adapter interfaces (KeyManager, AgentStore, - DNSVerifier, ServerCertificateAuthority, …). + DNSVerifier, ServerCertificateIssuer, …). - `internal/adapter/` — concrete adapters (SQLite, file-KMS, OIDC, static-key auth, miekg/dns, self-signed CA, docsui, …). - `internal/ra/` + `internal/tl/` — service layer and HTTP diff --git a/Makefile b/Makefile index e16bdb9..16af8fd 100644 --- a/Makefile +++ b/Makefile @@ -63,14 +63,22 @@ test: test-cover: @echo "Running tests with coverage..." - @# Exclude cmd/* from the instrumented set. The three command - @# binaries (ans-ra, ans-tl, ans-verify) are thin glue: flag - @# parsing, config loading, dependency wiring, then hand off to - @# library code under internal/. We don't write unit tests for + @# Exclude cmd/* from the instrumented set. The four command + @# binaries (ans-ra, ans-tl, ans-verify, ans-dns) are thin glue: + @# flag parsing, config loading, dependency wiring, then hand off + @# to library code under internal/. We don't write unit tests for @# main() — counting those ~30 unexercised statements toward the @# 90% gate would only penalize real logic coverage. The library @# packages under internal/ are where the gate has teeth. - @pkgs=$$(go list ./... | grep -v '/cmd/' | tr '\n' ',' | sed 's/,$$//'); \ + @# + @# Exclude acmetest for the same reason: it is a test double (an + @# in-process fake RFC 8555 server) imported only by _test.go + @# files and never compiled into a production binary. Its fault- + @# injection knobs are exercised selectively per test, so counting + @# its unused branches as "production" statements would penalize + @# real coverage exactly the way main() would. Test scaffolding is + @# not the system under test. + @pkgs=$$(go list ./... | grep -v -e '/cmd/' -e '/acmetest' | tr '\n' ',' | sed 's/,$$//'); \ go test ./... -count=1 -coverpkg=$$pkgs -coverprofile=coverage.out -covermode=atomic @go tool cover -func=coverage.out @echo "" diff --git a/cmd/ans-ra/main.go b/cmd/ans-ra/main.go index b61a5b7..a155f8c 100644 --- a/cmd/ans-ra/main.go +++ b/cmd/ans-ra/main.go @@ -27,6 +27,7 @@ import ( "github.com/godaddy/ans/internal/adapter/auth" "github.com/godaddy/ans/internal/adapter/cert" + "github.com/godaddy/ans/internal/adapter/challenge" "github.com/godaddy/ans/internal/adapter/dns" "github.com/godaddy/ans/internal/adapter/docsui" "github.com/godaddy/ans/internal/adapter/eventbus" @@ -123,24 +124,21 @@ func run(cfgPath string) error { if err != nil { return fmt.Errorf("init identity ca: %w", err) } - // Optional server CA — enables the serverCsrPEM path at - // registration and renewal. When the config block is absent the - // RA accepts only BYOC (serverCertificatePEM). - var serverCA port.ServerCertificateAuthority - if cfg.CA.Server != nil && cfg.CA.Server.DataDir != "" { - sca, caErr := cert.NewServerSelfCA( - cfg.CA.Server.DataDir, cfg.CA.Server.Org, cfg.CA.Server.ValidityDays) - if caErr != nil { - return fmt.Errorf("init server ca: %w", caErr) + // Optional server certificate issuer — enables the serverCsrPEM + // path at registration and renewal. When the config block is + // absent the RA accepts only BYOC (serverCertificatePEM). + // `ca.server.type` selects the adapter: "self" (default) is the + // in-process self-signed CA; "acme" is an external RFC 8555 + // provider such as Let's Encrypt. Both implement the same + // port.ServerCertificateIssuer order lifecycle. + var serverCA port.ServerCertificateIssuer + if cfg.CA.Server != nil { + serverCA, err = buildServerIssuer(cfg.CA.Server, logger) + if err != nil { + return err } - serverCA = sca - logger.Info(). - Str("dataDir", cfg.CA.Server.DataDir). - Str("org", cfg.CA.Server.Org). - Int("validityDays", cfg.CA.Server.ValidityDays). - Msg("server CA ready — serverCsrPEM path enabled") } else { - logger.Info().Msg("no server CA configured — serverCsrPEM path disabled (BYOC-only)") + logger.Info().Msg("no server issuer configured — serverCsrPEM path disabled (BYOC-only)") } // In local-dev, accept self-signed BYOC certs. Production must // remove WithSkipChainVerify in its config factory. @@ -171,7 +169,8 @@ func run(cfgPath string) error { KeyID: signerKeyID, RaID: cfg.Signer.RaID, }).WithDNSVerifier(dnsVerifier). - WithServerCertificateAuthority(serverCA). + WithHTTPChallengeVerifier(challenge.NewHTTPVerifier()). + WithServerCertificateIssuer(serverCA). WithTLPublicBaseURL(cfg.TLClient.PublicBaseURL) // HTTP. @@ -295,6 +294,12 @@ func run(cfgPath string) error { go service.RunExpiryChecker(expctx, renewals, certsStore, logger, service.ExpiryCheckerOptions{ Interval: 5 * time.Minute, }) + // Registration-side twin: PENDING_VALIDATION registrations whose + // challenge window lapsed flip to EXPIRED, per the spec's + // "not cancellable and will auto-expire" contract. + go service.RunAgentExpiryChecker(expctx, agents, logger, service.ExpiryCheckerOptions{ + Interval: 5 * time.Minute, + }) defer expCancel() addr := fmt.Sprintf("%s:%d", cfg.Server.Host, cfg.Server.Port) @@ -399,6 +404,32 @@ type providerWithAnonymous interface { Middleware() func(http.Handler) http.Handler } +// buildServerIssuer builds the configured server certificate issuer. +// Config validation has already checked the per-type required fields. +func buildServerIssuer(s *config.CAServer, logger zerolog.Logger) (port.ServerCertificateIssuer, error) { + if s.IsACME() { + issuer, err := cert.NewACMEIssuer(s.ACME.DirectoryURL, s.ACME.Email, s.ACME.DataDir) + if err != nil { + return nil, fmt.Errorf("init acme issuer: %w", err) + } + logger.Info(). + Str("directoryURL", s.ACME.DirectoryURL). + Str("dataDir", s.ACME.DataDir). + Msg("ACME issuer ready — serverCsrPEM path enabled (provider-issued certs)") + return issuer, nil + } + sca, err := cert.NewServerSelfCA(s.DataDir, s.Org, s.ValidityDays) + if err != nil { + return nil, fmt.Errorf("init server ca: %w", err) + } + logger.Info(). + Str("dataDir", s.DataDir). + Str("org", s.Org). + Int("validityDays", s.ValidityDays). + Msg("server CA ready — serverCsrPEM path enabled") + return sca, nil +} + // selectDNSVerifier returns the configured DNS adapter. Returns a // port.DNSVerifier so the service layer can wire it directly. // diff --git a/config/ra-local.yaml b/config/ra-local.yaml index f4e67b2..f5043f5 100644 --- a/config/ra-local.yaml +++ b/config/ra-local.yaml @@ -13,6 +13,35 @@ ca: org: "ANS Local Dev CA" validity-days: 365 data-dir: "./data/ra/ca" + # server: optional server certificate issuer. When omitted the RA + # accepts only BYOC server certs (serverCertificatePEM). Two + # adapter types implement the same order lifecycle: + # + # In-process self-signed CA (type self, the default): + # server: + # type: self + # org: "ANS Local Dev Server CA" + # validity-days: 365 + # data-dir: "./data/ra/server-ca" + # + # External RFC 8555 CA — Let's Encrypt et al. (type acme). Use the + # staging directory for testing; production rate limits are + # unforgiving. Selecting this type auto-accepts the provider's + # terms of service on account registration. The relayed challenges + # in pending responses become the provider's own (token + key + # authorization + computed DNS digest) — the domain owner publishes + # them, exactly as with the self-signed issuer. + # + # REQUIRES dns.type: lookup (below). With a real public CA the + # verify-acme gate must check the owner's published artifact before + # answering the provider; the noop verifier would answer blindly and + # invalidate every order, so config validation rejects acme + noop. + # server: + # type: acme + # acme: + # directory-url: "https://acme-staging-v02.api.letsencrypt.org/directory" + # email: "ops@example.com" + # data-dir: "./data/ra/acme" dns: # "noop" (default) accepts any DNS state — the quickstart demo uses diff --git a/go.mod b/go.mod index 195ebc6..0829e37 100644 --- a/go.mod +++ b/go.mod @@ -18,6 +18,7 @@ require ( github.com/rs/zerolog v1.35.1 github.com/stretchr/testify v1.11.1 github.com/transparency-dev/tessera v1.0.2 + golang.org/x/crypto v0.50.0 golang.org/x/mod v0.36.0 modernc.org/sqlite v1.51.0 ) @@ -49,7 +50,6 @@ require ( go.opentelemetry.io/otel/metric v1.43.0 // indirect go.opentelemetry.io/otel/trace v1.43.0 // indirect go.yaml.in/yaml/v3 v3.0.4 // indirect - golang.org/x/crypto v0.50.0 // indirect golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f // indirect golang.org/x/net v0.53.0 // indirect golang.org/x/oauth2 v0.36.0 // indirect diff --git a/internal/adapter/cert/acme.go b/internal/adapter/cert/acme.go new file mode 100644 index 0000000..4e18842 --- /dev/null +++ b/internal/adapter/cert/acme.go @@ -0,0 +1,460 @@ +package cert + +import ( + "context" + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "encoding/pem" + "errors" + "fmt" + "os" + "path/filepath" + "sync" + "time" + + "golang.org/x/crypto/acme" + + anscrypto "github.com/godaddy/ans/internal/crypto" + "github.com/godaddy/ans/internal/domain" + "github.com/godaddy/ans/internal/port" +) + +// pemTypePrivateKey is the PEM block type for PKCS#8 private keys, +// shared by the self-signed CAs' root keys and the ACME account key. +const pemTypePrivateKey = "PRIVATE KEY" + +// acmeAccountKeyFile is the PKCS#8 PEM file holding the ACME account +// key, persisted under the issuer's data dir so the account survives +// restarts (re-registering the same key resolves to the same account +// per RFC 8555 §7.3.1). +const acmeAccountKeyFile = "acme-account.key" + +// defaultFinalizeBudget bounds how long a single FinalizeOrder call +// blocks on the provider before reporting port.ErrOrderPending. The +// verify-acme request that triggers the finalize runs under the +// router's 30s timeout; 10s leaves headroom for the rest of the +// request while still completing most Let's Encrypt validations +// in-call. +const defaultFinalizeBudget = 10 * time.Second + +// ACME challenge type identifiers (RFC 8555 §9.7.8) mapped to the +// domain's wire-cased enum. +const ( + acmeChallengeDNS01 = "dns-01" + acmeChallengeHTTP01 = "http-01" +) + +// ACMEIssuer implements port.ServerCertificateIssuer against an +// RFC 8555 CA — Let's Encrypt being the canonical target (use the +// staging directory for testing, production for live issuance). +// +// Order-lifecycle mapping: +// +// - CreateOrder → new-order. The provider's challenges are relayed +// with their account-bound key authorizations and the computed +// DNS-01 TXT digest, so the pending-registration response hands +// the domain owner exactly the artifacts to publish. ANS never +// publishes them — the owner does, exactly as with the +// self-signed issuer. +// - FinalizeOrder → answer the RA-verified challenge, wait briefly +// for validation + issuance, then finalize with the CSR and +// download the chain. Validation that outlives the in-call +// budget reports port.ErrOrderPending; the re-driven verify-acme +// picks the order back up by its URL. Orders the provider moves +// to `invalid` report port.ErrOrderFailed. +// +// Account handling: the account key is generated once and persisted +// under dataDir; registration happens lazily on first use and +// auto-accepts the provider's terms of service — choosing +// `ca.server.type: acme` in config is the operator's ToS consent, +// per standard ACME automation practice. +type ACMEIssuer struct { + client *acme.Client + contact []string + finalizeBudget time.Duration + + mu sync.Mutex + registered bool + // chainRootPEM caches the top of the most recently downloaded + // chain for GetCACertificate. Informational for ACME providers — + // relying parties already hold the public root in system stores. + chainRootPEM string +} + +// ACMEIssuerOption configures the issuer at construction time. +type ACMEIssuerOption func(*ACMEIssuer) + +// WithFinalizeBudget overrides how long FinalizeOrder blocks on the +// provider before reporting port.ErrOrderPending. +func WithFinalizeBudget(d time.Duration) ACMEIssuerOption { + return func(a *ACMEIssuer) { a.finalizeBudget = d } +} + +// NewACMEIssuer opens (or creates) the ACME account key under +// dataDir and returns an issuer speaking to the given directory URL +// (e.g. Let's Encrypt staging: +// https://acme-staging-v02.api.letsencrypt.org/directory). The +// optional email becomes the account contact for expiry and incident +// notices. No network I/O happens here — account registration is +// deferred to first use so the RA can boot while the provider is +// unreachable. +func NewACMEIssuer(directoryURL, email, dataDir string, opts ...ACMEIssuerOption) (*ACMEIssuer, error) { + if directoryURL == "" { + return nil, errors.New("cert: acme directory-url is required") + } + if dataDir == "" { + return nil, errors.New("cert: acme data-dir is required") + } + if err := os.MkdirAll(dataDir, 0o700); err != nil { + return nil, fmt.Errorf("cert: create acme dir: %w", err) + } + key, err := loadOrCreateAccountKey(filepath.Join(dataDir, acmeAccountKeyFile)) + if err != nil { + return nil, err + } + a := &ACMEIssuer{ + client: &acme.Client{Key: key, DirectoryURL: directoryURL}, + finalizeBudget: defaultFinalizeBudget, + } + if email != "" { + a.contact = []string{"mailto:" + email} + } + for _, opt := range opts { + opt(a) + } + return a, nil +} + +// CreateOrder opens an RFC 8555 order for the FQDN and relays the +// provider's pending challenges. +// +// A new order is not always 'pending': per RFC 8555 §7.1.3 a CA that +// still holds a valid authorization for this account+identifier +// returns the order already 'ready' (Let's Encrypt reuses +// authorizations for ~30 days). The RA uses a single ACME account, so +// a renewal or re-registration of a recently-validated FQDN hits this +// routinely. In that case there is nothing for the owner to publish: +// the order is relayed as ISSUING with no challenges, the RA's gate +// skips ISSUING orders, and the next verify-acme finalizes directly. +func (a *ACMEIssuer) CreateOrder(ctx context.Context, fqdn string) (*domain.CertificateOrder, error) { + if fqdn == "" { + return nil, errors.New("cert: create order: fqdn is required") + } + if err := a.ensureRegistered(ctx); err != nil { + return nil, err + } + order, err := a.client.AuthorizeOrder(ctx, acme.DomainIDs(fqdn)) + if err != nil { + return nil, fmt.Errorf("cert: acme new-order: %w", err) + } + + switch order.Status { + case acme.StatusPending: + challenges, cerr := a.collectChallenges(ctx, order) + if cerr != nil { + return nil, cerr + } + return &domain.CertificateOrder{ + OrderRef: order.URI, + State: domain.OrderStatePending, + Challenges: challenges, + ExpiresAt: order.Expires, + }, nil + case acme.StatusReady, acme.StatusProcessing, acme.StatusValid: + // Authorizations already satisfied (reuse) or issuance already + // underway — no domain-control artifact for the owner to + // publish. Relay as ISSUING so verify-acme drives FinalizeOrder. + return &domain.CertificateOrder{ + OrderRef: order.URI, + State: domain.OrderStateIssuing, + ExpiresAt: order.Expires, + }, nil + default: // acme.StatusInvalid or unknown + return nil, fmt.Errorf("cert: acme new-order returned unusable status %q", order.Status) + } +} + +// collectChallenges walks the order's pending authorizations and +// maps their dns-01 / http-01 challenges to the domain shape. The +// DNS TXT value is the provider-mandated digest of the key +// authorization, NOT the raw token — relayed precomputed so the +// domain owner publishes an opaque name/value pair without knowing +// ACME exists. +func (a *ACMEIssuer) collectChallenges(ctx context.Context, order *acme.Order) ([]domain.Challenge, error) { + var out []domain.Challenge + for _, zurl := range order.AuthzURLs { + authz, err := a.client.GetAuthorization(ctx, zurl) + if err != nil { + return nil, fmt.Errorf("cert: acme get authorization: %w", err) + } + if authz.Status != acme.StatusPending { + continue + } + for _, ch := range authz.Challenges { + keyAuth, err := a.client.HTTP01ChallengeResponse(ch.Token) + if err != nil { + return nil, fmt.Errorf("cert: acme key authorization: %w", err) + } + switch ch.Type { + case acmeChallengeDNS01: + value, err := a.client.DNS01ChallengeRecord(ch.Token) + if err != nil { + return nil, fmt.Errorf("cert: acme dns-01 record: %w", err) + } + out = append(out, domain.Challenge{ + Type: domain.ChallengeTypeDNS01, + Token: ch.Token, + KeyAuthorization: keyAuth, + DNSRecordValue: value, + }) + case acmeChallengeHTTP01: + out = append(out, domain.Challenge{ + Type: domain.ChallengeTypeHTTP01, + Token: ch.Token, + KeyAuthorization: keyAuth, + HTTPPath: a.client.HTTP01ChallengePath(ch.Token), + }) + } + } + } + if len(out) == 0 { + return nil, errors.New("cert: acme order offered no supported challenges (dns-01 / http-01)") + } + return out, nil +} + +// FinalizeOrder drives the order to completion: answer the challenges +// the RA verified, wait (bounded) for the provider's validation, then +// finalize with the CSR and download the chain. +func (a *ACMEIssuer) FinalizeOrder(ctx context.Context, req port.FinalizeOrderRequest) (*port.IssuedCert, error) { + csr, err := anscrypto.ValidateServerCSR(req.CSRPEM, req.FQDN) + if err != nil { + return nil, err + } + if req.OrderRef == "" { + return nil, errors.New("cert: acme finalize: order ref is required") + } + if err := a.ensureRegistered(ctx); err != nil { + return nil, err + } + + order, err := a.client.GetOrder(ctx, req.OrderRef) + if err != nil { + return nil, fmt.Errorf("cert: acme get order: %w", err) + } + + if order.Status == acme.StatusPending { + if err := a.answerVerifiedChallenges(ctx, order, req.Verified); err != nil { + return nil, err + } + if order, err = a.waitOrder(ctx, req.OrderRef); err != nil { + return nil, err + } + } + + switch order.Status { + case acme.StatusReady: + return a.finalizeWithCSR(ctx, order.FinalizeURL, csr.Raw) + case acme.StatusProcessing, acme.StatusPending: + // Validation or issuance is still running provider-side; the + // re-driven verify-acme picks the order back up. + return nil, fmt.Errorf("cert: acme order %s: %w", order.Status, port.ErrOrderPending) + case acme.StatusValid: + der, err := a.client.FetchCert(ctx, order.CertURL, true) + if err != nil { + return nil, fmt.Errorf("cert: acme fetch cert: %w", err) + } + return a.issuedFromChain(der, order.CertURL) + case acme.StatusInvalid: + return nil, fmt.Errorf("cert: acme order invalid: %w", port.ErrOrderFailed) + default: + return nil, fmt.Errorf("cert: acme order in unexpected status %q", order.Status) + } +} + +// answerVerifiedChallenges tells the provider to validate exactly the +// challenges the RA's pre-flight gate found published. Answering an +// unsatisfied challenge would move its authorization to invalid and +// kill the order — which is why the port contract threads Verified +// through. Already-answered challenges (re-driven calls) are skipped +// by their non-pending status. +func (a *ACMEIssuer) answerVerifiedChallenges(ctx context.Context, order *acme.Order, verified []domain.ChallengeType) error { + wanted := map[string]bool{} + for _, t := range verified { + switch t { + case domain.ChallengeTypeDNS01: + wanted[acmeChallengeDNS01] = true + case domain.ChallengeTypeHTTP01: + wanted[acmeChallengeHTTP01] = true + } + } + if len(wanted) == 0 { + return nil + } + for _, zurl := range order.AuthzURLs { + authz, err := a.client.GetAuthorization(ctx, zurl) + if err != nil { + return fmt.Errorf("cert: acme get authorization: %w", err) + } + if authz.Status != acme.StatusPending { + continue + } + for _, ch := range authz.Challenges { + if !wanted[ch.Type] || ch.Status != acme.StatusPending { + continue + } + if _, err := a.client.Accept(ctx, ch); err != nil { + return fmt.Errorf("cert: acme accept %s challenge: %w", ch.Type, err) + } + // One accepted challenge satisfies the authorization; + // answering more buys nothing and risks a race with the + // provider marking the authz valid mid-loop. + break + } + } + return nil +} + +// waitOrder polls the order within the finalize budget. A budget +// overrun is not an error — the order is simply still pending and a +// later verify-acme re-drives it. +func (a *ACMEIssuer) waitOrder(ctx context.Context, orderRef string) (*acme.Order, error) { + waitCtx, cancel := context.WithTimeout(ctx, a.finalizeBudget) + defer cancel() + order, err := a.client.WaitOrder(waitCtx, orderRef) + switch { + case err == nil: + return order, nil + case errors.Is(err, context.DeadlineExceeded) && ctx.Err() == nil: + return nil, fmt.Errorf("cert: acme validation still running: %w", port.ErrOrderPending) + default: + var oe *acme.OrderError + if errors.As(err, &oe) { + return nil, fmt.Errorf("cert: acme order %s: %w", oe.Status, port.ErrOrderFailed) + } + return nil, fmt.Errorf("cert: acme wait order: %w", err) + } +} + +// finalizeWithCSR submits the CSR and downloads the issued chain. A +// budget overrun after submission is reported pending — the next +// re-drive finds the order processing/valid and fetches the cert. +func (a *ACMEIssuer) finalizeWithCSR(ctx context.Context, finalizeURL string, csrDER []byte) (*port.IssuedCert, error) { + waitCtx, cancel := context.WithTimeout(ctx, a.finalizeBudget) + defer cancel() + der, certURL, err := a.client.CreateOrderCert(waitCtx, finalizeURL, csrDER, true) + switch { + case err == nil: + return a.issuedFromChain(der, certURL) + case errors.Is(err, context.DeadlineExceeded) && ctx.Err() == nil: + return nil, fmt.Errorf("cert: acme issuance still running: %w", port.ErrOrderPending) + default: + var oe *acme.OrderError + if errors.As(err, &oe) { + return nil, fmt.Errorf("cert: acme order %s: %w", oe.Status, port.ErrOrderFailed) + } + return nil, fmt.Errorf("cert: acme finalize: %w", err) + } +} + +// issuedFromChain converts the downloaded DER chain (leaf first) into +// the port shape and caches the chain top for GetCACertificate. The +// certificate URL becomes the provider handle (CertificateRef) — the +// stable reference for audit and for RFC 8555 §7.6 revocation. +func (a *ACMEIssuer) issuedFromChain(der [][]byte, certURL string) (*port.IssuedCert, error) { + if len(der) == 0 { + return nil, errors.New("cert: acme returned an empty certificate chain") + } + leaf, err := x509.ParseCertificate(der[0]) + if err != nil { + return nil, fmt.Errorf("cert: parse acme leaf: %w", err) + } + certPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der[0]}) + var chainPEM []byte + for _, d := range der[1:] { + chainPEM = append(chainPEM, pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: d})...) + } + + a.mu.Lock() + if len(der) > 1 { + a.chainRootPEM = string(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: der[len(der)-1]})) + } + a.mu.Unlock() + + return &port.IssuedCert{ + CertPEM: string(certPEM), + ChainPEM: string(chainPEM), + SerialNumber: fmt.Sprintf("%x", leaf.SerialNumber), + CertificateRef: certURL, + ExpiresAt: leaf.NotAfter, + IssuedAt: leaf.NotBefore, + }, nil +} + +// GetCACertificate returns the top of the most recently downloaded +// chain. Informational for ACME providers — relying parties already +// trust the public root via system stores, so an error before first +// issuance is expected and harmless. +func (a *ACMEIssuer) GetCACertificate(_ context.Context) (string, error) { + a.mu.Lock() + defer a.mu.Unlock() + if a.chainRootPEM == "" { + return "", errors.New("cert: acme issuer has not downloaded a chain yet — the provider's public root is already in system trust stores") + } + return a.chainRootPEM, nil +} + +// ensureRegistered lazily registers the account on first use. An +// already-registered key (same key, prior run) is success per +// RFC 8555 §7.3.1. +func (a *ACMEIssuer) ensureRegistered(ctx context.Context) error { + a.mu.Lock() + defer a.mu.Unlock() + if a.registered { + return nil + } + _, err := a.client.Register(ctx, &acme.Account{Contact: a.contact}, acme.AcceptTOS) + if err != nil && !errors.Is(err, acme.ErrAccountAlreadyExists) { + return fmt.Errorf("cert: acme account registration: %w", err) + } + a.registered = true + return nil +} + +// loadOrCreateAccountKey reads the persisted ACME account key, or +// generates an ECDSA P-256 key on first run. +func loadOrCreateAccountKey(path string) (*ecdsa.PrivateKey, error) { + if raw, err := os.ReadFile(path); err == nil { + block, _ := pem.Decode(raw) + if block == nil || block.Type != pemTypePrivateKey { + return nil, errors.New("cert: acme account key is not a PKCS#8 PRIVATE KEY PEM") + } + key, err := x509.ParsePKCS8PrivateKey(block.Bytes) + if err != nil { + return nil, fmt.Errorf("cert: parse acme account key: %w", err) + } + ec, ok := key.(*ecdsa.PrivateKey) + if !ok { + return nil, errors.New("cert: acme account key is not an ECDSA key") + } + return ec, nil + } else if !os.IsNotExist(err) { + return nil, fmt.Errorf("cert: read acme account key: %w", err) + } + + key, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + return nil, fmt.Errorf("cert: generate acme account key: %w", err) + } + der, err := x509.MarshalPKCS8PrivateKey(key) + if err != nil { + return nil, fmt.Errorf("cert: marshal acme account key: %w", err) + } + if err := os.WriteFile(path, + pem.EncodeToMemory(&pem.Block{Type: pemTypePrivateKey, Bytes: der}), 0o600); err != nil { + return nil, fmt.Errorf("cert: write acme account key: %w", err) + } + return key, nil +} diff --git a/internal/adapter/cert/acme_test.go b/internal/adapter/cert/acme_test.go new file mode 100644 index 0000000..621d8bb --- /dev/null +++ b/internal/adapter/cert/acme_test.go @@ -0,0 +1,488 @@ +package cert + +import ( + "crypto/ecdsa" + "crypto/ed25519" + "crypto/rand" + "crypto/x509" + "encoding/pem" + "errors" + "os" + "path/filepath" + "strings" + "testing" + "time" + + "golang.org/x/crypto/acme" + + "github.com/godaddy/ans/internal/adapter/cert/acmetest" + "github.com/godaddy/ans/internal/domain" + "github.com/godaddy/ans/internal/port" +) + +func newFakeACME(t *testing.T) *acmetest.Server { + t.Helper() + f, err := acmetest.New() + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { + if perr := f.Err(); perr != nil { + t.Errorf("fake acme observed a protocol violation: %v", perr) + } + f.Close() + }) + return f +} + +func newTestACMEIssuer(t *testing.T, f *acmetest.Server, opts ...ACMEIssuerOption) *ACMEIssuer { + t.Helper() + issuer, err := NewACMEIssuer(f.DirectoryURL(), "ops@example.com", t.TempDir(), opts...) + if err != nil { + t.Fatal(err) + } + return issuer +} + +func TestACMEIssuer_CreateOrder_RelaysProviderChallenges(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + + order, err := issuer.CreateOrder(t.Context(), "agent.example.com") + if err != nil { + t.Fatalf("create order: %v", err) + } + if order.OrderRef != f.OrderURL() { + t.Errorf("order ref must be the provider order URL, got %q", order.OrderRef) + } + if order.State != domain.OrderStatePending || order.ExpiresAt.IsZero() { + t.Errorf("order shape: %+v", order) + } + + dns01, ok := order.ChallengeOfType(domain.ChallengeTypeDNS01) + if !ok { + t.Fatal("missing dns-01") + } + if dns01.Token != f.DNSToken() { + t.Errorf("dns token: %q", dns01.Token) + } + // The TXT value is the digest of the key authorization — never + // the raw token for an ACME provider. + if dns01.DNSRecordValue == "" || dns01.DNSRecordValue == dns01.Token { + t.Errorf("dns record value must be the provider digest, got %q", dns01.DNSRecordValue) + } + if !strings.HasPrefix(dns01.KeyAuthorization, f.DNSToken()+".") { + t.Errorf("key authorization shape: %q", dns01.KeyAuthorization) + } + + http01, ok := order.ChallengeOfType(domain.ChallengeTypeHTTP01) + if !ok { + t.Fatal("missing http-01") + } + if http01.EffectiveHTTPPath() != "/.well-known/acme-challenge/"+f.HTTPToken() { + t.Errorf("http path: %q", http01.EffectiveHTTPPath()) + } + if !strings.HasPrefix(http01.KeyAuthorization, f.HTTPToken()+".") { + t.Errorf("http key authorization: %q", http01.KeyAuthorization) + } +} + +func TestACMEIssuer_FinalizeOrder_AnswersOnlyVerifiedChallenge(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + + order, err := issuer.CreateOrder(t.Context(), "agent.example.com") + if err != nil { + t.Fatal(err) + } + csrPEM := buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}) + + issued, err := issuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: order.OrderRef, + CSRPEM: csrPEM, + FQDN: "agent.example.com", + Verified: []domain.ChallengeType{domain.ChallengeTypeDNS01}, + }) + if err != nil { + t.Fatalf("finalize: %v", err) + } + if issued.CertPEM == "" || issued.ChainPEM == "" || issued.SerialNumber == "" { + t.Errorf("issued cert incomplete: %+v", issued) + } + + // The Verified contract: only the RA-verified challenge was + // answered. Answering the unsatisfied http-01 would have + // invalidated the authorization at a real provider. + if accepted := f.Accepted(); len(accepted) != 1 || accepted[0] != "dns" { + t.Errorf("accepted challenges: %v, want exactly [dns]", accepted) + } + + // Chain root is cached for GetCACertificate after issuance. + rootPEM, err := issuer.GetCACertificate(t.Context()) + if err != nil || !strings.Contains(rootPEM, "BEGIN CERTIFICATE") { + t.Errorf("GetCACertificate after issuance: err=%v", err) + } +} + +func TestACMEIssuer_FinalizeOrder_PendingThenRedriven(t *testing.T) { + f := newFakeACME(t) + // Tight budget ONLY for the held-pending call: the fake replies + // Retry-After: 1s, so a sub-second budget forces WaitOrder to + // time out into ErrOrderPending deterministically. + pendingIssuer := newTestACMEIssuer(t, f, WithFinalizeBudget(300*time.Millisecond)) + + order, err := pendingIssuer.CreateOrder(t.Context(), "agent.example.com") + if err != nil { + t.Fatal(err) + } + csrPEM := buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}) + + // Provider-side validation outlives the in-call budget. + f.SetHoldPending(true) + _, err = pendingIssuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: order.OrderRef, + CSRPEM: csrPEM, + FQDN: "agent.example.com", + Verified: []domain.ChallengeType{domain.ChallengeTypeDNS01}, + }) + if !errors.Is(err, port.ErrOrderPending) { + t.Fatalf("want ErrOrderPending, got %v", err) + } + + // Validation completes provider-side; a re-driven call (modeled as + // a separate verify-acme request, hence its own default-budget + // issuer — the fake binds orders to no account, so the order URL + // re-drives cleanly) finishes the order. The default budget means + // the expected-success finalize is never racing a tight deadline. + f.SetHoldPending(false) + f.SetOrderStatus(acme.StatusReady) + redriveIssuer := newTestACMEIssuer(t, f) + issued, err := redriveIssuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: order.OrderRef, + CSRPEM: csrPEM, + FQDN: "agent.example.com", + Verified: []domain.ChallengeType{domain.ChallengeTypeDNS01}, + }) + if err != nil { + t.Fatalf("re-driven finalize: %v", err) + } + if issued.CertPEM == "" { + t.Error("re-driven finalize returned no cert") + } +} + +func TestACMEIssuer_FinalizeOrder_ProcessingReportsPending(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + + order, err := issuer.CreateOrder(t.Context(), "agent.example.com") + if err != nil { + t.Fatal(err) + } + // Provider already validating/issuing when we arrive. + f.SetOrderStatus(acme.StatusProcessing) + _, err = issuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: order.OrderRef, + CSRPEM: buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}), + FQDN: "agent.example.com", + }) + if !errors.Is(err, port.ErrOrderPending) { + t.Fatalf("want ErrOrderPending for processing order, got %v", err) + } +} + +func TestACMEIssuer_FinalizeOrder_UnknownStatus(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + + order, err := issuer.CreateOrder(t.Context(), "agent.example.com") + if err != nil { + t.Fatal(err) + } + f.SetOrderStatus("deactivated") + _, err = issuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: order.OrderRef, + CSRPEM: buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}), + FQDN: "agent.example.com", + }) + if err == nil || errors.Is(err, port.ErrOrderPending) || errors.Is(err, port.ErrOrderFailed) { + t.Fatalf("unknown status must be a plain error, got %v", err) + } +} + +func TestACMEIssuer_FinalizeOrder_ProviderOutageMidIssuance(t *testing.T) { + f := newFakeACME(t) + // Short budget: the client retries 5xx finalize responses until + // the budget expires. + issuer := newTestACMEIssuer(t, f, WithFinalizeBudget(300*time.Millisecond)) + + order, err := issuer.CreateOrder(t.Context(), "agent.example.com") + if err != nil { + t.Fatal(err) + } + f.SetFailFinalize(true) + _, err = issuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: order.OrderRef, + CSRPEM: buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}), + FQDN: "agent.example.com", + Verified: []domain.ChallengeType{domain.ChallengeTypeDNS01}, + }) + // A 500 on finalize is neither pending nor a terminal order + // failure — it surfaces as a plain retryable error. + if err == nil || errors.Is(err, port.ErrOrderPending) || errors.Is(err, port.ErrOrderFailed) { + t.Fatalf("provider outage must be a plain error, got %v", err) + } +} + +func TestACMEIssuer_FinalizeOrder_ValidOrderFetchesCert(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + + order, err := issuer.CreateOrder(t.Context(), "agent.example.com") + if err != nil { + t.Fatal(err) + } + // Order already valid (issued while we were away): FetchCert path. + f.SetOrderStatus(acme.StatusValid) + issued, err := issuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: order.OrderRef, + CSRPEM: buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}), + FQDN: "agent.example.com", + }) + if err != nil || issued.CertPEM == "" { + t.Fatalf("valid-order fetch: err=%v", err) + } +} + +func TestACMEIssuer_FinalizeOrder_InvalidOrderFails(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + + order, err := issuer.CreateOrder(t.Context(), "agent.example.com") + if err != nil { + t.Fatal(err) + } + f.SetFailValidation(true) + _, err = issuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: order.OrderRef, + CSRPEM: buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}), + FQDN: "agent.example.com", + Verified: []domain.ChallengeType{domain.ChallengeTypeDNS01}, + }) + if !errors.Is(err, port.ErrOrderFailed) { + t.Fatalf("want ErrOrderFailed, got %v", err) + } +} + +func TestACMEIssuer_FinalizeOrder_InputValidation(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + + // Bad CSR shape rejected before any provider call. + if _, err := issuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: "x", CSRPEM: "junk", FQDN: "agent.example.com", + }); err == nil { + t.Error("want CSR validation error") + } + // Missing order ref rejected. + if _, err := issuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + CSRPEM: buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}), + FQDN: "agent.example.com", + }); err == nil { + t.Error("want order-ref error") + } +} + +func TestACMEIssuer_GetCACertificate_BeforeIssuance(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + if _, err := issuer.GetCACertificate(t.Context()); err == nil { + t.Error("want error before first issuance") + } +} + +func TestACMEIssuer_AccountKeyPersists(t *testing.T) { + f := newFakeACME(t) + dir := t.TempDir() + i1, err := NewACMEIssuer(f.DirectoryURL(), "", dir) + if err != nil { + t.Fatal(err) + } + if _, err := i1.CreateOrder(t.Context(), "agent.example.com"); err != nil { + t.Fatal(err) + } + // Second instance reuses the persisted key — same account per + // RFC 8555 §7.3.1. + i2, err := NewACMEIssuer(f.DirectoryURL(), "", dir) + if err != nil { + t.Fatal(err) + } + if _, err := i2.CreateOrder(t.Context(), "agent.example.com"); err != nil { + t.Fatalf("second instance with persisted key: %v", err) + } + k1, ok1 := i1.client.Key.(*ecdsa.PrivateKey) + k2, ok2 := i2.client.Key.(*ecdsa.PrivateKey) + if !ok1 || !ok2 || !k1.Equal(k2) { + t.Error("account key must persist across restarts") + } +} + +func TestNewACMEIssuer_InputValidation(t *testing.T) { + if _, err := NewACMEIssuer("", "", t.TempDir()); err == nil { + t.Error("want directory-url error") + } + if _, err := NewACMEIssuer("https://example.com/dir", "", ""); err == nil { + t.Error("want data-dir error") + } +} + +func TestACMEIssuer_CreateOrder_RequiresFQDN(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + if _, err := issuer.CreateOrder(t.Context(), ""); err == nil { + t.Error("want fqdn error") + } +} + +func TestACMEIssuer_HTTP01VerifiedChallengeAnswered(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + + order, err := issuer.CreateOrder(t.Context(), "agent.example.com") + if err != nil { + t.Fatal(err) + } + if _, err := issuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: order.OrderRef, + CSRPEM: buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}), + FQDN: "agent.example.com", + Verified: []domain.ChallengeType{domain.ChallengeTypeHTTP01}, + }); err != nil { + t.Fatalf("finalize via http-01: %v", err) + } + if accepted := f.Accepted(); len(accepted) != 1 || accepted[0] != "http" { + t.Errorf("accepted: %v, want exactly [http]", accepted) + } +} + +func TestACMEIssuer_CreateOrder_BornReady(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + // Authorization reuse (RFC 8555 §7.1.3): a new order comes back + // already 'ready', so there is no challenge for the owner to + // publish. CreateOrder must relay it as ISSUING with no challenges + // — NOT error — so the gate skips it and verify-acme finalizes + // directly. This is routine on real Let's Encrypt within its + // authorization-reuse window. + f.SetOrderStatus(acme.StatusReady) + order, err := issuer.CreateOrder(t.Context(), "agent.example.com") + if err != nil { + t.Fatalf("born-ready order must not error: %v", err) + } + if order.State != domain.OrderStateIssuing { + t.Errorf("state: got %s want ISSUING", order.State) + } + if len(order.Challenges) != 0 { + t.Errorf("born-ready order must carry no challenges, got %d", len(order.Challenges)) + } + if order.OrderRef != f.OrderURL() { + t.Errorf("order ref: got %q", order.OrderRef) + } +} + +func TestACMEIssuer_CreateOrder_NoSupportedChallenges(t *testing.T) { + f := newFakeACME(t) + issuer := newTestACMEIssuer(t, f) + // A pending order whose only challenge is tls-alpn-01 (which this + // adapter doesn't implement) must surface a clear error, not an + // empty challenge set the gate could never satisfy. + f.SetUnsupportedChallengesOnly(true) + if _, err := issuer.CreateOrder(t.Context(), "agent.example.com"); err == nil { + t.Error("want no-supported-challenges error") + } +} + +func TestACMEIssuer_UnreachableProvider(t *testing.T) { + issuer, err := NewACMEIssuer("http://127.0.0.1:1/dir", "", t.TempDir()) + if err != nil { + t.Fatal(err) + } + if _, err := issuer.CreateOrder(t.Context(), "agent.example.com"); err == nil { + t.Error("want registration failure against unreachable provider") + } + if _, err := issuer.FinalizeOrder(t.Context(), port.FinalizeOrderRequest{ + OrderRef: "http://127.0.0.1:1/order/1", + CSRPEM: buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}), + FQDN: "agent.example.com", + }); err == nil { + t.Error("want finalize failure against unreachable provider") + } +} + +func TestLoadOrCreateAccountKey_Errors(t *testing.T) { + dir := t.TempDir() + + // Garbage PEM. + junk := filepath.Join(dir, "junk.key") + if err := os.WriteFile(junk, []byte("junk"), 0o600); err != nil { + t.Fatal(err) + } + if _, err := loadOrCreateAccountKey(junk); err == nil { + t.Error("want PEM error for junk key file") + } + + // Wrong PEM block type. + wrongType := filepath.Join(dir, "wrong-type.key") + if err := os.WriteFile(wrongType, + pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: []byte{1}}), 0o600); err != nil { + t.Fatal(err) + } + if _, err := loadOrCreateAccountKey(wrongType); err == nil { + t.Error("want type error for non-private-key PEM") + } + + // Valid PKCS#8 but not ECDSA. + edPub, edPriv, err := ed25519.GenerateKey(rand.Reader) + _ = edPub + if err != nil { + t.Fatal(err) + } + edDER, err := x509.MarshalPKCS8PrivateKey(edPriv) + if err != nil { + t.Fatal(err) + } + notEC := filepath.Join(dir, "ed25519.key") + if err := os.WriteFile(notEC, + pem.EncodeToMemory(&pem.Block{Type: pemTypePrivateKey, Bytes: edDER}), 0o600); err != nil { + t.Fatal(err) + } + if _, err := loadOrCreateAccountKey(notEC); err == nil { + t.Error("want ECDSA error for ed25519 key") + } + + // PEM type right, DER garbage. + badDER := filepath.Join(dir, "bad-der.key") + if err := os.WriteFile(badDER, + pem.EncodeToMemory(&pem.Block{Type: pemTypePrivateKey, Bytes: []byte{0xff}}), 0o600); err != nil { + t.Fatal(err) + } + if _, err := loadOrCreateAccountKey(badDER); err == nil { + t.Error("want parse error for bad DER") + } + + // Read failure that isn't not-exist: path is a directory. + if _, err := loadOrCreateAccountKey(dir); err == nil { + t.Error("want read error for directory path") + } +} + +func TestNewACMEIssuer_DataDirCreationFails(t *testing.T) { + // dataDir nested under a regular file cannot be created. + blocker := filepath.Join(t.TempDir(), "file") + if err := os.WriteFile(blocker, []byte("x"), 0o600); err != nil { + t.Fatal(err) + } + if _, err := NewACMEIssuer("https://acme.example/dir", "", filepath.Join(blocker, "sub")); err == nil { + t.Error("want mkdir error") + } +} diff --git a/internal/adapter/cert/acmetest/acmetest.go b/internal/adapter/cert/acmetest/acmetest.go new file mode 100644 index 0000000..c482da7 --- /dev/null +++ b/internal/adapter/cert/acmetest/acmetest.go @@ -0,0 +1,369 @@ +// Package acmetest provides an in-process fake RFC 8555 server for +// exercising the ACME issuer adapter without network access. It +// covers exactly the endpoints the issuer drives — directory, nonce, +// new-account, new-order, authorization, challenge accept, order +// poll, finalize, and certificate download — and exposes knobs to +// simulate slow validation, terminal failure, and pre-completed +// orders. JWS request bodies are decoded without signature +// verification: the code under test is the ACME client, never the +// server. +// +// Test-support only; production deployments point the issuer at a +// real directory URL (Let's Encrypt staging or production). +package acmetest + +import ( + "crypto/ecdsa" + "crypto/elliptic" + "crypto/rand" + "crypto/x509" + "crypto/x509/pkix" + "encoding/base64" + "encoding/json" + "encoding/pem" + "fmt" + "math/big" + "net/http" + "net/http/httptest" + "strings" + "sync" + "time" +) + +// RFC 8555 status strings, mirrored locally so this package doesn't +// depend on x/crypto/acme. +const ( + statusPending = "pending" + statusReady = "ready" + statusValid = "valid" + statusInvalid = "invalid" +) + +// Server is the fake ACME provider. Construct with New, point the +// issuer at DirectoryURL(), and drive scenarios via the setters. +type Server struct { + srv *httptest.Server + + rootKey *ecdsa.PrivateKey + rootCert *x509.Certificate + + mu sync.Mutex + orderStatus string + // holdPending freezes the order in pending after a challenge is + // accepted — simulates slow provider-side validation. + holdPending bool + // failValidation moves the order to invalid after accept. + failValidation bool + // failFinalize makes the finalize endpoint answer 500. + failFinalize bool + // unsupportedChallengesOnly makes authorizations offer only a + // challenge type the adapter can't satisfy (tls-alpn-01), so + // CreateOrder surfaces the no-supported-challenges error. + unsupportedChallengesOnly bool + accepted []string + errs []error + + dnsToken string + httpToken string +} + +// New starts the fake server with a fresh in-memory root CA. Callers +// must Close it (or register it with t.Cleanup). +func New() (*Server, error) { + rootKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + return nil, err + } + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(1), + Subject: pkix.Name{CommonName: "acmetest Root"}, + NotBefore: time.Now().Add(-time.Hour), + NotAfter: time.Now().Add(24 * time.Hour), + KeyUsage: x509.KeyUsageCertSign, + BasicConstraintsValid: true, + IsCA: true, + } + rootDER, err := x509.CreateCertificate(rand.Reader, tmpl, tmpl, rootKey.Public(), rootKey) + if err != nil { + return nil, err + } + rootCert, err := x509.ParseCertificate(rootDER) + if err != nil { + return nil, err + } + s := &Server{ + rootKey: rootKey, + rootCert: rootCert, + orderStatus: statusPending, + dnsToken: "dns-token-1", + httpToken: "http-token-1", + } + s.srv = httptest.NewServer(http.HandlerFunc(s.handle)) + return s, nil +} + +// Close shuts the server down. +func (s *Server) Close() { s.srv.Close() } + +// DirectoryURL is what the issuer's directory-url config points at. +func (s *Server) DirectoryURL() string { return s.url("/dir") } + +// OrderURL returns the provider order URL the fake hands out — what +// CreateOrder relays as the order ref. +func (s *Server) OrderURL() string { return s.url("/order/1") } + +// DNSToken returns the dns-01 challenge token the fake mints. +func (s *Server) DNSToken() string { return s.dnsToken } + +// HTTPToken returns the http-01 challenge token the fake mints. +func (s *Server) HTTPToken() string { return s.httpToken } + +// SetHoldPending freezes (or unfreezes) the order in pending after a +// challenge accept — provider-side validation that outlives the +// caller's finalize budget. +func (s *Server) SetHoldPending(v bool) { + s.mu.Lock() + defer s.mu.Unlock() + s.holdPending = v +} + +// SetFailValidation makes the next accepted challenge move the order +// to invalid. +func (s *Server) SetFailValidation(v bool) { + s.mu.Lock() + defer s.mu.Unlock() + s.failValidation = v +} + +// SetFailFinalize makes the finalize endpoint answer 500 — a +// provider-side outage mid-issuance. +func (s *Server) SetFailFinalize(v bool) { + s.mu.Lock() + defer s.mu.Unlock() + s.failFinalize = v +} + +// SetOrderStatus force-sets the order state (e.g. "ready" to resume +// a held validation, "valid" to simulate an already-issued order). +func (s *Server) SetOrderStatus(status string) { + s.mu.Lock() + defer s.mu.Unlock() + s.orderStatus = status +} + +// Accepted returns the challenge kinds ("dns", "http") the client +// answered — the assertion surface for the Verified contract. +func (s *Server) Accepted() []string { + s.mu.Lock() + defer s.mu.Unlock() + return append([]string(nil), s.accepted...) +} + +// Err returns the first protocol violation the fake observed (bad +// JWS, malformed CSR, unexpected path), or nil. +func (s *Server) Err() error { + s.mu.Lock() + defer s.mu.Unlock() + if len(s.errs) == 0 { + return nil + } + return s.errs[0] +} + +func (s *Server) url(path string) string { return s.srv.URL + path } + +func (s *Server) recordErr(err error) { + s.mu.Lock() + defer s.mu.Unlock() + s.errs = append(s.errs, err) +} + +// jwsPayload extracts the base64url payload of a JWS request body +// without verifying the signature. Empty payload = POST-as-GET. +func (s *Server) jwsPayload(r *http.Request) []byte { + var body struct { + Payload string `json:"payload"` + } + if err := json.NewDecoder(r.Body).Decode(&body); err != nil { + s.recordErr(fmt.Errorf("acmetest: decode jws: %w", err)) + return nil + } + if body.Payload == "" { + return nil + } + raw, err := base64.RawURLEncoding.DecodeString(body.Payload) + if err != nil { + s.recordErr(fmt.Errorf("acmetest: decode payload: %w", err)) + return nil + } + return raw +} + +func (s *Server) writeJSON(w http.ResponseWriter, status int, v any) { + w.Header().Set("Replay-Nonce", "nonce-"+time.Now().Format("150405.000000000")) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(v) +} + +func (s *Server) orderJSON() map[string]any { + s.mu.Lock() + defer s.mu.Unlock() + o := map[string]any{ + "status": s.orderStatus, + "expires": time.Now().Add(time.Hour).Format(time.RFC3339), + "identifiers": []map[string]string{{"type": "dns", "value": "agent.example.com"}}, + "authorizations": []string{s.url("/authz/1")}, + "finalize": s.url("/finalize/1"), + } + if s.orderStatus == statusValid { + o["certificate"] = s.url("/cert/1") + } + return o +} + +// SetUnsupportedChallengesOnly makes authorizations offer only a +// challenge type the adapter cannot satisfy (tls-alpn-01). +func (s *Server) SetUnsupportedChallengesOnly(v bool) { + s.mu.Lock() + defer s.mu.Unlock() + s.unsupportedChallengesOnly = v +} + +func (s *Server) authzJSON() map[string]any { + s.mu.Lock() + defer s.mu.Unlock() + authzStatus := statusPending + if len(s.accepted) > 0 || s.orderStatus != statusPending { + authzStatus = statusValid + } + challenges := []map[string]string{ + {"type": "dns-01", "url": s.url("/chal/dns"), "token": s.dnsToken, "status": statusPending}, + {"type": "http-01", "url": s.url("/chal/http"), "token": s.httpToken, "status": statusPending}, + } + if s.unsupportedChallengesOnly { + challenges = []map[string]string{ + {"type": "tls-alpn-01", "url": s.url("/chal/alpn"), "token": s.dnsToken, "status": statusPending}, + } + } + return map[string]any{ + "status": authzStatus, + "expires": time.Now().Add(time.Hour).Format(time.RFC3339), + "identifier": map[string]string{"type": "dns", "value": "agent.example.com"}, + "challenges": challenges, + } +} + +func (s *Server) handle(w http.ResponseWriter, r *http.Request) { + switch { + case r.URL.Path == "/dir": + s.writeJSON(w, http.StatusOK, map[string]string{ + "newNonce": s.url("/nonce"), + "newAccount": s.url("/acct"), + "newOrder": s.url("/order"), + "revokeCert": s.url("/revoke"), + "keyChange": s.url("/keychange"), + }) + case r.URL.Path == "/nonce": + w.Header().Set("Replay-Nonce", "nonce-head") + w.WriteHeader(http.StatusOK) + case r.URL.Path == "/acct": + w.Header().Set("Location", s.url("/acct/1")) + s.writeJSON(w, http.StatusCreated, map[string]any{"status": "valid"}) + case r.URL.Path == "/order": + _ = s.jwsPayload(r) + w.Header().Set("Location", s.OrderURL()) + s.writeJSON(w, http.StatusCreated, s.orderJSON()) + case r.URL.Path == "/order/1": + w.Header().Set("Retry-After", "1") + s.writeJSON(w, http.StatusOK, s.orderJSON()) + case r.URL.Path == "/authz/1": + s.writeJSON(w, http.StatusOK, s.authzJSON()) + case strings.HasPrefix(r.URL.Path, "/chal/"): + s.mu.Lock() + s.accepted = append(s.accepted, strings.TrimPrefix(r.URL.Path, "/chal/")) + switch { + case s.failValidation: + s.orderStatus = statusInvalid + case s.holdPending: + // stay pending — validation "running" + default: + s.orderStatus = statusReady + } + s.mu.Unlock() + s.writeJSON(w, http.StatusOK, map[string]string{"type": "dns-01", "status": "processing", "token": s.dnsToken}) + case r.URL.Path == "/finalize/1": + s.handleFinalize(w, r) + case r.URL.Path == "/cert/1": + _ = s.jwsPayload(r) + w.Header().Set("Replay-Nonce", "nonce-cert") + w.Header().Set("Content-Type", "application/pem-certificate-chain") + w.WriteHeader(http.StatusOK) + chain, err := s.issueChain() + if err != nil { + s.recordErr(err) + return + } + _, _ = w.Write(chain) + default: + s.recordErr(fmt.Errorf("acmetest: unexpected path %s", r.URL.Path)) + w.WriteHeader(http.StatusNotFound) + } +} + +// handleFinalize validates the finalize JWS + CSR and flips the +// order to valid (or answers 500 when SetFailFinalize is armed). +func (s *Server) handleFinalize(w http.ResponseWriter, r *http.Request) { + s.mu.Lock() + failFinalize := s.failFinalize + s.mu.Unlock() + if failFinalize { + s.writeJSON(w, http.StatusInternalServerError, map[string]string{ + "type": "urn:ietf:params:acme:error:serverInternal", "detail": "boom", + }) + return + } + payload := s.jwsPayload(r) + var req struct { + CSR string `json:"csr"` + } + if err := json.Unmarshal(payload, &req); err != nil { + s.recordErr(fmt.Errorf("acmetest: finalize payload: %w", err)) + } + csrDER, err := base64.RawURLEncoding.DecodeString(req.CSR) + if err != nil { + s.recordErr(fmt.Errorf("acmetest: csr decode: %w", err)) + } + if _, err := x509.ParseCertificateRequest(csrDER); err != nil { + s.recordErr(fmt.Errorf("acmetest: csr parse: %w", err)) + } + s.mu.Lock() + s.orderStatus = statusValid + s.mu.Unlock() + s.writeJSON(w, http.StatusOK, s.orderJSON()) +} + +// issueChain signs a leaf for the test FQDN with the fake root and +// returns leaf+root PEM. +func (s *Server) issueChain() ([]byte, error) { + leafKey, err := ecdsa.GenerateKey(elliptic.P256(), rand.Reader) + if err != nil { + return nil, err + } + tmpl := &x509.Certificate{ + SerialNumber: big.NewInt(4242), + Subject: pkix.Name{CommonName: "agent.example.com"}, + DNSNames: []string{"agent.example.com"}, + NotBefore: time.Now().Add(-time.Minute), + NotAfter: time.Now().Add(90 * 24 * time.Hour), + KeyUsage: x509.KeyUsageDigitalSignature, + ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, + } + leafDER, err := x509.CreateCertificate(rand.Reader, tmpl, s.rootCert, leafKey.Public(), s.rootKey) + if err != nil { + return nil, err + } + out := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: leafDER}) + out = append(out, pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: s.rootCert.Raw})...) + return out, nil +} diff --git a/internal/adapter/cert/acmetest/acmetest_test.go b/internal/adapter/cert/acmetest/acmetest_test.go new file mode 100644 index 0000000..8fe5948 --- /dev/null +++ b/internal/adapter/cert/acmetest/acmetest_test.go @@ -0,0 +1,69 @@ +package acmetest + +import ( + "net/http" + "strings" + "testing" +) + +// The fake's own contract: protocol violations it observes are +// surfaced via Err so adapter tests fail loudly instead of silently +// passing against a broken conversation. +func TestServer_RecordsProtocolViolations(t *testing.T) { + s, err := New() + if err != nil { + t.Fatal(err) + } + t.Cleanup(s.Close) + + if s.Err() != nil { + t.Fatal("fresh server must have no errors") + } + + // Unexpected path → 404 + recorded violation. + resp, err := http.Get(s.url("/nope")) + if err != nil { + t.Fatal(err) + } + _ = resp.Body.Close() + if resp.StatusCode != http.StatusNotFound { + t.Errorf("status: %d", resp.StatusCode) + } + if s.Err() == nil || !strings.Contains(s.Err().Error(), "unexpected path") { + t.Errorf("violation not recorded: %v", s.Err()) + } +} + +func TestServer_RejectsMalformedJWS(t *testing.T) { + s, err := New() + if err != nil { + t.Fatal(err) + } + t.Cleanup(s.Close) + + // Not JSON at all. + resp, err := http.Post(s.url("/order"), "application/jose+json", strings.NewReader("not-json")) + if err != nil { + t.Fatal(err) + } + _ = resp.Body.Close() + if s.Err() == nil { + t.Error("malformed JWS must be recorded") + } + + // Valid JSON, payload not base64url. + s2, err := New() + if err != nil { + t.Fatal(err) + } + t.Cleanup(s2.Close) + resp2, err := http.Post(s2.url("/order"), "application/jose+json", + strings.NewReader(`{"protected":"x","payload":"!!!not-b64!!!","signature":"y"}`)) + if err != nil { + t.Fatal(err) + } + _ = resp2.Body.Close() + if s2.Err() == nil { + t.Error("bad payload encoding must be recorded") + } +} diff --git a/internal/adapter/cert/cert_test.go b/internal/adapter/cert/cert_test.go index 00ea152..55c7f6e 100644 --- a/internal/adapter/cert/cert_test.go +++ b/internal/adapter/cert/cert_test.go @@ -19,6 +19,7 @@ import ( anscrypto "github.com/godaddy/ans/internal/crypto" "github.com/godaddy/ans/internal/domain" + "github.com/godaddy/ans/internal/port" ) // ----- Test helpers ----- @@ -151,12 +152,28 @@ func TestSelfCA_RevokeAndIsRevoked(t *testing.T) { if ca.IsRevoked(serial) { t.Error("fresh CA should have no revocations") } - if err := ca.RevokeCertificate(context.Background(), serial, domain.RevocationKeyCompromise); err != nil { + if err := ca.RevokeCertificate(context.Background(), port.RevokeCertificateRequest{ + SerialNumber: serial, + Reason: domain.RevocationKeyCompromise, + }); err != nil { t.Fatalf("revoke: %v", err) } if !ca.IsRevoked(serial) { t.Error("IsRevoked should be true after Revoke") } + // Idempotent per the port contract. + if err := ca.RevokeCertificate(context.Background(), port.RevokeCertificateRequest{ + SerialNumber: serial, + Reason: domain.RevocationKeyCompromise, + }); err != nil { + t.Fatalf("re-revoke must be idempotent: %v", err) + } + // Serial is mandatory. + if err := ca.RevokeCertificate(context.Background(), port.RevokeCertificateRequest{ + Reason: domain.RevocationKeyCompromise, + }); err == nil { + t.Error("want error for missing serial") + } } func TestSelfCA_PersistsRootAcrossRestarts(t *testing.T) { @@ -212,16 +229,45 @@ func TestNewServerSelfCA_RespectsTTLOption(t *testing.T) { } } -func TestServerSelfCA_IssueServerCertificate_And_GetCA(t *testing.T) { +func TestServerSelfCA_OrderLifecycle_And_GetCA(t *testing.T) { ca, err := NewServerSelfCA(t.TempDir(), "ServerOrg", 365) if err != nil { t.Fatalf("new: %v", err) } + // CreateOrder self-issues both challenge types with distinct + // tokens and a non-empty order ref. + order, err := ca.CreateOrder(context.Background(), "agent.example.com") + if err != nil { + t.Fatalf("create order: %v", err) + } + if order.OrderRef == "" || order.State != domain.OrderStatePending { + t.Errorf("order shape: ref=%q state=%q", order.OrderRef, order.State) + } + dns01, ok := order.ChallengeOfType(domain.ChallengeTypeDNS01) + if !ok || dns01.Token == "" { + t.Error("missing DNS-01 challenge") + } + http01, ok := order.ChallengeOfType(domain.ChallengeTypeHTTP01) + if !ok || http01.Token == "" { + t.Error("missing HTTP-01 challenge") + } + if dns01.Token == http01.Token { + t.Error("challenge tokens must be independent") + } + if order.ExpiresAt.Before(time.Now()) { + t.Error("order must expire in the future") + } + // CSR with DNS SAN matching expected FQDN. csrPEM := buildCSR(t, "agent.example.com", nil, []string{"agent.example.com"}) - issued, err := ca.IssueServerCertificate(context.Background(), csrPEM, "agent.example.com") + issued, err := ca.FinalizeOrder(context.Background(), port.FinalizeOrderRequest{ + OrderRef: order.OrderRef, + CSRPEM: csrPEM, + FQDN: "agent.example.com", + Verified: []domain.ChallengeType{domain.ChallengeTypeDNS01}, + }) if err != nil { - t.Fatalf("issue: %v", err) + t.Fatalf("finalize: %v", err) } if issued.CertPEM == "" || issued.ChainPEM == "" { t.Error("expected non-empty PEM output") @@ -277,16 +323,43 @@ func TestServerSelfCA_LoadRoot_MalformedKey(t *testing.T) { } } -func TestServerSelfCA_IssueServerCertificate_RejectsBadCSR(t *testing.T) { +func TestServerSelfCA_FinalizeOrder_RejectsBadCSR(t *testing.T) { ca, _ := NewServerSelfCA(t.TempDir(), "o", 365) // CN doesn't match, no DNS SAN → ValidateServerCSR rejects. csrPEM := buildCSR(t, "other.example.com", nil, nil) - _, err := ca.IssueServerCertificate(context.Background(), csrPEM, "agent.example.com") + _, err := ca.FinalizeOrder(context.Background(), port.FinalizeOrderRequest{ + CSRPEM: csrPEM, + FQDN: "agent.example.com", + }) if err == nil { t.Error("expected FQDN-mismatch rejection") } } +func TestServerSelfCA_CreateOrder_RequiresFQDN(t *testing.T) { + ca, _ := NewServerSelfCA(t.TempDir(), "o", 365) + if _, err := ca.CreateOrder(context.Background(), ""); err == nil { + t.Error("expected error for empty fqdn") + } +} + +func TestNewServerSelfCA_RespectsOrderTTLOption(t *testing.T) { + ca, err := NewServerSelfCA(t.TempDir(), "org", 365, WithOrderTTL(3*time.Hour)) + if err != nil { + t.Fatalf("new: %v", err) + } + if ca.orderTTL != 3*time.Hour { + t.Errorf("WithOrderTTL ignored: got %v", ca.orderTTL) + } + order, err := ca.CreateOrder(context.Background(), "agent.example.com") + if err != nil { + t.Fatal(err) + } + if remaining := time.Until(order.ExpiresAt); remaining > 3*time.Hour || remaining < 2*time.Hour { + t.Errorf("order expiry should honor the 3h TTL, got %v remaining", remaining) + } +} + // ----- X509Validator ----- func TestX509Validator_SkipChainVerify_HappyPath(t *testing.T) { diff --git a/internal/adapter/cert/le_live_smoke_test.go b/internal/adapter/cert/le_live_smoke_test.go new file mode 100644 index 0000000..24c160f --- /dev/null +++ b/internal/adapter/cert/le_live_smoke_test.go @@ -0,0 +1,63 @@ +package cert + +import ( + "os" + "strings" + "testing" + + "github.com/godaddy/ans/internal/domain" +) + +// TestLive_LetsEncryptStaging_CreateOrder talks to REAL Let's Encrypt +// staging: it registers a throwaway staging account, opens an order, +// and asserts the relayed challenges are LE's own (provider order +// URL, LE-minted token, computed DNS digest, key authorization). +// +// Opt-in via ANS_LE_LIVE_TEST=1 — it needs outbound network and +// consumes (generous) staging rate limits, so it is not part of the +// hermetic suite; the in-process acmetest fake covers the protocol +// there. Order *creation* requires no domain ownership — only +// satisfying a challenge does — which is what makes this smoke test +// runnable from anywhere: +// +// ANS_LE_LIVE_TEST=1 go test ./internal/adapter/cert/ -run TestLive -v +// +// Completing a full issuance against staging additionally requires a +// public domain you control: run the RA with `ca.server.type: acme` +// pointed at the staging directory and publish the relayed challenge +// for your real FQDN. +func TestLive_LetsEncryptStaging_CreateOrder(t *testing.T) { + if os.Getenv("ANS_LE_LIVE_TEST") == "" { + t.Skip("live Let's Encrypt staging test; set ANS_LE_LIVE_TEST=1 to run") + } + issuer, err := NewACMEIssuer( + "https://acme-staging-v02.api.letsencrypt.org/directory", + "", t.TempDir()) + if err != nil { + t.Fatal(err) + } + order, err := issuer.CreateOrder(t.Context(), "agent.ans-issuer-smoke-2026.com") + if err != nil { + t.Fatalf("create order against real LE staging: %v", err) + } + t.Logf("LE staging order ref: %s", order.OrderRef) + t.Logf("order expires: %s", order.ExpiresAt) + if !strings.Contains(order.OrderRef, "acme-staging-v02.api.letsencrypt.org") { + t.Errorf("order ref is not a real LE staging URL: %q", order.OrderRef) + } + dns01, ok := order.ChallengeOfType(domain.ChallengeTypeDNS01) + if !ok { + t.Fatal("LE did not offer dns-01") + } + t.Logf("dns-01 token (LE-minted): %s", dns01.Token) + t.Logf("TXT value to publish (digest): %s", dns01.EffectiveDNSRecordValue()) + t.Logf("key authorization: %s", dns01.KeyAuthorization) + if dns01.EffectiveDNSRecordValue() == dns01.Token || dns01.KeyAuthorization == "" { + t.Error("LE challenges must carry the computed digest + key authorization") + } + http01, ok := order.ChallengeOfType(domain.ChallengeTypeHTTP01) + if !ok { + t.Fatal("LE did not offer http-01") + } + t.Logf("http-01 path: %s", http01.EffectiveHTTPPath()) +} diff --git a/internal/adapter/cert/selfca.go b/internal/adapter/cert/selfca.go index d3ea287..e45ae0d 100644 --- a/internal/adapter/cert/selfca.go +++ b/internal/adapter/cert/selfca.go @@ -21,7 +21,6 @@ import ( "time" anscrypto "github.com/godaddy/ans/internal/crypto" - "github.com/godaddy/ans/internal/domain" "github.com/godaddy/ans/internal/port" ) @@ -131,17 +130,22 @@ func (c *SelfCA) IssueIdentityCertificate( }, nil } -// RevokeCertificate marks a certificate as revoked. The in-process CRL -// is not yet published; revocations are tracked in memory and surfaced -// through transparency-log events. +// RevokeCertificate marks a certificate as revoked by serial. +// Idempotent per the port contract. The in-process CRL is not +// published; revocations are tracked in memory (and authoritatively +// through transparency-log events) — production deployments that need +// CRL/OCSP distribution swap in a cloud private-CA adapter at this +// port. func (c *SelfCA) RevokeCertificate( ctx context.Context, - serialNumber string, - reason domain.RevocationReason, + req port.RevokeCertificateRequest, ) error { + if req.SerialNumber == "" { + return errors.New("cert: revoke: serial number is required") + } c.mu.Lock() defer c.mu.Unlock() - c.revoked[serialNumber] = struct{}{} + c.revoked[req.SerialNumber] = struct{}{} return nil } @@ -181,7 +185,7 @@ func (c *SelfCA) loadRoot(keyPath, certPath string) error { return fmt.Errorf("cert: read root key: %w", err) } keyBlock, _ := pem.Decode(keyBytes) - if keyBlock == nil || keyBlock.Type != "PRIVATE KEY" { + if keyBlock == nil || keyBlock.Type != pemTypePrivateKey { return errors.New("cert: root key is not a PKCS#8 PRIVATE KEY PEM") } key, err := x509.ParsePKCS8PrivateKey(keyBlock.Bytes) @@ -253,7 +257,7 @@ func (c *SelfCA) createRoot(keyPath, certPath string) error { return fmt.Errorf("cert: marshal root key: %w", err) } if err := os.WriteFile(keyPath, - pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privDER}), 0o600); err != nil { + pem.EncodeToMemory(&pem.Block{Type: pemTypePrivateKey, Bytes: privDER}), 0o600); err != nil { return fmt.Errorf("cert: write root key: %w", err) } // 0o644 (world-readable) is intentional: the root cert is a diff --git a/internal/adapter/cert/serverselfca.go b/internal/adapter/cert/serverselfca.go index 631ef85..642c735 100644 --- a/internal/adapter/cert/serverselfca.go +++ b/internal/adapter/cert/serverselfca.go @@ -8,6 +8,7 @@ import ( "crypto/rand" "crypto/x509" "crypto/x509/pkix" + "encoding/base64" "encoding/pem" "errors" "fmt" @@ -17,18 +18,38 @@ import ( "time" anscrypto "github.com/godaddy/ans/internal/crypto" + "github.com/godaddy/ans/internal/domain" "github.com/godaddy/ans/internal/port" ) -// ServerSelfCA implements port.ServerCertificateAuthority with an +// randomToken returns a base64url-encoded 32-byte crypto/rand token — +// the same shape RFC 8555 providers mint for challenge tokens. Used +// for self-issued challenge tokens and order refs. +func randomToken() (string, error) { + b := make([]byte, 32) + if _, err := rand.Read(b); err != nil { + return "", err + } + return base64.RawURLEncoding.EncodeToString(b), nil +} + +// ServerSelfCA implements port.ServerCertificateIssuer with an // in-process ECDSA P-256 root CA that signs TLS server-auth // certificates. // // This is the local / LF-submittable implementation. The reference // RA delegates to an internal ACME-style cert service, and any cloud -// adapter (AWS Private CA, GCP CAS, hosted ACME CA) can replace -// ServerSelfCA without touching the service layer — the port -// (ServerCertificateAuthority) is the stable contract. +// adapter (a hosted ACME CA such as Let's Encrypt, AWS Private CA, +// GCP CAS) can replace ServerSelfCA without touching the service +// layer — the port (ServerCertificateIssuer) is the stable contract. +// +// Because this CA is its own trust root, it plays the provider role +// in the order lifecycle itself: CreateOrder self-issues the +// domain-control challenge tokens (the same tokens an external ACME +// provider would mint), and FinalizeOrder signs synchronously, +// trusting the RA's challenge-presence gate as the authoritative +// domain validation. Orders carry no server-side state — the order +// ref exists so the lifecycle is uniform across providers. // // Kept distinct from SelfCA (the identity CA) so operators can // rotate the two roots independently and publish the server-CA @@ -41,6 +62,7 @@ type ServerSelfCA struct { org string validity time.Duration serverTTL time.Duration + orderTTL time.Duration mu sync.RWMutex rootCert *x509.Certificate rootKey crypto.Signer @@ -56,6 +78,14 @@ func WithServerCertTTL(d time.Duration) ServerSelfCAOption { return func(c *ServerSelfCA) { c.serverTTL = d } } +// WithOrderTTL sets the lifetime of certificate orders (the window +// the domain owner has to publish a challenge artifact). Default is +// 7 days, matching ACME-provider order lifetimes; the registration +// and renewal flows clamp their own shorter windows on top. +func WithOrderTTL(d time.Duration) ServerSelfCAOption { + return func(c *ServerSelfCA) { c.orderTTL = d } +} + // NewServerSelfCA opens (or creates) a self-signed server CA in the // given directory. The root certificate has the organization name // set to org and a validity of validityDays days. @@ -72,6 +102,7 @@ func NewServerSelfCA(dataDir, org string, validityDays int, opts ...ServerSelfCA org: org, validity: time.Duration(validityDays) * 24 * time.Hour, serverTTL: 90 * 24 * time.Hour, + orderTTL: 7 * 24 * time.Hour, } for _, opt := range opts { opt(c) @@ -85,15 +116,54 @@ func NewServerSelfCA(dataDir, org string, validityDays int, opts ...ServerSelfCA return c, nil } -// IssueServerCertificate signs the given server CSR. The resulting -// certificate has the provided FQDN as a DNS SAN and the standard -// TLS server-auth EKU, and is valid for serverTTL. -func (c *ServerSelfCA) IssueServerCertificate( +// CreateOrder opens a certificate order for the FQDN, self-issuing +// the domain-control challenge tokens. Because this CA is its own +// trust root there is no remote order to track — the order ref is a +// random handle that exists so the lifecycle (and its persistence) +// is identical to external providers'. Both DNS-01 and HTTP-01 +// challenges are offered; the owner satisfies whichever is easier to +// publish, exactly as with an ACME provider. +func (c *ServerSelfCA) CreateOrder(ctx context.Context, fqdn string) (*domain.CertificateOrder, error) { + if fqdn == "" { + return nil, errors.New("cert: create order: fqdn is required") + } + dns01, err := randomToken() + if err != nil { + return nil, fmt.Errorf("cert: dns01 token: %w", err) + } + http01, err := randomToken() + if err != nil { + return nil, fmt.Errorf("cert: http01 token: %w", err) + } + ref, err := randomToken() + if err != nil { + return nil, fmt.Errorf("cert: order ref: %w", err) + } + return &domain.CertificateOrder{ + OrderRef: "selfca-" + ref, + State: domain.OrderStatePending, + Challenges: []domain.Challenge{ + {Type: domain.ChallengeTypeDNS01, Token: dns01}, + {Type: domain.ChallengeTypeHTTP01, Token: http01}, + }, + ExpiresAt: time.Now().Add(c.orderTTL), + }, nil +} + +// FinalizeOrder signs the order's server CSR. The resulting +// certificate has the request FQDN as a DNS SAN and the standard TLS +// server-auth EKU, and is valid for serverTTL. Signing is synchronous +// and never returns port.ErrOrderPending. +// +// Domain validation: the RA's challenge-presence gate runs before +// every FinalizeOrder call and is authoritative for this CA (it IS +// the CA's validation — there is no separate provider to convince), +// so req.Verified is not re-checked here. +func (c *ServerSelfCA) FinalizeOrder( ctx context.Context, - csrPEM string, - fqdn string, + req port.FinalizeOrderRequest, ) (*port.IssuedCert, error) { - csr, err := anscrypto.ValidateServerCSR(csrPEM, fqdn) + csr, err := anscrypto.ValidateServerCSR(req.CSRPEM, req.FQDN) if err != nil { return nil, err } @@ -118,7 +188,7 @@ func (c *ServerSelfCA) IssueServerCertificate( // (browsers, curl) demand before trusting a cert for HTTPS. // Differs from the identity CA's ClientAuth EKU. ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth}, - DNSNames: []string{fqdn}, + DNSNames: []string{req.FQDN}, // Subject is lifted from the CSR; some SDK-generated CSRs set // CN to a placeholder rather than the FQDN. We keep CN for // back-compat but the DNS SAN is the authoritative binding. @@ -175,7 +245,7 @@ func (c *ServerSelfCA) loadRoot(keyPath, certPath string) error { return fmt.Errorf("cert: read server-root key: %w", err) } keyBlock, _ := pem.Decode(keyBytes) - if keyBlock == nil || keyBlock.Type != "PRIVATE KEY" { + if keyBlock == nil || keyBlock.Type != pemTypePrivateKey { return errors.New("cert: server-root key is not a PKCS#8 PRIVATE KEY PEM") } key, err := x509.ParsePKCS8PrivateKey(keyBlock.Bytes) @@ -242,7 +312,7 @@ func (c *ServerSelfCA) createRoot(keyPath, certPath string) error { return fmt.Errorf("cert: marshal server-root key: %w", err) } if err := os.WriteFile(keyPath, - pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: privDER}), 0o600); err != nil { + pem.EncodeToMemory(&pem.Block{Type: pemTypePrivateKey, Bytes: privDER}), 0o600); err != nil { return fmt.Errorf("cert: write server-root key: %w", err) } // 0o644 (world-readable) is intentional: the server-root cert diff --git a/internal/adapter/challenge/http.go b/internal/adapter/challenge/http.go new file mode 100644 index 0000000..2359bf5 --- /dev/null +++ b/internal/adapter/challenge/http.go @@ -0,0 +1,207 @@ +// Package challenge provides adapters for verifying domain-control +// challenge artifacts the domain owner has published. Verification +// only — ANS never publishes challenge artifacts on the owner's +// behalf. +package challenge + +import ( + "context" + "errors" + "fmt" + "io" + "net" + "net/http" + "strings" + "syscall" + "time" +) + +// maxChallengeBodyBytes bounds the response read. Key authorizations +// are ~130 bytes; anything past a few KiB is not a challenge artifact. +const maxChallengeBodyBytes = 8 * 1024 + +// errBlockedDialTarget is returned by the SSRF guard when a dial +// resolves to a non-public address. It is internal — callers see only +// "challenge not satisfied" — but distinct so the guard is testable. +var errBlockedDialTarget = errors.New("challenge: refusing to dial a non-public address") + +// HTTPVerifier implements port.HTTPChallengeVerifier by fetching the +// challenge URL over plain HTTP and comparing the body to the +// expected content. Plain HTTP (port 80) is the RFC 8555 §8.3 shape — +// the agent cannot present a valid TLS cert for the FQDN yet, since +// obtaining one is the very thing being validated. +// +// The fetch target is the agent's own registrant-supplied FQDN, so +// the client is hardened against SSRF: it refuses HTTP redirects and +// refuses to connect to any non-public address (loopback, +// link-local — including the 169.254.169.254 cloud metadata +// endpoint — private, unique-local, multicast, or unspecified). The +// guard runs on every dial via the dialer Control hook, so it sees +// the post-DNS-resolution IP and defeats DNS-rebinding as well as +// redirect-based pivots. +type HTTPVerifier struct { + client *http.Client + // urlFor builds the fetch URL from (fqdn, path). The default is + // "http://" + fqdn + path; tests substitute an httptest server. + urlFor func(fqdn, path string) string +} + +// HTTPVerifierOption configures the verifier at construction time. +type HTTPVerifierOption func(*HTTPVerifier) + +// WithHTTPClient overrides the HTTP client (timeouts, transport). +// Intended for tests; production uses the SSRF-hardened default. +func WithHTTPClient(c *http.Client) HTTPVerifierOption { + return func(v *HTTPVerifier) { v.client = c } +} + +// WithURLBuilder overrides how the challenge URL is derived from the +// FQDN and path. Intended for tests pointing at an httptest server; +// production uses the RFC 8555 default. +func WithURLBuilder(f func(fqdn, path string) string) HTTPVerifierOption { + return func(v *HTTPVerifier) { v.urlFor = f } +} + +// NewHTTPVerifier constructs an HTTPVerifier with a conservative +// default timeout and the SSRF guard installed. Challenge fetches hit +// operator infrastructure that may be slow to come up; 10s matches +// typical ACME validator budgets. +func NewHTTPVerifier(opts ...HTTPVerifierOption) *HTTPVerifier { + v := &HTTPVerifier{ + client: &http.Client{Timeout: 10 * time.Second, CheckRedirect: refuseRedirects, Transport: guardedTransport()}, + urlFor: func(fqdn, path string) string { return "http://" + fqdn + path }, + } + for _, opt := range opts { + opt(v) + } + return v +} + +// refuseRedirects is the http.Client CheckRedirect hook: HTTP-01 does +// not require following redirects (RFC 8555 §8.3), and following them +// would reopen the SSRF surface the dial guard closes (a public host +// 302-ing to an internal one). Returning an error stops the chain; +// http.Client surfaces it, the caller treats it as not-published. +func refuseRedirects(_ *http.Request, _ []*http.Request) error { + return errors.New("challenge: refusing to follow redirect") +} + +// guardedTransport returns an http.Transport whose dialer rejects any +// connection to a non-public address. The Control hook fires after +// DNS resolution with the concrete IP being dialed, so it guards the +// resolved target (defeating DNS rebinding) and every redirect/retry +// dial, not just the literal host in the URL. +func guardedTransport() *http.Transport { + d := &net.Dialer{ + Timeout: 5 * time.Second, + Control: func(_, address string, _ syscall.RawConn) error { + host, _, err := net.SplitHostPort(address) + if err != nil { + return fmt.Errorf("challenge: parse dial address %q: %w", address, err) + } + ip := net.ParseIP(host) + if ip == nil || !isPublicIP(ip) { + return fmt.Errorf("%w: %s", errBlockedDialTarget, host) + } + return nil + }, + } + base, ok := http.DefaultTransport.(*http.Transport) + if !ok { + // The stdlib default is always *http.Transport; fall back to a + // fresh one if some test rewired it. + base = &http.Transport{} + } + t := base.Clone() + t.DialContext = d.DialContext + return t +} + +// isPublicIP reports whether ip is a globally-routable unicast address +// safe to dial for challenge verification. It fails closed: anything it +// cannot positively classify as public is rejected. +// +// Beyond the stdlib predicates (loopback, link-local — which covers the +// 169.254.169.254 cloud-metadata endpoint — RFC 1918 private, ULA, +// unspecified, multicast) it rejects ranges those predicates miss: +// - 100.64.0.0/10 — RFC 6598 carrier-grade NAT, widely reused for +// cloud-internal addressing (AWS EKS pod networking, internal +// load-balancer targets); net.IP.IsPrivate covers only RFC 1918. +// - 192.0.0.0/24 — RFC 6890 IETF protocol assignments. +// - 198.18.0.0/15 — RFC 2544 benchmarking. +// - 2002::/16 — 6to4 and +// - 64:ff9b::/96 — NAT64: both embed an IPv4 address a translating +// egress path could pivot to an internal host, and neither is a +// form a legitimate public challenge target resolves to. +// +// These guard against a registrant pointing the FQDN under validation +// at an internal address to coerce a server-side request into +// infrastructure the verifier must never reach. Ranges are matched by +// byte math rather than a parsed CIDR table to keep the guard free of +// package-level state. +func isPublicIP(ip net.IP) bool { + if ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() || + ip.IsMulticast() || ip.IsUnspecified() || ip.IsPrivate() { + return false + } + if v4 := ip.To4(); v4 != nil { + switch { + case v4[0] == 100 && v4[1]&0xc0 == 64: // 100.64.0.0/10 (CGNAT) + return false + case v4[0] == 192 && v4[1] == 0 && v4[2] == 0: // 192.0.0.0/24 + return false + case v4[0] == 198 && v4[1]&0xfe == 18: // 198.18.0.0/15 + return false + } + return true + } + // IPv6 that is not an IPv4-mapped form (those are normalized by + // To4() above and caught by the stdlib predicates). Reject the 6to4 + // and NAT64 encapsulations outright. + if v6 := ip.To16(); v6 != nil { + switch { + case v6[0] == 0x20 && v6[1] == 0x02: // 2002::/16 (6to4) + return false + case v6[0] == 0x00 && v6[1] == 0x64 && v6[2] == 0xff && v6[3] == 0x9b: // 64:ff9b::/96 (NAT64) + return false + } + } + return true +} + +// VerifyHTTPChallenge fetches the challenge URL and reports whether +// the body matches the expected content. Trailing whitespace is +// tolerated (RFC 8555 §8.3 lets validators accept a trailing +// newline); any other difference is a mismatch. Network errors, +// blocked-target/redirect refusals, and non-200 statuses report +// (false, nil) — an unreachable, non-public, or not-yet-configured +// host is indistinguishable from "not published", and the caller's +// gate treats all of them as the challenge not being satisfied yet. +func (v *HTTPVerifier) VerifyHTTPChallenge(ctx context.Context, fqdn, path, expectedContent string) (bool, error) { + if fqdn == "" || path == "" || expectedContent == "" { + return false, errors.New("challenge: fqdn, path, and expectedContent are required") + } + req, err := http.NewRequestWithContext(ctx, http.MethodGet, v.urlFor(fqdn, path), http.NoBody) + if err != nil { + return false, fmt.Errorf("challenge: build request: %w", err) + } + resp, err := v.client.Do(req) + if err != nil { + // Connection refused / DNS failure / timeout / blocked + // non-public target / refused redirect: the artifact is not + // retrievable, so control is not proven. Not a systemic + // error — the owner simply hasn't published a reachable, + // public artifact. + return false, nil + } + defer func() { _ = resp.Body.Close() }() + + if resp.StatusCode != http.StatusOK { + return false, nil + } + body, err := io.ReadAll(io.LimitReader(resp.Body, maxChallengeBodyBytes)) + if err != nil { + return false, nil + } + return strings.TrimRight(string(body), "\r\n \t") == expectedContent, nil +} diff --git a/internal/adapter/challenge/http_test.go b/internal/adapter/challenge/http_test.go new file mode 100644 index 0000000..16f7e83 --- /dev/null +++ b/internal/adapter/challenge/http_test.go @@ -0,0 +1,257 @@ +package challenge + +import ( + "context" + "net" + "net/http" + "net/http/httptest" + "strings" + "testing" + "time" +) + +// testVerifier returns a verifier pointed at the given httptest +// server, exercising the WithURLBuilder + WithHTTPClient options the +// way production tests must (the default URL builder targets the real +// FQDN on port 80, which a unit test can't bind). +func testVerifier(srv *httptest.Server) *HTTPVerifier { + return NewHTTPVerifier( + WithHTTPClient(&http.Client{Timeout: 2 * time.Second}), + WithURLBuilder(func(_, path string) string { return srv.URL + path }), + ) +} + +func TestVerifyHTTPChallenge_Match(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/.well-known/acme-challenge/tok" { + http.NotFound(w, r) + return + } + _, _ = w.Write([]byte("tok.keyauth")) + })) + t.Cleanup(srv.Close) + + ok, err := testVerifier(srv).VerifyHTTPChallenge(context.Background(), + "agent.example.com", "/.well-known/acme-challenge/tok", "tok.keyauth") + if err != nil || !ok { + t.Fatalf("want match, got ok=%v err=%v", ok, err) + } +} + +func TestVerifyHTTPChallenge_TrailingNewlineTolerated(t *testing.T) { + // RFC 8555 §8.3 lets validators accept a trailing newline — shells + // and editors love appending one to challenge files. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte("tok.keyauth\r\n")) + })) + t.Cleanup(srv.Close) + + ok, err := testVerifier(srv).VerifyHTTPChallenge(context.Background(), + "agent.example.com", "/p", "tok.keyauth") + if err != nil || !ok { + t.Fatalf("want match with trailing newline, got ok=%v err=%v", ok, err) + } +} + +func TestVerifyHTTPChallenge_Mismatch(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte("something-else")) + })) + t.Cleanup(srv.Close) + + ok, err := testVerifier(srv).VerifyHTTPChallenge(context.Background(), + "agent.example.com", "/p", "tok.keyauth") + if err != nil || ok { + t.Fatalf("want mismatch, got ok=%v err=%v", ok, err) + } +} + +func TestVerifyHTTPChallenge_NotFound(t *testing.T) { + srv := httptest.NewServer(http.NotFoundHandler()) + t.Cleanup(srv.Close) + + ok, err := testVerifier(srv).VerifyHTTPChallenge(context.Background(), + "agent.example.com", "/p", "tok") + if err != nil || ok { + t.Fatalf("404 must report not-published, got ok=%v err=%v", ok, err) + } +} + +func TestVerifyHTTPChallenge_UnreachableHost(t *testing.T) { + srv := httptest.NewServer(http.NotFoundHandler()) + srv.Close() // immediately dead — connection refused + + ok, err := testVerifier(srv).VerifyHTTPChallenge(context.Background(), + "agent.example.com", "/p", "tok") + if err != nil || ok { + t.Fatalf("unreachable host must report not-published, got ok=%v err=%v", ok, err) + } +} + +func TestVerifyHTTPChallenge_InputValidation(t *testing.T) { + v := NewHTTPVerifier() + for _, tc := range []struct{ fqdn, path, expected string }{ + {"", "/p", "tok"}, + {"agent.example.com", "", "tok"}, + {"agent.example.com", "/p", ""}, + } { + if _, err := v.VerifyHTTPChallenge(context.Background(), tc.fqdn, tc.path, tc.expected); err == nil { + t.Errorf("want error for %+v", tc) + } + } +} + +func TestVerifyHTTPChallenge_BadURL(t *testing.T) { + v := NewHTTPVerifier(WithURLBuilder(func(_, _ string) string { return "http://[::1]:namedport/x" })) + if _, err := v.VerifyHTTPChallenge(context.Background(), "a", "/p", "tok"); err == nil { + t.Error("want request-build error for malformed URL") + } +} + +func TestVerifyHTTPChallenge_DefaultURLBuilder(t *testing.T) { + // The default builder targets plain HTTP on the FQDN itself per + // RFC 8555 §8.3. Point it at a guaranteed-closed local port via + // the fqdn argument: connection refused == not published. + v := NewHTTPVerifier(WithHTTPClient(&http.Client{Timeout: 500 * time.Millisecond})) + ok, err := v.VerifyHTTPChallenge(context.Background(), "127.0.0.1:1", "/p", "tok") + if err != nil || ok { + t.Fatalf("want not-published from default builder, got ok=%v err=%v", ok, err) + } +} + +func TestVerifyHTTPChallenge_TruncatedBody(t *testing.T) { + // A response that dies mid-body (Content-Length promises more + // than arrives) surfaces as a read error → not published. + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Length", "100") + _, _ = w.Write([]byte("short")) + if f, ok := w.(http.Flusher); ok { + f.Flush() + } + if hj, ok := w.(http.Hijacker); ok { + conn, _, herr := hj.Hijack() + if herr == nil { + _ = conn.Close() + } + } + })) + t.Cleanup(srv.Close) + + ok, err := testVerifier(srv).VerifyHTTPChallenge(context.Background(), + "agent.example.com", "/p", "tok") + if err != nil || ok { + t.Fatalf("truncated body must report not-published, got ok=%v err=%v", ok, err) + } +} + +func TestVerifyHTTPChallenge_OversizeBodyMismatch(t *testing.T) { + // Bodies past the read cap cannot match a ~130-byte key + // authorization; the limited read keeps memory bounded and the + // comparison fails. + big := make([]byte, maxChallengeBodyBytes*2) + for i := range big { + big[i] = 'a' + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write(big) + })) + t.Cleanup(srv.Close) + + ok, err := testVerifier(srv).VerifyHTTPChallenge(context.Background(), + "agent.example.com", "/p", "tok") + if err != nil || ok { + t.Fatalf("oversize body must mismatch, got ok=%v err=%v", ok, err) + } +} + +// --- SSRF guard --- + +// TestSSRFGuard_BlocksNonPublicDialTargets verifies the DEFAULT +// (production) verifier refuses to connect to non-public addresses. +// The httptest server binds 127.0.0.1, and we point the default URL +// builder at it via the host:port — the dialer Control hook must +// reject the loopback dial, so the fetch fails closed (not-published) +// rather than reaching an internal service. +func TestSSRFGuard_BlocksNonPublicDialTargets(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte("tok")) // would "match" if the guard let us connect + })) + t.Cleanup(srv.Close) + hostPort := strings.TrimPrefix(srv.URL, "http://") + + // Default verifier (guarded transport), but route the FQDN to the + // loopback test server. A registrant pointing their FQDN at an + // internal/loopback IP is exactly the SSRF case being closed. + v := NewHTTPVerifier(WithURLBuilder(func(_, path string) string { + return "http://" + hostPort + path + })) + ok, err := v.VerifyHTTPChallenge(context.Background(), "agent.example.com", "/p", "tok") + if err != nil || ok { + t.Fatalf("loopback dial must be blocked (fail closed), got ok=%v err=%v", ok, err) + } +} + +func TestIsPublicIP(t *testing.T) { + cases := map[string]bool{ + "8.8.8.8": true, // public + "1.1.1.1": true, // public + "2606:4700::1111": true, // public v6 + "127.0.0.1": false, // loopback + "::1": false, // loopback v6 + "169.254.169.254": false, // link-local (cloud metadata) + "10.0.0.1": false, // RFC 1918 + "192.168.1.1": false, // RFC 1918 + "172.16.0.1": false, // RFC 1918 + "fd00::1": false, // unique-local + "0.0.0.0": false, // unspecified + "224.0.0.1": false, // multicast + // Ranges Go's stdlib predicates miss (blockedCIDRs): + "100.64.0.1": false, // RFC 6598 CGNAT (cloud-internal) + "100.127.255.255": false, // RFC 6598 CGNAT upper edge + "192.0.0.1": false, // RFC 6890 IETF protocol assignments + "198.18.0.1": false, // RFC 2544 benchmarking + // 6to4 / NAT64 embedding internal IPv4 must fail closed: + "2002:7f00:0001::": false, // 6to4 wrapping 127.0.0.1 + "2002:0a00:0001::": false, // 6to4 wrapping 10.0.0.1 + "2002:0808:0808::": false, // 6to4 even wrapping public 8.8.8.8 (whole prefix blocked) + "64:ff9b::7f00:1": false, // NAT64 wrapping 127.0.0.1 + "64:ff9b::a00:1": false, // NAT64 wrapping 10.0.0.1 + // IPv4-mapped IPv6 normalizes to its inner v4 (must still reject + // loopback/private): + "::ffff:127.0.0.1": false, // mapped loopback + "::ffff:10.0.0.1": false, // mapped RFC 1918 + } + for s, want := range cases { + ip := net.ParseIP(s) + if ip == nil { + t.Fatalf("bad test IP %q", s) + } + if got := isPublicIP(ip); got != want { + t.Errorf("isPublicIP(%s): got %v want %v", s, got, want) + } + } +} + +// TestSSRFGuard_RefusesRedirects confirms the verifier does not follow +// a redirect (which could pivot a public host to an internal one). +func TestSSRFGuard_RefusesRedirects(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) { + // Emit a 302 directly (not http.Redirect, which derefs the + // request for relative-path resolution) so the client's + // CheckRedirect hook is what stops the chain. + w.Header().Set("Location", "http://example.com/elsewhere") + w.WriteHeader(http.StatusFound) + })) + t.Cleanup(srv.Close) + // Use a guarded client's redirect policy but a permissive dialer so + // the FIRST hop (to the test server) connects; the redirect itself + // must be refused. + v := NewHTTPVerifier( + WithHTTPClient(&http.Client{CheckRedirect: refuseRedirects, Timeout: 2 * time.Second}), + WithURLBuilder(func(_, path string) string { return srv.URL + path }), + ) + ok, err := v.VerifyHTTPChallenge(context.Background(), "agent.example.com", "/p", "tok") + if err != nil || ok { + t.Fatalf("redirect must not be followed (fail closed), got ok=%v err=%v", ok, err) + } +} diff --git a/internal/adapter/docsui/openapi/ra.yaml b/internal/adapter/docsui/openapi/ra.yaml index 9ad2749..a145424 100644 --- a/internal/adapter/docsui/openapi/ra.yaml +++ b/internal/adapter/docsui/openapi/ra.yaml @@ -1212,7 +1212,7 @@ components: HATEOAS link hrefs. Recommended for backport to v1. status: type: string - enum: [PENDING_VALIDATION, PENDING_DNS] + enum: [PENDING_VALIDATION, PENDING_CERTS, PENDING_DNS] ansName: type: string example: ans://v1.0.0.external-domain.com @@ -1246,7 +1246,7 @@ components: properties: action: type: string - enum: [CONFIGURE_DNS, CONFIGURE_HTTP, VERIFY_DNS, VALIDATE_DOMAIN, WAIT] + enum: [CONFIGURE_DNS, CONFIGURE_HTTP, VERIFY_DNS, VALIDATE_DOMAIN, WAIT, CANCEL] description: type: string endpoint: diff --git a/internal/adapter/store/sqlite/agent.go b/internal/adapter/store/sqlite/agent.go index a06be3f..f76f6d4 100644 --- a/internal/adapter/store/sqlite/agent.go +++ b/internal/adapter/store/sqlite/agent.go @@ -24,6 +24,14 @@ type AgentStore struct { func NewAgentStore(db *DB) *AgentStore { return &AgentStore{db: db} } // agentRow maps a single agent_registrations row for scanning. +// +// The `acme_dns01_token` column is legacy: rows written before +// migration 006 carried a single RA-generated DNS-01 token instead of +// a certificate order. Readers synthesize a self-issued order from it +// when the order columns are NULL; writers only fill the order +// columns. `acme_challenge_expires_at_ms` is shared between both +// generations — its semantic (challenge-window expiry) is unchanged, +// so it stores the order expiry. type agentRow struct { ID int64 `db:"id"` AgentID string `db:"agent_id"` @@ -39,6 +47,9 @@ type agentRow struct { SupersedesRegistrationID sql.NullInt64 `db:"supersedes_registration_id"` ACMEDNS01Token sql.NullString `db:"acme_dns01_token"` ACMEChallengeExpiresAtMs sql.NullInt64 `db:"acme_challenge_expires_at_ms"` + CertOrderRef sql.NullString `db:"cert_order_ref"` + CertOrderState sql.NullString `db:"cert_order_state"` + CertOrderChallenges sql.NullString `db:"cert_order_challenges"` CreatedAtMs int64 `db:"created_at_ms"` UpdatedAtMs int64 `db:"updated_at_ms"` } @@ -67,15 +78,79 @@ func (r agentRow) toDomain() (*domain.AgentRegistration, error) { if r.SupersedesRegistrationID.Valid { reg.SupersedesRegistrationID = r.SupersedesRegistrationID.Int64 } - if r.ACMEDNS01Token.Valid { - reg.ACMEChallenge.DNS01Token = r.ACMEDNS01Token.String - } - if r.ACMEChallengeExpiresAtMs.Valid { - reg.ACMEChallenge.ExpiresAt = msToTime(r.ACMEChallengeExpiresAtMs.Int64) + order, err := certOrderFromRow( + r.CertOrderRef, r.CertOrderState, r.CertOrderChallenges, + r.ACMEDNS01Token, r.ACMEChallengeExpiresAtMs, + ) + if err != nil { + return nil, err } + reg.CertOrder = order return reg, nil } +// certOrderFromRow decodes the certificate order from its columns, +// falling back to synthesizing a self-issued single-DNS-01 order from +// the legacy token columns for rows written before migration 006. +func certOrderFromRow( + ref, state, challengesJSON sql.NullString, + legacyDNS01 sql.NullString, legacyExpiresMs sql.NullInt64, +) (domain.CertificateOrder, error) { + var order domain.CertificateOrder + if challengesJSON.Valid && challengesJSON.String != "" { + if err := json.Unmarshal([]byte(challengesJSON.String), &order.Challenges); err != nil { + return domain.CertificateOrder{}, fmt.Errorf("sqlite: decode cert_order_challenges: %w", err) + } + order.OrderRef = ref.String + order.State = domain.OrderState(state.String) + if legacyExpiresMs.Valid { + order.ExpiresAt = msToTime(legacyExpiresMs.Int64) + } + return order, nil + } + // Legacy row: single RA-generated DNS-01 token, implicitly PENDING + // while the agent still sits in a pre-validation state. + if legacyDNS01.Valid && legacyDNS01.String != "" { + order.State = domain.OrderStatePending + order.Challenges = []domain.Challenge{{ + Type: domain.ChallengeTypeDNS01, + Token: legacyDNS01.String, + }} + if legacyExpiresMs.Valid { + order.ExpiresAt = msToTime(legacyExpiresMs.Int64) + } + } + return order, nil +} + +// certOrderColumns is the SQL-bindable representation of a +// CertificateOrder. Zero orders bind as NULLs so legacy-row synthesis +// stays distinguishable from an empty order. +type certOrderColumns struct { + ref any + state any + challenges any + expiresMs any +} + +// certOrderToRow encodes the order for persistence. The challenge +// array is stored verbatim as JSON. +func certOrderToRow(order domain.CertificateOrder) (certOrderColumns, error) { + if order.IsZero() { + return certOrderColumns{}, nil + } + encoded, err := json.Marshal(order.Challenges) + if err != nil { + return certOrderColumns{}, fmt.Errorf("sqlite: encode cert_order_challenges: %w", err) + } + return certOrderColumns{ + ref: nullableString(order.OrderRef), + state: string(order.State), + challenges: string(encoded), + expiresMs: nullableMs(order.ExpiresAt), + }, nil +} + // Save inserts or updates an AgentRegistration. Endpoints, server cert, // and identity CSR are persisted via their dedicated tables — Save only // writes the root aggregate row. @@ -85,6 +160,11 @@ func (s *AgentStore) Save(ctx context.Context, agent *domain.AgentRegistration) } now := time.Now().UnixMilli() + order, err := certOrderToRow(agent.CertOrder) + if err != nil { + return err + } + if agent.ID == 0 { const q = ` INSERT INTO agent_registrations ( @@ -92,9 +172,10 @@ func (s *AgentStore) Save(ctx context.Context, agent *domain.AgentRegistration) display_name, description, registration_timestamp_ms, last_renewal_timestamp_ms, supersedes_registration_id, - acme_dns01_token, acme_challenge_expires_at_ms, + cert_order_ref, cert_order_state, cert_order_challenges, + acme_challenge_expires_at_ms, created_at_ms, updated_at_ms - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` res, err := s.db.extx(ctx).ExecContext(ctx, q, agent.AgentID, agent.OwnerID, @@ -107,8 +188,8 @@ func (s *AgentStore) Save(ctx context.Context, agent *domain.AgentRegistration) agent.Details.RegistrationTimestamp.UnixMilli(), nullableMs(agent.Details.LastRenewalTimestamp), nullableInt64(agent.SupersedesRegistrationID), - nullableString(agent.ACMEChallenge.DNS01Token), - nullableMs(agent.ACMEChallenge.ExpiresAt), + order.ref, order.state, order.challenges, + order.expiresMs, now, now, ) if err != nil { @@ -129,18 +210,20 @@ func (s *AgentStore) Save(ctx context.Context, agent *domain.AgentRegistration) description = ?, last_renewal_timestamp_ms = ?, supersedes_registration_id = ?, - acme_dns01_token = ?, + cert_order_ref = ?, + cert_order_state = ?, + cert_order_challenges = ?, acme_challenge_expires_at_ms = ?, updated_at_ms = ? WHERE id = ?` - _, err := s.db.extx(ctx).ExecContext(ctx, q, + _, err = s.db.extx(ctx).ExecContext(ctx, q, string(agent.Status), agent.Details.DisplayName, agent.Details.Description, nullableMs(agent.Details.LastRenewalTimestamp), nullableInt64(agent.SupersedesRegistrationID), - nullableString(agent.ACMEChallenge.DNS01Token), - nullableMs(agent.ACMEChallenge.ExpiresAt), + order.ref, order.state, order.challenges, + order.expiresMs, now, agent.ID, ) @@ -283,6 +366,41 @@ func (s *AgentStore) ListByOwner( }, nil } +// ExpireLapsedPendingValidation flips lapsed PENDING_VALIDATION rows +// to EXPIRED in one guarded UPDATE and returns the count. +// +// The WHERE clause is the concurrency guard: a row is expired only +// while it is still PENDING_VALIDATION, its challenge window +// (acme_challenge_expires_at_ms — shared between the legacy +// single-token era and the order era) has lapsed, AND its order is +// still PENDING. A concurrent verify-acme that advances the row +// (status → PENDING_DNS, or order → ISSUING/COMPLETED/FAILED) removes +// it from the match set, so the sweep can never clobber a +// successfully-validated registration or roll its order columns back. +// Rows with a NULL expiry (pre-challenge-persistence registrations) +// never match — they carry no window to expire on and remain +// cancellable instead. Legacy rows have a NULL cert_order_state and +// are matched by the IS NULL arm. +func (s *AgentStore) ExpireLapsedPendingValidation(ctx context.Context, now time.Time) (int64, error) { + const q = ` + UPDATE agent_registrations + SET status = 'EXPIRED', updated_at_ms = ? + WHERE status = 'PENDING_VALIDATION' + AND acme_challenge_expires_at_ms IS NOT NULL + AND acme_challenge_expires_at_ms <= ? + AND (cert_order_state IS NULL OR cert_order_state = 'PENDING')` + nowMs := now.UnixMilli() + res, err := s.db.extx(ctx).ExecContext(ctx, q, nowMs, nowMs) + if err != nil { + return 0, mapSQLErr(err) + } + n, err := res.RowsAffected() + if err != nil { + return 0, fmt.Errorf("sqlite: expire lapsed registrations: rows affected: %w", err) + } + return n, nil +} + // Delete removes a registration by ID. func (s *AgentStore) Delete(ctx context.Context, id int64) error { _, err := s.db.extx(ctx).ExecContext(ctx, `DELETE FROM agent_registrations WHERE id = ?`, id) @@ -352,6 +470,3 @@ func decodeCursor(c string) (int64, error) { } return id, nil } - -// _ ensures encoding/json is used when sqlite rows encode JSON in future. -var _ = json.Marshal diff --git a/internal/adapter/store/sqlite/certificate.go b/internal/adapter/store/sqlite/certificate.go index 7fb0996..1c8ac73 100644 --- a/internal/adapter/store/sqlite/certificate.go +++ b/internal/adapter/store/sqlite/certificate.go @@ -15,7 +15,10 @@ type CertificateStore struct{ db *DB } // NewCertificateStore returns a new SQLite-backed CertificateStore. func NewCertificateStore(db *DB) *CertificateStore { return &CertificateStore{db: db} } -// certRow maps an issued_certificates row. +// certRow maps an issued_certificates row. serial_number / +// certificate_ref are NULL on rows persisted before migration 007 — +// readers surface them as empty strings and the revocation flow falls +// back to parsing the PEM for the serial. type certRow struct { ID int64 `db:"id"` AgentID string `db:"agent_id"` @@ -23,6 +26,8 @@ type certRow struct { CertificateType string `db:"certificate_type"` CertificatePEM string `db:"certificate_pem"` ChainPEM sql.NullString `db:"chain_pem"` + SerialNumber sql.NullString `db:"serial_number"` + CertificateRef sql.NullString `db:"certificate_ref"` Status string `db:"status"` IssueTimestampMs int64 `db:"issue_timestamp_ms"` ExpirationTimestampMs int64 `db:"expiration_timestamp_ms"` @@ -35,6 +40,8 @@ func (r certRow) toDomain() *domain.StoredCertificate { CertificateType: domain.CertificateType(r.CertificateType), CertificatePEM: r.CertificatePEM, ChainPEM: r.ChainPEM.String, + SerialNumber: r.SerialNumber.String, + CertificateRef: r.CertificateRef.String, Status: domain.CertificateStatus(r.Status), IssueTimestamp: msToTime(r.IssueTimestampMs), ExpirationTimestamp: msToTime(r.ExpirationTimestampMs), @@ -50,14 +57,16 @@ func (s *CertificateStore) SaveIdentityCertificate( const q = ` INSERT INTO issued_certificates( agent_id, csr_id, certificate_type, certificate_pem, chain_pem, + serial_number, certificate_ref, status, issue_timestamp_ms, expiration_timestamp_ms - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?)` + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` chain := sql.NullString{} if cert.ChainPEM != "" { chain = sql.NullString{String: cert.ChainPEM, Valid: true} } res, err := s.db.extx(ctx).ExecContext(ctx, q, agentID, cert.CSRID, string(cert.CertificateType), cert.CertificatePEM, chain, + nullableString(cert.SerialNumber), nullableString(cert.CertificateRef), string(cert.Status), cert.IssueTimestamp.UnixMilli(), cert.ExpirationTimestamp.UnixMilli(), ) if err != nil { diff --git a/internal/adapter/store/sqlite/migrations/006_certificate_orders.sql b/internal/adapter/store/sqlite/migrations/006_certificate_orders.sql new file mode 100644 index 0000000..08eaef2 --- /dev/null +++ b/internal/adapter/store/sqlite/migrations/006_certificate_orders.sql @@ -0,0 +1,36 @@ +-- 006_certificate_orders.sql +-- Persist the certificate order — provider order ref, order state, +-- and the full challenge set — on both agent registrations and +-- server-cert renewals. +-- +-- Challenges now originate from the certificate issuer port +-- (`ServerCertificateIssuer.CreateOrder`) instead of being invented +-- by the service layer, so the row must carry whatever the provider +-- minted: token, key authorization, and any provider-computed DNS +-- record value (an ACME provider's DNS-01 TXT value is a digest of +-- the key authorization, not the raw token). A JSON column holds the +-- challenge array verbatim; order ref and state get their own +-- columns so the state machine is queryable. +-- +-- The pre-existing token columns (`acme_dns01_token` on agents, +-- `dns01_token` / `http01_token` on renewals) are frozen as legacy: +-- readers synthesize a self-issued challenge set from them when the +-- JSON column is NULL; writers no longer touch them. The +-- `acme_challenge_expires_at_ms` column is NOT legacy — its semantic +-- (challenge-window expiry) is unchanged, so it carries the order's +-- expiry for both old and new rows. + +ALTER TABLE agent_registrations + ADD COLUMN cert_order_ref TEXT; + +ALTER TABLE agent_registrations + ADD COLUMN cert_order_state TEXT; + +ALTER TABLE agent_registrations + ADD COLUMN cert_order_challenges TEXT CHECK (cert_order_challenges IS NULL OR json_valid(cert_order_challenges)); + +ALTER TABLE server_cert_renewals + ADD COLUMN order_ref TEXT; + +ALTER TABLE server_cert_renewals + ADD COLUMN challenges TEXT CHECK (challenges IS NULL OR json_valid(challenges)); diff --git a/internal/adapter/store/sqlite/migrations/007_certificate_identifiers.sql b/internal/adapter/store/sqlite/migrations/007_certificate_identifiers.sql new file mode 100644 index 0000000..1e8d222 --- /dev/null +++ b/internal/adapter/store/sqlite/migrations/007_certificate_identifiers.sql @@ -0,0 +1,18 @@ +-- 007_certificate_identifiers.sql +-- Persist the issued certificate's serial number and the issuing +-- provider's opaque handle on issued_certificates. +-- +-- Both come back from the issuer port at signing time and were +-- previously discarded. They are required for CA-side revocation: +-- the in-process self-signed CA, AWS Private CA, and Vault revoke by +-- serial; GCP Private CA Service revokes by certificate resource +-- name, which only exists if captured at issuance. NULL on rows +-- written before this migration — readers fall back to parsing the +-- certificate PEM for the serial, and legacy rows never have a +-- provider handle (they were all self-CA issued). + +ALTER TABLE issued_certificates + ADD COLUMN serial_number TEXT; + +ALTER TABLE issued_certificates + ADD COLUMN certificate_ref TEXT; diff --git a/internal/adapter/store/sqlite/order_persistence_test.go b/internal/adapter/store/sqlite/order_persistence_test.go new file mode 100644 index 0000000..4788450 --- /dev/null +++ b/internal/adapter/store/sqlite/order_persistence_test.go @@ -0,0 +1,262 @@ +package sqlite + +import ( + "context" + "testing" + "time" + + "github.com/godaddy/ans/internal/domain" +) + +// seedOrderAgent saves a minimal registration carrying the given +// order and returns it reloaded from disk. +func seedOrderAgent(t *testing.T, db *DB, agentID string, order domain.CertificateOrder) *domain.AgentRegistration { + t.Helper() + store := NewAgentStore(db) + sv, _ := domain.ParseSemVer("1.0.0") + ansName, _ := domain.NewAnsName(sv, agentID+".example.com") + reg := &domain.AgentRegistration{ + AgentID: agentID, + OwnerID: "owner", + AnsName: ansName, + Status: domain.StatusPendingValidation, + CertOrder: order, + Details: domain.RegistrationDetails{ + RegistrationTimestamp: time.Now(), + }, + } + if err := store.Save(context.Background(), reg); err != nil { + t.Fatal(err) + } + got, err := store.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + return got +} + +// TestAgentStore_CertOrderRoundTrip pins the full order persistence: +// provider ref, state, challenge set (including provider-computed +// overrides), and expiry survive a write/read cycle. +func TestAgentStore_CertOrderRoundTrip(t *testing.T) { + db, err := Open(context.Background(), ":memory:") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = db.Close() }) + + exp := time.Now().Add(2 * time.Hour).Truncate(time.Millisecond) + order := domain.CertificateOrder{ + OrderRef: "https://acme.example/order/123", + State: domain.OrderStatePending, + Challenges: []domain.Challenge{ + { + Type: domain.ChallengeTypeDNS01, + Token: "tok-dns", + KeyAuthorization: "tok-dns.thumb", + DNSRecordValue: "digest-value", + }, + { + Type: domain.ChallengeTypeHTTP01, + Token: "tok-http", + KeyAuthorization: "tok-http.thumb", + HTTPPath: "/.well-known/acme-challenge/tok-http", + }, + }, + ExpiresAt: exp, + } + got := seedOrderAgent(t, db, "order-roundtrip", order) + + if got.CertOrder.OrderRef != order.OrderRef { + t.Errorf("orderRef: got %q", got.CertOrder.OrderRef) + } + if got.CertOrder.State != domain.OrderStatePending { + t.Errorf("state: got %q", got.CertOrder.State) + } + if !got.CertOrder.ExpiresAt.Equal(exp) { + t.Errorf("expiresAt: got %v want %v", got.CertOrder.ExpiresAt, exp) + } + dns01, ok := got.CertOrder.ChallengeOfType(domain.ChallengeTypeDNS01) + if !ok || dns01.DNSRecordValue != "digest-value" || dns01.KeyAuthorization != "tok-dns.thumb" { + t.Errorf("dns01 challenge lost provider fields: %+v", dns01) + } + http01, ok := got.CertOrder.ChallengeOfType(domain.ChallengeTypeHTTP01) + if !ok || http01.HTTPPath != "/.well-known/acme-challenge/tok-http" { + t.Errorf("http01 challenge lost fields: %+v", http01) + } + + // State updates persist through the UPDATE path too. + if err := got.CertOrder.MarkIssuing(); err != nil { + t.Fatal(err) + } + if err := NewAgentStore(db).Save(context.Background(), got); err != nil { + t.Fatal(err) + } + again, err := NewAgentStore(db).FindByAgentID(context.Background(), "order-roundtrip") + if err != nil { + t.Fatal(err) + } + if again.CertOrder.State != domain.OrderStateIssuing { + t.Errorf("updated state: got %q want ISSUING", again.CertOrder.State) + } +} + +// TestAgentStore_ZeroOrderStaysZero: registrations without an order +// (zero value) read back as zero — NULL columns, no synthesis. +func TestAgentStore_ZeroOrderStaysZero(t *testing.T) { + db, err := Open(context.Background(), ":memory:") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = db.Close() }) + + got := seedOrderAgent(t, db, "zero-order", domain.CertificateOrder{}) + if !got.CertOrder.IsZero() { + t.Errorf("zero order should round-trip as zero, got %+v", got.CertOrder) + } +} + +// TestAgentStore_LegacyRowSynthesizesOrder: a row written before +// migration 006 (bare acme_dns01_token, NULL order columns) reads as +// a self-issued single-DNS-01 PENDING order, so in-flight +// registrations keep working across the upgrade. +func TestAgentStore_LegacyRowSynthesizesOrder(t *testing.T) { + db, err := Open(context.Background(), ":memory:") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = db.Close() }) + + got := seedOrderAgent(t, db, "legacy-agent", domain.CertificateOrder{}) + + // Rewrite the row into its pre-006 shape: legacy token column set, + // order columns NULL. + expMs := time.Now().Add(time.Hour).Truncate(time.Millisecond) + if _, err := db.DBX().Exec(` + UPDATE agent_registrations SET + acme_dns01_token = 'legacy-tok', + acme_challenge_expires_at_ms = ?, + cert_order_ref = NULL, cert_order_state = NULL, cert_order_challenges = NULL + WHERE agent_id = 'legacy-agent'`, expMs.UnixMilli()); err != nil { + t.Fatal(err) + } + + reloaded, err := NewAgentStore(db).FindByAgentID(context.Background(), "legacy-agent") + if err != nil { + t.Fatal(err) + } + if reloaded.CertOrder.State != domain.OrderStatePending { + t.Fatalf("legacy synthesis state: got %q want PENDING", reloaded.CertOrder.State) + } + dns01, ok := reloaded.CertOrder.ChallengeOfType(domain.ChallengeTypeDNS01) + if !ok || dns01.Token != "legacy-tok" { + t.Fatalf("legacy synthesis challenge: %+v ok=%v", dns01, ok) + } + if _, hasHTTP := reloaded.CertOrder.ChallengeOfType(domain.ChallengeTypeHTTP01); hasHTTP { + t.Error("legacy rows never carried an HTTP-01 token; none should be synthesized") + } + if !reloaded.CertOrder.ExpiresAt.Equal(expMs) { + t.Errorf("legacy expiry: got %v want %v", reloaded.CertOrder.ExpiresAt, expMs) + } + if !got.CertOrder.IsZero() { + t.Error("precondition: seeded order should have been zero") + } +} + +// TestAgentStore_MalformedChallengeJSON surfaces a decode error +// instead of silently dropping the order. +func TestAgentStore_MalformedChallengeJSON(t *testing.T) { + db, err := Open(context.Background(), ":memory:") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = db.Close() }) + + seedOrderAgent(t, db, "bad-json", domain.CertificateOrder{}) + // `{"not":"array"}` passes the json_valid CHECK but cannot decode + // into []Challenge. + if _, err := db.DBX().Exec(` + UPDATE agent_registrations SET cert_order_challenges = '{"not":"array"}' + WHERE agent_id = 'bad-json'`); err != nil { + t.Fatal(err) + } + if _, err := NewAgentStore(db).FindByAgentID(context.Background(), "bad-json"); err == nil { + t.Fatal("want decode error for malformed challenge JSON") + } +} + +// TestRenewalStore_LegacyRowSynthesizesChallenges: pre-006 renewal +// rows (bare token columns, NULL challenges JSON) read back as the +// self-issued challenge pair. +func TestRenewalStore_LegacyRowSynthesizesChallenges(t *testing.T) { + db, err := Open(context.Background(), ":memory:") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = db.Close() }) + + reg := seedOrderAgent(t, db, "legacy-renewal-agent", domain.CertificateOrder{}) + renewals := NewRenewalStore(db) + r := domain.NewBYOCRenewal(reg.AgentID, reg.ID, "LEAF", "CHAIN", + domain.NewSelfIssuedOrder("dns-tok", "http-tok", time.Now().Add(time.Hour)), time.Now()) + if err := renewals.Save(context.Background(), r); err != nil { + t.Fatal(err) + } + + // Strip the JSON column to simulate a pre-006 row; the NOT NULL + // token columns are already populated by Save. + if _, err := db.DBX().Exec( + `UPDATE server_cert_renewals SET challenges = NULL, order_ref = NULL WHERE agent_id = ?`, + reg.AgentID); err != nil { + t.Fatal(err) + } + + got, err := renewals.FindByAgentID(context.Background(), reg.AgentID) + if err != nil { + t.Fatal(err) + } + dns01, ok := got.Validation.ChallengeOfType(domain.ChallengeTypeDNS01) + if !ok || dns01.Token != "dns-tok" { + t.Fatalf("legacy dns01 synthesis: %+v ok=%v", dns01, ok) + } + http01, ok := got.Validation.ChallengeOfType(domain.ChallengeTypeHTTP01) + if !ok || http01.Token != "http-tok" { + t.Fatalf("legacy http01 synthesis: %+v ok=%v", http01, ok) + } +} + +// TestRenewalStore_OrderRefRoundTrip pins provider order persistence +// on the renewal lane. +func TestRenewalStore_OrderRefRoundTrip(t *testing.T) { + db, err := Open(context.Background(), ":memory:") + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { _ = db.Close() }) + + reg := seedOrderAgent(t, db, "renewal-order-agent", domain.CertificateOrder{}) + renewals := NewRenewalStore(db) + order := domain.CertificateOrder{ + OrderRef: "https://acme.example/order/9", + State: domain.OrderStatePending, + Challenges: []domain.Challenge{ + {Type: domain.ChallengeTypeDNS01, Token: "t", KeyAuthorization: "t.kid", DNSRecordValue: "digest"}, + }, + ExpiresAt: time.Now().Add(time.Hour), + } + r := domain.NewCSRRenewal(reg.AgentID, reg.ID, "csr-1", order, time.Now()) + if err := renewals.Save(context.Background(), r); err != nil { + t.Fatal(err) + } + got, err := renewals.FindByAgentID(context.Background(), reg.AgentID) + if err != nil { + t.Fatal(err) + } + if got.Validation.OrderRef != order.OrderRef { + t.Errorf("orderRef: got %q", got.Validation.OrderRef) + } + dns01, ok := got.Validation.ChallengeOfType(domain.ChallengeTypeDNS01) + if !ok || dns01.DNSRecordValue != "digest" { + t.Errorf("provider DNS value lost: %+v", dns01) + } +} diff --git a/internal/adapter/store/sqlite/renewal.go b/internal/adapter/store/sqlite/renewal.go index ad81480..2a860c7 100644 --- a/internal/adapter/store/sqlite/renewal.go +++ b/internal/adapter/store/sqlite/renewal.go @@ -3,6 +3,8 @@ package sqlite import ( "context" "database/sql" + "encoding/json" + "fmt" "time" "github.com/godaddy/ans/internal/domain" @@ -14,6 +16,14 @@ type RenewalStore struct{ db *DB } // NewRenewalStore returns a new SQLite-backed RenewalStore. func NewRenewalStore(db *DB) *RenewalStore { return &RenewalStore{db: db} } +// renewalRow maps a single server_cert_renewals row for scanning. +// +// The `dns01_token` / `http01_token` columns are legacy: rows written +// before migration 006 carried bare RA-generated tokens instead of a +// challenge set from the certificate order. Readers synthesize +// self-issued challenges from them when the `challenges` JSON column +// is NULL. New rows still fill the token columns (they are NOT NULL +// in the original schema) but readers prefer the JSON. type renewalRow struct { ID int64 `db:"id"` AgentID string `db:"agent_id"` @@ -24,6 +34,8 @@ type renewalRow struct { ByocChainPEM sql.NullString `db:"byoc_chain_pem"` DNS01Token string `db:"dns01_token"` HTTP01Token string `db:"http01_token"` + OrderRef sql.NullString `db:"order_ref"` + Challenges sql.NullString `db:"challenges"` ValidationStatus string `db:"validation_status"` ValidationExpiresMs int64 `db:"validation_expires_ms"` FailureReason sql.NullString `db:"failure_reason"` @@ -32,14 +44,26 @@ type renewalRow struct { UpdatedAtMs int64 `db:"updated_at_ms"` } -func (r renewalRow) toDomain() *domain.ServerCertificateRenewal { +func (r renewalRow) toDomain() (*domain.ServerCertificateRenewal, error) { v := domain.RenewalValidation{ - DNS01ChallengeToken: r.DNS01Token, - HTTP01ChallengeToken: r.HTTP01Token, - Status: domain.ValidationStatus(r.ValidationStatus), - CreatedAt: msToTime(r.CreatedAtMs), - ExpiresAt: msToTime(r.ValidationExpiresMs), - UpdatedAt: msToTime(r.UpdatedAtMs), + OrderRef: r.OrderRef.String, + Status: domain.ValidationStatus(r.ValidationStatus), + CreatedAt: msToTime(r.CreatedAtMs), + ExpiresAt: msToTime(r.ValidationExpiresMs), + UpdatedAt: msToTime(r.UpdatedAtMs), + } + switch { + case r.Challenges.Valid && r.Challenges.String != "": + if err := json.Unmarshal([]byte(r.Challenges.String), &v.Challenges); err != nil { + return nil, fmt.Errorf("sqlite: decode renewal challenges: %w", err) + } + default: + // Legacy row: synthesize the self-issued pair from the bare + // token columns. + v.Challenges = []domain.Challenge{ + {Type: domain.ChallengeTypeDNS01, Token: r.DNS01Token}, + {Type: domain.ChallengeTypeHTTP01, Token: r.HTTP01Token}, + } } ren := &domain.ServerCertificateRenewal{ ID: r.ID, @@ -56,25 +80,36 @@ func (r renewalRow) toDomain() *domain.ServerCertificateRenewal { if r.CompletedAtMs.Valid { ren.CompletedAt = msToTime(r.CompletedAtMs.Int64) } - return ren + return ren, nil } // Save inserts or updates a renewal aggregate. func (s *RenewalStore) Save(ctx context.Context, r *domain.ServerCertificateRenewal) error { now := time.Now().UnixMilli() if r.ID == 0 { + challengesJSON, err := json.Marshal(r.Validation.Challenges) + if err != nil { + return fmt.Errorf("sqlite: encode renewal challenges: %w", err) + } + // The bare token columns are NOT NULL in the original schema; + // fill them from the challenge set so pre-006 tooling reading + // the table sees sane values. + dns01, _ := r.Validation.ChallengeOfType(domain.ChallengeTypeDNS01) + http01, _ := r.Validation.ChallengeOfType(domain.ChallengeTypeHTTP01) const q = ` INSERT INTO server_cert_renewals( agent_id, registration_id, renewal_type, server_csr_id, byoc_cert_pem, byoc_chain_pem, - dns01_token, http01_token, validation_status, validation_expires_ms, + dns01_token, http01_token, order_ref, challenges, + validation_status, validation_expires_ms, failure_reason, completed_at_ms, created_at_ms, updated_at_ms - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)` res, err := s.db.extx(ctx).ExecContext(ctx, q, r.AgentID, r.RegistrationID, string(r.RenewalType), nullableString(r.ServerCsrID), nullableString(r.ByocCertPEM), nullableString(r.ByocChainPEM), - r.Validation.DNS01ChallengeToken, r.Validation.HTTP01ChallengeToken, + dns01.Token, http01.Token, + nullableString(r.Validation.OrderRef), string(challengesJSON), string(r.Validation.Status), r.Validation.ExpiresAt.UnixMilli(), nullableString(r.FailureReason), nullableMs(r.CompletedAt), @@ -114,7 +149,7 @@ func (s *RenewalStore) FindByAgentID(ctx context.Context, agentID string) (*doma if err := s.db.extx(ctx).GetContext(ctx, &r, q, agentID); err != nil { return nil, mapSQLErr(err) } - return r.toDomain(), nil + return r.toDomain() } // FindPendingByAgentID returns a pending (not-completed) renewal. @@ -127,7 +162,7 @@ func (s *RenewalStore) FindPendingByAgentID(ctx context.Context, agentID string) if err := s.db.extx(ctx).GetContext(ctx, &r, q, agentID); err != nil { return nil, mapSQLErr(err) } - return r.toDomain(), nil + return r.toDomain() } // Delete removes a renewal by ID. @@ -151,7 +186,11 @@ func (s *RenewalStore) ListPendingExpired(ctx context.Context) ([]*domain.Server } out := make([]*domain.ServerCertificateRenewal, len(rows)) for i, r := range rows { - out[i] = r.toDomain() + ren, err := r.toDomain() + if err != nil { + return nil, err + } + out[i] = ren } return out, nil } diff --git a/internal/config/config.go b/internal/config/config.go index 4a32a87..aa4feec 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -65,15 +65,15 @@ type AuthOIDC struct { // CA holds identity certificate authority settings. // -// `Server` is optional: when non-nil it configures an additional -// server-auth CA used to sign server CSRs submitted at registration +// `Server` is optional: when non-nil it configures the server +// certificate issuer used for server CSRs submitted at registration // or renewal. When nil, the RA only supports the BYOC path for // server certs. The identity CA (`Self`) is always required — every // agent gets an RA-issued identity cert. type CA struct { - Type string `koanf:"type"` - Self *CASelf `koanf:"self"` - Server *CAServerSelf `koanf:"server"` + Type string `koanf:"type"` + Self *CASelf `koanf:"self"` + Server *CAServer `koanf:"server"` } // CASelf configures the in-process self-signed identity CA. @@ -83,13 +83,45 @@ type CASelf struct { DataDir string `koanf:"data-dir"` } -// CAServerSelf configures the in-process self-signed server CA. The -// shape matches CASelf (org + validity + data-dir); kept distinct so -// operators can rotate the identity and server roots independently. -type CAServerSelf struct { +// CAServer selects and configures the server certificate issuer +// behind `port.ServerCertificateIssuer`. +// +// - Type "self" (default, including when omitted — backwards +// compatible with pre-issuer configs): the in-process +// self-signed server CA. Same shape as the identity CA but kept +// distinct so operators can rotate the two roots independently. +// - Type "acme": an external RFC 8555 CA (Let's Encrypt et al.) +// configured by the nested `acme` block. Selecting this type is +// the operator's consent to the provider's terms of service — +// account registration auto-accepts them, per standard ACME +// automation practice. +type CAServer struct { + Type string `koanf:"type"` + + // Self-signed issuer settings (type "self"). Org string `koanf:"org"` ValidityDays int `koanf:"validity-days"` DataDir string `koanf:"data-dir"` + + // ACME issuer settings (type "acme"). + ACME *CAServerACME `koanf:"acme"` +} + +// IsACME reports whether the server issuer is the ACME adapter. +func (s *CAServer) IsACME() bool { return s != nil && s.Type == "acme" } + +// CAServerACME configures the RFC 8555 issuer. +type CAServerACME struct { + // DirectoryURL is the provider's directory endpoint, e.g. + // https://acme-staging-v02.api.letsencrypt.org/directory for + // Let's Encrypt staging. Use staging for testing — production + // rate limits are unforgiving. + DirectoryURL string `koanf:"directory-url"` + // Email becomes the ACME account contact for expiry and incident + // notices. Optional. + Email string `koanf:"email"` + // DataDir persists the ACME account key across restarts. + DataDir string `koanf:"data-dir"` } // DNS holds DNS verifier configuration. @@ -324,20 +356,24 @@ func (c *RAConfig) Validate() error { if c.CA.Self.ValidityDays <= 0 { return errors.New("ca.self.validity-days must be positive") } - // Server CA is optional but when configured must be valid. - if c.CA.Server != nil { - if c.CA.Server.DataDir == "" { - return errors.New("ca.server.data-dir is required when ca.server block is set") - } - if c.CA.Server.ValidityDays <= 0 { - return errors.New("ca.server.validity-days must be positive") - } + // Server issuer is optional but when configured must be valid. + if err := validateCAServer(c.CA.Server); err != nil { + return err } switch c.DNS.Type { case "noop", "lookup": default: return fmt.Errorf("dns.type %q not supported (expected 'noop' or 'lookup')", c.DNS.Type) } + // An ACME server issuer with the noop DNS verifier is a footgun: + // the noop gate "passes" every challenge, so the RA would answer + // the public provider's challenge before the owner published the + // artifact, and the provider would mark the authorization (and the + // order) invalid. A real provider needs the real lookup verifier. + if c.CA.Server.IsACME() && c.DNS.Type == "noop" { + return errors.New( + "ca.server.type 'acme' requires dns.type 'lookup': a noop challenge gate would answer the provider's challenge before the artifact exists and invalidate every order") + } if err := validateKeys(&c.Keys); err != nil { return err } @@ -385,6 +421,36 @@ func (c *TLConfig) Validate() error { return nil } +// validateCAServer checks the optional server-issuer block. Nil is +// valid (BYOC-only deployment). +func validateCAServer(s *CAServer) error { + if s == nil { + return nil + } + switch s.Type { + case "", "self": + if s.DataDir == "" { + return errors.New("ca.server.data-dir is required when ca.server block is set") + } + if s.ValidityDays <= 0 { + return errors.New("ca.server.validity-days must be positive") + } + case "acme": + if s.ACME == nil { + return errors.New("ca.server.acme block is required when ca.server.type is 'acme'") + } + if s.ACME.DirectoryURL == "" { + return errors.New("ca.server.acme.directory-url is required") + } + if s.ACME.DataDir == "" { + return errors.New("ca.server.acme.data-dir is required") + } + default: + return fmt.Errorf("ca.server.type %q not supported (expected 'self' or 'acme')", s.Type) + } + return nil +} + func validateAuth(a *Auth) error { switch a.Type { case "static": diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 5face48..90184e7 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -192,6 +192,32 @@ log: } } +// TestRAConfig_Validate_ACMEServerIssuer pins the valid-acme arm: +// a complete acme block passes validation. +func TestRAConfig_Validate_ACMEServerIssuer(t *testing.T) { + dir := t.TempDir() + c := defaultRAConfig() + c.Auth.Static = &AuthStatic{APIKey: "x"} + c.CA.Self.DataDir = dir + c.Keys.File.Path = dir + c.Store.SQLite.Path = filepath.Join(dir, "db") + c.CA.Server = &CAServer{Type: "acme", ACME: &CAServerACME{ + DirectoryURL: "https://acme-staging-v02.api.letsencrypt.org/directory", + Email: "ops@example.com", + DataDir: dir, + }} + c.DNS.Type = "lookup" // acme requires the real verifier + if err := c.Validate(); err != nil { + t.Fatalf("valid acme config rejected: %v", err) + } + if !c.CA.Server.IsACME() { + t.Error("IsACME must report true for type acme") + } + if (&CAServer{}).IsACME() { + t.Error("IsACME must report false for the self default") + } +} + // ----- RAConfig.Validate error branches ----- func TestRAConfig_Validate_Errors(t *testing.T) { @@ -216,11 +242,29 @@ func TestRAConfig_Validate_Errors(t *testing.T) { {"missing ca.self.data-dir", func(c *RAConfig) { c.CA.Self.DataDir = "" }, "ca.self.data-dir"}, {"invalid ca.self.validity-days", func(c *RAConfig) { c.CA.Self.ValidityDays = 0 }, "validity-days"}, {"server CA missing data-dir", func(c *RAConfig) { - c.CA.Server = &CAServerSelf{ValidityDays: 7} + c.CA.Server = &CAServer{ValidityDays: 7} }, "ca.server.data-dir"}, {"server CA bad validity", func(c *RAConfig) { - c.CA.Server = &CAServerSelf{DataDir: dir, ValidityDays: 0} + c.CA.Server = &CAServer{DataDir: dir, ValidityDays: 0} }, "ca.server.validity-days"}, + {"acme missing block", func(c *RAConfig) { + c.CA.Server = &CAServer{Type: "acme"} + }, "ca.server.acme block"}, + {"acme missing directory-url", func(c *RAConfig) { + c.CA.Server = &CAServer{Type: "acme", ACME: &CAServerACME{DataDir: dir}} + }, "directory-url"}, + {"acme missing data-dir", func(c *RAConfig) { + c.CA.Server = &CAServer{Type: "acme", ACME: &CAServerACME{DirectoryURL: "https://acme.example/dir"}} + }, "acme.data-dir"}, + {"unsupported server issuer type", func(c *RAConfig) { + c.CA.Server = &CAServer{Type: "vault"} + }, "ca.server.type"}, + {"acme issuer with noop dns", func(c *RAConfig) { + c.CA.Server = &CAServer{Type: "acme", ACME: &CAServerACME{ + DirectoryURL: "https://acme.example/dir", DataDir: dir, + }} + c.DNS.Type = "noop" + }, "requires dns.type 'lookup'"}, {"unsupported dns.type", func(c *RAConfig) { c.DNS.Type = "bind" }, "dns.type"}, {"unsupported keys.type", func(c *RAConfig) { c.Keys.Type = "vault" }, "keys.type"}, {"missing keys.file.path", func(c *RAConfig) { c.Keys.File.Path = "" }, "keys.file.path"}, diff --git a/internal/domain/agent.go b/internal/domain/agent.go index 7f20a60..c74931d 100644 --- a/internal/domain/agent.go +++ b/internal/domain/agent.go @@ -10,25 +10,6 @@ const ( maxDescriptionLength = 150 ) -// ACMEChallenge captures the DNS-01 challenge token issued to an -// operator at registration time. Zero-value when no challenge is -// active (agent is past PENDING_DNS, or the registration predates -// challenge-persistence). -// -// ans emits DNS-01 only — the reference api-spec ChallengeInfo -// declares both DNS_01 and HTTP_01 options, but the ans deviation -// (documented in CLAUDE.md) skips HTTP_01. Future support can land -// by extending this struct with HTTP01Token + KeyAuthorization. -type ACMEChallenge struct { - DNS01Token string `json:"dns01Token,omitempty"` - ExpiresAt time.Time `json:"expiresAt,omitzero"` -} - -// IsZero reports whether the challenge is unset. -func (c ACMEChallenge) IsZero() bool { - return c.DNS01Token == "" && c.ExpiresAt.IsZero() -} - // RegistrationDetails holds metadata about an agent registration. type RegistrationDetails struct { RegistrationTimestamp time.Time `json:"registrationTimestamp"` @@ -79,12 +60,13 @@ type AgentRegistration struct { // ServerCert is the BYOC server certificate (if submitted). ServerCert *ByocServerCertificate `json:"serverCert,omitempty"` - // IdentityCSR is the most recent pending identity CSR on this - // registration. Initially populated at registration time; can be - // replaced by a rotation via POST /certificates/identity (which - // flips the previous one to SIGNED once the CA issues the new - // cert). Historical CSRs persist in the csrs table — the one - // embedded here is a "fast path" cache for the status handler. + // IdentityCSR is the most recent identity CSR on this + // registration: PENDING between registration and verify-acme + // (which signs it once domain control is proven), and SIGNED + // thereafter. A rotation via POST /certificates/identity is signed + // at submission and replaces this slot with the new SIGNED CSR. + // Historical CSRs persist in the csrs table — the one embedded + // here is a "fast path" cache for the status handler. IdentityCSR *AgentCSR `json:"identityCsr,omitempty"` // ServerCSR is the most recent pending server CSR on this @@ -96,12 +78,14 @@ type AgentRegistration struct { // SupersedesRegistrationID is the ID of the previous version (for supersession). SupersedesRegistrationID int64 `json:"supersedesRegistrationId,omitempty"` - // ACMEChallenge holds the DNS-01 challenge token issued at - // registration time. Zero-value when the agent is past the - // PENDING_DNS stage (or predates the challenge-persistence - // migration). ans is DNS-01-only per the documented no-HTTP-01 - // deviation. - ACMEChallenge ACMEChallenge `json:"acmeChallenge,omitzero"` + // CertOrder tracks the certificate order and its domain-control + // challenges for this registration. CSR-path registrations carry + // the provider order returned by `ServerCertificateIssuer. + // CreateOrder`; BYOC registrations carry a self-issued validation + // order (OrderRef empty) because domain control must be proven + // even when no certificate is being issued. Zero-value for + // registrations predating order persistence. + CertOrder CertificateOrder `json:"certOrder,omitzero"` // PendingEvents holds domain events raised during this aggregate operation. // They are cleared after being published. @@ -226,15 +210,20 @@ func (r *AgentRegistration) Activate(now time.Time) error { // AdvanceToPendingDNS transitions from PENDING_VALIDATION to PENDING_DNS, // which is the state the registration enters once domain-control -// validation (ACME) succeeds and the DNS records the operator must -// configure have been computed. +// validation (ACME) succeeds AND the certificate order completed — +// the DNS records the operator must configure (notably TLSA) can only +// be computed once the server certificate exists. // -// The V2 spec (spec/api-spec-v2.yaml) and the reference RA both use a -// three-state pending chain of PENDING_VALIDATION → PENDING_DNS → -// ACTIVE. There is no intermediate PENDING_CERTS state: certificate -// issuance is internal work that happens within either -// PENDING_VALIDATION or PENDING_DNS and does not need its own exposed -// lifecycle state. +// The agent lifecycle enum stays exactly as the V2 spec's +// `AgentLifecycleStatus` defines it. Certificate issuance in flight is +// NOT a lifecycle state: it is tracked on `CertOrder.State` +// (PENDING → ISSUING → COMPLETED) and surfaced as derived views — +// `RegistrationPending.status = PENDING_CERTS` and `AgentStatus.phase +// = CERTIFICATE_ISSUANCE` — while the lifecycle status remains +// PENDING_VALIDATION. Synchronous issuers (the self-signed CA) pass +// through the ISSUING window in-process; asynchronous issuers (ACME +// providers) park the agent there until a re-driven verify-acme +// finalizes the order. func (r *AgentRegistration) AdvanceToPendingDNS() error { return r.transitionTo(StatusPendingDNS) } @@ -278,14 +267,32 @@ func (r *AgentRegistration) Revoke(reason RevocationReason, now time.Time) error return nil } -// Cancel transitions a PENDING registration to REVOKED (idempotent cancel). +// Cancel transitions a PENDING registration to REVOKED. +// +// Eligibility follows the spec's revoke-route contract: the +// PENDING_CERTS phase (order ISSUING or terminally FAILED) and +// PENDING_DNS are cancellable; a registration still awaiting its +// domain-control challenge (PENDING_VALIDATION with the order +// PENDING) is not — it auto-expires when the challenge window +// lapses. Legacy registrations without a persisted order are +// cancellable: they have no challenge window to expire on, so cancel +// is their only exit. func (r *AgentRegistration) Cancel(now time.Time) error { + // Validation-class (422), not invalid-state (409): Cancel is + // reached only through the revoke route, whose canonical spec + // documents 422 for an unprocessable request and carries no 409. if !r.Status.IsPending() { - return NewInvalidStateError( + return NewValidationError( "CANNOT_CANCEL", fmt.Sprintf("can only cancel pending registrations, current status: %s", r.Status), ) } + if r.Status == StatusPendingValidation && r.CertOrder.State == OrderStatePending { + return NewValidationError( + "CANNOT_CANCEL", + "registration is awaiting domain validation and will auto-expire when the challenge window lapses", + ) + } r.Status = StatusRevoked r.addEvent(NewAgentRevokedEvent(r.AgentID, r.AnsName, RevocationCessationOfOperation, now)) return nil diff --git a/internal/domain/agent_test.go b/internal/domain/agent_test.go index de83e60..71d1603 100644 --- a/internal/domain/agent_test.go +++ b/internal/domain/agent_test.go @@ -182,6 +182,8 @@ func TestAgentRegistration_Revoke_FromPending(t *testing.T) { } func TestAgentRegistration_Cancel(t *testing.T) { + // Legacy shape (no persisted order): cancellable — there is no + // challenge window to auto-expire on, so cancel is the only exit. reg := newValidRegistration(t) reg.ClearEvents() require.NoError(t, reg.Cancel(time.Now())) @@ -190,7 +192,29 @@ func TestAgentRegistration_Cancel(t *testing.T) { // Cannot cancel non-pending. reg.Status = StatusActive - assert.ErrorIs(t, reg.Cancel(time.Now()), ErrInvalidState) + assert.ErrorIs(t, reg.Cancel(time.Now()), ErrValidation) + + // Awaiting domain validation (order PENDING): not cancellable — + // per the spec it auto-expires when the challenge window lapses. + awaiting := newValidRegistration(t) + awaiting.CertOrder = NewSelfIssuedOrder("d", "h", time.Now().Add(time.Hour)) + assert.ErrorIs(t, awaiting.Cancel(time.Now()), ErrValidation) + + // Once validation is consumed (order ISSUING — the spec's + // PENDING_CERTS phase), cancel is allowed. + issuing := newValidRegistration(t) + issuing.CertOrder = NewSelfIssuedOrder("d", "h", time.Now().Add(time.Hour)) + require.NoError(t, issuing.CertOrder.MarkIssuing()) + require.NoError(t, issuing.Cancel(time.Now())) + assert.Equal(t, StatusRevoked, issuing.Status) + + // PENDING_DNS with a completed order: cancellable. + pendingDNS := newValidRegistration(t) + pendingDNS.CertOrder = NewSelfIssuedOrder("d", "h", time.Now().Add(time.Hour)) + require.NoError(t, pendingDNS.CertOrder.MarkCompleted()) + require.NoError(t, pendingDNS.AdvanceToPendingDNS()) + require.NoError(t, pendingDNS.Cancel(time.Now())) + assert.Equal(t, StatusRevoked, pendingDNS.Status) } func TestAgentRegistration_Fail(t *testing.T) { diff --git a/internal/domain/ansname.go b/internal/domain/ansname.go index 5a9ec89..9137e07 100644 --- a/internal/domain/ansname.go +++ b/internal/domain/ansname.go @@ -2,6 +2,7 @@ package domain import ( "fmt" + "net" "regexp" "strings" ) @@ -123,6 +124,19 @@ func validateAgentHost(host string) error { ) } + // Reject IP literals. An ANS name binds a DNS hostname; an IP + // (e.g. "169.254.169.254" or an IPv6 literal) is never a valid + // agent host, and accepting one would let a registrant point the + // HTTP-01 challenge gate at an internal address. The label checks + // below would otherwise pass a dotted-quad IPv4 as four numeric + // "labels". + if net.ParseIP(host) != nil { + return NewValidationError( + "INVALID_AGENT_HOST", + fmt.Sprintf("agent host must be a DNS hostname, not an IP address: %q", host), + ) + } + labels := strings.Split(host, ".") if len(labels) < 2 { return NewValidationError( diff --git a/internal/domain/ansname_test.go b/internal/domain/ansname_test.go index 216dc51..b54836f 100644 --- a/internal/domain/ansname_test.go +++ b/internal/domain/ansname_test.go @@ -38,6 +38,18 @@ func TestNewAnsName(t *testing.T) { assert.ErrorIs(t, err, ErrValidation) }) + t.Run("should reject IP-literal hosts", func(t *testing.T) { + // An ANS name binds a DNS hostname; accepting an IP would let a + // registrant aim the HTTP-01 challenge gate at an internal + // address (e.g. the cloud metadata endpoint). A dotted-quad + // would otherwise pass the per-label DNS checks as four numeric + // labels. + for _, ip := range []string{"169.254.169.254", "127.0.0.1", "10.0.0.1", "8.8.8.8"} { + _, err := NewAnsName(mustSemVer(1, 0, 0), ip) + assert.ErrorIsf(t, err, ErrValidation, "IP literal %q must be rejected", ip) + } + }) + t.Run("should reject host over 253 chars", func(t *testing.T) { // Build a 254-char hostname from repeating valid 63-char labels // plus separators plus a 62-char tail: 63+1+63+1+63+1+62 = 254. diff --git a/internal/domain/certificate.go b/internal/domain/certificate.go index 2713947..333473f 100644 --- a/internal/domain/certificate.go +++ b/internal/domain/certificate.go @@ -88,11 +88,20 @@ func (c AgentCSR) MarkRejected(reason string, processedAt time.Time) (AgentCSR, // StoredCertificate represents a certificate stored in the system. type StoredCertificate struct { - InternalID int64 `json:"internalId"` - CSRID string `json:"csrId"` - CertificateType CertificateType `json:"certificateType"` - CertificatePEM string `json:"certificatePem"` - ChainPEM string `json:"chainPem,omitempty"` + InternalID int64 `json:"internalId"` + CSRID string `json:"csrId"` + CertificateType CertificateType `json:"certificateType"` + CertificatePEM string `json:"certificatePem"` + ChainPEM string `json:"chainPem,omitempty"` + // SerialNumber is the issued certificate's serial (lowercase hex), + // captured at issuance so CA-side revocation never re-parses the + // PEM. Empty on rows persisted before serial tracking landed. + SerialNumber string `json:"serialNumber,omitempty"` + // CertificateRef is the issuing provider's opaque handle for this + // certificate (cloud-CA resource name/ARN, ACME certificate URL). + // Empty for the in-process self-signed CAs, which revoke by + // serial alone. + CertificateRef string `json:"certificateRef,omitempty"` Status CertificateStatus `json:"status"` ExpirationTimestamp time.Time `json:"expirationTimestamp"` IssueTimestamp time.Time `json:"issueTimestamp"` diff --git a/internal/domain/dnsrecords.go b/internal/domain/dnsrecords.go index 4f0cb81..438658d 100644 --- a/internal/domain/dnsrecords.go +++ b/internal/domain/dnsrecords.go @@ -91,33 +91,42 @@ func ComputeRequiredDNSRecords(reg *AgentRegistration, tlPublicBaseURL string) [ // TLSA record for certificate binding. Every registration has a // server cert — either BYOC (operator-submitted) or CSR-signed - // (RA issues via its configured `ServerCertificateAuthority`). - // Both paths land through the same ByocServerCertificate struct, - // so `reg.ServerCert` is set for any registration that's reached + // (issued via the configured `ServerCertificateIssuer`). Both + // paths land through the same ByocServerCertificate struct, so + // `reg.ServerCert` is set for any registration that's reached // verify-dns. - // - // `3 1 1 ` = DANE-EE + SubjectPublicKeyInfo + SHA-256 - // (RFC 6698). Required=false: operators whose zones aren't - // DNSSEC-signed can't produce a trustworthy TLSA record, so the - // RA doesn't block verify-dns on its presence. The verify layer - // enforces a stricter rule at query time: when a TLSA response - // IS DNSSEC-validated, its value must match the expected - // fingerprint (otherwise an attacker rewrote the record in a - // signed zone — the worst failure mode). That post-verify - // check lives alongside the verifier, not in the record set. if reg.ServerCert == nil { return records } - records = append(records, ExpectedDNSRecord{ + records = append(records, TLSARecordForCert(fqdn, reg.ServerCert.Fingerprint)) + + return records +} + +// TLSARecordForCert builds the DANE-EE TLSA record binding a server +// certificate fingerprint to the FQDN. Shared between the +// registration record set and the renewal status responses (the +// operator updates this record after every renewal — it fingerprints +// the new leaf). +// +// `3 1 1 ` = DANE-EE + SubjectPublicKeyInfo + SHA-256 +// (RFC 6698). Required=false: operators whose zones aren't +// DNSSEC-signed can't produce a trustworthy TLSA record, so the +// RA doesn't block verify-dns on its presence. The verify layer +// enforces a stricter rule at query time: when a TLSA response +// IS DNSSEC-validated, its value must match the expected +// fingerprint (otherwise an attacker rewrote the record in a +// signed zone — the worst failure mode). That post-verify +// check lives alongside the verifier, not in the record set. +func TLSARecordForCert(fqdn, fingerprint string) ExpectedDNSRecord { + return ExpectedDNSRecord{ Name: fmt.Sprintf("_443._tcp.%s", fqdn), Type: DNSRecordTLSA, - Value: fmt.Sprintf("3 1 1 %s", reg.ServerCert.Fingerprint), + Value: fmt.Sprintf("3 1 1 %s", fingerprint), Purpose: PurposeCertificateBinding, Required: false, TTL: 3600, - }) - - return records + } } func protocolToANSValue(p Protocol) string { diff --git a/internal/domain/order.go b/internal/domain/order.go new file mode 100644 index 0000000..ac5ef10 --- /dev/null +++ b/internal/domain/order.go @@ -0,0 +1,239 @@ +package domain + +import ( + "fmt" + "time" +) + +// ChallengeType enumerates the domain-control challenge mechanisms a +// certificate provider can demand. Mirrors the V2 spec's +// `ChallengeInfo.type` enum (spec/api-spec-v2.yaml §ChallengeInfo: +// DNS_01 | HTTP_01). +type ChallengeType string + +// Challenge types. Values match RFC 8555 challenge identifiers in the +// wire casing the V2 spec uses. +const ( + ChallengeTypeDNS01 ChallengeType = "DNS_01" + ChallengeTypeHTTP01 ChallengeType = "HTTP_01" +) + +// IsValid reports whether the challenge type is recognized. +func (t ChallengeType) IsValid() bool { + switch t { + case ChallengeTypeDNS01, ChallengeTypeHTTP01: + return true + default: + return false + } +} + +// Challenge is a single domain-control challenge the domain owner must +// satisfy by publishing an artifact (a DNS TXT record or an HTTP +// resource) in the zone or site they control. Challenges are relayed +// to the owner verbatim — ANS never creates DNS records or serves +// challenge files on the owner's behalf; the pending-registration and +// renewal responses carry everything the owner needs to publish them +// themselves. +// +// Challenges originate from the certificate issuer port when a +// certificate order is created (`ServerCertificateIssuer.CreateOrder`). +// For the in-process self-signed CA the tokens are self-issued; for an +// external ACME CA (e.g. Let's Encrypt) the token and key +// authorization come from the provider's order and the DNS record +// value is the provider-computed digest. +type Challenge struct { + // Type selects the verification mechanism. + Type ChallengeType `json:"type"` + + // Token is the opaque challenge token issued by the certificate + // provider (or self-issued by the RA when it is its own CA). + Token string `json:"token"` + + // KeyAuthorization binds the token to the issuing account's key + // per RFC 8555 §8.1 (token || "." || base64url(JWK thumbprint)). + // Empty for self-issued challenges, which have no account binding. + KeyAuthorization string `json:"keyAuthorization,omitempty"` + + // DNSRecordName overrides the TXT record name for DNS challenges. + // Empty means the RFC 8555 default `_acme-challenge.`. + // Non-ACME providers with proprietary DV record names set this. + DNSRecordName string `json:"dnsRecordName,omitempty"` + + // DNSRecordValue overrides the TXT record value for DNS + // challenges. ACME providers require + // base64url(SHA-256(keyAuthorization)); self-issued challenges + // publish the raw token. Empty means the raw token. + DNSRecordValue string `json:"dnsRecordValue,omitempty"` + + // HTTPPath overrides the HTTP challenge path. Empty means the + // RFC 8555 default `/.well-known/acme-challenge/`. + HTTPPath string `json:"httpPath,omitempty"` +} + +// EffectiveDNSRecordName returns the TXT record name the owner must +// publish for a DNS_01 challenge, applying the RFC 8555 default when +// the provider didn't override it. +func (c Challenge) EffectiveDNSRecordName(fqdn string) string { + if c.DNSRecordName != "" { + return c.DNSRecordName + } + return "_acme-challenge." + fqdn +} + +// EffectiveDNSRecordValue returns the TXT record value the owner must +// publish for a DNS_01 challenge. Defaults to the raw token for +// self-issued challenges. +func (c Challenge) EffectiveDNSRecordValue() string { + if c.DNSRecordValue != "" { + return c.DNSRecordValue + } + return c.Token +} + +// EffectiveHTTPPath returns the URL path (under the agent's FQDN) the +// owner must serve the challenge content from for an HTTP_01 +// challenge, applying the RFC 8555 default when the provider didn't +// override it. +func (c Challenge) EffectiveHTTPPath() string { + if c.HTTPPath != "" { + return c.HTTPPath + } + return "/.well-known/acme-challenge/" + c.Token +} + +// ExpectedHTTPContent returns the body the owner must serve at the +// HTTP challenge path: the key authorization when the provider binds +// the token to an account key (RFC 8555 §8.3), otherwise the raw +// token. +func (c Challenge) ExpectedHTTPContent() string { + if c.KeyAuthorization != "" { + return c.KeyAuthorization + } + return c.Token +} + +// OrderState is the lifecycle of a certificate order. It is tracked +// on the order itself, never on the agent lifecycle — `AgentLifecycle +// Status` stays exactly as the V2 spec defines it, and views like +// `RegistrationPending.status` (PENDING_CERTS) and `AgentStatus.phase` +// (CERTIFICATE_ISSUANCE) are derived from (agent status × order +// state). +type OrderState string + +// Order states. PENDING means challenges are issued and awaiting +// domain validation; ISSUING means validation passed and the provider +// is finalizing asynchronously; COMPLETED and FAILED are terminal. +const ( + OrderStatePending OrderState = "PENDING" + OrderStateIssuing OrderState = "ISSUING" + OrderStateCompleted OrderState = "COMPLETED" + OrderStateFailed OrderState = "FAILED" +) + +// CertificateOrder tracks a certificate issuance order and the +// domain-control challenges attached to it. +// +// Two shapes exist: +// +// - Provider order (CSR path): created by the configured +// `ServerCertificateIssuer` port. OrderRef is the provider-opaque +// handle (an ACME order URL for ACME providers, an internal id for +// the self-signed CA) used to finalize the order after the owner +// satisfies a challenge. +// - Self-issued validation order (BYOC path): no certificate is +// being issued, but domain control must still be proven before the +// registration advances. OrderRef is empty and the challenges are +// RA-issued. +type CertificateOrder struct { + OrderRef string `json:"orderRef,omitempty"` + State OrderState `json:"state,omitempty"` + Challenges []Challenge `json:"challenges,omitempty"` + ExpiresAt time.Time `json:"expiresAt,omitzero"` +} + +// NewSelfIssuedOrder builds the BYOC-path validation order from a +// pair of RA-generated tokens. The DNS record value and HTTP content +// default to the raw tokens (no account-key binding). +func NewSelfIssuedOrder(dns01Token, http01Token string, expiresAt time.Time) CertificateOrder { + return CertificateOrder{ + State: OrderStatePending, + Challenges: []Challenge{ + {Type: ChallengeTypeDNS01, Token: dns01Token}, + {Type: ChallengeTypeHTTP01, Token: http01Token}, + }, + ExpiresAt: expiresAt, + } +} + +// IsZero reports whether the order is unset — true for registrations +// that predate order persistence. +func (o *CertificateOrder) IsZero() bool { + return o.OrderRef == "" && o.State == "" && len(o.Challenges) == 0 && o.ExpiresAt.IsZero() +} + +// ChallengeOfType returns the first challenge of the given type. +func (o *CertificateOrder) ChallengeOfType(t ChallengeType) (Challenge, bool) { + for _, c := range o.Challenges { + if c.Type == t { + return c, true + } + } + return Challenge{}, false +} + +// IsExpired reports whether the order's challenge window has elapsed +// without the order reaching a terminal state. +func (o *CertificateOrder) IsExpired(now time.Time) bool { + if o.ExpiresAt.IsZero() { + return false + } + return now.After(o.ExpiresAt) && o.State != OrderStateCompleted && o.State != OrderStateFailed +} + +// MarkIssuing transitions PENDING → ISSUING: domain validation passed +// and the provider accepted the finalize request but has not produced +// a certificate yet. Idempotent for re-driven verify-acme calls (an +// ISSUING order stays ISSUING). +func (o *CertificateOrder) MarkIssuing() error { + if o.State == OrderStateIssuing { + return nil + } + if o.State != OrderStatePending { + return NewInvalidStateError( + "INVALID_ORDER_TRANSITION", + fmt.Sprintf("cannot mark order ISSUING from state %s", o.State), + ) + } + o.State = OrderStateIssuing + return nil +} + +// MarkCompleted transitions PENDING|ISSUING → COMPLETED: the +// certificate landed (or, for a BYOC validation order, domain control +// was proven). +func (o *CertificateOrder) MarkCompleted() error { + if o.State != OrderStatePending && o.State != OrderStateIssuing { + return NewInvalidStateError( + "INVALID_ORDER_TRANSITION", + fmt.Sprintf("cannot mark order COMPLETED from state %s", o.State), + ) + } + o.State = OrderStateCompleted + return nil +} + +// MarkFailed transitions PENDING|ISSUING → FAILED: the provider +// reported a terminal order failure (e.g. an ACME order moved to +// `invalid`). A failed order cannot be retried — the operator submits +// a new registration or renewal. +func (o *CertificateOrder) MarkFailed() error { + if o.State != OrderStatePending && o.State != OrderStateIssuing { + return NewInvalidStateError( + "INVALID_ORDER_TRANSITION", + fmt.Sprintf("cannot mark order FAILED from state %s", o.State), + ) + } + o.State = OrderStateFailed + return nil +} diff --git a/internal/domain/order_test.go b/internal/domain/order_test.go new file mode 100644 index 0000000..41c39ea --- /dev/null +++ b/internal/domain/order_test.go @@ -0,0 +1,208 @@ +package domain + +import ( + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestChallengeType_IsValid(t *testing.T) { + assert.True(t, ChallengeTypeDNS01.IsValid()) + assert.True(t, ChallengeTypeHTTP01.IsValid()) + assert.False(t, ChallengeType("TLS_ALPN_01").IsValid()) + assert.False(t, ChallengeType("").IsValid()) +} + +func TestChallenge_EffectiveDNSRecordName(t *testing.T) { + // RFC 8555 default when the provider didn't override. + c := Challenge{Type: ChallengeTypeDNS01, Token: "tok"} + assert.Equal(t, "_acme-challenge.agent.example.com", c.EffectiveDNSRecordName("agent.example.com")) + + // Provider override (proprietary DV record names) wins. + c.DNSRecordName = "_dnsauth.agent.example.com" + assert.Equal(t, "_dnsauth.agent.example.com", c.EffectiveDNSRecordName("agent.example.com")) +} + +func TestChallenge_EffectiveDNSRecordValue(t *testing.T) { + // Self-issued: raw token. + c := Challenge{Type: ChallengeTypeDNS01, Token: "tok"} + assert.Equal(t, "tok", c.EffectiveDNSRecordValue()) + + // ACME providers publish the key-authorization digest instead. + c.DNSRecordValue = "digest-of-keyauth" + assert.Equal(t, "digest-of-keyauth", c.EffectiveDNSRecordValue()) +} + +func TestChallenge_EffectiveHTTPPath(t *testing.T) { + c := Challenge{Type: ChallengeTypeHTTP01, Token: "tok"} + assert.Equal(t, "/.well-known/acme-challenge/tok", c.EffectiveHTTPPath()) + + c.HTTPPath = "/.well-known/pki-validation/provider.html" + assert.Equal(t, "/.well-known/pki-validation/provider.html", c.EffectiveHTTPPath()) +} + +func TestChallenge_ExpectedHTTPContent(t *testing.T) { + // Self-issued: raw token (no account binding). + c := Challenge{Type: ChallengeTypeHTTP01, Token: "tok"} + assert.Equal(t, "tok", c.ExpectedHTTPContent()) + + // Account-bound (ACME): the key authorization. + c.KeyAuthorization = "tok.thumbprint" + assert.Equal(t, "tok.thumbprint", c.ExpectedHTTPContent()) +} + +func TestNewSelfIssuedOrder(t *testing.T) { + exp := time.Now().Add(time.Hour) + o := NewSelfIssuedOrder("dns-tok", "http-tok", exp) + assert.Equal(t, OrderStatePending, o.State) + assert.Empty(t, o.OrderRef) + assert.Equal(t, exp, o.ExpiresAt) + assert.False(t, o.IsZero()) + + dns01, ok := o.ChallengeOfType(ChallengeTypeDNS01) + require.True(t, ok) + assert.Equal(t, "dns-tok", dns01.Token) + assert.Empty(t, dns01.KeyAuthorization) + + http01, ok := o.ChallengeOfType(ChallengeTypeHTTP01) + require.True(t, ok) + assert.Equal(t, "http-tok", http01.Token) +} + +func TestCertificateOrder_IsZero(t *testing.T) { + for _, tc := range []struct { + order CertificateOrder + want bool + }{ + {CertificateOrder{}, true}, + {CertificateOrder{OrderRef: "ref"}, false}, + {CertificateOrder{State: OrderStatePending}, false}, + {CertificateOrder{Challenges: []Challenge{{}}}, false}, + {CertificateOrder{ExpiresAt: time.Now()}, false}, + } { + assert.Equal(t, tc.want, tc.order.IsZero(), "%+v", tc.order) + } +} + +func TestCertificateOrder_ChallengeOfType_Missing(t *testing.T) { + o := CertificateOrder{Challenges: []Challenge{{Type: ChallengeTypeDNS01, Token: "t"}}} + _, ok := o.ChallengeOfType(ChallengeTypeHTTP01) + assert.False(t, ok) +} + +func TestCertificateOrder_IsExpired(t *testing.T) { + now := time.Now() + + // Zero expiry never expires (legacy rows). + zero := CertificateOrder{State: OrderStatePending} + assert.False(t, zero.IsExpired(now)) + + // Past expiry while non-terminal → expired. + past := CertificateOrder{State: OrderStatePending, ExpiresAt: now.Add(-time.Minute)} + assert.True(t, past.IsExpired(now)) + + // Future expiry → not expired. + future := CertificateOrder{State: OrderStatePending, ExpiresAt: now.Add(time.Minute)} + assert.False(t, future.IsExpired(now)) + + // Terminal states never report expired — the order resolved + // before the window closed. + done := CertificateOrder{State: OrderStateCompleted, ExpiresAt: now.Add(-time.Minute)} + assert.False(t, done.IsExpired(now)) + failed := CertificateOrder{State: OrderStateFailed, ExpiresAt: now.Add(-time.Minute)} + assert.False(t, failed.IsExpired(now)) +} + +func TestCertificateOrder_MarkIssuing(t *testing.T) { + o := NewSelfIssuedOrder("d", "h", time.Now().Add(time.Hour)) + require.NoError(t, o.MarkIssuing()) + assert.Equal(t, OrderStateIssuing, o.State) + + // Idempotent for re-driven verify-acme calls. + require.NoError(t, o.MarkIssuing()) + assert.Equal(t, OrderStateIssuing, o.State) + + // COMPLETED → ISSUING is invalid. + done := CertificateOrder{State: OrderStateCompleted} + assert.ErrorIs(t, done.MarkIssuing(), ErrInvalidState) +} + +func TestCertificateOrder_MarkCompleted(t *testing.T) { + // From PENDING (synchronous issuers / BYOC validation). + o := NewSelfIssuedOrder("d", "h", time.Now().Add(time.Hour)) + require.NoError(t, o.MarkCompleted()) + assert.Equal(t, OrderStateCompleted, o.State) + + // From ISSUING (re-driven async finalize). + o2 := CertificateOrder{State: OrderStateIssuing} + require.NoError(t, o2.MarkCompleted()) + assert.Equal(t, OrderStateCompleted, o2.State) + + // Terminal → COMPLETED is invalid. + assert.ErrorIs(t, o.MarkCompleted(), ErrInvalidState) + failed := CertificateOrder{State: OrderStateFailed} + assert.ErrorIs(t, failed.MarkCompleted(), ErrInvalidState) +} + +func TestCertificateOrder_MarkFailed(t *testing.T) { + o := NewSelfIssuedOrder("d", "h", time.Now().Add(time.Hour)) + require.NoError(t, o.MarkFailed()) + assert.Equal(t, OrderStateFailed, o.State) + + o2 := CertificateOrder{State: OrderStateIssuing} + require.NoError(t, o2.MarkFailed()) + assert.Equal(t, OrderStateFailed, o2.State) + + // Terminal → FAILED is invalid. + assert.ErrorIs(t, o.MarkFailed(), ErrInvalidState) + done := CertificateOrder{State: OrderStateCompleted} + assert.ErrorIs(t, done.MarkFailed(), ErrInvalidState) +} + +func TestRenewalValidation_ChallengeOfType(t *testing.T) { + v := RenewalValidation{Challenges: []Challenge{ + {Type: ChallengeTypeDNS01, Token: "d"}, + {Type: ChallengeTypeHTTP01, Token: "h"}, + }} + dns01, ok := v.ChallengeOfType(ChallengeTypeDNS01) + require.True(t, ok) + assert.Equal(t, "d", dns01.Token) + var empty RenewalValidation + _, ok = empty.ChallengeOfType(ChallengeTypeDNS01) + assert.False(t, ok) +} + +func TestTLSARecordForCert(t *testing.T) { + rec := TLSARecordForCert("agent.example.com", "abc123") + assert.Equal(t, "_443._tcp.agent.example.com", rec.Name) + assert.Equal(t, DNSRecordTLSA, rec.Type) + assert.Equal(t, "3 1 1 abc123", rec.Value) + assert.Equal(t, PurposeCertificateBinding, rec.Purpose) + assert.False(t, rec.Required) +} + +// TestNewRenewalValidation_ClampsToOrderExpiry pins the window +// clamping: when the provider's order outlives the standard renewal +// window the renewal keeps its own deadline, and when the order ends +// first the validation honors the order's expiry. +func TestNewRenewalValidation_ClampsToOrderExpiry(t *testing.T) { + now := time.Now() + + // Order expires after the 7d renewal window → renewal window wins. + longOrder := NewSelfIssuedOrder("d", "h", now.Add(30*24*time.Hour)) + r := NewCSRRenewal("a", 1, "csr", longOrder, now) + assert.Equal(t, now.Add(renewalExpiryDuration), r.Validation.ExpiresAt) + + // Zero order expiry → renewal window. + zeroOrder := CertificateOrder{State: OrderStatePending} + r2 := NewCSRRenewal("a", 1, "csr", zeroOrder, now) + assert.Equal(t, now.Add(renewalExpiryDuration), r2.Validation.ExpiresAt) + + // Order expires before the window → clamped to the order. + shortOrder := NewSelfIssuedOrder("d", "h", now.Add(time.Hour)) + r3 := NewCSRRenewal("a", 1, "csr", shortOrder, now) + assert.Equal(t, now.Add(time.Hour), r3.Validation.ExpiresAt) + assert.Empty(t, r3.Validation.OrderRef) +} diff --git a/internal/domain/renewal.go b/internal/domain/renewal.go index 20aa6c8..f262335 100644 --- a/internal/domain/renewal.go +++ b/internal/domain/renewal.go @@ -8,13 +8,29 @@ import ( const renewalExpiryDuration = 7 * 24 * time.Hour // 7 days. // RenewalValidation tracks the ACME challenge state for a renewal. +// +// OrderRef and Challenges mirror the registration aggregate's +// CertOrder: CSR renewals carry the provider order created by +// `ServerCertificateIssuer.CreateOrder`; BYOC renewals carry +// self-issued challenges (OrderRef empty) because domain control must +// still be proven before the operator's certificate goes live. type RenewalValidation struct { - DNS01ChallengeToken string `json:"dns01ChallengeToken"` - HTTP01ChallengeToken string `json:"http01ChallengeToken"` - Status ValidationStatus `json:"status"` - CreatedAt time.Time `json:"createdAt"` - ExpiresAt time.Time `json:"expiresAt"` - UpdatedAt time.Time `json:"updatedAt"` + OrderRef string `json:"orderRef,omitempty"` + Challenges []Challenge `json:"challenges,omitempty"` + Status ValidationStatus `json:"status"` + CreatedAt time.Time `json:"createdAt"` + ExpiresAt time.Time `json:"expiresAt"` + UpdatedAt time.Time `json:"updatedAt"` +} + +// ChallengeOfType returns the first challenge of the given type. +func (v RenewalValidation) ChallengeOfType(t ChallengeType) (Challenge, bool) { + for _, c := range v.Challenges { + if c.Type == t { + return c, true + } + } + return Challenge{}, false } // IsExpiredWithoutVerification returns true if the validation expired before being verified. @@ -68,15 +84,19 @@ type ServerCertificateRenewal struct { // NewBYOCRenewal creates a new BYOC server certificate renewal. // The operator supplies the already-issued certificate; the renewal -// only validates domain control via ACME then flips the registration's -// ServerCert to the new leaf. +// only validates domain control via the order's challenges then flips +// the registration's ServerCert to the new leaf. +// +// The order is self-issued (see NewSelfIssuedOrder) — no certificate +// provider participates in a BYOC renewal. The renewal's validation +// window is the shorter of the standard renewal expiry and the +// order's own expiry. func NewBYOCRenewal( agentID string, registrationID int64, byocCertPEM string, byocChainPEM string, - dns01Token string, - http01Token string, + order CertificateOrder, now time.Time, ) *ServerCertificateRenewal { return &ServerCertificateRenewal{ @@ -86,14 +106,7 @@ func NewBYOCRenewal( ByocCertPEM: byocCertPEM, ByocChainPEM: byocChainPEM, CreatedAt: now, - Validation: RenewalValidation{ - DNS01ChallengeToken: dns01Token, - HTTP01ChallengeToken: http01Token, - Status: ValidationPending, - CreatedAt: now, - ExpiresAt: now.Add(renewalExpiryDuration), - UpdatedAt: now, - }, + Validation: newRenewalValidation(order, now), } } @@ -102,12 +115,16 @@ func NewBYOCRenewal( // status=PENDING — this struct only references it by ID. // Matches the reference's `AgentServerCertificateRenewal` with // `renewalType = SERVER_CSR`. +// +// The order comes from the configured `ServerCertificateIssuer` port +// (`CreateOrder`), so the challenges relayed to the operator are the +// provider's own — for an ACME provider that means the provider's +// token + key authorization, not RA-invented values. func NewCSRRenewal( agentID string, registrationID int64, csrID string, - dns01Token string, - http01Token string, + order CertificateOrder, now time.Time, ) *ServerCertificateRenewal { return &ServerCertificateRenewal{ @@ -116,14 +133,27 @@ func NewCSRRenewal( RenewalType: RenewalTypeCSR, ServerCsrID: csrID, CreatedAt: now, - Validation: RenewalValidation{ - DNS01ChallengeToken: dns01Token, - HTTP01ChallengeToken: http01Token, - Status: ValidationPending, - CreatedAt: now, - ExpiresAt: now.Add(renewalExpiryDuration), - UpdatedAt: now, - }, + Validation: newRenewalValidation(order, now), + } +} + +// newRenewalValidation builds the embedded validation block from an +// order. The validation window is clamped to the order's expiry when +// the provider's order ends before the standard renewal window — +// relaying a challenge the provider will no longer accept would send +// the operator on a dead-end errand. +func newRenewalValidation(order CertificateOrder, now time.Time) RenewalValidation { + expires := now.Add(renewalExpiryDuration) + if !order.ExpiresAt.IsZero() && order.ExpiresAt.Before(expires) { + expires = order.ExpiresAt + } + return RenewalValidation{ + OrderRef: order.OrderRef, + Challenges: order.Challenges, + Status: ValidationPending, + CreatedAt: now, + ExpiresAt: expires, + UpdatedAt: now, } } @@ -132,9 +162,13 @@ func (r *ServerCertificateRenewal) IsExpired(now time.Time) bool { return r.Validation.IsExpiredWithoutVerification(now) } -// IsCompleted returns true if the renewal reached a terminal state. +// IsCompleted reports whether the renewal reached its terminal +// completed/failed state — i.e. CompletedAt is set (MarkCompleted or +// MarkFailed). It is NOT true merely because validation was verified: +// a CSR renewal whose order is still ISSUING is VERIFIED but not yet +// completed, and the operator must re-POST verify-acme to finish it. func (r *ServerCertificateRenewal) IsCompleted() bool { - return !r.CompletedAt.IsZero() || r.Validation.Status == ValidationVerified + return !r.CompletedAt.IsZero() } // MarkCompleted marks the renewal as successfully completed. diff --git a/internal/domain/renewal_test.go b/internal/domain/renewal_test.go index 9e08fd2..c8bab0a 100644 --- a/internal/domain/renewal_test.go +++ b/internal/domain/renewal_test.go @@ -14,7 +14,8 @@ func TestNewCSRRenewal(t *testing.T) { // rather than carrying the cert bytes inline. Reference: // AgentServerCertificateRenewal with renewalType=SERVER_CSR. now := time.Now() - r := NewCSRRenewal("agent-1", 42, "csr-9", "dns-tok", "http-tok", now) + r := NewCSRRenewal("agent-1", 42, "csr-9", + NewSelfIssuedOrder("dns-tok", "http-tok", now.Add(24*time.Hour)), now) require.NotNil(t, r) assert.Equal(t, "agent-1", r.AgentID) assert.Equal(t, int64(42), r.RegistrationID) @@ -24,23 +25,34 @@ func TestNewCSRRenewal(t *testing.T) { assert.Empty(t, r.ByocCertPEM) assert.Empty(t, r.ByocChainPEM) assert.Equal(t, ValidationPending, r.Validation.Status) - assert.Equal(t, "dns-tok", r.Validation.DNS01ChallengeToken) - assert.Equal(t, "http-tok", r.Validation.HTTP01ChallengeToken) - assert.True(t, r.Validation.ExpiresAt.After(now)) + dns01, ok := r.Validation.ChallengeOfType(ChallengeTypeDNS01) + require.True(t, ok) + assert.Equal(t, "dns-tok", dns01.Token) + http01, ok := r.Validation.ChallengeOfType(ChallengeTypeHTTP01) + require.True(t, ok) + assert.Equal(t, "http-tok", http01.Token) + // The validation window clamps to the order expiry when the + // order ends before the standard renewal window. + assert.Equal(t, now.Add(24*time.Hour), r.Validation.ExpiresAt) assert.Equal(t, now, r.CreatedAt) } func TestNewBYOCRenewal(t *testing.T) { now := time.Now() - r := NewBYOCRenewal("agent-1", 42, "LEAF", "CHAIN", "dns-tok", "http-tok", now) + r := NewBYOCRenewal("agent-1", 42, "LEAF", "CHAIN", + NewSelfIssuedOrder("dns-tok", "http-tok", now.Add(24*time.Hour)), now) assert.Equal(t, "agent-1", r.AgentID) assert.Equal(t, int64(42), r.RegistrationID) assert.Equal(t, RenewalTypeBYOC, r.RenewalType) assert.Equal(t, "LEAF", r.ByocCertPEM) assert.Equal(t, "CHAIN", r.ByocChainPEM) assert.Equal(t, ValidationPending, r.Validation.Status) - assert.Equal(t, "dns-tok", r.Validation.DNS01ChallengeToken) - assert.Equal(t, "http-tok", r.Validation.HTTP01ChallengeToken) + dns01, ok := r.Validation.ChallengeOfType(ChallengeTypeDNS01) + require.True(t, ok) + assert.Equal(t, "dns-tok", dns01.Token) + http01, ok := r.Validation.ChallengeOfType(ChallengeTypeHTTP01) + require.True(t, ok) + assert.Equal(t, "http-tok", http01.Token) assert.True(t, r.Validation.ExpiresAt.After(now)) } @@ -79,14 +91,14 @@ func TestRenewalValidation_MarkFailed(t *testing.T) { func TestServerCertificateRenewal_IsExpired(t *testing.T) { now := time.Now() - r := NewBYOCRenewal("a", 1, "c", "", "d", "h", now) + r := NewBYOCRenewal("a", 1, "c", "", NewSelfIssuedOrder("d", "h", now.Add(30*24*time.Hour)), now) assert.False(t, r.IsExpired(now)) assert.True(t, r.IsExpired(now.Add(8*24*time.Hour))) } func TestServerCertificateRenewal_Completion(t *testing.T) { now := time.Now() - r := NewBYOCRenewal("a", 1, "c", "", "d", "h", now) + r := NewBYOCRenewal("a", 1, "c", "", NewSelfIssuedOrder("d", "h", now.Add(30*24*time.Hour)), now) assert.False(t, r.IsCompleted()) require.NoError(t, r.MarkCompleted(now.Add(time.Hour))) @@ -100,16 +112,21 @@ func TestServerCertificateRenewal_Completion(t *testing.T) { func TestServerCertificateRenewal_Fail(t *testing.T) { now := time.Now() - r := NewBYOCRenewal("a", 1, "c", "", "d", "h", now) + r := NewBYOCRenewal("a", 1, "c", "", NewSelfIssuedOrder("d", "h", now.Add(30*24*time.Hour)), now) require.NoError(t, r.MarkFailed("boom", now.Add(time.Second))) assert.Equal(t, "boom", r.FailureReason) assert.False(t, r.CompletedAt.IsZero()) } func TestServerCertificateRenewal_UpdateValidationStatus(t *testing.T) { - r := NewBYOCRenewal("a", 1, "c", "", "d", "h", time.Now()) + r := NewBYOCRenewal("a", 1, "c", "", NewSelfIssuedOrder("d", "h", time.Now().Add(30*24*time.Hour)), time.Now()) newV, _ := r.Validation.MarkVerified(time.Now()) r.UpdateValidationStatus(newV) assert.Equal(t, ValidationVerified, r.Validation.Status) - assert.True(t, r.IsCompleted()) // verified validation counts as completed + // Verified validation alone is NOT completion: a CSR renewal whose + // order is still ISSUING is verified-but-incomplete until + // MarkCompleted runs. + assert.False(t, r.IsCompleted()) + require.NoError(t, r.MarkCompleted(time.Now())) + assert.True(t, r.IsCompleted()) } diff --git a/internal/port/certauthority.go b/internal/port/certauthority.go index 00d6f58..2ad2007 100644 --- a/internal/port/certauthority.go +++ b/internal/port/certauthority.go @@ -2,6 +2,7 @@ package port import ( "context" + "errors" "time" "github.com/godaddy/ans/internal/domain" @@ -22,13 +23,36 @@ type ValidatedCert struct { SerialNumber string } -// IssuedCert is returned by the identity CA after signing a CSR. +// IssuedCert is returned by a certificate issuer after signing a CSR. type IssuedCert struct { CertPEM string ChainPEM string SerialNumber string - ExpiresAt time.Time - IssuedAt time.Time + // CertificateRef is the issuer's opaque handle for the issued + // certificate — an ACME certificate URL, a cloud CA resource name + // (GCP CAS) or ARN (AWS PCA), empty for the in-process self-signed + // CAs. Persisted alongside the certificate because some providers + // revoke by handle rather than by serial. + CertificateRef string + ExpiresAt time.Time + IssuedAt time.Time +} + +// RevokeCertificateRequest identifies a certificate to revoke at its +// issuer. SerialNumber (lowercase hex) is always populated; +// CertificateRef carries the provider handle captured at issuance +// when one exists. Implementations use whichever their API keys on — +// serial for the self-signed CA, AWS PCA, and Vault; resource name +// for GCP CAS. +// +// Revocation MUST be idempotent: revoking an already-revoked +// certificate returns nil. The caller performs CA revocation before +// committing its own transaction, so a crash between the two is +// healed by a retried call. +type RevokeCertificateRequest struct { + SerialNumber string + CertificateRef string + Reason domain.RevocationReason } // CertificateValidator validates operator-provided server certificates @@ -64,9 +88,38 @@ type CertificateValidator interface { ) error } -// IdentityCertificateAuthority issues identity certificates from the -// system's private CA. The CA binds the versioned ANS name as a URI SAN, -// creating the verifiable link between an agent and its declared version. +// IdentityCertificateAuthority issues identity certificates from a +// PRIVATE trust root — identity certs are never publicly issued. The +// CA binds the versioned ANS name as a URI SAN, creating the +// verifiable link between an agent and its declared version. +// +// Because the trust root is private, no domain-control challenge +// lifecycle exists on this port: domain ownership is proven by the +// server-certificate flow (the verify-acme gate, plus the public +// provider's own validation on the ACME path) BEFORE the RA asks this +// port to sign anything. Issuance is therefore a plain CSR-in / +// cert-out call. +// +// The in-process default is the file-backed SelfCA. Cloud private CAs +// (AWS Private CA, GCP Private CA Service, Vault PKI) slot in at this +// boundary; adapter notes for them: +// +// - All three accept CSRs. AWS PCA issues asynchronously — the +// adapter does a short bounded poll (IssueCertificate → +// GetCertificate), the same in-call pattern the ACME server +// issuer uses. No pending/order state is needed: no third party +// or operator action intervenes, so a retryable error suffices +// and the caller's idempotent re-entry re-drives. +// - The URI SAN is the load-bearing field. The ansName parameter is +// passed alongside the CSR so adapters can either verify the +// CSR's URI SAN (the SelfCA approach) or stamp it API-side. The +// provider must be configured to permit URI SANs and a +// ClientAuth EKU (PCA: a CSR/API-passthrough template; CAS: pool +// issuance policy; Vault: role allowed_uri_sans). +// - Issuance should be idempotent under retry: derive the +// provider's idempotency token (PCA IdempotencyToken, CAS +// request_id) from a hash of the CSR PEM, which is stable per +// CSR row. type IdentityCertificateAuthority interface { // IssueIdentityCertificate signs the given identity CSR and returns // the resulting certificate plus chain. @@ -76,13 +129,14 @@ type IdentityCertificateAuthority interface { ansName string, ) (*IssuedCert, error) - // RevokeCertificate marks a previously issued certificate as revoked. - // Implementations should track revocations so issued certs can be - // cross-referenced. + // RevokeCertificate revokes a previously issued certificate at the + // CA, so private CRL/OCSP distribution reflects the revocation — + // the RA's own database flip and transparency-log emit are the + // caller's responsibility. Must be idempotent (see + // RevokeCertificateRequest). RevokeCertificate( ctx context.Context, - serialNumber string, - reason domain.RevocationReason, + req RevokeCertificateRequest, ) error // GetCACertificate returns the CA's root certificate PEM. @@ -90,31 +144,87 @@ type IdentityCertificateAuthority interface { GetCACertificate(ctx context.Context) (string, error) } -// ServerCertificateAuthority issues server-auth TLS certificates from -// a private CA that is distinct from the identity CA (so the two can -// be rotated and key-managed independently). +// ErrOrderPending is returned by ServerCertificateIssuer.FinalizeOrder +// when the provider accepted the finalize request but has not produced +// a certificate yet (e.g. an ACME order sitting in `processing`). The +// caller persists the order as ISSUING and re-drives the finalize on a +// subsequent verify-acme call. Wrap with %w so errors.Is matches. +var ErrOrderPending = errors.New("certificate order pending") + +// ErrOrderFailed is returned by ServerCertificateIssuer.FinalizeOrder +// when the provider reported a terminal order failure (e.g. an ACME +// order moved to `invalid`). The order cannot be retried; the operator +// must submit a new registration or renewal. Wrap with %w so errors.Is +// matches. +var ErrOrderFailed = errors.New("certificate order failed") + +// FinalizeOrderRequest carries everything an issuer needs to complete +// a previously created order. +type FinalizeOrderRequest struct { + // OrderRef is the provider-opaque handle returned by CreateOrder + // (an ACME order URL, an internal id, …). + OrderRef string + // CSRPEM is the operator-submitted server CSR. Per RFC 8555 the + // CSR is presented at finalize time, not at order creation. + CSRPEM string + // FQDN is the agent host the certificate must cover. + FQDN string + // Verified lists the challenge types whose artifacts the RA's own + // pre-flight check found published. ACME implementations MUST only + // answer a verified challenge: telling the provider to validate an + // unsatisfied challenge invalidates the authorization — and with it + // the whole order. Self-signed implementations may ignore it (the + // RA's gate is authoritative there). + Verified []domain.ChallengeType +} + +// ServerCertificateIssuer issues server-auth TLS certificates through +// a certificate-order lifecycle: // -// The reference RA delegates to an internal ACME-style cert service; -// local and LF-submittable deployments ship a file-backed self-signed -// CA under `internal/adapter/cert`. The port is stable so cloud-adapter -// contributions (AWS Private CA, GCP CAS, a hosted ACME CA, etc.) can -// replace the implementation without touching the service layer. -type ServerCertificateAuthority interface { - // IssueServerCertificate signs the given server CSR for the - // agent's FQDN. The CSR must carry the FQDN as a DNS SAN (the - // standard TLS server-auth shape); implementations call - // CertificateValidator.ValidateServerCSR before touching the - // signing key. Returns the leaf certificate + chain PEM + validity - // metadata. - IssueServerCertificate( - ctx context.Context, - csrPEM string, - fqdn string, - ) (*IssuedCert, error) +// 1. CreateOrder — called at registration / renewal-submission time. +// The returned domain-control challenges are relayed verbatim to +// the domain owner in the pending response. ANS never publishes +// challenge artifacts on the owner's behalf. +// 2. FinalizeOrder — called from verify-acme after the RA confirmed +// at least one challenge artifact is published. Synchronous +// issuers return the certificate immediately; asynchronous ones +// return ErrOrderPending and the order is re-driven on the next +// verify-acme call. +// +// The local implementation is the file-backed self-signed CA under +// `internal/adapter/cert`, which self-issues its challenge tokens and +// finalizes in-process. External providers slot in at this boundary +// without touching the service layer — e.g. an ACME adapter (Let's +// Encrypt) maps CreateOrder to new-order (relaying the provider's +// token + key authorization and the computed DNS digest), and +// FinalizeOrder to challenge-answer → poll → finalize → download. +// Proprietary CAs (managed-CA APIs, AWS Private CA, GCP CAS) map the +// same way, returning no challenges from CreateOrder when they perform +// no domain validation of their own. +// +// The server-cert trust root is distinct from the identity CA so the +// two can be rotated and key-managed independently. +type ServerCertificateIssuer interface { + // CreateOrder opens an issuance order for the agent's FQDN and + // returns the provider's domain-control challenges. The CSR is NOT + // required yet (matching ACME, where the CSR is presented at + // finalize). The returned order is persisted on the registration + // or renewal aggregate. + CreateOrder(ctx context.Context, fqdn string) (*domain.CertificateOrder, error) + + // FinalizeOrder completes the order: the issuer validates domain + // control by its own rules (or trusts the RA's gate, for the + // self-signed CA), signs the CSR, and returns the leaf + chain. + // Returns ErrOrderPending while an asynchronous provider is still + // processing, and ErrOrderFailed when the order is terminally + // dead. + FinalizeOrder(ctx context.Context, req FinalizeOrderRequest) (*IssuedCert, error) // GetCACertificate returns the server CA's root certificate PEM. // Distinct from the identity CA's root — operators publish this // separately so relying parties building TLS trust stores can - // trust it without trusting the identity CA. + // trust it without trusting the identity CA. Publicly trusted + // providers (ACME) return their chain root; it is informational + // there since relying parties already hold it in system stores. GetCACertificate(ctx context.Context) (string, error) } diff --git a/internal/port/challenge.go b/internal/port/challenge.go new file mode 100644 index 0000000..db0f610 --- /dev/null +++ b/internal/port/challenge.go @@ -0,0 +1,20 @@ +package port + +import "context" + +// HTTPChallengeVerifier checks that the domain owner has published an +// HTTP-01 challenge artifact at the expected path under their FQDN. +// Like the DNSVerifier, it is verification-only: ANS never serves +// challenge files on the owner's behalf — the owner publishes the +// artifact themselves from the challenge info relayed in the pending +// response. +type HTTPChallengeVerifier interface { + // VerifyHTTPChallenge fetches `http://` and reports + // whether the response body matches the expected content (the key + // authorization for account-bound challenges, the raw token + // otherwise). Returns (false, nil) when the artifact is missing or + // mismatched; a non-nil error indicates a systemic failure that + // prevented checking at all (which callers treat the same as + // not-published, since an unreachable host cannot prove control). + VerifyHTTPChallenge(ctx context.Context, fqdn, path, expectedContent string) (bool, error) +} diff --git a/internal/port/store.go b/internal/port/store.go index 38c79d9..4b8c656 100644 --- a/internal/port/store.go +++ b/internal/port/store.go @@ -2,6 +2,7 @@ package port import ( "context" + "time" "github.com/godaddy/ans/internal/domain" ) @@ -60,6 +61,23 @@ type AgentStore interface { filter ListFilter, ) (*CursorPage[*domain.AgentRegistration], error) + // ExpireLapsedPendingValidation atomically transitions to EXPIRED + // every registration that is still PENDING_VALIDATION with a + // PENDING certificate order whose challenge window lapsed at or + // before now, returning the number transitioned. The agent-expiry + // sweeper uses it to honor the spec's "PENDING_VALIDATION + // registrations are not cancellable and will auto-expire". + // + // The transition is a single guarded write — not a read-then-save + // — so a verify-acme that advances the same row (to PENDING_DNS, + // or to a non-PENDING order state) between scans cannot be + // clobbered: such a row simply no longer matches. In-flight + // (order ISSUING) and terminally-failed (order FAILED) + // registrations are excluded; they leave PENDING_VALIDATION + // through the cancel route instead, per domain.Cancel's + // eligibility rule. + ExpireLapsedPendingValidation(ctx context.Context, now time.Time) (int64, error) + // Delete removes the registration with the given ID. Used only for // administrative cleanup; normal lifecycle uses Revoke. Delete(ctx context.Context, id int64) error diff --git a/internal/ra/handler/dto.go b/internal/ra/handler/dto.go index d2d3588..e17a019 100644 --- a/internal/ra/handler/dto.go +++ b/internal/ra/handler/dto.go @@ -119,10 +119,57 @@ func mapAgentDetails(res *service.DetailResult, r *http.Request, tlPublicBaseURL // buildV1RegistrationPending. Agents still driving validation/DNS // expose the outstanding challenges + DNS records needed to // progress; terminal states omit the block. +// +// The block's status is a registration-flow status, NOT the agent +// lifecycle status: while an asynchronous issuer finalizes the +// certificate order the lifecycle stays PENDING_VALIDATION but the +// flow reports PENDING_CERTS (per the spec's RegistrationPending +// enum), with WAIT guidance pointing back at verify-acme. func buildRegistrationPendingBlock(reg *domain.AgentRegistration, r *http.Request, tlPublicBaseURL string) *registrationPendingResponse { switch reg.Status { case domain.StatusPendingValidation: base := schemeOf(r) + "://" + r.Host + "/v2/ans/agents/" + reg.AgentID + if reg.CertOrder.State == domain.OrderStateFailed { + // Terminal provider failure. The dead challenges are not + // worth relaying, and CONFIGURE_DNS/VALIDATE_DOMAIN would + // loop the operator into a verify-acme that only returns + // CERT_ORDER_FAILED. The actionable step is to cancel and + // register a new version (the ANS name is immutable once + // used). + return ®istrationPendingResponse{ + AgentID: reg.AgentID, + Status: registrationStatusPendingCerts, + AnsName: reg.AnsName.String(), + NextSteps: []nextStepDTO{ + {Action: "CANCEL", + Description: "Certificate issuance failed — cancel this registration (POST /revoke) and register a new version", + Endpoint: base + "/revoke"}, + }, + ExpiresAt: rfc3339Zero(reg.CertOrder.ExpiresAt), + Links: []linkDTO{ + {Rel: "self", Href: base}, + }, + } + } + if reg.CertOrder.State == domain.OrderStateIssuing { + // Domain control proven; the issuer is still finalizing. + // No challenges (already answered), no production DNS + // records (the TLSA value needs the cert). + return ®istrationPendingResponse{ + AgentID: reg.AgentID, + Status: registrationStatusPendingCerts, + AnsName: reg.AnsName.String(), + NextSteps: []nextStepDTO{ + {Action: "WAIT", + Description: "Certificate issuance in progress — POST verify-acme again to check for completion", + Endpoint: base + "/verify-acme"}, + }, + ExpiresAt: rfc3339Zero(reg.CertOrder.ExpiresAt), + Links: []linkDTO{ + {Rel: "self", Href: base}, + }, + } + } // PENDING_VALIDATION carries no production DNS records — those // only materialize after verify-acme issues the certs that the // TLSA record fingerprints. The ACME challenge itself rides in @@ -131,19 +178,20 @@ func buildRegistrationPendingBlock(reg *domain.AgentRegistration, r *http.Reques // should publish production records too, which they can't // without certs in hand. return ®istrationPendingResponse{ + AgentID: reg.AgentID, Status: string(reg.Status), AnsName: reg.AnsName.String(), Challenges: buildRegistrationChallenges(reg), DNSRecords: nil, NextSteps: []nextStepDTO{ {Action: "CONFIGURE_DNS", - Description: "Publish the ACME DNS-01 challenge TXT record listed in challenges[]", + Description: "Publish one challenge artifact from challenges[]: the DNS-01 TXT record, or the HTTP-01 resource at its httpPath", Endpoint: base + "/verify-acme"}, {Action: "VALIDATE_DOMAIN", - Description: "Call POST /v2/ans/agents/{agentId}/verify-acme once the challenge record is live", + Description: "Call POST /v2/ans/agents/{agentId}/verify-acme once the challenge artifact is live", Endpoint: base + "/verify-acme"}, }, - ExpiresAt: rfc3339Zero(reg.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(reg.CertOrder.ExpiresAt), Links: []linkDTO{ {Rel: "self", Href: base}, }, @@ -163,6 +211,7 @@ func buildRegistrationPendingBlock(reg *domain.AgentRegistration, r *http.Reques }) } return ®istrationPendingResponse{ + AgentID: reg.AgentID, Status: string(reg.Status), AnsName: reg.AnsName.String(), DNSRecords: dnsRecords, @@ -171,7 +220,7 @@ func buildRegistrationPendingBlock(reg *domain.AgentRegistration, r *http.Reques Description: "Verify that all required DNS records are configured", Endpoint: base + "/verify-dns"}, }, - ExpiresAt: rfc3339Zero(reg.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(reg.CertOrder.ExpiresAt), Links: []linkDTO{ {Rel: "self", Href: base}, }, @@ -181,6 +230,12 @@ func buildRegistrationPendingBlock(reg *domain.AgentRegistration, r *http.Reques } } +// registrationStatusPendingCerts is the registration-flow status +// reported while a certificate order is finalizing. It exists only in +// the RegistrationPending view (spec `RegistrationPending.status` +// enum) — the agent lifecycle enum deliberately does not contain it. +const registrationStatusPendingCerts = "PENDING_CERTS" + // ----- Certificate DTO (matches V2 spec CertificateResponse §1324) ----- type certificateResponse struct { @@ -278,18 +333,19 @@ type serverCertRenewalRequest struct { ServerCertificateChainPEM string `json:"serverCertificateChainPEM,omitempty"` } -// challengeInfo mirrors V2 `ChallengeInfo` — the parts the client -// needs to publish the DNS-01 or HTTP-01 challenge. Minimal shape; -// the reference has more knobs but the V2 spec only requires -// recordName/recordType/recordValue/url/expectedResponse/expiresAt. -// We omit-empty so DNS-01-only vs HTTP-01-only responses stay clean. +// challengeInfo mirrors the V2 `ChallengeInfo` schema — the same +// shape the registration lane's challenges[] carries: type, token, +// keyAuthorization, dnsRecord, httpPath, expiresAt. The renewal +// responses reference ChallengeInfo via $ref, so the field names +// must match it exactly. We omit-empty so DNS-01 entries don't carry +// an httpPath and vice versa. type challengeInfo struct { - RecordName string `json:"recordName,omitempty"` - RecordType string `json:"recordType,omitempty"` - RecordValue string `json:"recordValue,omitempty"` - URL string `json:"url,omitempty"` - ExpectedResponse string `json:"expectedResponse,omitempty"` - ExpiresAt string `json:"expiresAt"` + Type string `json:"type"` + Token string `json:"token"` + KeyAuthorization string `json:"keyAuthorization,omitempty"` + DNSRecord *challengeDNSRecordDTO `json:"dnsRecord,omitempty"` + HTTPPath string `json:"httpPath,omitempty"` + ExpiresAt string `json:"expiresAt"` } type renewalChallenges struct { @@ -381,12 +437,19 @@ type agentStatus struct { ExpiresAt string `json:"expiresAt,omitempty"` } -// phaseFromStatus maps a lifecycle status to the closest-matching -// V2 phase. Reference semantics: PENDING_VALIDATION → DOMAIN_VALIDATION, -// PENDING_DNS → DNS_PROVISIONING, ACTIVE → COMPLETED. -func phaseFromStatus(s domain.RegistrationStatus) string { - switch s { +// phaseFor derives the V2 AgentStatus phase from (lifecycle status × +// certificate-order state). Reference semantics: PENDING_VALIDATION → +// DOMAIN_VALIDATION, PENDING_DNS → DNS_PROVISIONING, ACTIVE → +// COMPLETED — plus CERTIFICATE_ISSUANCE, which is not a lifecycle +// state at all: it is the window where domain validation passed but +// an asynchronous issuer hasn't produced the certificate yet, tracked +// on the order while the lifecycle stays PENDING_VALIDATION. +func phaseFor(reg *domain.AgentRegistration) string { + switch reg.Status { case domain.StatusPendingValidation: + if reg.CertOrder.State == domain.OrderStateIssuing { + return "CERTIFICATE_ISSUANCE" + } return "DOMAIN_VALIDATION" case domain.StatusPendingDNS: return "DNS_PROVISIONING" @@ -397,22 +460,28 @@ func phaseFromStatus(s domain.RegistrationStatus) string { } } -func completedStepsFor(s domain.RegistrationStatus) []string { - switch s { - case domain.StatusPendingDNS: +func completedStepsFor(reg *domain.AgentRegistration) []string { + switch { + case reg.Status == domain.StatusPendingValidation && reg.CertOrder.State == domain.OrderStateIssuing: return []string{"DOMAIN_VALIDATION"} - case domain.StatusActive: + case reg.Status == domain.StatusPendingDNS: + // The cert exists by PENDING_DNS — issuance completes in the + // same transaction that advances the lifecycle. + return []string{"DOMAIN_VALIDATION", "CERTIFICATE_ISSUANCE"} + case reg.Status == domain.StatusActive: return []string{"DOMAIN_VALIDATION", "CERTIFICATE_ISSUANCE", "DNS_PROVISIONING"} default: return nil } } -func pendingStepsFor(s domain.RegistrationStatus) []string { - switch s { - case domain.StatusPendingValidation: +func pendingStepsFor(reg *domain.AgentRegistration) []string { + switch { + case reg.Status == domain.StatusPendingValidation && reg.CertOrder.State == domain.OrderStateIssuing: + return []string{"CERTIFICATE_ISSUANCE"} + case reg.Status == domain.StatusPendingValidation: return []string{"DOMAIN_VALIDATION"} - case domain.StatusPendingDNS: + case reg.Status == domain.StatusPendingDNS: return []string{"DNS_PROVISIONING"} default: return nil diff --git a/internal/ra/handler/dto_helpers_test.go b/internal/ra/handler/dto_helpers_test.go index 355c765..02aea34 100644 --- a/internal/ra/handler/dto_helpers_test.go +++ b/internal/ra/handler/dto_helpers_test.go @@ -121,7 +121,7 @@ func TestMapV1RenewalStatus_ActiveAndFailed(t *testing.T) { UpdatedAt: now, }, } - resp := mapV1RenewalStatus("agent-1", pending) + resp := mapV1RenewalStatus("agent-1", &service.GetRenewalResult{Renewal: pending, FQDN: "a.example.com"}) if resp.CsrID != "csr-1" { t.Errorf("csr: %q", resp.CsrID) } @@ -136,7 +136,7 @@ func TestMapV1RenewalStatus_ActiveAndFailed(t *testing.T) { failed := *pending failed.FailureReason = "dns lookup failed" failed.Validation.Status = domain.ValidationFailed - resp2 := mapV1RenewalStatus("agent-1", &failed) + resp2 := mapV1RenewalStatus("agent-1", &service.GetRenewalResult{Renewal: &failed, FQDN: "a.example.com"}) if resp2.FailureReason != "dns lookup failed" { t.Errorf("failure reason lost: %q", resp2.FailureReason) } @@ -276,7 +276,7 @@ func TestMapRenewalStatus_FailureReasonField(t *testing.T) { ExpiresAt: now.Add(-time.Hour), }, } - resp := mapRenewalStatus("agent-1", failed) + resp := mapRenewalStatus("agent-1", &service.GetRenewalResult{Renewal: failed, FQDN: "a.example.com"}) if resp.FailureReason != "validation expired" { t.Errorf("FailureReason: got %q want validation expired", resp.FailureReason) } @@ -295,7 +295,7 @@ func TestMapRenewalStatus_NoFailureReason(t *testing.T) { ExpiresAt: now.Add(time.Hour), }, } - resp := mapRenewalStatus("agent-1", pending) + resp := mapRenewalStatus("agent-1", &service.GetRenewalResult{Renewal: pending, FQDN: "a.example.com"}) if resp.FailureReason != "" { t.Errorf("FailureReason should be empty for non-failed renewal; got %q", resp.FailureReason) } diff --git a/internal/ra/handler/dto_more_test.go b/internal/ra/handler/dto_more_test.go index 2e6f850..5b6c846 100644 --- a/internal/ra/handler/dto_more_test.go +++ b/internal/ra/handler/dto_more_test.go @@ -23,7 +23,7 @@ func mustReq(t *testing.T, method, target string) *http.Request { return httptest.NewRequest(method, target, nil) } -func TestPhaseFromStatus_AllArms(t *testing.T) { +func TestPhaseFor_AllArms(t *testing.T) { cases := map[domain.RegistrationStatus]string{ domain.StatusPendingValidation: "DOMAIN_VALIDATION", domain.StatusPendingDNS: "DNS_PROVISIONING", @@ -31,21 +31,31 @@ func TestPhaseFromStatus_AllArms(t *testing.T) { domain.RegistrationStatus("UNKNOWN"): "INITIALIZATION", } for status, want := range cases { - if got := phaseFromStatus(status); got != want { - t.Errorf("phaseFromStatus(%q): got %q want %q", status, got, want) + reg := &domain.AgentRegistration{Status: status} + if got := phaseFor(reg); got != want { + t.Errorf("phaseFor(%q): got %q want %q", status, got, want) } } + // CERTIFICATE_ISSUANCE is derived from the order, not the + // lifecycle: PENDING_VALIDATION + ISSUING order reports it. + issuing := &domain.AgentRegistration{ + Status: domain.StatusPendingValidation, + CertOrder: domain.CertificateOrder{State: domain.OrderStateIssuing}, + } + if got := phaseFor(issuing); got != "CERTIFICATE_ISSUANCE" { + t.Errorf("phaseFor(issuing): got %q want CERTIFICATE_ISSUANCE", got) + } } func TestCompletedStepsFor_AllArms(t *testing.T) { cases := map[domain.RegistrationStatus][]string{ - domain.StatusPendingDNS: {"DOMAIN_VALIDATION"}, + domain.StatusPendingDNS: {"DOMAIN_VALIDATION", "CERTIFICATE_ISSUANCE"}, domain.StatusActive: {"DOMAIN_VALIDATION", "CERTIFICATE_ISSUANCE", "DNS_PROVISIONING"}, // Default arm (any other status) returns nil. domain.StatusPendingValidation: nil, } for status, want := range cases { - got := completedStepsFor(status) + got := completedStepsFor(&domain.AgentRegistration{Status: status}) if len(got) != len(want) { t.Errorf("completedStepsFor(%q): got %v want %v", status, got, want) continue @@ -65,7 +75,7 @@ func TestPendingStepsFor_AllArms(t *testing.T) { domain.StatusActive: nil, // default arm } for status, want := range cases { - got := pendingStepsFor(status) + got := pendingStepsFor(&domain.AgentRegistration{Status: status}) if len(got) != len(want) { t.Errorf("pendingStepsFor(%q): got %v want %v", status, got, want) continue diff --git a/internal/ra/handler/lifecycle.go b/internal/ra/handler/lifecycle.go index 26db754..2a2c6ed 100644 --- a/internal/ra/handler/lifecycle.go +++ b/internal/ra/handler/lifecycle.go @@ -285,7 +285,11 @@ func (h *LifecycleHandler) VerifyRenewalACME(w http.ResponseWriter, r *http.Requ // ----- POST /v2/ans/agents/{agentId}/verify-acme ----- -// VerifyACME handles POST .../verify-acme. No request body. +// VerifyACME handles POST .../verify-acme. No request body. Always +// 202: domain validation succeeded and certificates are either issued +// (status PENDING_DNS, phase DNS_PROVISIONING) or still being +// finalized by an asynchronous issuer (status PENDING_VALIDATION, +// phase CERTIFICATE_ISSUANCE — the caller re-POSTs to re-drive). func (h *LifecycleHandler) VerifyACME(w http.ResponseWriter, r *http.Request) { agentID := chi.URLParam(r, "agentId") res, err := h.svc.VerifyACME(r.Context(), agentID, service.VerifyInput{}) @@ -295,12 +299,12 @@ func (h *LifecycleHandler) VerifyACME(w http.ResponseWriter, r *http.Request) { } WriteJSON(w, http.StatusAccepted, agentStatus{ Status: string(res.Registration.Status), - Phase: phaseFromStatus(res.Registration.Status), - CompletedSteps: completedStepsFor(res.Registration.Status), - PendingSteps: pendingStepsFor(res.Registration.Status), + Phase: phaseFor(res.Registration), + CompletedSteps: completedStepsFor(res.Registration), + PendingSteps: pendingStepsFor(res.Registration), CreatedAt: res.Registration.Details.RegistrationTimestamp.Format("2006-01-02T15:04:05Z07:00"), UpdatedAt: res.Now.Format("2006-01-02T15:04:05Z07:00"), - ExpiresAt: rfc3339Zero(res.Registration.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(res.Registration.CertOrder.ExpiresAt), }) } @@ -328,12 +332,12 @@ func (h *LifecycleHandler) VerifyDNS(w http.ResponseWriter, r *http.Request) { WriteJSON(w, http.StatusAccepted, agentStatus{ Status: string(res.Registration.Status), - Phase: phaseFromStatus(res.Registration.Status), - CompletedSteps: completedStepsFor(res.Registration.Status), - PendingSteps: pendingStepsFor(res.Registration.Status), + Phase: phaseFor(res.Registration), + CompletedSteps: completedStepsFor(res.Registration), + PendingSteps: pendingStepsFor(res.Registration), CreatedAt: res.Registration.Details.RegistrationTimestamp.Format("2006-01-02T15:04:05Z07:00"), UpdatedAt: res.Now.Format("2006-01-02T15:04:05Z07:00"), - ExpiresAt: rfc3339Zero(res.Registration.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(res.Registration.CertOrder.ExpiresAt), }) } diff --git a/internal/ra/handler/lifecycle_test.go b/internal/ra/handler/lifecycle_test.go index 73908e3..2c2ab01 100644 --- a/internal/ra/handler/lifecycle_test.go +++ b/internal/ra/handler/lifecycle_test.go @@ -859,7 +859,7 @@ func newHandlerFixture(t *testing.T) *handlerFixture { KeyID: "ra-signer", RaID: "ra-test", }).WithDNSVerifier(dns.NewNoopVerifier()). - WithServerCertificateAuthority(serverCA) + WithServerCertificateIssuer(serverCA) r := chi.NewRouter() regH := handler.NewRegistrationHandler(svc) diff --git a/internal/ra/handler/order_dto_test.go b/internal/ra/handler/order_dto_test.go new file mode 100644 index 0000000..b38bc84 --- /dev/null +++ b/internal/ra/handler/order_dto_test.go @@ -0,0 +1,202 @@ +package handler + +import ( + "testing" + "time" + + "github.com/godaddy/ans/internal/domain" +) + +func orderedReg(t *testing.T, state domain.OrderState) *domain.AgentRegistration { + t.Helper() + sv, _ := domain.ParseSemVer("1.0.0") + ansName, _ := domain.NewAnsName(sv, "agent.example.com") + return &domain.AgentRegistration{ + AgentID: "agent-1", + AnsName: ansName, + Status: domain.StatusPendingValidation, + CertOrder: domain.CertificateOrder{ + OrderRef: "ref-1", + State: state, + Challenges: []domain.Challenge{ + {Type: domain.ChallengeTypeDNS01, Token: "dns-tok", KeyAuthorization: "dns-tok.kid", DNSRecordValue: "digest"}, + {Type: domain.ChallengeTypeHTTP01, Token: "http-tok", KeyAuthorization: "http-tok.kid"}, + }, + ExpiresAt: time.Now().Add(time.Hour), + }, + } +} + +// TestBuildRegistrationChallenges_RelaysProviderFields pins the +// challenge relay: provider-minted key authorizations, computed DNS +// digests, and HTTP paths reach the wire untouched. +func TestBuildRegistrationChallenges_RelaysProviderFields(t *testing.T) { + out := buildRegistrationChallenges(orderedReg(t, domain.OrderStatePending)) + if len(out) != 2 { + t.Fatalf("challenges: got %d want 2", len(out)) + } + dns01 := out[0] + if dns01.Type != "DNS_01" || dns01.Token != "dns-tok" || dns01.KeyAuthorization != "dns-tok.kid" { + t.Errorf("dns01 relay: %+v", dns01) + } + if dns01.DNSRecord == nil || + dns01.DNSRecord.Name != "_acme-challenge.agent.example.com" || + dns01.DNSRecord.Value != "digest" { + t.Errorf("dns01 record: %+v", dns01.DNSRecord) + } + http01 := out[1] + if http01.Type != "HTTP_01" || http01.HTTPPath != "/.well-known/acme-challenge/http-tok" { + t.Errorf("http01 relay: %+v", http01) + } + if http01.DNSRecord != nil { + t.Error("http01 must not carry a dnsRecord") + } + + // No order → no challenges block. + if got := buildRegistrationChallenges(&domain.AgentRegistration{}); got != nil { + t.Errorf("zero order should omit challenges, got %+v", got) + } +} + +// TestBuildRegistrationPendingBlock_PendingCerts pins the derived +// registration-flow status: PENDING_VALIDATION lifecycle + ISSUING +// order reports PENDING_CERTS with WAIT guidance, no challenges (the +// provider already accepted the answer). +func TestBuildRegistrationPendingBlock_PendingCerts(t *testing.T) { + reg := orderedReg(t, domain.OrderStateIssuing) + block := buildRegistrationPendingBlock(reg, mustReq(t, "GET", "/v2/ans/agents/agent-1"), "") + if block == nil { + t.Fatal("pending block missing") + } + if block.Status != "PENDING_CERTS" { + t.Fatalf("status: got %q want PENDING_CERTS", block.Status) + } + if len(block.Challenges) != 0 { + t.Error("ISSUING block must not relay challenges") + } + if len(block.NextSteps) != 1 || block.NextSteps[0].Action != "WAIT" { + t.Errorf("nextSteps: %+v", block.NextSteps) + } + + // Every block carries the spec-required agentId. + if block.AgentID != "agent-1" { + t.Errorf("PENDING_CERTS block missing agentId: %q", block.AgentID) + } + + // PENDING order keeps the lifecycle status + challenges. + pendingBlock := buildRegistrationPendingBlock(orderedReg(t, domain.OrderStatePending), + mustReq(t, "GET", "/v2/ans/agents/agent-1"), "") + if pendingBlock.Status != string(domain.StatusPendingValidation) { + t.Fatalf("status: got %q", pendingBlock.Status) + } + if len(pendingBlock.Challenges) != 2 { + t.Errorf("PENDING block must relay challenges, got %d", len(pendingBlock.Challenges)) + } + if pendingBlock.AgentID != "agent-1" { + t.Errorf("PENDING block missing agentId: %q", pendingBlock.AgentID) + } +} + +// TestBuildRegistrationPendingBlock_FailedOrder pins the terminal- +// failure guidance (V2 + V1): no dead challenges, and a CANCEL step +// pointing at /revoke rather than a verify-acme loop that can only +// return CERT_ORDER_FAILED. +func TestBuildRegistrationPendingBlock_FailedOrder(t *testing.T) { + reg := orderedReg(t, domain.OrderStateFailed) + v2 := buildRegistrationPendingBlock(reg, mustReq(t, "GET", "/v2/ans/agents/agent-1"), "") + if v2 == nil || v2.Status != "PENDING_CERTS" { + t.Fatalf("v2 failed-order block: %+v", v2) + } + if len(v2.Challenges) != 0 { + t.Error("failed-order block must not relay dead challenges") + } + if len(v2.NextSteps) != 1 || v2.NextSteps[0].Action != "CANCEL" { + t.Errorf("v2 nextSteps: %+v", v2.NextSteps) + } + if v2.AgentID != "agent-1" { + t.Errorf("v2 failed-order block missing agentId: %q", v2.AgentID) + } + + v1 := buildV1RegistrationPending(reg, mustReq(t, "GET", "/v1/agents/agent-1"), "") + if v1 == nil || len(v1.Challenges) != 0 { + t.Fatalf("v1 failed-order block must omit challenges: %+v", v1) + } + if len(v1.NextSteps) != 1 || v1.NextSteps[0].Action != "CANCEL" { + t.Errorf("v1 nextSteps: %+v", v1.NextSteps) + } + + // V1 ISSUING block: WAIT, no re-relayed challenge. + v1issuing := buildV1RegistrationPending(orderedReg(t, domain.OrderStateIssuing), + mustReq(t, "GET", "/v1/agents/agent-1"), "") + if v1issuing == nil || len(v1issuing.Challenges) != 0 || + len(v1issuing.NextSteps) != 1 || v1issuing.NextSteps[0].Action != "WAIT" { + t.Errorf("v1 issuing block: %+v", v1issuing) + } +} + +// TestPhaseTrio_IssuingOrder pins the order-derived step reporting. +func TestPhaseTrio_IssuingOrder(t *testing.T) { + reg := orderedReg(t, domain.OrderStateIssuing) + if got := phaseFor(reg); got != "CERTIFICATE_ISSUANCE" { + t.Errorf("phase: %q", got) + } + completed := completedStepsFor(reg) + if len(completed) != 1 || completed[0] != "DOMAIN_VALIDATION" { + t.Errorf("completedSteps: %v", completed) + } + pending := pendingStepsFor(reg) + if len(pending) != 1 || pending[0] != "CERTIFICATE_ISSUANCE" { + t.Errorf("pendingSteps: %v", pending) + } +} + +// TestBuildRenewalChallenges_Shapes pins the renewal challenges block +// the operator publishes from — including the HTTP-01 URL + expected +// response that were previously never surfaced. +func TestBuildRenewalChallenges_Shapes(t *testing.T) { + v := domain.RenewalValidation{ + Challenges: []domain.Challenge{ + {Type: domain.ChallengeTypeDNS01, Token: "d-tok"}, + {Type: domain.ChallengeTypeHTTP01, Token: "h-tok", KeyAuthorization: "h-tok.kid"}, + }, + ExpiresAt: time.Now().Add(time.Hour), + } + out := buildRenewalChallenges("agent.example.com", v) + if out == nil || out.DNS01 == nil || out.HTTP01 == nil { + t.Fatalf("challenges block incomplete: %+v", out) + } + // Shape must match the spec's ChallengeInfo — the renewal + // responses $ref the same schema the registration lane uses. + if out.DNS01.Type != "DNS_01" || out.DNS01.Token != "d-tok" || + out.DNS01.DNSRecord == nil || + out.DNS01.DNSRecord.Name != "_acme-challenge.agent.example.com" || + out.DNS01.DNSRecord.Type != "TXT" || out.DNS01.DNSRecord.Value != "d-tok" { + t.Errorf("dns01: %+v", out.DNS01) + } + if out.HTTP01.Type != "HTTP_01" || out.HTTP01.Token != "h-tok" || + out.HTTP01.KeyAuthorization != "h-tok.kid" || + out.HTTP01.HTTPPath != "/.well-known/acme-challenge/h-tok" { + t.Errorf("http01: %+v", out.HTTP01) + } + if out.HTTP01.DNSRecord != nil { + t.Error("http01 must not carry a dnsRecord") + } + + // Empty challenge set → nil block (omitted on the wire). + if got := buildRenewalChallenges("x", domain.RenewalValidation{}); got != nil { + t.Errorf("empty set should yield nil, got %+v", got) + } +} + +// TestTlsaDTOFrom covers both arms of the nil-propagating mapper. +func TestTlsaDTOFrom(t *testing.T) { + if got := tlsaDTOFrom(nil); got != nil { + t.Errorf("nil in, nil out: got %+v", got) + } + rec := domain.TLSARecordForCert("agent.example.com", "ff00") + dto := tlsaDTOFrom(&rec) + if dto == nil || dto.Name != "_443._tcp.agent.example.com" || + dto.Type != "TLSA" || dto.Value != "3 1 1 ff00" { + t.Errorf("tlsa dto: %+v", dto) + } +} diff --git a/internal/ra/handler/registration.go b/internal/ra/handler/registration.go index 51656bc..e4cdc77 100644 --- a/internal/ra/handler/registration.go +++ b/internal/ra/handler/registration.go @@ -2,7 +2,6 @@ package handler import ( "encoding/json" - "errors" "net/http" "github.com/godaddy/ans/internal/adapter/auth" @@ -27,7 +26,7 @@ func NewRegistrationHandler(svc *service.RegistrationService) *RegistrationHandl // Server cert input follows the reference shape: exactly one of // `serverCsrPEM` or `serverCertificatePEM` must be set (both or // neither → 422). The CSR path routes through the RA's configured -// `ServerCertificateAuthority` port; the BYOC path routes through +// `ServerCertificateIssuer` port; the BYOC path routes through // the certificate validator. type registrationRequest struct { AgentDisplayName string `json:"agentDisplayName"` @@ -59,8 +58,10 @@ type functionDTO struct { // registrationPendingResponse mirrors the V2 spec's RegistrationPending // schema (spec/api-spec-v2.yaml §1167). Field names and optionality -// match the spec exactly — no extensions. `challenges` holds ACME -// challenges needed to drive verify-acme; ans emits DNS_01 only. +// match the spec exactly — no extensions. `challenges` relays the +// certificate order's domain-control challenges (DNS_01 and HTTP_01); +// the owner publishes whichever artifact is easier and ANS verifies +// it at verify-acme. type registrationPendingResponse struct { AgentID string `json:"agentId"` Status string `json:"status"` @@ -81,13 +82,17 @@ type dnsRecordDTO struct { TTL int `json:"ttl"` } -// challengeDTO mirrors the V2 ChallengeInfo schema. ans emits DNS_01 -// only per the documented no-HTTP-01 deviation. +// challengeDTO mirrors the V2 ChallengeInfo schema: type, token, +// keyAuthorization, dnsRecord, httpPath, expiresAt. keyAuthorization +// is populated when the issuing provider binds the token to its +// account key (ACME); self-issued challenges omit it. type challengeDTO struct { - Type string `json:"type"` - Token string `json:"token"` - DNSRecord *challengeDNSRecordDTO `json:"dnsRecord,omitempty"` - ExpiresAt string `json:"expiresAt,omitempty"` + Type string `json:"type"` + Token string `json:"token"` + KeyAuthorization string `json:"keyAuthorization,omitempty"` + DNSRecord *challengeDNSRecordDTO `json:"dnsRecord,omitempty"` + HTTPPath string `json:"httpPath,omitempty"` + ExpiresAt string `json:"expiresAt,omitempty"` } type challengeDNSRecordDTO struct { @@ -215,8 +220,8 @@ func mapRegistrationResponse(resp *service.RegisterResponse, r *http.Request) *r // Register-time next-steps reflect the deferred-cert flow: certs // only issue once verify-acme proves domain control, so the only - // step the operator can take right now is publish the ACME - // challenge TXT and call verify-acme. Production DNS records + // step the operator can take right now is publish a challenge + // artifact and call verify-acme. Production DNS records // (TRUST/BADGE/DISCOVERY/TLSA) only materialize on the // verify-acme 202, where they appear paired with VERIFY_DNS as // the next step. @@ -228,37 +233,55 @@ func mapRegistrationResponse(resp *service.RegisterResponse, r *http.Request) *r DNSRecords: dnsRecords, NextSteps: []nextStepDTO{ {Action: "CONFIGURE_DNS", - Description: "Publish the ACME DNS-01 challenge TXT record listed in challenges[]", + Description: "Publish one challenge artifact from challenges[]: the DNS-01 TXT record, or the HTTP-01 resource at its httpPath", Endpoint: base + "/verify-acme"}, {Action: "VALIDATE_DOMAIN", - Description: "Call POST /v2/ans/agents/{agentId}/verify-acme once the challenge record is live", + Description: "Call POST /v2/ans/agents/{agentId}/verify-acme once the challenge artifact is live", Endpoint: base + "/verify-acme"}, }, - ExpiresAt: rfc3339Zero(resp.Registration.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(resp.Registration.CertOrder.ExpiresAt), Links: []linkDTO{ {Rel: "self", Href: base}, }, } } -// buildRegistrationChallenges builds the ChallengeInfo array for the -// V2 RegistrationPending response. ans emits DNS_01 only. Named -// distinctly from renewalmap.go's renewal-specific `buildChallenges` -// to avoid collision. +// buildRegistrationChallenges relays the certificate order's +// challenge set as the ChallengeInfo array for the V2 +// RegistrationPending response. The entries are the provider's own +// challenges, verbatim — for an external ACME provider that means its +// token, key authorization, and computed DNS digest; for the +// in-process CA the self-issued tokens. Named distinctly from +// renewalmap.go's renewal-specific builder to avoid collision. +// +// Returns nil (omitted on the wire) when no order is present — e.g. +// a registration pre-dating order persistence. func buildRegistrationChallenges(reg *domain.AgentRegistration) []challengeDTO { - if reg.ACMEChallenge.IsZero() { + if reg.CertOrder.IsZero() { return nil } - return []challengeDTO{{ - Type: "DNS_01", - Token: reg.ACMEChallenge.DNS01Token, - DNSRecord: &challengeDNSRecordDTO{ - Name: "_acme-challenge." + reg.FQDN(), - Type: "TXT", - Value: reg.ACMEChallenge.DNS01Token, - }, - ExpiresAt: rfc3339Zero(reg.ACMEChallenge.ExpiresAt), - }} + expiresAt := rfc3339Zero(reg.CertOrder.ExpiresAt) + out := make([]challengeDTO, 0, len(reg.CertOrder.Challenges)) + for _, ch := range reg.CertOrder.Challenges { + dto := challengeDTO{ + Type: string(ch.Type), + Token: ch.Token, + KeyAuthorization: ch.KeyAuthorization, + ExpiresAt: expiresAt, + } + switch ch.Type { + case domain.ChallengeTypeDNS01: + dto.DNSRecord = &challengeDNSRecordDTO{ + Name: ch.EffectiveDNSRecordName(reg.FQDN()), + Type: "TXT", + Value: ch.EffectiveDNSRecordValue(), + } + case domain.ChallengeTypeHTTP01: + dto.HTTPPath = ch.EffectiveHTTPPath() + } + out = append(out, dto) + } + return out } // schemeOf returns "https" if the request was served over TLS or @@ -272,6 +295,3 @@ func schemeOf(r *http.Request) string { } return "http" } - -// silence "imported and not used" if handlers evolve. -var _ = errors.New diff --git a/internal/ra/handler/renewalmap.go b/internal/ra/handler/renewalmap.go index 8202387..4d9a629 100644 --- a/internal/ra/handler/renewalmap.go +++ b/internal/ra/handler/renewalmap.go @@ -60,10 +60,14 @@ func nextStepFor(agentID, status string) nextStep { Description: "Complete ACME challenges then POST to verify-acme endpoint", } case renewalStatusIssuingCertificate: + // Only re-POSTing verify-acme re-drives a pending order — GET + // /renewal never finalizes it, so pointing the operator there + // would livelock an async issuance. Mirror the registration + // lane's PENDING_CERTS guidance. return nextStep{ Action: "WAIT", - Endpoint: base + "/renewal", - Description: "Certificate issuance in progress, poll GET /renewal for TLSA record", + Endpoint: base + "/renewal/verify-acme", + Description: "Certificate issuance in progress — POST verify-acme again to drive the order to completion", } case renewalStatusCompleted: return nextStep{ @@ -82,13 +86,64 @@ func nextStepFor(agentID, status string) nextStep { } } +// buildRenewalChallenges renders the renewal's domain-control +// challenge set in the V2 challenges shape (dns01 / http01 keyed +// object). The entries come verbatim from the certificate order — +// provider-minted for external issuers, self-issued for the +// in-process CA and BYOC validation. The owner publishes one of the +// two artifacts; verify-acme accepts either. +func buildRenewalChallenges(fqdn string, v domain.RenewalValidation) *renewalChallenges { + expires := v.ExpiresAt.UTC().Format(time.RFC3339) + out := &renewalChallenges{} + if ch, ok := v.ChallengeOfType(domain.ChallengeTypeDNS01); ok { + out.DNS01 = &challengeInfo{ + Type: string(domain.ChallengeTypeDNS01), + Token: ch.Token, + KeyAuthorization: ch.KeyAuthorization, + DNSRecord: &challengeDNSRecordDTO{ + Name: ch.EffectiveDNSRecordName(fqdn), + Type: "TXT", + Value: ch.EffectiveDNSRecordValue(), + }, + ExpiresAt: expires, + } + } + if ch, ok := v.ChallengeOfType(domain.ChallengeTypeHTTP01); ok { + out.HTTP01 = &challengeInfo{ + Type: string(domain.ChallengeTypeHTTP01), + Token: ch.Token, + KeyAuthorization: ch.KeyAuthorization, + HTTPPath: ch.EffectiveHTTPPath(), + ExpiresAt: expires, + } + } + if out.DNS01 == nil && out.HTTP01 == nil { + return nil + } + return out +} + +// tlsaDTOFrom maps the domain TLSA record into the wire DTO. Nil in, +// nil out — completed renewals carry it, pending ones don't. +func tlsaDTOFrom(rec *domain.ExpectedDNSRecord) *dnsRecordDTO { + if rec == nil { + return nil + } + return &dnsRecordDTO{ + Name: rec.Name, + Type: string(rec.Type), + Value: rec.Value, + Purpose: string(rec.Purpose), + Required: rec.Required, + TTL: rec.TTL, + } +} + // mapRenewalSubmission builds the 202 RenewalSubmissionResponse body -// from a successful submission. Matches reference `mapToSubmissionResponse`. -// -// Note: per the V2 spec, the status-GET response carries the -// challenges[] block; the submission 202 omits it (the caller already -// gets the same set from the response of the matching GET). This -// mapper stays side-effect-free. +// from a successful submission. Matches reference +// `mapToSubmissionResponse`. The challenges block is the operator's +// only copy of the artifacts they must publish before verify-acme — +// it rides on both the submission 202 and the status GET. func mapRenewalSubmission(agentID string, res *service.SubmitRenewalResult) renewalSubmissionResponse { r := res.Renewal status := renewalStatusPendingValidation @@ -96,6 +151,7 @@ func mapRenewalSubmission(agentID string, res *service.SubmitRenewalResult) rene RenewalType: string(r.RenewalType), Status: status, CsrID: res.CsrID, + Challenges: buildRenewalChallenges(res.FQDN, r.Validation), ExpiresAt: r.Validation.ExpiresAt.Format(time.RFC3339), NextStep: nextStepFor(agentID, status), Links: []linkRef{{ @@ -105,21 +161,25 @@ func mapRenewalSubmission(agentID string, res *service.SubmitRenewalResult) rene } } -// mapRenewalStatus builds the 200 RenewalStatusResponse body. Needs -// the agent FQDN for challenge record naming, which we look up via -// the service layer — but the lifecycle handler already has the -// agent in context, so we wire through the service by not taking -// the FQDN here directly. The challenge-info population is best- -// effort: when the service-layer contract evolves to return the -// FQDN alongside the renewal, we can plumb it through. -func mapRenewalStatus(agentID string, r *domain.ServerCertificateRenewal) renewalStatusResponse { +// mapRenewalStatus builds the 200 RenewalStatusResponse body. +// Pending-validation renewals carry the challenges block (the +// operator may have lost the submission response); completed ones +// carry the TLSA record for the new leaf instead — that record is +// what the ISSUING_CERTIFICATE WAIT step tells the operator to poll +// for. +func mapRenewalStatus(agentID string, res *service.GetRenewalResult) renewalStatusResponse { + r := res.Renewal status := deriveRenewalStatus(r, time.Now()) resp := renewalStatusResponse{ - RenewalType: string(r.RenewalType), - Status: status, - CsrID: r.ServerCsrID, - ExpiresAt: r.Validation.ExpiresAt.Format(time.RFC3339), - NextStep: nextStepFor(agentID, status), + RenewalType: string(r.RenewalType), + Status: status, + CsrID: r.ServerCsrID, + TlsaDNSRecord: tlsaDTOFrom(res.TLSARecord), + ExpiresAt: r.Validation.ExpiresAt.Format(time.RFC3339), + NextStep: nextStepFor(agentID, status), + } + if status == renewalStatusPendingValidation { + resp.Challenges = buildRenewalChallenges(res.FQDN, r.Validation) } if r.FailureReason != "" { resp.FailureReason = r.FailureReason @@ -128,8 +188,9 @@ func mapRenewalStatus(agentID string, r *domain.ServerCertificateRenewal) renewa } // mapRenewalVerification builds the 200/202 RenewalVerificationResponse -// body. BYOC returns COMPLETED sync; CSR returns ISSUING_CERTIFICATE -// async. +// body. Completed renewals (200) include the new leaf's TLSA record so +// the operator can update DNS immediately; ISSUING_CERTIFICATE (202) +// points the operator back at verify-acme via the WAIT next step. func mapRenewalVerification(agentID string, res *service.VerifyRenewalACMEResult) renewalVerificationResponse { r := res.Renewal status := renewalStatusIssuingCertificate @@ -137,8 +198,9 @@ func mapRenewalVerification(agentID string, res *service.VerifyRenewalACMEResult status = renewalStatusCompleted } return renewalVerificationResponse{ - Status: status, - CsrID: r.ServerCsrID, - NextStep: nextStepFor(agentID, status), + Status: status, + CsrID: r.ServerCsrID, + TlsaDNSRecord: tlsaDTOFrom(res.TLSARecord), + NextStep: nextStepFor(agentID, status), } } diff --git a/internal/ra/handler/v1lifecycle.go b/internal/ra/handler/v1lifecycle.go index b026d59..b40e922 100644 --- a/internal/ra/handler/v1lifecycle.go +++ b/internal/ra/handler/v1lifecycle.go @@ -94,12 +94,12 @@ func (h *V1LifecycleHandler) VerifyACME(w http.ResponseWriter, r *http.Request) } WriteJSON(w, http.StatusAccepted, v1AgentStatusResponse{ Status: string(res.Registration.Status), - Phase: phaseFromStatus(res.Registration.Status), - CompletedSteps: completedStepsFor(res.Registration.Status), - PendingSteps: pendingStepsFor(res.Registration.Status), + Phase: phaseFor(res.Registration), + CompletedSteps: completedStepsFor(res.Registration), + PendingSteps: pendingStepsFor(res.Registration), CreatedAt: res.Registration.Details.RegistrationTimestamp.Format("2006-01-02T15:04:05Z07:00"), UpdatedAt: res.Now.Format("2006-01-02T15:04:05Z07:00"), - ExpiresAt: rfc3339Zero(res.Registration.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(res.Registration.CertOrder.ExpiresAt), }) } @@ -125,12 +125,12 @@ func (h *V1LifecycleHandler) VerifyDNS(w http.ResponseWriter, r *http.Request) { } WriteJSON(w, http.StatusAccepted, v1AgentStatusResponse{ Status: string(res.Registration.Status), - Phase: phaseFromStatus(res.Registration.Status), - CompletedSteps: completedStepsFor(res.Registration.Status), - PendingSteps: pendingStepsFor(res.Registration.Status), + Phase: phaseFor(res.Registration), + CompletedSteps: completedStepsFor(res.Registration), + PendingSteps: pendingStepsFor(res.Registration), CreatedAt: res.Registration.Details.RegistrationTimestamp.Format("2006-01-02T15:04:05Z07:00"), UpdatedAt: res.Now.Format("2006-01-02T15:04:05Z07:00"), - ExpiresAt: rfc3339Zero(res.Registration.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(res.Registration.CertOrder.ExpiresAt), }) } diff --git a/internal/ra/handler/v1registration.go b/internal/ra/handler/v1registration.go index ec4baa6..c8cabe2 100644 --- a/internal/ra/handler/v1registration.go +++ b/internal/ra/handler/v1registration.go @@ -35,14 +35,16 @@ import ( // // ans deviations from the reference that apply here: // -// - **No HTTP-01 challenge.** ans verifies domain control via DNS -// only; the `challenges` array in RegistrationPending therefore -// contains only a DNS-01 entry. +// - **DNS-01-only on the V1 wire.** The V1 `challenges` array +// carries a single DNS_01 entry. The verify-acme gate itself +// accepts either artifact (DNS-01 TXT or HTTP-01 resource) — +// HTTP-01 challenge info is simply not relayed on this lane; +// V2 surfaces both. // // Server-cert handling matches the reference byte-for-byte: either -// `serverCsrPEM` (→ the RA signs via its `ServerCertificateAuthority` -// port) or `serverCertificatePEM` + chain (BYOC path). Exactly one -// must be set. +// `serverCsrPEM` (→ the order is finalized via the configured +// `ServerCertificateIssuer` port) or `serverCertificatePEM` + chain +// (BYOC path). Exactly one must be set. // V1RegistrationHandler wires HTTP routes for the V1 `/v1/agents/*` // registration + detail surface. @@ -57,13 +59,9 @@ func NewV1RegistrationHandler(svc *service.RegistrationService) *V1RegistrationH // v1RegistrationRequest mirrors the `AgentRegistrationRequest` // schema in the reference V1 API spec. Field names and JSON tags -// match byte-for-byte. -// -// Difference from V2's `registrationRequest`: the reference permits -// `serverCsrPEM` OR `serverCertificatePEM`; ans accepts only the -// latter (BYOC-only per deviation table). A non-empty -// `serverCsrPEM` returns 422 UNSUPPORTED_FIELD so SDK clients get a -// clear error. +// match byte-for-byte. Like the reference, exactly one of +// `serverCsrPEM` / `serverCertificatePEM` must be set; both or +// neither is 422. type v1RegistrationRequest struct { AgentDisplayName string `json:"agentDisplayName"` AgentDescription string `json:"agentDescription,omitempty"` @@ -328,7 +326,7 @@ func mapV1RegistrationResponse(resp *service.RegisterResponse, r *http.Request) Description: "Call POST /v1/agents/{agentId}/verify-acme once the challenge record is live", Endpoint: base + "/verify-acme"}, }, - ExpiresAt: rfc3339Zero(resp.Registration.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(resp.Registration.CertOrder.ExpiresAt), Links: []v1LinkDTO{ {Rel: "self", Href: base}, }, @@ -336,24 +334,28 @@ func mapV1RegistrationResponse(resp *service.RegisterResponse, r *http.Request) } // buildV1Challenges builds the ChallengeInfo array for the V1 -// RegistrationPending response. ans emits DNS_01 only. +// RegistrationPending response. The V1 wire contract carries a single +// DNS_01 entry (the documented V1 deviation — HTTP_01 is V2-only on +// the wire even though the gate accepts either artifact), so only the +// order's DNS-01 challenge is relayed here. // // Returns nil (omitted on the wire via the `omitempty` tag) when no // challenge has been issued — e.g., for an agent past PENDING_DNS or -// a registration pre-dating the challenge-persistence migration. +// a registration pre-dating order persistence. func buildV1Challenges(reg *domain.AgentRegistration) []v1ChallengeDTO { - if reg.ACMEChallenge.IsZero() { + ch, ok := reg.CertOrder.ChallengeOfType(domain.ChallengeTypeDNS01) + if !ok { return nil } c := v1ChallengeDTO{ Type: "DNS_01", - Token: reg.ACMEChallenge.DNS01Token, + Token: ch.Token, DNSRecord: &v1ChallengeDNSRecordDTO{ - Name: "_acme-challenge." + reg.FQDN(), + Name: ch.EffectiveDNSRecordName(reg.FQDN()), Type: "TXT", - Value: reg.ACMEChallenge.DNS01Token, + Value: ch.EffectiveDNSRecordValue(), }, - ExpiresAt: rfc3339Zero(reg.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(reg.CertOrder.ExpiresAt), } return []v1ChallengeDTO{c} } @@ -448,6 +450,42 @@ func buildV1RegistrationPending(reg *domain.AgentRegistration, r *http.Request, switch reg.Status { case domain.StatusPendingValidation: base := schemeOf(r) + "://" + r.Host + "/v1/agents/" + reg.AgentID + if reg.CertOrder.State == domain.OrderStateFailed { + // Terminal provider failure — relaying the dead challenge + // with CONFIGURE_DNS would loop the operator into a + // verify-acme that only returns CERT_ORDER_FAILED. + return &v1RegistrationPendingResponse{ + Status: string(reg.Status), + AnsName: reg.AnsName.String(), + NextSteps: []v1NextStepDTO{ + {Action: "CANCEL", + Description: "Certificate issuance failed — cancel this registration (POST /revoke) and register a new version", + Endpoint: base + "/revoke"}, + }, + ExpiresAt: rfc3339Zero(reg.CertOrder.ExpiresAt), + Links: []v1LinkDTO{ + {Rel: "self", Href: base}, + }, + } + } + if reg.CertOrder.State == domain.OrderStateIssuing { + // Domain control proven; the issuer is still finalizing. + // The challenge is already answered — relaying it again + // with CONFIGURE_DNS guidance would mislead the operator. + return &v1RegistrationPendingResponse{ + Status: string(reg.Status), + AnsName: reg.AnsName.String(), + NextSteps: []v1NextStepDTO{ + {Action: "WAIT", + Description: "Certificate issuance in progress — POST verify-acme again to check for completion", + Endpoint: base + "/verify-acme"}, + }, + ExpiresAt: rfc3339Zero(reg.CertOrder.ExpiresAt), + Links: []v1LinkDTO{ + {Rel: "self", Href: base}, + }, + } + } // PENDING_VALIDATION carries no production DNS records — those // don't materialize until verify-acme issues the certs that the // TLSA record fingerprints. The only record the operator needs @@ -467,7 +505,7 @@ func buildV1RegistrationPending(reg *domain.AgentRegistration, r *http.Request, Description: "Call POST /v1/agents/{agentId}/verify-acme once the challenge record is live", Endpoint: base + "/verify-acme"}, }, - ExpiresAt: rfc3339Zero(reg.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(reg.CertOrder.ExpiresAt), Links: []v1LinkDTO{ {Rel: "self", Href: base}, }, @@ -495,7 +533,7 @@ func buildV1RegistrationPending(reg *domain.AgentRegistration, r *http.Request, Description: "Verify that all required DNS records are configured", Endpoint: base + "/verify-dns"}, }, - ExpiresAt: rfc3339Zero(reg.ACMEChallenge.ExpiresAt), + ExpiresAt: rfc3339Zero(reg.CertOrder.ExpiresAt), Links: []v1LinkDTO{ {Rel: "self", Href: base}, }, diff --git a/internal/ra/handler/v1renewal.go b/internal/ra/handler/v1renewal.go index dafd78c..568a959 100644 --- a/internal/ra/handler/v1renewal.go +++ b/internal/ra/handler/v1renewal.go @@ -33,8 +33,8 @@ import ( // CERTIFICATE_RENEWED, V1 as AGENT_RENEWED. // // Both server-cert paths supported (matching the reference): operators -// can submit `serverCsrPEM` to have the RA's -// `ServerCertificateAuthority` port sign the cert, or +// can submit `serverCsrPEM` to have the configured +// `ServerCertificateIssuer` port issue the cert, or // `serverCertificatePEM` + chain for BYOC. Exactly one required. type V1RenewalHandler struct { svc *service.RegistrationService @@ -122,10 +122,12 @@ func v1NextStepFor(agentID, status string) nextStep { Description: "Complete ACME challenges then POST to verify-acme endpoint", } case renewalStatusIssuingCertificate: + // See nextStepFor: GET /renewal never re-drives the order; only + // a re-POST of verify-acme does. return nextStep{ Action: "WAIT", - Endpoint: base + "/renewal", - Description: "Certificate issuance in progress, poll GET /renewal for TLSA record", + Endpoint: base + "/renewal/verify-acme", + Description: "Certificate issuance in progress — POST verify-acme again to drive the order to completion", } case renewalStatusCompleted: return nextStep{ @@ -154,6 +156,7 @@ func mapV1RenewalSubmission(agentID string, res *service.SubmitRenewalResult) re RenewalType: string(r.RenewalType), Status: status, CsrID: res.CsrID, + Challenges: buildRenewalChallenges(res.FQDN, r.Validation), ExpiresAt: r.Validation.ExpiresAt.Format(time.RFC3339), NextStep: v1NextStepFor(agentID, status), Links: []linkRef{{ @@ -164,14 +167,19 @@ func mapV1RenewalSubmission(agentID string, res *service.SubmitRenewalResult) re } // mapV1RenewalStatus mirrors mapRenewalStatus with V1 next-step URLs. -func mapV1RenewalStatus(agentID string, r *domain.ServerCertificateRenewal) renewalStatusResponse { +func mapV1RenewalStatus(agentID string, res *service.GetRenewalResult) renewalStatusResponse { + r := res.Renewal status := deriveRenewalStatus(r, time.Now()) resp := renewalStatusResponse{ - RenewalType: string(r.RenewalType), - Status: status, - CsrID: r.ServerCsrID, - ExpiresAt: r.Validation.ExpiresAt.Format(time.RFC3339), - NextStep: v1NextStepFor(agentID, status), + RenewalType: string(r.RenewalType), + Status: status, + CsrID: r.ServerCsrID, + TlsaDNSRecord: tlsaDTOFrom(res.TLSARecord), + ExpiresAt: r.Validation.ExpiresAt.Format(time.RFC3339), + NextStep: v1NextStepFor(agentID, status), + } + if status == renewalStatusPendingValidation { + resp.Challenges = buildRenewalChallenges(res.FQDN, r.Validation) } if r.FailureReason != "" { resp.FailureReason = r.FailureReason @@ -188,8 +196,9 @@ func mapV1RenewalVerification(agentID string, res *service.VerifyRenewalACMEResu status = renewalStatusCompleted } return renewalVerificationResponse{ - Status: status, - CsrID: r.ServerCsrID, - NextStep: v1NextStepFor(agentID, status), + Status: status, + CsrID: r.ServerCsrID, + TlsaDNSRecord: tlsaDTOFrom(res.TLSARecord), + NextStep: v1NextStepFor(agentID, status), } } diff --git a/internal/ra/middleware/ownership_test.go b/internal/ra/middleware/ownership_test.go index cf4b8e3..a67a64a 100644 --- a/internal/ra/middleware/ownership_test.go +++ b/internal/ra/middleware/ownership_test.go @@ -255,6 +255,10 @@ type fakeAgentStore struct { } func (f *fakeAgentStore) Save(_ context.Context, _ *domain.AgentRegistration) error { return nil } + +func (f *fakeAgentStore) ExpireLapsedPendingValidation(_ context.Context, _ time.Time) (int64, error) { + return 0, nil +} func (f *fakeAgentStore) FindByID(_ context.Context, _ int64) (*domain.AgentRegistration, error) { return nil, domain.NewNotFoundError("AGENT_NOT_FOUND", "not found") } diff --git a/internal/ra/service/agent_expiry.go b/internal/ra/service/agent_expiry.go new file mode 100644 index 0000000..48715f9 --- /dev/null +++ b/internal/ra/service/agent_expiry.go @@ -0,0 +1,80 @@ +package service + +import ( + "context" + "fmt" + "time" + + "github.com/rs/zerolog" + + "github.com/godaddy/ans/internal/port" +) + +// ExpireAgentsOnce transitions lapsed PENDING_VALIDATION registrations +// to EXPIRED via a single guarded store write, honoring the spec's +// revoke-route contract: "PENDING_VALIDATION registrations are not +// cancellable and will auto-expire". Returns the number transitioned. +// +// The guard (status still PENDING_VALIDATION, order still PENDING, +// window lapsed) lives in the store's UPDATE WHERE clause, so the +// sweep cannot clobber a row that a concurrent verify-acme advanced +// between scans — see port.AgentStore.ExpireLapsedPendingValidation. +// +// No certificate cleanup is needed by construction: identity +// certificates are signed only when the certificate order completes — +// the same transaction that advances the agent to PENDING_DNS — so a +// still-PENDING_VALIDATION row with a PENDING order has never had +// anything issued. Registrations whose order failed terminally or is +// mid-issuance are excluded by the guard and leave PENDING_VALIDATION +// through the cancel path instead (see RegistrationService.Revoke). +// +// No TL emit: under the terminal-only event model no leaf exists for +// an agent that never reached ACTIVE. Idempotent — an already-EXPIRED +// row no longer matches the guard. +func ExpireAgentsOnce( + ctx context.Context, + agents port.AgentStore, + now time.Time, +) (int, error) { + n, err := agents.ExpireLapsedPendingValidation(ctx, now) + if err != nil { + return 0, fmt.Errorf("expire lapsed registrations: %w", err) + } + return int(n), nil +} + +// RunAgentExpiryChecker blocks until ctx is cancelled, calling +// ExpireAgentsOnce on a fixed interval — the registration-side twin +// of RunExpiryChecker for renewals. Sweep errors are logged, not +// returned: a single bad sweep (usually transient DB trouble) +// shouldn't tear down the worker. +func RunAgentExpiryChecker( + ctx context.Context, + agents port.AgentStore, + logger zerolog.Logger, + opts ExpiryCheckerOptions, +) { + interval := opts.Interval + if interval <= 0 { + interval = 5 * time.Minute + } + logger.Info().Dur("interval", interval).Msg("agent expiry checker started") + ticker := time.NewTicker(interval) + defer ticker.Stop() + for { + select { + case <-ctx.Done(): + logger.Info().Msg("agent expiry checker stopped") + return + case now := <-ticker.C: + n, err := ExpireAgentsOnce(ctx, agents, now) + if err != nil { + logger.Warn().Err(err).Msg("agent expiry sweep failed") + continue + } + if n > 0 { + logger.Info().Int("expired", n).Msg("agent expiry sweep completed") + } + } + } +} diff --git a/internal/ra/service/helpers.go b/internal/ra/service/helpers.go index fda9cd6..de73619 100644 --- a/internal/ra/service/helpers.go +++ b/internal/ra/service/helpers.go @@ -1,7 +1,9 @@ package service import ( + "context" "crypto/sha256" + "crypto/x509" "encoding/hex" "encoding/pem" "errors" @@ -11,6 +13,48 @@ import ( "github.com/godaddy/ans/internal/domain" ) +// loadServerCert returns the agent's latest valid server certificate, +// or (nil, nil) when none is on file. A genuinely-absent cert +// (ErrNotFound) is normal — CSR-path registrations may not have one +// yet, and ComputeRequiredDNSRecords simply omits the TLSA record. +// +// Any OTHER store error is propagated. Callers fold this cert into +// terminal, immutable artifacts — the TLSA record an operator +// publishes, and the serverCerts[] of the signed AGENT_REGISTERED leaf +// in the append-only log. Swallowing a transient store failure (busy +// timeout, I/O) would silently drop the cert and emit a permanently +// wrong attestation from a recoverable fault, so absence and failure +// must never be conflated here. +func (s *RegistrationService) loadServerCert( + ctx context.Context, agentID string, +) (*domain.ByocServerCertificate, error) { + cert, err := s.byoc.FindLatestValidByAgentID(ctx, agentID) + if err != nil { + if errors.Is(err, domain.ErrNotFound) { + return nil, nil //nolint:nilnil // (nil, nil) signals "no server cert on file"; callers distinguish via the nil pointer and skip the TLSA record + } + return nil, err + } + return cert, nil +} + +// serialFromCertPEM parses the certificate and returns its serial as +// lowercase hex — the same encoding issuers report at signing time. +// Fallback for stored certificates persisted before serial tracking +// landed (migration 007); rows written since carry the serial +// directly. +func serialFromCertPEM(pemStr string) (string, error) { + block, _ := pem.Decode([]byte(pemStr)) + if block == nil || block.Type != "CERTIFICATE" { + return "", errors.New("service: cert PEM has no CERTIFICATE block") + } + cert, err := x509.ParseCertificate(block.Bytes) + if err != nil { + return "", fmt.Errorf("service: parse certificate: %w", err) + } + return fmt.Sprintf("%x", cert.SerialNumber), nil +} + // fingerprintOf returns the SHA-256 fingerprint of the DER certificate // inside the given PEM string, formatted as `SHA256:`. // The `SHA256:` prefix matches the algorithm-prefixed form the diff --git a/internal/ra/service/helpers_internal_test.go b/internal/ra/service/helpers_internal_test.go new file mode 100644 index 0000000..bb90539 --- /dev/null +++ b/internal/ra/service/helpers_internal_test.go @@ -0,0 +1,24 @@ +package service + +import ( + "encoding/pem" + "strings" + "testing" +) + +func TestSerialFromCertPEM_Errors(t *testing.T) { + if _, err := serialFromCertPEM(""); err == nil { + t.Error("want error for empty PEM") + } + if _, err := serialFromCertPEM("not pem at all"); err == nil { + t.Error("want error for non-PEM input") + } + wrongType := string(pem.EncodeToMemory(&pem.Block{Type: "PRIVATE KEY", Bytes: []byte{1}})) + if _, err := serialFromCertPEM(wrongType); err == nil { + t.Error("want error for non-CERTIFICATE block") + } + badDER := string(pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: []byte{0xff}})) + if _, err := serialFromCertPEM(badDER); err == nil || !strings.Contains(err.Error(), "parse certificate") { + t.Errorf("want parse error for garbage DER, got %v", err) + } +} diff --git a/internal/ra/service/identity_flow_test.go b/internal/ra/service/identity_flow_test.go new file mode 100644 index 0000000..acf1a4d --- /dev/null +++ b/internal/ra/service/identity_flow_test.go @@ -0,0 +1,461 @@ +package service_test + +import ( + "context" + "testing" + "time" + + "github.com/rs/zerolog" + + "github.com/godaddy/ans/internal/adapter/cert" + "github.com/godaddy/ans/internal/adapter/dns" + "github.com/godaddy/ans/internal/adapter/store/sqlite" + "github.com/godaddy/ans/internal/domain" + "github.com/godaddy/ans/internal/ra/service" +) + +// selfCAOf unwraps the fixture's identity CA so tests can assert +// CA-side revocation through IsRevoked. +func selfCAOf(t *testing.T, fx *regFixture) *cert.SelfCA { + t.Helper() + ca, ok := fx.identityCA.(*cert.SelfCA) + if !ok { + t.Fatalf("fixture identity CA is %T, want *cert.SelfCA", fx.identityCA) + } + return ca +} + +// TestSubmitIdentityCSR_RotationSignsImmediately pins the rotation +// flow: an ACTIVE agent's new identity CSR is signed at submission — +// the CSR row flips to SIGNED and a second identity certificate +// (carrying its serial) lands in the store. Pre-fix, rotation CSRs +// sat PENDING forever because nothing ever signed them. +func TestSubmitIdentityCSR_RotationSignsImmediately(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + agentID := registerAndActivate(t, fx, fx.svc) + + rotationCSR := testCSR(t, fx.req.AnsName.String()) + csrID, err := fx.svc.SubmitIdentityCSR(context.Background(), agentID, rotationCSR) + if err != nil { + t.Fatalf("rotation submit: %v", err) + } + + csr, err := fx.svc.GetCSRStatus(context.Background(), agentID, csrID) + if err != nil { + t.Fatal(err) + } + if csr.Status != domain.CSRStatusSigned { + t.Fatalf("rotation CSR status: got %s want SIGNED", csr.Status) + } + + certs, err := fx.certs.FindIdentityCertificatesByAgent(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + if len(certs) != 2 { + t.Fatalf("identity certs after rotation: got %d want 2 (rotation is additive)", len(certs)) + } + for _, c := range certs { + if c.Status != domain.CertStatusValid { + t.Errorf("cert %s: status %s, want VALID (old cert stays valid until expiry)", c.CSRID, c.Status) + } + if c.SerialNumber == "" { + t.Errorf("cert %s: missing serial number", c.CSRID) + } + } +} + +// TestRevoke_RevokesIdentityCertsAtCA pins CA-side revocation: the +// revoke flow must tell the issuing CA, not just flip database rows — +// with a cloud private CA that call is what lands the cert on the +// CRL/OCSP plane. Pre-fix the port method was never invoked. +func TestRevoke_RevokesIdentityCertsAtCA(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + agentID := registerAndActivate(t, fx, fx.svc) + + certs, err := fx.certs.FindIdentityCertificatesByAgent(context.Background(), agentID) + if err != nil || len(certs) != 1 { + t.Fatalf("precondition: certs=%d err=%v", len(certs), err) + } + serial := certs[0].SerialNumber + if serial == "" { + t.Fatal("precondition: stored cert must carry its serial") + } + + if _, err := fx.svc.Revoke(context.Background(), agentID, service.RevokeInput{ + Reason: domain.RevocationKeyCompromise, + }); err != nil { + t.Fatalf("revoke: %v", err) + } + + if !selfCAOf(t, fx).IsRevoked(serial) { + t.Error("identity cert was not revoked at the issuing CA") + } + after, err := fx.certs.FindIdentityCertificatesByAgent(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + if after[0].Status != domain.CertStatusRevoked { + t.Errorf("stored cert status: got %s want REVOKED", after[0].Status) + } +} + +// TestRevoke_LegacyRowsDeriveSerialFromPEM pins the fallback: rows +// persisted before serial tracking (migration 007) have no stored +// serial, so CA revocation parses it out of the certificate PEM. +func TestRevoke_LegacyRowsDeriveSerialFromPEM(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + agentID := registerAndActivate(t, fx, fx.svc) + + certs, _ := fx.certs.FindIdentityCertificatesByAgent(context.Background(), agentID) + serial := certs[0].SerialNumber + + // Age the row into its pre-007 shape. + db, ok := fx.uow.(*sqlite.DB) + if !ok { + t.Fatalf("fixture uow is %T, want *sqlite.DB", fx.uow) + } + if _, err := db.DBX().Exec( + `UPDATE issued_certificates SET serial_number = NULL, certificate_ref = NULL WHERE agent_id = ?`, + agentID); err != nil { + t.Fatal(err) + } + + if _, err := fx.svc.Revoke(context.Background(), agentID, service.RevokeInput{ + Reason: domain.RevocationKeyCompromise, + }); err != nil { + t.Fatalf("revoke with legacy rows: %v", err) + } + if !selfCAOf(t, fx).IsRevoked(serial) { + t.Error("legacy-row revocation must derive the serial from the PEM") + } +} + +// TestRevoke_PendingDNS_CancelsWithoutTLEmit pins the cancel path: +// a PENDING_DNS registration terminates through the revoke route — +// lifecycle to REVOKED, identity cert revoked at the CA and in the +// store — and emits NOTHING to the TL, because no leaf exists for an +// agent that never reached ACTIVE. +func TestRevoke_PendingDNS_CancelsWithoutTLEmit(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + if _, err := fx.svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatal(err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + if _, err := fx.svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}); err != nil { + t.Fatalf("verify-acme: %v", err) + } + + certs, _ := fx.certs.FindIdentityCertificatesByAgent(context.Background(), agentID) + if len(certs) != 1 { + t.Fatalf("precondition: identity cert expected at PENDING_DNS, got %d", len(certs)) + } + + res, err := fx.svc.Revoke(context.Background(), agentID, service.RevokeInput{ + Reason: domain.RevocationCessationOfOperation, + }) + if err != nil { + t.Fatalf("cancel via revoke route: %v", err) + } + if res.Registration.Status != domain.StatusRevoked { + t.Fatalf("status: got %s want REVOKED", res.Registration.Status) + } + if !selfCAOf(t, fx).IsRevoked(certs[0].SerialNumber) { + t.Error("cancelled registration's identity cert must be revoked at the CA") + } + + rows, err := fx.outboxStore.Claim(context.Background(), 10) + if err != nil { + t.Fatal(err) + } + if len(rows) != 0 { + t.Fatalf("cancel must not emit to the TL (no leaf exists pre-ACTIVE), got %d rows", len(rows)) + } +} + +// TestRevoke_AwaitingValidation_NotCancellable pins the spec rule: +// a registration whose challenge is still outstanding is not +// cancellable — it auto-expires instead. +func TestRevoke_AwaitingValidation_NotCancellable(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + if _, err := fx.svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatal(err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + _, err := fx.svc.Revoke(context.Background(), agentID, service.RevokeInput{ + Reason: domain.RevocationCessationOfOperation, + }) + mustErrCode(t, err, "CANNOT_CANCEL") +} + +// TestRevoke_FailedOrder_Cancellable pins the recovery path for a +// terminally failed provider order: the registration is cancellable +// so the operator can clean up explicitly. +func TestRevoke_FailedOrder_Cancellable(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + issuer := &asyncIssuer{real: fx.serverCA, failOrder: true} + svc := rebuildWithIssuer(fx, issuer, dns.NewNoopVerifier(), nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatal(err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + if _, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}); err == nil { + t.Fatal("precondition: verify-acme should fail terminally") + } + + res, err := svc.Revoke(context.Background(), agentID, service.RevokeInput{ + Reason: domain.RevocationCessationOfOperation, + }) + if err != nil { + t.Fatalf("failed-order registration must be cancellable: %v", err) + } + if res.Registration.Status != domain.StatusRevoked { + t.Fatalf("status: got %s want REVOKED", res.Registration.Status) + } +} + +// TestRevoke_Cancel_InvalidReason mirrors the active-path validation +// on the cancel branch. +func TestRevoke_Cancel_InvalidReason(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + if _, err := fx.svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatal(err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + if _, err := fx.svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}); err != nil { + t.Fatal(err) + } + _, err := fx.svc.Revoke(context.Background(), agentID, service.RevokeInput{ + Reason: domain.RevocationReason("NOT_A_REASON"), + }) + mustErrCode(t, err, "INVALID_REVOCATION_REASON") +} + +// TestExpireAgentsOnce_FlipsLapsedPendingValidation pins the +// auto-expiry promise: PENDING_VALIDATION registrations whose +// challenge window lapsed flip to EXPIRED; everything else is +// untouched. Idempotent — the second sweep finds nothing. +func TestExpireAgentsOnce_FlipsLapsedPendingValidation(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + if _, err := fx.svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatal(err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + // Not yet lapsed → untouched. + n, err := service.ExpireAgentsOnce(context.Background(), fx.agents, time.Now()) + if err != nil || n != 0 { + t.Fatalf("fresh registration must not expire: n=%d err=%v", n, err) + } + + // Age the challenge window past its deadline. + reg, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + reg.CertOrder.ExpiresAt = time.Now().Add(-time.Minute) + if err := fx.agents.Save(context.Background(), reg); err != nil { + t.Fatal(err) + } + + n, err = service.ExpireAgentsOnce(context.Background(), fx.agents, time.Now()) + if err != nil || n != 1 { + t.Fatalf("sweep: n=%d err=%v, want 1", n, err) + } + after, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + if after.Status != domain.StatusExpired { + t.Fatalf("status: got %s want EXPIRED", after.Status) + } + + // Idempotent. + n, err = service.ExpireAgentsOnce(context.Background(), fx.agents, time.Now()) + if err != nil || n != 0 { + t.Fatalf("second sweep: n=%d err=%v, want 0", n, err) + } +} + +// TestExpireAgentsOnce_SkipsInFlightAndFailedOrders pins the guard +// that protects the async-issuer re-drive design: a lapsed +// PENDING_VALIDATION row whose order is ISSUING (provider validating) +// or FAILED (terminal, cancel-only) must NOT be auto-expired, even +// though its challenge window has passed. Only PENDING orders expire. +func TestExpireAgentsOnce_SkipsInFlightAndFailedOrders(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + + seed := func(host string, mutate func(*domain.CertificateOrder)) string { + req := fx.req + sv, _ := domain.ParseSemVer("1.0.0") + an, _ := domain.NewAnsName(sv, host) + req.AnsName = an + req.IdentityCSRPEM = testCSR(t, an.String()) + req.ServerCsrPEM = testServerCSR(t, an.FQDN()) + req.Endpoints = []domain.AgentEndpoint{{ + Protocol: domain.Protocol("MCP"), + AgentURL: "https://" + host + "/mcp", + Transports: []domain.Transport{domain.Transport("SSE")}, + }} + if _, err := fx.svc.RegisterAgent(context.Background(), req); err != nil { + t.Fatal(err) + } + reg, err := fx.agents.FindByAgentID(context.Background(), anyAgentID(t, fx, an)) + if err != nil { + t.Fatal(err) + } + reg.CertOrder.ExpiresAt = time.Now().Add(-time.Minute) // window lapsed + mutate(®.CertOrder) + if err := fx.agents.Save(context.Background(), reg); err != nil { + t.Fatal(err) + } + return reg.AgentID + } + + issuingID := seed("issuing.example.com", func(o *domain.CertificateOrder) { + _ = o.MarkIssuing() + }) + failedID := seed("failed.example.com", func(o *domain.CertificateOrder) { + _ = o.MarkFailed() + }) + pendingID := seed("pending.example.com", func(_ *domain.CertificateOrder) {}) + + n, err := service.ExpireAgentsOnce(context.Background(), fx.agents, time.Now()) + if err != nil { + t.Fatalf("sweep: %v", err) + } + if n != 1 { + t.Fatalf("only the PENDING-order row should expire: got n=%d want 1", n) + } + assertStatus := func(id string, want domain.RegistrationStatus) { + got, gerr := fx.agents.FindByAgentID(context.Background(), id) + if gerr != nil { + t.Fatal(gerr) + } + if got.Status != want { + t.Errorf("agent %s: status %s, want %s", id, got.Status, want) + } + } + assertStatus(issuingID, domain.StatusPendingValidation) + assertStatus(failedID, domain.StatusPendingValidation) + assertStatus(pendingID, domain.StatusExpired) +} + +// TestExpireAgentsOnce_StoreError surfaces the store error rather than +// swallowing it (the worker logs-and-continues on this; here we pin +// the error propagation deterministically by closing the DB first). +func TestExpireAgentsOnce_StoreError(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + db, ok := fx.uow.(*sqlite.DB) + if !ok { + t.Fatalf("fixture uow is %T, want *sqlite.DB", fx.uow) + } + _ = db.Close() + if _, err := service.ExpireAgentsOnce(context.Background(), fx.agents, time.Now()); err == nil { + t.Fatal("want error from a closed store") + } +} + +// TestRunAgentExpiryChecker_ExitsOnContextCancel proves the worker +// honors shutdown — the cmd/ans-ra SIGTERM path. +func TestRunAgentExpiryChecker_ExitsOnContextCancel(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + runCtx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + service.RunAgentExpiryChecker(runCtx, fx.agents, zerolog.Nop(), service.ExpiryCheckerOptions{ + Interval: 50 * time.Millisecond, + }) + close(done) + }() + time.Sleep(80 * time.Millisecond) // let the empty-sweep tick fire + cancel() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("RunAgentExpiryChecker did not exit on ctx cancel") + } +} + +// TestRunAgentExpiryChecker_DefaultInterval covers the zero-interval +// fallback; pre-cancelled ctx returns on the first select. +func TestRunAgentExpiryChecker_DefaultInterval(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + runCtx, cancel := context.WithCancel(context.Background()) + cancel() + service.RunAgentExpiryChecker(runCtx, fx.agents, zerolog.Nop(), service.ExpiryCheckerOptions{}) +} + +// TestRunAgentExpiryChecker_SweepsAndLogsErrors drives both ticker +// branches: a productive sweep (seeded lapsed registration) and a +// failing sweep (database closed mid-run) — neither may tear the +// worker down. +func TestRunAgentExpiryChecker_SweepsAndLogsErrors(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + if _, err := fx.svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatal(err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + reg, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + reg.CertOrder.ExpiresAt = time.Now().Add(-time.Minute) + if err := fx.agents.Save(context.Background(), reg); err != nil { + t.Fatal(err) + } + + runCtx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { + service.RunAgentExpiryChecker(runCtx, fx.agents, zerolog.Nop(), service.ExpiryCheckerOptions{ + Interval: 30 * time.Millisecond, + }) + close(done) + }() + + // Wait for the productive sweep to land. + deadline := time.Now().Add(2 * time.Second) + for { + got, gerr := fx.agents.FindByAgentID(context.Background(), agentID) + if gerr == nil && got.Status == domain.StatusExpired { + break + } + if time.Now().After(deadline) { + cancel() + t.Fatal("sweep never expired the lapsed registration") + } + time.Sleep(10 * time.Millisecond) + } + + // Close the DB out from under the worker: the next sweep errors + // and is logged, not fatal. + db, ok := fx.uow.(*sqlite.DB) + if !ok { + t.Fatalf("fixture uow is %T, want *sqlite.DB", fx.uow) + } + _ = db.Close() + time.Sleep(80 * time.Millisecond) + + cancel() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("worker did not exit on ctx cancel") + } +} diff --git a/internal/ra/service/lifecycle.go b/internal/ra/service/lifecycle.go index cf15a65..b1652d6 100644 --- a/internal/ra/service/lifecycle.go +++ b/internal/ra/service/lifecycle.go @@ -96,10 +96,16 @@ func (s *RegistrationService) GetByAgentID(ctx context.Context, agentID string) epSlice = eps.Endpoints } // BYOC server cert is optional — absent on CSR-path registrations - // where the RA signs the server cert itself. A nil result from - // the store is fine; ComputeRequiredDNSRecords skips the TLSA - // record when reg.ServerCert is nil. - if byoc, berr := s.byoc.FindLatestValidByAgentID(ctx, agentID); berr == nil && byoc != nil { + // where the RA signs the server cert itself. A genuinely-absent + // cert is fine; ComputeRequiredDNSRecords skips the TLSA record + // when reg.ServerCert is nil. A transient store failure, however, + // must not masquerade as "no cert" — that would silently drop the + // TLSA record from the detail response. + byoc, berr := s.loadServerCert(ctx, agentID) + if berr != nil { + return nil, berr + } + if byoc != nil { reg.ServerCert = byoc } return &DetailResult{ @@ -139,10 +145,21 @@ func (s *RegistrationService) ServerCertificates(ctx context.Context, agentID st // ----- CSR submission + status ----- // SubmitIdentityCSR accepts a new identity CSR for an already-ACTIVE -// agent (identity-cert rotation). Validates the CSR against the -// agent's ANS name, updates the aggregate's embedded IdentityCSR -// slot, and persists the row in the csrs table so the status endpoint -// can find it. +// agent (identity-cert rotation), signs it via the private identity +// CA, and persists the SIGNED CSR row plus the new certificate +// atomically. The previous identity cert stays VALID until it expires +// or the agent is revoked — rotation is additive, matching the +// rotation-array model the TL envelopes carry. +// +// Trust basis: the identity CA is a private root with no validation +// of its own, and rotation deliberately performs no fresh +// domain-control challenge. Ownership of the (unchanged) ANS name was +// proven when the agent reached ACTIVE — the RA's challenge gate plus +// the public provider's own validation on the ACME path — and the +// ACTIVE + identity-bearing guards below scope rotation to exactly +// that population. A recency-bounded revalidation would require +// relaying a fresh challenge through CsrSubmissionResponse, which has +// no challenge surface in the spec. // // Per the reference RA's `CertificateManagementService.submitIdentityCsr`, // identity CSRs are gated on status == ACTIVE. The aggregate method @@ -187,10 +204,26 @@ func (s *RegistrationService) SubmitIdentityCSR(ctx context.Context, agentID, cs if err != nil { return "", err } - if err := s.agents.Save(ctx, reg); err != nil { + + // Issue before the tx (CA work doesn't need the SQLite write + // lock), persist atomically after: the SIGNED CSR row, the new + // certificate, and the aggregate's embedded slot commit together + // so a crash can never leave a SIGNED CSR without its cert. + signedID, storedID, err := s.signIdentityCSR(ctx, reg, newCSR, now) + if err != nil { return "", err } - if err := s.certs.SaveCSR(ctx, agentID, newCSR); err != nil { + reg.IdentityCSR = signedID + + if err := s.uow.Run(ctx, func(txCtx context.Context) error { + if err := s.agents.Save(txCtx, reg); err != nil { + return err + } + if err := s.certs.SaveCSR(txCtx, agentID, signedID); err != nil { + return err + } + return s.certs.SaveIdentityCertificate(txCtx, agentID, storedID) + }); err != nil { return "", err } return csrID, nil @@ -251,10 +284,16 @@ func (s *RegistrationService) GetCSRStatus(ctx context.Context, agentID, csrID s // ----- Write: VerifyACME, VerifyDNS, Revoke ----- // VerifyACMEResult is returned by VerifyACME; the handler maps this -// into an AgentStatus response (status=PENDING_DNS, phase=DNS_PROVISIONING). +// into an AgentStatus response. type VerifyACMEResult struct { Registration *domain.AgentRegistration Now time.Time // propagated so handler timestamps match the outbox row + // Pending is true when domain validation passed but the + // certificate provider is finalizing the order asynchronously. + // The lifecycle status stays PENDING_VALIDATION with the order in + // ISSUING; the handler reports phase=CERTIFICATE_ISSUANCE and the + // client re-POSTs verify-acme to drive the order to completion. + Pending bool } // VerifyACME advances the registration from PENDING_VALIDATION to @@ -265,20 +304,31 @@ type VerifyACMEResult struct { // // Steps: // -// 1. Verify the ACME DNS-01 challenge TXT record resolves to the -// expected token. (Skipped when dnsVerifier is nil — local dev.) -// 2. Sign the identity CSR via identityCA. Persist the resulting +// 1. Challenge gate: confirm at least one of the order's +// domain-control challenge artifacts (DNS-01 TXT record or +// HTTP-01 resource) is published. Unconditional while the order +// awaits validation — the issuer is never invoked before the +// gate passes, regardless of which issuer adapter is wired. +// 2. If a server CSR was submitted at registration (CSR path), +// finalize the certificate order via the issuer port and persist +// the resulting cert through the BYOC store (same struct covers +// both paths downstream). Asynchronous providers may leave the +// order ISSUING — the call returns Pending (nothing signed) and +// a later verify-acme re-drives the finalize. BYOC registrations +// skip this step — the operator's cert was saved at register +// time. +// 3. Sign the identity CSR via the private identityCA — only after +// ownership is fully proven (the public provider's validation on +// the CSR path, the RA's gate for BYOC). Persist the resulting // cert + mark the CSR SIGNED. -// 3. If a server CSR was submitted at registration (CSR path), -// sign it via serverCA and persist the resulting cert through -// the BYOC store (same struct covers both paths downstream). -// BYOC registrations skipped this step — the operator's cert -// was saved at register time. -// 4. Transition the aggregate to PENDING_DNS. +// 4. Transition the aggregate to PENDING_DNS (the order reaches +// COMPLETED in the same transaction). // // Idempotent: if the registration is already past PENDING_VALIDATION, // return the current state without erroring — matches the reference's -// "if already progressed, succeed silently" semantics. +// "if already progressed, succeed silently" semantics. Re-driven +// calls on an ISSUING order skip the gate (the provider already +// accepted the challenge answer) and only re-attempt the finalize. func (s *RegistrationService) VerifyACME(ctx context.Context, agentID string, in VerifyInput) (*VerifyACMEResult, error) { now := s.clock() reg, err := s.agents.FindByAgentID(ctx, agentID) @@ -302,34 +352,64 @@ func (s *RegistrationService) VerifyACME(ctx context.Context, agentID string, in reg.Endpoints = eps.Endpoints } - // 1. Verify the ACME DNS-01 challenge record. The expected - // value is the token the RA generated at register time; the - // record name is `_acme-challenge.`. - if s.dnsVerifier != nil && !reg.ACMEChallenge.IsZero() { - acmeRec := domain.ExpectedDNSRecord{ - Name: "_acme-challenge." + reg.FQDN(), - Type: domain.DNSRecordTXT, - Value: reg.ACMEChallenge.DNS01Token, - Purpose: "DOMAIN_VALIDATION", - Required: true, - } - res, verr := s.dnsVerifier.VerifyRecords(ctx, reg.FQDN(), []domain.ExpectedDNSRecord{acmeRec}) - if verr != nil { - return nil, fmt.Errorf("acme verify: %w", verr) + // 1. Domain-control challenge gate. + verified, err := s.gateOrderChallenges(ctx, reg, now) + if err != nil { + return nil, err + } + + // 2. CSR-path server cert: finalize the certificate order via the + // issuer port — validate + mark up front, persist below inside + // the tx. Asynchronous providers may report the order still + // pending: persist only the ISSUING order state without + // advancing the lifecycle, and let a later verify-acme re-drive + // the finalize. Nothing is signed on the pending path — on the + // ACME path the provider's own validation is the authoritative + // ownership proof, and the identity cert below is provisioned + // only once it succeeds. + serverCSR, err := s.certs.FindLatestPendingCSRByType(ctx, agentID, domain.CSRTypeServer) + if err != nil { + return nil, err + } + var byocCert *domain.ByocServerCertificate + var signedSrv domain.AgentCSR + // Finalize only the server CSR that belongs to this registration's + // certificate order — one created via the issuer's CreateOrder, + // which always stamps an OrderRef (the self-CA uses a "selfca-…" + // handle, ACME the order URL). A BYOC registration's order is + // self-issued with an empty ref and its cert already exists; a + // stray server CSR submitted out-of-band to such an agent (the + // POST /certificates/server route accepts CSRs in any state) must + // NOT be finalized here — doing so would issue a second cert over + // the operator's BYOC cert, and against an ACME issuer would 500 + // on the empty order ref. Leave it untouched; the agent advances. + if serverCSR != nil && reg.CertOrder.OrderRef != "" { + outcome, err := s.finalizeServerOrder(ctx, reg, serverCSR, verified, now) + if err != nil { + return nil, err } - if res != nil && !res.AllRequired { - return nil, domain.NewInvalidStateError( - "ACME_CHALLENGE_MISSING", - fmt.Sprintf("_acme-challenge.%s TXT record is not published or doesn't match the issued token", reg.FQDN()), - ) + if outcome.pending { + if err := s.agents.Save(ctx, reg); err != nil { + return nil, err + } + return &VerifyACMEResult{Registration: reg, Now: now, Pending: true}, nil } + byocCert = outcome.cert + signedSrv = outcome.signedCSR + reg.ServerCert = byocCert } - // 2. Sign the identity CSR (when the agent registered with one). - // Issuance + signing are CPU-bound and do not touch the DB; we - // run them outside the tx so the SQLite write lock isn't held - // during work that doesn't need it. Same pattern below for the - // server CSR path. + // 3. Sign the identity CSR (when the agent registered with one). + // The identity CA is a private trust root with no challenge + // lifecycle of its own — it signs on the strength of the + // ownership proof already established: the RA's gate for BYOC + // (no public CA involved), plus the provider's own validation + // on the CSR path (the order completed above). That ordering + // is deliberate: a terminally failed public-CA order must never + // leave a signed identity cert behind. + // + // Issuance + signing run outside the tx so the SQLite write + // lock isn't held during work that doesn't need it. // // A nil pending identity CSR is NOT an error: an agent may // register without an identity CSR (identityCsrPEM optional), in @@ -343,45 +423,23 @@ func (s *RegistrationService) VerifyACME(ctx context.Context, agentID string, in var signedID *domain.AgentCSR var storedID *domain.StoredCertificate if identityCSR != nil { - issuedID, err := s.identityCA.IssueIdentityCertificate(ctx, identityCSR.CSRContent, reg.AnsName.String()) - if err != nil { - return nil, domain.NewInternalError("CERT_ISSUE_FAILED", "failed to issue identity cert", err) - } - signed, err := identityCSR.MarkSigned(now) + signedID, storedID, err = s.signIdentityCSR(ctx, reg, identityCSR, now) if err != nil { return nil, err } - reg.IdentityCSR = &signed - signedID = &signed - storedID = &domain.StoredCertificate{ - CSRID: identityCSR.CSRID, - CertificateType: domain.CertTypeIdentity, - CertificatePEM: issuedID.CertPEM, - ChainPEM: issuedID.ChainPEM, - Status: domain.CertStatusValid, - IssueTimestamp: issuedID.IssuedAt, - ExpirationTimestamp: issuedID.ExpiresAt, - } + reg.IdentityCSR = signedID } - // 3. CSR-path server cert: same shape — sign + validate up - // front, persist below inside the tx. - serverCSR, err := s.certs.FindLatestPendingCSRByType(ctx, agentID, domain.CSRTypeServer) - if err != nil { - return nil, err - } - var byocCert *domain.ByocServerCertificate - var signedSrv domain.AgentCSR - if serverCSR != nil { - var err error - byocCert, signedSrv, err = s.signServerCSRForVerifyACME(ctx, reg, serverCSR, now) - if err != nil { + // 4. Transition to PENDING_DNS in-memory; the tx below commits it. + // The order completes in the same step: for the CSR path the + // certificate landed, for BYOC domain control was proven — the + // only thing its self-issued order tracks. Legacy registrations + // without a persisted order have nothing to complete. + if !reg.CertOrder.IsZero() { + if err := reg.CertOrder.MarkCompleted(); err != nil { return nil, err } - reg.ServerCert = byocCert } - - // 4. Transition to PENDING_DNS in-memory; the tx below commits it. if err := reg.AdvanceToPendingDNS(); err != nil { return nil, err } @@ -422,30 +480,195 @@ func (s *RegistrationService) VerifyACME(ctx context.Context, agentID string, in return &VerifyACMEResult{Registration: reg, Now: now}, nil } -// signServerCSRForVerifyACME signs the pending server CSR via the -// configured server CA, validates the issued cert, and returns the -// BYOC-shape cert struct + the SIGNED CSR row so the caller can -// commit both inside its uow transaction. Extracted from VerifyACME -// to keep the orchestrator under the funlen bound; the issuance + -// validation are CPU-only and don't need to hold the SQLite write -// lock. -func (s *RegistrationService) signServerCSRForVerifyACME( +// gateOrderChallenges is the domain-control challenge gate. It runs +// before every issuer invocation, regardless of which issuer adapter +// is wired: +// +// - PENDING order → at least one challenge artifact must be +// verified as published (DNS-01 TXT or HTTP-01 resource); +// otherwise 422 ACME_CHALLENGE_MISSING. Expired challenge +// windows are 422 ACME_CHALLENGE_EXPIRED. +// - ISSUING order → gate skipped: the provider already accepted a +// challenge answer on an earlier call, and the operator may have +// legitimately removed the artifact since. The re-driven call +// only re-attempts the finalize. +// - FAILED order → 422 CERT_ORDER_FAILED; the operator cancels and +// re-registers. +// +// Returns the challenge types found published so ACME-style issuers +// can answer exactly the satisfied challenge (answering an +// unsatisfied one would invalidate the authorization). +// +// NOTE: zero-value orders (registrations predating order persistence) +// skip the gate — no challenge was ever issued to the operator, so +// there is nothing that could be verified. Every registration created +// since order persistence carries one. +func (s *RegistrationService) gateOrderChallenges( + ctx context.Context, reg *domain.AgentRegistration, now time.Time, +) ([]domain.ChallengeType, error) { + order := reg.CertOrder + switch { + case order.IsZero(): + return nil, nil + case order.State == domain.OrderStateIssuing: + return nil, nil + case order.State == domain.OrderStateFailed: + // 422 (validation), not 409: the spec documents only 422 on + // verify-acme. Recovery is to cancel (POST /revoke — cancel + // permits a failed order) then register a new version; the + // ANS name is immutable once used. + return nil, domain.NewValidationError("CERT_ORDER_FAILED", + "certificate order failed terminally; cancel this registration (POST /revoke) and register a new version") + case order.State != domain.OrderStatePending: + // COMPLETED while still PENDING_VALIDATION is unreachable — + // the order completes in the same transaction that advances + // the lifecycle. Tolerate rather than brick the row. + return nil, nil + } + if order.IsExpired(now) { + // A lapsed-window order stays PENDING (expiry doesn't change + // State), so cancel refuses it — the agent-expiry sweeper + // retires it instead. Guide the operator to the action that + // actually works. + return nil, domain.NewValidationError("ACME_CHALLENGE_EXPIRED", + "the domain-control challenge window has expired; this registration will auto-expire — register a new version to retry") + } + verified, verr := s.verifyChallengeArtifacts(ctx, reg.FQDN(), order.Challenges) + if len(verified) > 0 { + return verified, nil + } + if verr != nil { + return nil, fmt.Errorf("acme verify: %w", verr) + } + return nil, domain.NewValidationError( + "ACME_CHALLENGE_MISSING", + fmt.Sprintf("no domain-control challenge artifact found for %s — publish the DNS-01 TXT record or the HTTP-01 resource from challenges[]", reg.FQDN()), + ) +} + +// verifyChallengeArtifacts checks the challenge set and returns the +// type of the FIRST artifact found published. The gate is any-of: the +// owner satisfies whichever challenge is easiest, so the first +// success is sufficient and the loop short-circuits — no point making +// a second (network) probe, and for HTTP-01 that probe is an outbound +// fetch we'd rather not issue needlessly. A configuration with no +// verifier at all is an error — silently passing the gate would +// reopen the very hole this exists to close. +func (s *RegistrationService) verifyChallengeArtifacts( + ctx context.Context, fqdn string, challenges []domain.Challenge, +) ([]domain.ChallengeType, error) { + if len(challenges) == 0 { + return nil, nil + } + if s.dnsVerifier == nil && s.httpChallenge == nil { + return nil, domain.NewInternalError("CHALLENGE_VERIFIER_MISSING", + "no challenge verifier configured — wire a DNS verifier and/or an HTTP challenge verifier", nil) + } + var firstErr error + for _, ch := range challenges { + ok, err := s.verifyChallengeArtifact(ctx, fqdn, ch) + if err != nil && firstErr == nil { + firstErr = err + } + if ok { + return []domain.ChallengeType{ch.Type}, nil + } + } + return nil, firstErr +} + +// verifyChallengeArtifact dispatches a single challenge to the +// matching verifier. Challenges with no wired verifier (or of an +// unknown type) report unverified rather than erroring — the any-of +// gate lets a sibling challenge still pass. +func (s *RegistrationService) verifyChallengeArtifact( + ctx context.Context, fqdn string, ch domain.Challenge, +) (bool, error) { + switch ch.Type { + case domain.ChallengeTypeDNS01: + if s.dnsVerifier == nil { + return false, nil + } + rec := domain.ExpectedDNSRecord{ + Name: ch.EffectiveDNSRecordName(fqdn), + Type: domain.DNSRecordTXT, + Value: ch.EffectiveDNSRecordValue(), + Purpose: "DOMAIN_VALIDATION", + Required: true, + } + res, err := s.dnsVerifier.VerifyRecords(ctx, fqdn, []domain.ExpectedDNSRecord{rec}) + if err != nil { + return false, err + } + return res != nil && res.AllRequired, nil + case domain.ChallengeTypeHTTP01: + if s.httpChallenge == nil { + return false, nil + } + return s.httpChallenge.VerifyHTTPChallenge(ctx, fqdn, ch.EffectiveHTTPPath(), ch.ExpectedHTTPContent()) + default: + return false, nil + } +} + +// serverOrderOutcome is the result of finalizeServerOrder: either the +// issued cert + SIGNED CSR row, or pending=true when an asynchronous +// provider is still processing. +type serverOrderOutcome struct { + pending bool + cert *domain.ByocServerCertificate + signedCSR domain.AgentCSR +} + +// finalizeServerOrder asks the issuer to complete the certificate +// order for the pending server CSR, validates the issued cert, and +// returns the BYOC-shape cert struct + the SIGNED CSR row so the +// caller can commit both inside its uow transaction. Extracted from +// VerifyACME to keep the orchestrator under the funlen bound; the +// issuance + validation don't need to hold the SQLite write lock. +// +// Pending orders (port.ErrOrderPending) flip the order to ISSUING and +// report pending. Terminal provider failures (port.ErrOrderFailed) +// mark the order FAILED, persist it, and surface 422 — the lifecycle +// status stays PENDING_VALIDATION so the operator can cancel and +// re-register without the ANS name being burned by a FAILED agent. +func (s *RegistrationService) finalizeServerOrder( ctx context.Context, reg *domain.AgentRegistration, - serverCSR *domain.AgentCSR, now time.Time, -) (*domain.ByocServerCertificate, domain.AgentCSR, error) { + serverCSR *domain.AgentCSR, verified []domain.ChallengeType, now time.Time, +) (serverOrderOutcome, error) { if s.serverCA == nil { - return nil, domain.AgentCSR{}, domain.NewInternalError("SERVER_CA_DISABLED", - "server CSR pending but no server CA configured — inconsistent state", nil) - } - issued, err := s.serverCA.IssueServerCertificate(ctx, serverCSR.CSRContent, reg.FQDN()) - if err != nil { - return nil, domain.AgentCSR{}, domain.NewInternalError("SERVER_CERT_ISSUE_FAILED", + return serverOrderOutcome{}, domain.NewInternalError("SERVER_CA_DISABLED", + "server CSR pending but no certificate issuer configured — inconsistent state", nil) + } + issued, err := s.serverCA.FinalizeOrder(ctx, port.FinalizeOrderRequest{ + OrderRef: reg.CertOrder.OrderRef, + CSRPEM: serverCSR.CSRContent, + FQDN: reg.FQDN(), + Verified: verified, + }) + switch { + case errors.Is(err, port.ErrOrderPending): + if merr := reg.CertOrder.MarkIssuing(); merr != nil { + return serverOrderOutcome{}, merr + } + return serverOrderOutcome{pending: true}, nil + case errors.Is(err, port.ErrOrderFailed): + if merr := reg.CertOrder.MarkFailed(); merr != nil { + return serverOrderOutcome{}, merr + } + if perr := s.agents.Save(ctx, reg); perr != nil { + return serverOrderOutcome{}, perr + } + return serverOrderOutcome{}, domain.NewValidationError("CERT_ORDER_FAILED", + "certificate provider reported a terminal order failure; cancel this registration (POST /revoke) and register a new version") + case err != nil: + return serverOrderOutcome{}, domain.NewInternalError("SERVER_CERT_ISSUE_FAILED", "failed to issue server cert", err) } v, err := s.validator.ValidateServerCertificate(ctx, issued.CertPEM, issued.ChainPEM, reg.FQDN()) if err != nil { - return nil, domain.AgentCSR{}, domain.NewInternalError("SERVER_CERT_SELFVERIFY_FAILED", + return serverOrderOutcome{}, domain.NewInternalError("SERVER_CERT_SELFVERIFY_FAILED", "issued server cert failed self-validation", err) } byocCert := &domain.ByocServerCertificate{ @@ -460,9 +683,41 @@ func (s *RegistrationService) signServerCSRForVerifyACME( } signed, err := serverCSR.MarkSigned(now) if err != nil { - return nil, domain.AgentCSR{}, err + return serverOrderOutcome{}, err } - return byocCert, signed, nil + return serverOrderOutcome{cert: byocCert, signedCSR: signed}, nil +} + +// signIdentityCSR asks the private identity CA to sign the pending +// CSR and returns the SIGNED CSR row plus the stored-certificate row +// (carrying the issuer's serial and provider handle for later +// CA-side revocation) for the caller to persist inside its own +// transaction. Callers invoke this only after domain ownership is +// fully proven — the identity CA performs no validation of its own. +func (s *RegistrationService) signIdentityCSR( + ctx context.Context, reg *domain.AgentRegistration, + identityCSR *domain.AgentCSR, now time.Time, +) (*domain.AgentCSR, *domain.StoredCertificate, error) { + issued, err := s.identityCA.IssueIdentityCertificate(ctx, identityCSR.CSRContent, reg.AnsName.String()) + if err != nil { + return nil, nil, domain.NewInternalError("CERT_ISSUE_FAILED", "failed to issue identity cert", err) + } + signed, err := identityCSR.MarkSigned(now) + if err != nil { + return nil, nil, err + } + stored := &domain.StoredCertificate{ + CSRID: identityCSR.CSRID, + CertificateType: domain.CertTypeIdentity, + CertificatePEM: issued.CertPEM, + ChainPEM: issued.ChainPEM, + SerialNumber: issued.SerialNumber, + CertificateRef: issued.CertificateRef, + Status: domain.CertStatusValid, + IssueTimestamp: issued.IssuedAt, + ExpirationTimestamp: issued.ExpiresAt, + } + return &signed, stored, nil } // isV1Lane reports whether the caller asked for V1 TL emission. @@ -531,7 +786,16 @@ func (s *RegistrationService) VerifyDNS(ctx context.Context, agentID string, in // TLSA `_443._tcp.` record — without the cert in hand, the // record set would omit TLSA and an operator running the CSR path // would never be asked to publish the cert-binding record. - if byoc, berr := s.byoc.FindLatestValidByAgentID(ctx, agentID); berr == nil && byoc != nil { + // + // A transient store failure must abort: this transition activates + // the agent and signs the terminal AGENT_REGISTERED leaf, so a + // swallowed error here would emit an immutable attestation missing + // the TLSA record / serverCerts[] from a recoverable fault. + byoc, berr := s.loadServerCert(ctx, agentID) + if berr != nil { + return nil, berr + } + if byoc != nil { reg.ServerCert = byoc } @@ -725,15 +989,21 @@ func (s *RegistrationService) buildAgentRegisteredEvent( }) } - // BYOC server certs: if the operator provided one at registration. + // Server cert (BYOC or CSR-signed): folded into the terminal + // attestation's serverCerts[]. A transient store error must abort + // the build — this leaf is signed and appended to an append-only + // log, so silently emitting empty serverCerts[] would be a + // permanently wrong artifact from a recoverable fault. var serverCertInfos []event.CertificateInfo - var byocCert *domain.ByocServerCertificate - if byoc, berr := s.byoc.FindLatestValidByAgentID(ctx, reg.AgentID); berr == nil && byoc != nil { - byocCert = byoc + byocCert, berr := s.loadServerCert(ctx, reg.AgentID) + if berr != nil { + return nil, berr + } + if byocCert != nil { serverCertInfos = []event.CertificateInfo{{ - Fingerprint: "SHA256:" + byoc.Fingerprint, + Fingerprint: "SHA256:" + byocCert.Fingerprint, CertType: "X509-DV-SERVER", - NotAfter: byoc.ValidToTimestamp.UTC().Format(time.RFC3339), + NotAfter: byocCert.ValidToTimestamp.UTC().Format(time.RFC3339), }} } @@ -783,16 +1053,24 @@ type RevokeResult struct { DNSRecordsToRemove []domain.ExpectedDNSRecord } -// Revoke transitions the registration to REVOKED, marks every active -// identity certificate REVOKED, and emits an AGENT_REVOCATION event. +// Revoke terminates a registration through the single revoke route, +// per the spec's contract: // -// Reference parity note: the reference RA refuses revocation from -// PENDING_VALIDATION with a dedicated error (application must -// complete ACME first or expire). We delegate to the domain -// aggregate's own Revoke/Cancel split — Revoke works on ACTIVE or -// DEPRECATED, Cancel works on pending states. Here we wire Revoke -// semantically; callers who want to cancel a pending registration -// should hit a separate endpoint (not in Stage 2). +// - ACTIVE / DEPRECATED agents are revoked: lifecycle → REVOKED, +// every valid identity certificate revoked at the issuing CA and +// flipped in the store, and an AGENT_REVOKED event emitted. +// - PENDING registrations in the PENDING_CERTS phase (order +// issuing/failed) or PENDING_DNS are cancelled: same certificate +// cleanup, but NO TL emit — under the terminal-only event model +// no leaf was ever written for an agent that never reached +// ACTIVE, so there is nothing in the log to terminate. +// - PENDING_VALIDATION registrations still awaiting their challenge +// are neither: they auto-expire when the challenge window lapses +// (the agent-expiry sweeper) — matching the spec's "not +// cancellable and will auto-expire". +// +// The domain aggregate's Revoke/Cancel split enforces the state +// rules; this method routes to whichever applies. func (s *RegistrationService) Revoke(ctx context.Context, agentID string, in RevokeInput) (*RevokeResult, error) { now := s.clock() @@ -813,6 +1091,10 @@ func (s *RegistrationService) Revoke(ctx context.Context, agentID string, in Rev reg.Endpoints = eps.Endpoints } } + // Hydrate the server cert so DNSRecordsToRemove includes the TLSA + // binding on every return path (idempotent early-return, cancel, + // and active revoke all compute it). + s.hydrateServerCert(ctx, reg) // Idempotent: already revoked → return current state without // re-emitting the event. @@ -835,6 +1117,13 @@ func (s *RegistrationService) Revoke(ctx context.Context, agentID string, in Rev }, nil } + // Pending registrations route to the cancel path: same + // certificate cleanup as a revoke, no TL emit. The domain + // aggregate enforces which pending states are cancellable. + if reg.Status.IsPending() { + return s.cancelPending(ctx, reg, in, now) + } + // Domain aggregate validates the reason + state transition. // Done in-memory before opening the tx so a precondition failure // (ErrInvalidState) doesn't open and immediately roll back a tx. @@ -847,15 +1136,8 @@ func (s *RegistrationService) Revoke(ctx context.Context, agentID string, in Rev return nil, err } - // Hydrate the server cert for the same reason — the - // AGENT_REVOKED event's `expiresAt` needs the server cert's - // notAfter alongside the identity certs', and `FindLatestValid` - // returns nothing once the cert has been marked revoked. - if reg.ServerCert == nil { - if all, berr := s.byoc.FindByAgentID(ctx, reg.AgentID); berr == nil && len(all) > 0 { - reg.ServerCert = all[0] - } - } + // (Server cert already hydrated above — the AGENT_REVOKED event's + // `expiresAt` needs its notAfter alongside the identity certs'.) // Read every identity cert before the tx. Pre-revoke status is // what the AGENT_REVOKED event captures; the in-tx update flips @@ -865,6 +1147,16 @@ func (s *RegistrationService) Revoke(ctx context.Context, agentID string, in Rev return nil, err } + // Revoke at the issuing CA before our own transaction so private + // CRL/OCSP distribution reflects the revocation — flipping only + // our database row would leave the certificate valid on the + // trust plane. External side effects can't roll back with the + // tx; the port's idempotency contract means a crash between CA + // revocation and the commit is healed by retrying the call. + if err := s.revokeIdentityCertsAtCA(ctx, certs, in.Reason); err != nil { + return nil, err + } + // Persist atomically: agent state, every cert revocation, and // the AGENT_REVOKED outbox row commit together. Pre-tx, agent // could be REVOKED while certs were still VALID and the outbox @@ -896,48 +1188,112 @@ func (s *RegistrationService) Revoke(ctx context.Context, agentID string, in Rev } return s.enqueueTLEventV1(txCtx, string(eventv1.TypeAgentRevoked), reg, v1Inner, now) } - inner := s.baseInnerEvent(reg, event.TypeAgentRevoked, now) - inner.RevokedAt = now.UTC().Format(time.RFC3339) - inner.RevocationReasonCode = string(in.Reason) - idCertInfos := make([]event.CertificateInfo, 0, len(certs)) - for _, c := range certs { - fp, ferr := fingerprintOf(c.CertificatePEM) - if ferr != nil { - return ferr - } - idCertInfos = append(idCertInfos, event.CertificateInfo{ - Fingerprint: fp, - CertType: "X509-OV-CLIENT", - NotAfter: c.ExpirationTimestamp.UTC().Format(time.RFC3339), - }) + inner, err := s.buildAgentRevokedV2Event(reg, certs, in.Reason, now) + if err != nil { + return err } - // `expiresAt` is required at event level per the reference TL - // spec, including on terminal events. Use the certs that WERE - // valid at the point of revocation — `IsValid(now)` returns - // false post-revocation but at this exact moment we still - // have the original notAfter values, so feed them through - // directly. - var minExpiry time.Time - for _, c := range certs { - if c.ExpirationTimestamp.IsZero() { - continue - } - if minExpiry.IsZero() || c.ExpirationTimestamp.Before(minExpiry) { - minExpiry = c.ExpirationTimestamp - } + return s.enqueueTLEvent(txCtx, string(event.TypeAgentRevoked), reg, inner, now) + }); err != nil { + return nil, err + } + + return &RevokeResult{ + Registration: reg, + RevokedAt: now, + DNSRecordsToRemove: domain.ComputeRequiredDNSRecords(reg, s.tlPublicBaseURL), + }, nil +} + +// buildAgentRevokedV2Event assembles the V2 AGENT_REVOKED inner +// event: the revoked identity-cert fingerprints as attestations plus +// the event-level `expiresAt`, which is required per the reference TL +// spec even on terminal events. Uses the certs that WERE valid at the +// point of revocation — at this exact moment the original notAfter +// values are still in hand, so they feed through directly. +func (s *RegistrationService) buildAgentRevokedV2Event( + reg *domain.AgentRegistration, certs []*domain.StoredCertificate, + reason domain.RevocationReason, now time.Time, +) (*event.Event, error) { + inner := s.baseInnerEvent(reg, event.TypeAgentRevoked, now) + inner.RevokedAt = now.UTC().Format(time.RFC3339) + inner.RevocationReasonCode = string(reason) + idCertInfos := make([]event.CertificateInfo, 0, len(certs)) + for _, c := range certs { + fp, ferr := fingerprintOf(c.CertificatePEM) + if ferr != nil { + return nil, ferr } - if reg.ServerCert != nil && !reg.ServerCert.ValidToTimestamp.IsZero() { - if minExpiry.IsZero() || reg.ServerCert.ValidToTimestamp.Before(minExpiry) { - minExpiry = reg.ServerCert.ValidToTimestamp - } + idCertInfos = append(idCertInfos, event.CertificateInfo{ + Fingerprint: fp, + CertType: "X509-OV-CLIENT", + NotAfter: c.ExpirationTimestamp.UTC().Format(time.RFC3339), + }) + } + var minExpiry time.Time + for _, c := range certs { + if c.ExpirationTimestamp.IsZero() { + continue } - if !minExpiry.IsZero() { - inner.ExpiresAt = minExpiry.UTC().Format(time.RFC3339) + if minExpiry.IsZero() || c.ExpirationTimestamp.Before(minExpiry) { + minExpiry = c.ExpirationTimestamp } - inner.Attestations = &event.Attestations{ - IdentityCerts: idCertInfos, + } + if reg.ServerCert != nil && !reg.ServerCert.ValidToTimestamp.IsZero() { + if minExpiry.IsZero() || reg.ServerCert.ValidToTimestamp.Before(minExpiry) { + minExpiry = reg.ServerCert.ValidToTimestamp } - return s.enqueueTLEvent(txCtx, string(event.TypeAgentRevoked), reg, inner, now) + } + if !minExpiry.IsZero() { + inner.ExpiresAt = minExpiry.UTC().Format(time.RFC3339) + } + inner.Attestations = &event.Attestations{ + IdentityCerts: idCertInfos, + } + return inner, nil +} + +// cancelPending terminates a pending registration through the revoke +// route: the aggregate's Cancel transition (which enforces the +// spec's eligibility rule), CA-side revocation of any +// already-issued identity certificates, and the store flips — +// committed atomically. Deliberately NO TL emit: under the +// terminal-only event model no leaf was ever written for an agent +// that never reached ACTIVE, so there is nothing in the log to +// terminate; emitting AGENT_REVOKED for an agent the log has never +// seen would strand verifiers on an unresolvable reference. +func (s *RegistrationService) cancelPending( + ctx context.Context, reg *domain.AgentRegistration, + in RevokeInput, now time.Time, +) (*RevokeResult, error) { + if !in.Reason.IsValid() { + return nil, domain.NewValidationError( + "INVALID_REVOCATION_REASON", fmt.Sprintf("invalid reason: %q", in.Reason)) + } + if err := reg.Cancel(now); err != nil { + return nil, err + } + + certs, err := s.certs.FindIdentityCertificatesByAgent(ctx, reg.AgentID) + if err != nil { + return nil, err + } + if err := s.revokeIdentityCertsAtCA(ctx, certs, in.Reason); err != nil { + return nil, err + } + + if err := s.uow.Run(ctx, func(txCtx context.Context) error { + if err := s.agents.Save(txCtx, reg); err != nil { + return err + } + for _, c := range certs { + if c.Status == domain.CertStatusValid { + revoked := c.Revoke() + if err := s.certs.UpdateCertificateStatus(txCtx, &revoked); err != nil { + return err + } + } + } + return nil }); err != nil { return nil, err } @@ -948,3 +1304,55 @@ func (s *RegistrationService) Revoke(ctx context.Context, agentID string, in Rev DNSRecordsToRemove: domain.ComputeRequiredDNSRecords(reg, s.tlPublicBaseURL), }, nil } + +// hydrateServerCert loads the agent's most-recent server certificate +// onto the aggregate when it isn't already populated, so the revoke +// flow can fingerprint it for the AGENT_REVOKED event and include its +// TLSA binding in DNSRecordsToRemove. Best-effort: a missing cert +// (BYOC never submitted, CSR order never finalized) leaves ServerCert +// nil and the TLSA record is simply omitted. FindByAgentID (not +// FindLatestValid) is used because a cert already flipped to REVOKED +// must still be fingerprinted here. +func (s *RegistrationService) hydrateServerCert(ctx context.Context, reg *domain.AgentRegistration) { + if reg.ServerCert != nil { + return + } + if all, err := s.byoc.FindByAgentID(ctx, reg.AgentID); err == nil && len(all) > 0 { + reg.ServerCert = all[0] + } +} + +// revokeIdentityCertsAtCA revokes every still-valid identity +// certificate at the issuing CA. Runs BEFORE the caller's +// transaction — CA revocation is an external side effect that cannot +// roll back, and the port contract makes it idempotent, so a crash +// between CA revocation and the commit heals on retry. The serial +// comes from the stored row when present; rows persisted before +// serial tracking fall back to parsing the certificate PEM. +func (s *RegistrationService) revokeIdentityCertsAtCA( + ctx context.Context, certs []*domain.StoredCertificate, reason domain.RevocationReason, +) error { + for _, c := range certs { + if c.Status != domain.CertStatusValid { + continue + } + serial := c.SerialNumber + if serial == "" { + parsed, err := serialFromCertPEM(c.CertificatePEM) + if err != nil { + return domain.NewInternalError("CERT_REVOKE_FAILED", + "derive certificate serial for CA revocation", err) + } + serial = parsed + } + if err := s.identityCA.RevokeCertificate(ctx, port.RevokeCertificateRequest{ + SerialNumber: serial, + CertificateRef: c.CertificateRef, + Reason: reason, + }); err != nil { + return domain.NewInternalError("CERT_REVOKE_FAILED", + "revoke identity certificate at issuing CA", err) + } + } + return nil +} diff --git a/internal/ra/service/order_flow_test.go b/internal/ra/service/order_flow_test.go new file mode 100644 index 0000000..fd3af60 --- /dev/null +++ b/internal/ra/service/order_flow_test.go @@ -0,0 +1,1167 @@ +package service_test + +import ( + "context" + "errors" + "strings" + "testing" + "time" + + "github.com/godaddy/ans/internal/adapter/cert" + "github.com/godaddy/ans/internal/adapter/cert/acmetest" + "github.com/godaddy/ans/internal/adapter/dns" + "github.com/godaddy/ans/internal/domain" + "github.com/godaddy/ans/internal/port" + "github.com/godaddy/ans/internal/ra/service" +) + +// errStrayFinalize is returned by refOnlyIssuer when handed an empty +// order ref — a stray-CSR finalize that should never happen. +var errStrayFinalize = errors.New("refOnlyIssuer: finalize with empty order ref") + +// errTransientByoc is a non-ErrNotFound store failure — a busy +// timeout / I/O blip — used to prove the service aborts rather than +// treating a recoverable read failure as "no cert". +var errTransientByoc = errors.New("byoc: database is locked") + +// toggleByocStore wraps a real BYOC store and, when fail is set, +// returns a transient error from the lookup. Save and everything else +// delegate to the embedded store so the happy-path setup is unaffected. +type toggleByocStore struct { + port.ByocCertificateStore + fail *bool +} + +func (s toggleByocStore) FindLatestValidByAgentID( + ctx context.Context, agentID string, +) (*domain.ByocServerCertificate, error) { + if *s.fail { + return nil, errTransientByoc + } + return s.ByocCertificateStore.FindLatestValidByAgentID(ctx, agentID) +} + +// TestVerifyDNS_TransientServerCertError_Aborts is the regression test +// for the swallow bug: a transient (non-ErrNotFound) failure loading +// the server cert during verify-dns must abort the transition, NOT +// silently activate the agent and sign a terminal AGENT_REGISTERED +// leaf with empty serverCerts[] / no TLSA. An append-only log can never +// take back a wrong leaf, so a recoverable fault must never produce one. +func TestVerifyDNS_TransientServerCertError_Aborts(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + fail := false + byoc := toggleByocStore{ByocCertificateStore: fx.byoc, fail: &fail} + svc := service.NewRegistrationService( + fx.agents, fx.endpoints, fx.certs, byoc, fx.renewals, + fx.validator, fx.identityCA, fx.bus, fx.outboxStore, fx.uow, + ).WithServerCertificateIssuer(fx.serverCA).WithDNSVerifier(dns.NewNoopVerifier()) + + // Register + verify-acme with the store healthy so the CSR-issued + // server cert is persisted and the agent reaches the pre-DNS state. + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + if _, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}); err != nil { + t.Fatalf("verify-acme: %v", err) + } + + // Now make the server-cert load fail transiently and drive verify-dns. + fail = true + if _, err := svc.VerifyDNS(context.Background(), agentID, service.VerifyInput{}); !errors.Is(err, errTransientByoc) { + t.Fatalf("verify-dns must propagate the transient store error, got %v", err) + } + // The read path (GetByAgentID) must likewise surface the transient + // failure rather than returning a detail block with a silently + // dropped TLSA record. + if _, err := svc.GetByAgentID(context.Background(), agentID); !errors.Is(err, errTransientByoc) { + t.Fatalf("GetByAgentID must propagate the transient store error, got %v", err) + } + + // The agent must NOT have advanced to ACTIVE on a swallowed error. + fail = false + reg, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatalf("reload agent: %v", err) + } + if reg.Status == domain.StatusActive { + t.Fatal("agent activated despite a transient server-cert load failure — a corrupted leaf would have been signed") + } +} + +// mustErrCode fails unless err is a *domain.Error carrying exactly the +// given RFC 7807 code. The code is the programmatic contract (per +// CLAUDE.md), so the negative-path tests assert it directly rather +// than settling for "some error occurred" — which would let a +// regression that swaps one failure code for another pass silently. +func mustErrCode(t *testing.T, err error, wantCode string) { + t.Helper() + var de *domain.Error + if !asDomainErr(err, &de) { + t.Fatalf("want *domain.Error with code %q, got %T: %v", wantCode, err, err) + } + if de.Code != wantCode { + t.Fatalf("error code: got %q want %q (%v)", de.Code, wantCode, err) + } +} + +// asyncIssuer wraps the real self-signed CA but simulates an +// asynchronous provider (an ACME CA such as Let's Encrypt): the first +// `pendingCalls` FinalizeOrder invocations report the order still +// processing, and `failOrder` simulates a terminal provider failure. +// CreateOrder delegates to the real CA so challenge relay stays +// realistic. +type asyncIssuer struct { + real port.ServerCertificateIssuer + pendingCalls int + failOrder bool + // lastVerified records what the service claimed was verified, so + // tests can assert the RA only tells providers about challenges + // it actually checked. + lastVerified []domain.ChallengeType +} + +func (a *asyncIssuer) CreateOrder(ctx context.Context, fqdn string) (*domain.CertificateOrder, error) { + return a.real.CreateOrder(ctx, fqdn) +} + +func (a *asyncIssuer) FinalizeOrder(ctx context.Context, req port.FinalizeOrderRequest) (*port.IssuedCert, error) { + a.lastVerified = req.Verified + if a.failOrder { + return nil, port.ErrOrderFailed + } + if a.pendingCalls > 0 { + a.pendingCalls-- + return nil, port.ErrOrderPending + } + return a.real.FinalizeOrder(ctx, req) +} + +func (a *asyncIssuer) GetCACertificate(ctx context.Context) (string, error) { + return a.real.GetCACertificate(ctx) +} + +// failingDNSVerifier reports every record as unpublished. +type failingDNSVerifier struct{} + +func (failingDNSVerifier) VerifyRecords(_ context.Context, _ string, expected []domain.ExpectedDNSRecord) (*port.VerificationResult, error) { + return &port.VerificationResult{AllRequired: false}, nil +} + +// staticHTTPVerifier answers every HTTP-01 check with a fixed result. +type staticHTTPVerifier struct{ ok bool } + +func (s staticHTTPVerifier) VerifyHTTPChallenge(_ context.Context, _, _, _ string) (bool, error) { + return s.ok, nil +} + +// rebuildWithIssuer swaps the fixture service's issuer + verifiers. +func rebuildWithIssuer(fx *regFixture, issuer port.ServerCertificateIssuer, dnsV port.DNSVerifier, httpV port.HTTPChallengeVerifier) *service.RegistrationService { + svc := service.NewRegistrationService( + fx.agents, fx.endpoints, fx.certs, fx.byoc, fx.renewals, + fx.validator, fx.identityCA, fx.bus, fx.outboxStore, fx.uow, + ).WithServerCertificateIssuer(issuer) + if dnsV != nil { + svc = svc.WithDNSVerifier(dnsV) + } + if httpV != nil { + svc = svc.WithHTTPChallengeVerifier(httpV) + } + return svc +} + +// TestVerifyACME_AsyncIssuer_PendingThenCompletes drives the full +// asynchronous-provider flow: the first verify-acme passes the gate, +// signs the identity cert, parks the order in ISSUING without +// advancing the lifecycle, and a re-POSTed verify-acme skips the gate +// and finalizes to PENDING_DNS. This is the contract an ACME adapter +// (Let's Encrypt) plugs into. +func TestVerifyACME_AsyncIssuer_PendingThenCompletes(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + issuer := &asyncIssuer{real: fx.serverCA, pendingCalls: 1} + svc := rebuildWithIssuer(fx, issuer, dns.NewNoopVerifier(), nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + // First call: gate passes (noop DNS), provider reports pending. + res, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + if err != nil { + t.Fatalf("verify-acme #1: %v", err) + } + if !res.Pending { + t.Fatal("want Pending=true while the provider finalizes") + } + if res.Registration.Status != domain.StatusPendingValidation { + t.Fatalf("lifecycle must stay PENDING_VALIDATION, got %s", res.Registration.Status) + } + if res.Registration.CertOrder.State != domain.OrderStateIssuing { + t.Fatalf("order state: got %s want ISSUING", res.Registration.CertOrder.State) + } + if len(issuer.lastVerified) == 0 { + t.Error("FinalizeOrder must receive the verified challenge types") + } + + // The ISSUING order state must have been persisted. + stored, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + if stored.CertOrder.State != domain.OrderStateIssuing { + t.Fatalf("persisted order state: got %s want ISSUING", stored.CertOrder.State) + } + // The identity cert is provisioned only once the public provider's + // validation succeeds — the order completing is that proof. While + // the order is still ISSUING nothing may be signed: a terminally + // failed order must never leave an identity cert behind. + idCerts, err := fx.certs.FindIdentityCertificatesByAgent(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + if len(idCerts) != 0 { + t.Fatalf("identity cert must NOT be issued while the order is pending: certs=%d", len(idCerts)) + } + + // Re-driven call: gate skipped (order ISSUING), finalize succeeds. + res2, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + if err != nil { + t.Fatalf("verify-acme #2: %v", err) + } + if res2.Pending { + t.Fatal("second call should complete the order") + } + if res2.Registration.Status != domain.StatusPendingDNS { + t.Fatalf("status after completion: got %s want PENDING_DNS", res2.Registration.Status) + } + if res2.Registration.CertOrder.State != domain.OrderStateCompleted { + t.Fatalf("order state after completion: got %s want COMPLETED", res2.Registration.CertOrder.State) + } + if res2.Registration.ServerCert == nil { + t.Fatal("server cert missing after async completion") + } + // Identity cert lands with the completion, carrying the serial + // captured for later CA-side revocation. + idCerts, err = fx.certs.FindIdentityCertificatesByAgent(context.Background(), agentID) + if err != nil || len(idCerts) != 1 { + t.Fatalf("identity cert must be issued at order completion: certs=%d err=%v", len(idCerts), err) + } + if idCerts[0].SerialNumber == "" { + t.Error("stored identity cert must carry its serial number") + } +} + +// TestVerifyACME_AsyncIssuer_TerminalFailure pins the ErrOrderFailed +// contract: the order flips FAILED (persisted), the lifecycle stays +// PENDING_VALIDATION so the ANS name isn't burned, and subsequent +// verify-acme calls surface the dead order. +func TestVerifyACME_AsyncIssuer_TerminalFailure(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + issuer := &asyncIssuer{real: fx.serverCA, failOrder: true} + svc := rebuildWithIssuer(fx, issuer, dns.NewNoopVerifier(), nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + _, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + mustErrCode(t, err, "CERT_ORDER_FAILED") + stored, ferr := fx.agents.FindByAgentID(context.Background(), agentID) + if ferr != nil { + t.Fatal(ferr) + } + if stored.CertOrder.State != domain.OrderStateFailed { + t.Fatalf("order state: got %s want FAILED", stored.CertOrder.State) + } + if stored.Status != domain.StatusPendingValidation { + t.Fatalf("lifecycle must stay PENDING_VALIDATION, got %s", stored.Status) + } + + // Re-POST: the gate reports the dead order. + if _, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}); err == nil { + t.Fatal("want CERT_ORDER_FAILED from the gate on a FAILED order") + } +} + +// TestVerifyACME_Gate_MissingArtifact pins the unconditional gate: a +// DNS verifier that finds nothing and no HTTP verifier → 422, no +// issuance, no lifecycle movement. +func TestVerifyACME_Gate_MissingArtifact(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, failingDNSVerifier{}, nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + _, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + mustErrCode(t, err, "ACME_CHALLENGE_MISSING") + stored, _ := fx.agents.FindByAgentID(context.Background(), agentID) + if stored.Status != domain.StatusPendingValidation { + t.Fatalf("gate failure must not advance the lifecycle, got %s", stored.Status) + } +} + +// TestVerifyACME_Gate_HTTP01Satisfies pins the any-of semantics: DNS +// artifact absent but the HTTP-01 resource is live → gate passes and +// the registration advances. +func TestVerifyACME_Gate_HTTP01Satisfies(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, failingDNSVerifier{}, staticHTTPVerifier{ok: true}) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + res, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + if err != nil { + t.Fatalf("verify-acme with live HTTP-01: %v", err) + } + if res.Registration.Status != domain.StatusPendingDNS { + t.Fatalf("status: got %s want PENDING_DNS", res.Registration.Status) + } +} + +// TestVerifyACME_Gate_NoVerifierConfigured pins the misconfiguration +// guard: challenges exist but nothing can check them → error, never a +// silent pass. +func TestVerifyACME_Gate_NoVerifierConfigured(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, nil, nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + _, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + mustErrCode(t, err, "CHALLENGE_VERIFIER_MISSING") +} + +// TestVerifyACME_Gate_ExpiredChallengeWindow pins expiry enforcement — +// the relayed expiresAt is honored, not decorative. +func TestVerifyACME_Gate_ExpiredChallengeWindow(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + // Age the order past its window directly in the store. + stored, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + stored.CertOrder.ExpiresAt = time.Now().Add(-time.Minute) + if err := fx.agents.Save(context.Background(), stored); err != nil { + t.Fatal(err) + } + + _, err = svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + mustErrCode(t, err, "ACME_CHALLENGE_EXPIRED") +} + +// TestRenewal_AsyncIssuer_PendingThenCompletes drives the renewal +// lane's async path: verify-acme verifies the challenge, the provider +// reports pending (ISSUING_CERTIFICATE), and a re-POST completes the +// renewal with the new TLSA record surfaced. +func TestRenewal_AsyncIssuer_PendingThenCompletes(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + + // Activate against the synchronous issuer, then renew against an + // asynchronous one. + activateSvc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, activateSvc) + + issuer := &asyncIssuer{real: fx.serverCA, pendingCalls: 1} + svc := rebuildWithIssuer(fx, issuer, dns.NewNoopVerifier(), nil) + + sub, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: testServerCSR(t, fx.req.AnsName.FQDN()), + }) + if err != nil { + t.Fatalf("submit renewal: %v", err) + } + if len(sub.Renewal.Validation.Challenges) == 0 { + t.Fatal("renewal must relay the order's challenges") + } + if sub.FQDN != fx.req.AnsName.FQDN() { + t.Fatalf("submission FQDN: got %q", sub.FQDN) + } + + // First verify: gate passes, provider pending → Sync=false. + v1, err := svc.VerifyRenewalACME(context.Background(), agentID) + if err != nil { + t.Fatalf("verify renewal #1: %v", err) + } + if v1.Sync { + t.Fatal("want Sync=false while the provider finalizes") + } + if v1.Renewal.Validation.Status != domain.ValidationVerified { + t.Fatalf("validation: got %s want VERIFIED", v1.Renewal.Validation.Status) + } + + // Re-POST: gate skipped (already VERIFIED), finalize completes. + v2, err := svc.VerifyRenewalACME(context.Background(), agentID) + if err != nil { + t.Fatalf("verify renewal #2: %v", err) + } + if !v2.Sync { + t.Fatal("second call should complete the renewal") + } + if v2.Renewal.CompletedAt.IsZero() { + t.Fatal("renewal must be COMPLETED") + } + if v2.TLSARecord == nil || v2.TLSARecord.Type != domain.DNSRecordTLSA { + t.Fatalf("completed renewal must carry the new TLSA record, got %+v", v2.TLSARecord) + } + + // The status GET surfaces the TLSA record too — it's what the + // WAIT next-step tells the operator to poll for. + got, err := svc.GetServerCertRenewal(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + if got.TLSARecord == nil { + t.Fatal("GetServerCertRenewal must carry the TLSA record once completed") + } +} + +// TestRenewal_Gate_MissingArtifact pins the renewal-lane gate — the +// pre-change noop is dead: no published artifact, no issuance. +func TestRenewal_Gate_MissingArtifact(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + + // Activate with a passing verifier first… + activateSvc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, activateSvc) + + // …then verify the renewal against a failing one. + svc := rebuildWithIssuer(fx, fx.serverCA, failingDNSVerifier{}, nil) + if _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: testServerCSR(t, fx.req.AnsName.FQDN()), + }); err != nil { + t.Fatalf("submit renewal: %v", err) + } + _, err := svc.VerifyRenewalACME(context.Background(), agentID) + mustErrCode(t, err, "ACME_CHALLENGE_MISSING") +} + +// TestRenewal_AsyncIssuer_TerminalFailure pins ErrOrderFailed on the +// renewal lane: the renewal is marked FAILED with a reason. +func TestRenewal_AsyncIssuer_TerminalFailure(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + + activateSvc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, activateSvc) + + issuer := &asyncIssuer{real: fx.serverCA, failOrder: true} + svc := rebuildWithIssuer(fx, issuer, dns.NewNoopVerifier(), nil) + if _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: testServerCSR(t, fx.req.AnsName.FQDN()), + }); err != nil { + t.Fatalf("submit renewal: %v", err) + } + _, verr := svc.VerifyRenewalACME(context.Background(), agentID) + mustErrCode(t, verr, "CERT_ORDER_FAILED") + got, err := svc.GetServerCertRenewal(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + if got.Renewal.FailureReason == "" { + t.Fatal("failed renewal must carry the failure reason") + } +} + +// erroringDNSVerifier simulates a systemic lookup failure (resolver +// unreachable) — distinct from "record not found". +type erroringDNSVerifier struct{} + +func (erroringDNSVerifier) VerifyRecords(_ context.Context, _ string, _ []domain.ExpectedDNSRecord) (*port.VerificationResult, error) { + return nil, context.DeadlineExceeded +} + +// brokenIssuer returns a non-sentinel error from FinalizeOrder. +type brokenIssuer struct{ port.ServerCertificateIssuer } + +func (b brokenIssuer) FinalizeOrder(_ context.Context, _ port.FinalizeOrderRequest) (*port.IssuedCert, error) { + return nil, context.Canceled +} + +// TestVerifyACME_LegacyZeroOrder_SkipsGate pins backwards +// compatibility: registrations persisted before order-tracking (zero +// order, no challenge ever issued) skip the gate and advance. +func TestVerifyACME_LegacyZeroOrder_SkipsGate(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, failingDNSVerifier{}, nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + // Rewrite the row into its legacy shape: no order at all. + stored, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + stored.CertOrder = domain.CertificateOrder{} + if err := fx.agents.Save(context.Background(), stored); err != nil { + t.Fatal(err) + } + + res, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + if err != nil { + t.Fatalf("legacy rows must not be gated on challenges they never received: %v", err) + } + if res.Registration.Status != domain.StatusPendingDNS { + t.Fatalf("status: got %s want PENDING_DNS", res.Registration.Status) + } +} + +// TestVerifyACME_Gate_SystemicDNSFailure: a resolver outage (lookup +// error, not a missing record) surfaces as an error, never a silent +// pass. +func TestVerifyACME_Gate_SystemicDNSFailure(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, erroringDNSVerifier{}, nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + if _, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}); err == nil { + t.Fatal("want systemic verification error") + } +} + +// TestVerifyACME_Gate_DNSVerifierAbsent_HTTPStillChecks: a DNS_01 +// challenge with no DNS verifier wired reports unverified, while the +// wired HTTP verifier satisfies the any-of gate. +func TestVerifyACME_Gate_DNSVerifierAbsent_HTTPStillChecks(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, nil, staticHTTPVerifier{ok: true}) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + res, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + if err != nil { + t.Fatalf("HTTP-01 alone must satisfy the gate: %v", err) + } + if res.Registration.Status != domain.StatusPendingDNS { + t.Fatalf("status: got %s", res.Registration.Status) + } +} + +// TestVerifyACME_Gate_UnknownChallengeType: challenges of a type the +// RA cannot verify report unverified; with nothing else satisfied the +// gate fails closed. +func TestVerifyACME_Gate_UnknownChallengeType(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + stored, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + stored.CertOrder.Challenges = []domain.Challenge{{Type: domain.ChallengeType("TLS_ALPN_01"), Token: "t"}} + if err := fx.agents.Save(context.Background(), stored); err != nil { + t.Fatal(err) + } + _, err = svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + mustErrCode(t, err, "ACME_CHALLENGE_MISSING") +} + +// TestVerifyACME_IssuerGenericError maps non-sentinel issuer failures +// to SERVER_CERT_ISSUE_FAILED without touching the order state. +func TestVerifyACME_IssuerGenericError(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, brokenIssuer{fx.serverCA}, dns.NewNoopVerifier(), nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + _, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + mustErrCode(t, err, "SERVER_CERT_ISSUE_FAILED") + stored, _ := fx.agents.FindByAgentID(context.Background(), agentID) + if stored.CertOrder.State != domain.OrderStatePending { + t.Fatalf("transient issuer errors must leave the order retryable, got %s", stored.CertOrder.State) + } +} + +// TestVerifyRenewalACME_VerifiedBYOC_Rejected guards the re-drive +// branch: a VERIFIED-but-incomplete BYOC renewal is an impossible +// state via the public API (BYOC completes in the verifying call), so +// a directly-persisted one is rejected rather than re-verified. +func TestVerifyRenewalACME_VerifiedBYOC_Rejected(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, svc) + + now := time.Now() + reg, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + r := domain.NewBYOCRenewal(agentID, reg.ID, "LEAF", "CHAIN", + domain.NewSelfIssuedOrder("d", "h", now.Add(time.Hour)), now) + verified, err := r.Validation.MarkVerified(now) + if err != nil { + t.Fatal(err) + } + r.UpdateValidationStatus(verified) + if err := fx.renewals.Save(context.Background(), r); err != nil { + t.Fatal(err) + } + + _, err = svc.VerifyRenewalACME(context.Background(), agentID) + mustErrCode(t, err, "RENEWAL_NOT_PENDING") +} + +// TestWithTLPublicBaseURL is a builder smoke test — the badge URL +// plumbing is asserted end-to-end elsewhere; this pins the accessor +// pair. +func TestWithTLPublicBaseURL(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, nil, nil).WithTLPublicBaseURL("https://tl.example.org") + if svc.TLPublicBaseURL() != "https://tl.example.org" { + t.Fatalf("TLPublicBaseURL: %q", svc.TLPublicBaseURL()) + } +} + +// registerAndActivate drives a fresh registration through +// register → verify-acme → verify-dns so renewal tests start from an +// ACTIVE agent. +func registerAndActivate(t *testing.T, fx *regFixture, svc *service.RegistrationService) string { + t.Helper() + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + if _, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}); err != nil { + t.Fatalf("verify-acme: %v", err) + } + if _, err := svc.VerifyDNS(context.Background(), agentID, service.VerifyInput{}); err != nil { + t.Fatalf("verify-dns: %v", err) + } + return agentID +} + +// TestVerifyACME_ACMEIssuer_EndToEnd wires the real ACME adapter +// (the Let's Encrypt-shaped issuer) into the registration service +// against an in-process fake RFC 8555 provider, and drives the whole +// flow: register relays the provider's challenges, the first +// verify-acme parks the order in ISSUING while provider-side +// validation runs, and the re-driven verify-acme finalizes the order +// and lands the provider-issued chain. This is the wiring a real +// deployment gets with `ca.server.type: acme` pointed at Let's +// Encrypt staging. +func TestVerifyACME_ACMEIssuer_EndToEnd(t *testing.T) { + t.Parallel() + fake, err := acmetest.New() + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { + if perr := fake.Err(); perr != nil { + t.Errorf("fake acme observed a protocol violation: %v", perr) + } + fake.Close() + }) + + issuer, err := cert.NewACMEIssuer(fake.DirectoryURL(), "ops@example.com", t.TempDir(), + cert.WithFinalizeBudget(300*time.Millisecond)) + if err != nil { + t.Fatal(err) + } + + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, issuer, dns.NewNoopVerifier(), nil) + + // The fake provider authorizes agent.example.com — matching the + // fixture's default request. + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + // The 202's challenges are the provider's: the order ref is the + // provider order URL and the DNS TXT value is the key-auth + // digest, not the raw token. + reg, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + if reg.CertOrder.OrderRef != fake.OrderURL() { + t.Fatalf("order ref: got %q want provider order URL", reg.CertOrder.OrderRef) + } + dns01, ok := reg.CertOrder.ChallengeOfType(domain.ChallengeTypeDNS01) + if !ok || dns01.EffectiveDNSRecordValue() == dns01.Token || dns01.KeyAuthorization == "" { + t.Fatalf("provider DNS challenge not relayed faithfully: %+v", dns01) + } + + // First verify-acme: gate passes (noop DNS plays the published + // artifact), provider validation outlives the finalize budget → + // order parks in ISSUING. + fake.SetHoldPending(true) + res, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + if err != nil { + t.Fatalf("verify-acme #1: %v", err) + } + if !res.Pending || res.Registration.CertOrder.State != domain.OrderStateIssuing { + t.Fatalf("want pending/ISSUING while provider validates, got pending=%v state=%s", + res.Pending, res.Registration.CertOrder.State) + } + + // Provider finishes validation; the re-driven verify-acme + // finalizes and the provider-issued chain lands. + fake.SetHoldPending(false) + fake.SetOrderStatus("ready") + res2, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + if err != nil { + t.Fatalf("verify-acme #2: %v", err) + } + if res2.Pending || res2.Registration.Status != domain.StatusPendingDNS { + t.Fatalf("want completed PENDING_DNS, got pending=%v status=%s", res2.Pending, res2.Registration.Status) + } + if res2.Registration.ServerCert == nil || + !strings.Contains(res2.Registration.ServerCert.IssuerDN, "acmetest Root") { + t.Fatalf("server cert must be the provider-issued chain, got %+v", res2.Registration.ServerCert) + } +} + +// bornReadyIssuer models Let's Encrypt authorization reuse: CreateOrder +// returns an ISSUING order with NO challenges, and FinalizeOrder +// succeeds straight away (no challenge was ever published locally). +type bornReadyIssuer struct{ real port.ServerCertificateIssuer } + +func (b bornReadyIssuer) CreateOrder(ctx context.Context, fqdn string) (*domain.CertificateOrder, error) { + o, err := b.real.CreateOrder(ctx, fqdn) + if err != nil { + return nil, err + } + o.State = domain.OrderStateIssuing + o.Challenges = nil + return o, nil +} + +func (b bornReadyIssuer) FinalizeOrder(ctx context.Context, req port.FinalizeOrderRequest) (*port.IssuedCert, error) { + return b.real.FinalizeOrder(ctx, req) +} +func (b bornReadyIssuer) GetCACertificate(ctx context.Context) (string, error) { + return b.real.GetCACertificate(ctx) +} + +// TestVerifyACME_BornReadyOrder_SkipsGateAndFinalizes pins the +// authorization-reuse path: a registration whose order came back +// ISSUING with no challenges advances straight through verify-acme +// without the operator publishing anything — the gate skips ISSUING +// and the order finalizes. +func TestVerifyACME_BornReadyOrder_SkipsGateAndFinalizes(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + // failingDNS proves the gate is genuinely skipped (not passed): if + // the gate ran, this verifier would reject and verify-acme would + // 422. It must reach PENDING_DNS regardless. + svc := rebuildWithIssuer(fx, bornReadyIssuer{real: fx.serverCA}, failingDNSVerifier{}, nil) + + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + stored, err := fx.agents.FindByAgentID(context.Background(), agentID) + if err != nil { + t.Fatal(err) + } + if stored.CertOrder.State != domain.OrderStateIssuing || len(stored.CertOrder.Challenges) != 0 { + t.Fatalf("born-ready registration order: state=%s challenges=%d", + stored.CertOrder.State, len(stored.CertOrder.Challenges)) + } + + res, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + if err != nil { + t.Fatalf("born-ready verify-acme must finalize without a gate: %v", err) + } + if res.Registration.Status != domain.StatusPendingDNS { + t.Fatalf("status: got %s want PENDING_DNS", res.Registration.Status) + } + if res.Registration.ServerCert == nil { + t.Fatal("server cert missing after born-ready finalize") + } +} + +// badCertIssuer's FinalizeOrder succeeds at the issuer but returns a +// certificate that fails the RA's own self-validation — exercising +// the SERVER_CERT_SELFVERIFY_FAILED guard on both lanes. +type badCertIssuer struct{ real port.ServerCertificateIssuer } + +func (b badCertIssuer) CreateOrder(ctx context.Context, fqdn string) (*domain.CertificateOrder, error) { + return b.real.CreateOrder(ctx, fqdn) +} +func (b badCertIssuer) FinalizeOrder(_ context.Context, _ port.FinalizeOrderRequest) (*port.IssuedCert, error) { + return &port.IssuedCert{CertPEM: "-----BEGIN CERTIFICATE-----\nbogus\n-----END CERTIFICATE-----\n"}, nil +} +func (b badCertIssuer) GetCACertificate(ctx context.Context) (string, error) { + return b.real.GetCACertificate(ctx) +} + +// TestVerifyACME_SelfVerifyFailure pins the registration-lane guard: +// an issuer that returns an unparseable cert is caught by the RA's +// post-issuance validation rather than being persisted. +func TestVerifyACME_SelfVerifyFailure(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, badCertIssuer{fx.serverCA}, dns.NewNoopVerifier(), nil) + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + _, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + mustErrCode(t, err, "SERVER_CERT_SELFVERIFY_FAILED") +} + +// TestRenewal_SelfVerifyFailure pins the same guard on the renewal lane. +func TestRenewal_SelfVerifyFailure(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + activateSvc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, activateSvc) + + svc := rebuildWithIssuer(fx, badCertIssuer{fx.serverCA}, dns.NewNoopVerifier(), nil) + if _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: testServerCSR(t, fx.req.AnsName.FQDN()), + }); err != nil { + t.Fatalf("submit renewal: %v", err) + } + _, err := svc.VerifyRenewalACME(context.Background(), agentID) + mustErrCode(t, err, "SERVER_CERT_SELFVERIFY_FAILED") +} + +// TestRenewal_IssuerGenericError maps a non-sentinel issuer failure +// on the renewal lane to SERVER_CERT_ISSUE_FAILED (a retryable 500), +// distinct from the terminal ErrOrderFailed path. +func TestRenewal_IssuerGenericError(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + activateSvc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, activateSvc) + + svc := rebuildWithIssuer(fx, brokenIssuer{fx.serverCA}, dns.NewNoopVerifier(), nil) + if _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: testServerCSR(t, fx.req.AnsName.FQDN()), + }); err != nil { + t.Fatalf("submit renewal: %v", err) + } + _, err := svc.VerifyRenewalACME(context.Background(), agentID) + mustErrCode(t, err, "SERVER_CERT_ISSUE_FAILED") +} + +// TestRenewal_BornReadyOrder_SkipsGate pins the renewal-lane twin of +// authorization reuse: a CSR renewal whose order came back with no +// challenges finalizes without the operator publishing anything, even +// against a failing DNS verifier (proving the gate is skipped). +func TestRenewal_BornReadyOrder_SkipsGate(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + activateSvc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, activateSvc) + + svc := rebuildWithIssuer(fx, bornReadyIssuer{real: fx.serverCA}, failingDNSVerifier{}, nil) + sub, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: testServerCSR(t, fx.req.AnsName.FQDN()), + }) + if err != nil { + t.Fatalf("submit renewal: %v", err) + } + if len(sub.Renewal.Validation.Challenges) != 0 { + t.Fatalf("born-ready renewal must carry no challenges, got %d", len(sub.Renewal.Validation.Challenges)) + } + + res, err := svc.VerifyRenewalACME(context.Background(), agentID) + if err != nil { + t.Fatalf("born-ready renewal verify-acme must finalize without a gate: %v", err) + } + if !res.Sync || res.Renewal.CompletedAt.IsZero() { + t.Fatalf("born-ready renewal must complete synchronously: sync=%v completed=%v", + res.Sync, !res.Renewal.CompletedAt.IsZero()) + } + if res.TLSARecord == nil { + t.Error("completed renewal must carry the new TLSA record") + } +} + +// TestVerifyACME_BYOCWithStrayServerCSR_IgnoresIt pins the guard: a +// BYOC registration (self-issued order, empty OrderRef) that also has +// a server CSR submitted out-of-band must NOT finalize that CSR — +// doing so would 500 against an ACME issuer and issue a duplicate +// cert against the self-CA. The agent advances on its BYOC cert. +func TestVerifyACME_BYOCWithStrayServerCSR_IgnoresIt(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + // An ACME-style issuer that 500s on an empty order ref — proving + // the stray CSR is never handed to it. + issuer := &refOnlyIssuer{real: fx.serverCA} + svc := rebuildWithIssuer(fx, issuer, dns.NewNoopVerifier(), nil) + + // BYOC registration: server cert supplied, no server CSR. + leaf, chain := buildSelfSignedServerCert(t, fx.req.AnsName.FQDN()) + req := fx.req + req.ServerCsrPEM = "" + req.ServerCertificatePEM = leaf + req.ServerCertificateChainPEM = chain + if _, err := svc.RegisterAgent(context.Background(), req); err != nil { + t.Fatalf("register BYOC: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + // Operator submits a stray server CSR out-of-band. + if _, err := svc.SubmitServerCSR(context.Background(), agentID, testServerCSR(t, fx.req.AnsName.FQDN())); err != nil { + t.Fatalf("submit stray server CSR: %v", err) + } + + // verify-acme must ignore the stray CSR (empty OrderRef) and + // advance on the BYOC cert — not hand it to the issuer. + res, err := svc.VerifyACME(context.Background(), agentID, service.VerifyInput{}) + if err != nil { + t.Fatalf("BYOC verify-acme must not touch the stray CSR: %v", err) + } + if res.Registration.Status != domain.StatusPendingDNS { + t.Fatalf("status: got %s want PENDING_DNS", res.Registration.Status) + } + if issuer.finalizeCalls != 0 { + t.Errorf("issuer.FinalizeOrder must not be called for a BYOC registration, got %d calls", issuer.finalizeCalls) + } +} + +// TestGetByAgentID_NoServerCertYet pins the absence path: a freshly- +// registered CSR-path agent has no server cert on file yet, so the +// detail lookup must succeed with a nil ServerCert (and the BYOC +// store's ErrNotFound treated as "none", not an error). +func TestGetByAgentID_NoServerCertYet(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + + res, err := svc.GetByAgentID(context.Background(), agentID) + if err != nil { + t.Fatalf("GetByAgentID must tolerate a missing server cert, got %v", err) + } + if res.Registration.ServerCert != nil { + t.Error("a pre-verify-acme CSR agent must have no server cert attached") + } +} + +// TestGetServerCertRenewal_TransientServerCertError_Propagates pins the +// renewal read path: once a renewal has completed, the detail lookup +// surfaces the new leaf's TLSA record — a transient failure loading +// that cert must propagate rather than silently omit the record the +// WAIT next-step tells the operator to publish. +func TestGetServerCertRenewal_TransientServerCertError_Propagates(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + fail := false + byoc := toggleByocStore{ByocCertificateStore: fx.byoc, fail: &fail} + svc := service.NewRegistrationService( + fx.agents, fx.endpoints, fx.certs, byoc, fx.renewals, + fx.validator, fx.identityCA, fx.bus, fx.outboxStore, fx.uow, + ).WithServerCertificateIssuer(fx.serverCA).WithDNSVerifier(dns.NewNoopVerifier()) + + agentID := registerAndActivate(t, fx, svc) + // BYOC renewal completes synchronously, leaving a completed renewal + // whose detail surfaces the TLSA record. + leaf, chain := buildSelfSignedServerCert(t, fx.req.AnsName.FQDN()) + if _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCertificatePEM: leaf, + ServerCertificateChainPEM: chain, + }); err != nil { + t.Fatalf("submit BYOC renewal: %v", err) + } + if _, err := svc.VerifyRenewalACME(context.Background(), agentID); err != nil { + t.Fatalf("verify renewal: %v", err) + } + + fail = true + if _, err := svc.GetServerCertRenewal(context.Background(), agentID); !errors.Is(err, errTransientByoc) { + t.Fatalf("GetServerCertRenewal must propagate the transient store error, got %v", err) + } +} + +// TestSubmitRenewal_BYOC_HappyPath pins the bring-your-own-cert +// renewal branch: no provider order is created (the operator supplies +// the cert), the validated leaf is persisted to the BYOC store, and +// the RA self-issues domain-control challenges the operator must prove +// before the new cert goes live. +func TestSubmitRenewal_BYOC_HappyPath(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, svc) + + leaf, chain := buildSelfSignedServerCert(t, fx.req.AnsName.FQDN()) + res, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCertificatePEM: leaf, + ServerCertificateChainPEM: chain, + }) + if err != nil { + t.Fatalf("BYOC renewal submit: %v", err) + } + if res.CsrID != "" { + t.Errorf("BYOC renewal must not mint a CSR id, got %q", res.CsrID) + } + if res.Renewal == nil || len(res.Renewal.Validation.Challenges) == 0 { + t.Fatalf("BYOC renewal must self-issue challenges: %+v", res.Renewal) + } + // The validated cert must be persisted to the BYOC store before the + // renewal completes. + stored, err := fx.byoc.FindLatestValidByAgentID(context.Background(), agentID) + if err != nil || stored == nil { + t.Fatalf("BYOC cert not persisted at submit: cert=%v err=%v", stored, err) + } +} + +// TestSubmitRenewal_ValidationErrors pins the early reject branches. +func TestSubmitRenewal_ValidationErrors(t *testing.T) { + t.Parallel() + + // Non-active agent: a freshly-registered (still PENDING_VALIDATION) + // agent cannot renew. + t.Run("not active", func(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + if _, err := svc.RegisterAgent(context.Background(), fx.req); err != nil { + t.Fatalf("register: %v", err) + } + agentID := anyAgentID(t, fx, fx.req.AnsName) + _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: testServerCSR(t, fx.req.AnsName.FQDN()), + }) + mustErrCode(t, err, "AGENT_NOT_ACTIVE") + }) + + // Exactly one of CSR / BYOC must be set — both and neither reject. + for name, in := range map[string]service.SubmitRenewalInput{ + "neither": {}, + "both": {ServerCsrPEM: "x", ServerCertificatePEM: "y"}, + } { + t.Run(name, func(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, svc) + _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, in) + mustErrCode(t, err, "INVALID_RENEWAL_REQUEST") + }) + } + + // Malformed server CSR rejects before any order is created. + t.Run("bad csr", func(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, svc) + _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: "-----BEGIN CERTIFICATE REQUEST-----\nnope\n-----END CERTIFICATE REQUEST-----\n", + }) + mustErrCode(t, err, "INVALID_SERVER_CSR") + }) +} + +// TestSubmitRenewal_PendingExists pins the 409: a second submit while a +// renewal is still in flight is rejected. +func TestSubmitRenewal_PendingExists(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + svc := rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil) + agentID := registerAndActivate(t, fx, svc) + + csr := testServerCSR(t, fx.req.AnsName.FQDN()) + if _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: csr, + }); err != nil { + t.Fatalf("first renewal submit: %v", err) + } + _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: csr, + }) + mustErrCode(t, err, "PENDING_RENEWAL_EXISTS") +} + +// TestSubmitRenewal_NoIssuer pins the fail-fast: a CSR renewal with no +// server CA wired is rejected at submit rather than parked in a state +// that can never finalize. +func TestSubmitRenewal_NoIssuer(t *testing.T) { + t.Parallel() + fx := newRegFixture(t) + // Activate with a working issuer, then rebuild with none. + agentID := registerAndActivate(t, fx, rebuildWithIssuer(fx, fx.serverCA, dns.NewNoopVerifier(), nil)) + svc := rebuildWithIssuer(fx, nil, dns.NewNoopVerifier(), nil) + _, err := svc.SubmitServerCertRenewal(context.Background(), agentID, service.SubmitRenewalInput{ + ServerCsrPEM: testServerCSR(t, fx.req.AnsName.FQDN()), + }) + mustErrCode(t, err, "SERVER_CA_DISABLED") +} + +// refOnlyIssuer finalizes only when given a non-empty order ref (like +// the ACME adapter) and counts finalize calls. +type refOnlyIssuer struct { + real port.ServerCertificateIssuer + finalizeCalls int +} + +func (r *refOnlyIssuer) CreateOrder(ctx context.Context, fqdn string) (*domain.CertificateOrder, error) { + return r.real.CreateOrder(ctx, fqdn) +} +func (r *refOnlyIssuer) FinalizeOrder(ctx context.Context, req port.FinalizeOrderRequest) (*port.IssuedCert, error) { + r.finalizeCalls++ + if req.OrderRef == "" { + return nil, errStrayFinalize + } + return r.real.FinalizeOrder(ctx, req) +} +func (r *refOnlyIssuer) GetCACertificate(ctx context.Context) (string, error) { + return r.real.GetCACertificate(ctx) +} + +// Compile-time interface checks for the fakes and the ACME adapter. +var ( + _ port.ServerCertificateIssuer = (*asyncIssuer)(nil) + _ port.ServerCertificateIssuer = (*cert.ACMEIssuer)(nil) + _ port.DNSVerifier = failingDNSVerifier{} + _ port.HTTPChallengeVerifier = staticHTTPVerifier{} + _ = cert.ServerSelfCA{} +) diff --git a/internal/ra/service/registration.go b/internal/ra/service/registration.go index 652ef53..5ed3b8a 100644 --- a/internal/ra/service/registration.go +++ b/internal/ra/service/registration.go @@ -61,9 +61,10 @@ type RegisterRequest struct { // must be set (both set or neither set → 422): // // - ServerCsrPEM: caller submits a CSR; the service validates - // it against the agent FQDN and asks the configured - // `ServerCertificateAuthority` port to sign it. Leaf + chain - // are stored as an issued server cert. + // it against the agent FQDN, opens a certificate order via + // the configured `ServerCertificateIssuer` port, and the + // order is finalized at verify-acme. Leaf + chain are stored + // as an issued server cert. // // - ServerCertificatePEM + ServerCertificateChainPEM: BYOC. // Caller supplies a cert already signed by a public or @@ -111,6 +112,11 @@ type OutboxPayload struct { ProducerSignature string `json:"producerSignature"` } +// registrationChallengeWindow is how long the operator has to publish +// a domain-control challenge artifact and call verify-acme before the +// registration's challenge expires. +const registrationChallengeWindow = 24 * time.Hour + // RegistrationService is the aggregate-level service for the POST and // verify-* endpoints. // @@ -130,11 +136,15 @@ type RegistrationService struct { renewals port.RenewalStore validator port.CertificateValidator identityCA port.IdentityCertificateAuthority - serverCA port.ServerCertificateAuthority // optional; nil = CSR path rejected + serverCA port.ServerCertificateIssuer // optional; nil = CSR path rejected bus port.EventBus outbox OutboxEnqueuer uow port.UnitOfWork dnsVerifier port.DNSVerifier + // httpChallenge verifies HTTP-01 challenge artifacts. Optional — + // when nil, HTTP-01 challenges simply never verify and the gate + // relies on DNS-01. Production configs wire the default adapter. + httpChallenge port.HTTPChallengeVerifier // tlPublicBaseURL is the externally-reachable Transparency Log URL // used in _ans-badge DNS records (e.g. "https://tl.example.org"). tlPublicBaseURL string @@ -191,12 +201,23 @@ func (s *RegistrationService) WithSigner(sig EventSigner) *RegistrationService { return s } -// WithServerCertificateAuthority wires the server CA used to sign -// server CSRs submitted at registration or renewal time. When nil +// WithServerCertificateIssuer wires the certificate issuer used for +// server CSRs submitted at registration or renewal time. Orders are +// created at submission (relaying the issuer's domain-control +// challenges to the operator) and finalized at verify-acme. When nil // (or never called), the service rejects `serverCsrPEM` submissions // with SERVER_CA_DISABLED — operators deploy only the BYOC path. -func (s *RegistrationService) WithServerCertificateAuthority(ca port.ServerCertificateAuthority) *RegistrationService { - s.serverCA = ca +func (s *RegistrationService) WithServerCertificateIssuer(issuer port.ServerCertificateIssuer) *RegistrationService { + s.serverCA = issuer + return s +} + +// WithHTTPChallengeVerifier wires the verifier used to check HTTP-01 +// challenge artifacts during verify-acme. When nil (or never called), +// HTTP-01 challenges never verify and the challenge gate relies on +// DNS-01 alone. +func (s *RegistrationService) WithHTTPChallengeVerifier(v port.HTTPChallengeVerifier) *RegistrationService { + s.httpChallenge = v return s } @@ -248,70 +269,15 @@ func (s *RegistrationService) RegisterAgent(ctx context.Context, req RegisterReq ) } - // Server certificate: exactly one of CSR / BYOC. - // - // - CSR path: validate + call ServerCertificateAuthority to sign. - // The issued cert is stored as a BYOC cert downstream because - // the domain model doesn't distinguish "we signed it" from - // "the operator brought their own"; both end up in the - // ByocServerCertificate aggregate. The issuer DN differs (our - // self-signed root vs the operator's public CA), which is the - // audit trail. - // - BYOC path: validator checks the cert. - // - Neither: 422 (we don't allow identity-cert-only registration). - // - Both: 422 (ambiguous). - csrSet := req.ServerCsrPEM != "" - byocSet := req.ServerCertificatePEM != "" - if csrSet == byocSet { - return nil, domain.NewValidationError( - "INVALID_SERVER_CERT_INPUT", - "exactly one of serverCsrPEM or serverCertificatePEM must be provided", - ) - } - - // Validate BYOC cert / server CSR input shape. Actual cert - // issuance (identity cert sign + CSR-path server cert sign) - // is deferred to verify-acme — at registration time we haven't - // proven domain control yet, so a cert handed out at register - // time wouldn't mean anything, and producing the TLSA record - // before the server cert exists would leave the 202 response - // shape incoherent. - var byocCert *domain.ByocServerCertificate - var pendingServerCSR *domain.AgentCSR - switch { - case byocSet: - v, err := s.validator.ValidateServerCertificate(ctx, - req.ServerCertificatePEM, req.ServerCertificateChainPEM, req.AnsName.FQDN()) - if err != nil { - return nil, domain.NewCertificateError("INVALID_SERVER_CERT", err.Error()) - } - byocCert = &domain.ByocServerCertificate{ - LeafCertificatePEM: v.LeafPEM, - ChainCertificatesPEM: v.ChainPEM, - SubjectCommonName: v.CN, - SubjectAlternativeNames: v.SANs, - IssuerDN: v.IssuerDN, - ValidFromTimestamp: v.ValidFrom, - ValidToTimestamp: v.ValidTo, - Fingerprint: v.Fingerprint, - } - case csrSet: - if s.serverCA == nil { - return nil, domain.NewValidationError( - "SERVER_CA_DISABLED", - "serverCsrPEM submitted but no server CA is configured — either configure one or use serverCertificatePEM (BYOC)", - ) - } - if err := s.validator.ValidateServerCSR(ctx, req.ServerCsrPEM, req.AnsName.FQDN()); err != nil { - return nil, domain.NewValidationError("INVALID_SERVER_CSR", err.Error()) - } - srvCSR := domain.NewServerCSR(uuid.NewString(), req.ServerCsrPEM, now) - pendingServerCSR = &srvCSR - } - - // Validate identity CSR shape (optional). When supplied, signing is - // deferred to verify-acme and the CSR row stays PENDING until then. - // When omitted, the agent registers without an identity certificate. + // Validate identity CSR shape (optional) BEFORE the server-cert + // intake: resolveServerCertInput opens a certificate order with the + // configured issuer (a network round-trip for an ACME provider, + // counting against its order rate limits), so a request that will + // fail on a malformed identity CSR should be rejected with a cheap + // local 422 first, never after burning a provider order. When + // supplied, signing is deferred to verify-acme and the CSR row + // stays PENDING until then; when omitted, the agent registers + // without an identity certificate. var identityCSR *domain.AgentCSR if req.IdentityCSRPEM != "" { if err := s.validator.ValidateIdentityCSR(ctx, req.IdentityCSRPEM, req.AnsName.String()); err != nil { @@ -321,6 +287,16 @@ func (s *RegistrationService) RegisterAgent(ctx context.Context, req RegisterReq identityCSR = &csr } + // Server certificate intake: exactly one of CSR / BYOC, plus the + // certificate order whose challenges ride in the 202 response. For + // the CSR path this opens the provider order, so it runs only + // after the cheap local validations above have passed. + in, err := s.resolveServerCertInput(ctx, req, now) + if err != nil { + return nil, err + } + byocCert, pendingServerCSR, order := in.byocCert, in.serverCSR, in.order + // Build aggregates. agentID := uuid.NewString() @@ -332,19 +308,7 @@ func (s *RegistrationService) RegisterAgent(ctx context.Context, req RegisterReq return nil, err } reg.ServerCSR = pendingServerCSR - - // Generate the ACME DNS-01 challenge token + expiry. The only - // DNS action the operator should take before verify-acme. - dns01, _, err := generateChallengeTokens() - if err != nil { - return nil, domain.NewInternalError( - "CHALLENGE_GEN_FAILED", "generate ACME challenge", err, - ) - } - reg.ACMEChallenge = domain.ACMEChallenge{ - DNS01Token: dns01, - ExpiresAt: now.Add(24 * time.Hour), - } + reg.CertOrder = order // Persist the aggregate + CSR rows + BYOC cert (if any) atomically. // Each Save participates in the same transaction via the scoped @@ -407,6 +371,102 @@ func (s *RegistrationService) RegisterAgent(ctx context.Context, req RegisterReq }, nil } +// serverCertInput is the resolved server-certificate intake for a +// registration: exactly one of byocCert / serverCSR is set, and order +// carries the domain-control challenges relayed in the 202 response. +type serverCertInput struct { + byocCert *domain.ByocServerCertificate + serverCSR *domain.AgentCSR + order domain.CertificateOrder +} + +// resolveServerCertInput validates the server-certificate request +// shape and produces the certificate order. Exactly one of CSR / +// BYOC: +// +// - CSR path: validate the CSR, then open a certificate order via +// the configured issuer (`CreateOrder`) — the relayed challenge +// tokens are the provider's own (self-issued by the in-process +// CA, provider-minted for ACME CAs like Let's Encrypt). Actual +// issuance is deferred to verify-acme: at registration time +// domain control isn't proven yet, so a cert handed out here +// wouldn't mean anything. The issued cert is stored as a BYOC +// cert downstream because the domain model doesn't distinguish +// "we signed it" from "the operator brought their own"; the +// issuer DN is the audit trail. +// - BYOC path: validate the operator's cert. No certificate is +// being issued, but domain control must still be proven, so the +// RA self-issues a validation order. +// - Neither: 422 (identity-cert-only registration not allowed). +// - Both: 422 (ambiguous). +// +// Either way the domain owner publishes the challenge artifacts +// themselves — ANS never touches their DNS or web server. +func (s *RegistrationService) resolveServerCertInput( + ctx context.Context, req RegisterRequest, now time.Time, +) (serverCertInput, error) { + csrSet := req.ServerCsrPEM != "" + byocSet := req.ServerCertificatePEM != "" + if csrSet == byocSet { + return serverCertInput{}, domain.NewValidationError( + "INVALID_SERVER_CERT_INPUT", + "exactly one of serverCsrPEM or serverCertificatePEM must be provided", + ) + } + + if byocSet { + v, err := s.validator.ValidateServerCertificate(ctx, + req.ServerCertificatePEM, req.ServerCertificateChainPEM, req.AnsName.FQDN()) + if err != nil { + return serverCertInput{}, domain.NewCertificateError("INVALID_SERVER_CERT", err.Error()) + } + dns01, http01, err := generateChallengeTokens() + if err != nil { + return serverCertInput{}, domain.NewInternalError( + "CHALLENGE_GEN_FAILED", "generate ACME challenge", err, + ) + } + return serverCertInput{ + byocCert: &domain.ByocServerCertificate{ + LeafCertificatePEM: v.LeafPEM, + ChainCertificatesPEM: v.ChainPEM, + SubjectCommonName: v.CN, + SubjectAlternativeNames: v.SANs, + IssuerDN: v.IssuerDN, + ValidFromTimestamp: v.ValidFrom, + ValidToTimestamp: v.ValidTo, + Fingerprint: v.Fingerprint, + }, + order: domain.NewSelfIssuedOrder(dns01, http01, now.Add(registrationChallengeWindow)), + }, nil + } + + if s.serverCA == nil { + return serverCertInput{}, domain.NewValidationError( + "SERVER_CA_DISABLED", + "serverCsrPEM submitted but no server CA is configured — either configure one or use serverCertificatePEM (BYOC)", + ) + } + if err := s.validator.ValidateServerCSR(ctx, req.ServerCsrPEM, req.AnsName.FQDN()); err != nil { + return serverCertInput{}, domain.NewValidationError("INVALID_SERVER_CSR", err.Error()) + } + created, err := s.serverCA.CreateOrder(ctx, req.AnsName.FQDN()) + if err != nil { + return serverCertInput{}, domain.NewInternalError( + "CERT_ORDER_FAILED", "create certificate order", err, + ) + } + order := *created + // Clamp the challenge window to the registration's own deadline + // when the provider's order outlives it — relaying an expiry the + // registration flow won't honor would mislead the operator. + if deadline := now.Add(registrationChallengeWindow); order.ExpiresAt.IsZero() || deadline.Before(order.ExpiresAt) { + order.ExpiresAt = deadline + } + srvCSR := domain.NewServerCSR(uuid.NewString(), req.ServerCsrPEM, now) + return serverCertInput{serverCSR: &srvCSR, order: order}, nil +} + // baseInnerEvent populates the fields every event carries about its // agent: ansId, ansName, eventType, the agent host/name/version // block, raId (if the RA is configured with a signer), and the diff --git a/internal/ra/service/registration_test.go b/internal/ra/service/registration_test.go index 5eadb75..5e7f2dd 100644 --- a/internal/ra/service/registration_test.go +++ b/internal/ra/service/registration_test.go @@ -16,6 +16,7 @@ import ( "github.com/rs/zerolog" "github.com/godaddy/ans/internal/adapter/cert" + "github.com/godaddy/ans/internal/adapter/dns" "github.com/godaddy/ans/internal/adapter/eventbus" "github.com/godaddy/ans/internal/adapter/keymanager" "github.com/godaddy/ans/internal/adapter/store/sqlite" @@ -68,7 +69,7 @@ func TestRegistration_NoSigner(t *testing.T) { svcNoSig := service.NewRegistrationService( fx.agents, fx.endpoints, fx.certs, fx.byoc, fx.renewals, fx.validator, fx.identityCA, fx.bus, fx.outboxStore, fx.uow, - ).WithServerCertificateAuthority(fx.serverCA) + ).WithServerCertificateIssuer(fx.serverCA) // Use a fresh ANS name + matching CSR + matching endpoints so // every validation that checks FQDN/SAN passes. @@ -124,7 +125,7 @@ func TestRegistration_RollsBackOnPartialFailure(t *testing.T) { svc := service.NewRegistrationService( fx.agents, failingEndpoints, fx.certs, fx.byoc, fx.renewals, fx.validator, fx.identityCA, fx.bus, fx.outboxStore, fx.uow, - ).WithServerCertificateAuthority(fx.serverCA) + ).WithServerCertificateIssuer(fx.serverCA) if _, err := svc.RegisterAgent(context.Background(), fx.req); err == nil { t.Fatal("RegisterAgent should have surfaced the endpoint-store error") @@ -198,7 +199,7 @@ func TestRevoke_RollsBackOnOutboxFailure(t *testing.T) { svc := service.NewRegistrationService( fx.agents, fx.endpoints, fx.certs, fx.byoc, fx.renewals, fx.validator, fx.identityCA, fx.bus, &failingOutbox{}, fx.uow, - ).WithServerCertificateAuthority(fx.serverCA) + ).WithServerCertificateIssuer(fx.serverCA) if _, err := svc.Revoke(context.Background(), agentID, service.RevokeInput{ Reason: domain.RevocationKeyCompromise, @@ -264,7 +265,7 @@ type regFixture struct { renewals port.RenewalStore validator port.CertificateValidator identityCA port.IdentityCertificateAuthority - serverCA port.ServerCertificateAuthority + serverCA port.ServerCertificateIssuer bus port.EventBus signerPubPEM string } @@ -327,7 +328,10 @@ func newRegFixture(t *testing.T) *regFixture { KeyManager: km, KeyID: "ra-signer", RaID: "ra-test", - }).WithServerCertificateAuthority(serverCA) + }).WithServerCertificateIssuer(serverCA). + // The challenge gate is unconditional; the noop verifier plays + // the quickstart role (accepts any published state). + WithDNSVerifier(dns.NewNoopVerifier()) // Build a valid identity CSR whose URI SAN matches the ANS name // and a server CSR whose DNS SAN matches the FQDN. diff --git a/internal/ra/service/renewal.go b/internal/ra/service/renewal.go index ff4cd20..8ca8575 100644 --- a/internal/ra/service/renewal.go +++ b/internal/ra/service/renewal.go @@ -11,8 +11,15 @@ import ( "github.com/google/uuid" "github.com/godaddy/ans/internal/domain" + "github.com/godaddy/ans/internal/port" ) +// renewalChallengeWindow is how long the operator has to publish a +// domain-control challenge artifact for a renewal. Mirrors the +// domain's renewal expiry; the effective window is clamped to the +// provider order's own expiry when that is shorter. +const renewalChallengeWindow = 7 * 24 * time.Hour + // SubmitRenewalInput is what the POST /certificates/server/renewal // handler passes through. Matches V2 ServerCertificateRenewalRequest // (§1409): exactly one of ServerCsrPEM / ServerCertificatePEM must @@ -24,10 +31,13 @@ type SubmitRenewalInput struct { } // SubmitRenewalResult is returned from SubmitServerCertRenewal. The -// handler maps this into the RenewalSubmissionResponse DTO. +// handler maps this into the RenewalSubmissionResponse DTO. FQDN is +// carried so the handler can render challenge record names and URLs +// without re-fetching the agent. type SubmitRenewalResult struct { Renewal *domain.ServerCertificateRenewal CsrID string // non-empty for SERVER_CSR renewals + FQDN string } // SubmitServerCertRenewal initiates a server cert renewal for the @@ -85,11 +95,6 @@ func (s *RegistrationService) SubmitServerCertRenewal( "exactly one of serverCsrPEM or serverCertificatePEM must be provided") } - dns01, http01, err := generateChallengeTokens() - if err != nil { - return nil, domain.NewInternalError("CHALLENGE_GEN_FAILED", "generate challenge tokens", err) - } - var renewal *domain.ServerCertificateRenewal var csrID string @@ -102,15 +107,24 @@ func (s *RegistrationService) SubmitServerCertRenewal( return nil, domain.NewValidationError("INVALID_SERVER_CSR", "Server CSR validation failed: "+err.Error()) } - // Require a server CA for issuance at verify-acme time. We + // Require an issuer for finalization at verify-acme time. We // fail fast here rather than letting the renewal sit in - // PENDING_VALIDATION forever when the operator has no CA + // PENDING_VALIDATION forever when the operator has no issuer // wired. if s.serverCA == nil { return nil, domain.NewValidationError( "SERVER_CA_DISABLED", "serverCsrPEM renewal submitted but no server CA is configured") } + // The certificate order — and with it the domain-control + // challenges relayed to the operator — comes from the issuer + // port, so an ACME provider's own tokens flow through + // untouched. + order, err := s.serverCA.CreateOrder(ctx, reg.AnsName.FQDN()) + if err != nil { + return nil, domain.NewInternalError( + "CERT_ORDER_FAILED", "create certificate order", err) + } csrID = uuid.NewString() newCSR, err := reg.SubmitServerCSR(csrID, in.ServerCsrPEM, now) if err != nil { @@ -122,7 +136,7 @@ func (s *RegistrationService) SubmitServerCertRenewal( if err := s.certs.SaveCSR(ctx, agentID, newCSR); err != nil { return nil, err } - renewal = domain.NewCSRRenewal(agentID, reg.ID, csrID, dns01, http01, now) + renewal = domain.NewCSRRenewal(agentID, reg.ID, csrID, *order, now) case byocSet: v, err := s.validator.ValidateServerCertificate(ctx, @@ -149,16 +163,35 @@ func (s *RegistrationService) SubmitServerCertRenewal( if err := s.byoc.Save(ctx, agentID, byocCert); err != nil { return nil, err } + // BYOC renewals issue no certificate, so no provider order + // exists — but domain control must still be proven before the + // operator's cert goes live. The RA self-issues the + // validation challenges. + dns01, http01, err := generateChallengeTokens() + if err != nil { + return nil, domain.NewInternalError("CHALLENGE_GEN_FAILED", "generate challenge tokens", err) + } renewal = domain.NewBYOCRenewal(agentID, reg.ID, in.ServerCertificatePEM, in.ServerCertificateChainPEM, - dns01, http01, now) + domain.NewSelfIssuedOrder(dns01, http01, now.Add(renewalChallengeWindow)), now) } if err := s.renewals.Save(ctx, renewal); err != nil { return nil, err } - return &SubmitRenewalResult{Renewal: renewal, CsrID: csrID}, nil + return &SubmitRenewalResult{Renewal: renewal, CsrID: csrID, FQDN: reg.AnsName.FQDN()}, nil +} + +// GetRenewalResult is returned from GetServerCertRenewal. FQDN lets +// the handler render challenge record names; TLSARecord is the +// DANE-EE record for the renewal's new certificate, set once the +// renewal completed — it is the artifact the WAIT next-step tells the +// operator to poll for. +type GetRenewalResult struct { + Renewal *domain.ServerCertificateRenewal + FQDN string + TLSARecord *domain.ExpectedDNSRecord } // GetServerCertRenewal returns the most-recent renewal for the agent @@ -166,8 +199,31 @@ func (s *RegistrationService) SubmitServerCertRenewal( // is produced by the underlying store returning ErrNotFound; callers // don't need to distinguish "no renewal" from "agent not found" // because the ownership middleware has already confirmed the agent. -func (s *RegistrationService) GetServerCertRenewal(ctx context.Context, agentID string) (*domain.ServerCertificateRenewal, error) { - return s.renewals.FindByAgentID(ctx, agentID) +func (s *RegistrationService) GetServerCertRenewal(ctx context.Context, agentID string) (*GetRenewalResult, error) { + r, err := s.renewals.FindByAgentID(ctx, agentID) + if err != nil { + return nil, err + } + reg, err := s.agents.FindByAgentID(ctx, agentID) + if err != nil { + return nil, err + } + res := &GetRenewalResult{Renewal: r, FQDN: reg.AnsName.FQDN()} + // Completed renewals surface the TLSA record for the new leaf — + // the operator updates DNS with it to finish the rollover. A + // transient store error must propagate rather than silently drop + // the record the WAIT next-step tells the operator to poll for. + if !r.CompletedAt.IsZero() && r.FailureReason == "" { + cert, cerr := s.loadServerCert(ctx, agentID) + if cerr != nil { + return nil, cerr + } + if cert != nil { + rec := domain.TLSARecordForCert(res.FQDN, cert.Fingerprint) + res.TLSARecord = &rec + } + } + return res, nil } // CancelServerCertRenewal cancels the most recent renewal for the @@ -205,19 +261,31 @@ func (s *RegistrationService) CancelServerCertRenewal(ctx context.Context, agent // the response (HTTP 200 vs 202, status string, tlsaDnsRecord). type VerifyRenewalACMEResult struct { Renewal *domain.ServerCertificateRenewal - // Sync is true when the renewal completed synchronously (BYOC). - // CSR renewals are async — issuance happens after verification. + // Sync is true when the renewal reached COMPLETED in this call — + // BYOC after validation, or CSR when the issuer finalized + // synchronously. False means the issuer is still processing + // (ISSUING_CERTIFICATE); the operator re-POSTs verify-acme to + // drive the order to completion. Sync bool + // TLSARecord is the DANE-EE record for the renewal's new leaf + // certificate; set when the renewal completed in this call so the + // operator can update DNS immediately. + TLSARecord *domain.ExpectedDNSRecord } -// VerifyRenewalACME marks the renewal's validation as VERIFIED and -// (for BYOC) completes the renewal immediately by flipping the -// registration's ServerCert over. Mirrors the reference RA's -// `verifyRenewalAcme` handler. +// VerifyRenewalACME verifies that the operator published one of the +// renewal's domain-control challenge artifacts, marks the validation +// VERIFIED, and completes the renewal: BYOC by flipping the +// registration's ServerCert to the already-validated cert, CSR by +// finalizing the certificate order via the issuer port. // -// This build's ACME verification is a noop (same as the existing -// agent-activation verify-acme handler). Production deployments plug -// in a real port.ACMEVerifier at a future extension point. +// The challenge gate is unconditional — the issuer is never invoked +// until the RA has confirmed a published artifact, regardless of +// which issuer adapter is wired. Asynchronous issuers may leave the +// order pending; the renewal then stays in ISSUING_CERTIFICATE +// (derived) and a re-POST of verify-acme re-attempts the finalize — +// the gate is skipped on re-driven calls because the provider already +// accepted the challenge answer. func (s *RegistrationService) VerifyRenewalACME(ctx context.Context, agentID string) (*VerifyRenewalACMEResult, error) { now := s.clock() @@ -230,6 +298,52 @@ func (s *RegistrationService) VerifyRenewalACME(ctx context.Context, agentID str "renewal validation window has expired") } + // Re-driven call: validation already passed on an earlier + // verify-acme; only the order finalize remains. + if r.Validation.Status == domain.ValidationVerified { + if r.RenewalType != domain.RenewalTypeCSR { + // BYOC renewals complete in the same call that verifies + // them, so a verified-but-pending BYOC renewal cannot + // exist; FindPendingByAgentID would not have returned it. + return nil, domain.NewValidationError("RENEWAL_NOT_PENDING", + "renewal validation has already been verified") + } + return s.finalizeCSRRenewal(ctx, agentID, r, nil, now) + } + + reg, err := s.agents.FindByAgentID(ctx, agentID) + if err != nil { + return nil, err + } + + // A CSR renewal whose provider order came back already-validated + // (Let's Encrypt authorization reuse — CreateOrder returned no + // challenges) has nothing for the owner to publish, so the gate is + // skipped and the order is finalized directly. This is unambiguous: + // BYOC renewals always carry the RA's two self-issued challenges, + // and legacy renewals synthesize a DNS-01/HTTP-01 pair from their + // token columns, so only a born-ready provider order has none. + // A born-ready provider order (Let's Encrypt authorization reuse — + // CreateOrder returned no challenges) has nothing to gate on, so + // the gate is skipped and the order finalized directly. Otherwise + // at least one relayed artifact must be published (any-of: DNS-01 + // TXT or HTTP-01 resource). + var verified []domain.ChallengeType + bornReady := len(r.Validation.Challenges) == 0 && r.RenewalType == domain.RenewalTypeCSR + if !bornReady { + var verr error + verified, verr = s.verifyChallengeArtifacts(ctx, reg.AnsName.FQDN(), r.Validation.Challenges) + if len(verified) == 0 { + if verr != nil { + return nil, fmt.Errorf("renewal acme verify: %w", verr) + } + return nil, domain.NewValidationError( + "ACME_CHALLENGE_MISSING", + "no domain-control challenge artifact found — publish the DNS-01 TXT record or the HTTP-01 resource from challenges", + ) + } + } + verifiedValidation, err := r.Validation.MarkVerified(now) if err != nil { return nil, err @@ -243,62 +357,87 @@ func (s *RegistrationService) VerifyRenewalACME(ctx context.Context, agentID str if err := r.MarkCompleted(now); err != nil { return nil, err } - } - - // CSR path: with a server CA wired, we issue synchronously after - // verification rather than leaving the renewal in - // ISSUING_CERTIFICATE forever. Matches the reference - // CertIssuanceService.issueServerCertificate call from - // verifyRenewalAcme — the CA signs, we persist the leaf cert as - // the new live BYOC cert, and the renewal completes. - // - // Async issuance (for slow ACME-style CAs) would keep the - // renewal in ISSUING_CERTIFICATE and finalize via a background - // job; when that lands, it plugs in here without changing the - // caller contract. - if r.RenewalType == domain.RenewalTypeCSR && s.serverCA != nil { - if err := s.completeCSRRenewal(ctx, agentID, r, now); err != nil { + if err := s.renewals.Save(ctx, r); err != nil { return nil, err } + res := &VerifyRenewalACMEResult{Renewal: r, Sync: true} + // The new cert was persisted at submission; surface its TLSA + // record so the operator can update DNS immediately. A transient + // store error must propagate rather than silently drop it. + cert, cerr := s.loadServerCert(ctx, agentID) + if cerr != nil { + return nil, cerr + } + if cert != nil { + rec := domain.TLSARecordForCert(reg.AnsName.FQDN(), cert.Fingerprint) + res.TLSARecord = &rec + } + return res, nil } - if err := s.renewals.Save(ctx, r); err != nil { - return nil, err - } - - // Sync is true whenever the renewal reached COMPLETED in this - // call — either because it was BYOC (validation suffices) or - // because the configured server CA signed the CSR synchronously. - // The handler uses this to choose 200 vs 202 per the reference. - return &VerifyRenewalACMEResult{ - Renewal: r, - Sync: !r.CompletedAt.IsZero(), - }, nil + return s.finalizeCSRRenewal(ctx, agentID, r, verified, now) } -// completeCSRRenewal extracts the synchronous CSR-path renewal flow: -// fetch the pending CSR, sign it via the server CA, validate the -// issued cert, save the new BYOC cert, mark the CSR signed, and -// flip the renewal to COMPLETED. Lives as its own method so the -// caller doesn't trip the cyclomatic-complexity gate. -func (s *RegistrationService) completeCSRRenewal(ctx context.Context, agentID string, r *domain.ServerCertificateRenewal, now time.Time) error { +// finalizeCSRRenewal completes the CSR-path renewal flow: fetch the +// pending CSR, finalize the certificate order via the issuer port, +// validate the issued cert, save the new BYOC cert, mark the CSR +// signed, and flip the renewal to COMPLETED. Lives as its own method +// so the caller doesn't trip the cyclomatic-complexity gate. +// +// Asynchronous issuers may return port.ErrOrderPending: the renewal +// is persisted with its validation VERIFIED but not completed — +// deriveRenewalStatus reports ISSUING_CERTIFICATE — and the operator +// re-POSTs verify-acme to re-drive. Terminal failures +// (port.ErrOrderFailed) mark the renewal FAILED with the provider's +// reason. +func (s *RegistrationService) finalizeCSRRenewal( + ctx context.Context, agentID string, + r *domain.ServerCertificateRenewal, verified []domain.ChallengeType, now time.Time, +) (*VerifyRenewalACMEResult, error) { + if s.serverCA == nil { + return nil, domain.NewInternalError("SERVER_CA_DISABLED", + "CSR renewal pending but no certificate issuer configured — inconsistent state", nil) + } csr, err := s.certs.FindCSRByID(ctx, agentID, r.ServerCsrID) if err != nil { - return err + return nil, err } reg, err := s.agents.FindByAgentID(ctx, agentID) if err != nil { - return err + return nil, err } - issued, err := s.serverCA.IssueServerCertificate(ctx, csr.CSRContent, reg.AnsName.FQDN()) - if err != nil { - return domain.NewInternalError("SERVER_CERT_ISSUE_FAILED", + issued, err := s.serverCA.FinalizeOrder(ctx, port.FinalizeOrderRequest{ + OrderRef: r.Validation.OrderRef, + CSRPEM: csr.CSRContent, + FQDN: reg.AnsName.FQDN(), + Verified: verified, + }) + switch { + case errors.Is(err, port.ErrOrderPending): + // Persist the VERIFIED validation so the re-driven call skips + // the gate; the missing CompletedAt keeps the renewal in + // ISSUING_CERTIFICATE (derived). + if serr := s.renewals.Save(ctx, r); serr != nil { + return nil, serr + } + return &VerifyRenewalACMEResult{Renewal: r, Sync: false}, nil + case errors.Is(err, port.ErrOrderFailed): + if merr := r.MarkFailed("certificate provider reported a terminal order failure", now); merr != nil { + return nil, merr + } + if serr := s.renewals.Save(ctx, r); serr != nil { + return nil, serr + } + return nil, domain.NewValidationError("CERT_ORDER_FAILED", + "certificate provider reported a terminal order failure; submit a new renewal") + case err != nil: + return nil, domain.NewInternalError("SERVER_CERT_ISSUE_FAILED", "failed to issue server cert for renewal", err) } v, err := s.validator.ValidateServerCertificate(ctx, issued.CertPEM, issued.ChainPEM, reg.AnsName.FQDN()) if err != nil { - return domain.NewInternalError("SERVER_CERT_SELFVERIFY_FAILED", + return nil, domain.NewInternalError("SERVER_CERT_SELFVERIFY_FAILED", "issued renewal cert failed self-validation", err) } newCert := &domain.ByocServerCertificate{ @@ -311,24 +450,41 @@ func (s *RegistrationService) completeCSRRenewal(ctx context.Context, agentID st ValidToTimestamp: v.ValidTo, Fingerprint: v.Fingerprint, } - if err := s.byoc.Save(ctx, agentID, newCert); err != nil { - return err + signedCSR, err := csr.MarkSigned(now) + if err != nil { + return nil, err } - if signedCSR, serr := csr.MarkSigned(now); serr == nil { - _ = s.certs.SaveCSR(ctx, agentID, &signedCSR) + if err := r.MarkCompleted(now); err != nil { + return nil, err } - return r.MarkCompleted(now) + // Commit the new cert, the SIGNED CSR row, and the completed + // renewal atomically: a crash between them would otherwise leave + // the agent's live cert and its renewal record disagreeing about + // whether the rollover happened. + if err := s.uow.Run(ctx, func(txCtx context.Context) error { + if err := s.byoc.Save(txCtx, agentID, newCert); err != nil { + return err + } + if err := s.certs.SaveCSR(txCtx, agentID, &signedCSR); err != nil { + return err + } + return s.renewals.Save(txCtx, r) + }); err != nil { + return nil, err + } + tlsa := domain.TLSARecordForCert(reg.AnsName.FQDN(), v.Fingerprint) + return &VerifyRenewalACMEResult{Renewal: r, Sync: true, TLSARecord: &tlsa}, nil } // generateChallengeTokens returns a pair of base64url-encoded random -// tokens the operator uses for DNS-01 and HTTP-01 challenges. Each -// token is 32 bytes of crypto/rand which maps to ~43 base64url chars -// — more than enough entropy to prevent guessing. -// -// Tokens are opaque to the verifier; they only need to be -// unpredictable per-renewal. We don't use JWK thumbprint because our -// verifier is stubbed (noop); a future real-ACME integration will -// replace this with full RFC 8555 token semantics. +// tokens for the RA's self-issued challenges — used only on BYOC +// paths, where no certificate provider participates and the RA itself +// plays the validator. CSR-path challenges come from the issuer +// port's CreateOrder instead. Each token is 32 bytes of crypto/rand +// (~43 base64url chars) — opaque to the verifier, it only needs to be +// unpredictable per-flow. No JWK thumbprint binding: self-issued +// challenges have no account key to bind to (Challenge. +// KeyAuthorization stays empty and verifiers expect the raw token). func generateChallengeTokens() (string, string, error) { dns01Bytes := make([]byte, 32) if _, err := rand.Read(dns01Bytes); err != nil { @@ -342,7 +498,3 @@ func generateChallengeTokens() (string, string, error) { base64.RawURLEncoding.EncodeToString(http01Bytes), nil } - -// Unused imports placeholder so time doesn't get auto-removed if -// future edits need it. -var _ = time.Now diff --git a/internal/ra/service/renewal_expiry_test.go b/internal/ra/service/renewal_expiry_test.go index 5bf1384..322d0f0 100644 --- a/internal/ra/service/renewal_expiry_test.go +++ b/internal/ra/service/renewal_expiry_test.go @@ -41,12 +41,14 @@ func TestExpireRenewalsOnce_MarksStaleAsFailed(t *testing.T) { ByocCertPEM: "-----BEGIN CERTIFICATE-----\nabcd\n-----END CERTIFICATE-----", CreatedAt: pastExpiry.Add(-24 * time.Hour), Validation: domain.RenewalValidation{ - DNS01ChallengeToken: "dns", - HTTP01ChallengeToken: "http", - Status: domain.ValidationPending, - CreatedAt: pastExpiry.Add(-24 * time.Hour), - ExpiresAt: pastExpiry, - UpdatedAt: pastExpiry.Add(-24 * time.Hour), + Challenges: []domain.Challenge{ + {Type: domain.ChallengeTypeDNS01, Token: "dns"}, + {Type: domain.ChallengeTypeHTTP01, Token: "http"}, + }, + Status: domain.ValidationPending, + CreatedAt: pastExpiry.Add(-24 * time.Hour), + ExpiresAt: pastExpiry, + UpdatedAt: pastExpiry.Add(-24 * time.Hour), }, } if err := renewals.Save(ctx, r); err != nil { @@ -113,12 +115,14 @@ func TestExpireRenewalsOnce_CSRPath_RejectsAttachedCSR(t *testing.T) { ServerCsrID: csr.CSRID, CreatedAt: pastExpiry.Add(-24 * time.Hour), Validation: domain.RenewalValidation{ - DNS01ChallengeToken: "dns", - HTTP01ChallengeToken: "http", - Status: domain.ValidationPending, - CreatedAt: pastExpiry.Add(-24 * time.Hour), - ExpiresAt: pastExpiry, - UpdatedAt: pastExpiry.Add(-24 * time.Hour), + Challenges: []domain.Challenge{ + {Type: domain.ChallengeTypeDNS01, Token: "dns"}, + {Type: domain.ChallengeTypeHTTP01, Token: "http"}, + }, + Status: domain.ValidationPending, + CreatedAt: pastExpiry.Add(-24 * time.Hour), + ExpiresAt: pastExpiry, + UpdatedAt: pastExpiry.Add(-24 * time.Hour), }, } if err := renewals.Save(ctx, r); err != nil { @@ -169,12 +173,14 @@ func TestExpireRenewalsOnce_IgnoresCompletedRenewals(t *testing.T) { CompletedAt: now, CreatedAt: now.Add(-24 * time.Hour), Validation: domain.RenewalValidation{ - DNS01ChallengeToken: "dns", - HTTP01ChallengeToken: "http", - Status: domain.ValidationVerified, - CreatedAt: now.Add(-24 * time.Hour), - ExpiresAt: now.Add(-1 * time.Hour), // past, but completed - UpdatedAt: now, + Challenges: []domain.Challenge{ + {Type: domain.ChallengeTypeDNS01, Token: "dns"}, + {Type: domain.ChallengeTypeHTTP01, Token: "http"}, + }, + Status: domain.ValidationVerified, + CreatedAt: now.Add(-24 * time.Hour), + ExpiresAt: now.Add(-1 * time.Hour), // past, but completed + UpdatedAt: now, }, } if err := renewals.Save(ctx, r); err != nil { diff --git a/internal/ra/service/v1event.go b/internal/ra/service/v1event.go index edcb955..3144077 100644 --- a/internal/ra/service/v1event.go +++ b/internal/ra/service/v1event.go @@ -205,13 +205,19 @@ func (s *RegistrationService) buildAgentRegisteredV1Event( }) } - // BYOC server cert — operator-supplied at registration. + // Server cert (BYOC or CSR-signed): folded into the terminal V1 + // attestation. A transient store error must abort — this leaf is + // signed and appended to the append-only log, so swallowing it + // would emit a permanently wrong attestation from a recoverable + // fault. var primaryServer *eventv1.CertificateInfo var validServer []eventv1.CertificateInfoExtended - var byocCert *domain.ByocServerCertificate - if byoc, berr := s.byoc.FindLatestValidByAgentID(ctx, reg.AgentID); berr == nil && byoc != nil { - byocCert = byoc - fp := "SHA256:" + byoc.Fingerprint + byocCert, berr := s.loadServerCert(ctx, reg.AgentID) + if berr != nil { + return nil, berr + } + if byocCert != nil { + fp := "SHA256:" + byocCert.Fingerprint primaryServer = &eventv1.CertificateInfo{ Fingerprint: fp, CertType: "X509-DV-SERVER", @@ -219,7 +225,7 @@ func (s *RegistrationService) buildAgentRegisteredV1Event( validServer = []eventv1.CertificateInfoExtended{{ Fingerprint: fp, CertType: "X509-DV-SERVER", - NotAfter: byoc.ValidToTimestamp.UTC().Format(time.RFC3339), + NotAfter: byocCert.ValidToTimestamp.UTC().Format(time.RFC3339), }} } diff --git a/scripts/demo/acme-verify.sh b/scripts/demo/acme-verify.sh new file mode 100755 index 0000000..e28ce45 --- /dev/null +++ b/scripts/demo/acme-verify.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# Drive the ACME verify-acme loop for a registered V2 agent and fetch +# the provider-issued server certificate once the order completes. +# +# Pairs with `start.sh --with-acme`: after registering with +# scripts/demo/register.sh --v2 --register-only agent.yourdomain.com +# the pending response relays the provider's challenges. Publish ONE +# of them on the domain you control (usually the DNS-01 TXT record), +# then run this script. It: +# +# 1. shows the outstanding challenge artifacts (and dig-checks the +# TXT record locally when `dig` is available), +# 2. POSTs verify-acme — the RA re-checks the artifact, answers the +# provider, and finalizes the order, +# 3. re-POSTs while the provider reports the order still issuing +# (phase CERTIFICATE_ISSUANCE), and +# 4. prints the issued certificate's subject/issuer/validity once +# the agent reaches PENDING_DNS. +# +# Usage: +# scripts/demo/acme-verify.sh # agent from data/demo/last-agent-id +# scripts/demo/acme-verify.sh # explicit agent +# ACME_VERIFY_ATTEMPTS=40 scripts/demo/acme-verify.sh # longer re-drive loop +# +# Exits 0 once the certificate is issued; non-zero if the challenge +# isn't live yet or the provider failed the order. + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=common.sh +source "$SCRIPT_DIR/common.sh" + +AGENT="${1:-}" +if [ -z "$AGENT" ] && [ -f "$DATA/last-agent-id" ]; then + AGENT=$(cat "$DATA/last-agent-id") +fi +[ -n "$AGENT" ] || fail "no agentId given and $DATA/last-agent-id not found — register first" + +ATTEMPTS="${ACME_VERIFY_ATTEMPTS:-24}" +SLEEP_SECONDS=5 +AGENT_BASE="/v2/ans/agents" + +if ! curl -sSf "$RA_URL/v2/admin/ready" >/dev/null 2>&1; then + fail "ans-ra isn't reachable at $RA_URL — run scripts/demo/start.sh --with-acme first" +fi + +# ----- 1. Show outstanding challenges (pre-validation only) ----- +header "GET $AGENT_BASE/$AGENT" +DETAIL=$(curl_json GET "$AGENT_BASE/$AGENT") +STATUS=$(printf '%s' "$DETAIL" | jq -r '.agentStatus // empty') + +if [ "$STATUS" = "PENDING_VALIDATION" ]; then + CHALLENGES=$(printf '%s' "$DETAIL" | jq -c '.registrationPending.challenges // []') + TXT_NAME=$(printf '%s' "$CHALLENGES" | jq -r '.[] | select(.type=="DNS_01") | .dnsRecord.name // empty') + TXT_VALUE=$(printf '%s' "$CHALLENGES" | jq -r '.[] | select(.type=="DNS_01") | .dnsRecord.value // empty') + if [ -n "$TXT_NAME" ]; then + header "Challenge to publish (one of)" + printf " TXT %s = %s\n" "$TXT_NAME" "$TXT_VALUE" >&2 + HTTP_PATH=$(printf '%s' "$CHALLENGES" | jq -r '.[] | select(.type=="HTTP_01") | .httpPath // empty') + KEYAUTH=$(printf '%s' "$CHALLENGES" | jq -r '.[] | select(.type=="HTTP_01") | .keyAuthorization // .token // empty') + if [ -n "$HTTP_PATH" ]; then + printf " HTTP http://%s → %s\n" "$HTTP_PATH" "$KEYAUTH" >&2 + fi + # Local dig pre-check is advisory: your resolver may lag the + # provider's, and the RA does its own authoritative check anyway. + if command -v dig >/dev/null 2>&1; then + SEEN=$(dig +short TXT "$TXT_NAME" 2>/dev/null | tr -d '"' || true) + if printf '%s' "$SEEN" | grep -qF "$TXT_VALUE"; then + ok "TXT record is visible to the local resolver" + else + note "TXT record not visible to the local resolver yet (propagation can take a minute)" + fi + fi + fi +fi + +# ----- 2/3. verify-acme, re-driving while the order is issuing ----- +i=1 +while :; do + header "POST $AGENT_BASE/$AGENT/verify-acme (attempt $i)" + RESP=$(curl_json POST "$AGENT_BASE/$AGENT/verify-acme") + # Error responses are RFC 7807 problem documents carrying `code` — + # the success AgentStatus body never has one. (curl_json runs in a + # command substitution, so its LAST_HTTP_STATUS isn't visible here.) + ERR_CODE=$(printf '%s' "$RESP" | jq -r '.code // empty') + if [ -n "$ERR_CODE" ]; then + ERR_DETAIL=$(printf '%s' "$RESP" | jq -r '.detail // empty') + fail "verify-acme failed ($ERR_CODE): $ERR_DETAIL" + fi + PHASE=$(printf '%s' "$RESP" | jq -r '.phase // empty') + STATUS=$(printf '%s' "$RESP" | jq -r '.status // empty') + if [ "$PHASE" != "CERTIFICATE_ISSUANCE" ]; then + break + fi + if [ "$i" -ge "$ATTEMPTS" ]; then + fail "order still issuing after $ATTEMPTS attempts — re-run this script to keep driving it" + fi + note "provider is still validating/issuing — retrying in ${SLEEP_SECONDS}s" + sleep "$SLEEP_SECONDS" + i=$((i + 1)) +done + +if [ "$STATUS" != "PENDING_DNS" ] && [ "$STATUS" != "ACTIVE" ]; then + fail "unexpected post-verify state: status=$STATUS phase=$PHASE" +fi +ok "domain validated, order complete (status=$STATUS)" + +# ----- 4. Fetch the provider-issued server certificate ----- +header "GET $AGENT_BASE/$AGENT/certificates/server" +CERTS=$(curl_json GET "$AGENT_BASE/$AGENT/certificates/server") +LEAF=$(printf '%s' "$CERTS" | jq -r '.[0].certificatePEM // empty') +[ -n "$LEAF" ] || fail "no server certificate returned" + +header "Issued certificate" +printf '%s' "$LEAF" | openssl x509 -noout -subject -issuer -dates 2>/dev/null | sed 's/^/ /' >&2 +ok "server certificate issued — publish the production DNS records and POST verify-dns to reach ACTIVE" diff --git a/scripts/demo/register.sh b/scripts/demo/register.sh index 663759d..52408af 100755 --- a/scripts/demo/register.sh +++ b/scripts/demo/register.sh @@ -87,9 +87,10 @@ IDENTITY_CSR_PEM=$(cat "$CSR_DIR/identity.csr") # ----- Generate server CSR (DNS SAN = agent FQDN) ----- # -# Matches the reference's default registration shape: the RA signs -# the server TLS cert through its configured ServerCertificateAuthority -# port. The demo config wires a ServerSelfCA that handles this. +# The RA opens a certificate order via its configured +# ServerCertificateIssuer port and finalizes it at verify-acme. The +# demo wires the in-process self-signed CA by default, or Let's +# Encrypt with start.sh --with-acme. cat >"$CSR_DIR/server.cnf" </dev/ if [ "$PATH_MODE" = "csr" ]; then # CSR path: produce a PEM CSR with DNS SAN matching the agent FQDN. - # The RA's configured ServerCertificateAuthority signs it. + # The RA's configured ServerCertificateIssuer finalizes the order + # and returns the cert at renewal verify-acme. openssl req -new -key "$CERT_DIR/key.pem" \ -config "$CERT_DIR/openssl.cnf" \ -out "$CERT_DIR/csr.pem" 2>/dev/null diff --git a/scripts/demo/start.sh b/scripts/demo/start.sh index 415a09e..3187b2c 100755 --- a/scripts/demo/start.sh +++ b/scripts/demo/start.sh @@ -9,8 +9,26 @@ # inlined and starts the TL against it. # # Usage: -# scripts/demo/start.sh # wipe data/demo, fresh start -# scripts/demo/start.sh --keep # reuse existing data/demo +# scripts/demo/start.sh # wipe data/demo, fresh start +# scripts/demo/start.sh --keep # reuse existing data/demo +# scripts/demo/start.sh --with-dns # bundled ans-dns + lookup verifier +# scripts/demo/start.sh --with-acme # Let's Encrypt STAGING issues server certs +# +# --with-acme swaps the server certificate issuer from the demo's +# self-signed CA to a real RFC 8555 provider (Let's Encrypt staging +# by default). Registrations then relay the provider's real +# challenges, and the agent's server cert is provider-issued. This +# needs a public domain you control: register with your FQDN +# (register.sh --v2 --register-only agent.yourdomain.com), publish +# the relayed _acme-challenge TXT record, then drive verify-acme — +# scripts/demo/acme-verify.sh walks that loop. The DNS verifier +# defaults to "lookup" via the OS resolver in this mode so the RA +# fail-fasts locally before answering the provider (a wrongly +# answered challenge invalidates the provider order). Knobs: +# ANS_ACME_DIRECTORY_URL default Let's Encrypt staging; set the +# production directory only when you mean +# it — its rate limits are unforgiving +# ANS_ACME_EMAIL optional account contact for expiry mail # # Prerequisites: go, curl, jq, openssl (openssl only needed by # run-lifecycle.sh, but checked here for early-failure UX). @@ -23,6 +41,7 @@ source "$SCRIPT_DIR/common.sh" # ----- args ----- KEEP_DATA=0 WITH_DNS=0 +WITH_ACME=0 while [ $# -gt 0 ]; do case "$1" in --keep) KEEP_DATA=1; shift ;; @@ -36,14 +55,38 @@ while [ $# -gt 0 ]; do export ANS_DNS_SERVER="127.0.0.1:15353" shift ;; + --with-acme) + # Server certs come from a real RFC 8555 provider (Let's + # Encrypt staging unless ANS_ACME_DIRECTORY_URL overrides). + WITH_ACME=1 + shift + ;; -h|--help) - sed -n '2,17p' "$0" + sed -n '2,35p' "$0" exit 0 ;; *) fail "unknown arg: $1" ;; esac done +if [ "$WITH_ACME" -eq 1 ] && [ "$WITH_DNS" -eq 1 ]; then + # --with-dns points the RA's lookup verifier at the local ans-dns + # server, which knows nothing about the public domain a real ACME + # provider validates — the two modes contradict each other. + fail "--with-acme and --with-dns are mutually exclusive (ACME needs real public DNS)" +fi + +ACME_DIRECTORY_URL="${ANS_ACME_DIRECTORY_URL:-https://acme-staging-v02.api.letsencrypt.org/directory}" +ACME_EMAIL="${ANS_ACME_EMAIL:-}" +if [ "$WITH_ACME" -eq 1 ]; then + # The RA's challenge gate must check the real records the owner + # publishes — a noop gate would answer the provider's challenge + # before the artifact exists and invalidate the order. The OS + # resolver (empty ANS_DNS_SERVER) is the right default; honor an + # explicit ANS_DNS_TYPE if the caller insists. + export ANS_DNS_TYPE="${ANS_DNS_TYPE:-lookup}" +fi + # ----- preflight ----- header "Preflight" for cmd in go curl jq openssl; do @@ -73,7 +116,19 @@ if [ "$KEEP_DATA" -eq 0 ]; then ok "cleared $DATA (kept log files)" else note "keeping existing data under $DATA" + # Guard against silently re-pointing in-flight certificate orders at + # a different issuer: a self-CA order ref or an ACME account/order + # under $DATA/ra means nothing to the other issuer, so re-using the + # data dir across an issuer-type switch leaves un-finalizable orders. + PRIOR_ISSUER="" + [ -f "$DATA/issuer-mode" ] && PRIOR_ISSUER="$(cat "$DATA/issuer-mode")" + CURRENT_ISSUER=$([ "$WITH_ACME" -eq 1 ] && echo acme || echo self) + if [ -n "$PRIOR_ISSUER" ] && [ "$PRIOR_ISSUER" != "$CURRENT_ISSUER" ]; then + fail "this data dir was last run with the '$PRIOR_ISSUER' server issuer but you requested '$CURRENT_ISSUER'; re-run without --keep to start fresh (in-flight orders can't move between issuers)" + fi fi +# Record the issuer mode for the --keep guard above on the next run. +echo "$([ "$WITH_ACME" -eq 1 ] && echo acme || echo self)" >"$DATA/issuer-mode" # Refuse to start if the ports already have something on them. for url in "$RA_URL/v2/admin/health" "$TL_URL/v2/admin/health"; do @@ -84,6 +139,32 @@ done # ----- RA config ----- header "Compose RA config" + +# Server certificate issuer block: the in-process self-signed CA by +# default, or a real RFC 8555 provider with --with-acme. Either way +# the serverCsrPEM registration/renewal path is enabled; the issuer +# behind it is what changes. +if [ "$WITH_ACME" -eq 1 ]; then + SERVER_ISSUER_BLOCK=$(cat <"$DATA/demo-ra.yaml" <&2 printf " %s ans-tl %s (pid %s, log %s)\n" "${C_GREEN}✔${C_RESET}" "$TL_URL" "$TL_PID" "$DATA/tl.log" >&2 printf "\n" >&2 -printf " next: %s\n" "scripts/demo/run-lifecycle.sh" >&2 +if [ "$WITH_ACME" -eq 1 ]; then + printf " ACME issuer: %s\n" "$ACME_DIRECTORY_URL" >&2 + printf " next: %s\n" "scripts/demo/register.sh --v2 --register-only agent.yourdomain.com" >&2 + printf " %s\n" "publish the relayed _acme-challenge TXT, then scripts/demo/acme-verify.sh" >&2 +else + printf " next: %s\n" "scripts/demo/run-lifecycle.sh" >&2 +fi printf " stop: %s\n" "scripts/demo/stop.sh (or --clean to wipe data)" >&2 diff --git a/spec/api-spec-v2.yaml b/spec/api-spec-v2.yaml index 9ad2749..a145424 100644 --- a/spec/api-spec-v2.yaml +++ b/spec/api-spec-v2.yaml @@ -1212,7 +1212,7 @@ components: HATEOAS link hrefs. Recommended for backport to v1. status: type: string - enum: [PENDING_VALIDATION, PENDING_DNS] + enum: [PENDING_VALIDATION, PENDING_CERTS, PENDING_DNS] ansName: type: string example: ans://v1.0.0.external-domain.com @@ -1246,7 +1246,7 @@ components: properties: action: type: string - enum: [CONFIGURE_DNS, CONFIGURE_HTTP, VERIFY_DNS, VALIDATE_DOMAIN, WAIT] + enum: [CONFIGURE_DNS, CONFIGURE_HTTP, VERIFY_DNS, VALIDATE_DOMAIN, WAIT, CANCEL] description: type: string endpoint: