From cf1cbdd837800d368b03405671b3798df2f02785 Mon Sep 17 00:00:00 2001 From: arreyder Date: Thu, 14 May 2026 17:29:58 -0500 Subject: [PATCH 1/2] connectorerrors: ProvisionFailureDetail for terminal Grant/Revoke MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Connectors today return plain Go errors for upstream-permanent failures (target user SUSPENDED, group governed by a rule, license cap reached). Those resolve to gRPC code=Unknown by the time a remote caller sees them, and Unknown classifies as retryable — so callers loop on a failure that will never succeed without external state change. This change adds a baton-sdk primitive for that signal: - new proto ProvisionFailureDetail (reason + admin-facing detail) - pkg/connectorerrors.NewTerminalError builds a gRPC status with the right code (PermissionDenied / FailedPrecondition) and attaches the detail via status.WithDetails - pkg/connectorerrors.FailureDetail extracts the detail across fmt.Errorf %w and errors.Join wrap chains, via status.FromError The reason→code table picks codes the existing isClientErrorCode logic already treats as non-retryable, so callers that only look at status.Code classify these errors correctly without any changes; callers that want structured logging or admin-facing detail can read the proto. Refs OPS-1551. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../v2/annotation_provision_failure.pb.go | 247 ++++++++++++++++++ ...nnotation_provision_failure.pb.validate.go | 142 ++++++++++ ...tation_provision_failure_protoopaque.pb.go | 244 +++++++++++++++++ pkg/connectorbuilder/resource_provisioner.go | 22 ++ pkg/connectorerrors/terminal.go | 134 ++++++++++ pkg/connectorerrors/terminal_test.go | 105 ++++++++ .../v2/annotation_provision_failure.proto | 66 +++++ 7 files changed, 960 insertions(+) create mode 100644 pb/c1/connector/v2/annotation_provision_failure.pb.go create mode 100644 pb/c1/connector/v2/annotation_provision_failure.pb.validate.go create mode 100644 pb/c1/connector/v2/annotation_provision_failure_protoopaque.pb.go create mode 100644 pkg/connectorerrors/terminal.go create mode 100644 pkg/connectorerrors/terminal_test.go create mode 100644 proto/c1/connector/v2/annotation_provision_failure.proto diff --git a/pb/c1/connector/v2/annotation_provision_failure.pb.go b/pb/c1/connector/v2/annotation_provision_failure.pb.go new file mode 100644 index 000000000..9de46793c --- /dev/null +++ b/pb/c1/connector/v2/annotation_provision_failure.pb.go @@ -0,0 +1,247 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.10 +// protoc (unknown) +// source: c1/connector/v2/annotation_provision_failure.proto + +//go:build !protoopaque + +package v2 + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type ProvisionFailureDetail_Reason int32 + +const ( + ProvisionFailureDetail_REASON_UNSPECIFIED ProvisionFailureDetail_Reason = 0 + // The target user is in a state that the upstream system does not + // permit modifications against. Examples: Okta SUSPENDED / DEPROVISIONED + // / pending password reset (E0000038); Workday INACTIVE; locked AD + // account. Resolves when the user's status changes upstream. + ProvisionFailureDetail_REASON_USER_NOT_PROVISIONABLE ProvisionFailureDetail_Reason = 1 + // The target entitlement (group / role / queue / etc.) is governed by + // an external rule that the connector cannot override. Examples: Okta + // group rule (E0000001 groupMembers); AD group policy; Workday + // delivered security group. Resolves when the rule is changed or the + // subject is moved out of the rule's source set. + ProvisionFailureDetail_REASON_TARGET_MANAGED_EXTERNALLY ProvisionFailureDetail_Reason = 2 + // The upstream system has no remaining capacity for this operation. + // Examples: Jira "exceeds number of allowed users licensed"; + // Salesforce license type cap; Okta seat limit. Resolves when seats + // free up or the tenant purchases more. + ProvisionFailureDetail_REASON_LICENSE_EXHAUSTED ProvisionFailureDetail_Reason = 3 + // Upstream API validation rejected the operation for a reason that + // won't change on retry: malformed reference, conflicting policy, + // schema-level invariant. Use this for non-categorized terminal + // validation failures. + ProvisionFailureDetail_REASON_VALIDATION_FAILURE ProvisionFailureDetail_Reason = 4 + // The connector or upstream system does not permit the requested + // operation regardless of state. Examples: cannot revoke a + // self-managed group; cannot grant via this endpoint; required + // capability disabled by tenant admin. + ProvisionFailureDetail_REASON_OPERATION_NOT_PERMITTED ProvisionFailureDetail_Reason = 5 +) + +// Enum value maps for ProvisionFailureDetail_Reason. +var ( + ProvisionFailureDetail_Reason_name = map[int32]string{ + 0: "REASON_UNSPECIFIED", + 1: "REASON_USER_NOT_PROVISIONABLE", + 2: "REASON_TARGET_MANAGED_EXTERNALLY", + 3: "REASON_LICENSE_EXHAUSTED", + 4: "REASON_VALIDATION_FAILURE", + 5: "REASON_OPERATION_NOT_PERMITTED", + } + ProvisionFailureDetail_Reason_value = map[string]int32{ + "REASON_UNSPECIFIED": 0, + "REASON_USER_NOT_PROVISIONABLE": 1, + "REASON_TARGET_MANAGED_EXTERNALLY": 2, + "REASON_LICENSE_EXHAUSTED": 3, + "REASON_VALIDATION_FAILURE": 4, + "REASON_OPERATION_NOT_PERMITTED": 5, + } +) + +func (x ProvisionFailureDetail_Reason) Enum() *ProvisionFailureDetail_Reason { + p := new(ProvisionFailureDetail_Reason) + *p = x + return p +} + +func (x ProvisionFailureDetail_Reason) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (ProvisionFailureDetail_Reason) Descriptor() protoreflect.EnumDescriptor { + return file_c1_connector_v2_annotation_provision_failure_proto_enumTypes[0].Descriptor() +} + +func (ProvisionFailureDetail_Reason) Type() protoreflect.EnumType { + return &file_c1_connector_v2_annotation_provision_failure_proto_enumTypes[0] +} + +func (x ProvisionFailureDetail_Reason) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// ProvisionFailureDetail describes WHY a Grant or Revoke failure is permanent. +// Connectors attach this to gRPC status details (via status.WithDetails) on +// errors returned from Grant/Revoke when the underlying upstream API has +// indicated the failure will not succeed on retry without external state +// change. +// +// The canonical signal callers act on is still the gRPC status code: when +// the upstream is permanently rejecting the operation, the connector should +// pick the matching code (FailedPrecondition for "state-blocked", +// PermissionDenied for "not permitted", etc.). This message *supplements* +// that with a structured reason and human-readable detail so callers can +// log, tag metrics, and route admin-facing remediation text consistently +// across connectors. +// +// For transient failures (5xx, network blip, rate limit), return errors as +// normal — those are retryable. Rate-limit-specific signaling has its own +// annotation (RateLimitDescription). +type ProvisionFailureDetail struct { + state protoimpl.MessageState `protogen:"hybrid.v1"` + Reason ProvisionFailureDetail_Reason `protobuf:"varint,1,opt,name=reason,proto3,enum=c1.connector.v2.ProvisionFailureDetail_Reason" json:"reason,omitempty"` + // Human-readable detail intended for logs and admin-visible operational + // context — not user-facing. Include the upstream error code or message + // when available (e.g. "Okta E0000038: user 'jdoe' is SUSPENDED"). + Detail string `protobuf:"bytes,2,opt,name=detail,proto3" json:"detail,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ProvisionFailureDetail) Reset() { + *x = ProvisionFailureDetail{} + mi := &file_c1_connector_v2_annotation_provision_failure_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ProvisionFailureDetail) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ProvisionFailureDetail) ProtoMessage() {} + +func (x *ProvisionFailureDetail) ProtoReflect() protoreflect.Message { + mi := &file_c1_connector_v2_annotation_provision_failure_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +func (x *ProvisionFailureDetail) GetReason() ProvisionFailureDetail_Reason { + if x != nil { + return x.Reason + } + return ProvisionFailureDetail_REASON_UNSPECIFIED +} + +func (x *ProvisionFailureDetail) GetDetail() string { + if x != nil { + return x.Detail + } + return "" +} + +func (x *ProvisionFailureDetail) SetReason(v ProvisionFailureDetail_Reason) { + x.Reason = v +} + +func (x *ProvisionFailureDetail) SetDetail(v string) { + x.Detail = v +} + +type ProvisionFailureDetail_builder struct { + _ [0]func() // Prevents comparability and use of unkeyed literals for the builder. + + Reason ProvisionFailureDetail_Reason + // Human-readable detail intended for logs and admin-visible operational + // context — not user-facing. Include the upstream error code or message + // when available (e.g. "Okta E0000038: user 'jdoe' is SUSPENDED"). + Detail string +} + +func (b0 ProvisionFailureDetail_builder) Build() *ProvisionFailureDetail { + m0 := &ProvisionFailureDetail{} + b, x := &b0, m0 + _, _ = b, x + x.Reason = b.Reason + x.Detail = b.Detail + return m0 +} + +var File_c1_connector_v2_annotation_provision_failure_proto protoreflect.FileDescriptor + +const file_c1_connector_v2_annotation_provision_failure_proto_rawDesc = "" + + "\n" + + "2c1/connector/v2/annotation_provision_failure.proto\x12\x0fc1.connector.v2\"\xc5\x02\n" + + "\x16ProvisionFailureDetail\x12F\n" + + "\x06reason\x18\x01 \x01(\x0e2..c1.connector.v2.ProvisionFailureDetail.ReasonR\x06reason\x12\x16\n" + + "\x06detail\x18\x02 \x01(\tR\x06detail\"\xca\x01\n" + + "\x06Reason\x12\x16\n" + + "\x12REASON_UNSPECIFIED\x10\x00\x12!\n" + + "\x1dREASON_USER_NOT_PROVISIONABLE\x10\x01\x12$\n" + + " REASON_TARGET_MANAGED_EXTERNALLY\x10\x02\x12\x1c\n" + + "\x18REASON_LICENSE_EXHAUSTED\x10\x03\x12\x1d\n" + + "\x19REASON_VALIDATION_FAILURE\x10\x04\x12\"\n" + + "\x1eREASON_OPERATION_NOT_PERMITTED\x10\x05B6Z4github.com/conductorone/baton-sdk/pb/c1/connector/v2b\x06proto3" + +var file_c1_connector_v2_annotation_provision_failure_proto_enumTypes = make([]protoimpl.EnumInfo, 1) +var file_c1_connector_v2_annotation_provision_failure_proto_msgTypes = make([]protoimpl.MessageInfo, 1) +var file_c1_connector_v2_annotation_provision_failure_proto_goTypes = []any{ + (ProvisionFailureDetail_Reason)(0), // 0: c1.connector.v2.ProvisionFailureDetail.Reason + (*ProvisionFailureDetail)(nil), // 1: c1.connector.v2.ProvisionFailureDetail +} +var file_c1_connector_v2_annotation_provision_failure_proto_depIdxs = []int32{ + 0, // 0: c1.connector.v2.ProvisionFailureDetail.reason:type_name -> c1.connector.v2.ProvisionFailureDetail.Reason + 1, // [1:1] is the sub-list for method output_type + 1, // [1:1] is the sub-list for method input_type + 1, // [1:1] is the sub-list for extension type_name + 1, // [1:1] is the sub-list for extension extendee + 0, // [0:1] is the sub-list for field type_name +} + +func init() { file_c1_connector_v2_annotation_provision_failure_proto_init() } +func file_c1_connector_v2_annotation_provision_failure_proto_init() { + if File_c1_connector_v2_annotation_provision_failure_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_c1_connector_v2_annotation_provision_failure_proto_rawDesc), len(file_c1_connector_v2_annotation_provision_failure_proto_rawDesc)), + NumEnums: 1, + NumMessages: 1, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_c1_connector_v2_annotation_provision_failure_proto_goTypes, + DependencyIndexes: file_c1_connector_v2_annotation_provision_failure_proto_depIdxs, + EnumInfos: file_c1_connector_v2_annotation_provision_failure_proto_enumTypes, + MessageInfos: file_c1_connector_v2_annotation_provision_failure_proto_msgTypes, + }.Build() + File_c1_connector_v2_annotation_provision_failure_proto = out.File + file_c1_connector_v2_annotation_provision_failure_proto_goTypes = nil + file_c1_connector_v2_annotation_provision_failure_proto_depIdxs = nil +} diff --git a/pb/c1/connector/v2/annotation_provision_failure.pb.validate.go b/pb/c1/connector/v2/annotation_provision_failure.pb.validate.go new file mode 100644 index 000000000..2c5564e33 --- /dev/null +++ b/pb/c1/connector/v2/annotation_provision_failure.pb.validate.go @@ -0,0 +1,142 @@ +// Code generated by protoc-gen-validate. DO NOT EDIT. +// source: c1/connector/v2/annotation_provision_failure.proto + +package v2 + +import ( + "bytes" + "errors" + "fmt" + "net" + "net/mail" + "net/url" + "regexp" + "sort" + "strings" + "time" + "unicode/utf8" + + "google.golang.org/protobuf/types/known/anypb" +) + +// ensure the imports are used +var ( + _ = bytes.MinRead + _ = errors.New("") + _ = fmt.Print + _ = utf8.UTFMax + _ = (*regexp.Regexp)(nil) + _ = (*strings.Reader)(nil) + _ = net.IPv4len + _ = time.Duration(0) + _ = (*url.URL)(nil) + _ = (*mail.Address)(nil) + _ = anypb.Any{} + _ = sort.Sort +) + +// Validate checks the field values on ProvisionFailureDetail with the rules +// defined in the proto definition for this message. If any rules are +// violated, the first error encountered is returned, or nil if there are no violations. +func (m *ProvisionFailureDetail) Validate() error { + return m.validate(false) +} + +// ValidateAll checks the field values on ProvisionFailureDetail with the rules +// defined in the proto definition for this message. If any rules are +// violated, the result is a list of violation errors wrapped in +// ProvisionFailureDetailMultiError, or nil if none found. +func (m *ProvisionFailureDetail) ValidateAll() error { + return m.validate(true) +} + +func (m *ProvisionFailureDetail) validate(all bool) error { + if m == nil { + return nil + } + + var errors []error + + // no validation rules for Reason + + // no validation rules for Detail + + if len(errors) > 0 { + return ProvisionFailureDetailMultiError(errors) + } + + return nil +} + +// ProvisionFailureDetailMultiError is an error wrapping multiple validation +// errors returned by ProvisionFailureDetail.ValidateAll() if the designated +// constraints aren't met. +type ProvisionFailureDetailMultiError []error + +// Error returns a concatenation of all the error messages it wraps. +func (m ProvisionFailureDetailMultiError) Error() string { + msgs := make([]string, 0, len(m)) + for _, err := range m { + msgs = append(msgs, err.Error()) + } + return strings.Join(msgs, "; ") +} + +// AllErrors returns a list of validation violation errors. +func (m ProvisionFailureDetailMultiError) AllErrors() []error { return m } + +// ProvisionFailureDetailValidationError is the validation error returned by +// ProvisionFailureDetail.Validate if the designated constraints aren't met. +type ProvisionFailureDetailValidationError struct { + field string + reason string + cause error + key bool +} + +// Field function returns field value. +func (e ProvisionFailureDetailValidationError) Field() string { return e.field } + +// Reason function returns reason value. +func (e ProvisionFailureDetailValidationError) Reason() string { return e.reason } + +// Cause function returns cause value. +func (e ProvisionFailureDetailValidationError) Cause() error { return e.cause } + +// Key function returns key value. +func (e ProvisionFailureDetailValidationError) Key() bool { return e.key } + +// ErrorName returns error name. +func (e ProvisionFailureDetailValidationError) ErrorName() string { + return "ProvisionFailureDetailValidationError" +} + +// Error satisfies the builtin error interface +func (e ProvisionFailureDetailValidationError) Error() string { + cause := "" + if e.cause != nil { + cause = fmt.Sprintf(" | caused by: %v", e.cause) + } + + key := "" + if e.key { + key = "key for " + } + + return fmt.Sprintf( + "invalid %sProvisionFailureDetail.%s: %s%s", + key, + e.field, + e.reason, + cause) +} + +var _ error = ProvisionFailureDetailValidationError{} + +var _ interface { + Field() string + Reason() string + Key() bool + Cause() error + ErrorName() string +} = ProvisionFailureDetailValidationError{} diff --git a/pb/c1/connector/v2/annotation_provision_failure_protoopaque.pb.go b/pb/c1/connector/v2/annotation_provision_failure_protoopaque.pb.go new file mode 100644 index 000000000..a945104d2 --- /dev/null +++ b/pb/c1/connector/v2/annotation_provision_failure_protoopaque.pb.go @@ -0,0 +1,244 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.10 +// protoc (unknown) +// source: c1/connector/v2/annotation_provision_failure.proto + +//go:build protoopaque + +package v2 + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +type ProvisionFailureDetail_Reason int32 + +const ( + ProvisionFailureDetail_REASON_UNSPECIFIED ProvisionFailureDetail_Reason = 0 + // The target user is in a state that the upstream system does not + // permit modifications against. Examples: Okta SUSPENDED / DEPROVISIONED + // / pending password reset (E0000038); Workday INACTIVE; locked AD + // account. Resolves when the user's status changes upstream. + ProvisionFailureDetail_REASON_USER_NOT_PROVISIONABLE ProvisionFailureDetail_Reason = 1 + // The target entitlement (group / role / queue / etc.) is governed by + // an external rule that the connector cannot override. Examples: Okta + // group rule (E0000001 groupMembers); AD group policy; Workday + // delivered security group. Resolves when the rule is changed or the + // subject is moved out of the rule's source set. + ProvisionFailureDetail_REASON_TARGET_MANAGED_EXTERNALLY ProvisionFailureDetail_Reason = 2 + // The upstream system has no remaining capacity for this operation. + // Examples: Jira "exceeds number of allowed users licensed"; + // Salesforce license type cap; Okta seat limit. Resolves when seats + // free up or the tenant purchases more. + ProvisionFailureDetail_REASON_LICENSE_EXHAUSTED ProvisionFailureDetail_Reason = 3 + // Upstream API validation rejected the operation for a reason that + // won't change on retry: malformed reference, conflicting policy, + // schema-level invariant. Use this for non-categorized terminal + // validation failures. + ProvisionFailureDetail_REASON_VALIDATION_FAILURE ProvisionFailureDetail_Reason = 4 + // The connector or upstream system does not permit the requested + // operation regardless of state. Examples: cannot revoke a + // self-managed group; cannot grant via this endpoint; required + // capability disabled by tenant admin. + ProvisionFailureDetail_REASON_OPERATION_NOT_PERMITTED ProvisionFailureDetail_Reason = 5 +) + +// Enum value maps for ProvisionFailureDetail_Reason. +var ( + ProvisionFailureDetail_Reason_name = map[int32]string{ + 0: "REASON_UNSPECIFIED", + 1: "REASON_USER_NOT_PROVISIONABLE", + 2: "REASON_TARGET_MANAGED_EXTERNALLY", + 3: "REASON_LICENSE_EXHAUSTED", + 4: "REASON_VALIDATION_FAILURE", + 5: "REASON_OPERATION_NOT_PERMITTED", + } + ProvisionFailureDetail_Reason_value = map[string]int32{ + "REASON_UNSPECIFIED": 0, + "REASON_USER_NOT_PROVISIONABLE": 1, + "REASON_TARGET_MANAGED_EXTERNALLY": 2, + "REASON_LICENSE_EXHAUSTED": 3, + "REASON_VALIDATION_FAILURE": 4, + "REASON_OPERATION_NOT_PERMITTED": 5, + } +) + +func (x ProvisionFailureDetail_Reason) Enum() *ProvisionFailureDetail_Reason { + p := new(ProvisionFailureDetail_Reason) + *p = x + return p +} + +func (x ProvisionFailureDetail_Reason) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (ProvisionFailureDetail_Reason) Descriptor() protoreflect.EnumDescriptor { + return file_c1_connector_v2_annotation_provision_failure_proto_enumTypes[0].Descriptor() +} + +func (ProvisionFailureDetail_Reason) Type() protoreflect.EnumType { + return &file_c1_connector_v2_annotation_provision_failure_proto_enumTypes[0] +} + +func (x ProvisionFailureDetail_Reason) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// ProvisionFailureDetail describes WHY a Grant or Revoke failure is permanent. +// Connectors attach this to gRPC status details (via status.WithDetails) on +// errors returned from Grant/Revoke when the underlying upstream API has +// indicated the failure will not succeed on retry without external state +// change. +// +// The canonical signal callers act on is still the gRPC status code: when +// the upstream is permanently rejecting the operation, the connector should +// pick the matching code (FailedPrecondition for "state-blocked", +// PermissionDenied for "not permitted", etc.). This message *supplements* +// that with a structured reason and human-readable detail so callers can +// log, tag metrics, and route admin-facing remediation text consistently +// across connectors. +// +// For transient failures (5xx, network blip, rate limit), return errors as +// normal — those are retryable. Rate-limit-specific signaling has its own +// annotation (RateLimitDescription). +type ProvisionFailureDetail struct { + state protoimpl.MessageState `protogen:"opaque.v1"` + xxx_hidden_Reason ProvisionFailureDetail_Reason `protobuf:"varint,1,opt,name=reason,proto3,enum=c1.connector.v2.ProvisionFailureDetail_Reason"` + xxx_hidden_Detail string `protobuf:"bytes,2,opt,name=detail,proto3"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ProvisionFailureDetail) Reset() { + *x = ProvisionFailureDetail{} + mi := &file_c1_connector_v2_annotation_provision_failure_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ProvisionFailureDetail) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ProvisionFailureDetail) ProtoMessage() {} + +func (x *ProvisionFailureDetail) ProtoReflect() protoreflect.Message { + mi := &file_c1_connector_v2_annotation_provision_failure_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +func (x *ProvisionFailureDetail) GetReason() ProvisionFailureDetail_Reason { + if x != nil { + return x.xxx_hidden_Reason + } + return ProvisionFailureDetail_REASON_UNSPECIFIED +} + +func (x *ProvisionFailureDetail) GetDetail() string { + if x != nil { + return x.xxx_hidden_Detail + } + return "" +} + +func (x *ProvisionFailureDetail) SetReason(v ProvisionFailureDetail_Reason) { + x.xxx_hidden_Reason = v +} + +func (x *ProvisionFailureDetail) SetDetail(v string) { + x.xxx_hidden_Detail = v +} + +type ProvisionFailureDetail_builder struct { + _ [0]func() // Prevents comparability and use of unkeyed literals for the builder. + + Reason ProvisionFailureDetail_Reason + // Human-readable detail intended for logs and admin-visible operational + // context — not user-facing. Include the upstream error code or message + // when available (e.g. "Okta E0000038: user 'jdoe' is SUSPENDED"). + Detail string +} + +func (b0 ProvisionFailureDetail_builder) Build() *ProvisionFailureDetail { + m0 := &ProvisionFailureDetail{} + b, x := &b0, m0 + _, _ = b, x + x.xxx_hidden_Reason = b.Reason + x.xxx_hidden_Detail = b.Detail + return m0 +} + +var File_c1_connector_v2_annotation_provision_failure_proto protoreflect.FileDescriptor + +const file_c1_connector_v2_annotation_provision_failure_proto_rawDesc = "" + + "\n" + + "2c1/connector/v2/annotation_provision_failure.proto\x12\x0fc1.connector.v2\"\xc5\x02\n" + + "\x16ProvisionFailureDetail\x12F\n" + + "\x06reason\x18\x01 \x01(\x0e2..c1.connector.v2.ProvisionFailureDetail.ReasonR\x06reason\x12\x16\n" + + "\x06detail\x18\x02 \x01(\tR\x06detail\"\xca\x01\n" + + "\x06Reason\x12\x16\n" + + "\x12REASON_UNSPECIFIED\x10\x00\x12!\n" + + "\x1dREASON_USER_NOT_PROVISIONABLE\x10\x01\x12$\n" + + " REASON_TARGET_MANAGED_EXTERNALLY\x10\x02\x12\x1c\n" + + "\x18REASON_LICENSE_EXHAUSTED\x10\x03\x12\x1d\n" + + "\x19REASON_VALIDATION_FAILURE\x10\x04\x12\"\n" + + "\x1eREASON_OPERATION_NOT_PERMITTED\x10\x05B6Z4github.com/conductorone/baton-sdk/pb/c1/connector/v2b\x06proto3" + +var file_c1_connector_v2_annotation_provision_failure_proto_enumTypes = make([]protoimpl.EnumInfo, 1) +var file_c1_connector_v2_annotation_provision_failure_proto_msgTypes = make([]protoimpl.MessageInfo, 1) +var file_c1_connector_v2_annotation_provision_failure_proto_goTypes = []any{ + (ProvisionFailureDetail_Reason)(0), // 0: c1.connector.v2.ProvisionFailureDetail.Reason + (*ProvisionFailureDetail)(nil), // 1: c1.connector.v2.ProvisionFailureDetail +} +var file_c1_connector_v2_annotation_provision_failure_proto_depIdxs = []int32{ + 0, // 0: c1.connector.v2.ProvisionFailureDetail.reason:type_name -> c1.connector.v2.ProvisionFailureDetail.Reason + 1, // [1:1] is the sub-list for method output_type + 1, // [1:1] is the sub-list for method input_type + 1, // [1:1] is the sub-list for extension type_name + 1, // [1:1] is the sub-list for extension extendee + 0, // [0:1] is the sub-list for field type_name +} + +func init() { file_c1_connector_v2_annotation_provision_failure_proto_init() } +func file_c1_connector_v2_annotation_provision_failure_proto_init() { + if File_c1_connector_v2_annotation_provision_failure_proto != nil { + return + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_c1_connector_v2_annotation_provision_failure_proto_rawDesc), len(file_c1_connector_v2_annotation_provision_failure_proto_rawDesc)), + NumEnums: 1, + NumMessages: 1, + NumExtensions: 0, + NumServices: 0, + }, + GoTypes: file_c1_connector_v2_annotation_provision_failure_proto_goTypes, + DependencyIndexes: file_c1_connector_v2_annotation_provision_failure_proto_depIdxs, + EnumInfos: file_c1_connector_v2_annotation_provision_failure_proto_enumTypes, + MessageInfos: file_c1_connector_v2_annotation_provision_failure_proto_msgTypes, + }.Build() + File_c1_connector_v2_annotation_provision_failure_proto = out.File + file_c1_connector_v2_annotation_provision_failure_proto_goTypes = nil + file_c1_connector_v2_annotation_provision_failure_proto_depIdxs = nil +} diff --git a/pkg/connectorbuilder/resource_provisioner.go b/pkg/connectorbuilder/resource_provisioner.go index 4024eea27..318cba794 100644 --- a/pkg/connectorbuilder/resource_provisioner.go +++ b/pkg/connectorbuilder/resource_provisioner.go @@ -23,6 +23,28 @@ import ( // // Implementing this interface indicates the connector supports provisioning operations // for the associated resource type. +// +// # Signaling terminal failures +// +// Grant and Revoke errors flow back to a remote caller that has to decide +// whether to retry. By default that decision is based on the gRPC status +// code: codes.Unavailable / Internal / DeadlineExceeded / ResourceExhausted +// retry; codes.FailedPrecondition / PermissionDenied / InvalidArgument do +// not. A plain `errors.New(...)` resolves to codes.Unknown and is treated +// as retryable, which causes upstream-permanent failures (target user +// SUSPENDED, group governed by a rule, license cap reached) to loop until +// a higher-level timeout fires. +// +// When the upstream system has told you the operation will not succeed +// on retry without external state change, return an error built with +// pkg/connectorerrors.NewTerminalError. It picks a non-retryable gRPC +// code and attaches a ProvisionFailureDetail proto so callers can also +// read a structured reason and admin-facing detail string. +// +// Transient failures (5xx, network blips, rate limits) should continue +// to be returned as ordinary errors — those are retryable, and rate +// limits in particular have their own annotation (RateLimitDescription) +// on the response. type ResourceProvisioner interface { ResourceSyncer ResourceProvisionerLimited diff --git a/pkg/connectorerrors/terminal.go b/pkg/connectorerrors/terminal.go new file mode 100644 index 000000000..0fd7891b6 --- /dev/null +++ b/pkg/connectorerrors/terminal.go @@ -0,0 +1,134 @@ +// Package connectorerrors helps connectors signal terminal (non-retryable) +// failures from Grant and Revoke in a structured way that upstream callers +// can act on. +// +// Today, when a connector returns a plain Go error, the upstream framing +// (in baton-sdk: fmt.Errorf("grant failed: %w", err), then again in the +// caller: codes.Unknown) erases information about whether the failure is +// retryable. Callers fall back to message-substring matching, which is +// brittle and per-connector. +// +// This package gives connectors a single helper that: +// - picks the right gRPC status code for a known terminal reason, so +// callers that inspect status.Code(err) automatically classify the +// error as non-retryable; +// - attaches a ProvisionFailureDetail (proto) as a gRPC status detail, +// so callers that want a typed reason and an upstream-facing detail +// string can read it without parsing free text. +// +// Connectors should still return ordinary errors for transient failures +// (network blips, 5xx, rate limits — those have their own annotation, +// RateLimitDescription). Only call NewTerminalError when you have +// positively identified an upstream signal that means "this will not +// succeed on retry without external state change." +package connectorerrors + +import ( + "fmt" + + v2 "github.com/conductorone/baton-sdk/pb/c1/connector/v2" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// NewTerminalError returns an error suitable for returning from a +// Grant or Revoke implementation when the upstream system has signaled +// a permanent failure. The returned error carries: +// +// - a gRPC status code chosen from `reason` (see codeForReason); +// - a free-form detail message (formatted with format/args); +// - a ProvisionFailureDetail proto attached as a status detail, so +// structured callers can read the reason and detail without +// parsing the message. +// +// REASON_UNSPECIFIED is rejected — pass a real reason. If you genuinely +// don't know which bucket fits, REASON_VALIDATION_FAILURE is the +// catch-all for upstream rejections that won't change on retry. +func NewTerminalError(reason v2.ProvisionFailureDetail_Reason, format string, args ...any) error { + if reason == v2.ProvisionFailureDetail_REASON_UNSPECIFIED { + // Programmer error — callers must pick a reason. Return an + // error rather than panic so this doesn't take down a worker. + return status.Error(codes.Internal, "connectorerrors.NewTerminalError: reason must not be REASON_UNSPECIFIED") + } + + msg := fmt.Sprintf(format, args...) + st := status.New(codeForReason(reason), msg) + + detail := v2.ProvisionFailureDetail_builder{ + Reason: reason, + Detail: msg, + }.Build() + + withDetails, err := st.WithDetails(detail) + if err != nil { + // status.WithDetails only fails if the detail can't be + // marshaled; for a fixed proto we control, that should not + // happen in practice. Fall back to the status without + // details so the code is still correct. + return st.Err() + } + return withDetails.Err() +} + +// FailureDetail extracts a ProvisionFailureDetail from an error chain, +// returning (detail, true) when one is present. It walks fmt.Errorf +// %w and errors.Join chains via status.FromError (which uses errors.As), +// so wrapped errors are handled transparently. +func FailureDetail(err error) (*v2.ProvisionFailureDetail, bool) { + if err == nil { + return nil, false + } + st, ok := status.FromError(err) + if !ok || st == nil { + return nil, false + } + for _, d := range st.Details() { + // Some details fail to unmarshal (unknown proto types in the + // caller's binary surface as *errdetails.ErrorInfo-shaped + // errors here); skip those rather than aborting the walk. + if pfd, ok := d.(*v2.ProvisionFailureDetail); ok { + return pfd, true + } + } + return nil, false +} + +// IsTerminal reports whether the error carries a ProvisionFailureDetail. +// Equivalent to "_, ok := FailureDetail(err)" with no allocation in the +// false path; convenient when callers only need a yes/no. +func IsTerminal(err error) bool { + _, ok := FailureDetail(err) + return ok +} + +// codeForReason maps a ProvisionFailureDetail reason to the gRPC status +// code callers should observe. +// +// The codes here are the same ones baton-sdk's existing isClientErrorCode +// logic already treats as non-retryable, so connectors adopting this +// helper get correct retry behavior on day one — even from callers that +// haven't been updated to read ProvisionFailureDetail. +// +// - PermissionDenied for "the operation itself is not permitted" +// (REASON_OPERATION_NOT_PERMITTED) +// - FailedPrecondition for everything else: target state, external +// management, license exhaustion, validation. These all mean the +// upstream world is in a state that prevents this operation; retry +// against the same state will fail the same way. +func codeForReason(reason v2.ProvisionFailureDetail_Reason) codes.Code { + switch reason { + case v2.ProvisionFailureDetail_REASON_OPERATION_NOT_PERMITTED: + return codes.PermissionDenied + case v2.ProvisionFailureDetail_REASON_USER_NOT_PROVISIONABLE, + v2.ProvisionFailureDetail_REASON_TARGET_MANAGED_EXTERNALLY, + v2.ProvisionFailureDetail_REASON_LICENSE_EXHAUSTED, + v2.ProvisionFailureDetail_REASON_VALIDATION_FAILURE: + return codes.FailedPrecondition + default: + // REASON_UNSPECIFIED is filtered upstream in NewTerminalError; + // any future reason added without a code mapping defaults to + // FailedPrecondition rather than Unknown so it still classifies + // as non-retryable in current callers. + return codes.FailedPrecondition + } +} diff --git a/pkg/connectorerrors/terminal_test.go b/pkg/connectorerrors/terminal_test.go new file mode 100644 index 000000000..94796e48c --- /dev/null +++ b/pkg/connectorerrors/terminal_test.go @@ -0,0 +1,105 @@ +package connectorerrors + +import ( + "errors" + "fmt" + "testing" + + v2 "github.com/conductorone/baton-sdk/pb/c1/connector/v2" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +func TestNewTerminalError_AttachesDetailAndCode(t *testing.T) { + cases := []struct { + name string + reason v2.ProvisionFailureDetail_Reason + wantCode codes.Code + }{ + {"user_status", v2.ProvisionFailureDetail_REASON_USER_NOT_PROVISIONABLE, codes.FailedPrecondition}, + {"group_rule", v2.ProvisionFailureDetail_REASON_TARGET_MANAGED_EXTERNALLY, codes.FailedPrecondition}, + {"license", v2.ProvisionFailureDetail_REASON_LICENSE_EXHAUSTED, codes.FailedPrecondition}, + {"validation", v2.ProvisionFailureDetail_REASON_VALIDATION_FAILURE, codes.FailedPrecondition}, + {"not_permitted", v2.ProvisionFailureDetail_REASON_OPERATION_NOT_PERMITTED, codes.PermissionDenied}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := NewTerminalError(tc.reason, "upstream said: %s", "no soup") + require.Error(t, err) + + assert.Equal(t, tc.wantCode, status.Code(err), "gRPC code should be picked from reason") + + detail, ok := FailureDetail(err) + require.True(t, ok, "FailureDetail must extract the proto") + assert.Equal(t, tc.reason, detail.GetReason()) + assert.Equal(t, "upstream said: no soup", detail.GetDetail()) + + assert.True(t, IsTerminal(err), "IsTerminal should agree") + }) + } +} + +func TestNewTerminalError_RejectsUnspecified(t *testing.T) { + err := NewTerminalError(v2.ProvisionFailureDetail_REASON_UNSPECIFIED, "should fail") + require.Error(t, err) + // Returned as Internal so a caller can still log it; the point of + // the rejection is to fail the programmer error loudly, not to + // classify the underlying upstream error. + assert.Equal(t, codes.Internal, status.Code(err)) + _, ok := FailureDetail(err) + assert.False(t, ok, "rejection error must not carry a ProvisionFailureDetail") +} + +func TestFailureDetail_UnwrapsAcrossErrorf(t *testing.T) { + // This mirrors the real call chain: connector returns NewTerminalError, + // connectorbuilder wraps with fmt.Errorf("grant failed: %w", err), + // the caller may wrap again with errors.Join. status.FromError walks + // the chain via errors.As, so FailureDetail should still find the + // detail. + base := NewTerminalError(v2.ProvisionFailureDetail_REASON_USER_NOT_PROVISIONABLE, + "Okta E0000038: user is SUSPENDED") + + wrappedOnce := fmt.Errorf("grant failed: %w", base) + wrappedTwice := errors.Join(errors.New("ErrConnectorInvokeReturnedError"), wrappedOnce) + + t.Run("fmt.Errorf %w", func(t *testing.T) { + detail, ok := FailureDetail(wrappedOnce) + require.True(t, ok) + assert.Equal(t, v2.ProvisionFailureDetail_REASON_USER_NOT_PROVISIONABLE, detail.GetReason()) + }) + + t.Run("errors.Join over fmt.Errorf", func(t *testing.T) { + detail, ok := FailureDetail(wrappedTwice) + require.True(t, ok) + assert.Equal(t, v2.ProvisionFailureDetail_REASON_USER_NOT_PROVISIONABLE, detail.GetReason()) + }) + + t.Run("status.Code survives wrapping", func(t *testing.T) { + assert.Equal(t, codes.FailedPrecondition, status.Code(wrappedOnce)) + assert.Equal(t, codes.FailedPrecondition, status.Code(wrappedTwice)) + }) +} + +func TestFailureDetail_PlainErrors(t *testing.T) { + t.Run("nil", func(t *testing.T) { + d, ok := FailureDetail(nil) + assert.False(t, ok) + assert.Nil(t, d) + assert.False(t, IsTerminal(nil)) + }) + + t.Run("plain go error", func(t *testing.T) { + d, ok := FailureDetail(errors.New("network blip")) + assert.False(t, ok) + assert.Nil(t, d) + }) + + t.Run("status without detail", func(t *testing.T) { + d, ok := FailureDetail(status.Error(codes.FailedPrecondition, "no detail attached")) + assert.False(t, ok, "FailedPrecondition without ProvisionFailureDetail must not look terminal-typed") + assert.Nil(t, d) + }) +} diff --git a/proto/c1/connector/v2/annotation_provision_failure.proto b/proto/c1/connector/v2/annotation_provision_failure.proto new file mode 100644 index 000000000..18de566aa --- /dev/null +++ b/proto/c1/connector/v2/annotation_provision_failure.proto @@ -0,0 +1,66 @@ +syntax = "proto3"; + +package c1.connector.v2; + +option go_package = "github.com/conductorone/baton-sdk/pb/c1/connector/v2"; + +// ProvisionFailureDetail describes WHY a Grant or Revoke failure is permanent. +// Connectors attach this to gRPC status details (via status.WithDetails) on +// errors returned from Grant/Revoke when the underlying upstream API has +// indicated the failure will not succeed on retry without external state +// change. +// +// The canonical signal callers act on is still the gRPC status code: when +// the upstream is permanently rejecting the operation, the connector should +// pick the matching code (FailedPrecondition for "state-blocked", +// PermissionDenied for "not permitted", etc.). This message *supplements* +// that with a structured reason and human-readable detail so callers can +// log, tag metrics, and route admin-facing remediation text consistently +// across connectors. +// +// For transient failures (5xx, network blip, rate limit), return errors as +// normal — those are retryable. Rate-limit-specific signaling has its own +// annotation (RateLimitDescription). +message ProvisionFailureDetail { + enum Reason { + REASON_UNSPECIFIED = 0; + + // The target user is in a state that the upstream system does not + // permit modifications against. Examples: Okta SUSPENDED / DEPROVISIONED + // / pending password reset (E0000038); Workday INACTIVE; locked AD + // account. Resolves when the user's status changes upstream. + REASON_USER_NOT_PROVISIONABLE = 1; + + // The target entitlement (group / role / queue / etc.) is governed by + // an external rule that the connector cannot override. Examples: Okta + // group rule (E0000001 groupMembers); AD group policy; Workday + // delivered security group. Resolves when the rule is changed or the + // subject is moved out of the rule's source set. + REASON_TARGET_MANAGED_EXTERNALLY = 2; + + // The upstream system has no remaining capacity for this operation. + // Examples: Jira "exceeds number of allowed users licensed"; + // Salesforce license type cap; Okta seat limit. Resolves when seats + // free up or the tenant purchases more. + REASON_LICENSE_EXHAUSTED = 3; + + // Upstream API validation rejected the operation for a reason that + // won't change on retry: malformed reference, conflicting policy, + // schema-level invariant. Use this for non-categorized terminal + // validation failures. + REASON_VALIDATION_FAILURE = 4; + + // The connector or upstream system does not permit the requested + // operation regardless of state. Examples: cannot revoke a + // self-managed group; cannot grant via this endpoint; required + // capability disabled by tenant admin. + REASON_OPERATION_NOT_PERMITTED = 5; + } + + Reason reason = 1; + + // Human-readable detail intended for logs and admin-visible operational + // context — not user-facing. Include the upstream error code or message + // when available (e.g. "Okta E0000038: user 'jdoe' is SUSPENDED"). + string detail = 2; +} From b33ecaf403938315012ba5b53ab2e14c24b0d20c Mon Sep 17 00:00:00 2001 From: arreyder Date: Thu, 14 May 2026 17:54:54 -0500 Subject: [PATCH 2/2] connectorerrors: address review feedback - Trim package doc; drop historical narrative - Add cross-reference on ResourceProvisionerV2 so the recommended interface points at the helper too - Fix IsTerminal docstring (FailureDetail allocates via FromError) - Panic on the unreachable WithDetails error path so a future field change can't silently strip the detail - Skip fmt.Sprintf when len(args)==0 so a literal upstream message containing % isn't reinterpreted - Document the Reason-is-authoritative contract on NewTerminalError / codeForReason; note forward-compat on FailureDetail - Sharpen LICENSE_EXHAUSTED (capacity) vs OPERATION_NOT_PERMITTED (policy) boundary in proto comments Co-Authored-By: Claude Opus 4.7 (1M context) --- .../v2/annotation_provision_failure.pb.go | 28 +++++---- ...tation_provision_failure_protoopaque.pb.go | 28 +++++---- pkg/connectorbuilder/resource_provisioner.go | 5 ++ pkg/connectorerrors/terminal.go | 63 ++++++++++--------- .../v2/annotation_provision_failure.proto | 28 +++++---- 5 files changed, 89 insertions(+), 63 deletions(-) diff --git a/pb/c1/connector/v2/annotation_provision_failure.pb.go b/pb/c1/connector/v2/annotation_provision_failure.pb.go index 9de46793c..e3aa555fd 100644 --- a/pb/c1/connector/v2/annotation_provision_failure.pb.go +++ b/pb/c1/connector/v2/annotation_provision_failure.pb.go @@ -37,20 +37,26 @@ const ( // delivered security group. Resolves when the rule is changed or the // subject is moved out of the rule's source set. ProvisionFailureDetail_REASON_TARGET_MANAGED_EXTERNALLY ProvisionFailureDetail_Reason = 2 - // The upstream system has no remaining capacity for this operation. - // Examples: Jira "exceeds number of allowed users licensed"; - // Salesforce license type cap; Okta seat limit. Resolves when seats - // free up or the tenant purchases more. + // CAPACITY problem: the upstream system has run out of seats / + // licenses / quota for this operation. Examples: Jira "exceeds + // number of allowed users licensed"; Salesforce license type cap; + // Okta seat limit. Resolves when seats free up or the tenant + // purchases more. Use this only for numerical capacity; for + // capability or feature toggles, prefer REASON_OPERATION_NOT_PERMITTED. ProvisionFailureDetail_REASON_LICENSE_EXHAUSTED ProvisionFailureDetail_Reason = 3 // Upstream API validation rejected the operation for a reason that - // won't change on retry: malformed reference, conflicting policy, - // schema-level invariant. Use this for non-categorized terminal - // validation failures. + // won't change on retry: malformed reference, schema-level + // invariant. Use this for non-categorized terminal validation + // failures. (For policy-driven rejection where the upstream itself + // says the operation isn't allowed, prefer + // REASON_OPERATION_NOT_PERMITTED.) ProvisionFailureDetail_REASON_VALIDATION_FAILURE ProvisionFailureDetail_Reason = 4 - // The connector or upstream system does not permit the requested - // operation regardless of state. Examples: cannot revoke a - // self-managed group; cannot grant via this endpoint; required - // capability disabled by tenant admin. + // POLICY problem: the connector or upstream system does not permit + // the requested operation regardless of state. Examples: cannot + // revoke a self-managed group; cannot grant via this endpoint; + // required capability disabled by tenant admin; feature toggle off. + // Use this when the upstream is saying "you're not allowed to do + // this"; use REASON_LICENSE_EXHAUSTED only for numerical capacity. ProvisionFailureDetail_REASON_OPERATION_NOT_PERMITTED ProvisionFailureDetail_Reason = 5 ) diff --git a/pb/c1/connector/v2/annotation_provision_failure_protoopaque.pb.go b/pb/c1/connector/v2/annotation_provision_failure_protoopaque.pb.go index a945104d2..d0e8d9a34 100644 --- a/pb/c1/connector/v2/annotation_provision_failure_protoopaque.pb.go +++ b/pb/c1/connector/v2/annotation_provision_failure_protoopaque.pb.go @@ -37,20 +37,26 @@ const ( // delivered security group. Resolves when the rule is changed or the // subject is moved out of the rule's source set. ProvisionFailureDetail_REASON_TARGET_MANAGED_EXTERNALLY ProvisionFailureDetail_Reason = 2 - // The upstream system has no remaining capacity for this operation. - // Examples: Jira "exceeds number of allowed users licensed"; - // Salesforce license type cap; Okta seat limit. Resolves when seats - // free up or the tenant purchases more. + // CAPACITY problem: the upstream system has run out of seats / + // licenses / quota for this operation. Examples: Jira "exceeds + // number of allowed users licensed"; Salesforce license type cap; + // Okta seat limit. Resolves when seats free up or the tenant + // purchases more. Use this only for numerical capacity; for + // capability or feature toggles, prefer REASON_OPERATION_NOT_PERMITTED. ProvisionFailureDetail_REASON_LICENSE_EXHAUSTED ProvisionFailureDetail_Reason = 3 // Upstream API validation rejected the operation for a reason that - // won't change on retry: malformed reference, conflicting policy, - // schema-level invariant. Use this for non-categorized terminal - // validation failures. + // won't change on retry: malformed reference, schema-level + // invariant. Use this for non-categorized terminal validation + // failures. (For policy-driven rejection where the upstream itself + // says the operation isn't allowed, prefer + // REASON_OPERATION_NOT_PERMITTED.) ProvisionFailureDetail_REASON_VALIDATION_FAILURE ProvisionFailureDetail_Reason = 4 - // The connector or upstream system does not permit the requested - // operation regardless of state. Examples: cannot revoke a - // self-managed group; cannot grant via this endpoint; required - // capability disabled by tenant admin. + // POLICY problem: the connector or upstream system does not permit + // the requested operation regardless of state. Examples: cannot + // revoke a self-managed group; cannot grant via this endpoint; + // required capability disabled by tenant admin; feature toggle off. + // Use this when the upstream is saying "you're not allowed to do + // this"; use REASON_LICENSE_EXHAUSTED only for numerical capacity. ProvisionFailureDetail_REASON_OPERATION_NOT_PERMITTED ProvisionFailureDetail_Reason = 5 ) diff --git a/pkg/connectorbuilder/resource_provisioner.go b/pkg/connectorbuilder/resource_provisioner.go index 318cba794..61d37b8f9 100644 --- a/pkg/connectorbuilder/resource_provisioner.go +++ b/pkg/connectorbuilder/resource_provisioner.go @@ -68,6 +68,11 @@ type GrantProvisioner interface { // // This is the recommended interface for implementing provisioning operations in new connectors. // It differs from ResourceProvisioner by returning a list of grants from the Grant method. +// +// The same terminal-error contract as ResourceProvisioner applies — return +// errors built with pkg/connectorerrors.NewTerminalError when the upstream +// system has signaled a permanent failure, so callers don't retry. See +// the ResourceProvisioner docstring above for the full rationale. type ResourceProvisionerV2 interface { ResourceSyncerV2 ResourceProvisionerV2Limited diff --git a/pkg/connectorerrors/terminal.go b/pkg/connectorerrors/terminal.go index 0fd7891b6..52a4fb49c 100644 --- a/pkg/connectorerrors/terminal.go +++ b/pkg/connectorerrors/terminal.go @@ -1,26 +1,11 @@ // Package connectorerrors helps connectors signal terminal (non-retryable) -// failures from Grant and Revoke in a structured way that upstream callers -// can act on. +// failures from Grant and Revoke in a way upstream callers can act on. // -// Today, when a connector returns a plain Go error, the upstream framing -// (in baton-sdk: fmt.Errorf("grant failed: %w", err), then again in the -// caller: codes.Unknown) erases information about whether the failure is -// retryable. Callers fall back to message-substring matching, which is -// brittle and per-connector. -// -// This package gives connectors a single helper that: -// - picks the right gRPC status code for a known terminal reason, so -// callers that inspect status.Code(err) automatically classify the -// error as non-retryable; -// - attaches a ProvisionFailureDetail (proto) as a gRPC status detail, -// so callers that want a typed reason and an upstream-facing detail -// string can read it without parsing free text. -// -// Connectors should still return ordinary errors for transient failures -// (network blips, 5xx, rate limits — those have their own annotation, -// RateLimitDescription). Only call NewTerminalError when you have -// positively identified an upstream signal that means "this will not -// succeed on retry without external state change." +// Use NewTerminalError when an upstream API has signaled a permanent +// failure (target user state, externally-managed group, license cap, +// validation). For transient failures — 5xx, network blips, rate limits +// — return ordinary errors; rate limits have their own annotation +// (RateLimitDescription) on the response. package connectorerrors import ( @@ -44,6 +29,10 @@ import ( // REASON_UNSPECIFIED is rejected — pass a real reason. If you genuinely // don't know which bucket fits, REASON_VALIDATION_FAILURE is the // catch-all for upstream rejections that won't change on retry. +// +// Contract: Reason is the authoritative semantic signal; the gRPC code +// is a coarse retryability hint maintained for callers that classify +// purely on status.Code. New callers should branch on Reason. func NewTerminalError(reason v2.ProvisionFailureDetail_Reason, format string, args ...any) error { if reason == v2.ProvisionFailureDetail_REASON_UNSPECIFIED { // Programmer error — callers must pick a reason. Return an @@ -51,7 +40,12 @@ func NewTerminalError(reason v2.ProvisionFailureDetail_Reason, format string, ar return status.Error(codes.Internal, "connectorerrors.NewTerminalError: reason must not be REASON_UNSPECIFIED") } - msg := fmt.Sprintf(format, args...) + // Pass format through verbatim when no args, so a `%` in an + // upstream message ("E0000099: 50% capacity") isn't reinterpreted. + msg := format + if len(args) > 0 { + msg = fmt.Sprintf(format, args...) + } st := status.New(codeForReason(reason), msg) detail := v2.ProvisionFailureDetail_builder{ @@ -61,11 +55,12 @@ func NewTerminalError(reason v2.ProvisionFailureDetail_Reason, format string, ar withDetails, err := st.WithDetails(detail) if err != nil { - // status.WithDetails only fails if the detail can't be - // marshaled; for a fixed proto we control, that should not - // happen in practice. Fall back to the status without - // details so the code is still correct. - return st.Err() + // Unreachable for a proto we own — WithDetails only fails on + // marshal error, and ProvisionFailureDetail has no fields that + // can fail to marshal. Panic so we notice if a future field + // change breaks the invariant, rather than silently returning + // a status without the detail. + panic(fmt.Sprintf("connectorerrors: status.WithDetails failed on ProvisionFailureDetail: %v", err)) } return withDetails.Err() } @@ -74,6 +69,11 @@ func NewTerminalError(reason v2.ProvisionFailureDetail_Reason, format string, ar // returning (detail, true) when one is present. It walks fmt.Errorf // %w and errors.Join chains via status.FromError (which uses errors.As), // so wrapped errors are handled transparently. +// +// Forward compat: proto3 preserves unknown enum integers on the wire, +// so a future REASON_* value sent by a newer connector decodes here +// without error and detail.GetReason() returns the integer. Callers +// switching on Reason must include a default case for unknown values. func FailureDetail(err error) (*v2.ProvisionFailureDetail, bool) { if err == nil { return nil, false @@ -94,15 +94,18 @@ func FailureDetail(err error) (*v2.ProvisionFailureDetail, bool) { } // IsTerminal reports whether the error carries a ProvisionFailureDetail. -// Equivalent to "_, ok := FailureDetail(err)" with no allocation in the -// false path; convenient when callers only need a yes/no. +// Equivalent to `_, ok := FailureDetail(err); return ok` — convenient +// when callers only need a yes/no. func IsTerminal(err error) bool { _, ok := FailureDetail(err) return ok } // codeForReason maps a ProvisionFailureDetail reason to the gRPC status -// code callers should observe. +// code callers should observe. This mapping is part of the package +// contract — connector authors should not rely on a specific code beyond +// "non-retryable", and callers wanting fine-grained classification +// should read Reason from the attached ProvisionFailureDetail. // // The codes here are the same ones baton-sdk's existing isClientErrorCode // logic already treats as non-retryable, so connectors adopting this diff --git a/proto/c1/connector/v2/annotation_provision_failure.proto b/proto/c1/connector/v2/annotation_provision_failure.proto index 18de566aa..f411e1d26 100644 --- a/proto/c1/connector/v2/annotation_provision_failure.proto +++ b/proto/c1/connector/v2/annotation_provision_failure.proto @@ -38,22 +38,28 @@ message ProvisionFailureDetail { // subject is moved out of the rule's source set. REASON_TARGET_MANAGED_EXTERNALLY = 2; - // The upstream system has no remaining capacity for this operation. - // Examples: Jira "exceeds number of allowed users licensed"; - // Salesforce license type cap; Okta seat limit. Resolves when seats - // free up or the tenant purchases more. + // CAPACITY problem: the upstream system has run out of seats / + // licenses / quota for this operation. Examples: Jira "exceeds + // number of allowed users licensed"; Salesforce license type cap; + // Okta seat limit. Resolves when seats free up or the tenant + // purchases more. Use this only for numerical capacity; for + // capability or feature toggles, prefer REASON_OPERATION_NOT_PERMITTED. REASON_LICENSE_EXHAUSTED = 3; // Upstream API validation rejected the operation for a reason that - // won't change on retry: malformed reference, conflicting policy, - // schema-level invariant. Use this for non-categorized terminal - // validation failures. + // won't change on retry: malformed reference, schema-level + // invariant. Use this for non-categorized terminal validation + // failures. (For policy-driven rejection where the upstream itself + // says the operation isn't allowed, prefer + // REASON_OPERATION_NOT_PERMITTED.) REASON_VALIDATION_FAILURE = 4; - // The connector or upstream system does not permit the requested - // operation regardless of state. Examples: cannot revoke a - // self-managed group; cannot grant via this endpoint; required - // capability disabled by tenant admin. + // POLICY problem: the connector or upstream system does not permit + // the requested operation regardless of state. Examples: cannot + // revoke a self-managed group; cannot grant via this endpoint; + // required capability disabled by tenant admin; feature toggle off. + // Use this when the upstream is saying "you're not allowed to do + // this"; use REASON_LICENSE_EXHAUSTED only for numerical capacity. REASON_OPERATION_NOT_PERMITTED = 5; }