diff --git a/cmd/baton-c1z-sanitize/main.go b/cmd/baton-c1z-sanitize/main.go new file mode 100644 index 000000000..bbe4e493d --- /dev/null +++ b/cmd/baton-c1z-sanitize/main.go @@ -0,0 +1,150 @@ +// baton-c1z-sanitize transforms a .c1z snapshot into an identity- +// stripped copy whose graph topology and cardinalities are preserved. +// See pkg/c1zsanitize for the transform contract. +package main + +import ( + "context" + "crypto/rand" + "flag" + "fmt" + "os" + "time" + + "github.com/grpc-ecosystem/go-grpc-middleware/logging/zap/ctxzap" + "go.uber.org/zap" + + "github.com/conductorone/baton-sdk/pkg/c1zsanitize" + "github.com/conductorone/baton-sdk/pkg/dotc1z" + "github.com/conductorone/baton-sdk/pkg/logging" +) + +func main() { + if err := run(); err != nil { + fmt.Fprintln(os.Stderr, "baton-c1z-sanitize:", err) + os.Exit(1) + } +} + +func run() error { + fs := flag.NewFlagSet("baton-c1z-sanitize", flag.ContinueOnError) + inPath := fs.String("in", "", "Path to source .c1z file (required)") + outPath := fs.String("out", "", "Path to sanitized .c1z output file (required)") + secretFile := fs.String("secret-file", "", "Path to per-c1z HMAC secret (>=32 random bytes). If unset, a fresh secret is generated and written next to -out.") + anchorRaw := fs.String("anchor", "", "RFC3339 timestamp the newest source timestamp lands on. Defaults to now.") + allowUnknown := fs.Bool("allow-unknown-annotations", false, "Pass annotations of unknown type through unchanged instead of dropping. Dangerous on real customer data.") + logLevel := fs.String("log-level", "info", "Log level: debug, info, warn, error.") + logFormat := fs.String("log-format", "console", "Log format: console or json.") + + if err := fs.Parse(os.Args[1:]); err != nil { + return err + } + if *inPath == "" || *outPath == "" { + fs.Usage() + return fmt.Errorf("-in and -out are required") + } + + ctx, err := logging.Init(context.Background(), + logging.WithLogLevel(*logLevel), + logging.WithLogFormat(*logFormat), + ) + if err != nil { + return fmt.Errorf("init logging: %w", err) + } + log := ctxzap.Extract(ctx) + + secret, generated, err := loadOrGenerateSecret(*secretFile, *outPath) + if err != nil { + return err + } + if generated { + log.Warn("c1zsanitize: generated fresh secret; archive it if you want reversibility", + zap.String("path", secretPath(*secretFile, *outPath))) + } + + var anchor time.Time + if *anchorRaw != "" { + anchor, err = time.Parse(time.RFC3339, *anchorRaw) + if err != nil { + return fmt.Errorf("parse -anchor: %w", err) + } + } + + if _, err := os.Stat(*inPath); err != nil { + return fmt.Errorf("stat -in: %w", err) + } + if _, err := os.Stat(*outPath); err == nil { + return fmt.Errorf("-out path %q already exists; refusing to overwrite", *outPath) + } + + src, err := dotc1z.NewC1ZFile(ctx, *inPath, dotc1z.WithReadOnly(true)) + if err != nil { + return fmt.Errorf("open source c1z: %w", err) + } + defer src.Close(ctx) + + dst, err := dotc1z.NewC1ZFile(ctx, *outPath) + if err != nil { + return fmt.Errorf("open dst c1z: %w", err) + } + dstClosed := false + defer func() { + if !dstClosed { + _ = dst.Close(ctx) + } + }() + + opts := c1zsanitize.Options{ + Secret: secret, + TimestampAnchor: anchor, + DropUnknownAnnotations: !*allowUnknown, + } + + log.Info("c1zsanitize: starting", + zap.String("in", *inPath), + zap.String("out", *outPath), + zap.Bool("drop_unknown_annotations", opts.DropUnknownAnnotations), + ) + start := time.Now() + if err := c1zsanitize.Sanitize(ctx, src, dst, opts); err != nil { + return fmt.Errorf("sanitize: %w", err) + } + dstClosed = true + if err := dst.Close(ctx); err != nil { + return fmt.Errorf("close dst c1z: %w", err) + } + log.Info("c1zsanitize: done", zap.Duration("elapsed", time.Since(start))) + return nil +} + +func secretPath(flagPath, outPath string) string { + if flagPath != "" { + return flagPath + } + return outPath + ".secret" +} + +func loadOrGenerateSecret(flagPath, outPath string) ([]byte, bool, error) { + if flagPath != "" { + b, err := os.ReadFile(flagPath) + if err != nil { + return nil, false, fmt.Errorf("read -secret-file: %w", err) + } + if len(b) < c1zsanitize.MinSecretBytes { + return nil, false, fmt.Errorf("-secret-file %q is too short: got %d bytes, want at least %d", flagPath, len(b), c1zsanitize.MinSecretBytes) + } + return b, false, nil + } + path := secretPath(flagPath, outPath) + if _, err := os.Stat(path); err == nil { + return nil, false, fmt.Errorf("default secret path %q already exists; pass -secret-file to reuse it", path) + } + b := make([]byte, c1zsanitize.MinSecretBytes) + if _, err := rand.Read(b); err != nil { + return nil, false, fmt.Errorf("generate secret: %w", err) + } + if err := os.WriteFile(path, b, 0o600); err != nil { + return nil, false, fmt.Errorf("write generated secret: %w", err) + } + return b, true, nil +} diff --git a/pkg/c1zsanitize/annotations.go b/pkg/c1zsanitize/annotations.go new file mode 100644 index 000000000..66d12c62c --- /dev/null +++ b/pkg/c1zsanitize/annotations.go @@ -0,0 +1,85 @@ +package c1zsanitize + +import ( + "go.uber.org/zap" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" + + v2 "github.com/conductorone/baton-sdk/pb/c1/connector/v2" +) + +// annotationHandler returns a sanitized replacement for the supplied +// annotation message. Implementations may register additional asset +// references via the supplied set so copyAssets can pick them up at +// end-of-sync. Implementations MUST NOT mutate the input. +type annotationHandler func(s *sanitizer, msg proto.Message, refs *assetRefSet) proto.Message + +func defaultAnnotationHandlers() map[string]annotationHandler { + return map[string]annotationHandler{ + typeURL(&v2.UserTrait{}): handleUserTrait, + typeURL(&v2.GroupTrait{}): handleGroupTrait, + typeURL(&v2.AppTrait{}): handleAppTrait, + typeURL(&v2.RoleTrait{}): handleRoleTrait, + typeURL(&v2.SecretTrait{}): handleSecretTrait, + typeURL(&v2.LicenseProfileTrait{}): handleLicenseProfileTrait, + typeURL(&v2.ScopeBindingTrait{}): handleScopeBindingTrait, + } +} + +// typeURL returns the canonical anypb type-URL for a proto message. +// It matches what anypb.New would set on Any.TypeUrl, so it works as +// a lookup key against the values returned by Any.GetTypeUrl(). +func typeURL(m proto.Message) string { + a, err := anypb.New(m) + if err != nil { + panic(err) + } + return a.GetTypeUrl() +} + +// transformAnnotations walks the slice once, dispatching each entry +// on its Any type URL. Unknown types are dropped by default (with a +// log line naming the URL) or passed through unchanged if the +// operator opted in via Options.DropUnknownAnnotations=false. +func (s *sanitizer) transformAnnotations(in []*anypb.Any, refs *assetRefSet) []*anypb.Any { + if len(in) == 0 { + return nil + } + out := make([]*anypb.Any, 0, len(in)) + for _, a := range in { + if a == nil { + continue + } + handler, ok := s.handlers[a.GetTypeUrl()] + if !ok { + if s.dropUnknownAnnotations { + s.log.Debug("c1zsanitize: dropping unknown annotation", zap.String("type_url", a.GetTypeUrl())) + continue + } + s.log.Warn("c1zsanitize: passing unknown annotation through unchanged", zap.String("type_url", a.GetTypeUrl())) + out = append(out, a) + continue + } + msg, err := a.UnmarshalNew() + if err != nil { + s.log.Warn("c1zsanitize: failed to unmarshal annotation; dropping", + zap.String("type_url", a.GetTypeUrl()), zap.Error(err)) + continue + } + sanitized := handler(s, msg, refs) + if sanitized == nil { + continue + } + repacked, err := anypb.New(sanitized) + if err != nil { + s.log.Warn("c1zsanitize: failed to repack annotation; dropping", + zap.String("type_url", a.GetTypeUrl()), zap.Error(err)) + continue + } + out = append(out, repacked) + } + if len(out) == 0 { + return nil + } + return out +} diff --git a/pkg/c1zsanitize/assets.go b/pkg/c1zsanitize/assets.go new file mode 100644 index 000000000..8a9f27216 --- /dev/null +++ b/pkg/c1zsanitize/assets.go @@ -0,0 +1,128 @@ +package c1zsanitize + +import ( + "context" + "errors" + "fmt" + "io" + "strings" + "sync" + + "go.uber.org/zap" + + v2 "github.com/conductorone/baton-sdk/pb/c1/connector/v2" + "github.com/conductorone/baton-sdk/pkg/connectorstore" +) + +// AssetRecord.data is replaced with a deterministic placeholder +// matching the original content type. The placeholder bytes are +// chosen so that consumers that inspect content_type without parsing +// the payload still see a sensible content_type/length pair. +// +// content_type values not in the explicit map fall through to a +// single zero byte; that's enough to keep the row non-empty (PutAsset +// silently drops empty data) while preserving the cross-reference. +var placeholderByContentType = map[string][]byte{ + // 1x1 transparent PNG. + "image/png": { + 0x89, 0x50, 0x4e, 0x47, 0x0d, 0x0a, 0x1a, 0x0a, + 0x00, 0x00, 0x00, 0x0d, 0x49, 0x48, 0x44, 0x52, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, + 0x08, 0x06, 0x00, 0x00, 0x00, 0x1f, 0x15, 0xc4, + 0x89, 0x00, 0x00, 0x00, 0x0d, 0x49, 0x44, 0x41, + 0x54, 0x78, 0x9c, 0x62, 0x00, 0x01, 0x00, 0x00, + 0x05, 0x00, 0x01, 0x0d, 0x0a, 0x2d, 0xb4, 0x00, + 0x00, 0x00, 0x00, 0x49, 0x45, 0x4e, 0x44, 0xae, + 0x42, 0x60, 0x82, + }, +} + +// placeholderForContentType returns the bytes the sanitizer writes +// in place of the original asset payload. The single-byte fallback +// is intentional: zero-length data is silently dropped by PutAsset, +// which would break the cross-reference invariant. +func placeholderForContentType(contentType string) []byte { + if b, ok := placeholderByContentType[strings.ToLower(contentType)]; ok { + return b + } + if strings.HasPrefix(strings.ToLower(contentType), "image/") { + return placeholderByContentType["image/png"] + } + return []byte{0x00} +} + +// assetRefSet collects every AssetRef.Id encountered while walking +// records during a sync. The set is drained after the record walk by +// copyAssets which fetches each original asset, replaces its payload +// with a placeholder, and writes the sanitized AssetRef into dst. +type assetRefSet struct { + mu sync.Mutex + m map[string]struct{} +} + +func newAssetRefSet() *assetRefSet { + return &assetRefSet{m: map[string]struct{}{}} +} + +func (a *assetRefSet) add(id string) { + if id == "" { + return + } + a.mu.Lock() + a.m[id] = struct{}{} + a.mu.Unlock() +} + +func (a *assetRefSet) drain() []string { + a.mu.Lock() + defer a.mu.Unlock() + out := make([]string, 0, len(a.m)) + for id := range a.m { + out = append(out, id) + } + a.m = map[string]struct{}{} + return out +} + +// drainAndClose consumes the reader and closes it if the underlying +// type also implements io.Closer. The connectorstore.Reader.GetAsset +// contract returns an io.Reader, but at least one implementation +// returns *os.File; not closing it leaks a fd per asset. +func drainAndClose(r io.Reader) error { + if c, ok := r.(io.Closer); ok { + defer c.Close() + } + _, err := io.Copy(io.Discard, r) + return err +} + +func (s *sanitizer) copyAssets( + ctx context.Context, + src connectorstore.Reader, + dst connectorstore.Writer, + refs *assetRefSet, +) error { + ids := refs.drain() + for _, srcID := range ids { + req := v2.AssetServiceGetAssetRequest_builder{ + Asset: v2.AssetRef_builder{Id: srcID}.Build(), + }.Build() + contentType, r, err := src.GetAsset(ctx, req) + if err != nil { + // Asset referenced from an annotation but missing from + // the asset table. Skip — we don't fabricate placeholder + // rows because the cross-reference invariant treats it + // as a known dangling pointer in the source. + s.log.Debug("c1zsanitize: asset ref not found in source", zap.String("asset_id", srcID), zap.Error(err)) + continue + } + if err := drainAndClose(r); err != nil && !errors.Is(err, io.EOF) { + return fmt.Errorf("drain source asset %s: %w", srcID, err) + } + dstID := s.id(srcID) + if err := dst.PutAsset(ctx, v2.AssetRef_builder{Id: dstID}.Build(), contentType, placeholderForContentType(contentType)); err != nil { + return fmt.Errorf("put dst asset %s: %w", dstID, err) + } + } + return nil +} diff --git a/pkg/c1zsanitize/email.go b/pkg/c1zsanitize/email.go new file mode 100644 index 000000000..15114351d --- /dev/null +++ b/pkg/c1zsanitize/email.go @@ -0,0 +1,49 @@ +package c1zsanitize + +import ( + "strings" + "sync" +) + +// domainMap holds the deterministic original→sanitized domain mapping +// for a single c1z sanitization run. Same source domain maps to the +// same destination domain across every email encountered in the c1z; +// distinct source domains never collide on the destination side. +// +// The map is populated lazily as emails are sanitized. The mapping +// itself is purely a function of the per-c1z secret, so the map's +// only purpose is to avoid recomputing the HMAC for repeated lookups. +type domainMap struct { + mu sync.Mutex + m map[string]string +} + +func newDomainMap() *domainMap { + return &domainMap{m: map[string]string{}} +} + +func (d *domainMap) lookup(secret []byte, domain string) string { + d.mu.Lock() + defer d.mu.Unlock() + if v, ok := d.m[domain]; ok { + return v + } + v := "dom-" + strings.ToLower(SanitizeID(secret, domain)) + ".example" + d.m[domain] = v + return v +} + +// sanitizeEmail HMACs the localpart and assigns a deterministic +// sentinel domain through the per-c1z domain map. Cross-tenant +// "all from @acme.com" shapes are preserved without leaking acme.com. +// An input lacking a single '@' is HMACed wholesale — the caller may +// have passed something that looks like an email but isn't. +func sanitizeEmail(secret []byte, dm *domainMap, addr string) string { + at := strings.LastIndexByte(addr, '@') + if at < 0 { + return SanitizeID(secret, addr) + } + local := addr[:at] + domain := addr[at+1:] + return SanitizeID(secret, local) + "@" + dm.lookup(secret, domain) +} diff --git a/pkg/c1zsanitize/email_test.go b/pkg/c1zsanitize/email_test.go new file mode 100644 index 000000000..80571998e --- /dev/null +++ b/pkg/c1zsanitize/email_test.go @@ -0,0 +1,48 @@ +package c1zsanitize + +import ( + "strings" + "testing" +) + +func TestSanitizeEmailPreservesShape(t *testing.T) { + secret := bytes32("s") + dm := newDomainMap() + got := sanitizeEmail(secret, dm, "john.doe@acme.com") + if !strings.ContainsRune(got, '@') { + t.Fatalf("expected '@' to survive, got %q", got) + } +} + +func TestSanitizeEmailSameDomainMapsConsistently(t *testing.T) { + secret := bytes32("s") + dm := newDomainMap() + a := sanitizeEmail(secret, dm, "alice@acme.com") + b := sanitizeEmail(secret, dm, "bob@acme.com") + aDom := a[strings.LastIndexByte(a, '@')+1:] + bDom := b[strings.LastIndexByte(b, '@')+1:] + if aDom != bDom { + t.Fatalf("expected same source domain to map consistently; got %q vs %q", aDom, bDom) + } +} + +func TestSanitizeEmailDistinctDomainsDistinctOutputs(t *testing.T) { + secret := bytes32("s") + dm := newDomainMap() + a := sanitizeEmail(secret, dm, "alice@acme.com") + b := sanitizeEmail(secret, dm, "alice@example.com") + aDom := a[strings.LastIndexByte(a, '@')+1:] + bDom := b[strings.LastIndexByte(b, '@')+1:] + if aDom == bDom { + t.Fatalf("expected distinct source domains to map distinctly; both %q", aDom) + } +} + +func TestSanitizeEmailNoAtTreatsAsID(t *testing.T) { + secret := bytes32("s") + dm := newDomainMap() + got := sanitizeEmail(secret, dm, "no-at-sign-here") + if strings.ContainsRune(got, '@') { + t.Fatalf("expected no '@' in non-email output; got %q", got) + } +} diff --git a/pkg/c1zsanitize/handlers.go b/pkg/c1zsanitize/handlers.go new file mode 100644 index 000000000..fdd53a7f7 --- /dev/null +++ b/pkg/c1zsanitize/handlers.go @@ -0,0 +1,208 @@ +package c1zsanitize + +import ( + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/structpb" + + v2 "github.com/conductorone/baton-sdk/pb/c1/connector/v2" +) + +// sanitizeAssetRef rewrites the asset id while registering the +// original id so copyAssets fetches and rewrites the payload. +func (s *sanitizer) sanitizeAssetRef(in *v2.AssetRef, refs *assetRefSet) *v2.AssetRef { + if in == nil || in.GetId() == "" { + return nil + } + refs.add(in.GetId()) + return v2.AssetRef_builder{Id: s.id(in.GetId())}.Build() +} + +// sanitizeStruct recursively walks a google.protobuf.Struct, +// HMAC-ing string leaves. Keys are preserved (they're connector- +// schema field names, not tenant data). Numbers and booleans pass +// through — see investigation §3.3 for the rationale. +func (s *sanitizer) sanitizeStruct(in *structpb.Struct) *structpb.Struct { + if in == nil { + return nil + } + out := &structpb.Struct{Fields: make(map[string]*structpb.Value, len(in.GetFields()))} + for k, v := range in.GetFields() { + out.Fields[k] = s.sanitizeValue(v) + } + return out +} + +func (s *sanitizer) sanitizeValue(v *structpb.Value) *structpb.Value { + if v == nil { + return nil + } + switch kind := v.GetKind().(type) { + case *structpb.Value_StringValue: + return structpb.NewStringValue(s.id(kind.StringValue)) + case *structpb.Value_StructValue: + return structpb.NewStructValue(s.sanitizeStruct(kind.StructValue)) + case *structpb.Value_ListValue: + lv := kind.ListValue + items := make([]*structpb.Value, 0, len(lv.GetValues())) + for _, item := range lv.GetValues() { + items = append(items, s.sanitizeValue(item)) + } + return structpb.NewListValue(&structpb.ListValue{Values: items}) + default: + return v + } +} + +func handleUserTrait(s *sanitizer, msg proto.Message, refs *assetRefSet) proto.Message { + in := msg.(*v2.UserTrait) + emails := make([]*v2.UserTrait_Email, 0, len(in.GetEmails())) + for _, e := range in.GetEmails() { + emails = append(emails, v2.UserTrait_Email_builder{ + Address: sanitizeEmail(s.secret, s.domains, e.GetAddress()), + IsPrimary: e.GetIsPrimary(), + }.Build()) + } + loginAliases := make([]string, 0, len(in.GetLoginAliases())) + for _, a := range in.GetLoginAliases() { + loginAliases = append(loginAliases, s.id(a)) + } + employeeIDs := make([]string, 0, len(in.GetEmployeeIds())) + for _, eid := range in.GetEmployeeIds() { + employeeIDs = append(employeeIDs, s.id(eid)) + } + + out := v2.UserTrait_builder{ + Emails: emails, + Profile: s.sanitizeStruct(in.GetProfile()), + Icon: s.sanitizeAssetRef(in.GetIcon(), refs), + AccountType: in.GetAccountType(), + Login: s.id(in.GetLogin()), + LoginAliases: loginAliases, + EmployeeIds: employeeIDs, + }.Build() + + if in.HasStatus() { + st := in.GetStatus() + out.SetStatus(v2.UserTrait_Status_builder{ + Status: st.GetStatus(), + Details: s.id(st.GetDetails()), + }.Build()) + } + if in.HasCreatedAt() { + out.SetCreatedAt(s.shifter.shift(in.GetCreatedAt())) + } + if in.HasLastLogin() { + out.SetLastLogin(s.shifter.shift(in.GetLastLogin())) + } + if in.HasMfaStatus() { + out.SetMfaStatus(v2.UserTrait_MFAStatus_builder{ + MfaEnabled: in.GetMfaStatus().GetMfaEnabled(), + }.Build()) + } + if in.HasSsoStatus() { + out.SetSsoStatus(v2.UserTrait_SSOStatus_builder{ + SsoEnabled: in.GetSsoStatus().GetSsoEnabled(), + }.Build()) + } + if in.HasStructuredName() { + sn := in.GetStructuredName() + middle := make([]string, 0, len(sn.GetMiddleNames())) + for _, m := range sn.GetMiddleNames() { + middle = append(middle, s.id(m)) + } + out.SetStructuredName(v2.UserTrait_StructuredName_builder{ + GivenName: s.id(sn.GetGivenName()), + FamilyName: s.id(sn.GetFamilyName()), + MiddleNames: middle, + Prefix: s.id(sn.GetPrefix()), + Suffix: s.id(sn.GetSuffix()), + }.Build()) + } + return out +} + +func handleGroupTrait(s *sanitizer, msg proto.Message, refs *assetRefSet) proto.Message { + in := msg.(*v2.GroupTrait) + return v2.GroupTrait_builder{ + Icon: s.sanitizeAssetRef(in.GetIcon(), refs), + Profile: s.sanitizeStruct(in.GetProfile()), + }.Build() +} + +func handleAppTrait(s *sanitizer, msg proto.Message, refs *assetRefSet) proto.Message { + in := msg.(*v2.AppTrait) + helpURL := "" + if in.GetHelpUrl() != "" { + helpURL = "https://sanitized.example/help" + } + return v2.AppTrait_builder{ + HelpUrl: helpURL, + Icon: s.sanitizeAssetRef(in.GetIcon(), refs), + Logo: s.sanitizeAssetRef(in.GetLogo(), refs), + Profile: s.sanitizeStruct(in.GetProfile()), + Flags: in.GetFlags(), + }.Build() +} + +func handleRoleTrait(s *sanitizer, msg proto.Message, _ *assetRefSet) proto.Message { + in := msg.(*v2.RoleTrait) + out := v2.RoleTrait_builder{ + Profile: s.sanitizeStruct(in.GetProfile()), + }.Build() + if rsc := in.GetRoleScopeConditions(); rsc != nil { + conds := make([]*v2.RoleScopeCondition, 0, len(rsc.GetConditions())) + for _, c := range rsc.GetConditions() { + conds = append(conds, v2.RoleScopeCondition_builder{ + Expression: s.id(c.GetExpression()), + }.Build()) + } + out.SetRoleScopeConditions(v2.RoleScopeConditions_builder{ + Type: rsc.GetType(), + Conditions: conds, + }.Build()) + } + return out +} + +func handleSecretTrait(s *sanitizer, msg proto.Message, _ *assetRefSet) proto.Message { + in := msg.(*v2.SecretTrait) + out := v2.SecretTrait_builder{ + Profile: s.sanitizeStruct(in.GetProfile()), + CreatedById: s.transformResourceID(in.GetCreatedById()), + IdentityId: s.transformResourceID(in.GetIdentityId()), + }.Build() + if in.HasCreatedAt() { + out.SetCreatedAt(s.shifter.shift(in.GetCreatedAt())) + } + if in.HasExpiresAt() { + out.SetExpiresAt(s.shifter.shift(in.GetExpiresAt())) + } + if in.HasLastUsedAt() { + out.SetLastUsedAt(s.shifter.shift(in.GetLastUsedAt())) + } + return out +} + +func handleLicenseProfileTrait(s *sanitizer, msg proto.Message, _ *assetRefSet) proto.Message { + in := msg.(*v2.LicenseProfileTrait) + entIDs := make([]string, 0, len(in.GetEntitlementIds())) + for _, eid := range in.GetEntitlementIds() { + entIDs = append(entIDs, s.id(eid)) + } + return v2.LicenseProfileTrait_builder{ + LicenseName: in.GetLicenseName(), + PurchasedSeats: in.GetPurchasedSeats(), + ConsumedSeats: in.GetConsumedSeats(), + CostPerUnitInCents: in.GetCostPerUnitInCents(), + Currency: in.GetCurrency(), + EntitlementIds: entIDs, + }.Build() +} + +func handleScopeBindingTrait(s *sanitizer, msg proto.Message, _ *assetRefSet) proto.Message { + in := msg.(*v2.ScopeBindingTrait) + return v2.ScopeBindingTrait_builder{ + RoleId: s.transformResourceID(in.GetRoleId()), + ScopeResourceId: s.transformResourceID(in.GetScopeResourceId()), + }.Build() +} diff --git a/pkg/c1zsanitize/id.go b/pkg/c1zsanitize/id.go new file mode 100644 index 000000000..85965b18a --- /dev/null +++ b/pkg/c1zsanitize/id.go @@ -0,0 +1,37 @@ +package c1zsanitize + +import ( + "crypto/hmac" + "crypto/sha256" + "encoding/base32" +) + +// MinSecretBytes is the minimum length of a per-c1z secret. Anything +// shorter is rejected by Sanitize; in practice operators should use 32 +// random bytes from a CSPRNG. +const MinSecretBytes = 32 + +// idEncoding is base32 without padding so emitted identifiers stay +// stable-length and free of '=' which some downstream tools quote. +var idEncoding = base32.StdEncoding.WithPadding(base32.NoPadding) + +// idTruncationBytes is the number of HMAC output bytes that survive +// truncation. 12 bytes = 96 bits — well above the birthday bound for +// any plausible tenant size (1M users → ~10⁻¹⁰ collision probability). +const idTruncationBytes = 12 + +// SanitizeID returns a deterministic, irreversible transform of input +// under the per-c1z secret. Same input → same output within a c1z; +// different across c1zs whose secrets differ. +// +// Empty input returns empty output so callers can transform optional +// fields without checking presence first. +func SanitizeID(secret []byte, input string) string { + if input == "" { + return "" + } + h := hmac.New(sha256.New, secret) + h.Write([]byte(input)) + sum := h.Sum(nil) + return idEncoding.EncodeToString(sum[:idTruncationBytes]) +} diff --git a/pkg/c1zsanitize/id_test.go b/pkg/c1zsanitize/id_test.go new file mode 100644 index 000000000..be53c0da8 --- /dev/null +++ b/pkg/c1zsanitize/id_test.go @@ -0,0 +1,51 @@ +package c1zsanitize + +import ( + "strings" + "testing" +) + +func TestSanitizeIDDeterministic(t *testing.T) { + secret := bytes32("test-secret") + a := SanitizeID(secret, "user-123") + b := SanitizeID(secret, "user-123") + if a != b { + t.Fatalf("expected same input to yield same output; got %q vs %q", a, b) + } +} + +func TestSanitizeIDDistinctInputsDistinctOutputs(t *testing.T) { + secret := bytes32("test-secret") + a := SanitizeID(secret, "user-123") + b := SanitizeID(secret, "user-456") + if a == b { + t.Fatalf("expected distinct inputs to yield distinct outputs; both %q", a) + } +} + +func TestSanitizeIDDifferentSecrets(t *testing.T) { + a := SanitizeID(bytes32("secret-a"), "user-123") + b := SanitizeID(bytes32("secret-b"), "user-123") + if a == b { + t.Fatalf("expected different secrets to yield different outputs; both %q", a) + } +} + +func TestSanitizeIDEmptyInputEmptyOutput(t *testing.T) { + if got := SanitizeID(bytes32("s"), ""); got != "" { + t.Fatalf("expected empty output for empty input, got %q", got) + } +} + +func TestSanitizeIDNoPadding(t *testing.T) { + got := SanitizeID(bytes32("s"), "anything") + if strings.ContainsRune(got, '=') { + t.Fatalf("expected unpadded base32, got %q", got) + } +} + +func bytes32(seed string) []byte { + b := make([]byte, 32) + copy(b, seed) + return b +} diff --git a/pkg/c1zsanitize/sanitize.go b/pkg/c1zsanitize/sanitize.go new file mode 100644 index 000000000..cfed0af5f --- /dev/null +++ b/pkg/c1zsanitize/sanitize.go @@ -0,0 +1,216 @@ +// Package c1zsanitize transforms a real .c1z snapshot into an +// identity-stripped copy whose graph topology, cardinalities, and +// annotation structure are preserved. The output is suitable for +// shipping to internal development environments where the original +// customer data must not appear. +// +// The whole transform is driven by a single per-c1z HMAC-SHA256 +// secret. Same input → same output within one c1z so cross-references +// stay coherent; different across c1zs whose secrets differ so an +// attacker holding multiple sanitized outputs cannot correlate them. +// +// v0.1 reads and writes the v1/v2 sqlite-zstd .c1z format via +// connectorstore.Reader / Writer. v3 c1z3 output will land in v0.2 +// once the storage-engine-v4 PR stack merges. +package c1zsanitize + +import ( + "context" + "crypto/hmac" + "crypto/sha256" + "errors" + "fmt" + "hash" + "time" + + "github.com/grpc-ecosystem/go-grpc-middleware/logging/zap/ctxzap" + "go.uber.org/zap" + "google.golang.org/protobuf/types/known/anypb" + + c1zpb "github.com/conductorone/baton-sdk/pb/c1/c1z/v1" + reader_v2 "github.com/conductorone/baton-sdk/pb/c1/reader/v2" + "github.com/conductorone/baton-sdk/pkg/annotations" + "github.com/conductorone/baton-sdk/pkg/connectorstore" +) + +// Options configures a sanitization run. +type Options struct { + // Secret is the per-c1z HMAC key. Must be at least MinSecretBytes. + // The operator chooses whether to archive or discard it; the + // sanitizer never persists it on its own. + Secret []byte + + // TimestampAnchor is the wall-clock value the newest timestamp in + // the source c1z lands on. All other timestamps shift by the same + // delta so relative deltas are preserved. Defaults to time.Now() + // when zero. + TimestampAnchor time.Time + + // DropUnknownAnnotations controls behavior when an annotation's + // Any type URL is not in the handler registry. When true (the + // default), unknown annotations are dropped and a log line names + // the type URL. When false, unknown annotations pass through + // unchanged — convenient for development against new annotation + // types, dangerous on real customer data. + DropUnknownAnnotations bool +} + +// Sanitize copies records from src to dst, transforming identifiers, +// names, free text, emails, and timestamps under the per-c1z secret. +// One destination sync is opened per source sync; parent_sync_id +// linkage is preserved via a srcSyncID → dstSyncID map maintained for +// the duration of the call. +func Sanitize(ctx context.Context, src connectorstore.Reader, dst connectorstore.Writer, opts Options) error { + if src == nil { + return errors.New("c1zsanitize: src reader is nil") + } + if dst == nil { + return errors.New("c1zsanitize: dst writer is nil") + } + if len(opts.Secret) < MinSecretBytes { + return fmt.Errorf("c1zsanitize: secret too short: got %d bytes, want at least %d", len(opts.Secret), MinSecretBytes) + } + + anchor := opts.TimestampAnchor + if anchor.IsZero() { + anchor = time.Now().UTC() + } + + srcSyncs, err := listAllSyncs(ctx, src) + if err != nil { + return fmt.Errorf("c1zsanitize: list source syncs: %w", err) + } + + s := &sanitizer{ + secret: opts.Secret, + idHmac: hmac.New(sha256.New, opts.Secret), + domains: newDomainMap(), + shifter: newTimestampShifter(anchor, findTMax(srcSyncs)), + dropUnknownAnnotations: opts.DropUnknownAnnotations, + log: ctxzap.Extract(ctx), + handlers: defaultAnnotationHandlers(), + syncIDMap: map[string]string{}, + } + + for _, sr := range srcSyncs { + if err := s.sanitizeSync(ctx, src, dst, sr); err != nil { + return fmt.Errorf("c1zsanitize: sanitize sync %s: %w", sr.GetId(), err) + } + } + return nil +} + +type sanitizer struct { + secret []byte + idHmac hash.Hash + domains *domainMap + shifter *timestampShifter + dropUnknownAnnotations bool + log *zap.Logger + handlers map[string]annotationHandler + syncIDMap map[string]string +} + +// id is the per-sanitizer hot path. SanitizeID stays as the +// allocation-y reference implementation; this one reuses a single +// hmac.Hash so the SHA-256 key schedule isn't redone every call. +// Sanitize runs single-threaded, so no locking is needed. +func (s *sanitizer) id(input string) string { + if input == "" { + return "" + } + s.idHmac.Reset() + s.idHmac.Write([]byte(input)) + sum := s.idHmac.Sum(nil) + return idEncoding.EncodeToString(sum[:idTruncationBytes]) +} + +func (s *sanitizer) sanitizeSync(ctx context.Context, src connectorstore.Reader, dst connectorstore.Writer, sr *reader_v2.SyncRun) error { + srcSyncID := sr.GetId() + syncType := connectorstore.SyncType(sr.GetSyncType()) + if syncType == connectorstore.SyncTypeAny || syncType == "" { + syncType = connectorstore.SyncTypeFull + } + + parentDst := "" + if parentSrc := sr.GetParentSyncId(); parentSrc != "" { + parentDst = s.syncIDMap[parentSrc] + } + + dstSyncID, err := dst.StartNewSync(ctx, syncType, parentDst) + if err != nil { + return fmt.Errorf("start dst sync: %w", err) + } + s.syncIDMap[srcSyncID] = dstSyncID + + assetRefs := newAssetRefSet() + + if err := s.copyResourceTypes(ctx, src, dst, srcSyncID, assetRefs); err != nil { + return err + } + if err := s.copyResources(ctx, src, dst, srcSyncID, assetRefs); err != nil { + return err + } + if err := s.copyEntitlements(ctx, src, dst, srcSyncID, assetRefs); err != nil { + return err + } + if err := s.copyGrants(ctx, src, dst, srcSyncID, assetRefs); err != nil { + return err + } + if err := s.copyAssets(ctx, src, dst, assetRefs); err != nil { + return err + } + + if err := dst.EndSync(ctx); err != nil { + return fmt.Errorf("end dst sync: %w", err) + } + return nil +} + +// listAllSyncs paginates the source SyncsReaderService and returns +// every sync run the source can see. The SQLite-backed C1File emits +// in id-asc order which matches insertion order, which is parent- +// before-child for any well-formed sync chain. +func listAllSyncs(ctx context.Context, src connectorstore.Reader) ([]*reader_v2.SyncRun, error) { + var out []*reader_v2.SyncRun + pageToken := "" + for { + req := reader_v2.SyncsReaderServiceListSyncsRequest_builder{ + PageToken: pageToken, + }.Build() + resp, err := src.ListSyncs(ctx, req) + if err != nil { + return nil, err + } + out = append(out, resp.GetSyncs()...) + if resp.GetNextPageToken() == "" { + return out, nil + } + pageToken = resp.GetNextPageToken() + } +} + +func findTMax(syncs []*reader_v2.SyncRun) time.Time { + var tMax time.Time + for _, sr := range syncs { + if sr.HasStartedAt() { + if t := sr.GetStartedAt().AsTime(); t.After(tMax) { + tMax = t + } + } + if sr.HasEndedAt() { + if t := sr.GetEndedAt().AsTime(); t.After(tMax) { + tMax = t + } + } + } + return tMax +} + +// syncIDAnnotations returns the annotation slice that scopes a list +// request to a specific source sync. The reader resolves the sync ID +// from a SyncDetails annotation; see pkg/dotc1z/sql_helpers.go. +func syncIDAnnotations(srcSyncID string) []*anypb.Any { + a := annotations.New(c1zpb.SyncDetails_builder{Id: srcSyncID}.Build()) + return a +} diff --git a/pkg/c1zsanitize/sanitize_test.go b/pkg/c1zsanitize/sanitize_test.go new file mode 100644 index 000000000..1930a48f9 --- /dev/null +++ b/pkg/c1zsanitize/sanitize_test.go @@ -0,0 +1,519 @@ +package c1zsanitize + +import ( + "context" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" + "google.golang.org/protobuf/proto" + "google.golang.org/protobuf/types/known/anypb" + "google.golang.org/protobuf/types/known/structpb" + "google.golang.org/protobuf/types/known/timestamppb" + + v2 "github.com/conductorone/baton-sdk/pb/c1/connector/v2" + "github.com/conductorone/baton-sdk/pkg/connectorstore" + "github.com/conductorone/baton-sdk/pkg/dotc1z" +) + +// Cross-reference integrity invariant: for every original identifier +// that appears N times in src, the sanitized identifier +// sanitize_id(id) appears exactly N times in dst, and no other +// sanitized identifier appears at any of those positions. +// +// This is the load-bearing test from §6.2 step 4 of the investigation. +func TestSanitizeCrossReferenceIntegrity(t *testing.T) { + ctx := context.Background() + tmp := t.TempDir() + srcPath := filepath.Join(tmp, "src.c1z") + dstPath := filepath.Join(tmp, "dst.c1z") + secret := bytes32("xref-test") + + buildFixture(t, ctx, srcPath) + + src := mustOpen(t, ctx, srcPath, true) + defer src.Close(ctx) + dst := mustOpen(t, ctx, dstPath, false) + + require.NoError(t, Sanitize(ctx, src, dst, Options{ + Secret: secret, + DropUnknownAnnotations: true, + })) + require.NoError(t, dst.Close(ctx)) + + dstRO := mustOpen(t, ctx, dstPath, true) + defer dstRO.Close(ctx) + + srcRecords := collectRecords(t, ctx, src) + dstRecords := collectRecords(t, ctx, dstRO) + + require.Equal(t, len(srcRecords.resourceTypes), len(dstRecords.resourceTypes), "resource-type counts must match") + require.Equal(t, len(srcRecords.resources), len(dstRecords.resources), "resource counts must match") + require.Equal(t, len(srcRecords.entitlements), len(dstRecords.entitlements), "entitlement counts must match") + require.Equal(t, len(srcRecords.grants), len(dstRecords.grants), "grant counts must match") + + for srcID, srcCount := range srcRecords.idOccurrences { + dstID := SanitizeID(secret, srcID) + require.Equal(t, srcCount, dstRecords.idOccurrences[dstID], + "id %q (sanitized %q) occurs %d times in src but %d in dst", + srcID, dstID, srcCount, dstRecords.idOccurrences[dstID]) + } + + srcResourceTypeIDs := map[string]struct{}{} + for _, rt := range srcRecords.resourceTypes { + srcResourceTypeIDs[rt.GetId()] = struct{}{} + } + for _, rt := range dstRecords.resourceTypes { + require.Contains(t, srcResourceTypeIDs, rt.GetId(), "resource_type id should pass through unchanged: %q", rt.GetId()) + } +} + +func TestSanitizeGraphIntegrity(t *testing.T) { + ctx := context.Background() + tmp := t.TempDir() + srcPath := filepath.Join(tmp, "src.c1z") + dstPath := filepath.Join(tmp, "dst.c1z") + secret := bytes32("graph-test") + + buildFixture(t, ctx, srcPath) + + src := mustOpen(t, ctx, srcPath, true) + defer src.Close(ctx) + dst := mustOpen(t, ctx, dstPath, false) + require.NoError(t, Sanitize(ctx, src, dst, Options{Secret: secret, DropUnknownAnnotations: true})) + require.NoError(t, dst.Close(ctx)) + + dstRO := mustOpen(t, ctx, dstPath, true) + defer dstRO.Close(ctx) + + rec := collectRecords(t, ctx, dstRO) + + resourceKeys := map[string]struct{}{} + for _, r := range rec.resources { + resourceKeys[resourceKey(r.GetId())] = struct{}{} + } + entitlementByID := map[string]*v2.Entitlement{} + for _, e := range rec.entitlements { + entitlementByID[e.GetId()] = e + } + + for _, r := range rec.resources { + if r.GetParentResourceId() != nil && r.GetParentResourceId().GetResource() != "" { + require.Contains(t, resourceKeys, resourceKey(r.GetParentResourceId()), + "resource %q's parent_resource_id must resolve", r.GetId().GetResource()) + } + } + + for _, e := range rec.entitlements { + require.NotNil(t, e.GetResource(), "entitlement %q must have a resource", e.GetId()) + require.Contains(t, resourceKeys, resourceKey(e.GetResource().GetId()), + "entitlement %q's resource must resolve in dst", e.GetId()) + } + + for _, g := range rec.grants { + require.NotNil(t, g.GetEntitlement(), "grant %q must have entitlement", g.GetId()) + require.NotNil(t, g.GetPrincipal(), "grant %q must have principal", g.GetId()) + require.Contains(t, entitlementByID, g.GetEntitlement().GetId(), + "grant %q's entitlement must resolve in dst", g.GetId()) + require.Contains(t, resourceKeys, resourceKey(g.GetPrincipal().GetId()), + "grant %q's principal must resolve in dst", g.GetId()) + } +} + +func TestSanitizeRejectsShortSecret(t *testing.T) { + ctx := context.Background() + err := Sanitize(ctx, nil, nil, Options{Secret: []byte("too short")}) + require.Error(t, err) +} + +func TestSanitizeAnnotationDropsUnknownByDefault(t *testing.T) { + ctx := context.Background() + tmp := t.TempDir() + srcPath := filepath.Join(tmp, "src.c1z") + dstPath := filepath.Join(tmp, "dst.c1z") + secret := bytes32("anno-drop") + + src := mustOpen(t, ctx, srcPath, false) + _, err := src.StartNewSync(ctx, connectorstore.SyncTypeFull, "") + require.NoError(t, err) + + require.NoError(t, src.PutResourceTypes(ctx, v2.ResourceType_builder{ + Id: "user", + DisplayName: "User", + Traits: []v2.ResourceType_Trait{v2.ResourceType_TRAIT_USER}, + }.Build())) + + unknownAnno := &anypb.Any{ + TypeUrl: "type.googleapis.com/c1.connector.v2.NotARealAnnotation", + Value: []byte{0x08, 0x01}, + } + + require.NoError(t, src.PutResources(ctx, v2.Resource_builder{ + Id: v2.ResourceId_builder{ResourceType: "user", Resource: "u1"}.Build(), + Annotations: []*anypb.Any{ + unknownAnno, + mustAny(t, v2.UserTrait_builder{Login: "alice"}.Build()), + }, + }.Build())) + + require.NoError(t, src.EndSync(ctx)) + require.NoError(t, src.Close(ctx)) + + srcRO := mustOpen(t, ctx, srcPath, true) + defer srcRO.Close(ctx) + dst := mustOpen(t, ctx, dstPath, false) + require.NoError(t, Sanitize(ctx, srcRO, dst, Options{Secret: secret, DropUnknownAnnotations: true})) + require.NoError(t, dst.Close(ctx)) + + dstRO := mustOpen(t, ctx, dstPath, true) + defer dstRO.Close(ctx) + + rec := collectRecords(t, ctx, dstRO) + require.Len(t, rec.resources, 1) + annos := rec.resources[0].GetAnnotations() + for _, a := range annos { + require.NotEqual(t, "type.googleapis.com/c1.connector.v2.NotARealAnnotation", a.GetTypeUrl(), + "unknown annotation type should have been dropped") + } + require.NotEmpty(t, annos, "known annotations should survive") +} + +func mustOpen(t *testing.T, ctx context.Context, path string, readOnly bool) *dotc1z.C1File { + t.Helper() + opts := []dotc1z.C1ZOption{} + if readOnly { + opts = append(opts, dotc1z.WithReadOnly(true)) + } + f, err := dotc1z.NewC1ZFile(ctx, path, opts...) + require.NoError(t, err) + return f +} + +func mustAny(t *testing.T, m proto.Message) *anypb.Any { + t.Helper() + a, err := anypb.New(m) + require.NoError(t, err) + return a +} + +func resourceKey(id *v2.ResourceId) string { + if id == nil { + return "" + } + return id.GetResourceType() + "/" + id.GetResource() +} + +type collected struct { + resourceTypes []*v2.ResourceType + resources []*v2.Resource + entitlements []*v2.Entitlement + grants []*v2.Grant + idOccurrences map[string]int +} + +func collectRecords(t *testing.T, ctx context.Context, r connectorstore.Reader) *collected { + t.Helper() + out := &collected{idOccurrences: map[string]int{}} + + syncs, err := listAllSyncs(ctx, r) + require.NoError(t, err) + + count := func(id string) { + if id == "" { + return + } + out.idOccurrences[id]++ + } + + for _, sr := range syncs { + annos := syncIDAnnotations(sr.GetId()) + + rtPage := "" + for { + req := v2.ResourceTypesServiceListResourceTypesRequest_builder{ + PageSize: 1000, + PageToken: rtPage, + Annotations: annos, + }.Build() + resp, err := r.ListResourceTypes(ctx, req) + require.NoError(t, err) + out.resourceTypes = append(out.resourceTypes, resp.GetList()...) + if resp.GetNextPageToken() == "" { + break + } + rtPage = resp.GetNextPageToken() + } + + rPage := "" + for { + req := v2.ResourcesServiceListResourcesRequest_builder{ + PageSize: 1000, + PageToken: rPage, + Annotations: annos, + }.Build() + resp, err := r.ListResources(ctx, req) + require.NoError(t, err) + for _, res := range resp.GetList() { + out.resources = append(out.resources, res) + count(res.GetId().GetResource()) + if res.GetParentResourceId() != nil { + count(res.GetParentResourceId().GetResource()) + } + } + if resp.GetNextPageToken() == "" { + break + } + rPage = resp.GetNextPageToken() + } + + ePage := "" + for { + req := v2.EntitlementsServiceListEntitlementsRequest_builder{ + PageSize: 1000, + PageToken: ePage, + Annotations: annos, + }.Build() + resp, err := r.ListEntitlements(ctx, req) + require.NoError(t, err) + for _, ent := range resp.GetList() { + out.entitlements = append(out.entitlements, ent) + count(ent.GetId()) + if ent.GetResource() != nil { + count(ent.GetResource().GetId().GetResource()) + } + } + if resp.GetNextPageToken() == "" { + break + } + ePage = resp.GetNextPageToken() + } + + gPage := "" + for { + req := v2.GrantsServiceListGrantsRequest_builder{ + PageSize: 1000, + PageToken: gPage, + Annotations: annos, + }.Build() + resp, err := r.ListGrants(ctx, req) + require.NoError(t, err) + for _, g := range resp.GetList() { + out.grants = append(out.grants, g) + count(g.GetId()) + if g.GetEntitlement() != nil { + count(g.GetEntitlement().GetId()) + } + if g.GetPrincipal() != nil { + count(g.GetPrincipal().GetId().GetResource()) + } + for k := range g.GetSources().GetSources() { + count(k) + } + } + if resp.GetNextPageToken() == "" { + break + } + gPage = resp.GetNextPageToken() + } + } + return out +} + +// buildFixture writes a small but representative c1z to path with +// one sync, multiple resource types, and one annotation per trait +// handler. Closes the C1File before returning so the bytes hit disk. +func buildFixture(t *testing.T, ctx context.Context, path string) { + t.Helper() + f, err := dotc1z.NewC1ZFile(ctx, path) + require.NoError(t, err) + syncID, err := f.StartNewSync(ctx, connectorstore.SyncTypeFull, "") + require.NoError(t, err) + _ = syncID + + resourceTypes := []*v2.ResourceType{ + v2.ResourceType_builder{Id: "user", DisplayName: "User", Traits: []v2.ResourceType_Trait{v2.ResourceType_TRAIT_USER}}.Build(), + v2.ResourceType_builder{Id: "group", DisplayName: "Group", Traits: []v2.ResourceType_Trait{v2.ResourceType_TRAIT_GROUP}}.Build(), + v2.ResourceType_builder{Id: "app", DisplayName: "App", Traits: []v2.ResourceType_Trait{v2.ResourceType_TRAIT_APP}}.Build(), + v2.ResourceType_builder{Id: "role", DisplayName: "Role", Traits: []v2.ResourceType_Trait{v2.ResourceType_TRAIT_ROLE}}.Build(), + v2.ResourceType_builder{Id: "secret", DisplayName: "Secret", Traits: []v2.ResourceType_Trait{v2.ResourceType_TRAIT_SECRET}}.Build(), + v2.ResourceType_builder{Id: "license", DisplayName: "License", Traits: []v2.ResourceType_Trait{v2.ResourceType_TRAIT_LICENSE_PROFILE}}.Build(), + v2.ResourceType_builder{Id: "scope", DisplayName: "Scope", Traits: []v2.ResourceType_Trait{v2.ResourceType_TRAIT_SCOPE_BINDING}}.Build(), + } + require.NoError(t, f.PutResourceTypes(ctx, resourceTypes...)) + + userTrait := v2.UserTrait_builder{ + Emails: []*v2.UserTrait_Email{ + v2.UserTrait_Email_builder{Address: "alice@acme.com", IsPrimary: true}.Build(), + v2.UserTrait_Email_builder{Address: "alice.work@acme.com"}.Build(), + }, + Login: "alice", + LoginAliases: []string{"alice.smith", "asmith"}, + EmployeeIds: []string{"E-1234"}, + AccountType: v2.UserTrait_ACCOUNT_TYPE_HUMAN, + Profile: &structpb.Struct{Fields: map[string]*structpb.Value{ + "department": structpb.NewStringValue("Engineering"), + "head_count": structpb.NewNumberValue(42), + }}, + Icon: v2.AssetRef_builder{Id: "icon-alice"}.Build(), + CreatedAt: timestamppb.New(time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC)), + LastLogin: timestamppb.New(time.Date(2024, 5, 1, 0, 0, 0, 0, time.UTC)), + StructuredName: v2.UserTrait_StructuredName_builder{ + GivenName: "Alice", FamilyName: "Smith", + }.Build(), + }.Build() + userTrait.SetStatus(v2.UserTrait_Status_builder{ + Status: v2.UserTrait_Status_STATUS_ENABLED, + Details: "active", + }.Build()) + + groupTrait := v2.GroupTrait_builder{ + Icon: v2.AssetRef_builder{Id: "icon-eng"}.Build(), + Profile: &structpb.Struct{Fields: map[string]*structpb.Value{"slug": structpb.NewStringValue("eng-team")}}, + }.Build() + + appTrait := v2.AppTrait_builder{ + HelpUrl: "https://acme.example.com/help", + Icon: v2.AssetRef_builder{Id: "icon-app"}.Build(), + Profile: &structpb.Struct{Fields: map[string]*structpb.Value{"vendor": structpb.NewStringValue("acme")}}, + Flags: []v2.AppTrait_AppFlag{v2.AppTrait_APP_FLAG_OIDC}, + }.Build() + + roleTrait := v2.RoleTrait_builder{ + Profile: &structpb.Struct{Fields: map[string]*structpb.Value{"name": structpb.NewStringValue("admin")}}, + RoleScopeConditions: v2.RoleScopeConditions_builder{ + Type: "expr", + Conditions: []*v2.RoleScopeCondition{v2.RoleScopeCondition_builder{Expression: "resource.id == 'r1'"}.Build()}, + }.Build(), + }.Build() + + secretTrait := v2.SecretTrait_builder{ + Profile: &structpb.Struct{Fields: map[string]*structpb.Value{"kid": structpb.NewStringValue("kid-1")}}, + CreatedById: v2.ResourceId_builder{ResourceType: "user", Resource: "alice"}.Build(), + IdentityId: v2.ResourceId_builder{ResourceType: "user", Resource: "alice"}.Build(), + CreatedAt: timestamppb.New(time.Date(2024, 2, 1, 0, 0, 0, 0, time.UTC)), + ExpiresAt: timestamppb.New(time.Date(2025, 2, 1, 0, 0, 0, 0, time.UTC)), + LastUsedAt: timestamppb.New(time.Date(2024, 4, 1, 0, 0, 0, 0, time.UTC)), + }.Build() + + licenseTrait := v2.LicenseProfileTrait_builder{ + LicenseName: "E3", + PurchasedSeats: 1000, + ConsumedSeats: 650, + CostPerUnitInCents: 1200, + Currency: "USD", + EntitlementIds: []string{"ent-license-e3"}, + }.Build() + + scopeTrait := v2.ScopeBindingTrait_builder{ + RoleId: v2.ResourceId_builder{ResourceType: "role", Resource: "admin"}.Build(), + ScopeResourceId: v2.ResourceId_builder{ResourceType: "app", Resource: "app-prod"}.Build(), + }.Build() + + resources := []*v2.Resource{ + v2.Resource_builder{ + Id: v2.ResourceId_builder{ResourceType: "user", Resource: "alice"}.Build(), + DisplayName: "Alice Smith", + Annotations: []*anypb.Any{mustAny(t, userTrait)}, + }.Build(), + v2.Resource_builder{ + Id: v2.ResourceId_builder{ResourceType: "user", Resource: "bob"}.Build(), + DisplayName: "Bob Jones", + Annotations: []*anypb.Any{mustAny(t, v2.UserTrait_builder{Login: "bob"}.Build())}, + }.Build(), + v2.Resource_builder{ + Id: v2.ResourceId_builder{ResourceType: "group", Resource: "engineering"}.Build(), + DisplayName: "Engineering", + ParentResourceId: v2.ResourceId_builder{ResourceType: "group", Resource: "all"}.Build(), + Annotations: []*anypb.Any{mustAny(t, groupTrait)}, + }.Build(), + v2.Resource_builder{ + Id: v2.ResourceId_builder{ResourceType: "group", Resource: "all"}.Build(), + DisplayName: "All", + Annotations: []*anypb.Any{mustAny(t, v2.GroupTrait_builder{}.Build())}, + }.Build(), + v2.Resource_builder{ + Id: v2.ResourceId_builder{ResourceType: "app", Resource: "app-prod"}.Build(), + DisplayName: "Production App", + Annotations: []*anypb.Any{mustAny(t, appTrait)}, + }.Build(), + v2.Resource_builder{ + Id: v2.ResourceId_builder{ResourceType: "role", Resource: "admin"}.Build(), + DisplayName: "Administrator", + Annotations: []*anypb.Any{mustAny(t, roleTrait)}, + }.Build(), + v2.Resource_builder{ + Id: v2.ResourceId_builder{ResourceType: "secret", Resource: "tok-1"}.Build(), + DisplayName: "API Token 1", + Annotations: []*anypb.Any{mustAny(t, secretTrait)}, + }.Build(), + v2.Resource_builder{ + Id: v2.ResourceId_builder{ResourceType: "license", Resource: "lic-e3"}.Build(), + DisplayName: "E3 License", + Annotations: []*anypb.Any{mustAny(t, licenseTrait)}, + }.Build(), + v2.Resource_builder{ + Id: v2.ResourceId_builder{ResourceType: "scope", Resource: "scope-1"}.Build(), + DisplayName: "Scope 1", + Annotations: []*anypb.Any{mustAny(t, scopeTrait)}, + }.Build(), + } + require.NoError(t, f.PutResources(ctx, resources...)) + + entitlements := []*v2.Entitlement{ + v2.Entitlement_builder{ + Id: "ent-app-member", + DisplayName: "Member of app-prod", + Description: "Production app membership", + Resource: resources[4], + Purpose: v2.Entitlement_PURPOSE_VALUE_ASSIGNMENT, + }.Build(), + v2.Entitlement_builder{ + Id: "ent-eng-member", + DisplayName: "Member of engineering", + Resource: resources[2], + Purpose: v2.Entitlement_PURPOSE_VALUE_ASSIGNMENT, + }.Build(), + v2.Entitlement_builder{ + Id: "ent-license-e3", + DisplayName: "Holds E3", + Resource: resources[7], + Purpose: v2.Entitlement_PURPOSE_VALUE_PERMISSION, + }.Build(), + } + require.NoError(t, f.PutEntitlements(ctx, entitlements...)) + + grants := []*v2.Grant{ + v2.Grant_builder{ + Id: "grant-alice-app", + Entitlement: entitlements[0], + Principal: resources[0], + }.Build(), + v2.Grant_builder{ + Id: "grant-alice-eng", + Entitlement: entitlements[1], + Principal: resources[0], + Sources: v2.GrantSources_builder{Sources: map[string]*v2.GrantSources_GrantSource{ + "ent-app-member": v2.GrantSources_GrantSource_builder{IsDirect: false}.Build(), + }}.Build(), + }.Build(), + v2.Grant_builder{ + Id: "grant-bob-app", + Entitlement: entitlements[0], + Principal: resources[1], + }.Build(), + v2.Grant_builder{ + Id: "grant-alice-license", + Entitlement: entitlements[2], + Principal: resources[0], + }.Build(), + } + require.NoError(t, f.PutGrants(ctx, grants...)) + + require.NoError(t, f.PutAsset(ctx, v2.AssetRef_builder{Id: "icon-alice"}.Build(), "image/png", []byte{0x89, 0x50, 0x4e, 0x47})) + require.NoError(t, f.PutAsset(ctx, v2.AssetRef_builder{Id: "icon-eng"}.Build(), "image/png", []byte{0x89, 0x50, 0x4e, 0x47})) + require.NoError(t, f.PutAsset(ctx, v2.AssetRef_builder{Id: "icon-app"}.Build(), "image/png", []byte{0x89, 0x50, 0x4e, 0x47})) + + require.NoError(t, f.EndSync(ctx)) + require.NoError(t, f.Close(ctx)) +} diff --git a/pkg/c1zsanitize/timestamp.go b/pkg/c1zsanitize/timestamp.go new file mode 100644 index 000000000..3e9f61b49 --- /dev/null +++ b/pkg/c1zsanitize/timestamp.go @@ -0,0 +1,34 @@ +package c1zsanitize + +import ( + "time" + + "google.golang.org/protobuf/types/known/timestamppb" +) + +// timestampShifter applies a constant offset Δ = anchor - tMax to +// every observed timestamp so the newest input timestamp lands on +// the anchor and all relative deltas are preserved exactly. +type timestampShifter struct { + delta time.Duration +} + +func newTimestampShifter(anchor, tMax time.Time) *timestampShifter { + if tMax.IsZero() { + return ×tampShifter{} + } + return ×tampShifter{delta: anchor.Sub(tMax)} +} + +// shift returns a new timestamp shifted by Δ. nil and zero +// timestamps map to themselves so "field unset" semantics survive. +func (s *timestampShifter) shift(ts *timestamppb.Timestamp) *timestamppb.Timestamp { + if ts == nil { + return nil + } + t := ts.AsTime() + if t.IsZero() { + return ts + } + return timestamppb.New(t.Add(s.delta)) +} diff --git a/pkg/c1zsanitize/timestamp_test.go b/pkg/c1zsanitize/timestamp_test.go new file mode 100644 index 000000000..5b3bc3e41 --- /dev/null +++ b/pkg/c1zsanitize/timestamp_test.go @@ -0,0 +1,38 @@ +package c1zsanitize + +import ( + "testing" + "time" + + "google.golang.org/protobuf/types/known/timestamppb" +) + +func TestShifterAnchorMapsTMaxToAnchor(t *testing.T) { + tMax := time.Date(2024, 6, 1, 12, 0, 0, 0, time.UTC) + anchor := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC) + s := newTimestampShifter(anchor, tMax) + got := s.shift(timestamppb.New(tMax)).AsTime() + if !got.Equal(anchor) { + t.Fatalf("expected tMax to map to anchor; got %s want %s", got, anchor) + } +} + +func TestShifterPreservesDeltas(t *testing.T) { + tMax := time.Date(2024, 6, 1, 12, 0, 0, 0, time.UTC) + anchor := time.Date(2026, 1, 1, 0, 0, 0, 0, time.UTC) + s := newTimestampShifter(anchor, tMax) + t1 := time.Date(2024, 5, 1, 12, 0, 0, 0, time.UTC) + t2 := time.Date(2024, 5, 15, 12, 0, 0, 0, time.UTC) + g1 := s.shift(timestamppb.New(t1)).AsTime() + g2 := s.shift(timestamppb.New(t2)).AsTime() + if g2.Sub(g1) != t2.Sub(t1) { + t.Fatalf("expected deltas preserved; got %s want %s", g2.Sub(g1), t2.Sub(t1)) + } +} + +func TestShifterNilNil(t *testing.T) { + s := newTimestampShifter(time.Now(), time.Date(2024, 6, 1, 0, 0, 0, 0, time.UTC)) + if got := s.shift(nil); got != nil { + t.Fatalf("expected nil input -> nil output") + } +} diff --git a/pkg/c1zsanitize/transform.go b/pkg/c1zsanitize/transform.go new file mode 100644 index 000000000..a6dfc1d03 --- /dev/null +++ b/pkg/c1zsanitize/transform.go @@ -0,0 +1,260 @@ +package c1zsanitize + +import ( + "context" + "fmt" + + v2 "github.com/conductorone/baton-sdk/pb/c1/connector/v2" + "github.com/conductorone/baton-sdk/pkg/connectorstore" +) + +const listPageSize = 1000 + +func (s *sanitizer) copyResourceTypes( + ctx context.Context, + src connectorstore.Reader, + dst connectorstore.Writer, + srcSyncID string, + refs *assetRefSet, +) error { + pageToken := "" + for { + req := v2.ResourceTypesServiceListResourceTypesRequest_builder{ + PageSize: listPageSize, + PageToken: pageToken, + Annotations: syncIDAnnotations(srcSyncID), + }.Build() + resp, err := src.ListResourceTypes(ctx, req) + if err != nil { + return fmt.Errorf("list resource types: %w", err) + } + out := make([]*v2.ResourceType, 0, len(resp.GetList())) + for _, rt := range resp.GetList() { + out = append(out, s.transformResourceType(rt, refs)) + } + if len(out) > 0 { + if err := dst.PutResourceTypes(ctx, out...); err != nil { + return fmt.Errorf("put resource types: %w", err) + } + } + if resp.GetNextPageToken() == "" { + return nil + } + pageToken = resp.GetNextPageToken() + } +} + +func (s *sanitizer) copyResources( + ctx context.Context, + src connectorstore.Reader, + dst connectorstore.Writer, + srcSyncID string, + refs *assetRefSet, +) error { + pageToken := "" + for { + req := v2.ResourcesServiceListResourcesRequest_builder{ + PageSize: listPageSize, + PageToken: pageToken, + Annotations: syncIDAnnotations(srcSyncID), + }.Build() + resp, err := src.ListResources(ctx, req) + if err != nil { + return fmt.Errorf("list resources: %w", err) + } + out := make([]*v2.Resource, 0, len(resp.GetList())) + for _, r := range resp.GetList() { + out = append(out, s.transformResource(r, refs)) + } + if len(out) > 0 { + if err := dst.PutResources(ctx, out...); err != nil { + return fmt.Errorf("put resources: %w", err) + } + } + if resp.GetNextPageToken() == "" { + return nil + } + pageToken = resp.GetNextPageToken() + } +} + +func (s *sanitizer) copyEntitlements( + ctx context.Context, + src connectorstore.Reader, + dst connectorstore.Writer, + srcSyncID string, + refs *assetRefSet, +) error { + pageToken := "" + for { + req := v2.EntitlementsServiceListEntitlementsRequest_builder{ + PageSize: listPageSize, + PageToken: pageToken, + Annotations: syncIDAnnotations(srcSyncID), + }.Build() + resp, err := src.ListEntitlements(ctx, req) + if err != nil { + return fmt.Errorf("list entitlements: %w", err) + } + out := make([]*v2.Entitlement, 0, len(resp.GetList())) + for _, e := range resp.GetList() { + out = append(out, s.transformEntitlement(e, refs)) + } + if len(out) > 0 { + if err := dst.PutEntitlements(ctx, out...); err != nil { + return fmt.Errorf("put entitlements: %w", err) + } + } + if resp.GetNextPageToken() == "" { + return nil + } + pageToken = resp.GetNextPageToken() + } +} + +func (s *sanitizer) copyGrants( + ctx context.Context, + src connectorstore.Reader, + dst connectorstore.Writer, + srcSyncID string, + refs *assetRefSet, +) error { + pageToken := "" + for { + req := v2.GrantsServiceListGrantsRequest_builder{ + PageSize: listPageSize, + PageToken: pageToken, + Annotations: syncIDAnnotations(srcSyncID), + }.Build() + resp, err := src.ListGrants(ctx, req) + if err != nil { + return fmt.Errorf("list grants: %w", err) + } + out := make([]*v2.Grant, 0, len(resp.GetList())) + for _, g := range resp.GetList() { + out = append(out, s.transformGrant(g, refs)) + } + if len(out) > 0 { + if err := dst.PutGrants(ctx, out...); err != nil { + return fmt.Errorf("put grants: %w", err) + } + } + if resp.GetNextPageToken() == "" { + return nil + } + pageToken = resp.GetNextPageToken() + } +} + +// transformResourceType preserves the resource type's id and trait +// enum but rewrites display name, description, and annotations. +// resource_type.id is connector-defined (e.g. "user") and treated as +// non-tenant; see §7 question 1 in the investigation for the caveat. +func (s *sanitizer) transformResourceType(in *v2.ResourceType, refs *assetRefSet) *v2.ResourceType { + if in == nil { + return nil + } + annos := s.transformAnnotations(in.GetAnnotations(), refs) + return v2.ResourceType_builder{ + Id: in.GetId(), + DisplayName: s.id(in.GetDisplayName()), + Description: s.id(in.GetDescription()), + Traits: in.GetTraits(), + Annotations: annos, + SourcedExternally: in.GetSourcedExternally(), + }.Build() +} + +// transformResource rewrites the resource id (preserving the type +// portion), parent id, display name, description, and annotations. +// baton_resource is preserved as it's a flag, not identity. +func (s *sanitizer) transformResource(in *v2.Resource, refs *assetRefSet) *v2.Resource { + if in == nil { + return nil + } + annos := s.transformAnnotations(in.GetAnnotations(), refs) + return v2.Resource_builder{ + Id: s.transformResourceID(in.GetId()), + ParentResourceId: s.transformResourceID(in.GetParentResourceId()), + DisplayName: s.id(in.GetDisplayName()), + Description: s.id(in.GetDescription()), + Annotations: annos, + BatonResource: in.GetBatonResource(), + }.Build() +} + +func (s *sanitizer) transformResourceID(in *v2.ResourceId) *v2.ResourceId { + if in == nil { + return nil + } + if in.GetResource() == "" && in.GetResourceType() == "" { + return nil + } + return v2.ResourceId_builder{ + ResourceType: in.GetResourceType(), + Resource: s.id(in.GetResource()), + BatonResource: in.GetBatonResource(), + }.Build() +} + +func (s *sanitizer) transformEntitlement(in *v2.Entitlement, refs *assetRefSet) *v2.Entitlement { + if in == nil { + return nil + } + annos := s.transformAnnotations(in.GetAnnotations(), refs) + out := v2.Entitlement_builder{ + Id: s.id(in.GetId()), + Resource: s.transformResource(in.GetResource(), refs), + DisplayName: s.id(in.GetDisplayName()), + Description: s.id(in.GetDescription()), + GrantableTo: s.transformResourceTypeSlice(in.GetGrantableTo(), refs), + Purpose: in.GetPurpose(), + Slug: s.id(in.GetSlug()), + Annotations: annos, + }.Build() + return out +} + +func (s *sanitizer) transformResourceTypeSlice(in []*v2.ResourceType, refs *assetRefSet) []*v2.ResourceType { + if len(in) == 0 { + return nil + } + out := make([]*v2.ResourceType, 0, len(in)) + for _, rt := range in { + out = append(out, s.transformResourceType(rt, refs)) + } + return out +} + +func (s *sanitizer) transformGrant(in *v2.Grant, refs *assetRefSet) *v2.Grant { + if in == nil { + return nil + } + annos := s.transformAnnotations(in.GetAnnotations(), refs) + return v2.Grant_builder{ + Id: s.id(in.GetId()), + Entitlement: s.transformEntitlement(in.GetEntitlement(), refs), + Principal: s.transformResource(in.GetPrincipal(), refs), + Sources: s.transformGrantSources(in.GetSources()), + Annotations: annos, + }.Build() +} + +// transformGrantSources rebuilds the sources map with sanitized keys. +// Map keys are source entitlement IDs; values carry only is_direct. +func (s *sanitizer) transformGrantSources(in *v2.GrantSources) *v2.GrantSources { + if in == nil { + return nil + } + srcMap := in.GetSources() + if len(srcMap) == 0 { + return v2.GrantSources_builder{}.Build() + } + out := make(map[string]*v2.GrantSources_GrantSource, len(srcMap)) + for srcEntitlementID, gs := range srcMap { + out[s.id(srcEntitlementID)] = v2.GrantSources_GrantSource_builder{ + IsDirect: gs.GetIsDirect(), + }.Build() + } + return v2.GrantSources_builder{Sources: out}.Build() +}