From 18e3d56f9953eb41ac9413277af367b6e9d311f5 Mon Sep 17 00:00:00 2001 From: Sonu Preetam Date: Wed, 18 Mar 2026 18:15:26 -0400 Subject: [PATCH 01/11] feat(sync): add config parsing, path validation, and lock file management Signed-off-by: Sonu Preetam --- cmd/sync-content/config.go | 108 ++++++++++++++++++++++++++ cmd/sync-content/config_test.go | 118 ++++++++++++++++++++++++++++ cmd/sync-content/helpers_test.go | 39 ++++++++++ cmd/sync-content/lock.go | 70 +++++++++++++++++ cmd/sync-content/lock_test.go | 127 +++++++++++++++++++++++++++++++ cmd/sync-content/path.go | 74 ++++++++++++++++++ cmd/sync-content/path_test.go | 67 ++++++++++++++++ 7 files changed, 603 insertions(+) create mode 100644 cmd/sync-content/config.go create mode 100644 cmd/sync-content/config_test.go create mode 100644 cmd/sync-content/helpers_test.go create mode 100644 cmd/sync-content/lock.go create mode 100644 cmd/sync-content/lock_test.go create mode 100644 cmd/sync-content/path.go create mode 100644 cmd/sync-content/path_test.go diff --git a/cmd/sync-content/config.go b/cmd/sync-content/config.go new file mode 100644 index 0000000..c0c6346 --- /dev/null +++ b/cmd/sync-content/config.go @@ -0,0 +1,108 @@ +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "fmt" + "os" + + "gopkg.in/yaml.v3" +) + +// PeribolosConfig is the top-level structure parsed from peribolos.yaml +// in the org's .github repo. +type PeribolosConfig struct { + Orgs map[string]PeribolosOrg `yaml:"orgs"` +} + +// PeribolosOrg represents an organization entry in peribolos.yaml. +type PeribolosOrg struct { + Repos map[string]PeribolosRepo `yaml:"repos"` +} + +// PeribolosRepo holds per-repo metadata from peribolos.yaml. +type PeribolosRepo struct { + Description string `yaml:"description"` + DefaultBranch string `yaml:"default_branch"` +} + +// SyncConfig is the top-level structure parsed from sync-config.yaml. +type SyncConfig struct { + Defaults Defaults `yaml:"defaults"` + Sources []Source `yaml:"sources"` + Discovery Discovery `yaml:"discovery"` +} + +// Discovery configures automatic detection of new repos and doc files +// that are not yet declared in sources. +type Discovery struct { + IgnoreRepos []string `yaml:"ignore_repos"` + IgnoreFiles []string `yaml:"ignore_files"` + ScanPaths []string `yaml:"scan_paths"` +} + +// Defaults holds fallback values applied to every source unless overridden. +type Defaults struct { + Branch string `yaml:"branch"` +} + +// Source is a single upstream repository declared in the config file. +type Source struct { + Repo string `yaml:"repo"` + Branch string `yaml:"branch"` + SkipOrgSync bool `yaml:"skip_org_sync"` + Files []FileSpec `yaml:"files"` +} + +// FileSpec describes one file to fetch from a source repo and where to place it. +type FileSpec struct { + Src string `yaml:"src"` + Dest string `yaml:"dest"` + Transform Transform `yaml:"transform"` +} + +// Transform describes optional mutations applied to fetched content. +type Transform struct { + InjectFrontmatter map[string]any `yaml:"inject_frontmatter"` + RewriteLinks bool `yaml:"rewrite_links"` + StripBadges bool `yaml:"strip_badges"` +} + +// loadConfig reads a sync-config.yaml file and returns the parsed configuration. +// It applies default values (e.g. branch) and validates that every source has +// the required fields. +func loadConfig(path string) (*SyncConfig, error) { + data, err := os.ReadFile(path) + if err != nil { + return nil, fmt.Errorf("reading config %s: %w", path, err) + } + + var cfg SyncConfig + if err := yaml.Unmarshal(data, &cfg); err != nil { + return nil, fmt.Errorf("parsing config %s: %w", path, err) + } + + if cfg.Defaults.Branch == "" { + cfg.Defaults.Branch = "main" + } + + for i := range cfg.Sources { + src := &cfg.Sources[i] + if src.Repo == "" { + return nil, fmt.Errorf("config %s: source[%d] missing required field 'repo'", path, i) + } + if src.Branch == "" { + src.Branch = cfg.Defaults.Branch + } + for j, f := range src.Files { + if f.Src == "" { + return nil, fmt.Errorf("config %s: source[%d] (%s) file[%d] missing 'src'", path, i, src.Repo, j) + } + if f.Dest == "" { + return nil, fmt.Errorf("config %s: source[%d] (%s) file[%d] missing 'dest'", path, i, src.Repo, j) + } + } + } + + return &cfg, nil +} diff --git a/cmd/sync-content/config_test.go b/cmd/sync-content/config_test.go new file mode 100644 index 0000000..d3eba15 --- /dev/null +++ b/cmd/sync-content/config_test.go @@ -0,0 +1,118 @@ +// SPDX-License-Identifier: Apache-2.0 +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" +) + +func TestLoadConfig(t *testing.T) { + t.Run("valid config", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "sync-config.yaml") + os.WriteFile(path, []byte(` +defaults: + branch: main +sources: + - repo: org/repo1 + files: + - src: README.md + dest: content/docs/projects/repo1/_index.md +`), 0o644) + + cfg, err := loadConfig(path) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg.Defaults.Branch != "main" { + t.Errorf("branch = %q, want %q", cfg.Defaults.Branch, "main") + } + if len(cfg.Sources) != 1 { + t.Fatalf("sources count = %d, want 1", len(cfg.Sources)) + } + if cfg.Sources[0].Repo != "org/repo1" { + t.Errorf("repo = %q, want %q", cfg.Sources[0].Repo, "org/repo1") + } + if cfg.Sources[0].Branch != "main" { + t.Errorf("source branch = %q, want %q (inherited from defaults)", cfg.Sources[0].Branch, "main") + } + }) + + t.Run("default branch applied", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "cfg.yaml") + os.WriteFile(path, []byte(` +sources: + - repo: org/repo1 + files: + - src: README.md + dest: out/README.md +`), 0o644) + + cfg, err := loadConfig(path) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if cfg.Defaults.Branch != "main" { + t.Errorf("default branch = %q, want %q", cfg.Defaults.Branch, "main") + } + }) + + t.Run("malformed YAML", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "bad.yaml") + os.WriteFile(path, []byte(`{{{not yaml`), 0o644) + + _, err := loadConfig(path) + if err == nil { + t.Fatal("expected error for malformed YAML") + } + }) + + t.Run("missing file", func(t *testing.T) { + _, err := loadConfig("/nonexistent/path.yaml") + if err == nil { + t.Fatal("expected error for missing file") + } + }) + + t.Run("missing repo field", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "cfg.yaml") + os.WriteFile(path, []byte(` +sources: + - files: + - src: README.md + dest: out/README.md +`), 0o644) + + _, err := loadConfig(path) + if err == nil { + t.Fatal("expected error for missing repo") + } + if !strings.Contains(err.Error(), "missing required field 'repo'") { + t.Errorf("error = %q, want it to mention missing repo", err) + } + }) + + t.Run("missing src field", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "cfg.yaml") + os.WriteFile(path, []byte(` +sources: + - repo: org/repo1 + files: + - dest: out/README.md +`), 0o644) + + _, err := loadConfig(path) + if err == nil { + t.Fatal("expected error for missing src") + } + if !strings.Contains(err.Error(), "missing 'src'") { + t.Errorf("error = %q, want it to mention missing src", err) + } + }) +} diff --git a/cmd/sync-content/helpers_test.go b/cmd/sync-content/helpers_test.go new file mode 100644 index 0000000..52e8ec4 --- /dev/null +++ b/cmd/sync-content/helpers_test.go @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: Apache-2.0 +package main + +import ( + "encoding/base64" + "net/http" + "strings" +) + +// urlRewriter intercepts HTTP requests and redirects them to the test server, +// allowing the apiClient to use its hardcoded githubAPI constant while actually +// hitting the mock server. +type urlRewriter struct { + targetHost string + targetPort string +} + +func (r *urlRewriter) RoundTrip(req *http.Request) (*http.Response, error) { + req.URL.Scheme = "http" + req.URL.Host = r.targetHost + ":" + r.targetPort + return http.DefaultTransport.RoundTrip(req) +} + +func newTestClient(serverURL string) *apiClient { + parts := strings.TrimPrefix(serverURL, "http://") + hostPort := strings.SplitN(parts, ":", 2) + host, port := hostPort[0], hostPort[1] + + return &apiClient{ + token: "test-token", + http: &http.Client{ + Transport: &urlRewriter{targetHost: host, targetPort: port}, + }, + } +} + +func b64(s string) string { + return base64.StdEncoding.EncodeToString([]byte(s)) +} diff --git a/cmd/sync-content/lock.go b/cmd/sync-content/lock.go new file mode 100644 index 0000000..ff29612 --- /dev/null +++ b/cmd/sync-content/lock.go @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "encoding/json" + "fmt" + "os" + "sort" +) + +// ContentLock tracks approved branch commit SHAs per repository. +// The lockfile is committed to version control and governs which upstream +// content versions the deploy workflow is allowed to sync. +type ContentLock struct { + Repos map[string]string `json:"repos"` +} + +// readLock loads a content lockfile from disk. If the file does not exist +// (bootstrap case), an empty lock is returned with no error. +func readLock(path string) (*ContentLock, error) { + data, err := os.ReadFile(path) + if os.IsNotExist(err) { + return &ContentLock{Repos: make(map[string]string)}, nil + } + if err != nil { + return nil, fmt.Errorf("reading lock %s: %w", path, err) + } + + var lock ContentLock + if err := json.Unmarshal(data, &lock); err != nil { + return nil, fmt.Errorf("parsing lock %s: %w", path, err) + } + if lock.Repos == nil { + lock.Repos = make(map[string]string) + } + return &lock, nil +} + +// writeLock persists the lockfile to disk with deterministic key ordering. +func writeLock(path string, lock *ContentLock) error { + ordered := make([]string, 0, len(lock.Repos)) + for k := range lock.Repos { + ordered = append(ordered, k) + } + sort.Strings(ordered) + + m := make(map[string]string, len(ordered)) + for _, k := range ordered { + m[k] = lock.Repos[k] + } + + wrapper := struct { + Repos map[string]string `json:"repos"` + }{Repos: m} + + data, err := json.MarshalIndent(wrapper, "", " ") + if err != nil { + return fmt.Errorf("marshaling lock: %w", err) + } + return os.WriteFile(path, append(data, '\n'), 0o644) +} + +// sha returns the approved branch SHA for a repo, or "" if not locked. +func (l *ContentLock) sha(repo string) string { + if l == nil || l.Repos == nil { + return "" + } + return l.Repos[repo] +} diff --git a/cmd/sync-content/lock_test.go b/cmd/sync-content/lock_test.go new file mode 100644 index 0000000..6494e9d --- /dev/null +++ b/cmd/sync-content/lock_test.go @@ -0,0 +1,127 @@ +// SPDX-License-Identifier: Apache-2.0 +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestReadWriteLock_RoundTrip(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, ".content-lock.json") + + original := &ContentLock{Repos: map[string]string{ + "complyctl": "abc123", + "comply-scribe": "def456", + }} + if err := writeLock(path, original); err != nil { + t.Fatalf("writeLock: %v", err) + } + + loaded, err := readLock(path) + if err != nil { + t.Fatalf("readLock: %v", err) + } + + if len(loaded.Repos) != 2 { + t.Fatalf("repos count = %d, want 2", len(loaded.Repos)) + } + if loaded.Repos["complyctl"] != "abc123" { + t.Errorf("complyctl SHA = %q, want %q", loaded.Repos["complyctl"], "abc123") + } + if loaded.Repos["comply-scribe"] != "def456" { + t.Errorf("comply-scribe SHA = %q, want %q", loaded.Repos["comply-scribe"], "def456") + } +} + +func TestReadLock_MissingFile(t *testing.T) { + lock, err := readLock(filepath.Join(t.TempDir(), "nonexistent.json")) + if err != nil { + t.Fatalf("readLock should not error for missing file: %v", err) + } + if lock == nil || lock.Repos == nil { + t.Fatal("readLock should return empty lock with initialized map") + } + if len(lock.Repos) != 0 { + t.Errorf("repos count = %d, want 0", len(lock.Repos)) + } +} + +func TestReadLock_InvalidJSON(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "bad.json") + os.WriteFile(path, []byte("not json"), 0o644) + + _, err := readLock(path) + if err == nil { + t.Fatal("readLock should error on invalid JSON") + } +} + +func TestContentLock_SHA(t *testing.T) { + lock := &ContentLock{Repos: map[string]string{ + "complyctl": "abc123", + }} + + if got := lock.sha("complyctl"); got != "abc123" { + t.Errorf("sha(complyctl) = %q, want %q", got, "abc123") + } + if got := lock.sha("unknown"); got != "" { + t.Errorf("sha(unknown) = %q, want empty", got) + } + + var nilLock *ContentLock + if got := nilLock.sha("anything"); got != "" { + t.Errorf("nil lock sha() = %q, want empty", got) + } +} + +func TestWriteLock_DeterministicOrder(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "lock.json") + + lock := &ContentLock{Repos: map[string]string{ + "zebra": "z1", + "alpha": "a1", + "mike": "m1", + }} + + if err := writeLock(path, lock); err != nil { + t.Fatalf("writeLock: %v", err) + } + + data, _ := os.ReadFile(path) + content := string(data) + + alphaIdx := indexOf(content, "alpha") + mikeIdx := indexOf(content, "mike") + zebraIdx := indexOf(content, "zebra") + + if alphaIdx > mikeIdx || mikeIdx > zebraIdx { + t.Errorf("keys should be sorted alphabetically, got:\n%s", content) + } +} + +func TestReadLock_NilReposInitialized(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "lock.json") + os.WriteFile(path, []byte(`{}`), 0o644) + + lock, err := readLock(path) + if err != nil { + t.Fatalf("readLock: %v", err) + } + if lock.Repos == nil { + t.Error("Repos should be initialized even when missing from JSON") + } +} + +func indexOf(s, substr string) int { + for i := range len(s) - len(substr) + 1 { + if s[i:i+len(substr)] == substr { + return i + } + } + return -1 +} diff --git a/cmd/sync-content/path.go b/cmd/sync-content/path.go new file mode 100644 index 0000000..3cb6c0c --- /dev/null +++ b/cmd/sync-content/path.go @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "path/filepath" + "strings" +) + +func isValidRepoName(name string) bool { + if name == "" || name == "." || name == ".." { + return false + } + if strings.ContainsAny(name, "/\\") { + return false + } + for _, seg := range strings.Split(name, string(filepath.Separator)) { + if seg == ".." { + return false + } + } + return true +} + +// isUnderDir reports whether target is under base after resolving symlinks and +// cleaning both paths. Both base and target are resolved through symlinks as +// far as the filesystem allows (the target may not fully exist yet). Prevents +// path traversal attacks from config dest fields, manifest entries, or +// API-sourced file paths that contain "../" sequences. +func isUnderDir(base, target string) bool { + absBase, err := filepath.Abs(base) + if err != nil { + return false + } + if resolved, err := filepath.EvalSymlinks(absBase); err == nil { + absBase = resolved + } + absTarget, err := evalDeepest(target) + if err != nil { + return false + } + absBase = filepath.Clean(absBase) + string(filepath.Separator) + absTarget = filepath.Clean(absTarget) + return strings.HasPrefix(absTarget, absBase) || absTarget == strings.TrimSuffix(absBase, string(filepath.Separator)) +} + +// evalDeepest resolves a path through symlinks as deeply as the filesystem +// allows. If the full path doesn't exist, it walks up to the deepest existing +// ancestor, resolves that, and appends the remaining unresolved tail. +func evalDeepest(path string) (string, error) { + abs, err := filepath.Abs(path) + if err != nil { + return "", err + } + if resolved, err := filepath.EvalSymlinks(abs); err == nil { + return resolved, nil + } + dir, remaining := filepath.Dir(abs), filepath.Base(abs) + for dir != filepath.Dir(dir) { + if resolved, err := filepath.EvalSymlinks(dir); err == nil { + return filepath.Join(resolved, remaining), nil + } + remaining = filepath.Join(filepath.Base(dir), remaining) + dir = filepath.Dir(dir) + } + return abs, nil +} + +func languageOrDefault(lang string) string { + if lang == "" { + return "Unknown" + } + return lang +} diff --git a/cmd/sync-content/path_test.go b/cmd/sync-content/path_test.go new file mode 100644 index 0000000..80f5770 --- /dev/null +++ b/cmd/sync-content/path_test.go @@ -0,0 +1,67 @@ +// SPDX-License-Identifier: Apache-2.0 +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestIsValidRepoName(t *testing.T) { + valid := []string{"my-repo", "repo123", "a", "repo.name", "dotdot..name"} + for _, name := range valid { + if !isValidRepoName(name) { + t.Errorf("isValidRepoName(%q) = false, want true", name) + } + } + + invalid := []string{"", ".", "..", "path/sep", "back\\slash"} + for _, name := range invalid { + if isValidRepoName(name) { + t.Errorf("isValidRepoName(%q) = true, want false", name) + } + } +} + +func TestIsUnderDir(t *testing.T) { + base := t.TempDir() + + tests := []struct { + name string + target string + want bool + }{ + {"child file", filepath.Join(base, "content", "file.md"), true}, + {"same dir", base, true}, + {"traversal", filepath.Join(base, "..", "etc", "passwd"), false}, + {"double traversal", filepath.Join(base, "content", "..", "..", "etc"), false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isUnderDir(base, tt.target) + if got != tt.want { + t.Errorf("isUnderDir(%q, %q) = %v, want %v", base, tt.target, got, tt.want) + } + }) + } +} + +func TestIsUnderDir_ResolvesBaseSymlinks(t *testing.T) { + real := t.TempDir() + parent := t.TempDir() + link := filepath.Join(parent, "symlink-to-real") + if err := os.Symlink(real, link); err != nil { + t.Skipf("symlinks not supported: %v", err) + } + + child := filepath.Join(link, "content", "file.md") + if !isUnderDir(link, child) { + t.Error("isUnderDir should allow child under symlinked base") + } + + escape := filepath.Join(link, "..", "etc", "passwd") + if isUnderDir(link, escape) { + t.Error("isUnderDir should reject traversal out of symlinked base") + } +} From a88af2bbe9ec4745d90e6beef0161d09bc8c3a51 Mon Sep 17 00:00:00 2001 From: Sonu Preetam Date: Wed, 18 Mar 2026 18:17:46 -0400 Subject: [PATCH 02/11] feat(sync): add GitHub API client, manifest tracking, and content discovery Signed-off-by: Sonu Preetam --- cmd/sync-content/github.go | 310 ++++++++++++++++++++++++++++ cmd/sync-content/github_test.go | 332 ++++++++++++++++++++++++++++++ cmd/sync-content/manifest.go | 135 ++++++++++++ cmd/sync-content/manifest_test.go | 155 ++++++++++++++ 4 files changed, 932 insertions(+) create mode 100644 cmd/sync-content/github.go create mode 100644 cmd/sync-content/github_test.go create mode 100644 cmd/sync-content/manifest.go create mode 100644 cmd/sync-content/manifest_test.go diff --git a/cmd/sync-content/github.go b/cmd/sync-content/github.go new file mode 100644 index 0000000..d1d5d2e --- /dev/null +++ b/cmd/sync-content/github.go @@ -0,0 +1,310 @@ +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "context" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "log/slog" + "net/http" + "net/url" + "sort" + "strconv" + "strings" + "time" + + "gopkg.in/yaml.v3" +) + +const ( + githubAPI = "https://api.github.com" + maxRetries = 3 + maxResponseBytes = 10 << 20 // 10 MB safety ceiling for API response bodies + maxDirDepth = 10 +) + +// GitHub API response types + +type Repo struct { + Name string `json:"name"` + FullName string `json:"full_name"` + Description string `json:"description"` + Language string `json:"language"` + StargazersCount int `json:"stargazers_count"` + HTMLURL string `json:"html_url"` + DefaultBranch string `json:"default_branch"` + PushedAt string `json:"pushed_at"` + Archived bool `json:"archived"` + Fork bool `json:"fork"` + Topics []string `json:"topics"` +} + +type FileResponse struct { + Content string `json:"content"` + Encoding string `json:"encoding"` + SHA string `json:"sha"` +} + +type DirEntry struct { + Name string `json:"name"` + Path string `json:"path"` + Type string `json:"type"` +} + +type BranchResponse struct { + Commit struct { + SHA string `json:"sha"` + } `json:"commit"` +} + +// apiClient wraps net/http for authenticated GitHub REST API calls. +type apiClient struct { + token string + http *http.Client +} + +func (c *apiClient) do(ctx context.Context, url string) (*http.Response, error) { + req, err := http.NewRequestWithContext(ctx, "GET", url, nil) + if err != nil { + return nil, err + } + req.Header.Set("Accept", "application/vnd.github.v3+json") + if c.token != "" { + req.Header.Set("Authorization", "Bearer "+c.token) + } + return c.http.Do(req) +} + +// getJSON fetches a URL and decodes JSON, retrying on rate limit (403/429) +// with exponential backoff and respect for Retry-After / X-RateLimit-Reset. +func (c *apiClient) getJSON(ctx context.Context, url string, dst any) error { + var lastErr error + for attempt := range maxRetries + 1 { + resp, err := c.do(ctx, url) + if err != nil { + return err + } + + if resp.StatusCode == http.StatusOK { + limited := io.LimitReader(resp.Body, maxResponseBytes) + err = json.NewDecoder(limited).Decode(dst) + io.Copy(io.Discard, resp.Body) + resp.Body.Close() + return err + } + + body, _ := io.ReadAll(io.LimitReader(resp.Body, 4096)) + resp.Body.Close() + lastErr = fmt.Errorf("GET %s: %d %s", url, resp.StatusCode, body) + + if !isRateLimited(resp) || attempt == maxRetries { + return lastErr + } + + wait := retryWait(resp, attempt) + slog.Warn("rate limited, retrying", "url", url, "attempt", attempt+1, "wait", wait.Round(time.Second)) + select { + case <-ctx.Done(): + return ctx.Err() + case <-time.After(wait): + } + } + return lastErr +} + +func isRateLimited(resp *http.Response) bool { + if resp.StatusCode == http.StatusTooManyRequests { + return true + } + if resp.StatusCode == http.StatusForbidden { + return resp.Header.Get("X-RateLimit-Remaining") == "0" + } + return false +} + +func retryWait(resp *http.Response, attempt int) time.Duration { + if ra := resp.Header.Get("Retry-After"); ra != "" { + if seconds, err := strconv.Atoi(ra); err == nil { + return time.Duration(seconds) * time.Second + } + } + if reset := resp.Header.Get("X-RateLimit-Reset"); reset != "" { + if ts, err := strconv.ParseInt(reset, 10, 64); err == nil { + wait := time.Until(time.Unix(ts, 0)) + time.Second + if wait > 0 && wait < 5*time.Minute { + return wait + } + if wait >= 5*time.Minute { + slog.Warn("rate limit reset too far in future, using backoff", "reset_in", wait.Round(time.Second)) + } + } + } + return time.Duration(1<.repos. +func (c *apiClient) fetchPeribolosRepos(ctx context.Context, org string) ([]string, error) { + apiURL := fmt.Sprintf("%s/repos/%s/.github/contents/peribolos.yaml", + githubAPI, url.PathEscape(org)) + var f FileResponse + if err := c.getJSON(ctx, apiURL, &f); err != nil { + return nil, fmt.Errorf("fetching peribolos.yaml from %s/.github: %w", org, err) + } + content, err := decodeContent(f) + if err != nil { + return nil, fmt.Errorf("decoding peribolos.yaml: %w", err) + } + + var pc PeribolosConfig + if err := yaml.Unmarshal([]byte(content), &pc); err != nil { + return nil, fmt.Errorf("parsing peribolos.yaml: %w", err) + } + + orgData, ok := pc.Orgs[org] + if !ok { + return nil, fmt.Errorf("peribolos.yaml has no entry for org %q", org) + } + + names := make([]string, 0, len(orgData.Repos)) + for name := range orgData.Repos { + names = append(names, name) + } + sort.Strings(names) + return names, nil +} + +// getRepoMetadata fetches full metadata for a single repo from the GitHub API. +func (c *apiClient) getRepoMetadata(ctx context.Context, owner, name string) (*Repo, error) { + apiURL := fmt.Sprintf("%s/repos/%s/%s", + githubAPI, url.PathEscape(owner), url.PathEscape(name)) + var repo Repo + if err := c.getJSON(ctx, apiURL, &repo); err != nil { + return nil, err + } + return &repo, nil +} + +func (c *apiClient) getREADME(ctx context.Context, owner, repo, ref string) (string, string, error) { + apiURL := fmt.Sprintf("%s/repos/%s/%s/readme", + githubAPI, url.PathEscape(owner), url.PathEscape(repo)) + apiURL = appendRef(apiURL, ref) + var f FileResponse + if err := c.getJSON(ctx, apiURL, &f); err != nil { + return "", "", err + } + content, err := decodeContent(f) + return content, f.SHA, err +} + +func (c *apiClient) getFileContent(ctx context.Context, owner, repo, path, ref string) (string, string, error) { + apiURL := fmt.Sprintf("%s/repos/%s/%s/contents/%s", + githubAPI, url.PathEscape(owner), url.PathEscape(repo), escapePathSegments(path)) + apiURL = appendRef(apiURL, ref) + var f FileResponse + if err := c.getJSON(ctx, apiURL, &f); err != nil { + return "", "", err + } + content, err := decodeContent(f) + return content, f.SHA, err +} + +func (c *apiClient) listDir(ctx context.Context, owner, repo, path, ref string) ([]DirEntry, error) { + apiURL := fmt.Sprintf("%s/repos/%s/%s/contents/%s", + githubAPI, url.PathEscape(owner), url.PathEscape(repo), escapePathSegments(path)) + apiURL = appendRef(apiURL, ref) + var entries []DirEntry + if err := c.getJSON(ctx, apiURL, &entries); err != nil { + return nil, err + } + return entries, nil +} + +func (c *apiClient) getBranchSHA(ctx context.Context, owner, repo, branch string) (string, error) { + apiURL := fmt.Sprintf("%s/repos/%s/%s/branches/%s", + githubAPI, url.PathEscape(owner), url.PathEscape(repo), url.PathEscape(branch)) + var b BranchResponse + if err := c.getJSON(ctx, apiURL, &b); err != nil { + return "", err + } + return b.Commit.SHA, nil +} + +// listDirMD recursively lists .md files under a directory, reusing listDir. +// Returns paths relative to the repo root (e.g. "docs/guide.md"). +// Recursion is bounded to maxDirDepth levels to limit API calls on deeply +// nested repositories. +func (c *apiClient) listDirMD(ctx context.Context, owner, repo, dir, ref string) ([]string, error) { + return c.listDirMDDepth(ctx, owner, repo, dir, ref, 0) +} + +func (c *apiClient) listDirMDDepth(ctx context.Context, owner, repo, dir, ref string, depth int) ([]string, error) { + if depth >= maxDirDepth { + slog.Warn("max directory depth reached, skipping deeper levels", "repo", owner+"/"+repo, "dir", dir, "depth", depth) + return nil, nil + } + entries, err := c.listDir(ctx, owner, repo, dir, ref) + if err != nil { + return nil, err + } + var files []string + for _, e := range entries { + switch { + case e.Type == "file" && strings.HasSuffix(e.Name, ".md"): + if e.Path != "" { + files = append(files, e.Path) + } else { + files = append(files, dir+"/"+e.Name) + } + case e.Type == "dir": + subDir := dir + "/" + e.Name + if e.Path != "" { + subDir = e.Path + } + sub, err := c.listDirMDDepth(ctx, owner, repo, subDir, ref, depth+1) + if err != nil { + slog.Warn("could not list subdir", "repo", owner+"/"+repo, "dir", subDir, "error", err) + continue + } + files = append(files, sub...) + } + } + return files, nil +} + +func decodeContent(f FileResponse) (string, error) { + if f.Encoding != "base64" { + return f.Content, nil + } + raw := strings.NewReplacer("\n", "", "\r", "").Replace(f.Content) + decoded, err := base64.StdEncoding.DecodeString(raw) + if err != nil { + return "", fmt.Errorf("base64 decode: %w", err) + } + return string(decoded), nil +} diff --git a/cmd/sync-content/github_test.go b/cmd/sync-content/github_test.go new file mode 100644 index 0000000..0b8663f --- /dev/null +++ b/cmd/sync-content/github_test.go @@ -0,0 +1,332 @@ +// SPDX-License-Identifier: Apache-2.0 +package main + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" +) + +func TestListDirMD(t *testing.T) { + mux := http.NewServeMux() + + mux.HandleFunc("/repos/org/repo/contents/docs", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode([]DirEntry{ + {Name: "guide.md", Path: "docs/guide.md", Type: "file"}, + {Name: "image.png", Path: "docs/image.png", Type: "file"}, + {Name: "sub", Path: "docs/sub", Type: "dir"}, + }) + }) + + mux.HandleFunc("/repos/org/repo/contents/docs/sub", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode([]DirEntry{ + {Name: "nested.md", Path: "docs/sub/nested.md", Type: "file"}, + {Name: "data.json", Path: "docs/sub/data.json", Type: "file"}, + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + ctx := context.Background() + + files, err := gh.listDirMD(ctx, "org", "repo", "docs", "") + if err != nil { + t.Fatalf("listDirMD: %v", err) + } + + want := map[string]bool{ + "docs/guide.md": true, + "docs/sub/nested.md": true, + } + got := make(map[string]bool) + for _, f := range files { + got[f] = true + } + + if len(got) != len(want) { + t.Errorf("got %d files, want %d: %v", len(got), len(want), files) + } + for w := range want { + if !got[w] { + t.Errorf("missing expected file %q", w) + } + } +} + +func TestListDirMD_DepthLimit(t *testing.T) { + callCount := 0 + + mux := http.NewServeMux() + mux.HandleFunc("/repos/org/repo/contents/", func(w http.ResponseWriter, r *http.Request) { + callCount++ + json.NewEncoder(w).Encode([]DirEntry{ + {Name: "file.md", Path: r.URL.Path[len("/repos/org/repo/contents/"):] + "/file.md", Type: "file"}, + {Name: "deeper", Path: r.URL.Path[len("/repos/org/repo/contents/"):] + "/deeper", Type: "dir"}, + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + ctx := context.Background() + + files, err := gh.listDirMD(ctx, "org", "repo", "docs", "") + if err != nil { + t.Fatalf("listDirMD: %v", err) + } + + if callCount > maxDirDepth+1 { + t.Errorf("API calls = %d, expected at most %d (depth limit should cap recursion)", callCount, maxDirDepth+1) + } + + if len(files) == 0 { + t.Error("expected at least some .md files to be found") + } + if len(files) > maxDirDepth+1 { + t.Errorf("found %d files, expected at most %d", len(files), maxDirDepth+1) + } +} + +func TestEscapePathSegments(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"docs/guide.md", "docs/guide.md"}, + {"docs/my file.md", "docs/my%20file.md"}, + {"path/with spaces/file#1.md", "path/with%20spaces/file%231.md"}, + } + for _, tt := range tests { + got := escapePathSegments(tt.input) + if got != tt.want { + t.Errorf("escapePathSegments(%q) = %q, want %q", tt.input, got, tt.want) + } + } +} + +func TestAppendRef(t *testing.T) { + tests := []struct { + url string + ref string + want string + }{ + {"https://api.github.com/repos/o/r/readme", "", "https://api.github.com/repos/o/r/readme"}, + {"https://api.github.com/repos/o/r/readme", "abc123", "https://api.github.com/repos/o/r/readme?ref=abc123"}, + {"https://api.github.com/repos/o/r/contents/docs?per_page=100", "def456", "https://api.github.com/repos/o/r/contents/docs?per_page=100&ref=def456"}, + } + for _, tt := range tests { + got := appendRef(tt.url, tt.ref) + if got != tt.want { + t.Errorf("appendRef(%q, %q) = %q, want %q", tt.url, tt.ref, got, tt.want) + } + } +} + +func TestGetREADME_WithRef(t *testing.T) { + var receivedRef string + + mux := http.NewServeMux() + mux.HandleFunc("/repos/org/repo/readme", func(w http.ResponseWriter, r *http.Request) { + receivedRef = r.URL.Query().Get("ref") + json.NewEncoder(w).Encode(FileResponse{ + Content: "VEVTVA==", + Encoding: "base64", + SHA: "sha123", + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + ctx := context.Background() + + _, _, err := gh.getREADME(ctx, "org", "repo", "locked-sha-abc") + if err != nil { + t.Fatalf("getREADME: %v", err) + } + if receivedRef != "locked-sha-abc" { + t.Errorf("ref = %q, want %q", receivedRef, "locked-sha-abc") + } + + receivedRef = "" + _, _, err = gh.getREADME(ctx, "org", "repo", "") + if err != nil { + t.Fatalf("getREADME (no ref): %v", err) + } + if receivedRef != "" { + t.Errorf("ref should be empty when not provided, got %q", receivedRef) + } +} + +func TestListDirMD_WithRef(t *testing.T) { + var receivedRef string + + mux := http.NewServeMux() + mux.HandleFunc("/repos/org/repo/contents/docs", func(w http.ResponseWriter, r *http.Request) { + receivedRef = r.URL.Query().Get("ref") + json.NewEncoder(w).Encode([]DirEntry{ + {Name: "guide.md", Path: "docs/guide.md", Type: "file"}, + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + ctx := context.Background() + + _, err := gh.listDirMD(ctx, "org", "repo", "docs", "pinned-sha") + if err != nil { + t.Fatalf("listDirMD: %v", err) + } + if receivedRef != "pinned-sha" { + t.Errorf("ref = %q, want %q", receivedRef, "pinned-sha") + } +} + +func TestFetchPeribolosRepos(t *testing.T) { + peribolosYAML := `orgs: + myorg: + repos: + alpha: + description: "first repo" + beta: + description: "second repo" + gamma: + description: "third repo" +` + + t.Run("success", func(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/repos/myorg/.github/contents/peribolos.yaml", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(FileResponse{ + Content: b64(peribolosYAML), + Encoding: "base64", + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + names, err := gh.fetchPeribolosRepos(context.Background(), "myorg") + if err != nil { + t.Fatalf("fetchPeribolosRepos: %v", err) + } + want := []string{"alpha", "beta", "gamma"} + if len(names) != len(want) { + t.Fatalf("got %d repos, want %d: %v", len(names), len(want), names) + } + for i, name := range names { + if name != want[i] { + t.Errorf("repo[%d] = %q, want %q", i, name, want[i]) + } + } + }) + + t.Run("missing org in peribolos", func(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/repos/otherorg/.github/contents/peribolos.yaml", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(FileResponse{ + Content: b64(peribolosYAML), + Encoding: "base64", + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + _, err := gh.fetchPeribolosRepos(context.Background(), "otherorg") + if err == nil { + t.Fatal("expected error for missing org") + } + }) + + t.Run("peribolos.yaml not found", func(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/repos/noorg/.github/contents/peribolos.yaml", func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + w.Write([]byte(`{"message":"Not Found"}`)) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + _, err := gh.fetchPeribolosRepos(context.Background(), "noorg") + if err == nil { + t.Fatal("expected error when peribolos.yaml is missing") + } + }) +} + +func TestGetRepoMetadata(t *testing.T) { + mux := http.NewServeMux() + mux.HandleFunc("/repos/org/myrepo", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(Repo{ + Name: "myrepo", + FullName: "org/myrepo", + Description: "A test repo", + HTMLURL: "https://github.com/org/myrepo", + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + repo, err := gh.getRepoMetadata(context.Background(), "org", "myrepo") + if err != nil { + t.Fatalf("getRepoMetadata: %v", err) + } + if repo.Name != "myrepo" { + t.Errorf("name = %q, want %q", repo.Name, "myrepo") + } + if repo.FullName != "org/myrepo" { + t.Errorf("full_name = %q, want %q", repo.FullName, "org/myrepo") + } + if repo.Description != "A test repo" { + t.Errorf("description = %q, want %q", repo.Description, "A test repo") + } +} + +func TestContextCancellationDuringRetry(t *testing.T) { + callCount := 0 + mux := http.NewServeMux() + mux.HandleFunc("/test-endpoint", func(w http.ResponseWriter, r *http.Request) { + callCount++ + w.Header().Set("Retry-After", "60") + w.WriteHeader(http.StatusTooManyRequests) + w.Write([]byte(`{"message":"rate limited"}`)) + }) + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + time.Sleep(100 * time.Millisecond) + cancel() + }() + + start := time.Now() + var result map[string]any + err := gh.getJSON(ctx, server.URL+"/test-endpoint", &result) + elapsed := time.Since(start) + + if err == nil { + t.Fatal("expected error from cancelled context") + } + if elapsed > 2*time.Second { + t.Errorf("cancellation took %v, expected < 2s", elapsed) + } +} diff --git a/cmd/sync-content/manifest.go b/cmd/sync-content/manifest.go new file mode 100644 index 0000000..e6b3293 --- /dev/null +++ b/cmd/sync-content/manifest.go @@ -0,0 +1,135 @@ +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "encoding/json" + "log/slog" + "os" + "path/filepath" + "sort" + "strings" + + "gopkg.in/yaml.v3" +) + +const manifestFile = ".sync-manifest.json" + +// readExistingState reads source_sha and readme_sha from existing project +// pages to enable two-tier change detection: branch SHA as a fast pre-filter, +// readme SHA for content-level comparison. +func readExistingState(outputDir string) map[string]repoState { + state := make(map[string]repoState) + dir := filepath.Join(outputDir, "content", "docs", "projects") + entries, err := os.ReadDir(dir) + if err != nil { + return state + } + for _, e := range entries { + if !e.IsDir() { + continue + } + repoName := e.Name() + indexPath := filepath.Join(dir, repoName, "_index.md") + params := readFrontmatterParams(indexPath) + branchSHA, _ := params["source_sha"].(string) + if branchSHA != "" { + readmeSHA, _ := params["readme_sha"].(string) + state[repoName] = repoState{ + branchSHA: branchSHA, + readmeSHA: readmeSHA, + } + } + } + return state +} + +// carryForwardManifest records files from the previous manifest that belong to +// a repo being skipped on the fast path, preventing orphan cleanup from +// deleting them. +func carryForwardManifest(result *syncResult, repoName string, oldManifest map[string]bool) { + prefix := "content/docs/projects/" + repoName + "/" + for relPath := range oldManifest { + if strings.HasPrefix(relPath, prefix) { + result.recordFile(relPath) + } + } +} + +// buildDocPagesIndex pre-computes which repos have doc pages (files other than +// _index.md) in the manifest. This avoids an O(manifest) scan per repo during +// the concurrent worker loop. +func buildDocPagesIndex(manifest map[string]bool) map[string]bool { + index := make(map[string]bool) + const prefix = "content/docs/projects/" + for relPath := range manifest { + if !strings.HasPrefix(relPath, prefix) { + continue + } + tail := relPath[len(prefix):] + if slash := strings.IndexByte(tail, '/'); slash > 0 { + repoName := tail[:slash] + if filepath.Base(relPath) != "_index.md" { + index[repoName] = true + } + } + } + return index +} + +// readFrontmatterParams reads the YAML frontmatter from a Hugo content file +// and returns the "params" map. Returns nil if the file cannot be read or has +// no frontmatter/params section. +func readFrontmatterParams(path string) map[string]any { + data, err := os.ReadFile(path) + if err != nil { + return nil + } + content := string(data) + if !strings.HasPrefix(content, "---\n") { + return nil + } + endIdx := strings.Index(content[4:], "\n---") + if endIdx < 0 { + return nil + } + fmBytes := content[4 : 4+endIdx] + + var fm map[string]any + if err := yaml.Unmarshal([]byte(fmBytes), &fm); err != nil { + return nil + } + params, _ := fm["params"].(map[string]any) + return params +} + +// readManifest loads the set of files written by the previous sync run. +// Returns nil if no manifest exists (e.g. first run). +func readManifest(outputDir string) map[string]bool { + data, err := os.ReadFile(filepath.Join(outputDir, manifestFile)) + if err != nil { + return nil + } + var files []string + if err := json.Unmarshal(data, &files); err != nil { + slog.Warn("could not parse sync manifest", "error", err) + return nil + } + m := make(map[string]bool, len(files)) + for _, f := range files { + m[f] = true + } + return m +} + +// writeManifest persists the list of files written during this sync run. +func writeManifest(outputDir string, files []string) error { + sorted := make([]string, len(files)) + copy(sorted, files) + sort.Strings(sorted) + data, err := json.MarshalIndent(sorted, "", " ") + if err != nil { + return err + } + return os.WriteFile(filepath.Join(outputDir, manifestFile), append(data, '\n'), 0o644) +} diff --git a/cmd/sync-content/manifest_test.go b/cmd/sync-content/manifest_test.go new file mode 100644 index 0000000..45a07a2 --- /dev/null +++ b/cmd/sync-content/manifest_test.go @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: Apache-2.0 +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestManifestRoundTrip(t *testing.T) { + dir := t.TempDir() + files := []string{ + "content/docs/projects/complyctl/_index.md", + "content/docs/projects/complyctl/quick-start.md", + } + + if err := writeManifest(dir, files); err != nil { + t.Fatalf("writeManifest: %v", err) + } + + got := readManifest(dir) + if got == nil { + t.Fatal("readManifest returned nil") + } + for _, f := range files { + if !got[f] { + t.Errorf("manifest missing %q", f) + } + } + if len(got) != len(files) { + t.Errorf("manifest has %d entries, want %d", len(got), len(files)) + } +} + +func TestReadManifest_Missing(t *testing.T) { + dir := t.TempDir() + got := readManifest(dir) + if got != nil { + t.Errorf("readManifest for missing file should return nil, got %v", got) + } +} + +func TestReadFrontmatterParams(t *testing.T) { + t.Run("reads params from generated frontmatter", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "_index.md") + os.WriteFile(path, []byte("---\ntitle: \"test\"\nparams:\n source_sha: \"abc123\"\n readme_sha: \"def456\"\n---\n"), 0o644) + + params := readFrontmatterParams(path) + if params == nil { + t.Fatal("params should not be nil") + } + if v, _ := params["source_sha"].(string); v != "abc123" { + t.Errorf("source_sha = %q, want %q", v, "abc123") + } + if v, _ := params["readme_sha"].(string); v != "def456" { + t.Errorf("readme_sha = %q, want %q", v, "def456") + } + }) + + t.Run("does not match similarly-prefixed keys", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "_index.md") + os.WriteFile(path, []byte("---\ntitle: \"test\"\nparams:\n source_sha_v2: \"wrong\"\n source_sha: \"correct\"\n---\n"), 0o644) + + params := readFrontmatterParams(path) + if v, _ := params["source_sha"].(string); v != "correct" { + t.Errorf("source_sha = %q, want %q", v, "correct") + } + if v, _ := params["source_sha_v2"].(string); v != "wrong" { + t.Errorf("source_sha_v2 = %q, want %q (should be separate key)", v, "wrong") + } + }) + + t.Run("missing file returns nil", func(t *testing.T) { + params := readFrontmatterParams("/nonexistent/path.md") + if params != nil { + t.Errorf("expected nil for missing file, got %v", params) + } + }) + + t.Run("no frontmatter returns nil", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "plain.md") + os.WriteFile(path, []byte("# No frontmatter\nBody."), 0o644) + + params := readFrontmatterParams(path) + if params != nil { + t.Errorf("expected nil for file without frontmatter, got %v", params) + } + }) + + t.Run("no params section returns nil", func(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "no-params.md") + os.WriteFile(path, []byte("---\ntitle: test\n---\nBody."), 0o644) + + params := readFrontmatterParams(path) + if params != nil { + t.Errorf("expected nil for frontmatter without params, got %v", params) + } + }) +} + +func TestReadExistingState_UsesYAMLParsing(t *testing.T) { + dir := t.TempDir() + repoDir := filepath.Join(dir, "content", "docs", "projects", "test-repo") + os.MkdirAll(repoDir, 0o755) + os.WriteFile(filepath.Join(repoDir, "_index.md"), []byte( + "---\ntitle: \"test-repo\"\nparams:\n source_sha: \"branch-sha-123\"\n readme_sha: \"readme-sha-456\"\n---\n", + ), 0o644) + + state := readExistingState(dir) + if len(state) != 1 { + t.Fatalf("state has %d entries, want 1", len(state)) + } + s := state["test-repo"] + if s.branchSHA != "branch-sha-123" { + t.Errorf("branchSHA = %q, want %q", s.branchSHA, "branch-sha-123") + } + if s.readmeSHA != "readme-sha-456" { + t.Errorf("readmeSHA = %q, want %q", s.readmeSHA, "readme-sha-456") + } +} + +func TestBuildDocPagesIndex(t *testing.T) { + manifest := map[string]bool{ + "content/docs/projects/complyctl/_index.md": true, + "content/docs/projects/complyctl/overview.md": true, + "content/docs/projects/complyctl/installation.md": true, + "content/docs/projects/complyscribe/_index.md": true, + "content/docs/projects/collector/_index.md": true, + "content/docs/projects/collector/docs/guide.md": true, + "data/projects.json": true, + } + + index := buildDocPagesIndex(manifest) + + if !index["complyctl"] { + t.Error("complyctl should be in index (has overview.md and installation.md)") + } + if index["complyscribe"] { + t.Error("complyscribe should NOT be in index (only has _index.md)") + } + if !index["collector"] { + t.Error("collector should be in index (has docs/guide.md)") + } +} + +func TestBuildDocPagesIndex_NilManifest(t *testing.T) { + index := buildDocPagesIndex(nil) + if len(index) != 0 { + t.Errorf("nil manifest should produce empty index, got %d entries", len(index)) + } +} From c0498cbddb3f8c81dd635bf3efd5c3354eecbbd1 Mon Sep 17 00:00:00 2001 From: Sonu Preetam Date: Wed, 18 Mar 2026 18:23:08 -0400 Subject: [PATCH 03/11] feat(sync): add Hugo frontmatter generation, content transforms, and cleanup Signed-off-by: Sonu Preetam --- cmd/sync-content/cleanup.go | 51 +++ cmd/sync-content/cleanup_test.go | 116 +++++ cmd/sync-content/hugo.go | 216 ++++++++++ cmd/sync-content/hugo_test.go | 170 ++++++++ cmd/sync-content/transform.go | 163 +++++++ cmd/sync-content/transform_test.go | 359 ++++++++++++++++ config/_default/params.toml | 2 +- content/docs/projects/_index.md | 137 +----- content/docs/projects/complyctl/_index.md | 142 ------- .../docs/projects/complyctl/installation.md | 41 -- .../docs/projects/complyctl/plugin-guide.md | 99 ----- .../docs/projects/complyctl/quick-start.md | 80 ---- content/docs/projects/complyscribe/_index.md | 88 ---- .../projects/complyscribe/troubleshooting.md | 8 - .../complyscribe/tutorials/authoring.md | 64 --- .../projects/complyscribe/tutorials/github.md | 136 ------ .../tutorials/sync-cac-content.md | 146 ------- .../tutorials/sync-oscal-content.md | 119 ------ .../complytime-collector-components/_index.md | 151 ------- .../attributes/compliance.md | 68 --- .../attributes/policy.md | 36 -- .../complytime-collector-components/design.md | 152 ------- .../development.md | 399 ------------------ .../integration/sync-evidence-hyperproof.md | 163 ------- layouts/_default/_markup/render-heading.html | 4 + layouts/_partials/main/edit-page.html | 48 +++ layouts/home.html | 54 +-- layouts/shortcodes/project-cards.html | 31 ++ 28 files changed, 1179 insertions(+), 2064 deletions(-) create mode 100644 cmd/sync-content/cleanup.go create mode 100644 cmd/sync-content/cleanup_test.go create mode 100644 cmd/sync-content/hugo.go create mode 100644 cmd/sync-content/hugo_test.go create mode 100644 cmd/sync-content/transform.go create mode 100644 cmd/sync-content/transform_test.go delete mode 100644 content/docs/projects/complyctl/_index.md delete mode 100644 content/docs/projects/complyctl/installation.md delete mode 100644 content/docs/projects/complyctl/plugin-guide.md delete mode 100644 content/docs/projects/complyctl/quick-start.md delete mode 100644 content/docs/projects/complyscribe/_index.md delete mode 100644 content/docs/projects/complyscribe/troubleshooting.md delete mode 100644 content/docs/projects/complyscribe/tutorials/authoring.md delete mode 100644 content/docs/projects/complyscribe/tutorials/github.md delete mode 100644 content/docs/projects/complyscribe/tutorials/sync-cac-content.md delete mode 100644 content/docs/projects/complyscribe/tutorials/sync-oscal-content.md delete mode 100644 content/docs/projects/complytime-collector-components/_index.md delete mode 100644 content/docs/projects/complytime-collector-components/attributes/compliance.md delete mode 100644 content/docs/projects/complytime-collector-components/attributes/policy.md delete mode 100644 content/docs/projects/complytime-collector-components/design.md delete mode 100644 content/docs/projects/complytime-collector-components/development.md delete mode 100644 content/docs/projects/complytime-collector-components/integration/sync-evidence-hyperproof.md create mode 100644 layouts/_default/_markup/render-heading.html create mode 100644 layouts/_partials/main/edit-page.html create mode 100644 layouts/shortcodes/project-cards.html diff --git a/cmd/sync-content/cleanup.go b/cmd/sync-content/cleanup.go new file mode 100644 index 0000000..9aea47a --- /dev/null +++ b/cmd/sync-content/cleanup.go @@ -0,0 +1,51 @@ +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "log/slog" + "os" + "path/filepath" +) + +// cleanOrphanedFiles removes files present in the old manifest but absent from +// the current sync run. After each removal it prunes empty parent directories +// up to outputDir. +func cleanOrphanedFiles(outputDir string, oldManifest map[string]bool, currentFiles []string) int { + current := make(map[string]bool, len(currentFiles)) + for _, f := range currentFiles { + current[f] = true + } + removed := 0 + for relPath := range oldManifest { + if current[relPath] { + continue + } + fullPath := filepath.Join(outputDir, relPath) + if !isUnderDir(outputDir, fullPath) { + slog.Warn("skipping orphaned file outside output dir", "path", relPath) + continue + } + if err := os.Remove(fullPath); err != nil { + if !os.IsNotExist(err) { + slog.Warn("could not remove orphaned file", "path", fullPath, "error", err) + } + continue + } + slog.Info("removed orphaned file", "path", relPath) + removed++ + dir := filepath.Dir(fullPath) + absOutput := filepath.Clean(outputDir) + for dir != absOutput && dir != "." && dir != "/" { + if !isUnderDir(outputDir, dir) { + break + } + if err := os.Remove(dir); err != nil { + break + } + slog.Info("removed empty directory", "path", dir) + dir = filepath.Dir(dir) + } + } + return removed +} diff --git a/cmd/sync-content/cleanup_test.go b/cmd/sync-content/cleanup_test.go new file mode 100644 index 0000000..535cf83 --- /dev/null +++ b/cmd/sync-content/cleanup_test.go @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: Apache-2.0 +package main + +import ( + "os" + "path/filepath" + "testing" +) + +func TestCleanOrphanedFiles(t *testing.T) { + dir := t.TempDir() + + staleFile := filepath.Join(dir, "content", "docs", "projects", "complyctl", "quick-start.md") + keptFile := filepath.Join(dir, "content", "docs", "projects", "complyctl", "_index.md") + otherFile := filepath.Join(dir, "content", "docs", "projects", "complyscribe", "_index.md") + + for _, f := range []string{staleFile, keptFile, otherFile} { + os.MkdirAll(filepath.Dir(f), 0o755) + os.WriteFile(f, []byte("test"), 0o644) + } + + oldManifest := map[string]bool{ + "content/docs/projects/complyctl/_index.md": true, + "content/docs/projects/complyctl/quick-start.md": true, + "content/docs/projects/complyscribe/_index.md": true, + } + + currentFiles := []string{ + "content/docs/projects/complyctl/_index.md", + "content/docs/projects/complyscribe/_index.md", + } + + removed := cleanOrphanedFiles(dir, oldManifest, currentFiles) + + if removed != 1 { + t.Errorf("removed = %d, want 1", removed) + } + if _, err := os.Stat(staleFile); !os.IsNotExist(err) { + t.Error("stale file quick-start.md should have been removed") + } + if _, err := os.Stat(keptFile); err != nil { + t.Error("kept file _index.md should still exist") + } + if _, err := os.Stat(otherFile); err != nil { + t.Error("other repo file should still exist") + } +} + +func TestCleanOrphanedFiles_PrunesEmptyDirs(t *testing.T) { + dir := t.TempDir() + + staleDir := filepath.Join(dir, "content", "docs", "projects", "removed-repo") + staleFile := filepath.Join(staleDir, "_index.md") + os.MkdirAll(staleDir, 0o755) + os.WriteFile(staleFile, []byte("test"), 0o644) + + oldManifest := map[string]bool{ + "content/docs/projects/removed-repo/_index.md": true, + } + + removed := cleanOrphanedFiles(dir, oldManifest, nil) + + if removed != 1 { + t.Errorf("removed = %d, want 1", removed) + } + if _, err := os.Stat(staleDir); !os.IsNotExist(err) { + t.Error("empty directory should have been pruned") + } +} + +func TestCleanOrphanedFiles_TraversalBlocked(t *testing.T) { + dir := t.TempDir() + + outsideDir := t.TempDir() + outsideFile := filepath.Join(outsideDir, "should-survive.txt") + os.WriteFile(outsideFile, []byte("protected"), 0o644) + + relTraversal, err := filepath.Rel(dir, outsideFile) + if err != nil { + t.Fatalf("could not compute relative path: %v", err) + } + + oldManifest := map[string]bool{ + relTraversal: true, + } + + removed := cleanOrphanedFiles(dir, oldManifest, nil) + + if removed != 0 { + t.Errorf("removed = %d, want 0 (traversal should be blocked)", removed) + } + if _, err := os.Stat(outsideFile); err != nil { + t.Errorf("file outside output dir was deleted: %v", err) + } +} + +func TestCleanOrphanedFiles_LegitimateRemoval(t *testing.T) { + dir := t.TempDir() + + legitFile := filepath.Join(dir, "content", "docs", "projects", "old-repo", "_index.md") + os.MkdirAll(filepath.Dir(legitFile), 0o755) + os.WriteFile(legitFile, []byte("stale"), 0o644) + + oldManifest := map[string]bool{ + "content/docs/projects/old-repo/_index.md": true, + } + + removed := cleanOrphanedFiles(dir, oldManifest, nil) + + if removed != 1 { + t.Errorf("removed = %d, want 1 (legitimate orphan should be cleaned)", removed) + } + if _, err := os.Stat(legitFile); !os.IsNotExist(err) { + t.Error("legitimate orphan should have been removed") + } +} diff --git a/cmd/sync-content/hugo.go b/cmd/sync-content/hugo.go new file mode 100644 index 0000000..5d33a31 --- /dev/null +++ b/cmd/sync-content/hugo.go @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "fmt" + "path/filepath" + "strings" +) + +// ProjectCard is the structure written to data/projects.json for landing page templates. +type ProjectCard struct { + Name string `json:"name"` + Language string `json:"language"` + Type string `json:"type"` + Description string `json:"description"` + URL string `json:"url"` + Repo string `json:"repo"` + Stars int `json:"stars"` +} + +// deriveProjectType infers a human-readable type label from repo topics and description. +func deriveProjectType(r Repo) string { + topics := make(map[string]bool, len(r.Topics)) + for _, t := range r.Topics { + topics[strings.ToLower(t)] = true + } + desc := strings.ToLower(r.Description) + + switch { + case topics["cli"] || strings.Contains(desc, "command-line") || strings.Contains(desc, " cli"): + return "CLI Tool" + case topics["automation"] || strings.Contains(desc, "automation") || strings.Contains(desc, "automat"): + return "Automation" + case topics["observability"] || strings.Contains(desc, "observability") || strings.Contains(desc, "collector"): + return "Observability" + case topics["framework"] || strings.Contains(desc, "framework") || strings.Contains(desc, "bridging"): + return "Framework" + default: + return "Library" + } +} + +// buildSectionIndex generates a lightweight Hugo section index (_index.md) for a +// project. Contains only frontmatter metadata so the Doks sidebar renders the +// section heading as a collapsible toggle with child pages listed underneath. +func buildSectionIndex(repo Repo, sha, readmeSHA string) string { + lang := languageOrDefault(repo.Language) + title := formatRepoTitle(repo.Name) + + var b strings.Builder + b.WriteString("---\n") + fmt.Fprintf(&b, "title: %q\n", title) + fmt.Fprintf(&b, "linkTitle: %q\n", repo.Name) + fmt.Fprintf(&b, "description: %q\n", repo.Description) + fmt.Fprintf(&b, "date: %s\n", repo.PushedAt) + fmt.Fprintf(&b, "lastmod: %s\n", repo.PushedAt) + b.WriteString("draft: false\n") + b.WriteString("toc: false\n") + b.WriteString("params:\n") + fmt.Fprintf(&b, " language: %q\n", lang) + fmt.Fprintf(&b, " stars: %d\n", repo.StargazersCount) + fmt.Fprintf(&b, " repo: %q\n", repo.HTMLURL) + fmt.Fprintf(&b, " source_sha: %q\n", sha) + fmt.Fprintf(&b, " readme_sha: %q\n", readmeSHA) + b.WriteString(" seo:\n") + fmt.Fprintf(&b, " title: %q\n", title+" | ComplyTime") + fmt.Fprintf(&b, " description: %q\n", repo.Description) + b.WriteString("---\n") + + return b.String() +} + +// buildOverviewPage generates the README content as a child page (overview.md) +// so it appears as a navigable sidebar link in the Doks theme. +func buildOverviewPage(repo Repo, readme string) string { + editURL := fmt.Sprintf("https://github.com/%s/edit/%s/README.md", repo.FullName, repo.DefaultBranch) + + var b strings.Builder + b.WriteString("---\n") + fmt.Fprintf(&b, "title: %q\n", "Overview") + fmt.Fprintf(&b, "description: %q\n", repo.Description) + fmt.Fprintf(&b, "date: %s\n", repo.PushedAt) + fmt.Fprintf(&b, "lastmod: %s\n", repo.PushedAt) + b.WriteString("draft: false\n") + b.WriteString("toc: true\n") + fmt.Fprintf(&b, "weight: %d\n", 1) + b.WriteString("params:\n") + fmt.Fprintf(&b, " editURL: %q\n", editURL) + b.WriteString("---\n\n") + b.WriteString(readme) + + return b.String() +} + +// knownAcronyms maps lowercase tokens to their canonical uppercase form. +// Used by smartTitle to preserve intended casing for common technical terms. +var knownAcronyms = map[string]string{ + "api": "API", + "apis": "APIs", + "cac": "CAC", + "ci": "CI", + "cd": "CD", + "cli": "CLI", + "cpu": "CPU", + "css": "CSS", + "dns": "DNS", + "faq": "FAQ", + "grpc": "gRPC", + "html": "HTML", + "http": "HTTP", + "https": "HTTPS", + "id": "ID", + "io": "I/O", + "ip": "IP", + "json": "JSON", + "jwt": "JWT", + "k8s": "K8s", + "oauth": "OAuth", + "openid": "OpenID", + "oscal": "OSCAL", + "rbac": "RBAC", + "rest": "REST", + "sdk": "SDK", + "sql": "SQL", + "ssh": "SSH", + "sso": "SSO", + "tcp": "TCP", + "tls": "TLS", + "toml": "TOML", + "ui": "UI", + "uri": "URI", + "url": "URL", + "uuid": "UUID", + "vm": "VM", + "xml": "XML", + "yaml": "YAML", +} + +// smartTitle capitalises the first letter of each word, but preserves +// canonical casing for known acronyms (e.g. "api" → "API", "cac" → "CAC"). +func smartTitle(words []string) string { + for i, w := range words { + if canonical, ok := knownAcronyms[strings.ToLower(w)]; ok { + words[i] = canonical + continue + } + if len(w) > 0 { + words[i] = strings.ToUpper(w[:1]) + strings.ToLower(w[1:]) + } + } + return strings.Join(words, " ") +} + +// formatRepoTitle converts a GitHub repo name (typically lowercase/kebab-case) +// into a human-readable title for Hugo frontmatter. +// E.g. "complyctl" → "Complyctl", "oscal-sdk" → "OSCAL SDK". +func formatRepoTitle(repoName string) string { + words := strings.FieldsFunc(repoName, func(r rune) bool { + return r == '-' || r == '_' + }) + return smartTitle(words) +} + +// titleFromFilename converts a Markdown filename stem to a human-readable title. +// E.g. "quick-start" → "Quick Start", "sync_cac_content" → "Sync CAC Content". +func titleFromFilename(name string) string { + name = strings.TrimSuffix(name, filepath.Ext(name)) + name = strings.NewReplacer("-", " ", "_", " ").Replace(name) + words := strings.Fields(name) + return smartTitle(words) +} + +// buildDocPage generates a Hugo doc page with auto-generated frontmatter +// derived from the file path. The title comes from the filename, the +// description combines the repo description with the title, and a provenance +// comment is inserted after the frontmatter closing delimiter. +func buildDocPage(filePath, repoFullName, repoDescription, pushedAt, branch, sha, content string) string { + title := titleFromFilename(filepath.Base(filePath)) + + shortSHA := sha + if len(shortSHA) > 12 { + shortSHA = shortSHA[:12] + } + + editURL := fmt.Sprintf("https://github.com/%s/edit/%s/%s", repoFullName, branch, filePath) + + var b strings.Builder + b.WriteString("---\n") + fmt.Fprintf(&b, "title: %q\n", title) + fmt.Fprintf(&b, "description: %q\n", repoDescription+" — "+title) + fmt.Fprintf(&b, "date: %s\n", pushedAt) + fmt.Fprintf(&b, "lastmod: %s\n", pushedAt) + b.WriteString("draft: false\n") + fmt.Fprintf(&b, "weight: %d\n", 10) + b.WriteString("params:\n") + fmt.Fprintf(&b, " editURL: %q\n", editURL) + b.WriteString("---\n") + fmt.Fprintf(&b, "\n\n", repoFullName, filePath, branch, shortSHA) + b.WriteString(content) + + return b.String() +} + +// buildProjectCard constructs a ProjectCard from repo metadata. +func buildProjectCard(repo Repo) ProjectCard { + return ProjectCard{ + Name: repo.Name, + Language: languageOrDefault(repo.Language), + Type: deriveProjectType(repo), + Description: repo.Description, + URL: fmt.Sprintf("/docs/projects/%s/", repo.Name), + Repo: repo.HTMLURL, + Stars: repo.StargazersCount, + } +} diff --git a/cmd/sync-content/hugo_test.go b/cmd/sync-content/hugo_test.go new file mode 100644 index 0000000..f21f962 --- /dev/null +++ b/cmd/sync-content/hugo_test.go @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: Apache-2.0 +package main + +import ( + "strings" + "testing" +) + +func TestFormatRepoTitle(t *testing.T) { + cases := []struct { + input string + want string + }{ + {"complyctl", "Complyctl"}, + {"oscal-sdk", "OSCAL SDK"}, + {"cac-content-sync", "CAC Content Sync"}, + {"my-cli-tool", "My CLI Tool"}, + {"rest-api-server", "REST API Server"}, + {"simple", "Simple"}, + {"json-yaml-converter", "JSON YAML Converter"}, + {"k8s-operator", "K8s Operator"}, + {"oauth-grpc-bridge", "OAuth gRPC Bridge"}, + } + + for _, tc := range cases { + t.Run(tc.input, func(t *testing.T) { + got := formatRepoTitle(tc.input) + if got != tc.want { + t.Errorf("formatRepoTitle(%q) = %q, want %q", tc.input, got, tc.want) + } + }) + } +} + +func TestTitleFromFilename(t *testing.T) { + cases := []struct { + input string + want string + }{ + {"quick-start.md", "Quick Start"}, + {"sync_cac_content.md", "Sync CAC Content"}, + {"api-reference.md", "API Reference"}, + {"installation.md", "Installation"}, + {"cli-usage.md", "CLI Usage"}, + {"rest-api.md", "REST API"}, + {"getting-started", "Getting Started"}, + {"CONTRIBUTING.md", "Contributing"}, + {"PLUGIN_GUIDE.md", "Plugin Guide"}, + {"RELEASE-PROCESS.md", "Release Process"}, + } + + for _, tc := range cases { + t.Run(tc.input, func(t *testing.T) { + got := titleFromFilename(tc.input) + if got != tc.want { + t.Errorf("titleFromFilename(%q) = %q, want %q", tc.input, got, tc.want) + } + }) + } +} + +func TestSmartTitle(t *testing.T) { + cases := []struct { + name string + input []string + want string + }{ + {"plain words", []string{"hello", "world"}, "Hello World"}, + {"acronym api", []string{"my", "api"}, "My API"}, + {"mixed case via acronym map", []string{"OAuth", "setup"}, "OAuth Setup"}, + {"already uppercase acronym", []string{"CLI"}, "CLI"}, + {"h6 cap", []string{"some", "uuid", "generator"}, "Some UUID Generator"}, + {"all caps normalised", []string{"CONTRIBUTING"}, "Contributing"}, + {"all caps multi-word", []string{"PLUGIN", "GUIDE"}, "Plugin Guide"}, + {"mixed all-caps and acronym", []string{"OSCAL", "QUICK", "START"}, "OSCAL Quick Start"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + input := make([]string, len(tc.input)) + copy(input, tc.input) + got := smartTitle(input) + if got != tc.want { + t.Errorf("smartTitle(%v) = %q, want %q", tc.input, got, tc.want) + } + }) + } +} + +func TestBuildSectionIndex(t *testing.T) { + repo := Repo{ + Name: "oscal-sdk", + FullName: "complytime/oscal-sdk", + Description: "OSCAL SDK for Go", + Language: "Go", + StargazersCount: 10, + HTMLURL: "https://github.com/complytime/oscal-sdk", + PushedAt: "2025-06-01T00:00:00Z", + } + + result := buildSectionIndex(repo, "sha-branch", "sha-readme") + + if !strings.Contains(result, `title: "OSCAL SDK"`) { + t.Error("section index title should use formatRepoTitle (OSCAL SDK)") + } + if !strings.Contains(result, `linkTitle: "oscal-sdk"`) { + t.Error("section index should have linkTitle with raw repo name for sidebar") + } + if !strings.Contains(result, `seo:`) { + t.Error("section index should have seo params") + } + if !strings.Contains(result, `title: "OSCAL SDK | ComplyTime"`) { + t.Error("SEO title should use formatted repo title") + } + if !strings.Contains(result, "readme_sha:") { + t.Error("section index should contain readme_sha") + } +} + +func TestBuildDocPage(t *testing.T) { + content := "## Getting Started\n\nSome content here." + result := buildDocPage( + "docs/api-reference.md", + "complytime/complyctl", + "A CLI tool", + "2025-06-01T00:00:00Z", + "main", + "abc123def456789", + content, + ) + + if !strings.Contains(result, `title: "API Reference"`) { + t.Error("doc page title should use titleFromFilename with acronym handling") + } + if !strings.Contains(result, `description: "A CLI tool — API Reference"`) { + t.Error("description should combine repo description with title") + } + if !strings.Contains(result, "\n") + result := string(insertAfterFrontmatter(content, insert)) + + if !strings.Contains(result, "---\n") { + t.Errorf("provenance should appear after closing ---, got:\n%s", result) + } + if !strings.Contains(result, "Body text") { + t.Error("body should be preserved") + } + }) + + t.Run("without frontmatter", func(t *testing.T) { + content := []byte("# Hello\n\nBody text") + insert := []byte("\n") + result := string(insertAfterFrontmatter(content, insert)) + + if !strings.HasPrefix(result, "") { + t.Errorf("provenance should be prepended when no frontmatter, got:\n%s", result) + } + if !strings.Contains(result, "# Hello") { + t.Error("content should be preserved") + } + }) +} diff --git a/config/_default/params.toml b/config/_default/params.toml index fcf0878..8073a31 100644 --- a/config/_default/params.toml +++ b/config/_default/params.toml @@ -44,7 +44,7 @@ mainSections = ["docs"] sectionNav = ["docs"] toTopButton = true breadcrumbTrail = true - headlineHash = true + headlineHash = false scrollSpy = true # Multilingual diff --git a/content/docs/projects/_index.md b/content/docs/projects/_index.md index ac8ede7..734877b 100644 --- a/content/docs/projects/_index.md +++ b/content/docs/projects/_index.md @@ -8,142 +8,19 @@ draft: false images: [] weight: 200 toc: true +cascade: + - sidebar: + collapsed: true + _target: + kind: section + path: "{/docs/projects/*}" --- ## Core Projects ComplyTime consists of several interconnected projects, each serving a specific purpose in the compliance automation workflow. -### Command Line Tools - - - -### Frameworks & Libraries - - - -### Observability & Collection - - - -### AI & Automation - - - -### Demos & Examples - - +{{< project-cards >}} ## Getting Involved diff --git a/content/docs/projects/complyctl/_index.md b/content/docs/projects/complyctl/_index.md deleted file mode 100644 index 276956f..0000000 --- a/content/docs/projects/complyctl/_index.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -description: A command-line tool for streamlining end-to-end compliance workflows. -title: complyctl -weight: 10 ---- - - -# complyctl - -ComplyCTL leverages [OSCAL](https://github.com/usnistgov/OSCAL/) to perform compliance assessment activities, using plugins for each stage of the lifecycle. - -## Documentation - -:paperclip: [Installation](https://github.com/complytime/complyctl/blob/main/docs/INSTALLATION.md)\ -:paperclip: [Quick Start](https://github.com/complytime/complyctl/blob/main/docs/QUICK_START.md)\ -:paperclip: [Sample Component Definition](https://github.com/complytime/complyctl/blob/main/docs/samples/sample-component-definition.json) - -### Basic Usage - -Determine the baseline you want to run a scan for and create an OSCAL [Assessment Plan](https://pages.nist.gov/OSCAL/learn/concepts/layer/assessment/assessment-plan/). The Assessment -Plan will act as configuration to guide the complyctl generation and scanning operations. - -### `list` command - -```bash -complyctl list -... -# Table appears with options. Look at the Framework ID column. -``` - -### `info` command - -```bash -complyctl info -# Display information about a framework's controls and rules. - -complyctl info --control -# Display details about a specific control. - -complyctl info --rule -# Display details about a specific rule. - -complyctl info --parameter -# Display details about a specific parameter. -``` - -### `plan` command - -```bash -complyctl plan -... -# The file will be written out to assessment-plan.json in the specified workspace. -# Defaults to current working directory. - -cat complytime/assessment-plan.json -# The default assessment-plan.json will be available in the complytime workspace (complytime/assessment-plan.json). - -complyctl plan --dry-run -# See the default contents of the assessment-plan.json. -``` - -Use a scope config file to customize the assessment plan: - -```bash -complyctl plan --dry-run --out config.yml -# Customize the assessment-plan.json with the 'out' flag. Updates can be made to the config.yml. -``` - -Open the `config.yml` file in a text editor and modify the YAML as desired. The example below shows various options for including and excluding rules. - -The `selectParameters` YAML key sets parameters for the `controlId`. If you try to use a value that isn't supported, an error will occur, and the valid alternative values will be displayed. To fix this, update the `value` in the `config.yml` file, and then run the command with the `--scope-config ` flag. This will generate a new `assessment-plan.json` file with the updated values. - -```yaml -frameworkId: example-framework -includeControls: -- controlId: control-01 - controlTitle: Title of Control 01 - includeRules: - - "*" # all rules included by default - selectParameters: - - name: param-1-id - value: param-1-value - - name: param-2-id - value: param-2-value -- controlId: control-02 - controlTitle: Title of Control 02 - includeRules: - - "rule-02" # only rule-02 will be included for this control - waiveRules: - - "rule-01" # rule-01 will be waived for this control -- controlId: control-03 - controlTitle: Title of Control 03 - includeRules: - - "*" - selectParameters: - - name: param-1-id - value: param-1-value - - name: param-5-id - value: param-5-value # update the value with available alternatives - excludeRules: - - "rule-03" # exclude rule-03 specific rule from control-03 -globalExcludeRules: - - "rule-99" # will be excluded for all controls, this takes priority over any includeRules, waiveRules, and globalWaiveRules clauses above -globalWaiveRules: - - "rule-50" # will be waived for all controls, this takes priority over any includeRules clauses above -``` - -The edited `config.yml` can then be used with the `plan` command to customize the assessment plan. - -```bash -complyctl plan --scope-config config.yml -# The config.yml will be loaded by passing '--scope-config' to customize the assessment-plan.json. -``` - -### `generate` command - -```bash -complyctl generate -# Run the `generate` command to generate the plugin specific policy artifacts in the workspace. -``` - -### `scan` command - -```bash -complyctl scan -# Run the `scan` command to execute the PVP plugins and create results artifacts. The results will be written to assessment-results.json in the specified workspace. - -complyctl scan --with-md -# Results can also be created in Markdown format by passing the `--with-md` flag. -``` - -## Plugin Interaction - -plugin-interaction - -## Contributing - -:paperclip: Read the [contributing guidelines](https://github.com/complytime/complyctl/blob/main/docs/CONTRIBUTING.md)\ -:paperclip: Read the [style guide](https://github.com/complytime/complyctl/blob/main/docs/STYLE_GUIDE.md)\ -:paperclip: Read and agree to the [Code of Conduct](https://github.com/complytime/complyctl/blob/main/docs/CODE_OF_CONDUCT.md) - -*Interested in writing a plugin?* See the [plugin guide](https://github.com/complytime/complyctl/blob/main/docs/PLUGIN_GUIDE.md). diff --git a/content/docs/projects/complyctl/installation.md b/content/docs/projects/complyctl/installation.md deleted file mode 100644 index 6e0f532..0000000 --- a/content/docs/projects/complyctl/installation.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -description: Install complyctl on your system. -title: Installation -weight: 30 ---- - - -# Installation - -## Binary - -- The latest binary release can be downloaded from . -- The release signature can be verified with: - ``` - cosign verify-blob --certificate complyctl_*_checksums.txt.pem --signature complyctl_*_checksums.txt.sig complytime_*_checksums.txt --certificate-oidc-issuer=https://token.actions.githubusercontent.com --certificate-identity=https://github.com/complytime/complyctl/.github/workflows/release.yml@refs/heads/main - ``` - - -## From Source - -### Prerequisites - -- **Go** version 1.20 or higher -- **Make** (optional, for using the `Makefile` if included) -- **pandoc** (optional, for generating man pages using the `make man`) - -### Clone the repository - -```bash -git clone https://github.com/complytime/complyctl.git -cd complyctl -``` - -### Build Instructions -To compile complyctl and openscap-plugin: - -```bash -make build -``` - -The binaries can be found in the `bin/` directory in the local repo. Add it to your PATH and you are all set! diff --git a/content/docs/projects/complyctl/plugin-guide.md b/content/docs/projects/complyctl/plugin-guide.md deleted file mode 100644 index e8e50de..0000000 --- a/content/docs/projects/complyctl/plugin-guide.md +++ /dev/null @@ -1,99 +0,0 @@ ---- -description: Discover, install, and manage complyctl plugins. -title: Plugin Guide -weight: 40 ---- - - -# Plugin Authoring - -Complyctl can be extended to support desired policy engines (PVPs) by the use of plugins. -The plugin acts as the integration between complyctl and the PVPs native interface. -Each plugin is responsible for converting the policy content described in OSCAL into the input format expected by the PVP. -In addition, the plugin converts the raw results provided by the PVP into the schema used by complyctl to generate OSCAL output. - -Plugins communicate with complyctl via gRPC and can be authored using any preferred language. -The plugin acts as the gRPC server while the complyctl CLI acts as the client. -When a `complyctl` command is run, it invokes the appropriate method served by the plugin. - -Complyctl is built on [compliance-to-policy-go](https://github.com/oscal-compass/compliance-to-policy-go/ which provides a flexible plugin framework for leveraging OSCAL with various PVPs. For developers choosing Golang, the same SDK can be used for plugin authoring. - -## Plugin Discovery - -Complyctl performs automated plugin discovery using the compliance-to-policy-go [plugin manager](https://github.com/complytime/compliance-to-policy-go/blob/CPLYTM-272/plugin/discovery.go). -Plugins are defined using manifest files placed in the `c2p-plugins` directory. -The plugin manifest is a JSON file that provides metadata about the plugin. -Check the quick start [guide](https://github.com/complytime/complyctl/blob/main/docs/QUICK_START.md) to see an example. - -**Note:** the plugin manifest file must have the following syntax for automatic discovery: `c2p--manifest.json` - -### Example Plugin Manifest - -``` -{ - “id”: “myplugin”, - “description”: “my example plugin”, - “version”: “0.1”, - “type”: [“pvp”], - “executablePath”: "myplugin" // in relation to the plugin directory - “sha256”: “23f…” // sha256 of executable - "configuration": [ - { - "name": "config_name", - "description": "Config description", - "default": "default_value", - "required": true - }, - ] -} -``` - -### Directory Naming Conventions - -In order to support automated aggregation of output files from multiple plugins the following directory names are expected by complyctl : - -**Note:** The `workspace` path will be provided by complyctl via the [configuration](https://github.com/complytime/complyctl/blob/6cf2e92aff852119bba83e579e2c6d8700e4bcec/internal/complytime/plugins.go#L72) and represents the user's desired working directory for all complyctl activities. - -- `{workspace}/{plugin name}/results` # files for evidence collection -- `{workspace}/{plugin name}/remediations` # files for automated remediation - -### Plugin Selection - -Complyctl generates a mapping of plugins to validation components at runtime. -This mapping uses the `title` of the validation component to find a matching plugin with that ID (defined in manifest). - -```json -{ - ... - “uuid”: “701c7...”, - “type”: “validation, - “title”: “myplugin”, // name must match plugin ID in manifest -} -``` - -## Example - -Below shows an example template for authoring a Golang plugin. - -```go - -import "github.com/oscal-compass/compliance-to-policy-go/v2/policy" - -type PluginServer struct {} - -func (s PluginServer) Generate(p policy.Policy) error { - - // PluginServer should implement the Generate() method to provide logic for - // translating OSCAL to the PVPs expected input format. Note: this may not be - // applicable to all PVPs. - -} - -func (s PluginServer) GetResults(p policy.Policy) (policy.PVPResult, error) { - - // PluginServer should implement the GetResults() method to provide logic to - // collect results from the PVP for a given policy. Note: if the PVP requires input - // from Generate() then the policy input here may be ignored. - -} -``` diff --git a/content/docs/projects/complyctl/quick-start.md b/content/docs/projects/complyctl/quick-start.md deleted file mode 100644 index e729f9a..0000000 --- a/content/docs/projects/complyctl/quick-start.md +++ /dev/null @@ -1,80 +0,0 @@ ---- -description: Get up and running with complyctl in minutes. -title: Quick Start -weight: 20 ---- - - -# Quick Start - -To get started with the `complyctl` CLI, at least one plugin must be installed with a corresponding OSCAL [Component Definition](https://pages.nist.gov/OSCAL/learn/concepts/layer/implementation/component-definition/). - -> Note: Some of these steps are manual. The [quick_start.sh](https://github.com/complytime/complyctl/blob/main/scripts/quick_start/quick_start.sh) automates the process below. - -## Step 1: Install Complyctl - -See [INSTALLATION.md](https://github.com/complytime/complyctl/blob/main/docs/INSTALLATION.md) - -## Step 2: Add configuration - -After running `complyctl list` for the first time, the complytime -directory should be created under $HOME/.local/share - -```markdown -complytime -├── bundles -└── plugins -└── controls -``` - -You will need an OSCAL Component Definition that defines an OSCAL Component for your target system and an OSCAL Component the corresponding -policy validation plugin. See `docs/samples/` for example configuration for the `myplugin` plugin. - -```bash -cp docs/samples/sample-component-definition.json ~/.local/share/complytime/bundles -cp docs/samples/sample-profile.json docs/samples/sample-catalog.json ~/.local/share/complytime/controls -``` - -## Step 3: Install a plugin - -Each plugin requires a plugin manifest. For more information about plugin discovery see [PLUGIN_GUIDE.md](https://github.com/complytime/complyctl/blob/main/docs/PLUGIN_GUIDE.md). - -```bash -plugin_dir="$HOME/.local/share/complytime/plugins" -cp "bin/openscap-plugin" "docs/samples/c2p-openscap-manifest.json" "$plugin_dir" -checksum=$(sha256sum ~/.local/share/complytime/plugins/openscap-plugin | awk '{ print $1 }' ) -version=$(bin/complyctl version | head -n1 | awk '{ print $2 }' | sed -E 's/^v([0-9]+\.[0-9]+\.[0-9]+).*/\1/') -sed -i -e "s|checksum_placeholder|$checksum|" -e "s|version_placeholder|$version|" "$plugin_dir/c2p-openscap-manifest.json" -``` - -## Step 4: Edit plugin configuration (optional) -```bash -mkdir -p /etc/complyctl/config.d -cp ~/.local/share/complytime/plugins/c2p-openscap-manifest.json /etc/complyctl/config.d -``` - -Edit `/etc/complyctl/config.d/c2p-openscap-manifest.json` to keep only the desired changes. e.g.: -```json -{ - "configuration": [ - { - "name": "policy", - "default": "custom_tailoring_policy.xml", - }, - { - "name": "arf", - "default": "custom_arf.xml", - }, - { - "name": "results", - "default": "custom_results.xml", - } - ] -} -``` - -### Using with the openscap-plugin - -If using the openscap-plugin, there are two prerequisites: -- **openscap-scanner** package installed -- **scap-security-guide** package installed diff --git a/content/docs/projects/complyscribe/_index.md b/content/docs/projects/complyscribe/_index.md deleted file mode 100644 index eae3b11..0000000 --- a/content/docs/projects/complyscribe/_index.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -description: A workflow automation tool for compliance content authoring. -title: complyscribe -weight: 10 ---- - - -# complyscribe - -ComplyScribe is a CLI tool that assists users in leveraging [Compliance-Trestle](https://github.com/oscal-compass/compliance-trestle) in CI/CD workflows for [OSCAL](https://github.com/usnistgov/OSCAL) formatted compliance content management. - -> WARNING: This project is currently under initial development. APIs may be changed incompatibly from one commit to another. - -## Getting Started - -### Available Commands - -The `autosync` command will sync trestle-generated Markdown files to OSCAL JSON files in a trestle workspace. All content under the provided markdown directory will be transformed when the action is run. This action supports all top-level models [supported by compliance-trestle for authoring](https://oscal-compass.github.io/compliance-trestle/tutorials/ssp_profile_catalog_authoring/ssp_profile_catalog_authoring/). - -The `rules-transform` command can be used when managing [OSCAL Component Definitions](https://pages.nist.gov/OSCAL-Reference/models/v1.1.1/component-definition/json-outline/) in a trestle workspace. The action will transform rules defined in the rules YAML view to an OSCAL Component Definition JSON file. - -The `create compdef` command can be used to create a new [OSCAL Component Definition](https://pages.nist.gov/OSCAL-Reference/models/v1.1.1/component-definition/json-outline/) in a trestle workspace. The action will create a new Component Definition JSON file and corresponding directories that contain rules YAML files and trestle-generated Markdown files. This action prepares the workspace for use with the `rules-transform` and `autosync` actions. - -The `sync-upstreams` command can be used to sync and validate upstream OSCAL content stored in a git repository to a local trestle workspace. The inputs `include_models` and `exclude_models` determine which content is synced to the trestle workspace. - -The `create ssp` command can be used to create a new [OSCAL System Security Plans](https://pages.nist.gov/OSCAL-Reference/models/v1.1.1/system-security-plan/json-outline/) (SSP) in a trestle workspace. The action will create a new SSP JSON file and corresponding directories that contain trestle-generated Markdown files. This action prepares the workspace for use with the `autosync` action by creating or updating the `ssp-index.json` file. The `ssp-index.json` file is used to track the relationships between the SSP and the other OSCAL content in the workspace for the `autosync` action. - -The `sync-cac-content` command supports transforming the [CaC content](https://github.com/ComplianceAsCode/content) to OSCAL models in a trestle workspace. For detailed documentation on how to use, see the [sync-cac-content.md](https://github.com/complytime/complyscribe/blob/main/docs/tutorials/sync-cac-content.md). - -The `sync-oscal-content` command supports sync OSCAL models to the [CaC content](https://github.com/ComplianceAsCode/content) in a trestle workspace. For detailed documentation on how to use, see the [sync-oscal-content.md](https://github.com/complytime/complyscribe/blob/main/docs/tutorials/sync-oscal-content.md). - - -Below is a table of the available commands and their current availability as a GitHub Action: - -| Command | Available as a GitHub Action | -|-------------------------------------------|------------------------------| -| `autosync` | ✓ | -| `rules-transform` | ✓ | -| `create compdef` | ✓ | -| `sync-upstreams` | ✓ | -| `create ssp` | | -| `sync-cac-content component-definition` | | -| `sync-cac-content profile` | | -| `sync-cac-content catalog` | | -| `sync-oscal-content component-definition` | | -| `sync-oscal-content profile` | | -| `sync-oscal-content catalog` | | - - -For detailed documentation on how to use each action, see the README.md in each folder under [actions](https://github.com/complytime/complyscribe/blob/main/actions). - - -### Supported Git Providers - -> Note: Only applicable if using `complyscribe` to create pull requests. Automatically detecting the git -provider information is supported for GitHub Actions (GitHub) and GitLab CI (GitLab). - -- GitHub -- GitLab - -### Run as a Container - -> Note: When running the commands in a container, all are prefixed with `complyscribe` (e.g. `complyscribe autosync`). The default entrypoint for the container is the autosync command. - -Build and run the container locally: - -```bash -podman build -f Dockerfile -t complyscribe . -podman run -v $(pwd):/data -w /data complyscribe -``` - -Container images are available in `quay.io`: - -```bash -podman run -v $(pwd):/data -w /data quay.io/continuouscompliance/complyscribe: -``` - -## Contributing - -For information about contributing to complyscribe, see the [CONTRIBUTING.md](https://github.com/complytime/complyscribe/blob/main/CONTRIBUTING.md) file. - -## License - -This project is licensed under the Apache 2.0 License - see the [LICENSE.md](https://github.com/complytime/complyscribe/blob/main/LICENSE) file for details. - -## Troubleshooting - -See [TROUBLESHOOTING.md](https://github.com/complytime/complyscribe/blob/main/TROUBLESHOOTING.md) for troubleshooting tips. diff --git a/content/docs/projects/complyscribe/troubleshooting.md b/content/docs/projects/complyscribe/troubleshooting.md deleted file mode 100644 index 17ea6b3..0000000 --- a/content/docs/projects/complyscribe/troubleshooting.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -description: Common issues and solutions for complyscribe. -title: Troubleshooting -weight: 30 ---- - - -Check [TROUBLESHOOTING.md](https://github.com/complytime/complyscribe/blob/main/TROUBLESHOOTING.md) \ No newline at end of file diff --git a/content/docs/projects/complyscribe/tutorials/authoring.md b/content/docs/projects/complyscribe/tutorials/authoring.md deleted file mode 100644 index 6919f1c..0000000 --- a/content/docs/projects/complyscribe/tutorials/authoring.md +++ /dev/null @@ -1,64 +0,0 @@ ---- -description: Tutorial on authoring compliance content with complyscribe. -title: Authoring Content -weight: 41 ---- - - -# Authoring Tutorial - -This tutorial provides an overview of the authoring process using `complyscribe`. We will use the component definition created in the [GitHub tutorial](https://redhatproductsecurity.github.io/complyscribe/tutorials/github/) as our starting point. This tutorial will demonstrate the workflow for updating Markdown content and syncing those changes to OSCAL. - -## 1. Prerequisites - -- Complete the [GitHub tutorial](https://complytime.github.io/complyscribe/tutorials/github/) - - -## 2. Edit in Markdown - -We will begin where we left off at the end of the [GitHub tutorial](https://redhatproductsecurity.github.io/complyscribe/tutorials/github/). Our repository has a newly created component definition named `my-first-compdef` with corresponding content in the `markdown/` and `component-definitions/` directories. We will now demonstrate how to author changes in Markdown and produce updated OSCAL content. - -1. Navigate to the `markdown/component-definitions/my-first-compdef/test-component/nist_rev5_800_53/ac` directory and select the `ac-1.md` file. -2. Click the `Edit this file` (pencil) icon. -3. Scroll down to the section titled `## What is the solution and how is it implemented?` and add a new line of text with a brief comment. For example: - -``` -## What is the solution and how is it implemented? - -Here is where details should be added by the author. -``` - -4. Click the `Commit changes..` button -5. Select the `Create a new branch for this commit and start a pull request` radio button -6. Click `Propose changes` - - -The `Open a pull request` page now opens. Enter any additional details about your changes into the description box. - -7. Click `Create pull request` -8. For demo purposes, we will go ahead and merge the pull request ourselves. In a production setting the pull request process should be used for review, discussion and approval of the proposed changes. Click `Merge pull request` and then `Confirm merge`. - - -## Autosync - -Once the pull request has been merged the `complyscribe rules-transform and autosync` GitHub action will be triggered. We will now validate that action was successful. - -1. Navigate to the `Actions` tab of your GitHub repository. -2. The top entry in the list of workflow runs should be titled `Merge pull request # from `. This action should be either running or have just successfully completed. -3. [Optional] Clicking this entry will allow you to view the detailed steps and log output. -4. Once the action is completed successfully, navigate back to the source code by clicking the `Code` tab of the repo. -5. Click the `component-definitions` folder and navigate to `my-first-compdef/component-definition.json`. -5. The `Last commit date` should align with the time the action completed. -6. Click the `component-definitions.json` file and then click the `History` icon to view the commit history. -7. Ensure the latest commit performed by the GitHub action reflects the changes made in Markdown as shown below: - -``` - "description": "", - "description": "Here is where details should be added by the author", -``` - -You will also notice the `"last-modified"` timestamp has been updated. - - -Congrats! You've successfully authored a change by modifying a Markdown file and letting complyscribe sync those changes back to the OSCAL content. - diff --git a/content/docs/projects/complyscribe/tutorials/github.md b/content/docs/projects/complyscribe/tutorials/github.md deleted file mode 100644 index 62a3c31..0000000 --- a/content/docs/projects/complyscribe/tutorials/github.md +++ /dev/null @@ -1,136 +0,0 @@ ---- -description: Using complyscribe with GitHub Actions. -title: GitHub Integration -weight: 42 ---- - - -# GitHub Tutorial - -This tutorial provides an introduction to using `complyscribe` with GitHub. We will be using a single GitHub repository for our trestle authoring workspace and executing the `complyscribe` commands as GitHub actions. Note, each repo is intended to support authoring a single OSCAL model type (SSP, component definition, etc.). If authoring more than one OSCAL model type, then a dedicated repository should be used for each model. - - -### 1. Prerequisites - -Before moving on, please ensure the following is completed: - -1. Create a new (or use an existing) empty GitHub repository -2. Clone the repo to a local workstation -3. Install complyscribe - * Option 1: Clone the [complyscribe](https://github.com/complytime/complyscribe/tree/main) repo to a local workstation and run `poetry install` - * Option 2: Use the [complyscribe container image](https://github.com/complytime/complyscribe?tab=readme-ov-file#run-as-a-container) - - -### 2. Set Permissions for GitHub Actions - -The `complyscribe` commands will be run inside of GitHub actions. These commands often perform `write` level operations against the repo contents. The GitHub workflows generated in this tutorial make use of [automatic token authentication.](https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication) To ensure this is configured correct the following repo settings need to be in place. - -*Note: If an alternative method is choosen to provide repo access, such as personal access tokens or GitHub apps, the following steps can be skipped.* - -1. Click the `Settings` tab for your GitHub repo -2. Select `Actions` -> `General` from the left-hand menu -3. Scroll down to `Workflow permissions` -4. Ensure `Read repository contents and packages permissions` is selected -5. Ensure `Allow GitHub Actions to create and approve pull requests` is checked - - -### 3. Initialize complyscribe Workspace - -The `complyscribe init` command will initialize the empty GitHub repository. Unlike other complyscribe commands, this command is run on the local workstation. The complyscribe commands can be installed by cloning the [complyscribe](https://github.com/complytime/complyscribe/tree/main) repo and running `poetry install`. Alternatively these commands can be run using the [complyscribe container image](https://github.com/complytime/complyscribe?tab=readme-ov-file#run-as-a-container). - -For this tutorial example, we will be authoring a component-definition. - -1a. Running complyscribe init using a locally installed complyscribe: - -``` -complyscribe init --repo-path -``` - -1b. Running complyscribe init using a complyscribe container image: - - * *Note: latest image version tag can be found in the [continuouscompliance repo on quay.io](https://quay.io/repository/continuouscompliance/complyscribe?tab=tags).* - -``` -podman run -v :/data:rw complyscribe: --oscal-model compdef --working-dir /data -``` - - * If the local workstation is in SELinux enforcing mode and a permissions error occurs, then the following command should be used instead: -``` -podman run -v :/data:Z complyscribe: --oscal-model compdef --working-dir /data -``` - - * Once the initiatization runs successfully, the following directories will be created within the local copy of the repository. - -```bash -. -├── assessment-plans -├── assessment-results -├── catalogs -├── component-definitions -├── markdown -├── plan-of-action-and-milestones -├── profiles -└── system-security-plans -``` - -2. Any catalog or profile content needed for the authoring process can now be added. - - * For this example, we will add the NIST SP 800-53 Rev. 5 catalog to our `/catalogs` directory. - -``` -mkdir catalogs/nist_rev5_800_53 -wget https://raw.githubusercontent.com/usnistgov/oscal-content/release-v1.0.5-update/nist.gov/SP800-53/rev5/json/NIST_SP-800-53_rev5_catalog.json -O catalogs/nist_rev5_800_53/catalog.json -``` - - * We will also add the NIST SP 800-53 Rev. 5 High Baseline profile to our `profiles/` directory. - -``` -mkdir profiles/nist_rev5_800_53 -wget https://raw.githubusercontent.com/usnistgov/oscal-content/release-v1.0.5-update/nist.gov/SP800-53/rev5/json/NIST_SP-800-53_rev5_HIGH-baseline_profile.json -O profiles/nist_rev5_800_53/profile.json -``` - -3. Our `profile.json` file contains a reference to our `catalog.json` file. By default, this path is not resolvable by compliance-trestle, so we need to run the following command to update the `href` value in the JSON. - -``` -sed -i 's/NIST_SP-800-53_rev5_catalog.json/trestle:\/\/catalogs\/nist_rev5_800_53\/catalog.json/g' profiles/nist_rev5_800_53/profile.json -``` - -4. Ready-made CI/CD workflows can be copied from the `.github/workflows/` directory within the upstream `trestle-demo` repository into the local trestle workspace. These are the complyscribe actions that will run as changes are made to the repo contents. - - * Copy the required template workflows from the `trestle-demo` repository into the new workspace repository. -``` -mkdir -p .github/workflows -wget -O .github/workflows/complyscribe-rules-transform.yml https://raw.githubusercontent.com/RedHatProductSecurity/trestle-demo/refs/heads/main/.github/workflows/complyscribe-rules-transform.yml -wget -O .github/workflows/complyscribe-create-component-definition.yml https://raw.githubusercontent.com/RedHatProductSecurity/trestle-demo/refs/heads/main/.github/workflows/complyscribe-create-component-definition.yml -``` - -5. ComplyScribe initial content is now created locally within the new trestle authoring workspace. This content can now be pushed to the remote GitHub repository. -``` -git add . -git commit -m "added example NIST SP 800-53 profile and component definition authoring workflow" -git push -``` - *Note: if this is the first git push to the remote GitHub repository, then use `git push -u origin main` rather than `git push`.* - - -### 4. Create a New Component Definition - -Now it's time to run our first complyscribe action within GitHub! We will go ahead and create our first component definition. - -1. Open the new remote GitHub repository in a web browser. -2. Click to the `Actions` tab from the top menu. -3. Click the `ComplyScribe create component definition` action from the left-hand menu. -4. Click `Run Workflow` which will open up a dialog box. -5. Enter the following values: - -* _Name of the Trestle profile to use for the component definition:_ `nist_rev5_800_53` -* _Name of the component definition to create:_ `my-first-compdef` -* _Name of the component to create in the generated component definition:_ `test-component` -* _Type of the component (e.g. service, policy, physical, validation, etc.):_ `service` -* _Description of the component to create:_ `Testing complyscribe init` - -6. Click `Run Workflow` - -Once the workflow job has completed, there will be a new Pull Request containing the files complyscribe generated for the component definition. After reviewing the committed changes, the Pull Request can then be merged into the main branch! - -**Congratulations! We have successfully created a new complyscribe workspace and have an authoring environment!** diff --git a/content/docs/projects/complyscribe/tutorials/sync-cac-content.md b/content/docs/projects/complyscribe/tutorials/sync-cac-content.md deleted file mode 100644 index fd34d6b..0000000 --- a/content/docs/projects/complyscribe/tutorials/sync-cac-content.md +++ /dev/null @@ -1,146 +0,0 @@ ---- -description: Synchronize Compliance-as-Code content with complyscribe. -title: Sync CaC Content -weight: 43 ---- - - -# The complyscribe command line sync-cac-content Tutorial - -This tutorial provides how to use `complyscribe sync-cac-content` transform [cac-content](https://github.com/ComplianceAsCode/content) to OSCAL models. -This command has three sub-commands `catalog`, `profile` and `component-definition` - -> **WARNING:** There is a sequential order when transformed, first Catalog, then Profile, last Component Definition. -> Because Profile depends on Catalog, and Component Definition depends on Profile. - -## catalog - -This command is to generate OSCAL Catalog according to CaC content policy - -### 1. Prerequisites - -- Initialize the [complyscribe workspace](../tutorials/github.md#3-initialize-complyscribe-workspace) if you do not have one. - -- Clone the [cac-content repository](https://github.com/ComplianceAsCode/content). - -### 2. Run the CLI sync-cac-content catalog - -A real world example, if we want to transform [cis_rhel8](https://github.com/ComplianceAsCode/content/blob/master/controls/cis_rhel8.yml) -to OSCAL Catalog, we run command like below,`cac-policy-id` is [control file id](https://github.com/ComplianceAsCode/content/blob/master/controls/cis_rhel8.yml#L4), -`oscal-catalog` is OSCAL Catalog directory name we will use when generating the OSCAL Catalog. - -```shell -poetry run complyscribe sync-cac-content catalog \ ---dry-run \ ---repo-path $complyscribe_workspace_root_dir \ ---committer-email tester@redhat.com \ ---committer-name tester \ ---branch main \ ---cac-policy-id cis_rhel8 \ ---oscal-catalog cis_rhel8 \ ---cac-content-root $cac_content_root_dir -``` - -After successfully running above command, will generate [catalogs/cis_rhel8/catalog.json](https://github.com/ComplianceAsCode/oscal-content/blob/main/catalogs/cis_rhel8/catalog.json) - -For more details about these options and additional flags, you can use the `--help` flag: -`poetry run complyscribe sync-cac-content catalog --help` -This will display a full list of available options and their descriptions. - -After running the CLI with the right options, you would successfully generate an OSCAL Catalog under -`$complyscribe_workspace_root_dir/catalogs`. - - -## profile - -This command is to generate OSCAL Profile according to content policy - -### 1. Prerequisites - -- Initialize the [complyscribe workspace](../tutorials/github.md#3-initialize-complyscribe-workspace) if you do not have one. - -- Clone the [cac-content repository](https://github.com/ComplianceAsCode/content). - -### 2. Run the CLI sync-cac-content profile - -A real world example, if we want to transform [rhel8 product](https://github.com/ComplianceAsCode/content/tree/master/products/rhel8) -that using [cis_rhel8 control file](https://github.com/ComplianceAsCode/content/blob/master/controls/cis_rhel8.yml) to OSCAL Profile, -we run command like below, `product` is [product name](https://github.com/ComplianceAsCode/content/blob/master/products/rhel8/product.yml#L1), -`oscal-catalog` is OSCAL [catalog directory name](https://github.com/ComplianceAsCode/oscal-content/tree/main/catalogs/cis_rhel8), -`cac-policy-id` is [control file id](https://github.com/ComplianceAsCode/content/blob/master/controls/cis_rhel8.yml#L4) - -```shell -poetry run complyscribe sync-cac-content profile \ ---dry-run \ ---repo-path $complyscribe_workspace_root_dir \ ---committer-email tester@redhat.com \ ---committer-name tester \ ---branch main \ ---cac-content-root $cac_content_root_dir \ ---product rhel8 \ ---oscal-catalog cis_rhel8 \ ---cac-policy-id cis_rhel8 -``` - -After successfully running above command, you will generate four OSCAL -Profiles([rhel8-cis_rhel8-l1_server](https://github.com/ComplianceAsCode/oscal-content/blob/main/profiles/rhel8-cis_rhel8-l1_server/profile.json) -,[rhel8-cis_rhel8-l2_server](https://github.com/ComplianceAsCode/oscal-content/blob/main/profiles/rhel8-cis_rhel8-l2_server/profile.json), -[rhel8-cis_rhel8-l1_workstation](https://github.com/ComplianceAsCode/oscal-content/blob/main/profiles/rhel8-cis_rhel8-l1_workstation/profile.json), -[rhel8-cis_rhel8-l2_workstation](https://github.com/ComplianceAsCode/oscal-content/blob/main/profiles/rhel8-cis_rhel8-l2_workstation/profile.json)), -every [level](https://github.com/ComplianceAsCode/content/blob/master/controls/cis_rhel8.yml#L8) has its own Profile. - -For more details about these options and additional flags, you can use the `--help` flag: -`poetry run complyscribe sync-cac-content profile --help` -This will display a full list of available options and their descriptions. - -After running the CLI with the right options, you would successfully generate an OSCAL Profile -under `$complyscribe_workspace_root_dir/profiles/$product_$cac-policy-id_$level`. - -## component-definition - -This command creates OSCAL Component Definitions by transforming CaC content control files. - -The CLI performs the following transformations: - -- Populate CaC product information to OSCAL component title and description -- Ensure OSCAL component control mappings are populated with rule and rule parameter data from CaC control files -- Create a validation component from SSG rules to check mappings -- Ensure OSCAL Component Definition implemented requirements are populated from control notes in the control file -- Ensure implementation status of an implemented requirement in OSCAL Component Definitions are populated with the status from CaC control files - -### 1. Prerequisites - -- Initialize the [complyscribe workspace](../tutorials/github.md#3-initialize-complyscribe-workspace). - -- Clone the [cac-content repository](https://github.com/ComplianceAsCode/content). - -### 2. Run the CLI sync-cac-content component-definition - -A real world example. If we want to transform [cis_server_l1.profile](https://github.com/ComplianceAsCode/content/blob/master/products/rhel8/profiles/cis_server_l1.profile) -to an OSCAL Component Definition, we run command like below. `product` is [product name](https://github.com/ComplianceAsCode/content/blob/master/products/rhel8/product.yml#L1), -`cac-profile` is [CaC content profile file name](https://github.com/ComplianceAsCode/content/blob/master/products/rhel8/profiles/cis_server_l1.profile) you need transform, -`oscal-profile` is [OSCAL profile directory name](https://github.com/ComplianceAsCode/oscal-content/blob/main/profiles/rhel8-cis_rhel8-l1_server/profile.json) corresponding -to CaC content profile, `component-definition-type` is [a category describing the purpose of the component](https://pages.nist.gov/OSCAL-Reference/models/v1.1.3/component-definition/json-reference/#/component-definition/components/type). - -```shell -poetry run complyscribe sync-cac-content component-definition \ ---dry-run \ ---repo-path $complyscribe_workspace_root_dir \ ---committer-email tester@redhat.com \ ---committer-name tester \ ---branch main \ ---cac-content-root $cac_content_root_dir \ ---product rhel8 \ ---component-definition-type software \ ---oscal-profile rhel8-cis_rhel8-l1_server \ ---cac-profile cis_server_l1 -``` - -After successfully running above command, will generate an OSCAL [Component Definition](https://github.com/ComplianceAsCode/oscal-content/blob/main/component-definitions/rhel8/rhel8-cis_rhel8-l1_server/component-definition.json) - -For more details about these options and additional flags, you can use the `--help` flag: -`poetry run complyscribe sync-cac-content component-definition --help` -This will display a full list of available options and their descriptions. - -After running the CLI with the right options, you would successfully generate an OSCAL Component Definition -under $complyscribe_workspace_root_dir/component-definitions/$product_name/$OSCAL-profile-name. diff --git a/content/docs/projects/complyscribe/tutorials/sync-oscal-content.md b/content/docs/projects/complyscribe/tutorials/sync-oscal-content.md deleted file mode 100644 index de4f78e..0000000 --- a/content/docs/projects/complyscribe/tutorials/sync-oscal-content.md +++ /dev/null @@ -1,119 +0,0 @@ ---- -description: Synchronize OSCAL content with complyscribe. -title: Sync OSCAL Content -weight: 44 ---- - - -# The complyscribe command line sync-oscal-content Tutorial - -This tutorial provides how to use `complyscribe sync-oscal-content` sync OSCAL models to [cac-content](https://github.com/ComplianceAsCode/content). - -Currently, this command has three sub-command: `component-definition` and `profile` and `catalog` - -## component-definition - -This command is to sync OSCAL Component Definition information to CaC content side. - -The CLI performs the following sync: - -- Sync OSCAL Component Definition parameters/rules changes to CaC content profile file -- Sync OSCAL Component Definition parameters/rules changes to CaC content control file -- Add a hint comment to the control file when a missing rule is found in the CaC content repo. -- Sync OSCAL Component Definition control status changes to CaC content control file. Since status mapping between -cac and OSCAL is many-to-many relationship, if status can not be determined when sync, then add a comment to let user -decide. Discussion detail in [doc](https://github.com/complytime/complyscribe/discussions/511) -- Add new option to cac var file when found variable exists but missing the option we sync. -- Sync OSCAL Component Definition statements field to CaC control notes field - -### 1. Prerequisites - -- Initialize the [complyscribe workspace](../tutorials/github.md#3-initialize-complyscribe-workspace). - -- Clone the [cac-content repository](https://github.com/ComplianceAsCode/content). - -- Has an OSCAL Component Definition file, (transformed from CaC content using `sync-cac-content component-definition` cmd) - -### 2. Run the CLI sync-oscal-content component-definition -```shell -poetry run complyscribe sync-oscal-content component-definition \ ---branch main \ ---cac-content-root $cac_content_root_dir \ ---committer-name tester \ ---committer-email tester@redhat.com \ ---dry-run \ ---repo-path $complyscribe_workspace_root_dir \ ---product $product-name \ ---oscal-profile $oscal-profile-name -``` - -For more details about these options and additional flags, you can use the --help flag: -`poetry run complyscribe sync-oscal-content component-definition --help` -This will display a full list of available options and their descriptions. - - -## profile - -This command is to sync OSCAL Profile information to CaC content side. - -The CLI performs the following sync: - -- Sync OSCAL Profile control levels change to CaC control files - -### 1. Prerequisites - -- Initialize the [complyscribe workspace](../tutorials/github.md#3-initialize-complyscribe-workspace). - -- Clone the [cac-content repository](https://github.com/ComplianceAsCode/content). - -- Have OSCAL Profile file, (transformed from CaC content using `sync-cac-content profile` cmd) - -### 2. Run the CLI sync-oscal-content profile - -```shell -poetry run complyscribe sync-oscal-content profile \ ---dry-run \ ---repo-path $complyscribe_workspace_root_dir \ ---committer-email tester@redhat.com \ ---committer-name tester \ ---branch main \ ---cac-content-root $cac_content_root_dir \ ---cac-policy-id cis_rhel8 \ ---product rhel8 -``` - -For more details about these options and additional flags, you can use the --help flag: -`poetry run complyscribe sync-oscal-content profile --help` -This will display a full list of available options and their descriptions. - -## catalog - -This command is to sync OSCAL Catalog information to CaC content side. - -The CLI performs the following sync: - -- Sync OSCAL Catalog control parts field change to CaC control files control description field - -### 1. Prerequisites - -- Initialize the [complyscribe workspace](../tutorials/github.md#3-initialize-complyscribe-workspace). - -- Clone the [cac-content repository](https://github.com/ComplianceAsCode/content). - -- An OSCAL Catalog file, (transformed from CaC content using `sync-cac-content catalog` cmd) - -### 2. Run the CLI sync-oscal-content catalog -```shell -poetry run complyscribe sync-oscal-content catalog \ ---cac-policy-id nist_ocp4 \ ---cac-content-root $cac_content_root_dir \ ---repo-path $complyscribe_workspace_root_dir \ ---committer-name tester \ ---committer-email tester@redhat.com \ ---branch main \ ---dry-run -``` - -For more details about these options and additional flags, you can use the --help flag: -`poetry run complyscribe sync-oscal-content catalog --help` -This will display a full list of available options and their descriptions. \ No newline at end of file diff --git a/content/docs/projects/complytime-collector-components/_index.md b/content/docs/projects/complytime-collector-components/_index.md deleted file mode 100644 index 98c87ba..0000000 --- a/content/docs/projects/complytime-collector-components/_index.md +++ /dev/null @@ -1,151 +0,0 @@ ---- -description: OpenTelemetry-based observability toolkit for compliance evidence collection. -title: complytime-collector-components -weight: 10 ---- - - -# ComplyBeacon - -**ComplyBeacon** is an open-source observability toolkit designed to collect, normalize, and enrich compliance evidence, extending the OpenTelemetry (OTEL) standard. - -By bridging the gap between raw policy scanner output and modern logging pipelines, it provides a unified, enriched, and auditable data stream for security and compliance analysis. - ---- - -⚠️ **WARNING:** All components are under initial development and are **not** ready for production use. - ---- - -## The ComplyBeacon Architecture - -ComplyBeacon is a policy-driven observability toolkit composed of four main components that work together to process and enrich compliance data. - -### 1. ProofWatch - -An instrumentation library that accepts and emits pre-normalized compliance evidence as an OpenTelemetry log stream, while also instrumenting metrics for real-time observability. - -### 2. Beacon - -A custom OpenTelemetry Collector distribution that acts as the pipeline's host, receiving log records from ProofWatch and preparing them for the next stage of enrichment. - -### 3. TruthBeam - -A custom OpenTelemetry Collector processor that enriches log records with compliance and risk data by integrating with the Compass service. - -### 4. Compass - -A central enrichment service that provides risk, threat, and compliance framework attributes based on policy lookup data. - -#### Supported Compass Mappers - -| Mapper | Description | -|---------|----------------------------------------------------| -| `basic` | Maps to the `gemara` model based on log attributes | - -## Quick Start - -Before Deploying: Please read the following **NOTE**. - -⚠️ **NOTE:** -To enable evidence log synchronization to AWS S3 and Hyperproof, you must configure the following environment variables. The collector will fail to start if the S3 configuration is invalid. - -For more detailed information, please refer to the integration guide: [Sync_Evidence2Hyperproof](https://github.com/complytime/complytime-collector-components/blob/main/docs/integration/Sync_Evidence2Hyperproof.md). - -| Environment Variable | Description | -|------------------------|---------------------------------------------------------| -| `AWS_REGION` | The AWS region where your S3 bucket is hosted | -| `S3_BUCKETNAME` | The name of the target S3 bucket. | -| `S3_OBJ_DIR` | The folder path (prefix) for bucket subjects | -| `AWS_ACCESS_KEY_ID` | The AWS Access Key ID with permissions to the bucket | -| `AWS_SECRET_ACCESS_KEY`| The AWS Secret Access Key corresponding to the ID. | - - -If you do not wish to use the AWS S3 integration, you can disable it by modifying the configuration files: - -A. **In [hack/demo/demo-config.yaml](https://github.com/complytime/complytime-collector-components/blob/main/hack/demo/demo-config.yaml)** change the exporters line from: - -`exporters: [debug, otlphttp/logs, awss3/logs, signaltometrics]` - -to - -`exporters: [debug, otlphttp/logs, signaltometrics]` - -The `awss3/logs` configuration in `exporters` section should also be commented. - -```yaml -exporters: - debug: - verbosity: detailed - otlphttp/logs: - endpoint: "http://loki:3100/otlp" - tls: - insecure: true - # File exporter: writes metrics as JSON for filelog receiver - file/metrics: - path: /data/metrics.jsonl - format: json - awss3/logs: - s3uploader: - region: ${AWS_REGION} - s3_bucket: ${S3_BUCKETNAME} - s3_prefix: ${S3_OBJ_DIR} - s3_partition_format: "" -``` - -to - -```yaml -exporters: - debug: - verbosity: detailed - otlphttp/logs: - endpoint: "http://loki:3100/otlp" - tls: - insecure: true - # File exporter: writes metrics as JSON for filelog receiver - file/metrics: - path: /data/metrics.jsonl - format: json -# awss3/logs: -# s3uploader: -# region: ${AWS_REGION} -# s3_bucket: ${S3_BUCKETNAME} -# s3_prefix: ${S3_OBJ_DIR} -# s3_partition_format: "" -``` - -B. **Comment collector.environment part of [compose.yml](https://github.com/complytime/complytime-collector-components/blob/main/compose.yaml)** as the AWS S3 environment variables will no longer be needed. - -Once you've reviewed the **NOTE** above, follow these steps to deploy the infrastructure and test the pipeline. - -1. **Deploy the Stack:** - This command builds and starts the full infrastructure, including Grafana, Loki, the custom collector (`Beacon`), and the `Compass` service. - ```bash - podman-compose up --build - ``` - -2. **Test the Pipeline:** - Send sample compliance data to the webhook receiver to test the pipeline's functionality. - ```bash - curl -X POST http://localhost:8088/eventsource/receiver -H "Content-Type: application/json" -d @hack/sampledata/evidence.json - ``` - -3. **Enable grafana dashboard:** - If you want to configure loki as default datasource on grafana and enable pre-build grafana dashboard, refer to [README.md](https://github.com/complytime/complytime-collector-components/blob/main/hack/demo/terraform/README.md) - -## Project Design - -For additional details on the planned design and roadmap, see [`DESIGN.md`](https://github.com/complytime/complytime-collector-components/blob/main/docs/DESIGN.md). - -## Updating the Semantic Conventions - -Update semantic convention under `model/` - -Validate with `make weaver-check` - -Update docs and code: -`make weaver-docsgen` -`make weaver-codegen` - ---- diff --git a/content/docs/projects/complytime-collector-components/attributes/compliance.md b/content/docs/projects/complytime-collector-components/attributes/compliance.md deleted file mode 100644 index 502e5d2..0000000 --- a/content/docs/projects/complytime-collector-components/attributes/compliance.md +++ /dev/null @@ -1,68 +0,0 @@ ---- -description: Reference for compliance-related OpenTelemetry attributes. -title: Compliance Attributes -weight: 41 ---- - - - - - -# Compliance - -## Compliance Assessment Attributes - -Attributes added by compliance assessment tools to map policy results to compliance frameworks. Provides compliance context, risk assessment, and regulatory mapping for audit and reporting. Maps to GEMARA Layer 5 (Enforcement) for Policy-as-Code workflows. - -| Attribute | Type | Description | Examples | Stability | -|---|---|---|---|---| -| `compliance.assessment.id` | string | Unique identifier for the compliance assessment run or session. Used to group findings from the same assessment execution. | `assessment-2024-001`; `scan-run-abc123`; `compliance-check-xyz789` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.control.applicability` | string[] | Environments or contexts where this control applies. | `["Production", "Staging"]`; `["All Environments"]`; `["Kubernetes", "AWS"]` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.control.catalog.id` | string | Unique identifier for the security control catalog or framework. | `OSPS-B`; `CCC`; `CIS` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.control.category` | string | Category or family that the security control belongs to. | `Access Control`; `Quality` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.control.id` | string | Unique identifier for the security control and assessment requirement being assessed. | `OSPS-QA-07.01` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.enrichment.status` | string | Result of the compliance framework mapping and enrichment process, indicating whether compliance context was successfully added to the event. | `Success`; `Unmapped`; `Partial` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.frameworks` | string[] | Regulatory or industry standards being evaluated for compliance. | `["NIST-800-53", "ISO-27001"]` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.remediation.action` | string | Remediation action determined by the policy engine in response to the compliance assessment result. | `Block`; `Allow`; `Remediate` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.remediation.description` | string | Description of the recommended remediation strategy for this control. | `This is a short description of the remediation strategy for this control.` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.remediation.exception.active` | boolean | Whether the exception is active for this enforcement. | `true`; `false` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.remediation.exception.id` | string | Unique identifier for the approved exception, if applicable. | `EX-2025-10-001`; `WAIVE-AC-1-001` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.remediation.status` | string | Outcome of the remediation action execution, indicating whether the remediation was successfully applied. | `Success`; `Fail`; `Skipped` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.requirements` | string[] | Compliance requirement identifiers from the frameworks impacted. | `["AC-1", "A.9.1.1"]` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.risk.level` | string | Severity classification of the risk posed by non-compliance with the control requirement. | `Critical`; `High`; `Medium` | ![Development](https://img.shields.io/badge/-development-blue) | -| `compliance.status` | string | Overall compliance determination for the assessed resource or control, indicating whether it meets the compliance requirements. | `Compliant`; `Non-Compliant`; `Exempt` | ![Development](https://img.shields.io/badge/-development-blue) | - ---- - -`compliance.enrichment.status` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. - -| Value | Description | Stability | -|---|---|---| - ---- - -`compliance.remediation.action` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. - -| Value | Description | Stability | -|---|---|---| - ---- - -`compliance.remediation.status` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. - -| Value | Description | Stability | -|---|---|---| - ---- - -`compliance.risk.level` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. - -| Value | Description | Stability | -|---|---|---| - ---- - -`compliance.status` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. - -| Value | Description | Stability | -|---|---|---| diff --git a/content/docs/projects/complytime-collector-components/attributes/policy.md b/content/docs/projects/complytime-collector-components/attributes/policy.md deleted file mode 100644 index 059da03..0000000 --- a/content/docs/projects/complytime-collector-components/attributes/policy.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -description: Reference for policy-related OpenTelemetry attributes. -title: Policy Attributes -weight: 42 ---- - - - - - -# Policy - -## Policy Engine Attributes - -Attributes emitted by policy engines (OPA, Gatekeeper, etc.) during policy evaluation and enforcement. Maps to GEMARA Layer 4 (Evaluation) for Policy-as-Code workflows. - -| Attribute | Type | Description | Examples | Stability | -|---|---|---|---|---| -| `policy.engine.name` | string | Name of the policy engine that performed the evaluation or enforcement action. | `OPA`; `Gatekeeper`; `Conftest`; `Sentinel` | ![Development](https://img.shields.io/badge/-development-blue) | -| `policy.engine.version` | string | Version of the policy engine. | `v3.14.0`; `v0.45.0`; `v1.2.3`; `v2.0.1` | ![Development](https://img.shields.io/badge/-development-blue) | -| `policy.evaluation.message` | string | Additional context about the policy evaluation result. | `The policy evaluation failed due to a missing attribute.` | ![Development](https://img.shields.io/badge/-development-blue) | -| `policy.evaluation.result` | string | Outcome of the policy rule evaluation, indicating the result of the policy check. | `Not Run`; `Passed`; `Failed` | ![Development](https://img.shields.io/badge/-development-blue) | -| `policy.rule.id` | string | Unique identifier for the policy rule being evaluated or enforced. | `deny-root-user`; `require-encryption`; `check-labels` | ![Development](https://img.shields.io/badge/-development-blue) | -| `policy.rule.name` | string | Human-readable name of the policy rule. | `Deny Root User`; `Require Encryption`; `Check Resource Labels` | ![Development](https://img.shields.io/badge/-development-blue) | -| `policy.rule.uri` | string | Source control URL and version of the policy-as-code file for auditability. | `github.com/org/policy-repo/b8a7c2e`; `gitlab.com/company/policies@v1.2.3` | ![Development](https://img.shields.io/badge/-development-blue) | -| `policy.target.environment` | string | Environment where the target resource or entity exists. | `production`; `staging`; `development` | ![Development](https://img.shields.io/badge/-development-blue) | -| `policy.target.id` | string | Unique identifier for the resource or entity being evaluated or enforced against. | `deployment-123`; `resource-456`; `user-789` | ![Development](https://img.shields.io/badge/-development-blue) | -| `policy.target.name` | string | Human-readable name of the resource or entity being evaluated or enforced against. | `frontend-deployment`; `s3-bucket-secrets`; `admin-user` | ![Development](https://img.shields.io/badge/-development-blue) | -| `policy.target.type` | string | Type of the resource or entity being evaluated or enforced against. | `deployment`; `resource`; `user`; `configuration` | ![Development](https://img.shields.io/badge/-development-blue) | - ---- - -`policy.evaluation.result` has the following list of well-known values. If one of them applies, then the respective value MUST be used; otherwise, a custom value MAY be used. - -| Value | Description | Stability | -|---|---|---| diff --git a/content/docs/projects/complytime-collector-components/design.md b/content/docs/projects/complytime-collector-components/design.md deleted file mode 100644 index 268b9b3..0000000 --- a/content/docs/projects/complytime-collector-components/design.md +++ /dev/null @@ -1,152 +0,0 @@ ---- -description: Architecture and design decisions for Collector Components. -title: Design -weight: 20 ---- - - -# ComplyBeacon Design Documentation - -## Key Features - -- **OpenTelemetry Native**: Built on the OpenTelemetry standard for seamless integration with existing observability pipelines. -- **Automated Enrichment**: Enriches raw evidence with risk scores, threat mappings, and regulatory requirements via the Compass service. -- **Composability**: Components are designed as a toolkit; they are not required to be used together, and users can compose their own pipelines. -- **Compliance-as-Code**: Leverages the `gemara` model for a robust, auditable, and automated approach to risk assessment. - -## Architecture Overview - -### Design Principles - -* **Modularity:** The system is composed of small, focused, and interchangeable services. - -* **Standardization:** The architecture is built on OpenTelemetry to ensure broad compatibility and interoperability. - -* **Operational Experience:** The toolkit is built for easy deployment, configuration, and maintenance using familiar cloud-native practices and protocols. - - -### Data Flow - -The ComplyBeacon architecture is centered around a unified enrichment pipeline that processes and enriches compliance evidence. The primary data flow begins with a source that generates OpenTelemetry-compliant logs. - -1. **Log Ingestion**: A source generates compliance evidence and sends it as a structured log record to the `Beacon` collector, typically using `ProofWatch` to handle the emission. This can also be done by an OpenTelemetry collector agent. -2. **Enrichment Request**: The log record is received by the `Beacon` collector and forwarded to the `truthbeam` processor. `truthbeam` extracts key attributes from the record and sends an enrichment request to the `Compass` API. -3. **Enrichment Lookup**: The `Compass` service performs a lookup based on the provided attributes and returns a response containing compliance-related context (e.g., impacted baselines, requirements, and risk). -4. **Attribute Injection**: `truthbeam` adds these new attributes from `Compass` to the original log record. -5. **Export**: The now-enriched log record is exported from the `Beacon` collector to a final destination (e.g., a SIEM, logging backend, or data lake) for analysis and correlation. - -``` -┌───────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ │ -│ │ -│ ┌─────────────────────────┐ │ -│ │ │ │ -│ │ Beacon Collector Distro │ │ -│ ┌────────────────────┐ ┌───────────────────┐ │ │ │ -│ │ │ │ │ ├─────────────────────────┤ │ -│ │ ├───┤ ProofWatch ├───┼────┐ │ │ -│ │ │ │ │ │ │ │ │ -│ │ Policy Log │ └───────────────────┘ │ ┌┴─────────────────┐ │ │ -│ │ Source App │ │ │ │ │ │ -│ │ │ │ │ OTLP │ │ │ -│ │ │ │ │ Reciever │ │ │ -│ │ │ ┌────────────────────────┼───┤ │ │ │ -│ └────────────────────┘ │ │ └────────┬─────────┘ │ ┌─────────────┐ │ -│ │ │ │ │ │ │ │ -│ │ │ ┌────────┴─────────┐ │ │ │ │ -│ │ │ │ │ │ │ Compass API │ │ -│ │ │ │ TruthBeam │──┼──────────────►│ │ │ -│ ┌───────────────────────┴───┐ │ │ Processor │ │ │ │ │ -│ │ │ │ │ │ │ └─────────────┘ │ -│ │ │ │ └────────┬─────────┘ │ │ -│ │ OpenTelemetry │ │ │ │ │ -│ │ Collector Agent │ │ ┌────────┴─────────┐ │ │ -│ │ │ │ │ Exporter │ │ │ -│ │ │ │ │ (e.g. Loki, │ │ │ -│ │ │ │ │ Splunk, AWSS3) │ │ │ -│ │ │ │ └──────────────────┘ │ │ -│ │ │ └─────────────────────────┘ │ -│ └───────────────────────────┘ │ -│ │ -└───────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ -``` - -### Deployment Patterns - -ComplyBeacon is designed to be a flexible toolkit. Its components can be used in different combinations to fit a variety of operational needs. - -* **Full Pipeline**: The most common use case where `ProofWatch` emits events to the `Beacon` collector, which in turn uses `TruthBeam` and `Compass` to enrich and export logs to a final destination. -* **Integrating `TruthBeam`**: `TruthBeam` can be included in an existing OpenTelemetry Collector distribution, allowing you to add enrichment capabilities to your current observability pipeline. -* **Standalone `Compass`**: The `Compass` service can be deployed as an independent API, enabling it to be called by any application or a different enrichment processor within an existing OpenTelemetry or custom logging pipeline. - -## Component Analysis - -### 1. ProofWatch - -**Purpose**: An instrumentation library for collecting and emitting compliance evidence as OpenTelemetry log streams. It provides a standardized interface for tracking policy evaluation events and compliance evidence in real-time. - -**Key Responsibilities**: -* Converts compliance evidence data into standardized OpenTelemetry log records. -* Emits log records to the OpenTelemetry Collector using the OTLP (OpenTelemetry Protocol). -* Provides metrics and tracing for evidence collection and processing. - -`proofwatch` attributes defined [here](https://github.com/complytime/complytime-collector-components/blob/main/docs/attributes) - -_Example code snippet_ -```go -import ( - "context" - "log" - - "go.opentelemetry.io/otel/log" - "github.com/complytime/complybeacon/proofwatch" -) - -// Create a new ProofWatch instance -pw, err := proofwatch.NewProofWatch() -if err != nil { - log.Fatal(err) -} - -// Create evidence (example with GemaraEvidence) -evidence := proofwatch.GemaraEvidence{ - // ... populate evidence fields -} - -// Log evidence with default severity -err = pw.Log(ctx, evidence) -if err != nil { - return fmt.Errorf("error logging evidence: %w", err) -} - -// Or log with specific severity -err = pw.LogWithSeverity(ctx, evidence, olog.SeverityWarn) -``` - -### 2. Beacon Collector Distro - -**Purpose**: A minimal OpenTelemetry Collector distribution that acts as the runtime environment for the `complybeacon` evidence pipeline, specifically by hosting the `truthbeam` processor. - -**Key Responsibilities**: -* Receiving log records from sources like `proofwatch` -* Running the `truthbeam` log processor on each log record. -* Exporting the processed, enriched logs to a configured backend. - -### 3. TruthBeam - -**Purpose**: To enrich log records with compliance-related context by querying the `compass` service. This is the core logic that transforms a simple policy check into an actionable compliance event. - -**Key Responsibilities**: -* Maintains a local, in-memory cache of previously enriched data to reduce API calls and improve performance. -* Queries the Compass API for enrichment data based on attributes in the log record. -* Skips enrichment on API failures, tagging the log record with an enrichment_status: skipped attribute to enable graceful degradation. -* Adds the returned enrichment data as new attributes to the log record. - -### 4. Compass - -**Purpose**: A centralized lookup service that provides compliance context. It's the source of truth for mapping policies to standards and risk attributes. - -**Key Responsibilities**: -* Receiving an EnrichmentRequest from `truthbeam`. -* Performing a lookup based on the policy details. -* Returning an EnrichmentResponse with compliance attributes. diff --git a/content/docs/projects/complytime-collector-components/development.md b/content/docs/projects/complytime-collector-components/development.md deleted file mode 100644 index 14ca904..0000000 --- a/content/docs/projects/complytime-collector-components/development.md +++ /dev/null @@ -1,399 +0,0 @@ ---- -description: Developer guide for building and extending Collector Components. -title: Development -weight: 30 ---- - - -# ComplyBeacon Development Guide - -This guide provides comprehensive instructions for setting up, building, and testing the ComplyBeacon project. -It complements the [DESIGN.md](https://github.com/complytime/complytime-collector-components/blob/main/docs/DESIGN.md) document by focusing on the practical aspects of development. - - -* [ComplyBeacon Development Guide](#complybeacon-development-guide) - * [Prerequisites](#prerequisites) - * [Required Software](#required-software) - * [Development Environment Setup](#development-environment-setup) - * [1. Clone the Repository](#1-clone-the-repository) - * [2. Install podman-compose (if needed)](#2-install-podman-compose-if-needed) - * [3. Initialize Go Workspace](#3-initialize-go-workspace) - * [4. Install Dependencies](#4-install-dependencies) - * [5. Verify Installation](#5-verify-installation) - * [Project Structure](#project-structure) - * [Testing](#testing) - * [Running Tests](#running-tests) - * [Integration Testing](#integration-testing) - * [Component Development](#component-development) - * [1. ProofWatch Development](#1-proofwatch-development) - * [2. Compass Development](#2-compass-development) - * [3. TruthBeam Development](#3-truthbeam-development) - * [4. Beacon Distro Development](#4-beacon-distro-development) - * [Debugging and Troubleshooting](#debugging-and-troubleshooting) - * [Debugging Tools](#debugging-tools) - * [Code Generation](#code-generation) - * [1. API Code Generation](#1-api-code-generation) - * [2. OpenTelemetry Semantic Conventions](#2-opentelemetry-semantic-conventions) - * [3. Manual Code Generation](#3-manual-code-generation) - * [Deployment and Demo](#deployment-and-demo) - * [Local Development Demo](#local-development-demo) - * [Additional Resources](#additional-resources) - - -## Prerequisites - -### Required Software - -- **Go 1.24+**: The project uses Go 1.24.0 with toolchain 1.24.5 -- **Podman**: For containerized development and deployment -- **podman-compose**: For orchestrating multi-container development environments -- **Make**: For build automation -- **Git**: For version control -- **openssl** Cryptography toolkit - -## Development Environment Setup - -### 1. Clone the Repository - -```bash -git clone https://github.com/complytime/complybeacon.git -cd complybeacon -``` - -### 2. Install podman-compose (if needed) - -The project uses `podman-compose` for container orchestration. Install it if you don't have it: - -```bash -# Install podman-compose -pip install podman-compose - -# alternatively for Fedora: -dnf install podman-compose - -# Verify installation -podman-compose --version -``` - -### 3. Initialize Go Workspace - -The project uses Go workspaces to manage multiple modules: - -```bash -make workspace -``` - -This creates a `go.work` file that includes all project modules: -- `./compass` -- `./proofwatch` -- `./truthbeam` - -### 4. Install Dependencies - -Dependencies are managed per module. Install them for all modules: - -```bash -# Install dependencies for all modules -for module in compass proofwatch truthbeam; do - cd $module && go mod download && cd .. -done -``` - -### 5. Verify Installation - -```bash -# Run tests to verify everything works -make test - -# Build all binaries -make build -``` - -## Project Structure - -``` -complybeacon/ -├── api.yaml # OpenAPI specification for Compass service -├── compose.yaml # podman-compose configuration for demo environment -├── Makefile # Build automation -├── docs/ # Documentation -│ ├── DESIGN.md # Architecture and design documentation -│ ├── DEVELOPMENT.md # This file -│ └── attributes/ # Attribute documentation -├── model/ # OpenTelemetry semantic conventions -│ ├── attributes.yaml # Attribute definitions -│ └── entities.yaml # Entity definitions -├── compass/ # Compass service module -│ ├── cmd/compass/ # Main application -│ ├── api/ # Generated API code -│ ├── mapper/ # Enrichment mappers -│ └── service/ # Business logic -├── proofwatch/ # ProofWatch instrumentation library -│ ├── attributes.go # Attribute definitions -│ ├── evidence.go # Evidence types -│ └── proofwatch.go # Main library -├── truthbeam/ # TruthBeam processor module -│ ├── internal/ # Internal packages -│ ├── config.go # Configuration -│ └── processor.go # Main processor logic -├── beacon-distro/ # OpenTelemetry Collector distribution -│ ├── config.yaml # Collector configuration -│ └── Containerfile.collector # Container definition -├── hack/ # Development utilities -│ ├── demo/ # Demo configurations -│ ├── sampledata/ # Sample data for testing -│ └── self-signed-cert/ # self signed cert, testing/development purpose -└── bin/ # Built binaries (created by make build) -``` - -## Testing - -### Running Tests - -```bash -# Run all tests -make test - -# Run tests for specific module -cd compass && go test -v ./... -cd proofwatch && go test -v ./... -cd truthbeam && go test -v ./... -``` - -### Integration Testing - -The project includes integration tests using the demo environment: - -```bash -# Start the demo environment -make deploy - -# Test the pipeline -curl -X POST http://localhost:8088/eventsource/receiver \ - -H "Content-Type: application/json" \ - -d @hack/sampledata/evidence.json - -# Check logs in Grafana at http://localhost:3000 -# Check Compass API at http://localhost:8081/v1/enrich -``` - -## Component Development - -### 1. ProofWatch Development - -ProofWatch is an instrumentation library for emitting compliance evidence. - -**Key Files:** -- `proofwatch/proofwatch.go` - Main library interface -- `proofwatch/evidence.go` - Evidence type definition -- `proofwatch/attributes.go` - OpenTelemetry attributes - -**Development Workflow:** -```bash -cd proofwatch - -# Run tests -go test -v ./... - -# Check for linting issues -go vet ./... - -# Format code -go fmt ./... -``` - -### 2. Compass Development - -Compass is the enrichment service that provides compliance context. - -**Key Files:** -- `compass/cmd/compass/main.go` - Service entry point -- `compass/service/service.go` - Business logic -- `compass/mapper/` - Enrichment mappers -- `api.yaml` - OpenAPI specification - -**Development Workflow:** -```bash -cd compass - -# Run the service locally -go run ./cmd/compass --config hack/demo/config.yaml --catalog hack/sampledata/osps.yaml --port 8081 --skip-tls - -# Test the API -curl -X POST http://localhost:8081/v1/metadata \ - -H "Content-Type: application/json" \ - -d '{"policy": {"policyEngineName": "OPA", "policyRuleId": "deny-root-user"}}' -``` - -**Adding New Mappers:** -1. Create a new mapper in `compass/mapper/plugins/` -2. Implement the `Mapper` interface -3. Register the mapper in the factory -4. Add configuration options - -### 3. TruthBeam Development - -TruthBeam is an OpenTelemetry Collector processor for enriching logs. - -**Key Files:** -- `truthbeam/processor.go` - Main processor logic -- `truthbeam/config.go` - Configuration structures -- `truthbeam/factory.go` - Processor factory - -**Development Workflow:** -```bash -cd truthbeam - -# Run tests -go test -v ./... - -# Test with collector (requires beacon-distro) -cd ../beacon-distro -# Modify config to use local truthbeam -# Run collector with local processor -``` - -**Local development config** - -If you want locally test the TruthBeam, remember to change the [manifest.yaml](https://github.com/complytime/complytime-collector-components/blob/main/beacon-distro/manifest.yaml) - -Add replace directive at the end of [manifest.yaml](https://github.com/complytime/complytime-collector-components/blob/main/beacon-distro/manifest.yaml), to make sure collector use your `truthbeam` code. Default collector will use `- gomod: github.com/complytime/complybeacon/truthbeam main` - -For example: -```yaml -replaces: - - github.com/complytime/complybeacon/truthbeam => github.com/AlexXuan233/complybeacon/truthbeam 52e4a76ea0f72a7049e73e7a5d67d988116a3892 -``` -or -```yaml -replaces: - - github.com/complytime/complybeacon/truthbeam => github.com/AlexXuan233/complybeacon/truthbeam main -``` - -### 4. Beacon Distro Development - -The Beacon distribution is a custom OpenTelemetry Collector. - -**Key Files:** -- `beacon-distro/config.yaml` - Collector configuration -- `beacon-distro/Containerfile.collector` - Container definition - -**Development Workflow:** -```bash -cd beacon-distro - -# Build the collector image -podman build -f Containerfile.collector -t complybeacon-beacon-distro:latest . - -# Test with local configuration -podman run --rm -p 4317:4317 -p 8088:8088 \ - -v $(pwd)/config.yaml:/etc/otel-collector.yaml:Z \ - complybeacon-beacon-distro:latest -``` - -## Debugging and Troubleshooting - -### Debugging Tools - -```bash -# View container logs -podman-compose logs -f compass -podman-compose logs -f collector -``` - -## Code Generation - -The project uses several code generation tools: - -### 1. API Code Generation - -Generate Go code from OpenAPI specification: - -```bash -make api-codegen -``` - -This generates: -- `compass/api/types.gen.go` - Request/response types -- `compass/api/server.gen.go` - Server interfaces - -### 2. OpenTelemetry Semantic Conventions - -Generate documentation and Go code from semantic convention models: - -```bash -# Generate documentation -make weaver-docsgen - -# Generate Go code -make weaver-codegen - -# Validate models -make weaver-check -``` - -### 3. Manual Code Generation - -If you modify the OpenAPI spec or semantic conventions: - -```bash -# Update API spec -vim api.yaml - -# Regenerate API code -make api-codegen - -# Update semantic conventions -vim model/attributes.yaml -vim model/entities.yaml - -# Regenerate semantic convention code -make weaver-codegen -``` - -## Deployment and Demo - -### Local Development Demo - -The demo environment uses `podman-compose` to orchestrate multiple containers. Ensure you have `podman-compose` installed before proceeding. - -1. **Generate self-signed certificate** - -Since compass and truthbeam enabled TLS by default, first we need to generate self-signed certificate for testing/development - -```shell -make generate-self-signed-cert -``` - -2. **Start the full stack:** -```bash -make deploy -``` - -3. **Test the pipeline:** -```bash -curl -X POST http://localhost:8088/eventsource/receiver \ - -H "Content-Type: application/json" \ - -d @hack/sampledata/evidence.json -``` - -4. **View results:** -- Grafana: http://localhost:3000 - -5. **Stop the stack:** -```bash -make undeploy -``` - ---- - -## Additional Resources - -- [OpenTelemetry Documentation](https://opentelemetry.io/docs/) -- [Go Documentation](https://golang.org/doc/) -- [Podman Documentation](https://docs.podman.io/) -- [Project Design Document](https://github.com/complytime/complytime-collector-components/blob/main/docs/DESIGN.md) -- [Attribute Documentation](https://github.com/complytime/complytime-collector-components/blob/main/docs/attributes) -- [Containers Guide](https://github.com/complytime/community/blob/main/CONTAINERS_GUIDE.md) - -For questions or support, please open an issue in the GitHub repository. diff --git a/content/docs/projects/complytime-collector-components/integration/sync-evidence-hyperproof.md b/content/docs/projects/complytime-collector-components/integration/sync-evidence-hyperproof.md deleted file mode 100644 index 67a5f4e..0000000 --- a/content/docs/projects/complytime-collector-components/integration/sync-evidence-hyperproof.md +++ /dev/null @@ -1,163 +0,0 @@ ---- -description: Integration guide for syncing compliance evidence to Hyperproof. -title: Sync Evidence to Hyperproof -weight: 50 ---- - - -# Auto-Sync Evidence to Hyperproof - -## 1. Objective and Value -The purpose of this document is to detail the architecture and workflow for automatically syncing compliance evidence into [Hyperproof](https://hyperproof.io/). This process automates the "last mile" of the compliance journey: delivering collected, enriched, and verified evidence directly into the organisation's GRC (Governance, Risk, and Compliance) platform. - ---- - -### **Business Value** -Implementing this workflow closes the loop between technical operations and compliance auditing, achieving: - -* Continuous Compliance: Transforms evidence collection from a periodic, manual scramble into a continuous, automated flow. -* Audit Readiness: Ensures evidence is instantly available to auditors and stakeholders within [Hyperproof](https://hyperproof.io/). -* End-to-End Automation: Fully automates the pipeline from code check-in (or system event) to auditor review. - ---- - -## 2. Technical Architecture & Workflow -The automation pipeline uses an event-driven architecture hosted on [AWS](https://docs.aws.amazon.com/) to bridge [Complybeacon](https://github.com/complytime/complybeacon) and [Hyperproof](https://hyperproof.io/). - - - -### **The Step-by-Step Workflow** - -| Step | Component | Action | Details | -| :--- | :--- | :--- | :--- | -| Export | Complybeacon | Output | Complybeacon completes evidence collection and exports the finalized logs. | -| Ingestion | AWS S3 | Secure Storage | The evidence logs are deposited into the designated S3 Bucket. | -| Trigger | S3 Event | Event-Driven | The creation of a new object in S3 automatically triggers the linked AWS Lambda Function. | -| Processing | AWS Lambda | Transformation/Push | The function executes a Python script that retrieves the Hyperproof secrets from AWS SSM, authenticates via the Hyperproof API, and pushes the evidence data. | -| Verification | AWS / Hyperproof | Validation | Inspect CloudWatch Logs for successful execution. Then, check Hyperproof to verify the evidence appears in the expected location. | - ---- - -## 3. Preparation & Prerequisites -Before configuring the automation, the following components and credentials must be provisioned. - -### **3.1 Hyperproof Configuration** - -1. **Provision API Credentials:** Create an API client within Hyperproof to allow external access. - * *Path:* `Administrator -> Setting -> API Client` -2. **Record Credentials:** Securely note the `CLIENT_ID` and `CLIENT_SECRET`. - -### **3.2 AWS Infrastructure Setup** - -#### **A. IAM & [S3 Bucket](https://docs.aws.amazon.com/s3/?icmpid=docs_homepage_featuredsvcs) (Storage)** - -1. Create S3 Bucket: Provision a new AWS S3 bucket for evidence ingestion. Note the Bucket Name. -2. [Create IAM Policy](https://docs.hyperproof.io/cm/en/integrations/hp-amazon-s3): Create an IAM Policy granting write access to this specific S3 bucket (for Complybeacon). - - _Example Policy snippet_ - ``` - { - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "VisualEditor0", - "Effect": "Allow", - "Action": [ - "s3:GetObject" - ], - "Resource": [ - "arn:aws:s3:::sw-s3-hyperproof/*", # Update the S3 bucket name - "arn:aws:s3:::sw-s3-hyperproof" # Update the S3 bucket name - ] - }, - { - "Sid": "VisualEditor1", - "Effect": "Allow", - "Action": [ - "s3:ListAllMyBuckets", - "s3:ListBucket" - ], - "Resource": "*" - }, - { - "Sid": "VisualEditor2", - "Effect": "Allow", - "Action": "s3:PutObject", - "Resource": "arn:aws:s3:::sw-s3-hyperproof/*" # Update the S3 bucket name - } - ] - } - ``` -3. [Create IAM User](https://docs.hyperproof.io/cm/en/integrations/hp-amazon-s3): Create an IAM User (for Complybeacon), attach the policy, and generate the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. - -#### **B. [Systems Manager](https://docs.aws.amazon.com/systems-manager/?icmpid=docs_homepage_mgmtgov) (Secrets Management)** - -Create new **`SecureString`** parameters in the AWS Systems Manager (SSM) Parameter Store to securely hold the Hyperproof credentials. - -* `/hyperproof/CLIENT_ID` -* `/hyperproof/CLIENT_SECRET` - -#### **C. [Lambda Function](https://docs.aws.amazon.com/lambda/?icmpid=docs_homepage_featuredsvcs)** - -1. **Create Function:** Initialise a new AWS Lambda function (using Python runtime). -2. **Configure Triggers:** Add an S3 trigger linking it to the bucket from step **3.2 A**, configured to fire ***only*** on `s3:ObjectCreated:Put` and `s3:ObjectCreated:Post` events(Very important). -3. **Configure IAM Execution Role:** - * Attach the managed policy `AmazonS3ReadOnlyAccess` (to allow Lambda to read the evidence logs). - * Create and attach an inline policy granting `ssm:GetParameter` and `kms:Decrypt` permission to read the specific SSM parameters (`/hyperproof/CLIENT_ID`, `/hyperproof/CLIENT_SECRET`). - - _Example Policy snippet_ - ```json - { - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "VisualEditor0", - "Effect": "Allow", - "Action": [ - "s3:GetObject" - ], - "Resource": "arn:aws:s3:::alex-hyperproof-test/*" - } - ] - } - ``` - - ```json - { - "Version": "2012-10-17", - "Statement": [ - { - "Sid": "VisualEditor0", - "Effect": "Allow", - "Action": "kms:Decrypt", - "Resource": "*" - }, - { - "Sid": "VisualEditor1", - "Effect": "Allow", - "Action": "ssm:GetParameter", - "Resource": [ - "arn:aws:ssm:eu-north-1:725106756198:parameter/hyperproof/CLIENT_ID", - "arn:aws:ssm:eu-north-1:725106756198:parameter/hyperproof/CLIENT_SECRET" - ] - } - ] - } - ``` - -4. **Dependencies & Layers:** Create and attach a Lambda Layer containing the necessary Python libraries (`requests`). -5. **Set Environment Variables:** Configure the following (for the Python script to use): - * `CLIENT_ID`: `/hyperproof/CLIENT_ID` - * `CLIENT_SECRET`: `/hyperproof/CLIENT_SECRET` -6. **Deploy Code:** Deploy the actual [sync code](https://gitlab.cee.redhat.com/product-security/continuous-compliance/SyncEvidence2Hyperproof/-/blob/main/lambda_function.py?ref_type=heads) (which reads S3, retrieves secrets from SSM, and calls the Hyperproof API) into the Lambda Function editor. -7. **Setup timeout** Go to Configuration->General configuration, increase timeout value to a bigger value, for example 10s(default is 3). - ---- - -## 4. Execution -Once all prerequisites are complete, the pipeline is activated automatically: - -1. The Complybeacon exports the evidence log. -2. The evidence log is written to the configured S3 bucket. -3. The S3 write event immediately triggers the Lambda function. -4. The Lambda function executes, pushing the evidence log to Hyperproof. \ No newline at end of file diff --git a/layouts/_default/_markup/render-heading.html b/layouts/_default/_markup/render-heading.html new file mode 100644 index 0000000..e6dbff4 --- /dev/null +++ b/layouts/_default/_markup/render-heading.html @@ -0,0 +1,4 @@ + + {{- .Text -}} + # + diff --git a/layouts/_partials/main/edit-page.html b/layouts/_partials/main/edit-page.html new file mode 100644 index 0000000..437af16 --- /dev/null +++ b/layouts/_partials/main/edit-page.html @@ -0,0 +1,48 @@ +{{- /* Use per-page editURL (set by sync tool for upstream content) when available, + otherwise fall back to the default docsRepo-based URL. */ -}} + +{{ $url := "" }} + +{{ with .Params.editURL }} + {{ $url = . }} +{{ else }} + {{ $parts := slice site.Params.doks.docsRepo }} + + {{ if (eq site.Params.doks.repoHost "GitHub") }} + {{ $parts = $parts | append "blob" site.Params.doks.docsRepoBranch }} + {{ else if (eq site.Params.doks.repoHost "Gitea") }} + {{ $parts = $parts | append "_edit" site.Params.doks.docsRepoBranch }} + {{ else if (eq site.Params.doks.repoHost "GitLab") }} + {{ $parts = $parts | append "-/blob" site.Params.doks.docsRepoBranch }} + {{ else if (eq site.Params.doks.repoHost "Bitbucket") }} + {{ $parts = $parts | append "src" site.Params.doks.docsRepoBranch }} + {{ else if (eq site.Params.doks.repoHost "BitbucketServer") }} + {{ $parts = $parts | append "browse" site.Params.doks.docsRepoBranch }} + {{ end }} + + {{ if isset .Site.Params "docsreposubpath" }} + {{ if not (eq site.Params.doks.docsRepoSubPath "") }} + {{ $parts = $parts | append site.Params.doks.docsRepoSubPath }} + {{ end }} + {{ end }} + + {{ $filePath := replace .File.Path "\\" "/" }} + + {{ $lang := "" }} + {{ if site.Params.doks.multilingualMode }} + {{ $lang = .Lang }} + {{ end }} + + {{ $parts = $parts | append "content" $lang $filePath }} + + {{ $url = delimit $parts "/" }} +{{ end }} + + diff --git a/layouts/home.html b/layouts/home.html index 0c08d86..4ab7c50 100644 --- a/layouts/home.html +++ b/layouts/home.html @@ -110,56 +110,28 @@

Our Projects

A suite of tools designed to streamline compliance workflows from code to audit.

+ {{- $colors := dict "Go" "primary" "Python" "warning" "Shell" "secondary" "TypeScript" "info" -}} + {{- $projects := site.Data.projects | default slice -}} + {{- if $projects }} + {{- $projects = sort $projects "stars" "desc" -}} + {{- end }} diff --git a/layouts/shortcodes/project-cards.html b/layouts/shortcodes/project-cards.html new file mode 100644 index 0000000..d4b5d4b --- /dev/null +++ b/layouts/shortcodes/project-cards.html @@ -0,0 +1,31 @@ +{{- $colors := dict "Go" "primary" "Python" "warning" "Shell" "secondary" "TypeScript" "info" -}} +{{- $projects := site.Data.projects | default slice -}} +{{- if $projects }} +{{- $projects = sort $projects "stars" "desc" -}} +{{- $types := slice -}} +{{- range $projects -}} + {{- $types = $types | append .type -}} +{{- end -}} +{{- $types = $types | uniq -}} + +{{- range $type := $types }} +

{{ $type }}

+ +{{- end }} +{{- end }} From ef193f9445dfdd30f46fd77538f0a74187684850 Mon Sep 17 00:00:00 2001 From: Sonu Preetam Date: Wed, 18 Mar 2026 18:26:03 -0400 Subject: [PATCH 04/11] refactor(sync): extract sync orchestration Signed-off-by: Sonu Preetam --- cmd/sync-content/README.md | 551 +++++++++++++++++++++++ cmd/sync-content/main.go | 438 ++++++++++++++++++ cmd/sync-content/sync.go | 538 ++++++++++++++++++++++ cmd/sync-content/sync_test.go | 824 ++++++++++++++++++++++++++++++++++ go.mod | 5 + go.sum | 4 + 6 files changed, 2360 insertions(+) create mode 100644 cmd/sync-content/README.md create mode 100644 cmd/sync-content/main.go create mode 100644 cmd/sync-content/sync.go create mode 100644 cmd/sync-content/sync_test.go create mode 100644 go.mod create mode 100644 go.sum diff --git a/cmd/sync-content/README.md b/cmd/sync-content/README.md new file mode 100644 index 0000000..1dcdcb7 --- /dev/null +++ b/cmd/sync-content/README.md @@ -0,0 +1,551 @@ +# sync-content + +A Go CLI tool that pulls documentation from upstream GitHub repositories into the +ComplyTime website's Hugo content tree. It reads the org's governance registry +(`peribolos.yaml` in the `.github` repo) to determine which repositories exist, +enriches each with GitHub API metadata, generates per-project documentation pages +and landing-page card data, then layers precise config-driven file syncs on top. + +**No generated content is committed to git.** The tool runs at build time (in CI) +or on-demand (locally) to populate the site. This keeps the repository lean and +ensures documentation is always sourced from upstream. + +## How It Works + +The tool operates in **hybrid mode** with two complementary phases: + +### Phase 1: Governance-Driven Discovery (automatic) + +Fetches `peribolos.yaml` from the org's `.github` repo to get the authoritative +list of repositories, then enriches each with metadata from the GitHub REST API. +For each eligible (non-archived, non-fork) repo it: + +1. Fetches the README and branch HEAD SHA. +2. Generates two Hugo pages per project: + - `content/docs/projects/{repo}/_index.md` — a section index with metadata + frontmatter (title, description, dates, language, stars, SEO metadata, + source/README SHAs). Contains no body content; the Doks sidebar renders + this as a collapsible section heading. + - `content/docs/projects/{repo}/overview.md` — the README content as a + navigable child page with edit URL. +3. Normalises casing: ALL CAPS filenames (e.g. `CONTRIBUTING.md`) and headings become Title Case (`Contributing`); known acronyms (API, OSCAL, CLI, …) are preserved. +4. Shifts all Markdown headings down one level (H1→H2, H2→H3, …) so Hugo's page title is the sole H1. +5. Strips CI badge lines from the top of the README. +6. Rewrites relative Markdown links and images to absolute GitHub URLs. +7. Scans for doc pages under configurable `scan_paths` (e.g. `docs/`). +8. Builds a `ProjectCard` for the landing page. + +After processing all repos, the tool writes `data/projects.json` — an array of +`ProjectCard` objects that Hugo templates use to render the "Our Projects" section. + +### Phase 2: Config Sync (opt-in) + +Reads `sync-config.yaml` and pulls specific files with per-file transforms: + +- **Frontmatter injection** — prepend YAML frontmatter with arbitrary key-value + pairs, or replace existing frontmatter. +- **Link rewriting** — convert relative Markdown links to absolute GitHub blob + URLs and relative images to raw.githubusercontent URLs. +- **Badge stripping** — remove CI/status badge lines from the top of content. + +Config sources can operate alongside or instead of the org scan per-repo: + +| `skip_org_sync` | Org scan page | Config files | ProjectCard | +|-----------------|---------------|--------------|-------------| +| `false` (default) | Generated from README | Synced as additional content | Yes | +| `true` | Suppressed | Synced as primary content | Yes | + +## Quick Start + +### Prerequisites + +- **Go 1.25+** — the sync tool is pure Go with one dependency (`gopkg.in/yaml.v3`) +- **Node.js 22+** — for the Hugo/Doks theme build (`npm ci`) +- **Hugo extended** — the static site generator +- **`GITHUB_TOKEN`** (recommended) — unauthenticated rate limit is 60 requests/hour + +### 1. Dry-run (preview without writing) + +```bash +go run ./cmd/sync-content --org complytime --config sync-config.yaml +``` + +Logs every action the tool would take but creates zero files. This is the default +mode — you must explicitly opt in to writes. + +### 2. Write mode (generate content) + +```bash +go run ./cmd/sync-content --org complytime --config sync-config.yaml --write +``` + +Produces: + +| Output | Path | +|--------|------| +| Per-repo section index | `content/docs/projects/{repo}/_index.md` | +| Per-repo README page | `content/docs/projects/{repo}/overview.md` | +| Auto-discovered doc pages | `content/docs/projects/{repo}/*.md` | +| Landing page card data | `data/projects.json` | +| Sync manifest | `.sync-manifest.json` | +| Content lockfile (with `--update-lock`) | `.content-lock.json` | + +### 3. Start Hugo + +```bash +npm run dev +``` + +Navigate to `http://localhost:1313/`. Project pages appear at `/docs/projects/`. + +### 4. Build for production + +```bash +# Local dev (fetches HEAD): +go run ./cmd/sync-content --org complytime --config sync-config.yaml --write + +# Production (fetches at approved SHAs): +go run ./cmd/sync-content --org complytime --config sync-config.yaml --lock .content-lock.json --write + +hugo --minify --gc +``` + +Output is in `public/`. The `--lock` flag ensures content matches the approved +SHAs in `.content-lock.json`. Omit it for local development to fetch latest HEAD. + +## CLI Reference + +| Flag | Default | Description | +|------|---------|-------------| +| `--org` | `complytime` | GitHub organization (reads `peribolos.yaml` from `{org}/.github` repo) | +| `--token` | `$GITHUB_TOKEN` | GitHub API token (or set the env var) | +| `--config` | _(none)_ | Path to `sync-config.yaml` for config-driven file syncs | +| `--write` | `false` | Apply changes to disk (without this flag, everything is a dry-run) | +| `--output` | `.` | Hugo site root directory | +| `--repo` | _(none)_ | Sync only this repo, e.g. `complytime/complyctl` | +| `--include` | _(all)_ | Comma-separated repo allowlist (empty = all eligible repos) | +| `--exclude` | _(see config)_ | Comma-separated repo names to skip; merged with `discovery.ignore_repos` in `sync-config.yaml` | +| `--workers` | `5` | Maximum concurrent repo processing goroutines | +| `--timeout` | `3m` | Overall timeout for all API operations | +| `--summary` | _(none)_ | Write a Markdown change summary to this file (for PR bodies) | +| `--lock` | _(none)_ | Path to `.content-lock.json` for content approval gating | +| `--update-lock` | `false` | Write current upstream SHAs to the lockfile (requires `--lock`) | + +## Common Tasks + +### Sync a single repository + +```bash +go run ./cmd/sync-content --repo complytime/complyctl --config sync-config.yaml --write +``` + +### Generate a change summary for PR review + +```bash +go run ./cmd/sync-content --org complytime --config sync-config.yaml --write \ + --summary sync-report.md +``` + +The summary file contains a Markdown report with new/updated/removed repos and +stats. + +### Increase concurrency for faster syncs + +```bash +go run ./cmd/sync-content --org complytime --workers 10 --write +``` + +## Configuration + +The config file `sync-config.yaml` lives at the repository root. It has three +sections: + +### `defaults` + +Fallback values applied to every source unless overridden. + +```yaml +defaults: + branch: main +``` + +### `discovery` + +Controls repo filtering and automatic doc page scanning. + +```yaml +discovery: + ignore_repos: + - .github # repo names to exclude from sync + - website + scan_paths: + - docs # directories to scan for .md files + ignore_files: + - CHANGELOG.md # filenames to skip during scanning + - CODE_OF_CONDUCT.md +``` + +`ignore_repos` filters repos out of the peribolos-driven list. When `scan_paths` +is set, the tool recursively lists `.md` files under each path for every eligible +repo and syncs them as doc pages at +`content/docs/projects/{repo}/{relative-path}`. Files already declared in +`sources` or listed in `ignore_files` are skipped. + +### `sources` + +Declares specific files to sync with fine-grained control. + +```yaml +sources: + - repo: complytime/complyctl + branch: main # optional, inherits from defaults + skip_org_sync: true # suppress auto-generated README page + files: + - src: README.md + dest: content/docs/projects/complyctl/_index.md + transform: + inject_frontmatter: + title: "complyctl" + description: "A compliance CLI tool." + weight: 10 + rewrite_links: true + strip_badges: true + + - src: docs/QUICK_START.md + dest: content/docs/projects/complyctl/quick-start.md + transform: + inject_frontmatter: + title: "Quick Start" + description: "Getting started with complyctl." + weight: 20 + rewrite_links: true +``` + +Each `files` entry maps one upstream file (`src`) to one local destination +(`dest`) with optional transforms. + +## Architecture + +### Data Flow + +``` +GitHub REST API + │ + ├─ GET /repos/{org}/.github/contents/peribolos.yaml → governance registry + ├─ GET /repos/{owner}/{repo} → per-repo metadata enrichment + ├─ GET /repos/{owner}/{repo}/readme → fetch README content + SHA + ├─ GET /repos/{owner}/{repo}/branches/{branch} → branch HEAD SHA + ├─ GET /repos/{owner}/{repo}/contents/{path} → fetch config-declared files + └─ GET /repos/{owner}/{repo}/contents/{dir} → list docs/ for doc page scanning + │ + ▼ + ┌─────────────────────────────────────────────┐ + │ sync-content │ + │ │ + │ Governance Discovery ──┐ │ + │ • read peribolos.yaml│ │ + │ • enrich via API ├─→ Project Pages │ + │ • fetch READMEs │ ProjectCards │ + │ • scan doc pages │ │ + │ │ │ + │ Config Sync ───────────┤ │ + │ • fetch declared ├─→ Config Files │ + │ files │ (with transforms)│ + │ • apply transforms │ │ + │ │ │ + │ Change Detection ──────┤ │ + │ • branch SHA cache ├─→ Skip unchanged │ + │ • README blob SHA │ │ + │ • byte-level dedup │ │ + │ │ │ + │ Orphan Cleanup ────────┘ │ + │ • manifest diffing ──→ Remove stale │ + │ • empty dir pruning files │ + └─────────────────────────────────────────────┘ + │ + ▼ + Hugo Content Tree + ├─ content/docs/projects/{repo}/_index.md (section index) + ├─ content/docs/projects/{repo}/overview.md (README content) + ├─ content/docs/projects/{repo}/*.md (discovered docs) + ├─ data/projects.json + ├─ .sync-manifest.json + └─ .content-lock.json (committed, updated by --update-lock) +``` + +### Key Design Decisions + +**Dry-run by default.** The tool never writes to disk unless `--write` is passed. +This makes it safe to run in CI for validation and locally for exploration. + +**Two-tier change detection.** On each run the tool reads `source_sha` and +`readme_sha` from existing project page frontmatter. If the branch HEAD SHA +hasn't changed, all fetches for that repo are skipped (fast path). If the branch +moved but the README blob SHA is identical, the repo is classified as unchanged. +This minimizes API calls and disk writes. + +**Manifest-based orphan cleanup.** A `.sync-manifest.json` file tracks every file +written during a sync run. On the next run, files in the old manifest but absent +from the current run are deleted, and empty parent directories are pruned. This +handles repos being archived, renamed, or removed from the org. + +**Idempotent writes.** Before writing a file, the tool reads the existing content +and compares bytes. If identical, the write is skipped entirely. This means +running the tool twice in succession produces zero disk I/O on the second run. + +**Provenance comments.** Every synced file includes an HTML comment after the +frontmatter: + +``` + +``` + +This makes it trivial to trace any page back to its upstream source and commit. + +**Bounded concurrency with rate-limit awareness.** A worker pool (default 5, +configurable via `--workers`) processes repos concurrently. The API client retries +on HTTP 403/429 with exponential backoff, respecting `Retry-After` and +`X-RateLimit-Reset` headers. A global context timeout (default 3 minutes) prevents +runaway execution. + +**Content approval gate.** A committed `.content-lock.json` file pins each repo +to an approved branch SHA. The deploy workflow fetches content at locked SHAs — +not HEAD. A weekly check workflow (`sync-content-check.yml`) detects upstream +changes and opens a PR to update the lockfile. This prevents broken or +undesirable content from reaching production without human review. + +**Single package, single dependency.** The entire tool lives in `package main` within `cmd/sync-content/` — domain-organised source files, one third-party dependency (`gopkg.in/yaml.v3`). No separate packages, no interfaces, no abstractions beyond what the problem requires. + +### Output Entities + +#### ProjectCard (`data/projects.json`) + +```json +{ + "name": "complyctl", + "language": "Go", + "type": "CLI Tool", + "description": "A compliance CLI tool for Kubernetes.", + "url": "/docs/projects/complyctl/", + "repo": "https://github.com/complytime/complyctl", + "stars": 42 +} +``` + +The `type` field is derived from repo topics and description using keyword +matching: + +| Keywords | Type | +|----------|------| +| `cli` topic, "command-line" or " cli" in description | CLI Tool | +| `automation` topic, "automat" in description | Automation | +| `observability` topic, "observability" or "collector" in description | Observability | +| `framework` topic, "framework" or "bridging" in description | Framework | +| _(default)_ | Library | + +#### Section Index Frontmatter (`_index.md`) + +```yaml +--- +title: "complyctl" +description: "A compliance CLI tool for Kubernetes." +date: 2026-03-10T18:30:00Z +lastmod: 2026-03-10T18:30:00Z +draft: false +toc: false +params: + language: "Go" + stars: 42 + repo: "https://github.com/complytime/complyctl" + source_sha: "abc123def456" + readme_sha: "def789abc012" + seo: + title: "complyctl | ComplyTime" + description: "A compliance CLI tool for Kubernetes." +--- +``` + +#### Overview Page Frontmatter (`overview.md`) + +```yaml +--- +title: "Overview" +description: "A compliance CLI tool for Kubernetes." +date: 2026-03-10T18:30:00Z +lastmod: 2026-03-10T18:30:00Z +draft: false +toc: true +weight: 1 +params: + editURL: "https://github.com/complytime/complyctl/edit/main/README.md" +--- +``` + +#### Auto-Discovered Doc Page Frontmatter + +```yaml +--- +title: "Quick Start" +description: "A compliance CLI tool for Kubernetes. — Quick Start" +date: 2026-03-10T18:30:00Z +lastmod: 2026-03-10T18:30:00Z +draft: false +weight: 10 +params: + editURL: "https://github.com/complytime/complyctl/edit/main/docs/quick-start.md" +--- + +``` + +### Content Transforms + +| Transform | What it does | +|-----------|-------------| +| `stripLeadingH1` | Removes the first H1 heading from the content body — the title is already captured in frontmatter, so the leading H1 would be a duplicate | +| `shiftHeadings` | Bumps every Markdown heading down one level (H1→H2, H2→H3, …) so Hugo's page title is the sole H1 | +| `titleCaseHeadings` | Applies acronym-aware Title Case to all in-page heading text (e.g. `## getting started` → `## Getting Started`, `## api reference` → `## API Reference`, `## CONTRIBUTING` → `## Contributing`); normalises ALL CAPS words while preserving known acronyms; ensures page headings and Hugo's TableOfContents match | +| `stripBadges` | Removes `[![alt](img)](link)` badge patterns from the start of content | +| `rewriteRelativeLinks` | Converts `[text](path)` to `[text](https://github.com/.../blob/main/path)` and `![alt](img)` to `![alt](https://raw.githubusercontent.com/.../img)` | +| `injectFrontmatter` | Prepends or replaces YAML frontmatter with declared key-value pairs | + +## CI/CD Integration + +### Three-Workflow Model + +The tool integrates with three GitHub Actions workflows (Constitution XV v1.3.0): + +**1. CI (`ci.yml`)** — PR validation (syncs content and builds the site to catch breakage): + +```yaml +- name: Sync content + run: go run ./cmd/sync-content --org complytime --config sync-config.yaml --lock .content-lock.json --write + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} +``` + +**2. Content Sync Check (`sync-content-check.yml`)** — weekly upstream detection: + +```yaml +- name: Check for upstream changes + run: go run ./cmd/sync-content --org complytime --config sync-config.yaml --lock .content-lock.json --update-lock --summary sync-summary.md +``` + +Checks upstream SHAs and creates/updates a PR with lockfile changes when content has moved. Since peribolos provides the authoritative repo list, separate discovery is unnecessary. + +**3. Deploy (`deploy-gh-pages.yml`)** — production build: + +```yaml +- name: Sync content + run: go run ./cmd/sync-content --org complytime --config sync-config.yaml --lock .content-lock.json --write + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + +- name: Build site + run: hugo --minify --gc +``` + +Upstream content changes require a reviewed PR before reaching production — no +unreviewed content is deployed. + +### Structured Outputs + +When running in GitHub Actions, the tool writes structured data to +`$GITHUB_OUTPUT` and `$GITHUB_STEP_SUMMARY`: + +**`GITHUB_OUTPUT`:** + +``` +has_changes=true +changed_count=3 +error_count=0 +``` + +**`GITHUB_STEP_SUMMARY`:** A Markdown table with new/updated/removed repos and +sync stats. + +**`--summary` flag:** Writes the same Markdown report to a file, useful for +automated PR body generation. + +### Exit Codes + +| Code | Meaning | +|------|---------| +| 0 | Success (all repos synced or dry-run complete) | +| 1 | One or more errors occurred (API failures, write errors) | + +## Testing + +Tests are split across 10 `*_test.go` files that mirror the source files. A +shared `helpers_test.go` provides common utilities. + +```bash +# Run all tests +go test ./cmd/sync-content/... + +# Run with race detector +go test -race ./cmd/sync-content/... + +# Run with verbose output +go test -v ./cmd/sync-content/... +``` + +### Test Coverage + +| Category | What's tested | +|----------|---------------| +| Config loading | Valid YAML, malformed YAML, missing file, default values, missing required fields | +| Frontmatter injection | Prepend to bare content, replace existing frontmatter, empty content | +| Badge stripping | Line-start badges removed, inline badges preserved, no-badge passthrough | +| Heading shifting | All headings bumped down one level (H1→H2, H2→H3, …) so Hugo page title is the sole H1 | +| Heading casing | ALL CAPS normalised to Title Case, acronyms preserved, mixed-case normalised, multi-word headings | +| Title from filename | ALL CAPS filenames (`CONTRIBUTING.md` → `Contributing`), hyphen/underscore splitting, acronym preservation | +| Link rewriting | Relative to absolute, images to raw URLs, absolute URLs unchanged, anchors unchanged, `./` prefix | +| Repo name validation | Valid names, empty, `.`, `..`, path separators | +| `processRepo` integration | Mock API server, project page written with correct frontmatter, headings shifted, README SHA recorded | +| Branch-unchanged fast path | No README fetch when branch SHA matches, manifest carry-forward | +| Branch-changed README-unchanged | Two-tier detection classifies as unchanged | +| `syncConfigSource` | All transforms applied, provenance comment inserted, dry-run writes nothing | +| Doc page scanning | Auto-syncs `docs/*.md`, skips config-tracked files, generates section indexes | +| Manifest round-trip | Write and read manifest, orphan cleanup, empty directory pruning | +| Concurrent access | Race-safe `syncResult` mutations, concurrent `recordFile` | +| Peribolos integration | Governance registry fetch, repo validation, missing org handling | + +All integration tests use `net/http/httptest` to mock the GitHub API. No real API +calls are made during testing. + +## File Inventory + +``` +cmd/sync-content/ +├── main.go # Entry point and orchestration (~440 lines) +├── config.go # Config types and loading +├── github.go # GitHub API client and types +├── transform.go # Markdown transforms (links, badges, frontmatter) +├── hugo.go # Hugo page and card generation +├── sync.go # Sync logic, result tracking, repo processing +├── manifest.go # Manifest I/O and state tracking +├── cleanup.go # Orphan and stale content removal +├── path.go # Path validation utilities +├── lock.go # Content lockfile read/write/query +├── *_test.go # Tests mirror source files (10 files) +└── README.md # This file + +sync-config.yaml # Declarative sync config (repo root) +.content-lock.json # Approved upstream SHAs per repo (committed) +go.mod # Go module: github.com/complytime/website +go.sum # Dependency checksums +``` + +### Generated Files (gitignored, not committed) + +``` +content/docs/projects/{repo}/_index.md # Section index (metadata only) +content/docs/projects/{repo}/overview.md # README content page +content/docs/projects/{repo}/*.md # Auto-discovered doc pages +data/projects.json # Landing page card data +.sync-manifest.json # Orphan tracking manifest +``` + +## License + +SPDX-License-Identifier: Apache-2.0 diff --git a/cmd/sync-content/main.go b/cmd/sync-content/main.go new file mode 100644 index 0000000..41e7334 --- /dev/null +++ b/cmd/sync-content/main.go @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: Apache-2.0 + +// Command sync-content pulls documentation from upstream GitHub repositories +// into the website's Hugo content tree. It operates in hybrid mode: +// +// 1. Governance-driven discovery — reads the org's peribolos.yaml governance +// registry to determine which repos exist, then enriches each with GitHub +// API metadata. Generates project pages and landing-page cards. +// 2. Config sync — reads sync-config.yaml and pulls specific files with +// transforms (frontmatter injection, link rewriting, badge stripping). +// +// The tool is fully idempotent: unchanged files are not rewritten. A sync +// manifest (.sync-manifest.json) tracks written files so orphaned content +// from previous runs is cleaned up automatically. +// +// Usage: +// +// go run ./cmd/sync-content --org complytime --config sync-config.yaml # dry-run +// go run ./cmd/sync-content --org complytime --config sync-config.yaml --write # apply +// go run ./cmd/sync-content --config sync-config.yaml --repo complytime/complyctl --write # single repo +// +// Environment: +// +// GITHUB_TOKEN GitHub API token (recommended; unauthenticated rate limit is 60 req/hr) +package main + +import ( + "context" + "encoding/json" + "flag" + "log/slog" + "net/http" + "os" + "path/filepath" + "sort" + "strings" + "sync" + "time" +) + +func main() { + org := flag.String("org", "complytime", "GitHub organization name") + token := flag.String("token", "", "GitHub API token (or set GITHUB_TOKEN env var)") + output := flag.String("output", ".", "Hugo site root directory") + include := flag.String("include", "", "Comma-separated repo allowlist (empty = auto-discover all)") + exclude := flag.String("exclude", "", "Comma-separated repo names to skip (merged with config discovery.ignore_repos)") + write := flag.Bool("write", false, "Apply changes to disk (default: dry-run)") + summaryFile := flag.String("summary", "", "Write markdown change summary to this file (for PR body)") + timeout := flag.Duration("timeout", 3*time.Minute, "Overall timeout for all API operations") + workers := flag.Int("workers", defaultWorkers, "Maximum concurrent repo processing goroutines") + configPath := flag.String("config", "", "Path to sync-config.yaml for declarative file syncs") + repoFilter := flag.String("repo", "", "Sync only this repo (e.g. complytime/complyctl)") + lockPath := flag.String("lock", "", "Path to .content-lock.json for content approval gating") + updateLock := flag.Bool("update-lock", false, "Write updated upstream SHAs to the lockfile (requires --lock)") + flag.Parse() + + slog.SetDefault(slog.New(slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo}))) + + if *workers < 1 { + slog.Error("--workers must be at least 1", "value", *workers) + os.Exit(1) + } + + apiToken := *token + if apiToken != "" { + slog.Warn("--token flag is visible in process listings and shell history; prefer GITHUB_TOKEN env var") + } else { + apiToken = os.Getenv("GITHUB_TOKEN") + } + if apiToken == "" { + slog.Warn("no GitHub token provided; API rate limits will be restrictive") + } + + includeSet := parseNameList(*include) + excludeSet := parseNameList(*exclude) + + if *repoFilter != "" { + parts := strings.SplitN(*repoFilter, "/", 2) + shortName := *repoFilter + if len(parts) == 2 { + shortName = parts[1] + } + includeSet = map[string]bool{shortName: true} + delete(excludeSet, shortName) + slog.Info("filtering to single repo", "repo", *repoFilter) + } + + ctx, cancel := context.WithTimeout(context.Background(), *timeout) + defer cancel() + + gh := &apiClient{ + token: apiToken, + http: &http.Client{Timeout: 30 * time.Second}, + } + + var cfg *SyncConfig + if *configPath != "" { + var err error + cfg, err = loadConfig(*configPath) + if err != nil { + slog.Error("error loading config", "error", err) + os.Exit(1) + } + slog.Info("loaded sync config", "path", *configPath, "sources", len(cfg.Sources)) + for _, r := range cfg.Discovery.IgnoreRepos { + if !includeSet[r] { + excludeSet[r] = true + } + } + } + + if *updateLock && *lockPath == "" { + slog.Error("--update-lock requires --lock") + os.Exit(1) + } + + var lock *ContentLock + if *lockPath != "" { + var err error + lock, err = readLock(*lockPath) + if err != nil { + slog.Error("error loading lockfile", "error", err) + os.Exit(1) + } + slog.Info("loaded content lock", "path", *lockPath, "repos", len(lock.Repos)) + } + + // lockGate is true when the lock should constrain which repos are synced + // and pin content to approved SHAs. Disabled during --update-lock, which + // scans HEAD to propose lockfile updates. + lockGate := lock != nil && len(lock.Repos) > 0 && !*updateLock + + configSources := make(map[string]Source) + if cfg != nil { + for _, src := range cfg.Sources { + if *repoFilter != "" && src.Repo != *repoFilter { + continue + } + configSources[src.Repo] = src + } + } + + oldState := readExistingState(*output) + oldManifest := readManifest(*output) + + slog.Info("fetching governance registry", "org", *org) + repoNames, err := gh.fetchPeribolosRepos(ctx, *org) + if err != nil { + slog.Error("error fetching peribolos.yaml", "error", err) + os.Exit(1) + } + slog.Info("found repos in governance registry", "count", len(repoNames)) + + peribolosSet := make(map[string]bool, len(repoNames)) + for _, name := range repoNames { + peribolosSet[name] = true + } + + if *repoFilter != "" { + parts := strings.SplitN(*repoFilter, "/", 2) + shortName := *repoFilter + if len(parts) == 2 { + shortName = parts[1] + } + if !peribolosSet[shortName] { + slog.Error("--repo target is not in the governance registry (peribolos.yaml)", "repo", *repoFilter) + os.Exit(1) + } + } + + var repos []Repo + for _, name := range repoNames { + if *repoFilter != "" && !includeSet[name] { + continue + } + r, err := gh.getRepoMetadata(ctx, *org, name) + if err != nil { + slog.Warn("could not fetch repo metadata, skipping", "repo", name, "error", err) + continue + } + repos = append(repos, *r) + } + slog.Info("found repositories", "count", len(repos)) + + sort.Slice(repos, func(i, j int) bool { + return repos[i].Name < repos[j].Name + }) + + var result syncResult + newState := make(map[string]bool) + + var eligible []Repo + for _, repo := range repos { + if !isValidRepoName(repo.Name) { + slog.Warn("skipping repo with invalid name", "name", repo.Name) + result.skipped++ + continue + } + included := len(includeSet) == 0 || includeSet[repo.Name] + if repo.Archived || repo.Fork || !included || excludeSet[repo.Name] { + result.skipped++ + continue + } + newState[repo.Name] = true + eligible = append(eligible, repo) + } + + if len(eligible) == 0 && len(oldState) > 0 && *write && len(includeSet) == 0 { + slog.Error("refusing to clean: zero eligible repos from API but previous state has entries — possible API outage or misconfiguration", + "old_repos", len(oldState)) + os.Exit(1) + } + + ignoreFiles := make(map[string]bool) + if cfg != nil { + for _, f := range cfg.Discovery.IgnoreFiles { + ignoreFiles[f] = true + } + } + + docPagesIndex := buildDocPagesIndex(oldManifest) + + var upstreamSHAs sync.Map + + sem := make(chan struct{}, *workers) + var wg sync.WaitGroup + var cardsMu sync.Mutex + var cards []ProjectCard + var processedConfigMu sync.Mutex + processedConfig := make(map[string]bool) + + for _, repo := range eligible { + if ctx.Err() != nil { + slog.Warn("context cancelled, stopping repo dispatch") + break + } + wg.Add(1) + sem <- struct{}{} + go func(r Repo) { + defer wg.Done() + defer func() { <-sem }() + + slog.Info("processing repo", "repo", r.Name) + + lockedSHA := "" + if lockGate { + lockedSHA = lock.sha(r.Name) + if lockedSHA == "" { + slog.Info("repo not in lockfile, skipping (unapproved)", "repo", r.Name) + result.mu.Lock() + result.skipped++ + result.mu.Unlock() + return + } + } + + cfgSrc, inConfig := configSources[r.FullName] + skipReadme := inConfig && cfgSrc.SkipOrgSync + + work := processRepo(ctx, gh, *org, *output, r, *write, skipReadme, &result, oldState, oldManifest, lockedSHA) + if work != nil { + upstreamSHAs.Store(r.Name, work.sha) + cardsMu.Lock() + cards = append(cards, work.card) + cardsMu.Unlock() + } + + configTracked := make(map[string]bool) + if inConfig { + for _, f := range cfgSrc.Files { + configTracked[f.Src] = true + } + } + + // Determine ref for doc pages and config sources. + fetchRef := "" + if work != nil && lockedSHA != "" && lockedSHA != work.sha { + fetchRef = lockedSHA + } + + if work != nil && !skipReadme && cfg != nil && len(cfg.Discovery.ScanPaths) > 0 { + if !work.unchanged || !docPagesIndex[r.Name] { + syncRepoDocPages(ctx, gh, *org, r, *output, *write, cfg.Discovery, ignoreFiles, configTracked, &result, fetchRef) + } + } + + if inConfig { + syncConfigSource(ctx, gh, cfgSrc, cfg.Defaults, *output, *write, &result, fetchRef) + processedConfigMu.Lock() + processedConfig[r.FullName] = true + processedConfigMu.Unlock() + } + }(repo) + } + + wg.Wait() + + if cfg != nil { + for _, src := range cfg.Sources { + if processedConfig[src.Repo] { + continue + } + + parts := strings.SplitN(src.Repo, "/", 2) + if len(parts) != 2 { + slog.Error("config source repo must be in owner/name format", "repo", src.Repo) + result.addError() + continue + } + shortName := parts[1] + + if !peribolosSet[shortName] { + slog.Error("config source repo is not in the governance registry (peribolos.yaml), skipping", "repo", src.Repo) + result.addError() + continue + } + + if lockGate && lock.sha(src.Repo) == "" { + slog.Info("config-only source not in lockfile, skipping (unapproved)", "repo", src.Repo) + result.mu.Lock() + result.skipped++ + result.mu.Unlock() + continue + } + + slog.Info("processing config-only source", "repo", src.Repo) + + cfgRef := "" + if lockGate { + sha, err := gh.getBranchSHA(ctx, parts[0], parts[1], src.Branch) + if err == nil { + upstreamSHAs.Store(src.Repo, sha) + locked := lock.sha(src.Repo) + if locked != "" && locked != sha { + cfgRef = locked + } + } + } else if *updateLock { + sha, err := gh.getBranchSHA(ctx, parts[0], parts[1], src.Branch) + if err == nil { + upstreamSHAs.Store(src.Repo, sha) + } + } + + syncConfigSource(ctx, gh, src, cfg.Defaults, *output, *write, &result, cfgRef) + + prefix := filepath.Join("content", "docs", "projects", shortName) + string(filepath.Separator) + for _, f := range src.Files { + if strings.HasPrefix(f.Dest, prefix) { + newState[shortName] = true + break + } + } + } + } + + sort.Slice(cards, func(i, j int) bool { + return cards[i].Name < cards[j].Name + }) + + for name := range oldState { + if !newState[name] { + result.removed = append(result.removed, name) + } + } + sort.Strings(result.removed) + + if *write { + if oldManifest != nil { + orphans := cleanOrphanedFiles(*output, oldManifest, result.writtenFiles) + if orphans > 0 { + slog.Info("cleaned orphaned files from previous sync", "count", orphans) + } + } + if err := writeManifest(*output, result.writtenFiles); err != nil { + slog.Warn("could not write sync manifest", "error", err) + result.addWarning() + } + } + + if *updateLock { + newLock := &ContentLock{Repos: make(map[string]string)} + upstreamSHAs.Range(func(key, value any) bool { + newLock.Repos[key.(string)] = value.(string) + return true + }) + if err := writeLock(*lockPath, newLock); err != nil { + slog.Error("error writing lockfile", "path", *lockPath, "error", err) + os.Exit(1) + } + slog.Info("updated content lock", "path", *lockPath, "repos", len(newLock.Repos)) + } + + if *write { + jsonData, err := json.MarshalIndent(cards, "", " ") + if err != nil { + slog.Error("error marshaling projects.json", "error", err) + os.Exit(1) + } + jsonPath := filepath.Join(*output, "data", "projects.json") + written, err := writeFileSafe(jsonPath, append(jsonData, '\n')) + if err != nil { + slog.Error("error writing projects.json", "error", err) + os.Exit(1) + } + if written { + slog.Info("wrote projects.json", "count", len(cards)) + } else { + slog.Info("projects.json unchanged, skipped write") + } + } + + result.printSummary() + writeGitHubOutputs(&result) + + if *summaryFile != "" { + if !isUnderDir(*output, *summaryFile) { + slog.Error("summary file path is outside output directory", "path", *summaryFile) + } else { + md := result.toMarkdown() + if _, err := writeFileSafe(*summaryFile, []byte(md)); err != nil { + slog.Error("error writing summary file", "path", *summaryFile, "error", err) + } else { + slog.Info("wrote change summary", "path", *summaryFile) + } + } + } + + if !*write { + slog.Info("dry run complete, no files were written") + } else { + slog.Info("sync complete") + } + + if result.errors > 0 { + os.Exit(1) + } +} diff --git a/cmd/sync-content/sync.go b/cmd/sync-content/sync.go new file mode 100644 index 0000000..a6efe35 --- /dev/null +++ b/cmd/sync-content/sync.go @@ -0,0 +1,538 @@ +// SPDX-License-Identifier: Apache-2.0 + +package main + +import ( + "bytes" + "context" + "fmt" + "log/slog" + "os" + "path/filepath" + "strings" + "sync" +) + +const defaultWorkers = 5 + +// repoState holds the SHAs read from an existing project page for two-tier +// change detection: branchSHA is a fast pre-filter, readmeSHA enables +// content-level comparison when the branch has moved. +type repoState struct { + branchSHA string + readmeSHA string +} + +// syncResult tracks outcomes for the final summary and exit code. +type syncResult struct { + mu sync.Mutex + synced int + skipped int + warnings int + errors int + added []string + updated []string + removed []string + unchanged []string + writtenFiles []string +} + +// recordFile appends a relative file path to the manifest of files written +// during this sync run. Thread-safe. +func (r *syncResult) recordFile(relPath string) { + r.mu.Lock() + r.writtenFiles = append(r.writtenFiles, relPath) + r.mu.Unlock() +} + +func (r *syncResult) addError() { r.mu.Lock(); r.errors++; r.mu.Unlock() } +func (r *syncResult) addWarning() { r.mu.Lock(); r.warnings++; r.mu.Unlock() } +func (r *syncResult) addSynced() { r.mu.Lock(); r.synced++; r.mu.Unlock() } + +func (r *syncResult) hasChanges() bool { + return len(r.added) > 0 || len(r.updated) > 0 || len(r.removed) > 0 +} + +func (r *syncResult) toMarkdown() string { + var b strings.Builder + b.WriteString("## Content Sync Summary\n\n") + + if len(r.added) > 0 { + b.WriteString("### New Repositories\n\n") + for _, name := range r.added { + fmt.Fprintf(&b, "- `%s`\n", name) + } + b.WriteString("\n") + } + if len(r.updated) > 0 { + b.WriteString("### Updated\n\n") + for _, name := range r.updated { + fmt.Fprintf(&b, "- `%s`\n", name) + } + b.WriteString("\n") + } + if len(r.removed) > 0 { + b.WriteString("### Removed\n\n") + for _, name := range r.removed { + fmt.Fprintf(&b, "- `%s`\n", name) + } + b.WriteString("\n") + } + if !r.hasChanges() { + b.WriteString("No changes detected.\n\n") + } + + fmt.Fprintf(&b, "**Stats**: %d synced, %d skipped", + r.synced, r.skipped) + if r.warnings > 0 { + fmt.Fprintf(&b, ", %d warnings", r.warnings) + } + if r.errors > 0 { + fmt.Fprintf(&b, ", %d errors", r.errors) + } + b.WriteString("\n") + + return b.String() +} + +func (r *syncResult) printSummary() { + slog.Info("sync summary", + "synced", r.synced, + "skipped", r.skipped, + "warnings", r.warnings, + "errors", r.errors, + ) + if len(r.added) > 0 { + slog.Info("new repos", "repos", strings.Join(r.added, ", ")) + } + if len(r.updated) > 0 { + slog.Info("updated repos", "repos", strings.Join(r.updated, ", ")) + } + if len(r.removed) > 0 { + slog.Info("removed repos", "repos", strings.Join(r.removed, ", ")) + } + if !r.hasChanges() { + slog.Info("no content changes detected") + } +} + +// writeFileSafe writes data to path, skipping the write if the file already +// exists with identical content. Returns true if the file was actually written. +func writeFileSafe(path string, data []byte) (bool, error) { + existing, err := os.ReadFile(path) + if err == nil && bytes.Equal(existing, data) { + return false, nil + } + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return false, err + } + return true, os.WriteFile(path, data, 0o644) +} + +// syncConfigSource processes all FileSpec entries for a single config source, +// fetching each declared file from GitHub, applying the requested transforms, +// and writing the result to the output directory. When ref is non-empty, +// content is fetched at that specific commit SHA. +func syncConfigSource(ctx context.Context, gh *apiClient, src Source, defaults Defaults, output string, write bool, result *syncResult, ref string) { + parts := strings.SplitN(src.Repo, "/", 2) + if len(parts) != 2 { + slog.Error("invalid repo format in config, expected owner/name", "repo", src.Repo) + result.addError() + return + } + owner, repoName := parts[0], parts[1] + logger := slog.With("config_repo", src.Repo) + + for _, file := range src.Files { + content, fileSHA, err := gh.getFileContent(ctx, owner, repoName, file.Src, ref) + if err != nil { + logger.Error("could not fetch config file", "src", file.Src, "error", err) + result.addError() + continue + } + + if file.Transform.StripBadges { + content = stripBadges(content) + } + content = stripLeadingH1(content) + content = shiftHeadings(content) + content = titleCaseHeadings(content) + if file.Transform.RewriteLinks { + content = rewriteRelativeLinks(content, owner, repoName, src.Branch) + } + + out := []byte(content) + if len(file.Transform.InjectFrontmatter) > 0 { + var fmErr error + out, fmErr = injectFrontmatter(out, file.Transform.InjectFrontmatter) + if fmErr != nil { + logger.Error("frontmatter injection failed", "src", file.Src, "error", fmErr) + result.addError() + continue + } + } + + shortSHA := fileSHA + if len(shortSHA) > 12 { + shortSHA = shortSHA[:12] + } + provenance := fmt.Sprintf( + "\n", + src.Repo, file.Src, src.Branch, shortSHA, + ) + out = insertAfterFrontmatter(out, []byte(provenance)) + + destPath := filepath.Join(output, file.Dest) + + if !isUnderDir(output, destPath) { + logger.Error("path traversal blocked", "dest", file.Dest, "resolved", destPath) + result.addError() + continue + } + + if !write { + logger.Info("would write config file (dry-run)", "src", file.Src, "dest", destPath) + result.addSynced() + continue + } + + written, err := writeFileSafe(destPath, out) + if err != nil { + logger.Error("error writing config file", "src", file.Src, "dest", destPath, "error", err) + result.addError() + continue + } + + result.recordFile(file.Dest) + + if written { + logger.Info("wrote config file", "src", file.Src, "dest", destPath) + } else { + logger.Info("config file unchanged", "src", file.Src, "dest", destPath) + } + + result.addSynced() + } +} + +func parseNameList(raw string) map[string]bool { + set := make(map[string]bool) + for _, name := range strings.Split(raw, ",") { + name = strings.TrimSpace(name) + if name != "" { + set[name] = true + } + } + return set +} + +// repoWork holds the inputs and outputs for processing a single repo. +type repoWork struct { + repo Repo + sha string + card ProjectCard + unchanged bool +} + +// processRepo handles a single repository: fetches content, writes pages. +// When skipReadme is true, README fetching and project page generation are +// skipped but the ProjectCard is still produced. +// +// lockedSHA, when non-empty, pins content fetches to the approved commit. +// If the upstream branch has moved past the lock, content is still fetched +// at the locked version so only reviewed content reaches production. +// +// Two-tier change detection: +// 1. Branch SHA unchanged → skip all fetches (fast path). +// 2. Branch SHA changed → fetch README, compare blob SHA for accurate +// content-level change reporting. +// +// All shared state mutations go through result.mu. +func processRepo(ctx context.Context, gh *apiClient, org, output string, repo Repo, write bool, skipReadme bool, result *syncResult, oldState map[string]repoState, oldManifest map[string]bool, lockedSHA string) *repoWork { + logger := slog.With("repo", repo.Name) + + sha, err := gh.getBranchSHA(ctx, org, repo.Name, repo.DefaultBranch) + if err != nil { + logger.Warn("could not get branch SHA", "error", err) + sha = "unknown" + result.addWarning() + } + + old, existed := oldState[repo.Name] + + // Fast path: branch hasn't changed since last sync — skip all fetches. + if existed && old.branchSHA == sha { + result.mu.Lock() + result.unchanged = append(result.unchanged, repo.Name) + result.mu.Unlock() + result.addSynced() + + if !write { + logger.Info("unchanged (branch SHA match), skipping", "sha", sha) + return &repoWork{repo: repo, sha: sha, card: buildProjectCard(repo), unchanged: true} + } + + logger.Info("unchanged (branch SHA match), skipping fetches", "sha", sha) + if oldManifest != nil { + carryForwardManifest(result, repo.Name, oldManifest) + } + + return &repoWork{repo: repo, sha: sha, card: buildProjectCard(repo), unchanged: true} + } + + // Dry-run: report what would happen without fetching content. + if !write { + result.mu.Lock() + if !existed { + result.added = append(result.added, repo.Name) + } else { + result.updated = append(result.updated, repo.Name) + } + result.mu.Unlock() + result.addSynced() + logger.Info("would sync (dry-run)", "sha", sha) + return &repoWork{repo: repo, sha: sha, card: buildProjectCard(repo)} + } + + // Slow path: branch SHA changed — fetch content and compare file-level SHAs. + // When a lock is active, fetch at the locked commit rather than HEAD. + fetchRef := "" + if lockedSHA != "" && lockedSHA != sha { + fetchRef = lockedSHA + } + + contentChanged := !existed + var readmeSHA string + + if !skipReadme { + readme, rSHA, err := gh.getREADME(ctx, org, repo.Name, fetchRef) + readmeSHA = rSHA + if err != nil { + logger.Warn("no README found", "error", err) + result.addWarning() + } + + if existed && old.readmeSHA != "" && old.readmeSHA == readmeSHA { + logger.Info("README unchanged despite branch update", "branch_sha", sha, "readme_sha", readmeSHA) + } else { + contentChanged = true + } + + if readme != "" { + readme = stripLeadingH1(readme) + readme = shiftHeadings(readme) + readme = titleCaseHeadings(readme) + readme = stripBadges(readme) + readme = rewriteRelativeLinks(readme, org, repo.Name, repo.DefaultBranch) + } else { + readme = fmt.Sprintf( + "*No README available.* Visit the [repository on GitHub](%s) for more information.\n", + repo.HTMLURL, + ) + } + + indexPage := buildSectionIndex(repo, sha, readmeSHA) + indexRel := filepath.Join("content", "docs", "projects", repo.Name, "_index.md") + indexPath := filepath.Join(output, indexRel) + if !isUnderDir(output, indexPath) { + logger.Error("path traversal blocked", "path", indexRel) + result.addError() + return nil + } + written, err := writeFileSafe(indexPath, []byte(indexPage)) + if err != nil { + logger.Error("error writing section index", "path", indexPath, "error", err) + result.addError() + return nil + } + result.recordFile(indexRel) + if written { + logger.Info("wrote section index", "path", indexPath) + } else { + logger.Info("section index unchanged", "path", indexPath) + } + + overviewPage := buildOverviewPage(repo, readme) + overviewRel := filepath.Join("content", "docs", "projects", repo.Name, "overview.md") + overviewPath := filepath.Join(output, overviewRel) + if !isUnderDir(output, overviewPath) { + logger.Error("path traversal blocked", "path", overviewRel) + result.addError() + return nil + } + written, err = writeFileSafe(overviewPath, []byte(overviewPage)) + if err != nil { + logger.Error("error writing overview page", "path", overviewPath, "error", err) + result.addError() + return nil + } + result.recordFile(overviewRel) + if written { + logger.Info("wrote overview page", "path", overviewPath) + } else { + logger.Info("overview page unchanged", "path", overviewPath) + } + } + + result.mu.Lock() + if !existed { + result.added = append(result.added, repo.Name) + } else if contentChanged { + result.updated = append(result.updated, repo.Name) + } else { + result.unchanged = append(result.unchanged, repo.Name) + } + result.mu.Unlock() + result.addSynced() + + return &repoWork{repo: repo, sha: sha, card: buildProjectCard(repo)} +} + +// syncRepoDocPages auto-syncs Markdown files found under each scan_path in the +// discovery config. Files already tracked by explicit config sources or listed +// in ignoreFiles are skipped. Intermediate directories get auto-generated +// _index.md section pages. When ref is non-empty, content is fetched at that +// specific commit SHA. +func syncRepoDocPages(ctx context.Context, gh *apiClient, org string, repo Repo, output string, write bool, discovery Discovery, ignoreFiles map[string]bool, configTracked map[string]bool, result *syncResult, ref string) { + logger := slog.With("repo", repo.Name, "phase", "doc-pages") + + for _, scanPath := range discovery.ScanPaths { + files, err := gh.listDirMD(ctx, org, repo.Name, scanPath, ref) + if err != nil { + logger.Debug("scan path not found", "path", scanPath, "error", err) + continue + } + + neededDirs := make(map[string]bool) + + for _, filePath := range files { + baseName := filepath.Base(filePath) + if ignoreFiles[baseName] { + continue + } + if configTracked[filePath] { + continue + } + + relPath := strings.TrimPrefix(filePath, scanPath+"/") + destRel := filepath.Join("content", "docs", "projects", repo.Name, relPath) + destPath := filepath.Join(output, destRel) + + if !isUnderDir(output, destPath) { + logger.Error("path traversal blocked", "src", filePath, "dest", destRel) + result.addError() + continue + } + + dir := filepath.Dir(relPath) + for dir != "." && dir != "" { + neededDirs[dir] = true + dir = filepath.Dir(dir) + } + + if !write { + logger.Info("would write doc page (dry-run)", "src", filePath, "dest", destRel) + result.addSynced() + continue + } + + content, sha, err := gh.getFileContent(ctx, org, repo.Name, filePath, ref) + if err != nil { + logger.Warn("could not fetch doc file", "path", filePath, "error", err) + result.addWarning() + continue + } + + content = stripBadges(content) + content = stripLeadingH1(content) + content = shiftHeadings(content) + content = titleCaseHeadings(content) + fileDir := filepath.Dir(filePath) + content = rewriteRelativeLinks(content, org, repo.Name, repo.DefaultBranch, fileDir) + + page := buildDocPage(filePath, repo.FullName, repo.Description, repo.PushedAt, repo.DefaultBranch, sha, content) + + written, err := writeFileSafe(destPath, []byte(page)) + if err != nil { + logger.Error("error writing doc page", "path", destPath, "error", err) + result.addError() + continue + } + + result.recordFile(destRel) + if written { + logger.Info("wrote doc page", "src", filePath, "dest", destPath) + } else { + logger.Info("doc page unchanged", "src", filePath, "dest", destPath) + } + + result.addSynced() + } + + for dir := range neededDirs { + indexRel := filepath.Join("content", "docs", "projects", repo.Name, dir, "_index.md") + indexPath := filepath.Join(output, indexRel) + + if !isUnderDir(output, indexPath) { + logger.Error("path traversal blocked for section index", "path", indexRel) + result.addError() + continue + } + + if _, err := os.Stat(indexPath); err == nil { + result.recordFile(indexRel) + continue + } + + if !write { + continue + } + + title := titleFromFilename(filepath.Base(dir)) + var b strings.Builder + b.WriteString("---\n") + fmt.Fprintf(&b, "title: %q\n", title) + fmt.Fprintf(&b, "description: %q\n", repo.Description+" — "+title) + fmt.Fprintf(&b, "date: %s\n", repo.PushedAt) + fmt.Fprintf(&b, "lastmod: %s\n", repo.PushedAt) + b.WriteString("draft: false\n") + b.WriteString("---\n") + + written, err := writeFileSafe(indexPath, []byte(b.String())) + if err != nil { + logger.Error("error writing section index", "path", indexPath, "error", err) + continue + } + + result.recordFile(indexRel) + if written { + logger.Info("wrote section index", "path", indexPath) + } + } + } +} + +// writeGitHubOutputs writes structured outputs for GitHub Actions integration. +func writeGitHubOutputs(result *syncResult) { + if ghOutput := os.Getenv("GITHUB_OUTPUT"); ghOutput != "" { + f, err := os.OpenFile(ghOutput, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o644) + if err == nil { + defer f.Close() + hasChanges := "false" + if result.hasChanges() { + hasChanges = "true" + } + fmt.Fprintf(f, "has_changes=%s\n", hasChanges) + fmt.Fprintf(f, "changed_count=%d\n", len(result.added)+len(result.updated)) + fmt.Fprintf(f, "error_count=%d\n", result.errors) + } + } + + if summaryPath := os.Getenv("GITHUB_STEP_SUMMARY"); summaryPath != "" { + f, err := os.OpenFile(summaryPath, os.O_APPEND|os.O_WRONLY|os.O_CREATE, 0o644) + if err == nil { + defer f.Close() + fmt.Fprint(f, result.toMarkdown()) + } + } +} diff --git a/cmd/sync-content/sync_test.go b/cmd/sync-content/sync_test.go new file mode 100644 index 0000000..bfb346c --- /dev/null +++ b/cmd/sync-content/sync_test.go @@ -0,0 +1,824 @@ +// SPDX-License-Identifier: Apache-2.0 +package main + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "sync" + "testing" +) + +func TestProcessRepo(t *testing.T) { + readmeContent := "# test-repo\n\nThis is a test README." + branchSHA := "abc123def456" + + mux := http.NewServeMux() + + mux.HandleFunc("/repos/testorg/test-repo/readme", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(FileResponse{ + Content: b64(readmeContent), + Encoding: "base64", + SHA: "sha-readme", + }) + }) + mux.HandleFunc("/repos/testorg/test-repo/branches/main", func(w http.ResponseWriter, r *http.Request) { + resp := BranchResponse{} + resp.Commit.SHA = branchSHA + json.NewEncoder(w).Encode(resp) + }) + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + output := t.TempDir() + ctx := context.Background() + + repo := Repo{ + Name: "test-repo", + FullName: "testorg/test-repo", + Description: "A test repository", + Language: "Go", + HTMLURL: "https://github.com/testorg/test-repo", + DefaultBranch: "main", + } + + result := &syncResult{} + oldState := map[string]repoState{} + + work := processRepo(ctx, gh, "testorg", output, repo, true, false, result, oldState, nil, "") + + if work == nil { + t.Fatal("processRepo returned nil") + } + if work.card.Name != "test-repo" { + t.Errorf("card.Name = %q, want %q", work.card.Name, "test-repo") + } + if work.card.Language != "Go" { + t.Errorf("card.Language = %q, want %q", work.card.Language, "Go") + } + if work.card.Description != "A test repository" { + t.Errorf("card.Description = %q, want %q", work.card.Description, "A test repository") + } + + indexPath := filepath.Join(output, "content", "docs", "projects", "test-repo", "_index.md") + data, err := os.ReadFile(indexPath) + if err != nil { + t.Fatalf("section index not written: %v", err) + } + index := string(data) + if !strings.Contains(index, `title: "Test Repo"`) { + t.Error("section index title should use formatRepoTitle") + } + if !strings.Contains(index, `linkTitle: "test-repo"`) { + t.Error("section index should have linkTitle with raw repo name") + } + if !strings.Contains(index, "readme_sha:") { + t.Error("section index should contain readme_sha in frontmatter") + } + if !strings.Contains(index, "sha-readme") { + t.Error("section index should contain the README blob SHA value") + } + if strings.Contains(index, "This is a test README.") { + t.Error("section index should be frontmatter-only, no README body") + } + + overviewPath := filepath.Join(output, "content", "docs", "projects", "test-repo", "overview.md") + overviewData, err := os.ReadFile(overviewPath) + if err != nil { + t.Fatalf("overview page not written: %v", err) + } + overview := string(overviewData) + if !strings.Contains(overview, "This is a test README.") { + t.Error("overview page should contain README body") + } + if strings.Contains(overview, "# test-repo") || strings.Contains(overview, "## Test-repo") { + t.Error("leading H1 should be stripped — title is already in frontmatter") + } + if !strings.Contains(overview, `title: "Overview"`) { + t.Error("overview page should have title 'Overview'") + } + if work.unchanged { + t.Error("unchanged should be false for new repos") + } +} + +func TestProcessRepo_BranchUnchanged(t *testing.T) { + branchSHA := "abc123def456" + readmeCalls := 0 + + mux := http.NewServeMux() + mux.HandleFunc("/repos/testorg/test-repo/branches/main", func(w http.ResponseWriter, r *http.Request) { + resp := BranchResponse{} + resp.Commit.SHA = branchSHA + json.NewEncoder(w).Encode(resp) + }) + mux.HandleFunc("/repos/testorg/test-repo/readme", func(w http.ResponseWriter, r *http.Request) { + readmeCalls++ + json.NewEncoder(w).Encode(FileResponse{ + Content: b64("# test-repo\n\nContent"), + Encoding: "base64", + SHA: "sha-readme", + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + output := t.TempDir() + ctx := context.Background() + + repo := Repo{ + Name: "test-repo", + FullName: "testorg/test-repo", + Description: "A test repository", + Language: "Go", + HTMLURL: "https://github.com/testorg/test-repo", + DefaultBranch: "main", + } + + oldState := map[string]repoState{ + "test-repo": {branchSHA: branchSHA, readmeSHA: "sha-readme"}, + } + oldManifest := map[string]bool{ + "content/docs/projects/test-repo/_index.md": true, + } + + result := &syncResult{} + work := processRepo(ctx, gh, "testorg", output, repo, true, false, result, oldState, oldManifest, "") + + if work == nil { + t.Fatal("processRepo returned nil for unchanged repo in write mode") + } + if work.card.Name != "test-repo" { + t.Errorf("card.Name = %q, want %q", work.card.Name, "test-repo") + } + if readmeCalls != 0 { + t.Errorf("README was fetched %d times, want 0 (fast path should skip)", readmeCalls) + } + if !work.unchanged { + t.Error("unchanged should be true when branch SHA matches") + } + if len(result.unchanged) != 1 || result.unchanged[0] != "test-repo" { + t.Errorf("unchanged = %v, want [test-repo]", result.unchanged) + } + if len(result.writtenFiles) != 1 { + t.Errorf("writtenFiles = %d, want 1 (carried forward from manifest)", len(result.writtenFiles)) + } +} + +func TestProcessRepo_BranchChangedReadmeUnchanged(t *testing.T) { + readmeContent := "# test-repo\n\nThis is a test README." + readmeSHA := "sha-readme-stable" + + mux := http.NewServeMux() + mux.HandleFunc("/repos/testorg/test-repo/readme", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(FileResponse{ + Content: b64(readmeContent), + Encoding: "base64", + SHA: readmeSHA, + }) + }) + mux.HandleFunc("/repos/testorg/test-repo/branches/main", func(w http.ResponseWriter, r *http.Request) { + resp := BranchResponse{} + resp.Commit.SHA = "new-branch-sha" + json.NewEncoder(w).Encode(resp) + }) + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + output := t.TempDir() + ctx := context.Background() + + repo := Repo{ + Name: "test-repo", + FullName: "testorg/test-repo", + Description: "A test repository", + Language: "Go", + HTMLURL: "https://github.com/testorg/test-repo", + DefaultBranch: "main", + } + + oldState := map[string]repoState{ + "test-repo": {branchSHA: "old-branch-sha", readmeSHA: readmeSHA}, + } + + result := &syncResult{} + work := processRepo(ctx, gh, "testorg", output, repo, true, false, result, oldState, nil, "") + + if work == nil { + t.Fatal("processRepo returned nil") + } + if len(result.unchanged) != 1 || result.unchanged[0] != "test-repo" { + t.Errorf("repo should be classified as unchanged when README SHA matches, got unchanged=%v updated=%v", result.unchanged, result.updated) + } +} + +func TestSyncConfigSource(t *testing.T) { + fileContent := "[![badge](https://img.svg)](https://ci)\n\n# complyctl\n\nSome [link](docs/guide.md) here." + + mux := http.NewServeMux() + mux.HandleFunc("/repos/org/complyctl/contents/README.md", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(FileResponse{ + Content: b64(fileContent), + Encoding: "base64", + SHA: "sha-file", + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + output := t.TempDir() + ctx := context.Background() + + src := Source{ + Repo: "org/complyctl", + Branch: "main", + Files: []FileSpec{ + { + Src: "README.md", + Dest: "content/docs/projects/complyctl/_index.md", + Transform: Transform{ + InjectFrontmatter: map[string]any{ + "title": "complyctl", + "description": "CLI tool", + "weight": 10, + }, + RewriteLinks: true, + StripBadges: true, + }, + }, + }, + } + + t.Run("write mode applies transforms", func(t *testing.T) { + result := &syncResult{} + syncConfigSource(ctx, gh, src, Defaults{Branch: "main"}, output, true, result, "") + + if result.errors > 0 { + t.Fatalf("syncConfigSource had %d errors", result.errors) + } + if result.synced != 1 { + t.Errorf("synced = %d, want 1", result.synced) + } + + destPath := filepath.Join(output, "content", "docs", "projects", "complyctl", "_index.md") + data, err := os.ReadFile(destPath) + if err != nil { + t.Fatalf("config file not written: %v", err) + } + content := string(data) + + if !strings.Contains(content, "title: complyctl") { + t.Error("injected frontmatter should contain title") + } + if strings.Contains(content, "[![badge") { + t.Error("badges should be stripped") + } + if strings.Contains(content, "](docs/guide.md)") { + t.Error("relative links should be rewritten") + } + if !strings.Contains(content, "https://github.com/org/complyctl/blob/main/docs/guide.md") { + t.Error("relative link should become absolute GitHub URL") + } + if strings.Contains(content, "# complyctl") || strings.Contains(content, "## Complyctl") { + t.Error("leading H1 should be stripped — title is already in frontmatter") + } + }) + + t.Run("dry-run writes nothing", func(t *testing.T) { + dryOutput := t.TempDir() + result := &syncResult{} + syncConfigSource(ctx, gh, src, Defaults{Branch: "main"}, dryOutput, false, result, "") + + if result.synced != 1 { + t.Errorf("dry-run synced = %d, want 1", result.synced) + } + + destPath := filepath.Join(dryOutput, "content", "docs", "projects", "complyctl", "_index.md") + if _, err := os.Stat(destPath); !os.IsNotExist(err) { + t.Error("dry-run should not create files") + } + }) +} + +func TestConcurrentSyncResult(t *testing.T) { + result := &syncResult{} + var wg sync.WaitGroup + + for range 100 { + wg.Add(1) + go func() { + defer wg.Done() + result.addSynced() + }() + } + + for range 50 { + wg.Add(1) + go func() { + defer wg.Done() + result.addError() + }() + } + + for range 25 { + wg.Add(1) + go func() { + defer wg.Done() + result.addWarning() + }() + } + + wg.Wait() + + if result.synced != 100 { + t.Errorf("synced = %d, want 100", result.synced) + } + if result.errors != 50 { + t.Errorf("errors = %d, want 50", result.errors) + } + if result.warnings != 25 { + t.Errorf("warnings = %d, want 25", result.warnings) + } +} + +func TestRecordFile(t *testing.T) { + result := &syncResult{} + var wg sync.WaitGroup + for i := range 50 { + wg.Add(1) + go func(n int) { + defer wg.Done() + result.recordFile(fmt.Sprintf("file-%d.md", n)) + }(i) + } + wg.Wait() + if len(result.writtenFiles) != 50 { + t.Errorf("writtenFiles = %d, want 50", len(result.writtenFiles)) + } +} + +func TestSyncConfigSourceProvenance(t *testing.T) { + fileContent := "# complyctl\n\nSome content." + fileSHA := "abc123def456789" + + mux := http.NewServeMux() + mux.HandleFunc("/repos/org/complyctl/contents/README.md", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(FileResponse{ + Content: b64(fileContent), + Encoding: "base64", + SHA: fileSHA, + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + output := t.TempDir() + ctx := context.Background() + + src := Source{ + Repo: "org/complyctl", + Branch: "main", + Files: []FileSpec{ + { + Src: "README.md", + Dest: "content/docs/projects/complyctl/_index.md", + Transform: Transform{ + InjectFrontmatter: map[string]any{"title": "complyctl"}, + }, + }, + }, + } + + result := &syncResult{} + syncConfigSource(ctx, gh, src, Defaults{Branch: "main"}, output, true, result, "") + + destPath := filepath.Join(output, "content", "docs", "projects", "complyctl", "_index.md") + data, err := os.ReadFile(destPath) + if err != nil { + t.Fatalf("file not written: %v", err) + } + content := string(data) + + if !strings.Contains(content, "") { + t.Errorf("provenance comment missing or incorrect, got:\n%s", content) + } +} + +func TestSyncRepoDocPages(t *testing.T) { + mux := http.NewServeMux() + + mux.HandleFunc("/repos/testorg/test-repo/contents/docs", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode([]DirEntry{ + {Name: "installation.md", Path: "docs/installation.md", Type: "file"}, + {Name: "usage.md", Path: "docs/usage.md", Type: "file"}, + }) + }) + mux.HandleFunc("/repos/testorg/test-repo/contents/docs/installation.md", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(FileResponse{ + Content: b64("# Installation\n\nRun `go install`."), + Encoding: "base64", + SHA: "sha-install", + }) + }) + mux.HandleFunc("/repos/testorg/test-repo/contents/docs/usage.md", func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(FileResponse{ + Content: b64("# Usage\n\nRun the CLI tool."), + Encoding: "base64", + SHA: "sha-usage", + }) + }) + + server := httptest.NewServer(mux) + defer server.Close() + + gh := newTestClient(server.URL) + output := t.TempDir() + ctx := context.Background() + + repo := Repo{ + Name: "test-repo", + FullName: "testorg/test-repo", + Description: "A test repository", + Language: "Go", + HTMLURL: "https://github.com/testorg/test-repo", + DefaultBranch: "main", + PushedAt: "2025-01-15T00:00:00Z", + } + + discovery := Discovery{ScanPaths: []string{"docs"}} + result := &syncResult{} + syncRepoDocPages(ctx, gh, "testorg", repo, output, true, discovery, nil, nil, result, "") + + if result.errors != 0 { + t.Fatalf("errors = %d, want 0", result.errors) + } + if result.synced != 2 { + t.Errorf("synced = %d, want 2", result.synced) + } + + cases := []struct { + relPath string + title string + provSrc string + }{ + { + relPath: "content/docs/projects/test-repo/installation.md", + title: "Installation", + provSrc: "testorg/test-repo/docs/installation.md@main", + }, + { + relPath: "content/docs/projects/test-repo/usage.md", + title: "Usage", + provSrc: "testorg/test-repo/docs/usage.md@main", + }, + } + + for _, tc := range cases { + fullPath := filepath.Join(output, tc.relPath) + data, err := os.ReadFile(fullPath) + if err != nil { + t.Fatalf("file not written: %s: %v", tc.relPath, err) + } + content := string(data) + + if !strings.Contains(content, fmt.Sprintf("title: %q", tc.title)) { + t.Errorf("%s: missing title %q in frontmatter:\n%s", tc.relPath, tc.title, content) + } + if !strings.Contains(content, "draft: false") { + t.Errorf("%s: missing draft: false", tc.relPath) + } + if !strings.Contains(content, "weight: 10") { + t.Errorf("%s: missing weight: 10", tc.relPath) + } + if !strings.Contains(content, "date: 2025-01-15T00:00:00Z") { + t.Errorf("%s: missing or wrong date", tc.relPath) + } + if !strings.Contains(content, "