diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 72e30782..700a2c28 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -4,6 +4,7 @@ on: push: branches: - main + - release/* paths: - VERSION - server/** @@ -181,7 +182,7 @@ jobs: test: runs-on: ubuntu-latest - needs: [build-cli, build-server] + needs: [build-cli] timeout-minutes: 15 concurrency: group: ci-test-${{ github.ref }} @@ -189,6 +190,25 @@ jobs: steps: - uses: actions/checkout@v4 + - name: Set up Erlang/Elixir + uses: erlef/setup-beam@v1 + with: + otp-version: "27" + elixir-version: "1.18" + + - name: Cache Mix dependencies + uses: actions/cache@v4 + with: + path: | + server/deps + server/_build + key: mix-${{ runner.os }}-${{ hashFiles('server/mix.lock') }} + restore-keys: mix-${{ runner.os }}- + + - name: Compile server + working-directory: server + run: mix deps.get && mix compile + - name: Set up Python uses: actions/setup-python@v5 with: @@ -203,19 +223,10 @@ jobs: - name: Make CLI executable run: chmod +x cli/coflux-linux-amd64 - - name: Download server image - uses: actions/download-artifact@v4 - with: - name: coflux-image - - - name: Load server image - run: gunzip -c coflux-image.tar.gz | docker load - - name: Install test dependencies - run: pip install pytest pytest-xdist + run: pip install pytest pytest-xdist PyJWT cryptography - name: Run E2E tests env: COFLUX_BIN: cli/coflux-linux-amd64 - COFLUX_IMAGE: coflux:ci run: pytest tests/ -n auto diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index cdb2cdc4..8a945ba7 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -311,14 +311,17 @@ jobs: env: VERSION: ${{ needs.prepare.outputs.version }} API_VERSION: ${{ needs.prepare.outputs.api_version }} + TARGET_BRANCH: ${{ needs.prepare.outputs.target_branch }} run: | gunzip -c coflux-image.tar.gz | docker load docker tag coflux:ci ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$VERSION docker tag coflux:ci ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$API_VERSION - docker tag coflux:ci ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$VERSION docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:$API_VERSION - docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest + if [ "$TARGET_BRANCH" = "main" ]; then + docker tag coflux:ci ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest + docker push ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:latest + fi # PyPI - name: Publish to PyPI diff --git a/cli/README.md b/cli/README.md index 58576187..341daeaa 100644 --- a/cli/README.md +++ b/cli/README.md @@ -86,14 +86,12 @@ coflux blobs get Create a `coflux.toml` file (or use `coflux setup`): ```toml +host = "localhost:7777" +# token = "your-token" # or use --token flag workspace = "default" modules = ["myapp.workflows", "myapp.tasks"] concurrency = 8 -[server] -host = "localhost:7777" -# token = "your-token" # or use --token flag - [blobs] threshold = 200 # bytes, values larger stored as blobs diff --git a/cli/cmd/coflux/follow.go b/cli/cmd/coflux/follow.go index e7251c8b..aefd12bc 100644 --- a/cli/cmd/coflux/follow.go +++ b/cli/cmd/coflux/follow.go @@ -24,6 +24,19 @@ type stepState struct { Detail string } +// groupMember records a child step's membership in a group, keyed by the +// parent execution that defined the group. +type groupMember struct { + parentExecID string // execution ID of the parent + groupID string // group ID within that execution + groupName string // human-readable group name (may be empty) +} + +// groupSummary is a rendered summary line for a collapsed group. +type groupSummary struct { + text string // pre-formatted summary text +} + // watchRun subscribes to the run topic and renders a live-updating tree of // step statuses (TTY only). Waits for all steps to complete. Returns exit code. func watchRun(ctx context.Context, host string, secure bool, token string, runID string, workspaceID string) (int, error) { @@ -146,6 +159,8 @@ func renderStepTree(data map[string]any, lastRendered map[string]stepState, line // Build current state for every step, and an executionID -> stepID index currentStates := make(map[string]stepState, len(steps)) execToStep := map[string]string{} // executionID -> stepID + // groupMembers maps child stepID -> groupMember info + groupMembers := map[string]groupMember{} for stepID, raw := range steps { stepData, ok := raw.(map[string]any) if !ok { @@ -156,11 +171,49 @@ func renderStepTree(data map[string]any, lastRendered map[string]stepState, line parentID, _ := stepData["parentId"].(string) executions, _ := stepData["executions"].(map[string]any) - // Index all execution IDs for this step + // Index all execution IDs for this step, and parse group info for _, execRaw := range executions { - if e, ok := execRaw.(map[string]any); ok { - if eid, ok := e["executionId"].(string); ok { - execToStep[eid] = stepID + e, ok := execRaw.(map[string]any) + if !ok { + continue + } + eid, _ := e["executionId"].(string) + if eid != "" { + execToStep[eid] = stepID + } + + // Parse groups map (groupID -> name) and children array + groups, _ := e["groups"].(map[string]any) + childrenArr, _ := e["children"].([]any) + for _, childRaw := range childrenArr { + child, ok := childRaw.(map[string]any) + if !ok { + continue + } + childStepID, _ := child["stepId"].(string) + if childStepID == "" { + continue + } + // groupId may be a float64 from JSON or nil + var gidStr string + switch gid := child["groupId"].(type) { + case float64: + gidStr = fmt.Sprintf("%d", int(gid)) + case string: + gidStr = gid + default: + continue // no group + } + groupName := "" + if groups != nil { + if name, ok := groups[gidStr].(string); ok { + groupName = name + } + } + groupMembers[childStepID] = groupMember{ + parentExecID: eid, + groupID: gidStr, + groupName: groupName, } } } @@ -208,16 +261,62 @@ func renderStepTree(data map[string]any, lastRendered map[string]stepState, line sort.Strings(children[k]) } - // Walk tree depth-first to build output lines + // Walk tree depth-first to build output lines, collapsing groups type line struct { - prefix string - stepID string + prefix string + stepID string + summary *groupSummary // non-nil for group summary lines } var lines []line var walkChildren func(stepIDs []string, prefix string) walkChildren = func(stepIDs []string, prefix string) { - for i, id := range stepIDs { - isLast := i == len(stepIDs)-1 + // Partition children into groups and ungrouped, preserving order. + // groupKey = parentExecID + ":" + groupID + type groupEntry struct { + key string + name string + stepIDs []string + } + groupMap := map[string]*groupEntry{} + var orderedItems []any // either string (stepID) or *groupEntry (first occurrence) + for _, id := range stepIDs { + gm, inGroup := groupMembers[id] + if !inGroup { + orderedItems = append(orderedItems, id) + continue + } + key := gm.parentExecID + ":" + gm.groupID + if ge, ok := groupMap[key]; ok { + ge.stepIDs = append(ge.stepIDs, id) + } else { + ge = &groupEntry{key: key, name: gm.groupName, stepIDs: []string{id}} + groupMap[key] = ge + orderedItems = append(orderedItems, ge) + } + } + + // Flatten to a list of items to render + type renderItem struct { + stepID string + group *groupEntry // non-nil means this is a group (render first child + summary) + } + var items []renderItem + for _, item := range orderedItems { + switch v := item.(type) { + case string: + items = append(items, renderItem{stepID: v}) + case *groupEntry: + if len(v.stepIDs) == 1 { + // Single-member group: render normally + items = append(items, renderItem{stepID: v.stepIDs[0]}) + } else { + items = append(items, renderItem{group: v}) + } + } + } + + for i, item := range items { + isLast := i == len(items)-1 var connector, childPrefix string if isLast { connector = prefix + "└─ " @@ -226,9 +325,26 @@ func renderStepTree(data map[string]any, lastRendered map[string]stepState, line connector = prefix + "├─ " childPrefix = prefix + "│ " } - lines = append(lines, line{prefix: connector, stepID: id}) - if kids, ok := children[id]; ok { - walkChildren(kids, childPrefix) + + if item.group != nil { + ge := item.group + firstID := ge.stepIDs[0] + // Render the group header as a tree layer + summaryText := buildGroupSummary(ge.name, ge.stepIDs, currentStates, children) + lines = append(lines, line{ + prefix: connector, + summary: &groupSummary{text: summaryText}, + }) + // Render the first child nested under the group + lines = append(lines, line{prefix: childPrefix + "└─ ", stepID: firstID}) + if kids, ok := children[firstID]; ok { + walkChildren(kids, childPrefix+" ") + } + } else { + lines = append(lines, line{prefix: connector, stepID: item.stepID}) + if kids, ok := children[item.stepID]; ok { + walkChildren(kids, childPrefix) + } } } } @@ -254,6 +370,10 @@ func renderStepTree(data map[string]any, lastRendered map[string]stepState, line // Render for _, l := range lines { + if l.summary != nil { + fmt.Printf("\r%s%s\033[K\n", l.prefix, l.summary.text) + continue + } st := currentStates[l.stepID] label := "" if st.StepNum != "" && st.Attempt != "" { @@ -300,6 +420,101 @@ func renderStepTree(data map[string]any, lastRendered map[string]stepState, line return drawn } +// buildGroupSummary builds a summary string like: +// "group_name: 1 of 22 (5 pending, 2 running, 1 error)" +// The group name is shown in normal text, the counter is dim, and status +// counts are colored to match their status indicators. Each child's status +// is its "branch status" — if any descendant is still running/pending, the +// branch counts as running, regardless of the child's own terminal state. +func buildGroupSummary(groupName string, stepIDs []string, states map[string]stepState, childrenMap map[string][]string) string { + total := len(stepIDs) + + // Count branch statuses + counts := map[string]int{} + for _, id := range stepIDs { + counts[branchStatus(id, states, childrenMap)]++ + } + + // Build colored status parts (non-completed/non-cached only) + statusOrder := []string{"pending", "running", "error", "cancelled", "abandoned", "suspended", "deferred"} + var parts []string + for _, s := range statusOrder { + if c, ok := counts[s]; ok && c > 0 { + color := statusColor(s) + parts = append(parts, fmt.Sprintf("%s%d %s%s", color, c, s, colorReset)) + } + } + + var summary string + if groupName != "" { + summary = groupName + " " + } + summary += colorDim + fmt.Sprintf("1 of %d", total) + colorReset + if len(parts) > 0 { + summary += " (" + strings.Join(parts, ", ") + ")" + } + + return summary +} + +// branchStatus returns the effective status for a step considering all its +// descendants. Active states (running, pending) take precedence over terminal +// states so that a branch with a running grandchild under an errored child +// shows as "running" until all descendants settle. +func branchStatus(stepID string, states map[string]stepState, childrenMap map[string][]string) string { + st, ok := states[stepID] + if !ok { + return "pending" + } + + // Collect this step's status plus all descendant statuses + statuses := collectBranchStatuses(stepID, states, childrenMap) + + // Priority: active states first, then terminal failures + for _, s := range []string{"running", "pending", "error", "cancelled", "abandoned", "suspended", "deferred"} { + for _, status := range statuses { + if status == s { + return s + } + } + } + + // All completed/cached + return st.Status +} + +// collectBranchStatuses recursively collects the status of a step and all its +// descendants. +func collectBranchStatuses(stepID string, states map[string]stepState, childrenMap map[string][]string) []string { + st, ok := states[stepID] + if !ok { + return []string{"pending"} + } + result := []string{st.Status} + for _, childID := range childrenMap[stepID] { + result = append(result, collectBranchStatuses(childID, states, childrenMap)...) + } + return result +} + +// statusColor returns the ANSI color code for a given status string. +func statusColor(status string) string { + switch status { + case "completed": + return colorGreen + case "cached": + return colorDimGreen + case "running": + return colorBlue + case "error", "abandoned": + return colorRed + case "cancelled": + return colorYellow + default: + return colorDim + } +} + // statesEqual returns true if two state maps are identical. func statesEqual(a, b map[string]stepState) bool { if len(a) != len(b) { @@ -337,10 +552,25 @@ func latestExecutionStatus(executions map[string]any) (string, string, string) { } // computeExecutionStatus determines the status string, detail, and timestamp -// for a single execution from its topic data. +// for a single execution from its topic data. "Spawned" results are unwrapped +// to show the status of the inner result (matching Studio behaviour). func computeExecutionStatus(exec map[string]any) (status, detail string, ts int64) { result, _ := exec["result"].(map[string]any) + // Unwrap spawned: look through to the inner result. If the spawn is + // async and hasn't resolved yet, inner result will be absent — treat + // as running (result = nil). + if result != nil { + if rt, _ := result["type"].(string); rt == "spawned" { + inner, ok := result["result"].(map[string]any) + if ok { + result = inner + } else { + result = nil + } + } + } + if result == nil { if exec["assignedAt"] != nil { if assignedAt, ok := exec["assignedAt"].(float64); ok { @@ -384,15 +614,6 @@ func computeExecutionStatus(exec map[string]any) (status, detail string, ts int6 return "cached", "", ts case "deferred": return "deferred", "", ts - case "spawned": - if execRef, ok := result["execution"].(map[string]any); ok { - if eid, ok := execRef["executionId"].(string); ok { - if idx := strings.Index(eid, ":"); idx > 0 { - detail = eid[:idx] - } - } - } - return "spawned", detail, ts default: return resultType, "", ts } @@ -439,7 +660,7 @@ func checkAllComplete(data map[string]any) (exitCode int, done bool) { resultType, _ := result["type"].(string) switch resultType { - case "value", "cached": + case "value", "cached", "spawned": // ok default: hasFailure = true @@ -492,7 +713,7 @@ func checkRootComplete(data map[string]any) (exitCode int, done bool) { resultType, _ := result["type"].(string) switch resultType { - case "value", "cached": + case "value", "cached", "spawned": return 0, true default: return 1, true diff --git a/cli/cmd/coflux/main.go b/cli/cmd/coflux/main.go index 39482381..00ce791c 100644 --- a/cli/cmd/coflux/main.go +++ b/cli/cmd/coflux/main.go @@ -43,8 +43,9 @@ workers, and authenticating with Studio.`, func init() { // Set defaults (before config file is read) // Priority: defaults < config file < env vars < flags - viper.SetDefault("server.host", "localhost:7777") + viper.SetDefault("host", "localhost:7777") viper.SetDefault("workspace", "default") + viper.SetDefault("worker.adapter", []string{}) viper.SetDefault("worker.concurrency", min(runtime.NumCPU()+4, 32)) viper.SetDefault("blobs.threshold", 100) viper.SetDefault("logs.batch_size", 100) @@ -61,8 +62,8 @@ func init() { rootCmd.PersistentFlags().StringP("output", "o", "", "Output format (json)") // Bind flags to viper - viper.BindPFlag("server.host", rootCmd.PersistentFlags().Lookup("host")) - viper.BindPFlag("server.token", rootCmd.PersistentFlags().Lookup("token")) + viper.BindPFlag("host", rootCmd.PersistentFlags().Lookup("host")) + viper.BindPFlag("token", rootCmd.PersistentFlags().Lookup("token")) viper.BindPFlag("team", rootCmd.PersistentFlags().Lookup("team")) viper.BindPFlag("workspace", rootCmd.PersistentFlags().Lookup("workspace")) viper.BindPFlag("log_level", rootCmd.PersistentFlags().Lookup("log-level")) @@ -138,17 +139,17 @@ func loadConfig() (*config.Config, error) { } // Handle secure flag default (depends on host value) - if cfg.Server.Secure == nil { - secure := !config.IsLocalhost(cfg.Server.Host) - cfg.Server.Secure = &secure + if cfg.Secure == nil { + secure := !config.IsLocalhost(cfg.Host) + cfg.Secure = &secure } return cfg, nil } -// getHost returns the resolved host with secure detection +// getHost returns the resolved host func getHost() string { - return viper.GetString("server.host") + return viper.GetString("host") } // getWorkspace returns the resolved workspace @@ -163,13 +164,13 @@ func getTeam() string { // getToken returns the resolved token func getToken() string { - return viper.GetString("server.token") + return viper.GetString("token") } // isSecure determines if HTTPS should be used func isSecure() bool { - if viper.IsSet("server.secure") { - return viper.GetBool("server.secure") + if viper.IsSet("secure") { + return viper.GetBool("secure") } return !config.IsLocalhost(getHost()) } diff --git a/cli/cmd/coflux/pools.go b/cli/cmd/coflux/pools.go index 48a02c05..ddcdf1cc 100644 --- a/cli/cmd/coflux/pools.go +++ b/cli/cmd/coflux/pools.go @@ -140,12 +140,27 @@ func runPoolsGet(cmd *cobra.Command, args []string) error { if image := getString(launcher, "image"); image != "" { fmt.Printf("Image: %s\n", image) } + if dir := getString(launcher, "directory"); dir != "" { + fmt.Printf("Directory: %s\n", dir) + } if dockerHost := getString(launcher, "dockerHost"); dockerHost != "" { fmt.Printf("Docker host: %s\n", dockerHost) } if serverHost := getString(launcher, "serverHost"); serverHost != "" { fmt.Printf("Server host: %s\n", serverHost) } + if adapter := getStringSlice(launcher, "adapter"); len(adapter) > 0 { + fmt.Printf("Adapter: %s\n", strings.Join(adapter, " ")) + } + if concurrency := getFloat64(launcher, "concurrency"); concurrency > 0 { + fmt.Printf("Concurrency: %d\n", int(concurrency)) + } + if env, ok := launcher["env"].(map[string]any); ok && len(env) > 0 { + fmt.Printf("Environment:\n") + for k, v := range env { + fmt.Printf(" %s=%s\n", k, v) + } + } } return nil @@ -428,6 +443,42 @@ func formatMillis(ms int64) string { return t.Format("2006-01-02 15:04:05 UTC") } +func hasCommonLauncherFlags() bool { + return poolsUpdateServerHost != "" || poolsUpdateNoServerHost || + poolsUpdateAdapter != nil || poolsUpdateNoAdapter || + poolsUpdateConcurrency > 0 || poolsUpdateNoConcurrency || + poolsUpdateEnv != nil || poolsUpdateNoEnv +} + +func applyCommonLauncherFlags(launcher map[string]any) { + if poolsUpdateServerHost != "" { + launcher["serverHost"] = poolsUpdateServerHost + } else if poolsUpdateNoServerHost { + delete(launcher, "serverHost") + } + if poolsUpdateAdapter != nil { + launcher["adapter"] = poolsUpdateAdapter + } else if poolsUpdateNoAdapter { + delete(launcher, "adapter") + } + if poolsUpdateConcurrency > 0 { + launcher["concurrency"] = poolsUpdateConcurrency + } else if poolsUpdateNoConcurrency { + delete(launcher, "concurrency") + } + if poolsUpdateEnv != nil { + env := make(map[string]any) + for _, e := range poolsUpdateEnv { + if key, value, ok := strings.Cut(e, "="); ok { + env[key] = value + } + } + launcher["env"] = env + } else if poolsUpdateNoEnv { + delete(launcher, "env") + } +} + func getFloat64(m map[string]any, key string) float64 { if v, ok := m[key]; ok { if f, ok := v.(float64); ok { @@ -437,15 +488,39 @@ func getFloat64(m map[string]any, key string) float64 { return 0 } +func getStringSlice(m map[string]any, key string) []string { + v, ok := m[key] + if !ok { + return nil + } + if arr, ok := v.([]any); ok { + result := make([]string, 0, len(arr)) + for _, item := range arr { + if s, ok := item.(string); ok { + result = append(result, s) + } + } + return result + } + return nil +} + // pools update var ( - poolsUpdateModules []string - poolsUpdateProvides []string - poolsUpdateDockerImage string - poolsUpdateDockerHost string - poolsUpdateNoDockerHost bool - poolsUpdateServerHost string - poolsUpdateNoServerHost bool + poolsUpdateModules []string + poolsUpdateProvides []string + poolsUpdateDockerImage string + poolsUpdateDockerHost string + poolsUpdateNoDockerHost bool + poolsUpdateProcessDir string + poolsUpdateServerHost string + poolsUpdateNoServerHost bool + poolsUpdateAdapter []string + poolsUpdateNoAdapter bool + poolsUpdateConcurrency int + poolsUpdateNoConcurrency bool + poolsUpdateEnv []string + poolsUpdateNoEnv bool ) var poolsUpdateCmd = &cobra.Command{ @@ -461,10 +536,24 @@ func init() { poolsUpdateCmd.Flags().StringVar(&poolsUpdateDockerImage, "docker-image", "", "Docker image") poolsUpdateCmd.Flags().StringVar(&poolsUpdateDockerHost, "docker-host", "", "Docker host") poolsUpdateCmd.Flags().BoolVar(&poolsUpdateNoDockerHost, "no-docker-host", false, "Unset Docker host (use default socket)") + poolsUpdateCmd.Flags().StringVar(&poolsUpdateProcessDir, "process-dir", "", "Directory for process launcher") poolsUpdateCmd.Flags().StringVar(&poolsUpdateServerHost, "server-host", "", "Coflux server host (overrides server default)") poolsUpdateCmd.Flags().BoolVar(&poolsUpdateNoServerHost, "no-server-host", false, "Unset server host (use server default)") + poolsUpdateCmd.Flags().StringSliceVar(&poolsUpdateAdapter, "adapter", nil, "Adapter command (e.g., --adapter python,-m,coflux)") + poolsUpdateCmd.Flags().BoolVar(&poolsUpdateNoAdapter, "no-adapter", false, "Unset adapter (use worker default)") + poolsUpdateCmd.Flags().IntVar(&poolsUpdateConcurrency, "concurrency", 0, "Max concurrent executions per worker") + poolsUpdateCmd.Flags().BoolVar(&poolsUpdateNoConcurrency, "no-concurrency", false, "Unset concurrency (use worker default)") + poolsUpdateCmd.Flags().StringArrayVar(&poolsUpdateEnv, "env", nil, "Environment variable (e.g., --env KEY=VALUE)") + poolsUpdateCmd.Flags().BoolVar(&poolsUpdateNoEnv, "no-env", false, "Clear all custom environment variables") poolsUpdateCmd.MarkFlagsMutuallyExclusive("docker-host", "no-docker-host") poolsUpdateCmd.MarkFlagsMutuallyExclusive("server-host", "no-server-host") + poolsUpdateCmd.MarkFlagsMutuallyExclusive("adapter", "no-adapter") + poolsUpdateCmd.MarkFlagsMutuallyExclusive("concurrency", "no-concurrency") + poolsUpdateCmd.MarkFlagsMutuallyExclusive("env", "no-env") + // Process and Docker flags are mutually exclusive + poolsUpdateCmd.MarkFlagsMutuallyExclusive("docker-image", "process-dir") + poolsUpdateCmd.MarkFlagsMutuallyExclusive("docker-host", "process-dir") + poolsUpdateCmd.MarkFlagsMutuallyExclusive("no-docker-host", "process-dir") } func runPoolsUpdate(cmd *cobra.Command, args []string) error { @@ -480,7 +569,7 @@ func runPoolsUpdate(cmd *cobra.Command, args []string) error { return err } - workspaceID, err := resolveWorkspaceID(cmd.Context(), client, workspace) + workspaceID, err := ensureWorkspaceID(cmd.Context(), client, workspace) if err != nil { return err } @@ -498,7 +587,14 @@ func runPoolsUpdate(cmd *cobra.Command, args []string) error { if poolsUpdateProvides != nil { pool["provides"] = parseProvides(poolsUpdateProvides) } - if poolsUpdateDockerImage != "" || poolsUpdateDockerHost != "" || poolsUpdateNoDockerHost || poolsUpdateServerHost != "" || poolsUpdateNoServerHost { + if poolsUpdateProcessDir != "" { + launcher := map[string]any{ + "type": "process", + "directory": poolsUpdateProcessDir, + } + applyCommonLauncherFlags(launcher) + pool["launcher"] = launcher + } else if poolsUpdateDockerImage != "" || poolsUpdateDockerHost != "" || poolsUpdateNoDockerHost { launcher, ok := pool["launcher"].(map[string]any) if !ok || getString(launcher, "type") != "docker" { launcher = map[string]any{"type": "docker"} @@ -511,12 +607,14 @@ func runPoolsUpdate(cmd *cobra.Command, args []string) error { } else if poolsUpdateNoDockerHost { delete(launcher, "dockerHost") } - if poolsUpdateServerHost != "" { - launcher["serverHost"] = poolsUpdateServerHost - } else if poolsUpdateNoServerHost { - delete(launcher, "serverHost") - } + applyCommonLauncherFlags(launcher) pool["launcher"] = launcher + } else if hasCommonLauncherFlags() { + // Update common launcher fields on an existing launcher + if launcher, ok := pool["launcher"].(map[string]any); ok { + applyCommonLauncherFlags(launcher) + pool["launcher"] = launcher + } } if err := client.UpdatePool(cmd.Context(), workspaceID, name, pool); err != nil { diff --git a/cli/cmd/coflux/server.go b/cli/cmd/coflux/server.go index cba30db0..933a6a21 100644 --- a/cli/cmd/coflux/server.go +++ b/cli/cmd/coflux/server.go @@ -1,13 +1,17 @@ package main import ( + "crypto/sha256" + "encoding/hex" "fmt" "os" "os/exec" "path/filepath" + "strings" "github.com/bitroot/coflux/cli/internal/version" "github.com/spf13/cobra" + "github.com/spf13/viper" ) var serverCmd = &cobra.Command{ @@ -18,23 +22,47 @@ var serverCmd = &cobra.Command{ This is a convenience wrapper around Docker (which must be installed and running), useful for running the server in a development environment. +Server options can be set via flags, the [server] section in coflux.toml, or +COFLUX_SERVER_* environment variables. + Examples: coflux server coflux server --port 8080 - coflux server --data-dir ./my-data`, + coflux server --data-dir ./my-data + coflux server --super-token mytoken --no-auth`, RunE: runServer, } var ( - serverPort int - serverDataDir string - serverImage string + serverNoAuth bool + serverSuperToken string + serverSuperTokenHash string ) func init() { - serverCmd.Flags().IntVarP(&serverPort, "port", "p", 7777, "Port to run server on") - serverCmd.Flags().StringVarP(&serverDataDir, "data-dir", "d", "./data", "Directory to store data") - serverCmd.Flags().StringVar(&serverImage, "image", getDefaultImage(), "Docker image to run") + serverCmd.Flags().IntP("port", "p", 0, "Port to run server on (default 7777)") + serverCmd.Flags().StringP("data-dir", "d", "", "Directory to store data (default ./data)") + serverCmd.Flags().String("image", "", "Docker image to run") + serverCmd.Flags().String("project", "", "Restrict server to a single project") + serverCmd.Flags().String("public-host", "", "Public-facing host (use % prefix for subdomain routing)") + serverCmd.Flags().BoolVar(&serverNoAuth, "no-auth", false, "Disable authentication") + serverCmd.Flags().StringVar(&serverSuperToken, "super-token", "", "Super token (will be hashed)") + serverCmd.Flags().StringVar(&serverSuperTokenHash, "super-token-hash", "", "Pre-hashed super token (SHA-256 hex)") + serverCmd.Flags().String("secret", "", "Server secret for signing service tokens") + serverCmd.Flags().StringSlice("studio-teams", nil, "Team IDs allowed for Studio auth") + serverCmd.Flags().StringSlice("launcher-types", nil, "Allowed launcher types (docker, process)") + + serverCmd.MarkFlagsMutuallyExclusive("super-token", "super-token-hash") + + // Bind flags to viper under the server.* namespace + viper.BindPFlag("server.port", serverCmd.Flags().Lookup("port")) + viper.BindPFlag("server.data_dir", serverCmd.Flags().Lookup("data-dir")) + viper.BindPFlag("server.image", serverCmd.Flags().Lookup("image")) + viper.BindPFlag("server.project", serverCmd.Flags().Lookup("project")) + viper.BindPFlag("server.public_host", serverCmd.Flags().Lookup("public-host")) + viper.BindPFlag("server.secret", serverCmd.Flags().Lookup("secret")) + viper.BindPFlag("server.studio_teams", serverCmd.Flags().Lookup("studio-teams")) + viper.BindPFlag("server.launcher_types", serverCmd.Flags().Lookup("launcher-types")) } // getDefaultImage returns the default Docker image name. @@ -43,21 +71,36 @@ func init() { func getDefaultImage() string { apiVersion := version.APIVersion() if apiVersion != "dev" && apiVersion != "" { - return fmt.Sprintf("ghcr.io/cofluxlabs/coflux:%s", apiVersion) + return fmt.Sprintf("ghcr.io/bitroot/coflux:%s", apiVersion) } - return "ghcr.io/cofluxlabs/coflux:latest" + return "ghcr.io/bitroot/coflux:latest" } func runServer(cmd *cobra.Command, args []string) error { + port := viper.GetInt("server.port") + if port == 0 { + port = 7777 + } + + dataDir := viper.GetString("server.data_dir") + if dataDir == "" { + dataDir = "./data" + } + + image := viper.GetString("server.image") + if image == "" { + image = getDefaultImage() + } + // Resolve data directory to absolute path - dataDir, err := resolveDataDir(serverDataDir) + absDataDir, err := resolveDataDir(dataDir) if err != nil { return err } // Determine pull policy based on image name pullPolicy := "always" - if len(serverImage) >= 7 && serverImage[:7] == "sha256:" { + if len(image) >= 7 && image[:7] == "sha256:" { pullPolicy = "missing" } @@ -65,13 +108,60 @@ func runServer(cmd *cobra.Command, args []string) error { dockerArgs := []string{ "run", "--pull", pullPolicy, - "--publish", fmt.Sprintf("%d:7777", serverPort), - "--volume", fmt.Sprintf("%s:/data", dataDir), - serverImage, + "--publish", fmt.Sprintf("%d:7777", port), + "--volume", fmt.Sprintf("%s:/data", absDataDir), } - fmt.Printf("Starting Coflux server on port %d...\n", serverPort) - fmt.Printf("Data directory: %s\n", dataDir) + // Add environment variables for server configuration + if project := viper.GetString("server.project"); project != "" { + dockerArgs = append(dockerArgs, "--env", "COFLUX_PROJECT="+project) + } + if publicHost := viper.GetString("server.public_host"); publicHost != "" { + dockerArgs = append(dockerArgs, "--env", "COFLUX_PUBLIC_HOST="+publicHost) + } + if serverNoAuth { + dockerArgs = append(dockerArgs, "--env", "COFLUX_REQUIRE_AUTH=false") + } + + // Handle super token: flag takes precedence, then config + tokenHash := serverSuperTokenHash + if serverSuperToken != "" { + tokenHash = hashToken(serverSuperToken) + } else if tokenHash == "" { + // Check config file values + if t := viper.GetString("server.super_token"); t != "" { + tokenHash = hashToken(t) + } else if h := viper.GetString("server.super_token_hash"); h != "" { + tokenHash = h + } + } + if tokenHash != "" { + dockerArgs = append(dockerArgs, "--env", "COFLUX_SUPER_TOKEN_HASH="+tokenHash) + } + + if secret := viper.GetString("server.secret"); secret != "" { + dockerArgs = append(dockerArgs, "--env", "COFLUX_SECRET="+secret) + } + if teams := viper.GetStringSlice("server.studio_teams"); len(teams) > 0 { + dockerArgs = append(dockerArgs, "--env", "COFLUX_STUDIO_TEAMS="+strings.Join(teams, ",")) + } + if types := viper.GetStringSlice("server.launcher_types"); len(types) > 0 { + dockerArgs = append(dockerArgs, "--env", "COFLUX_LAUNCHER_TYPES="+strings.Join(types, ",")) + } + + // Check config-level auth setting (--no-auth flag handled above) + if !serverNoAuth { + if auth := viper.Get("server.auth"); auth != nil { + if authBool, ok := auth.(bool); ok && !authBool { + dockerArgs = append(dockerArgs, "--env", "COFLUX_REQUIRE_AUTH=false") + } + } + } + + dockerArgs = append(dockerArgs, image) + + fmt.Printf("Starting Coflux server on port %d...\n", port) + fmt.Printf("Data directory: %s\n", absDataDir) // Run docker dockerCmd := exec.Command("docker", dockerArgs...) @@ -89,6 +179,12 @@ func runServer(cmd *cobra.Command, args []string) error { return nil } +// hashToken returns the SHA-256 hex digest of a token +func hashToken(token string) string { + h := sha256.Sum256([]byte(token)) + return hex.EncodeToString(h[:]) +} + // resolveDataDir resolves and creates the data directory func resolveDataDir(dir string) (string, error) { // Get absolute path diff --git a/cli/cmd/coflux/setup.go b/cli/cmd/coflux/setup.go index d1af95ba..dd792e91 100644 --- a/cli/cmd/coflux/setup.go +++ b/cli/cmd/coflux/setup.go @@ -118,10 +118,8 @@ func runSetup(cmd *cobra.Command, args []string) error { // Get existing values for defaults existingHost := "localhost:7777" - if server, ok := existingConfig["server"].(map[string]any); ok { - if h, ok := server["host"].(string); ok && h != "" { - existingHost = h - } + if h, ok := existingConfig["host"].(string); ok && h != "" { + existingHost = h } existingWorkspace := filepath.Base(mustGetwd()) if w, ok := existingConfig["workspace"].(string); ok && w != "" { @@ -234,10 +232,7 @@ func runSetup(cmd *cobra.Command, args []string) error { } // Update config - if existingConfig["server"] == nil { - existingConfig["server"] = make(map[string]any) - } - existingConfig["server"].(map[string]any)["host"] = host + existingConfig["host"] = host existingConfig["workspace"] = workspace if len(adapterCmd) > 0 { if existingConfig["worker"] == nil { diff --git a/cli/cmd/coflux/worker.go b/cli/cmd/coflux/worker.go index cdd1c310..4d25d42f 100644 --- a/cli/cmd/coflux/worker.go +++ b/cli/cmd/coflux/worker.go @@ -84,7 +84,7 @@ func runWorker(cmd *cobra.Command, args []string) error { if token == "" && session != "" { token = session } - cfg.Server.Token = token + cfg.Token = token modules := args @@ -133,7 +133,7 @@ func runWorker(cmd *cobra.Command, args []string) error { // Run worker logger.Info("starting worker", "workspace", cfg.Workspace, - "host", cfg.Server.Host, + "host", cfg.Host, "modules", modules, "concurrency", cfg.Worker.Concurrency, "register", shouldRegister, @@ -183,7 +183,7 @@ func runWorkerWithWatch( if err != nil { return fmt.Errorf("failed to refresh token: %w", err) } - cfg.Server.Token = token + cfg.Token = token // Create a cancellable context for this worker run runCtx, runCancel := context.WithCancel(ctx) @@ -196,7 +196,7 @@ func runWorkerWithWatch( w := worker.New(cfg, cmdAdapter, session, logger) logger.Info("starting worker", "workspace", cfg.Workspace, - "host", cfg.Server.Host, + "host", cfg.Host, "concurrency", cfg.Worker.Concurrency, "register", shouldRegister, ) diff --git a/cli/internal/api/client.go b/cli/internal/api/client.go index ebaff607..59aceaea 100644 --- a/cli/internal/api/client.go +++ b/cli/internal/api/client.go @@ -34,16 +34,13 @@ func NewClient(host string, secure bool, token string) *Client { } // CreateSession creates a new worker session -func (c *Client) CreateSession(ctx context.Context, workspaceID string, provides map[string][]string, concurrency int) (string, error) { +func (c *Client) CreateSession(ctx context.Context, workspaceID string, provides map[string][]string) (string, error) { body := map[string]any{ "workspaceId": workspaceID, } if provides != nil { body["provides"] = provides } - if concurrency > 0 { - body["concurrency"] = concurrency - } var result struct { SessionID string `json:"sessionId"` diff --git a/cli/internal/config/config.go b/cli/internal/config/config.go index 5c174149..32e8ed6a 100644 --- a/cli/internal/config/config.go +++ b/cli/internal/config/config.go @@ -5,6 +5,9 @@ import "strings" // Config represents the coflux.toml configuration file. // Defaults are set via viper.SetDefault() in cmd/coflux/main.go. type Config struct { + Host string `mapstructure:"host"` + Token string `mapstructure:"token"` + Secure *bool `mapstructure:"secure"` Workspace string `mapstructure:"workspace"` Team string `mapstructure:"team"` Server ServerConfig `mapstructure:"server"` @@ -15,11 +18,19 @@ type Config struct { Output string `mapstructure:"output"` } -// ServerConfig holds server connection settings +// ServerConfig holds settings for running a local server via `coflux server`. type ServerConfig struct { - Host string `mapstructure:"host"` - Token string `mapstructure:"token"` - Secure *bool `mapstructure:"secure"` + Port int `mapstructure:"port"` + DataDir string `mapstructure:"data_dir"` + Image string `mapstructure:"image"` + Project string `mapstructure:"project"` + PublicHost string `mapstructure:"public_host"` + Auth *bool `mapstructure:"auth"` + SuperToken string `mapstructure:"super_token"` + SuperTokenHash string `mapstructure:"super_token_hash"` + Secret string `mapstructure:"secret"` + StudioTeams []string `mapstructure:"studio_teams"` + LauncherTypes []string `mapstructure:"launcher_types"` } // WorkerConfig holds worker-specific settings @@ -73,11 +84,11 @@ type LogsConfig struct { // IsSecure determines if the connection should use TLS func (c *Config) IsSecure() bool { - if c.Server.Secure != nil { - return *c.Server.Secure + if c.Secure != nil { + return *c.Secure } // Default: localhost uses HTTP, others use HTTPS - return !IsLocalhost(c.Server.Host) + return !IsLocalhost(c.Host) } // IsLocalhost checks if the host is localhost-like. @@ -119,7 +130,7 @@ func (c *Config) HTTPURL() string { if c.IsSecure() { scheme = "https" } - return scheme + "://" + c.Server.Host + return scheme + "://" + c.Host } // WebSocketURL returns the WebSocket URL for the server @@ -128,5 +139,5 @@ func (c *Config) WebSocketURL() string { if c.IsSecure() { scheme = "wss" } - return scheme + "://" + c.Server.Host + return scheme + "://" + c.Host } diff --git a/cli/internal/worker/worker.go b/cli/internal/worker/worker.go index 3f38745e..a3cc4e3c 100644 --- a/cli/internal/worker/worker.go +++ b/cli/internal/worker/worker.go @@ -113,7 +113,7 @@ func (w *Worker) requireConn() (*api.Connection, error) { // Run starts the worker func (w *Worker) Run(ctx context.Context, modules []string, register bool) error { // Create API client - w.client = api.NewClient(w.cfg.Server.Host, w.cfg.IsSecure(), w.cfg.Server.Token) + w.client = api.NewClient(w.cfg.Host, w.cfg.IsSecure(), w.cfg.Token) // Resolve workspace name to external ID workspaceID, err := w.resolveWorkspaceID(ctx) @@ -130,6 +130,10 @@ func (w *Worker) Run(ctx context.Context, modules []string, register bool) error } w.logger.Debug("discovered targets", "count", len(manifest.Targets)) + if len(manifest.Targets) == 0 { + return fmt.Errorf("no targets found in modules %v", modules) + } + // Register manifests if requested (before connecting) if register { w.logger.Debug("registering manifests") @@ -151,7 +155,7 @@ func (w *Worker) Run(ctx context.Context, modules []string, register bool) error w.logger.Debug("creating session", "workspace", w.cfg.Workspace) provides := config.ParseProvides(w.cfg.Worker.Provides) var err error - sessionID, err = w.client.CreateSession(ctx, w.workspaceID, provides, w.cfg.Worker.Concurrency) + sessionID, err = w.client.CreateSession(ctx, w.workspaceID, provides) if err != nil { return fmt.Errorf("failed to create session: %w", err) } @@ -261,7 +265,7 @@ func (w *Worker) runWithReconnect(ctx context.Context, targets map[string]map[st func (w *Worker) runConnection(ctx context.Context, targets map[string]map[string][]string) (bool, error) { // Create new connection conn := api.NewConnection( - w.cfg.Server.Host, + w.cfg.Host, w.cfg.IsSecure(), w.workspaceID, w.sessionID, @@ -288,8 +292,8 @@ func (w *Worker) runConnection(ctx context.Context, targets map[string]map[strin errCh <- conn.Run(ctx) }() - // Declare targets via WebSocket (now that write loop is running) - if err := conn.Notify("declare_targets", targets); err != nil { + // Declare targets and concurrency via WebSocket (now that write loop is running) + if err := conn.Notify("declare_targets", targets, w.cfg.Worker.Concurrency); err != nil { return true, err } @@ -297,9 +301,9 @@ func (w *Worker) runConnection(ctx context.Context, targets map[string]map[strin hasExecutions := len(w.executions) > 0 w.mu.RUnlock() if hasExecutions { - w.logger.Info("reconnected", "host", w.cfg.Server.Host) + w.logger.Info("reconnected", "host", w.cfg.Host) } else { - w.logger.Info("connected", "host", w.cfg.Server.Host) + w.logger.Info("connected", "host", w.cfg.Host) } // Start heartbeat for this connection diff --git a/docs/docs/assets.md b/docs/docs/assets.md index 7ee08653..cc7e6a97 100644 --- a/docs/docs/assets.md +++ b/docs/docs/assets.md @@ -1,10 +1,10 @@ # Assets -An asset is a collection of files, which can be shared between tasks and inspected in the UI. Individual files are uploaded to the configured [blob store](/blobs). The listing and metadata are uploaded to the Coflux server. +An asset is a collection of files, which can be shared between tasks, inspected in Studio, and downloaded with the CLI. Individual files are uploaded to the configured [blob store](/blobs). The listing and metadata are uploaded to the Coflux server. ## Creating assets -An asset is created by calling `cf.asset()`. This will return an `Asset` object, which can be passed to other tasks (or returned), and then used to restore some or all of the files. +An asset is created by calling `cf.asset(...)`. This will return an `Asset` object, which can be passed to other tasks (or returned), and then used to restore some or all of the files. Each execution is started in a temporary directory. By default, `cf.asset()` will collect all files in the directory. diff --git a/docs/docs/authentication.md b/docs/docs/authentication.md new file mode 100644 index 00000000..6be49ba3 --- /dev/null +++ b/docs/docs/authentication.md @@ -0,0 +1,76 @@ +# Authentication + +By default, the Coflux server requires authentication. There are several authentication methods available, which can be used independently or combined. + +## Disabling authentication + +For local development, authentication can be disabled by starting the server with the `--no-auth` flag: + +```bash +coflux server --no-auth --project myproject +``` + +This allows anonymous access to all endpoints. This is not recommended for production use. + +## Super token + +A super token provides full access to the server. It is configured when starting the server: + +```bash +coflux server --super-token "my-secret-token" --project myproject +``` + +The token can then be used with the CLI: + +```bash +coflux submit --token "my-secret-token" mymodule/my_workflow +``` + +Or in `coflux.toml`: + +```toml +token = "my-secret-token" +``` + +If configuring the server via environment variables or a configuration file, you can provide the SHA-256 hash of the token instead of the plaintext token using `COFLUX_SUPER_TOKEN_HASH`. This is recommended for production environments where the token shouldn't appear in configuration files. + +## Service tokens + +Service tokens can be created and scoped to specific workspaces, making them suitable for CI/CD pipelines, production workers, or as an alternative to Studio authentication for individual users: + +```bash +coflux tokens create --name "CI" --workspaces "production/*" +``` + +Service tokens require `COFLUX_SECRET` to be configured on the server. This secret is used to derive per-project signing keys. + +To manage tokens: + +```bash +coflux tokens list +coflux tokens revoke +``` + +## Studio authentication + +Studio authentication uses a device authorization flow. Run `coflux login`, approve in the browser, and the CLI manages token refresh automatically: + +```bash +coflux login +``` + +This requires `COFLUX_STUDIO_TEAMS` to be configured on the server with the allowed team IDs: + +```bash +coflux server --studio-teams "team-id-1,team-id-2" --project myproject +``` + +To log out: + +```bash +coflux logout +``` + +## Worker authentication + +Workers authenticate using the same token mechanisms. When a token is configured (via `coflux.toml`, the `--token` flag, or Studio login), it is used for all connections to the server, including WebSocket connections and blob/log uploads. diff --git a/docs/docs/blobs.md b/docs/docs/blobs.md index 8b868a49..62845341 100644 --- a/docs/docs/blobs.md +++ b/docs/docs/blobs.md @@ -1,4 +1,4 @@ -# Blobs +# Blob stores Blob stores are used to store non-trivial amounts of data - this includes execution results, arguments passed to other executions, and asset data. @@ -6,7 +6,7 @@ Separating the storage of data means that the Coflux server doesn't need to have By default Coflux will use a blob store embedded in the Coflux server, which saves blobs to the filesystem. -This can be configured explicitly in the CLI configuration file (which is used by workers; the configuration file can be initialised with the CLI using `coflux configure`): +This can be configured explicitly in the CLI configuration file (which is used by workers; the configuration file can be initialized with the CLI using `coflux setup`): ```toml [[blobs.stores]] @@ -17,7 +17,7 @@ protocol = "http" ## Blob threshold -To determine when to store data in the blob store, a blob 'threshold' is used. If the serialised data takes more than this number of bytes, the blob store will be used, and a reference to the blob is substituted - otherwise the raw data is sent to the Coflux server. The default threshold is 200 bytes. This can be specified in the configuration file: +To determine when to store data in the blob store, a blob 'threshold' is used. If the serialized data takes more than this number of bytes, the blob store will be used, and a reference to the blob is substituted - otherwise the raw data is sent to the Coflux server. The default threshold is 200 bytes. This can be specified in the configuration file: ```toml [blobs] @@ -46,6 +46,6 @@ Multiple blob stores can be configured - the first will be considered the 'prima This is useful when adding a new store, as the original store can still be read from. Blobs can be manually migrated to the new store, and then the original store can be removed from configuration. -## UI +## Studio -This configuration is only used by the CLI (e.g., for running workers). To support loading blobs in the UI, stores can be configured from the project settings dialog. Settings entered in the UI (including access keys) are stored in the browser in local storage. When blobs are loaded in the UI, they're cached in the browser in session storage. +This configuration is only used by the CLI (e.g., for running workers). To support loading blobs in Studio, stores can be configured from the project settings dialog. When blobs are loaded in Studio, they're cached in the browser in session storage. diff --git a/docs/docs/caching.md b/docs/docs/caching.md index d9e3d7ff..34916ca9 100644 --- a/docs/docs/caching.md +++ b/docs/docs/caching.md @@ -88,7 +88,7 @@ def fetch_product(product_id, url): :::note The names of arguments can be changed without affecting the cache - this is because the names are translated to indexes. -Additionally, if the order of parameters needs to be changed, the cache can be maintained by specifying (or rearranging) the `params`. In the following three versions of `my_task` the addition of a parameter, and then rearranging, won't effect the cache: +Additionally, if the order of parameters needs to be changed, the cache can be maintained by specifying (or rearranging) the `params`. In the following three versions of `my_task` the addition of a parameter, and then rearranging, won't affect the cache: ```python # before change @@ -152,7 +152,7 @@ If you need to re-evaluate a task that's cached, you can do so by 're-running' t ## Cache hit requirements -To summarise, the requirements for a cache hit (i.e., for a previous result to be reused, instead of executing a step) are that: +To summarize, the requirements for a cache hit (i.e., for a previous result to be reused, instead of executing a step) are that: 1. The result must be in the same workspace, or an ancestral workspace, within the same project. 2. The result must also have had caching enabled. diff --git a/docs/docs/cli_config.md b/docs/docs/cli_config.md new file mode 100644 index 00000000..04a939e5 --- /dev/null +++ b/docs/docs/cli_config.md @@ -0,0 +1,98 @@ +# CLI configuration + +The CLI is configured through a combination of a configuration file, environment variables, and command-line flags. The priority order is: flags > environment variables > config file > defaults. + +## Configuration file + +The default configuration file is `coflux.toml` in the current directory. Use `coflux setup` to create one interactively: + +```bash +coflux setup +``` + +A typical configuration file: + +```toml +host = "localhost:7777" +workspace = "default" +modules = ["myapp.workflows", "myapp.tasks"] + +[worker] +concurrency = 8 +adapter = ["python", "-m", "coflux"] + +[blobs] +threshold = 100 + +[[blobs.stores]] +type = "http" +url = "http://localhost:7777/blobs" +``` + +### Reference + +| Key | Default | Description | +|-----|---------|-------------| +| `host` | `localhost:7777` | Server host | +| `token` | _(none)_ | Authentication token | +| `workspace` | `default` | Workspace name | +| `modules` | `[]` | Modules to load | +| `secure` | _(auto)_ | Use TLS (defaults to `true` for non-localhost hosts) | +| `team` | _(none)_ | Team ID for Studio authentication | +| `output` | _(none)_ | Output format (`json` for machine-readable output) | +| `log_level` | `info` | Log level (`debug`, `info`, `warn`, `error`) | + +### Worker settings + +```toml +[worker] +concurrency = 8 +adapter = ["python", "-m", "coflux"] +provides = ["gpu:A100", "region:eu"] +``` + +| Key | Default | Description | +|-----|---------|-------------| +| `worker.concurrency` | _(CPU count + 4, max 32)_ | Maximum concurrent executions | +| `worker.adapter` | `[]` | Adapter command for executing Python code | +| `worker.provides` | `[]` | Features this worker provides (for pool matching) | + +### Blob storage + +See [Blobs](./blobs.md) for detailed blob store configuration. + +### Log storage + +```toml +[logs] +type = "http" +url = "http://localhost:7777/logs" +batch_size = 100 +flush_interval = 0.5 +``` + +## Environment variables + +All configuration keys can be set via environment variables with the `COFLUX_` prefix: + +```bash +export COFLUX_HOST=localhost:7777 +export COFLUX_TOKEN=my-token +export COFLUX_WORKSPACE=production +``` + +Nested keys use underscores: `COFLUX_WORKER_CONCURRENCY`, `COFLUX_BLOBS_THRESHOLD`. + +## Global flags + +These flags are available on all commands: + +| Flag | Description | +|------|-------------| +| `--config`, `-c` | Path to configuration file (default: `coflux.toml`) | +| `--host` | Server host | +| `--token` | Authentication token | +| `--workspace`, `-w` | Workspace name | +| `--team`, `-t` | Team ID | +| `--output`, `-o` | Output format (`json`) | +| `--log-level` | Log level | diff --git a/docs/docs/concepts.md b/docs/docs/concepts.md index bdf4a414..6e9e074d 100644 --- a/docs/docs/concepts.md +++ b/docs/docs/concepts.md @@ -6,45 +6,45 @@ This page outlines the main concepts in Coflux. A Coflux _server_ can host multiple _projects_. Data for each project is isolated from other projects, and orchestration is handled by a dedicated process for each project. -You should use a separate project when: +Consider using separate projects when: -1. Data needs to be kept separate for reasons of security or privacy. -2. Throughput is -3. There's a logical separation of concerns. +1. There's a logical separation of concerns. +2. Data needs to be kept separate for compliance reasons. +3. To improve throughput (i.e., by partitioning). ## Workspaces -A individual project can contain multiple workspaces. All workspaces within a project are controlled by the same orchestration process, and some level of separation is provided between workspaces, but workspace inheritance allows controlled data sharing. Workspaces might be mapped to deployment environments (e.g., production, staging, development), or separated further - for example a workspace per customer in a production environment, or a workspace per developer in a development environment. Or even more granular separation is possible - for example using temporary workspaces which correspond with a Git branch, to work on fixing a bug or building a new feature. +An individual project can contain multiple workspaces. All workspaces within a project are controlled by the same orchestration process, and some level of separation is provided between workspaces, but workspace inheritance allows controlled sharing of data. ### Workspace inheritance -By default there is isolation between workspaces within a project - for example, workflows, runs, results are separated. But workspaces can be arranged into a hierarchy. This allows: +By default there is isolation between workspaces within a project — for example, workflows, runs, and results are separated. But workspaces can be arranged into a hierarchy. This allows: -1. Cached (or memoised) results to be inherited from parent workspaces. -2. Steps to be _re-run_ in a 'child' workspaces. +1. Cached (or memoized) results can be inherited from parent workspaces. +2. Steps can be _re-run_ in child workspaces. -For example, a `development` workspace can inherit from a `production` workspace, allowing you to re-run whole workflows, or specific steps within a workflow, in a development workspace, experimenting with changes to the code without having to re-run the whole workflow from scratch. When working with a team on a shared project, you might choose to set up separate workspace for each engineer, or even create workspaces temporarily to work on specific features. - -This makes it easier to diagnose issues that arise in a production workspace by retrying individual steps locally, and trying out code changes safely. +For example, a `development` workspace could inherit from a `production` workspace, allowing you to re-run whole workflows, or specific steps within a run, in a development workspace. This lets you experiment with changes to the code using real data without having to re-run the whole workflow from scratch. When working with a team on a shared project, separate workspaces can be used by each engineer. Or separate short-lived workspaces can be used to work on individual features, or investigate bugs. ## Workers -An _worker_ is a process that hosts _modules_ (collections workflows/tasks). An worker connects to the server and is associated with a specific project and workspace. The worker waits for commands from the server telling it to execute specific tasks, and the worker monitors and reports progress of these executions back to the server. +A _worker_ is a process that hosts _modules_ (collections of workflows/tasks). A worker connects to the server and is associated with a specific project and workspace. The worker waits for commands from the server telling it to execute specific tasks, and reports progress of these executions back to the server. + +This model of having workers connect to the server provides flexibility over where and how workers are run. During development a worker can run locally on a laptop, restarting automatically as code changes are made. Or multiple workers can run in the cloud, or on dedicated machines — or a combination. A worker can be started with specific environment variables associated with the deployment environment (e.g., production access keys). -This model of having workers connect to the server provides flexibility over where and how workers are run. During development a worker can run locally on a laptop, restarting automatically as code changes are made. Or multiple workers can run in the cloud, or on dedicated machines - or a combination. An worker can be started with specific environment variables associated with the deployment environment (e.g., production access keys). +Each execution is run in an isolated process. The worker creates a number of 'warm' executor processes (with the module code loaded) ready to handle executions, so executions start in milliseconds. ## Workflows A _workflow_ is defined in a module, in code. Additionally, _tasks_ can be defined, and called from workflows (or other tasks). -Workflows and tasks are collectively referred to as _targets_, although workflows are really just special forms of tasks, from which runs can be started. You can think of the distinction between workflows and tasks a bit like the distinction between public and private functions in a module. +Workflows and tasks are collectively referred to as _targets_, although workflows are really just special forms of tasks, from which runs can be started. You can think of the distinction between workflows and tasks like the distinction between public and private functions in a module. -Workflows need to be registered with a project and workspace so that they appear in the UI. This can be done explicitly (e.g., for a production workspace as part of a build process), or automatically by a worker when it starts/restarts (using the `--register` or `--dev` flag). +Workflows need to be registered with a project and workspace so that they appear in Studio. This can be done explicitly (e.g., for a production workspace as part of a build process), or automatically by a worker when it starts/restarts (using the `--register` or `--dev` flag). ## Runs -When a workflow is submitted, this initiates a _run_. A run is made up of _steps_, which each correspond to a target to be executed. The target (a workflow or task) can call other tasks, which cause those to scheduled as steps. Each step has at least one associated _execution_. Steps can be retried (manually or automatically), which will lead to multiple executions being associated with the step. +When a workflow is submitted, this initiates a _run_. A run is made up of _steps_, which each correspond to a target to be executed. The target (a workflow or task) can call other tasks, which cause those to be scheduled as steps. Each step has at least one associated _execution_. Steps can be retried (manually or automatically), which will lead to multiple executions being associated with the step. -# Assets +## Assets -Executions can persist _assets_ (a collection of files) which can be passed between executions and restored as needed, or viewed in the UI. +Executions can persist _assets_ (a collection of files) which can be passed between executions and restored as needed, or viewed in Studio. diff --git a/docs/docs/concurrency.md b/docs/docs/concurrency.md index 35b9eb88..105cd313 100644 --- a/docs/docs/concurrency.md +++ b/docs/docs/concurrency.md @@ -1,8 +1,8 @@ # Concurrency -By default, when a task is called from another task (or workflow), execution will block while waiting for the called task to complete. This is more intuitive for beginners, and also makes code more portable. +By default, when a task is called from another task (or workflow) - e.g., with `my_task()` - execution will block while waiting for the called task to complete. This is more intuitive for beginners, and also makes code more portable. -Often, however, you'll want to be able to execute tasks in parallel, and collect the results later on. Or trigger a task without waiting for the result. This can be done by 'submitting' the task (using `.submit(...)`) instead of calling it. This returns an `Execution`, which is a 'future'-like object that can be used to wait for the result (using `.result()`), when needed: +Often, however, you'll want to be able to execute tasks in parallel, and collect the results later on. Or trigger a task without waiting for the result. This can be done by 'submitting' the task - e.g., with `my_task.submit(...)`. This returns an `Execution` object, which is a 'future'-like object that can be used to wait for the result (using `.result()`), when needed: ```python @cf.task() @@ -118,4 +118,4 @@ def my_workflow(): execution.cancel() ``` -In this case `my_workflow` submits `another_workflow` (causing a separate run to be started), but then cancels it. The effect is the same as if the run had been cancelled in the UI. +In this case `my_workflow` submits `another_workflow` (causing a separate run to be started), but then cancels it. The effect is the same as if the run had been cancelled in Studio. diff --git a/docs/docs/deferring.md b/docs/docs/deferring.md index 72c05534..77ffe937 100644 --- a/docs/docs/deferring.md +++ b/docs/docs/deferring.md @@ -19,7 +19,7 @@ Delaying tasks is useful in combination with 'deferring' as a way to de-duplicat For example, you might want to be able to send a notification to a user to notify of them of updates to a document. If there are lots of updates to the document within a short period of time, you wouldn't want to send notifications for every change. Instead, you can configure a delay, as above, and enable deferring. This is done by specifying the `defer` option on the task: ```python -@cf.task(delay=60, defer=True): +@cf.task(delay=60, defer=True) def send_notification(user_id, document_id): ... ``` @@ -38,4 +38,4 @@ def send_notification(user_id, document_id, update): ... ``` -In this case, subsequent calls for the same user and document would be de-duplicated, even though the update is different each time. Initial calls to `send_notification` would be discarded, in favour of the latest call. +In this case, subsequent calls for the same user and document would be de-duplicated, even though the update is different each time. Initial calls to `send_notification` would be discarded, in favor of the latest call. diff --git a/docs/docs/examples.md b/docs/docs/examples.md index 53de5405..4a48e403 100644 --- a/docs/docs/examples.md +++ b/docs/docs/examples.md @@ -1,3 +1,3 @@ # Example workflows -See the [examples](https://github.com/bitroot/coflux/tree/main/examples) directory on GitHub for some example workflows. +See the [examples directory](https://github.com/bitroot/coflux/tree/main/examples) on GitHub for example workflows. diff --git a/docs/docs/executions.md b/docs/docs/executions.md index a68d59ba..596c1287 100644 --- a/docs/docs/executions.md +++ b/docs/docs/executions.md @@ -17,12 +17,10 @@ graph TB; assigned-->cancelled[Cancelled] ``` -When an execution is first scheduled, it starts in the _Queued_ state. +When an execution is first scheduled, it starts in the _Queued_ state — unless caching is enabled and there is a cache hit, in which case it transitions straight to _Cached_. -(unless caching is enabled and there is a cache hit, in which case it transitions straight to _Cached_). +From the _Queued_ state it will transition to _Assigned_ once it is due and a suitable worker is available to run it, unless: caching is enabled and there's a cache hit; it refers to a workflow, in which case that is 'spawned' as a separate run; or it becomes 'deferred' to another execution in the meantime. -From the _Queued_ state it will transition to _Assigned_ once it is due, and a suitable worker is available to run it, unless: caching is enabled and there's a cache hit; it's referring to a workflow, in which case that is 'spawned' as a separate run; it become 'deferred' to another execution in the meantime. +Once assigned, the worker will generally execute it until it succeeds (_Succeeded_) or raises an exception (_Failed_). If contact is lost with a worker for more than the timeout period, the tasks that are running will be marked as _Abandoned_ (we don't know whether they completed successfully). Executions may be _Cancelled_ while they're running (or before they've been assigned). An execution may choose to suspend itself (either explicitly, or from timing out while waiting for another execution to complete) — in this case it will be automatically re-run. -Once assigned, the worker will generally execute it until it succeeds (_Succeeded_) or raises an exception (_Failed_). If contact is lost with a worker for more than the timeout period, the task that are running will be marked as _Abandoned_ (we don't know whether it completed successfully). Executions may be _Cancelled_ while they're running (or before they've been assigned). An execution may choose to suspend itself (either explicitly, or from timing out while waiting for another execution to complete) - in this case it will be automatically re-run. - -Steps maybe be configured to automatically retry (from a failed or abandoned state), or they may be re-run manually, in which case a new execution will be started. +Steps may be configured to automatically retry (from a failed or abandoned state), or they may be re-run manually, in which case a new execution will be started. diff --git a/docs/docs/getting_started/install.md b/docs/docs/getting_started/install.md index 7721506f..ebb70248 100644 --- a/docs/docs/getting_started/install.md +++ b/docs/docs/getting_started/install.md @@ -1,11 +1,19 @@ -# 1. Installing +# 1. Installing the CLI -Coflux is available as a Python package on PyPI. You can install it using `pip` or similar: +The quickest way to install the Coflux CLI on macOS or Linux: ```bash -pip install coflux +curl -fsSL https://coflux.com/install.sh | sh ``` -The package includes the SDK for defining workflows as well as the CLI. +This detects your OS and architecture, downloads the latest release, and installs the binary to `/usr/local/bin`. + +Alternatively, you can download the binary directly from the [GitHub releases](https://github.com/bitroot/coflux/releases) page. + +Once installed, verify it's working: + +```bash +coflux --help +``` Next, we'll start the server... diff --git a/docs/docs/getting_started/runs.md b/docs/docs/getting_started/runs.md index bfe86011..83f45e57 100644 --- a/docs/docs/getting_started/runs.md +++ b/docs/docs/getting_started/runs.md @@ -2,31 +2,24 @@ We've defined our workflow, started the Coflux server, and started a worker. The final step is to submit a run of our workflow. -## Using the web UI - -We can do this in the web UI: - -1. Select the `print_greeting` workflow in the sidebar. -2. Click the 'Run...' button. -3. Enter a name (it must be _JSON-encoded_ - e.g., `"Joe"`, in quotes). -4. Click 'Run'. +## Using the CLI -In the web UI, you'll see the run graph appear as the run executes. +Submit a run using the CLI: -### Exploring the run +```bash +coflux submit hello/print_greeting '"Joe"' +``` -From the graph you can see the relationship between steps. You can also switch to _timeline_ and _logs_ views. And select steps to see details related to the specific step. You should be able to find the result from the `build_greeting` step, and this result being logged by the `print_greeting` step. +The target is specified in the format `module/target`. Arguments are passed as JSON strings (note the need to quote the argument). -## Using the CLI +By default, the command waits for the workflow to complete, showing a live-updating tree of step statuses. Use `--no-wait` to submit and exit immediately. -You can also submit runs using the CLI: +You can also inspect runs and retrieve results: ```bash -coflux submit hello.py print_greeting '"Joe"' +coflux runs inspect +coflux runs result +coflux logs ``` -(Note the need to triple-quote the argument.) - -## Next steps - -Congratulations on defining and starting your first run. Continue with the documentation or try defining another workflow. +Next, we can connect from Studio to explore runs in a browser... diff --git a/docs/docs/getting_started/server.md b/docs/docs/getting_started/server.md index 6b45c59d..6f691e7b 100644 --- a/docs/docs/getting_started/server.md +++ b/docs/docs/getting_started/server.md @@ -3,11 +3,13 @@ Use the CLI to start the server locally: ```bash -coflux server +coflux server --no-auth --project myproject ``` +The `--project` flag configures the server for single-project mode. A _project_ is a top-level unit of isolation — it has its own data, orchestration process, and set of workspaces. The `--no-auth` flag disables authentication, which simplifies getting started. See the [authentication documentation](/authentication) for options for setting up authentication for production use. + :::note -The command is just a wrapper around `docker run`, so you'll need to have Docker installed and running. +The command is a wrapper around `docker run`, so you'll need to have Docker installed and running. Alternatively you can start the server with Docker directly: @@ -16,20 +18,12 @@ docker run \ --pull always \ -p 7777:7777 \ -v $(pwd):/data \ + -e COFLUX_PROJECT=myproject \ + -e COFLUX_REQUIRE_AUTH=false \ ghcr.io/bitroot/coflux ``` ::: -Open up the web UI at http://localhost:7777. - -## Setting up a project - -Before we can connect a worker, we need to create a Coflux project and a workspace. - -In the web UI, click 'New project...', enter a project name, and click 'Create'. - -Now that you have an empty project, you'll be prompted to add a workspace. Enter a name (or use the suggested one), and click 'Create'. - -Take note of the project ID and workspace name in the instructions. +The server is now running on `localhost:7777`. Next, we can define a workflow... diff --git a/docs/docs/getting_started/studio.md b/docs/docs/getting_started/studio.md new file mode 100644 index 00000000..8379db48 --- /dev/null +++ b/docs/docs/getting_started/studio.md @@ -0,0 +1,34 @@ +# 6. Connecting from Studio + +Coflux Studio provides a web UI for monitoring and exploring workflow runs, managing access to projects, and sharing them within a team. It connects to your self-hosted Coflux server at runtime — your data stays in your infrastructure. + +## Connecting + +1. Visit [studio.coflux.com](https://studio.coflux.com). +2. Create a project, entering your server address (`localhost:7777`). +3. You should see your project and workspace, along with the workflow you registered. + +## Exploring a run + +Select the `print_greeting` workflow in the sidebar and find the run you submitted. From here you can: + +- View the **run graph** showing the relationship between steps. +- Switch to **timeline** and **logs** views. +- Select individual steps to see their details, results, and logs. +- Start new runs. + +You should be able to find the result from the `build_greeting` step, and this result being logged by the `print_greeting` step. + +:::note +Studio can be used without creating an account. Creating an account allows you to use Studio for authentication and to share access to projects with your team. +::: + +## Next steps + +Now that you have a workflow running, here are some areas to explore: + +- [Concurrency](/concurrency) — run tasks in parallel using `.submit()`. +- [Caching](/caching) — avoid re-computing results that haven't changed. +- [Automatic retries](/retries) — handle transient failures gracefully. +- [Assets](/assets) — persist and share files between tasks. +- [Concepts](/concepts) — a deeper look at projects, workspaces, and workspace inheritance. diff --git a/docs/docs/getting_started/workers.md b/docs/docs/getting_started/workers.md index 953e809c..614e0382 100644 --- a/docs/docs/getting_started/workers.md +++ b/docs/docs/getting_started/workers.md @@ -1,33 +1,33 @@ # 4. Starting workers -Modules are hosted by _workers_ - each worker can have its own package dependencies, and be deployed within your infrastructure as needed - for example one worker could be deployed on an on-premise bare-metal server with a GPU, and another worker could be deployed as a Docker image on an auto-scaling cloud cluster. +Modules are hosted by _workers_. A worker is a process that connects to the server and executes the code required by your workflows. Each worker can have its own package dependencies, and be deployed within your infrastructure as needed — for example, one worker could run on an on-premise server with a GPU, while another runs as a Docker image on an auto-scaling cloud cluster. -An worker is a process that's responsible for executing the code required by your workflow - it will: +A worker will: -1. Listen for commands from the orchestrator. -2. Invoke and monitor executions of operations (in forked sub-processes). -3. Report the status (including results, errors, etc.) of executions back to the orchestrator. +1. Listen for commands from the server. +2. Execute operations in isolated sub-processes. +3. Report the status (including results, errors, etc.) of executions back to the server. -Importantly, they can be run locally, automatically watching for code changes, restarting, and registering workflows as needed. +Importantly, workers can be run locally, automatically watching for code changes, restarting, and registering workflows as needed. -## Initialise +## Set up -Use the `configure` command to populate a configuration file. A configuration file isn't necessary, but avoids having to specify configuration manually in the following commands. Run the following command: +Use the `setup` command to populate a configuration file (`coflux.toml`). A configuration file isn't necessary, but avoids having to specify configuration manually in the following commands. Run the following command: ```bash -coflux configure +coflux setup ``` -You will be prompted to enter the host (`localhost:7777`), the project ID, and the workspace name. +You will be prompted to enter the host (`localhost:7777`), the _workspace_ name, and the adapter command for your Python environment. A workspace is an environment within a project (e.g., `production`, `development/joe`) — see [Concepts](/concepts#workspaces) for more detail. Use `--detect` to auto-detect your Python environment. ## Run Now the worker can be started. Run the following command: ```bash -coflux worker --dev hello.py +coflux worker --dev hello ``` -In the web UI you will be able to see your workflow appear in the sidebar. +The `--dev` flag (equivalent to specifying `--watch` and `--register`) enables development mode, which watches for code changes, automatically restarts the worker, and registers workflows with the server. Without it, modules need to be registered separately (e.g., using `coflux manifests register`), and the worker would need to be restarted after making code changes. -Next, let's initiate a run... +Next, let's submit a run... diff --git a/docs/docs/getting_started/workflows.md b/docs/docs/getting_started/workflows.md index f39225c9..2a88d408 100644 --- a/docs/docs/getting_started/workflows.md +++ b/docs/docs/getting_started/workflows.md @@ -4,6 +4,14 @@ Workflows are defined in code, using Python functions, which are decorated to in The decorators are intended to be unimposing so that functions can be executed outside of Coflux. +## Installing the Python package + +The `coflux` Python package provides the decorators and runtime for defining and executing workflows. Install it into the environment that your workflow code will run in: + +```bash +pip install coflux +``` + ## An example Here's a simple example: @@ -30,7 +38,7 @@ Workflows are defined in _modules_. Typically these are Python modules, but they Put the workflow above into `hello.py`. :::tip -The docstring of a workflow will be available in the UI when running a workflow. This is a great place to explain what the workflow does. +The docstring of a workflow will be available in Studio when running a workflow. This is a great place to explain what the workflow does. ::: Before coming back to more advanced features, let's see how to get this workflow running... diff --git a/docs/docs/groups.md b/docs/docs/groups.md index 9f11c94e..a05b74a4 100644 --- a/docs/docs/groups.md +++ b/docs/docs/groups.md @@ -15,7 +15,7 @@ def my_workflow(n: int): my_task(i) ``` -But starting a large number of tasks can make it difficult to navigate the graph in the UI - especially when those tasks are themselves starting other tasks. To make the graphs easier to navigate, Coflux has the concept of _task groups_. +But starting a large number of tasks can make it difficult to navigate the graph in Studio — especially when those tasks are themselves starting other tasks. To make the graphs easier to navigate, Coflux has the concept of _task groups_. A group can be created using a context manager: @@ -27,14 +27,14 @@ def my_workflow(n: int): my_task(i) ``` -Now all of the steps will be assigned to the group. In the UI only one step from the group will be displayed at a time. +Now all of the steps will be assigned to the group. In Studio, only one step from the group will be displayed at a time. A group -The name passed to `cf.group(...)` is optional, and simply serves as a way to label the group in the UI. +The name passed to `cf.group(...)` is optional, and simply serves as a way to label the group in Studio. :::note -Note that steps can be run in parallel by 'submitting' them : +Note that steps can be run in parallel by 'submitting' them: ```python @cf.workflow() @@ -70,7 +70,7 @@ Tasks that are called within a group don't need to be the same: ```python with cf.group("My tasks"): for i in range(n): - if n % 2 == 0: + if i % 2 == 0: even_task(i) else: odd_task(i) diff --git a/docs/docs/intro.md b/docs/docs/intro.md index 8a2e925e..f52abfc9 100644 --- a/docs/docs/intro.md +++ b/docs/docs/intro.md @@ -4,16 +4,24 @@ slug: '/' # Introduction -Coflux is an open-source workflow engine. It can be used to orchestrate and observe computational workflows, which are defined in plain Python. It aims to maximise developer productivity and be easy to try out and adopt. +Coflux is an open-source workflow engine for orchestrating and observing computational workflows defined in plain Python. You define workflows using simple decorators — no DSLs, no YAML, no DAG definitions — and Coflux handles scheduling, retries, caching, and observability. -It can be used to build data pipelines, coordinate background tasks, or orchestrate real-time workflows. +You can use it to build data pipelines, coordinate background tasks, or orchestrate real-time workflows. + +### Why Coflux? + +- **Plain Python**: Workflows are regular Python functions with decorators. They remain callable outside of Coflux, making them easy to test and debug. +- **Self-hosted**: You run the server. Your data stays in your infrastructure. +- **Workspace inheritance**: Branch your production environment into development workspaces, re-run specific steps with real data, and experiment without affecting production. +- **Live observation**: Studio provides a web UI for exploring run graphs, timelines, logs, and results in real time. ## Getting started -The first section in this guide describes the steps involved in defining and running a workflow: +The following guide walks you through defining and running your first workflow: -1. [Installing the Python package](./getting_started/install.md) +1. [Installing the CLI](./getting_started/install.md) 2. [Starting the server](./getting_started/server.md) 3. [Defining a workflow](./getting_started/workflows.md) 4. [Running a worker](./getting_started/workers.md) -5. [Submitting and observing a run](./getting_started/runs.md) +5. [Submitting a run](./getting_started/runs.md) +6. [Connecting from Studio](./getting_started/studio.md) (optional) diff --git a/docs/docs/logging.md b/docs/docs/logging.md index 6245ee25..e2cb75c9 100644 --- a/docs/docs/logging.md +++ b/docs/docs/logging.md @@ -11,7 +11,7 @@ cf.log_info( ) ``` -These messages will appear in the web UI like this: +These messages will appear in Studio like this: Log messages diff --git a/docs/docs/memoising.md b/docs/docs/memoizing.md similarity index 58% rename from docs/docs/memoising.md rename to docs/docs/memoizing.md index e88fa4e8..29fb6f1b 100644 --- a/docs/docs/memoising.md +++ b/docs/docs/memoizing.md @@ -1,10 +1,10 @@ -# Memoising +# Memoizing -Memoising is similar to [caching](/caching), however it only applies to steps within a run, and serves a subtly different purpose. With caching, a cache hit still results in a new step entity, but the result will be shared. Memoising is more lightweight because the existing execution is referenced directly, rather than creating a new step which references the existing result. +Memoizing is similar to [caching](/caching), however it only applies to steps within a run, and serves a subtly different purpose. With caching, a cache hit still results in a new step entity, but the result will be shared. Memoizing is more lightweight because the existing execution is referenced directly, rather than creating a new step which references the existing result. -Memoising can be used as a way of optimising runs (by sharing a result), and also to make debugging runs easier. +Memoizing can be used as a way of optimizing runs (by sharing a result), and also to make debugging runs easier. -Enable memoising of a task with the `memo` option: +Enable memoizing of a task with the `memo` option: ```python @task(memo=True) @@ -12,25 +12,25 @@ def fetch_user(user_id): ... ``` -Memoised steps are indicated in the web UI with a pin icon. +Memoized steps are indicated in Studio with a pin icon. -As with caching, explicitly clicking the 're-run' button for a step will force the step to be re-run, even if it's memoised. Then subsequent memoising will use the new step execution. +As with caching, explicitly clicking the 're-run' button for a step will force the step to be re-run, even if it's memoized. Then subsequent memoizing will use the new step execution. -If a step is manually re-run in a child workspace, the memoised results will be used (but memoised results from the child workspace aren't available to the parent). This follows the same rules as caching. +If a step is manually re-run in a child workspace, the memoized results will be used (but memoized results from the child workspace aren't available to the parent). This follows the same rules as caching. ## For debugging -Memoising provides several benefits for debugging: +Memoizing provides several benefits for debugging: -1. Memoising a task with side effects (e.g., sending a notification e-mail) means you can re-run the whole run (or part of it) without that side-effect happening. +1. Memoizing a task with side effects (e.g., sending a notification email) means you can re-run the whole run (or part of it) without that side-effect happening. -2. Memoising slow tasks allows you to fix bugs that are occurring elsewhere in the workflow. +2. Memoizing slow tasks allows you to fix bugs that are occurring elsewhere in the workflow. -This is particularly useful when re-running a workflow from a production workspace in a development workspace (assuming the production workspace is configured as an ancestor of the development workspace). By liberally memo-ising tasks, specific steps can be re-run in the development workspace without re-running downstream steps. +This is particularly useful when re-running a workflow from a production workspace in a development workspace (assuming the production workspace is configured as an ancestor of the development workspace). By liberally memoizing tasks, specific steps can be re-run in the development workspace without re-running downstream steps. -## For optimisation +## For optimization -Memoising can also be used as an optimisation for workflows. For example, if a resource needs to be used in multiple parts of a workflow, rather than passing around that resource, the task to fetch it can be memoised: +Memoizing can also be used as an optimization for workflows. For example, if a resource needs to be used in multiple parts of a workflow, rather than passing around that resource, the task to fetch it can be memoized: ```python @task(memo=True) @@ -52,11 +52,11 @@ In this case, the `fetch_user` task will only be executed once for the run, even ## Memo parameters -As with caching, by default the memoisation considers all arguments. This can be changed by specifying the individual parameters: +As with caching, by default the memoization considers all arguments. This can be changed by specifying the individual parameters: ```python @task(memo=["machine_id"]) -def apply_configuration(machine_id, config) +def apply_configuration(machine_id, config): ... ``` diff --git a/docs/docs/pools.md b/docs/docs/pools.md new file mode 100644 index 00000000..7d2eeba3 --- /dev/null +++ b/docs/docs/pools.md @@ -0,0 +1,104 @@ +# Pools + +:::note +Pools are an early preview feature. The concept is functional but will become more useful as additional launcher types are added. +::: + +A _pool_ defines a configuration for automatically launching workers in a workspace. Instead of manually starting workers, you can configure a pool and the server will launch and manage workers on your behalf. + +## Configuring a pool + +Pools are configured using the CLI: + +```bash +coflux pools update mypool \ + --module myapp.workflows \ + --module myapp.tasks \ + --docker-image myorg/myapp:latest +``` + +### Launcher types + +Each pool has a _launcher_ that determines how workers are started. The server must be configured to allow the relevant launcher type (`COFLUX_LAUNCHER_TYPES`). + +#### Docker launcher + +Launches workers as Docker containers: + +```bash +coflux pools update mypool \ + --docker-image myorg/myapp:latest \ + --docker-host tcp://docker:2375 +``` + +| Option | Description | +|--------|-------------| +| `--docker-image` | Docker image to run | +| `--docker-host` | Docker host (default: local socket) | + +#### Process launcher + +Launches workers as local processes: + +```bash +coflux pools update mypool \ + --process-dir /path/to/project +``` + +| Option | Description | +|--------|-------------| +| `--process-dir` | Working directory for the worker process | + +### Common options + +These options apply to all launcher types: + +| Option | Description | +|--------|-------------| +| `--module`, `-m` | Modules to host (can be specified multiple times) | +| `--provides` | Features that workers provide (e.g., `gpu:A100`) | +| `--server-host` | Server host override for launched workers | +| `--adapter` | Adapter command | +| `--concurrency` | Maximum concurrent executions per worker | +| `--env` | Environment variables (e.g., `--env KEY=VALUE`) | + +## Managing pools + +```bash +# List pools in a workspace +coflux pools list + +# Get pool configuration +coflux pools get mypool + +# View launched workers +coflux pools launches mypool + +# Watch launches in real-time +coflux pools launches mypool --watch + +# Delete a pool +coflux pools delete mypool +``` + +## Provides and requires + +Workers can declare features they _provide_, and targets can _require_ specific features. This allows routing executions to appropriate workers — for example, GPU-intensive tasks to GPU-equipped workers. + +On the worker side, configure `provides` on the pool: + +```bash +coflux pools update gpu-pool \ + --docker-image myorg/gpu-worker:latest \ + --provides "gpu:A100" +``` + +On the task side, specify `requires` in the decorator: + +```python +@cf.task(requires={"gpu": "A100"}) +def train_model(data): + ... +``` + +The `requires` parameter accepts a dictionary where keys are feature names and values can be a specific value (`"A100"`), a list of acceptable values (`["A100", "H100"]`), or `True` to require the feature with any value. diff --git a/docs/docs/recurring.md b/docs/docs/recurring.md index 6c741bc4..4bf048a6 100644 --- a/docs/docs/recurring.md +++ b/docs/docs/recurring.md @@ -10,7 +10,7 @@ def poll_for_updates(): process_update.submit(update) ``` -This continues indefinitely until the run is cancelled. +This continues indefinitely until the run is cancelled or an error occurs (without a successful retry). ## Delay diff --git a/docs/docs/retries.md b/docs/docs/retries.md index f2327290..a9776c18 100644 --- a/docs/docs/retries.md +++ b/docs/docs/retries.md @@ -59,3 +59,47 @@ def critical_task(): ``` With unlimited retries, each retry uses a random delay between the min and max backoff seconds. + +## Retry conditions + +By default, retries apply to all errors. The `when` parameter allows you to control which errors trigger a retry — errors that don't match fail immediately without consuming retry attempts. + +The condition is evaluated on the worker at the time the exception is raised, so it has access to the full exception object. + +### Exception class + +Specify a single exception type to only retry on that error: + +```python +@cf.task(retries=cf.Retries(limit=3, when=ConnectionError)) +def call_api(): + ... +``` + +### Tuple of exception classes + +Specify multiple exception types as a tuple: + +```python +@cf.task(retries=cf.Retries(limit=3, when=(ConnectionError, TimeoutError))) +def call_api(): + ... +``` + +### Callback function + +For more complex logic, pass a function that receives the exception and returns whether to retry: + +```python +@cf.task( + retries=cf.Retries( + limit=5, + backoff=(1, 30), + when=lambda e: getattr(e, "status_code", 0) >= 500, + ), +) +def call_api(): + ... +``` + +This is useful for distinguishing between transient errors (e.g., 5xx server errors) that are worth retrying and permanent errors (e.g., 4xx client errors) that should fail immediately. diff --git a/docs/docs/serialization.md b/docs/docs/serialization.md new file mode 100644 index 00000000..8bacdb24 --- /dev/null +++ b/docs/docs/serialization.md @@ -0,0 +1,54 @@ +# Serialization + +Values passed between executions (arguments and results) are automatically serialized and deserialized. This page describes the types that are supported and how more complex types are handled. + +## Supported types + +The following Python types are supported natively: + +| Type | Notes | +|------|-------| +| `None`, `bool`, `int`, `float`, `str` | Basic types | +| `list` | Elements serialized recursively | +| `dict` | Supports non-string keys | +| `set`, `frozenset` | | +| `tuple` | Type preserved (not converted to list) | +| `datetime`, `date`, `time` | | +| `timedelta` | | +| `Decimal` | Precision preserved | +| `UUID` | | +| `bytes`, `bytearray` | Stored as fragments (see [Blobs](./blobs.md)) | + +## Pydantic models + +Pydantic models are automatically detected and serialized. Pydantic models can be passed between executions (even across different workers) as long as the model class is available in both environments. + +```python +from pydantic import BaseModel + +class Order(BaseModel): + item: str + quantity: int + +@cf.task() +def create_order(item: str, quantity: int) -> Order: + return Order(item=item, quantity=quantity) + +@cf.task() +def process_order(order: Order): + print(f"Processing {order.quantity}x {order.item}") +``` + +## Pandas DataFrames + +Pandas DataFrames are serialized using the Parquet format, which preserves column types and is efficient for large datasets. Pandas must be installed in the worker environment. + +## Pickle fallback + +Any type not explicitly handled above is serialized using Python's `pickle` module. This provides broad compatibility but comes with the usual caveats: + +- Both sides must have the same class definition available. +- Pickle is Python-specific and not portable across languages. +- Be cautious about unpickling data from untrusted sources. + +The type name is recorded in metadata, which is displayed in Studio to help identify pickled values. diff --git a/docs/docs/server_config.md b/docs/docs/server_config.md new file mode 100644 index 00000000..8c551c64 --- /dev/null +++ b/docs/docs/server_config.md @@ -0,0 +1,68 @@ +# Server configuration + +The Coflux server is distributed as a Docker image and can be started using the CLI or directly with Docker. + +## Starting the server + +```bash +coflux server +``` + +This is a convenience wrapper around `docker run`. Docker must be installed and running. + +### Options + +| Flag | Default | Description | +|------|---------|-------------| +| `--project` | _(none)_ | Restrict the server to a single project | +| `--port` | `7777` | Port to run the server on | +| `--data-dir` | `./data` | Directory for persistent data | +| `--no-auth` | `false` | Disable authentication | +| `--super-token` | _(none)_ | Set a super token for authentication | +| `--image` | _(auto)_ | Docker image to use | + +## Projects + +The server can operate in two modes: + +### Single-project mode + +Use `--project` to restrict the server to a single project. All requests are routed to this project regardless of the hostname used to connect: + +```bash +coflux server --project myproject +``` + +### Multi-project mode + +Configure `COFLUX_PUBLIC_HOST` with a `%` placeholder to enable subdomain-based routing, where the project is extracted from the subdomain: + +```bash +docker run \ + -e COFLUX_PUBLIC_HOST=%.localhost:7777 \ + ghcr.io/bitroot/coflux +``` + +Workers and Studio connect using the project as a subdomain (e.g., `myproject.localhost:7777`). + +## Environment variables + +The server is configured via environment variables. When using `coflux server`, CLI flags are mapped to these variables automatically. + +| Variable | Default | Description | +|----------|---------|-------------| +| `COFLUX_PROJECT` | _(none)_ | Restrict to a single project | +| `COFLUX_PUBLIC_HOST` | `localhost:PORT` | Public host (use `%` prefix for subdomain routing) | +| `COFLUX_REQUIRE_AUTH` | `true` | Whether authentication is required | +| `COFLUX_SUPER_TOKEN_HASH` | _(none)_ | SHA-256 hex hash of the super token | +| `COFLUX_SECRET` | _(none)_ | Server secret for signing service tokens | +| `COFLUX_STUDIO_TEAMS` | _(none)_ | Comma-separated team IDs for Studio auth | +| `COFLUX_STUDIO_URL` | `https://studio.coflux.com` | Studio URL for JWKS | +| `COFLUX_DATA_DIR` | `./data` | Data directory path | +| `COFLUX_ALLOW_ORIGINS` | `https://studio.coflux.com` | Comma-separated CORS origins | +| `COFLUX_LAUNCHER_TYPES` | _(none)_ | Allowed launcher types (e.g., `docker,process`) | +| `COFLUX_CLI_PATH` | `coflux` | CLI binary path for process launcher | + +## Data storage + +The server stores data in the configured data directory. Each project gets its own SQLite database. Data is managed in rotating epochs, which allows the server to manage data growth without losing access to historical runs. diff --git a/docs/docs/stubs.md b/docs/docs/stubs.md deleted file mode 100644 index d2910027..00000000 --- a/docs/docs/stubs.md +++ /dev/null @@ -1,40 +0,0 @@ -# Stubs - -A stub allows you to define a reference to a step that's in another module (in a separate codebase) by its name. Using stubs can make it easier to separate 3rd-party dependencies since you don't need to `import` the target module into into the module you're calling it from. - -For example, given an `other.workflows` module with a random number generator: - -```python -# other/workflows.py - -@cf.task() -def random_int(max: int) -> int: - return random.randint(1, max) -``` - -Another module could reference this function with a stub, and then call it: - -```python -# example/workflows.py - -@cf.stub("other.workflows") -def random_int(max: int) -> int: - ... - -@cf.workflow() -def roll_die(): - if random_int(6).result() == 6: - print("You won") - else: - print("You lost") -``` - -## Stub implementations - -When you call the stub in the context of a workflow, the function itself won't be executed, so the body of the function isn't important. However, being able to implement the function is useful when you want to be able to run your code outside of the context of a workflow. For example, as part of a test, you could return some dummy data. - -```python -@cf.stub("other.workflows") -def random_int(max: int) -> int: - return 4 # Dummy value for testing -``` diff --git a/docs/docs/suspense.md b/docs/docs/suspense.md index e6ee4d29..863f13f7 100644 --- a/docs/docs/suspense.md +++ b/docs/docs/suspense.md @@ -1,14 +1,14 @@ # Suspense -Suspense is a way of putting the task to sleep - the current execution will be stopped and a new execution will be started, executing _from the beginning of the task_. +Suspense is a way of putting a task to sleep — the current execution will be stopped and a new execution will be started, executing _from the beginning of the task_. -The suspense can be either _explicit_ or _implicit_. In either case, it's important that it's safe for the code up to the point of suspense can be re-executed - i.e., any side-effects need to be idempotent. (An easy way to achieve this is to ensure that any tasks called by the execution are [memoised](/memoising).) +The suspense can be either _explicit_ or _implicit_. In either case, it's important that the code up to the point of suspense is safe to re-execute — i.e., any side-effects need to be idempotent. (An easy way to achieve this is to ensure that any tasks called by the execution are [memoized](/memoizing).) Suspense is useful as a way of freeing up resources used by a waiting execution. ## Explicit suspense -To explicitly suspend an execution, simply call the `suspend` function, passing either a delay (as a number of seconds, or as a `datetime.timedelta`), or a future timestamp (as a `datetime.datetime`): +To explicitly suspend an execution, call the `suspend` function, passing either a delay (as a number of seconds, or as a `datetime.timedelta`), or a future timestamp (as a `datetime.datetime`): ```python @cf.workflow() @@ -47,5 +47,5 @@ with cf.suspense(10): ``` :::warning -It's important that any tasks that are called within the suspense block _or before it_ are [memoised](/memoising) (or cached). Otherwise the task is likely to keep suspending as a new task will be spawned on each execution. +It's important that any tasks called within the suspense block _or before it_ are [memoized](/memoizing) (or cached). Otherwise the task is likely to keep suspending as a new task will be spawned on each execution. ::: diff --git a/docs/docs/workflows.md b/docs/docs/workflows.md new file mode 100644 index 00000000..6815b276 --- /dev/null +++ b/docs/docs/workflows.md @@ -0,0 +1,98 @@ +# Defining workflows + +Workflows are defined in Python using decorators provided by the `coflux` package. This page covers how to define workflows, tasks, and modules in more detail. + +## Workflows and tasks + +A function decorated with `@cf.workflow()` is the entry point for a run. A function decorated with `@cf.task()` is an operation that can be called from a workflow or another task. + +```python +import coflux as cf + +@cf.task() +def fetch_data(url: str): + ... + +@cf.workflow() +def process(url: str): + data = fetch_data(url) + ... +``` + +Workflows can call tasks, tasks can call other tasks, and tasks can call workflows (which will submit a separate run). Workflows and tasks are collectively referred to as _targets_. + +You can think of the distinction between workflows and tasks like the distinction between public and private functions — workflows are the entry points that can be submitted, while tasks are internal operations. + +### Docstrings + +The docstring of a workflow is displayed in Studio when submitting a run. This is a good place to explain what the workflow does and what arguments it expects. + +```python +@cf.workflow() +def process(url: str): + """ + Fetches data from `url` and processes it. + """ + ... +``` + +### Running outside Coflux + +The decorators are designed to be unimposing — decorated functions can be called directly outside of a Coflux context (e.g., in tests or scripts). When called outside of an execution context, tasks execute directly rather than being scheduled as steps. + +## Modules + +Targets are defined in _modules_. Typically these correspond to Python modules (i.e., `.py` files). + +Modules are specified when starting a worker: + +```bash +coflux worker --dev myapp.workflows myapp.tasks +``` + +Or in `coflux.toml`: + +```toml +modules = ["myapp.workflows", "myapp.tasks"] +``` + +Each module's targets (workflows and tasks) are declared by the worker when it connects to the server so that the server knows what targets the worker is able to handle. The workflows can also be registered with the server so they appear in Studio and can be submitted. + +## Stubs + +A _stub_ allows you to reference a target in another module without importing it. This is useful for separating dependencies between modules that may run on different workers or have different package requirements. + +For example, given an `other.workflows` module with a task: + +```python +# other/workflows.py + +@cf.task() +def random_int(max: int) -> int: + return random.randint(1, max) +``` + +Another module can reference this task with a stub: + +```python +# example/workflows.py + +@cf.stub("other.workflows") +def random_int(max: int) -> int: + ... + +@cf.workflow() +def roll_die(): + if random_int(6).result() == 6: + print("You won") + else: + print("You lost") +``` + +When called in the context of a workflow, the stub schedules the real target for execution — the stub's function body is not executed. However, the body can be useful for providing a dummy implementation when running outside of Coflux (e.g., in tests): + +```python +@cf.stub("other.workflows") +def random_int(max: int) -> int: + return 4 # Dummy value for testing +``` diff --git a/docs/sidebars.ts b/docs/sidebars.ts index 2c4e3804..7bd22786 100644 --- a/docs/sidebars.ts +++ b/docs/sidebars.ts @@ -12,29 +12,52 @@ const sidebars: SidebarsConfig = { "getting_started/workflows", "getting_started/workers", "getting_started/runs", + "getting_started/studio", ], }, "concepts", - "executions", - "concurrency", - "retries", - "recurring", - "caching", - "groups", - "logging", { type: "category", - label: "Advanced", + label: "Workflows", items: [ - "suspense", + "workflows", + "executions", + "concurrency", + "groups", + ], + }, + { + type: "category", + label: "Execution", + items: [ + "retries", + "recurring", + "caching", + "memoizing", "deferring", - "memoising", - "assets", - "stubs", + "suspense", + ], + }, + { + type: "category", + label: "Data & storage", + items: [ + "serialization", "blobs", + "assets", + "logging", + ], + }, + { + type: "category", + label: "Configuration", + items: [ + "server_config", + "cli_config", + "authentication", + "pools", ], }, - "examples", ], }; diff --git a/server/lib/coflux/application.ex b/server/lib/coflux/application.ex index 45a5708f..1dc91830 100644 --- a/server/lib/coflux/application.ex +++ b/server/lib/coflux/application.ex @@ -12,6 +12,7 @@ defmodule Coflux.Application do children = [ # TODO: separate launch supervisor per project? (and specify max_children?) {Task.Supervisor, name: Coflux.LauncherSupervisor}, + {DynamicSupervisor, name: Coflux.ProcessLauncher.Supervisor, strategy: :one_for_one}, Orchestration.Supervisor, {Registry, keys: :unique, name: Coflux.Logs.Registry}, Logs.Supervisor, diff --git a/server/lib/coflux/config.ex b/server/lib/coflux/config.ex index 1779d8ba..cd71c216 100644 --- a/server/lib/coflux/config.ex +++ b/server/lib/coflux/config.ex @@ -42,6 +42,12 @@ defmodule Coflux.Config do token support. Should be a long random string, kept consistent across restarts. - **COFLUX_STUDIO_TEAMS**: Comma-separated list of team IDs allowed for Studio auth - **COFLUX_STUDIO_URL**: Studio URL for JWKS (default: https://studio.coflux.com) + - **COFLUX_LAUNCHER_TYPES**: Comma-separated list of allowed launcher types + (e.g. `"docker"`, `"process"`, `"docker,process"`). Defaults to none (no + launcher types enabled). Pools without an enabled launcher type cannot be + created or updated. + - **COFLUX_CLI_PATH**: Path to the Coflux CLI binary for the process launcher + (default: `"coflux"`, assuming the CLI is on PATH). """ @doc """ @@ -58,6 +64,8 @@ defmodule Coflux.Config do :persistent_term.put(:coflux_studio_url, parse_studio_url()) :persistent_term.put(:coflux_super_token_hash, parse_super_token()) :persistent_term.put(:coflux_secret, parse_secret()) + :persistent_term.put(:coflux_launcher_types, parse_launcher_types()) + :persistent_term.put(:coflux_cli_path, parse_cli_path()) :ok end @@ -257,4 +265,46 @@ defmodule Coflux.Config do secret -> secret end end + + @doc """ + Returns the set of allowed launcher types (e.g. `MapSet<:docker | :process>`). + + Defaults to an empty set (no launcher types enabled). Set via + `COFLUX_LAUNCHER_TYPES` as a comma-separated list of type names + (e.g. `"docker,process"`). + """ + def launcher_types do + :persistent_term.get(:coflux_launcher_types) + end + + @doc """ + Returns the CLI binary path for the process launcher. + + Defaults to `"coflux"` (assumes the CLI is on PATH). Set via `COFLUX_CLI_PATH`. + """ + def cli_path do + :persistent_term.get(:coflux_cli_path) + end + + defp parse_cli_path do + System.get_env("COFLUX_CLI_PATH", "coflux") + end + + @valid_launcher_types MapSet.new([:docker, :process]) + + defp parse_launcher_types do + case System.get_env("COFLUX_LAUNCHER_TYPES") do + nil -> + MapSet.new() + + value -> + value + |> String.split(",") + |> Enum.map(&String.trim/1) + |> Enum.reject(&(&1 == "")) + |> Enum.map(&String.to_existing_atom/1) + |> MapSet.new() + |> MapSet.intersection(@valid_launcher_types) + end + end end diff --git a/server/lib/coflux/handlers/api.ex b/server/lib/coflux/handlers/api.ex index b5fe3fe2..22403a2b 100644 --- a/server/lib/coflux/handlers/api.ex +++ b/server/lib/coflux/handlers/api.ex @@ -499,15 +499,13 @@ defmodule Coflux.Handlers.Api do req, %{workspace_id: "workspaceId"}, %{ - provides: {"provides", &parse_tag_set/1}, - concurrency: {"concurrency", &parse_integer(&1, optional: true)} + provides: {"provides", &parse_tag_set/1} } ) do {:ok, arguments, req} -> opts = [ - provides: arguments[:provides], - concurrency: arguments[:concurrency] + provides: arguments[:provides] ] |> Enum.reject(fn {_, v} -> is_nil(v) end) @@ -755,7 +753,6 @@ defmodule Coflux.Handlers.Api do defp parse_docker_launcher(value) do image = Map.get(value, "image") docker_host = Map.get(value, "dockerHost") - server_host = Map.get(value, "serverHost") cond do not is_binary(image) or String.length(image) > 200 -> @@ -764,29 +761,95 @@ defmodule Coflux.Handlers.Api do not is_nil(docker_host) and (not is_binary(docker_host) or String.length(docker_host) > 200) -> {:error, :invalid} - not is_nil(server_host) and (not is_binary(server_host) or String.length(server_host) > 200) -> - {:error, :invalid} - true -> launcher = %{type: :docker, image: image} launcher = if docker_host, do: Map.put(launcher, :docker_host, docker_host), else: launcher + {:ok, launcher} + end + end + + defp parse_process_launcher(value) do + directory = Map.get(value, "directory") + + cond do + not is_binary(directory) or String.length(directory) > 500 -> + {:error, :invalid} + + true -> + {:ok, %{type: :process, directory: directory}} + end + end + + defp parse_common_launcher_fields(launcher, value) do + server_host = Map.get(value, "serverHost") + adapter = Map.get(value, "adapter") + concurrency = Map.get(value, "concurrency") + env = Map.get(value, "env") + + cond do + not is_nil(server_host) and (not is_binary(server_host) or String.length(server_host) > 200) -> + {:error, :invalid} + + not is_nil(adapter) and + (not is_list(adapter) or adapter == [] or + Enum.any?(adapter, &(not is_binary(&1)))) -> + {:error, :invalid} + + not is_nil(concurrency) and (not is_integer(concurrency) or concurrency < 1) -> + {:error, :invalid} + + not is_nil(env) and not is_map(env) -> + {:error, :invalid} + + not is_nil(env) and + Enum.any?(env, fn {k, v} -> + not is_binary(k) or not is_binary(v) or String.starts_with?(k, "COFLUX_") + end) -> + {:error, :invalid} + + true -> launcher = if server_host, do: Map.put(launcher, :server_host, server_host), else: launcher + launcher = if adapter, do: Map.put(launcher, :adapter, adapter), else: launcher + + launcher = + if concurrency, do: Map.put(launcher, :concurrency, concurrency), else: launcher + + launcher = if env, do: Map.put(launcher, :env, env), else: launcher {:ok, launcher} end end defp parse_launcher(value) do + allowed = Coflux.Config.launcher_types() + cond do is_map(value) -> case Map.fetch(value, "type") do - {:ok, "docker"} -> parse_docker_launcher(value) - {:ok, _other} -> {:error, :invalid} - :error -> {:error, :invalid} + {:ok, type} when type in ["docker", "process"] -> + type_atom = String.to_existing_atom(type) + + if MapSet.member?(allowed, type_atom) do + with {:ok, launcher} <- + (case type do + "docker" -> parse_docker_launcher(value) + "process" -> parse_process_launcher(value) + end) do + parse_common_launcher_fields(launcher, value) + end + else + {:error, :invalid} + end + + {:ok, _other} -> + {:error, :invalid} + + :error -> + {:error, :invalid} end is_nil(value) -> diff --git a/server/lib/coflux/handlers/worker.ex b/server/lib/coflux/handlers/worker.ex index d9076359..d58d05ea 100644 --- a/server/lib/coflux/handlers/worker.ex +++ b/server/lib/coflux/handlers/worker.ex @@ -87,12 +87,13 @@ defmodule Coflux.Handlers.Worker do case message["request"] do "declare_targets" -> - [targets] = message["params"] + [targets, concurrency] = message["params"] case Orchestration.declare_targets( state.project_id, state.session_id, - parse_targets(targets) + parse_targets(targets), + concurrency ) do :ok -> {[], state} diff --git a/server/lib/coflux/launchers/docker.ex b/server/lib/coflux/launchers/docker.ex index 81cc64b0..78e20778 100644 --- a/server/lib/coflux/launchers/docker.ex +++ b/server/lib/coflux/launchers/docker.ex @@ -3,9 +3,10 @@ defmodule Coflux.DockerLauncher do @log_tail_lines 20 @log_max_bytes 1024 - def launch(project_id, workspace_name, session_token, modules, config \\ %{}) do + def launch(env, modules, config) do docker_conn = parse_docker_host(config[:docker_host]) - coflux_host = config[:server_host] || Coflux.Config.server_host(project_id) + + container_env = Enum.map(env, fn {k, v} -> "#{k}=#{v}" end) with {:ok, %{"Id" => container_id}} <- create_container( @@ -14,11 +15,7 @@ defmodule Coflux.DockerLauncher do "Image" => Map.fetch!(config, :image), "HostConfig" => %{"NetworkMode" => "host"}, "Cmd" => modules, - "Env" => [ - "COFLUX_SERVER_HOST=#{coflux_host}", - "COFLUX_WORKSPACE=#{workspace_name}", - "COFLUX_SESSION=#{session_token}" - ] + "Env" => container_env } ), :ok <- start_container(docker_conn, container_id) do diff --git a/server/lib/coflux/launchers/process.ex b/server/lib/coflux/launchers/process.ex new file mode 100644 index 00000000..ad239a65 --- /dev/null +++ b/server/lib/coflux/launchers/process.ex @@ -0,0 +1,143 @@ +defmodule Coflux.ProcessLauncher do + @log_tail_lines 20 + @log_max_bytes 1024 + + def launch(env, modules, config) do + cli_path = Coflux.Config.cli_path() + directory = Map.fetch!(config, :directory) + + # Use `exec` so the shell is replaced by the command, ensuring + # the port's OS process IS the worker (not a wrapper shell). + argv = Enum.map_join([cli_path, "worker" | modules], " ", &shell_escape/1) + shell_cmd = "exec #{argv}" + + port_env = + Enum.map(env, fn {k, v} -> {String.to_charlist(k), String.to_charlist(v)} end) + + port_opts = + [ + :binary, + :exit_status, + :stderr_to_stdout, + {:env, port_env}, + {:args, ["-c", shell_cmd]}, + {:cd, String.to_charlist(directory)} + ] + + case DynamicSupervisor.start_child( + Coflux.ProcessLauncher.Supervisor, + {Coflux.ProcessLauncher.Worker, port_opts} + ) do + {:ok, pid} -> + {:ok, %{pid: pid}} + + {:error, reason} -> + {:error, "failed to start process: #{inspect(reason)}"} + end + end + + def stop(%{pid: pid}) do + if Process.alive?(pid) do + GenServer.call(pid, :stop) + end + + :ok + end + + def poll(%{pid: pid}) do + if Process.alive?(pid) do + case GenServer.call(pid, :status) do + :running -> + {:ok, true} + + {:exited, exit_status, output} -> + GenServer.stop(pid, :normal) + error = if exit_status != 0, do: "exit_code:#{exit_status}" + logs = if error, do: format_logs(output) + {:ok, false, error, logs} + end + else + {:ok, false, "process_lost", nil} + end + end + + defp format_logs(""), do: nil + + defp format_logs(content) do + content + |> tail_lines(@log_tail_lines) + |> truncate_bytes(@log_max_bytes) + end + + defp tail_lines(string, n) do + string + |> String.split("\n") + |> Enum.take(-n) + |> Enum.join("\n") + end + + defp truncate_bytes(string, max_bytes) when byte_size(string) <= max_bytes, do: string + + defp truncate_bytes(string, max_bytes) do + string + |> binary_part(byte_size(string) - max_bytes, max_bytes) + |> String.replace(~r/^[^\n]*\n/, "") + end + + defp shell_escape(arg) do + "'" <> String.replace(arg, "'", "'\\''") <> "'" + end +end + +defmodule Coflux.ProcessLauncher.Worker do + use GenServer, restart: :temporary + + def start_link(port_opts) do + GenServer.start_link(__MODULE__, port_opts) + end + + @impl true + def init(port_opts) do + port = Port.open({:spawn_executable, "/bin/sh"}, port_opts) + {:ok, %{port: port, output: [], exit_status: nil, stop_requested: false}} + end + + @impl true + def handle_info({port, {:data, data}}, %{port: port} = state) do + {:noreply, %{state | output: [state.output, data]}} + end + + def handle_info({port, {:exit_status, status}}, %{port: port} = state) do + {:noreply, %{state | exit_status: status}} + end + + @impl true + def handle_call(:status, _from, state) do + if is_nil(state.exit_status) do + {:reply, :running, state} + else + logs = IO.iodata_to_binary(state.output) + # Treat as clean exit if stop was requested (even if exit code is non-zero + # due to SIGTERM producing exit code 143). + exit_status = if state.stop_requested, do: 0, else: state.exit_status + {:reply, {:exited, exit_status, logs}, state} + end + end + + def handle_call(:stop, _from, %{exit_status: nil} = state) do + {:os_pid, os_pid} = Port.info(state.port, :os_pid) + System.cmd("kill", [Integer.to_string(os_pid)], stderr_to_stdout: true) + {:reply, :ok, %{state | stop_requested: true}} + end + + def handle_call(:stop, _from, state) do + {:reply, :ok, %{state | stop_requested: true}} + end + + @impl true + def terminate(_reason, %{exit_status: nil, port: port}) do + Port.close(port) + end + + def terminate(_reason, _state), do: :ok +end diff --git a/server/lib/coflux/orchestration.ex b/server/lib/coflux/orchestration.ex index 9016fef7..f903a269 100644 --- a/server/lib/coflux/orchestration.ex +++ b/server/lib/coflux/orchestration.ex @@ -107,8 +107,8 @@ defmodule Coflux.Orchestration do call_server(project_id, {:create_session, workspace_id, access, opts}) end - def declare_targets(project_id, session_id, targets) do - call_server(project_id, {:declare_targets, session_id, targets}) + def declare_targets(project_id, session_id, targets, concurrency) do + call_server(project_id, {:declare_targets, session_id, targets, concurrency}) end def start_run(project_id, module, target, type, arguments, access \\ nil, opts \\ []) do diff --git a/server/lib/coflux/orchestration/epoch.ex b/server/lib/coflux/orchestration/epoch.ex index fc04b830..a4eff5a9 100644 --- a/server/lib/coflux/orchestration/epoch.ex +++ b/server/lib/coflux/orchestration/epoch.ex @@ -668,7 +668,7 @@ defmodule Coflux.Orchestration.Epoch do {:ok, rows} = query(old_db, """ SELECT s.id, s.external_id, s.workspace_id, s.worker_id, s.provides_tag_set_id, - s.concurrency, s.activation_timeout, s.reconnection_timeout, s.secret_hash, + s.activation_timeout, s.reconnection_timeout, s.secret_hash, s.created_at, s.created_by FROM sessions AS s LEFT JOIN session_expirations AS se ON se.session_id = s.id @@ -676,8 +676,8 @@ defmodule Coflux.Orchestration.Epoch do """) Enum.reduce(rows, %{}, fn {old_id, ext_id, old_ws_id, old_worker_id, old_tag_set_id, - concurrency, activation_timeout, reconnection_timeout, secret_hash, - created_at, created_by}, + activation_timeout, reconnection_timeout, secret_hash, created_at, + created_by}, acc -> new_ws_id = Map.fetch!(workspace_ids, old_ws_id) new_worker_id = if old_worker_id, do: Map.fetch!(worker_ids, old_worker_id) @@ -690,7 +690,6 @@ defmodule Coflux.Orchestration.Epoch do workspace_id: new_ws_id, worker_id: new_worker_id, provides_tag_set_id: new_tag_set_id, - concurrency: concurrency, activation_timeout: activation_timeout, reconnection_timeout: reconnection_timeout, secret_hash: if(secret_hash, do: {:blob, secret_hash}), diff --git a/server/lib/coflux/orchestration/server.ex b/server/lib/coflux/orchestration/server.ex index b7b7c935..67e7f740 100644 --- a/server/lib/coflux/orchestration/server.ex +++ b/server/lib/coflux/orchestration/server.ex @@ -203,7 +203,7 @@ defmodule Coflux.Orchestration.Server do Enum.reduce( active_sessions, state, - fn {session_id, external_id, workspace_id, worker_id, provides_tag_set_id, concurrency, + fn {session_id, external_id, workspace_id, worker_id, provides_tag_set_id, activation_timeout, reconnection_timeout, secret_hash, created_at, activated_at}, state -> provides = @@ -226,7 +226,7 @@ defmodule Coflux.Orchestration.Server do queue: [], starting: MapSet.new(), executing: MapSet.new(), - concurrency: concurrency, + concurrency: 0, workspace_id: workspace_id, provides: provides, worker_id: worker_id, @@ -419,6 +419,7 @@ defmodule Coflux.Orchestration.Server do |> put_in([Access.key(:workspaces), workspace_id], workspace) |> put_in([Access.key(:workspace_names), workspace.name], workspace_id) |> put_in([Access.key(:workspace_external_ids), workspace.external_id], workspace_id) + |> put_in([Access.key(:pools), workspace_id], %{}) |> notify_listeners( :workspaces, {:workspace, workspace_id, @@ -817,7 +818,6 @@ defmodule Coflux.Orchestration.Server do def handle_call({:create_session, workspace_external_id, access, opts}, _from, state) do provides = Keyword.get(opts, :provides, %{}) - concurrency = Keyword.get(opts, :concurrency, 0) activation_timeout = Keyword.get(opts, :activation_timeout, @default_activation_timeout_ms) reconnection_timeout = @@ -827,7 +827,6 @@ defmodule Coflux.Orchestration.Server do require_workspace(state, workspace_external_id, access) do db_opts = [ provides: provides, - concurrency: concurrency, activation_timeout: activation_timeout, reconnection_timeout: reconnection_timeout, created_by: access[:principal_id] @@ -843,7 +842,7 @@ defmodule Coflux.Orchestration.Server do queue: [], starting: MapSet.new(), executing: MapSet.new(), - concurrency: concurrency, + concurrency: 0, workspace_id: workspace_id, provides: provides, worker_id: nil, @@ -971,10 +970,13 @@ defmodule Coflux.Orchestration.Server do end end - def handle_call({:declare_targets, external_id, targets}, _from, state) do + def handle_call({:declare_targets, external_id, targets, concurrency}, _from, state) do session_id = Map.fetch!(state.session_ids, external_id) - state = assign_targets(state, targets, session_id) + state = + state + |> assign_targets(targets, session_id) + |> put_in([Access.key(:sessions), session_id, :concurrency], concurrency) session = Map.fetch!(state.sessions, session_id) @@ -1751,7 +1753,7 @@ defmodule Coflux.Orchestration.Server do def handle_call({:subscribe_pool, workspace_external_id, pool_name, pid}, _from, state) do case resolve_workspace_external_id(state, workspace_external_id) do {:ok, workspace_id} -> - pool = Map.get(state.pools[workspace_id], pool_name) + pool = state.pools |> Map.get(workspace_id, %{}) |> Map.get(pool_name) {:ok, pool_workers} = Workers.get_pool_workers(state.db, pool_name) # TODO: include 'active' workers that aren't in this (potentially limited) list @@ -2280,11 +2282,28 @@ defmodule Coflux.Orchestration.Server do state = if Enum.any?(unassigned) do - latest_pool_launch_at = + # Track the most recent worker creation per pool, and which pools + # already have a worker that isn't ready to accept work. We skip + # launching for pools that have a worker still pending activation + # or that activated but hasn't registered any targets yet (e.g. + # due to a misconfigured command or working directory). + {latest_pool_launch_at, pools_with_pending_worker} = state.workers |> Map.values() - |> Enum.reduce(%{}, fn worker, latest -> - Map.update(latest, worker.pool_id, worker.created_at, &max(&1, worker.created_at)) + |> Enum.reduce({%{}, MapSet.new()}, fn worker, {latest, pending} -> + latest = + Map.update(latest, worker.pool_id, worker.created_at, &max(&1, worker.created_at)) + + pending = + with session_id when not is_nil(session_id) <- worker.session_id, + {:ok, session} <- Map.fetch(state.sessions, session_id), + false <- session.activated_at != nil and Enum.any?(session.targets) do + MapSet.put(pending, worker.pool_id) + else + _ -> pending + end + + {latest, pending} end) unassigned @@ -2301,13 +2320,14 @@ defmodule Coflux.Orchestration.Server do end) |> Enum.reject(&is_nil/1) |> Enum.uniq() + |> Enum.reject(&MapSet.member?(pools_with_pending_worker, &1)) |> Enum.filter(&(now - Map.get(latest_pool_launch_at, &1, 0) > 10_000)) |> Enum.reduce(state, fn pool_id, state -> case Workers.create_worker(state.db, pool_id) do {:ok, worker_id, worker_external_id, created_at} -> {pool_name, pool} = Enum.find( - state.pools[workspace_id], + Map.get(state.pools, workspace_id, %{}), &(elem(&1, 1).id == pool_id) ) @@ -2354,11 +2374,9 @@ defmodule Coflux.Orchestration.Server do pool.launcher, :launch, [ - state.project_id, - state.workspaces[workspace_id].name, - token, + build_launcher_env(state, workspace_id, token, pool.launcher), pool.modules, - Map.delete(pool.launcher, :type) + pool.launcher ], fn state, result -> {data, error} = @@ -5092,10 +5110,38 @@ defmodule Coflux.Orchestration.Server do |> Map.update!(:launcher_tasks, &Map.delete(&1, task_ref)) end + defp build_launcher_env(state, workspace_id, token, launcher) do + coflux_host = launcher[:server_host] || Coflux.Config.server_host(state.project_id) + + base = %{ + "COFLUX_HOST" => coflux_host, + "COFLUX_WORKSPACE" => state.workspaces[workspace_id].name, + "COFLUX_SESSION" => token + } + + base = + case Map.get(launcher, :adapter) do + nil -> base + adapter -> Map.put(base, "COFLUX_WORKER_ADAPTER", Enum.join(adapter, ",")) + end + + base = + case Map.get(launcher, :concurrency) do + nil -> base + concurrency -> Map.put(base, "COFLUX_WORKER_CONCURRENCY", Integer.to_string(concurrency)) + end + + case Map.get(launcher, :env) do + nil -> base + env -> Map.merge(base, env) + end + end + defp call_launcher(state, launcher, fun, args, callback) do module = case launcher.type do :docker -> Coflux.DockerLauncher + :process -> Coflux.ProcessLauncher end task = Task.Supervisor.async_nolink(Coflux.LauncherSupervisor, module, fun, args) @@ -5133,6 +5179,14 @@ defmodule Coflux.Orchestration.Server do state = Map.update!(state, :worker_external_ids, &Map.delete(&1, worker.external_id)) + # Expire the worker's session so it can't reconnect to a deactivated worker. + state = + if worker.session_id && Map.has_key?(state.sessions, worker.session_id) do + remove_session(state, worker.session_id) + else + state + end + notify_listeners( state, {:pool, workspace_external_id(state, worker.workspace_id), worker.pool_name}, diff --git a/server/lib/coflux/orchestration/sessions.ex b/server/lib/coflux/orchestration/sessions.ex index 369fcc06..abd00299 100644 --- a/server/lib/coflux/orchestration/sessions.ex +++ b/server/lib/coflux/orchestration/sessions.ex @@ -5,7 +5,6 @@ defmodule Coflux.Orchestration.Sessions do def create_session(db, workspace_id, worker_id, opts \\ []) do provides = Keyword.get(opts, :provides) - concurrency = Keyword.get(opts, :concurrency, 0) activation_timeout = Keyword.get(opts, :activation_timeout) reconnection_timeout = Keyword.get(opts, :reconnection_timeout) created_by = Keyword.get(opts, :created_by) @@ -32,7 +31,6 @@ defmodule Coflux.Orchestration.Sessions do workspace_id: workspace_id, worker_id: worker_id, provides_tag_set_id: provides_tag_set_id, - concurrency: concurrency, activation_timeout: activation_timeout, reconnection_timeout: reconnection_timeout, secret_hash: {:blob, secret_hash}, @@ -101,7 +99,6 @@ defmodule Coflux.Orchestration.Sessions do s.workspace_id, s.worker_id, s.provides_tag_set_id, - s.concurrency, s.activation_timeout, s.reconnection_timeout, s.secret_hash, diff --git a/server/lib/coflux/orchestration/workspaces.ex b/server/lib/coflux/orchestration/workspaces.ex index d06c6ad1..404e89ab 100644 --- a/server/lib/coflux/orchestration/workspaces.ex +++ b/server/lib/coflux/orchestration/workspaces.ex @@ -672,13 +672,15 @@ defmodule Coflux.Orchestration.Workspaces do defp encode_launcher_type(type) do case type do - :docker -> 0 + :process -> 0 + :docker -> 1 end end defp decode_launcher_type(value) do case value do - 0 -> :docker + 0 -> :process + 1 -> :docker end end diff --git a/server/lib/coflux/topics/module.ex b/server/lib/coflux/topics/module.ex index eb25ad79..45956b24 100644 --- a/server/lib/coflux/topics/module.ex +++ b/server/lib/coflux/topics/module.ex @@ -13,30 +13,31 @@ defmodule Coflux.Topics.Module do module = Map.fetch!(params, :module) workspace_id = Map.fetch!(params, :workspace_id) - {:ok, executions, ref} = - Orchestration.subscribe_module(project_id, module, workspace_id, self()) + case Orchestration.subscribe_module(project_id, module, workspace_id, self()) do + {:ok, executions, ref} -> + value = + Map.new(executions, fn {target_name, external_run_id, step_number, attempt, + execute_after, created_at, assigned_at} -> + execution_id = "#{external_run_id}:#{step_number}:#{attempt}" - value = - Map.new(executions, fn {target_name, external_run_id, step_number, attempt, execute_after, - created_at, assigned_at} -> - execution_id = "#{external_run_id}:#{step_number}:#{attempt}" + {execution_id, + %{ + target: target_name, + runId: external_run_id, + stepId: "#{external_run_id}:#{step_number}", + stepNumber: step_number, + attempt: attempt, + executeAfter: execute_after, + createdAt: created_at, + assignedAt: assigned_at + }} + end) - {execution_id, - %{ - target: target_name, - runId: external_run_id, - stepId: "#{external_run_id}:#{step_number}", - stepNumber: step_number, - attempt: attempt, - executeAfter: execute_after, - createdAt: created_at, - assignedAt: assigned_at - }} - end) + {:ok, Topic.new(value, %{ref: ref})} - topic = Topic.new(value, %{ref: ref}) - - {:ok, topic} + {:error, :workspace_invalid} -> + {:error, :not_found} + end end def handle_info({:topic, _ref, notifications}, topic) do diff --git a/server/lib/coflux/topics/modules.ex b/server/lib/coflux/topics/modules.ex index 718b5da5..ad626654 100644 --- a/server/lib/coflux/topics/modules.ex +++ b/server/lib/coflux/topics/modules.ex @@ -11,36 +11,39 @@ defmodule Coflux.Topics.Modules do project_id = Map.fetch!(params, :project) workspace_id = Map.fetch!(params, :workspace_id) - {:ok, manifests, executions, ref} = - Orchestration.subscribe_modules(project_id, workspace_id, self()) - - value = - Map.new(manifests, fn {module, workflows} -> - result = %{ - workflows: Map.keys(workflows), - executing: 0, - scheduled: 0, - nextDueAt: nil - } - - result = - case Map.fetch(executions, module) do - {:ok, {executing, scheduled}} -> - next_due_at = scheduled |> Map.values() |> Enum.min(fn -> nil end) - - result - |> Map.put(:executing, MapSet.size(executing)) - |> Map.put(:scheduled, map_size(scheduled)) - |> Map.put(:nextDueAt, next_due_at) - - :error -> - result - end - - {module, result} - end) - - {:ok, Topic.new(value, %{ref: ref, executions: executions})} + case Orchestration.subscribe_modules(project_id, workspace_id, self()) do + {:ok, manifests, executions, ref} -> + value = + Map.new(manifests, fn {module, workflows} -> + result = %{ + workflows: Map.keys(workflows), + executing: 0, + scheduled: 0, + nextDueAt: nil + } + + result = + case Map.fetch(executions, module) do + {:ok, {executing, scheduled}} -> + next_due_at = scheduled |> Map.values() |> Enum.min(fn -> nil end) + + result + |> Map.put(:executing, MapSet.size(executing)) + |> Map.put(:scheduled, map_size(scheduled)) + |> Map.put(:nextDueAt, next_due_at) + + :error -> + result + end + + {module, result} + end) + + {:ok, Topic.new(value, %{ref: ref, executions: executions})} + + {:error, :workspace_invalid} -> + {:error, :not_found} + end end def handle_info({:topic, _ref, notifications}, topic) do diff --git a/server/lib/coflux/topics/pool.ex b/server/lib/coflux/topics/pool.ex index fa6856bb..480de478 100644 --- a/server/lib/coflux/topics/pool.ex +++ b/server/lib/coflux/topics/pool.ex @@ -28,6 +28,9 @@ defmodule Coflux.Topics.Pool do {:error, :not_found} -> {:error, :not_found} + + {:error, :workspace_invalid} -> + {:error, :not_found} end end @@ -90,14 +93,21 @@ defmodule Coflux.Topics.Pool do end defp build_launcher(launcher) do - case launcher.type do - :docker -> - base = %{type: "docker", image: launcher.image} - - base - |> maybe_put(:dockerHost, Map.get(launcher, :docker_host)) - |> maybe_put(:serverHost, Map.get(launcher, :server_host)) - end + base = + case launcher.type do + :docker -> + %{type: "docker", image: launcher.image} + |> maybe_put(:dockerHost, Map.get(launcher, :docker_host)) + + :process -> + %{type: "process", directory: launcher.directory} + end + + base + |> maybe_put(:serverHost, Map.get(launcher, :server_host)) + |> maybe_put(:adapter, Map.get(launcher, :adapter)) + |> maybe_put(:concurrency, Map.get(launcher, :concurrency)) + |> maybe_put(:env, Map.get(launcher, :env)) end defp maybe_put(map, _key, nil), do: map diff --git a/server/lib/coflux/topics/pools.ex b/server/lib/coflux/topics/pools.ex index 648dd8b5..f43360db 100644 --- a/server/lib/coflux/topics/pools.ex +++ b/server/lib/coflux/topics/pools.ex @@ -11,12 +11,13 @@ defmodule Coflux.Topics.Pools do project_id = Map.fetch!(params, :project) workspace_id = Map.fetch!(params, :workspace_id) - {:ok, pools, ref} = - Orchestration.subscribe_pools(project_id, workspace_id, self()) + case Orchestration.subscribe_pools(project_id, workspace_id, self()) do + {:ok, pools, ref} -> + {:ok, Topic.new(build_value(pools), %{ref: ref})} - value = build_value(pools) - - {:ok, Topic.new(value, %{ref: ref})} + {:error, :workspace_invalid} -> + {:error, :not_found} + end end def handle_info({:topic, _ref, notifications}, topic) do @@ -47,8 +48,23 @@ defmodule Coflux.Topics.Pools do end defp build_launcher(launcher) do - case launcher.type do - :docker -> Map.take(launcher, [:type, :image]) - end + type_fields = + case launcher.type do + :docker -> + %{type: "docker", image: launcher.image} + |> maybe_put(:dockerHost, Map.get(launcher, :docker_host)) + + :process -> + %{type: "process", directory: launcher.directory} + end + + type_fields + |> maybe_put(:serverHost, Map.get(launcher, :server_host)) + |> maybe_put(:adapter, Map.get(launcher, :adapter)) + |> maybe_put(:concurrency, Map.get(launcher, :concurrency)) + |> maybe_put(:env, Map.get(launcher, :env)) end + + defp maybe_put(map, _key, nil), do: map + defp maybe_put(map, key, value), do: Map.put(map, key, value) end diff --git a/server/lib/coflux/topics/search.ex b/server/lib/coflux/topics/search.ex index f31b759f..943eacb1 100644 --- a/server/lib/coflux/topics/search.ex +++ b/server/lib/coflux/topics/search.ex @@ -15,6 +15,9 @@ defmodule Coflux.Topics.Search do {:ok, targets, _ref} -> topic = Topical.Topic.new(nil, %{targets: targets}) {:ok, topic} + + {:error, :workspace_invalid} -> + {:error, :not_found} end end diff --git a/server/lib/coflux/topics/sessions.ex b/server/lib/coflux/topics/sessions.ex index 16f5f49a..5bbdab57 100644 --- a/server/lib/coflux/topics/sessions.ex +++ b/server/lib/coflux/topics/sessions.ex @@ -11,14 +11,18 @@ defmodule Coflux.Topics.Sessions do project_id = Map.fetch!(params, :project) workspace_id = Map.fetch!(params, :workspace_id) - {:ok, sessions, ref} = Orchestration.subscribe_sessions(project_id, workspace_id, self()) + case Orchestration.subscribe_sessions(project_id, workspace_id, self()) do + {:ok, sessions, ref} -> + sessions = + Map.new(sessions, fn {session_external_id, session} -> + {session_external_id, build_session(session)} + end) - sessions = - Map.new(sessions, fn {session_external_id, session} -> - {session_external_id, build_session(session)} - end) + {:ok, Topic.new(sessions, %{ref: ref})} - {:ok, Topic.new(sessions, %{ref: ref})} + {:error, :workspace_invalid} -> + {:error, :not_found} + end end def handle_info({:topic, _ref, notifications}, topic) do diff --git a/server/lib/coflux/topics/workflow.ex b/server/lib/coflux/topics/workflow.ex index bed36807..8f99bf6c 100644 --- a/server/lib/coflux/topics/workflow.ex +++ b/server/lib/coflux/topics/workflow.ex @@ -39,6 +39,9 @@ defmodule Coflux.Topics.Workflow do {:error, :not_found} -> {:error, :not_found} + + {:error, :workspace_invalid} -> + {:error, :not_found} end end diff --git a/server/priv/migrations/orchestration/1.sql b/server/priv/migrations/orchestration/1.sql index d2b95891..01892f8d 100644 --- a/server/priv/migrations/orchestration/1.sql +++ b/server/priv/migrations/orchestration/1.sql @@ -218,7 +218,6 @@ CREATE TABLE sessions ( workspace_id INTEGER NOT NULL, worker_id INTEGER, provides_tag_set_id INTEGER, - concurrency INTEGER NOT NULL DEFAULT 0, activation_timeout INTEGER, reconnection_timeout INTEGER, secret_hash BLOB, diff --git a/tests/conftest.py b/tests/conftest.py index 764a5e0c..69ac31a5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,4 @@ import json -import subprocess import tempfile import time import uuid @@ -7,7 +6,7 @@ import pytest import support.cli as cli -from support.helpers import ADAPTER_SCRIPT, managed_worker +from support.helpers import ADAPTER_SCRIPT, managed_worker, poll_result from support.server import ManagedServer @@ -39,22 +38,9 @@ def submit(self, module, target, *arguments, idempotency_key=None): def result(self, run_id, timeout=10): """Poll for a run result.""" - deadline = time.time() + timeout - last_error = None - interval = 0.05 - while time.time() < deadline: - try: - return cli.runs_result(run_id, host=self.host, workspace=self.workspace) - except ( - subprocess.CalledProcessError, - json.JSONDecodeError, - ) as e: - last_error = e - time.sleep(interval) - interval = min(interval * 2, 0.5) - raise TimeoutError( - f"run {run_id} did not complete within {timeout}s" - f" (last error: {last_error})" + return poll_result( + run_id, self.host, workspace=self.workspace, + timeout=timeout, interval=0.05, max_interval=0.5, ) def inspect(self, run_id): @@ -118,24 +104,24 @@ def logs( timeout=5, ): """Fetch logs, polling until min_entries are available.""" - deadline = time.time() + timeout if min_entries else 0 + kwargs = dict( + step_attempt=step_attempt, + from_ts=from_ts, + host=self.host, + workspace=self.workspace, + json_output=json_output, + ) + if not min_entries: + return cli.logs_get(run_id, **kwargs) + deadline = time.time() + timeout while True: - data = cli.logs_get( - run_id, - step_attempt=step_attempt, - from_ts=from_ts, - host=self.host, - workspace=self.workspace, - json_output=json_output, - ) - if not min_entries or time.time() >= deadline: + data = cli.logs_get(run_id, **kwargs) + if json_output and len(data.get("logs", [])) >= min_entries: + return data + if not json_output and data.strip(): + return data + if time.time() >= deadline: return data - if json_output: - if len(data.get("logs", [])) >= min_entries: - return data - else: - if data.strip(): - return data time.sleep(0.05) def run(self, module, target, *arguments): @@ -199,6 +185,25 @@ def project_id(): return f"test-{uuid.uuid4().hex[:12]}" +@pytest.fixture +def isolated_server(tmp_path): + """A dedicated server + project for tests that need their own server instance. + + Yields ``(server, host, project_id)``. The default workspace is created + automatically. Tests may call ``server.stop()`` / ``server.start()`` to + simulate restarts — the fixture cleans up on exit. + """ + pid = f"test-{uuid.uuid4().hex[:12]}" + srv = ManagedServer(str(tmp_path / "data")) + srv.start() + host = f"{pid}.localhost:{srv.port}" + cli.workspaces_create("default", host=host) + try: + yield srv, host, pid + finally: + srv.stop() + + @pytest.fixture def worker(server, project_id, tmp_path): _worker_count = 0 diff --git a/tests/support/cli.py b/tests/support/cli.py index 9cfae3f5..12d78fe4 100644 --- a/tests/support/cli.py +++ b/tests/support/cli.py @@ -160,6 +160,50 @@ def logs_get( return result.stdout +def pools_update( + name, modules=None, provides=None, process_dir=None, + docker_image=None, adapter=None, concurrency=None, env=None, host=None, workspace="default", +): + args = ["pools", "update", name] + if modules: + for m in modules: + args.extend(["--module", m]) + if provides: + for key, values in provides.items(): + args.extend(["--provides", ",".join(f"{key}:{v}" for v in values)]) + if process_dir: + args.extend(["--process-dir", process_dir]) + if docker_image: + args.extend(["--docker-image", docker_image]) + if adapter: + args.extend(["--adapter", ",".join(adapter)]) + if concurrency: + args.extend(["--concurrency", str(concurrency)]) + if env: + for k, v in env.items(): + args.extend(["--env", f"{k}={v}"]) + _coflux(*args, host=host, workspace=workspace, output=None) + + +def pools_list(host=None, workspace="default"): + result = _coflux("pools", "list", host=host, workspace=workspace) + return json.loads(result.stdout) + + +def pools_get(name, host=None, workspace="default"): + result = _coflux("pools", "get", name, host=host, workspace=workspace) + return json.loads(result.stdout) + + +def pools_delete(name, host=None, workspace="default"): + _coflux("pools", "delete", name, host=host, workspace=workspace, output=None) + + +def pools_launches(name, host=None, workspace="default"): + result = _coflux("pools", "launches", name, host=host, workspace=workspace) + return json.loads(result.stdout) + + def worker( modules, adapter, diff --git a/tests/support/helpers.py b/tests/support/helpers.py index 6366b7ef..84e5b3a9 100644 --- a/tests/support/helpers.py +++ b/tests/support/helpers.py @@ -5,26 +5,49 @@ import signal import subprocess import time +import urllib.request from contextlib import contextmanager from . import cli from .executor import Executor from .manifest import manifest +from .server import SUPER_TOKEN ADAPTER_SCRIPT = os.path.join(os.path.dirname(__file__), "adapter.py") -def poll_result(run_id, host, workspace="default", timeout=15): +def poll_result(run_id, host, workspace="default", timeout=15, interval=0.1, max_interval=1.0): """Poll for a run result until it completes or times out.""" deadline = time.time() + timeout - interval = 0.1 + last_error = None while time.time() < deadline: try: return cli.runs_result(run_id, host=host, workspace=workspace) - except (subprocess.CalledProcessError, json.JSONDecodeError): + except (subprocess.CalledProcessError, json.JSONDecodeError) as e: + last_error = e time.sleep(interval) - interval = min(interval * 2, 1.0) - raise TimeoutError(f"run {run_id} did not complete within {timeout}s") + interval = min(interval * 2, max_interval) + raise TimeoutError( + f"run {run_id} did not complete within {timeout}s" + f" (last error: {last_error})" + ) + + +def api_post(port, project_id, path, token=None): + """POST to a server management API endpoint.""" + if token is None: + token = SUPER_TOKEN + url = f"http://{project_id}.localhost:{port}/api/{path}" + req = urllib.request.Request( + url, + method="POST", + data=b"{}", + headers={ + "Content-Type": "application/json", + "Authorization": f"Bearer {token}", + }, + ) + urllib.request.urlopen(req, timeout=10) @contextmanager diff --git a/tests/support/jwks.py b/tests/support/jwks.py new file mode 100644 index 00000000..0922443b --- /dev/null +++ b/tests/support/jwks.py @@ -0,0 +1,127 @@ +"""Helpers for testing Studio JWT authentication. + +Provides an in-process JWKS HTTP server and JWT minting utilities so that +E2E tests can exercise the server's Studio auth path without a real Studio +instance. +""" + +import base64 +import json +import threading +import time +from http.server import HTTPServer, BaseHTTPRequestHandler + +import jwt as pyjwt +from cryptography.hazmat.primitives.asymmetric.ed25519 import Ed25519PrivateKey +from cryptography.hazmat.primitives.serialization import Encoding, PublicFormat + + +def _b64url(data: bytes) -> str: + """Base64url-encode without padding.""" + return base64.urlsafe_b64encode(data).rstrip(b"=").decode() + + +def generate_keypair(kid="test-key-1"): + """Generate an Ed25519 keypair and return ``(private_key, jwk_dict)``.""" + private_key = Ed25519PrivateKey.generate() + raw_public = private_key.public_key().public_bytes(Encoding.Raw, PublicFormat.Raw) + jwk = { + "kty": "OKP", + "crv": "Ed25519", + "x": _b64url(raw_public), + "kid": kid, + "alg": "EdDSA", + "use": "sig", + } + return private_key, jwk + + +def mint_jwt( + private_key, + *, + kid="test-key-1", + issuer, + subject="test-user-1", + team_id, + host, + workspaces=None, + expires_in=3600, + not_before=None, + extra_claims=None, +): + """Create a signed JWT matching the format the Coflux server expects. + + Returns the encoded JWT string. + """ + now = int(time.time()) + claims = { + "iss": issuer, + "sub": subject, + "aud": f"{team_id}:{host}", + "exp": now + expires_in, + "iat": now, + } + if workspaces is not None: + claims["workspaces"] = workspaces + if not_before is not None: + claims["nbf"] = not_before + if extra_claims: + claims.update(extra_claims) + return pyjwt.encode( + claims, private_key, algorithm="EdDSA", headers={"kid": kid} + ) + + +class JWKSHandler(BaseHTTPRequestHandler): + """Serves ``/.well-known/jwks.json``.""" + + def do_GET(self): + if self.path == "/.well-known/jwks.json": + body = json.dumps(self.server.jwks_data).encode() + self.send_response(200) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + else: + self.send_error(404) + + def log_message(self, format, *args): + pass # Silence request logging during tests + + +class JWKSServer: + """Tiny HTTP server that serves a JWKS document. + + Usage:: + + srv = JWKSServer([jwk_dict]) + srv.start() + # ... run tests against srv.url ... + srv.stop() + + The ``keys`` list can be mutated between requests to simulate key + rotation. + """ + + def __init__(self, keys): + self._httpd = HTTPServer(("127.0.0.1", 0), JWKSHandler) + self._httpd.jwks_data = {"keys": keys} + self.port = self._httpd.server_address[1] + self.url = f"http://127.0.0.1:{self.port}" + self._thread = None + + def start(self): + self._thread = threading.Thread( + target=self._httpd.serve_forever, daemon=True + ) + self._thread.start() + + def set_keys(self, keys): + """Replace the served keys (takes effect on next request).""" + self._httpd.jwks_data = {"keys": keys} + + def stop(self): + self._httpd.shutdown() + if self._thread: + self._thread.join(timeout=5) diff --git a/tests/support/server.py b/tests/support/server.py index 6eec66d5..e1a1c5db 100644 --- a/tests/support/server.py +++ b/tests/support/server.py @@ -18,15 +18,27 @@ def _find_free_port(): return s.getsockname()[1] -def _wait_for_ready(port, timeout=15): - """Wait until the server is accepting HTTP requests.""" +def _wait_for_ready(port, token=None, timeout=15): + """Wait until the server is accepting HTTP requests. + + When the server has ``COFLUX_REQUIRE_AUTH=true``, the discover endpoint + returns 401 without a token. We accept any non-connection-error response + (including 401) as proof the server is up, but if *token* is supplied we + send it so we get a clean 200. + """ url = f"http://127.0.0.1:{port}/api/discover" deadline = time.time() + timeout while time.time() < deadline: try: - req = urllib.request.Request(url, headers={"Host": f"healthcheck.localhost:{port}"}) + headers = {"Host": f"healthcheck.localhost:{port}"} + if token: + headers["Authorization"] = f"Bearer {token}" + req = urllib.request.Request(url, headers=headers) urllib.request.urlopen(req, timeout=1) return + except urllib.error.HTTPError: + # Any HTTP response (including 401/403) means the server is up. + return except (urllib.error.URLError, OSError): time.sleep(0.1) raise TimeoutError(f"server not ready on port {port} within {timeout}s") @@ -42,14 +54,18 @@ class ManagedServer: or via Docker when COFLUX_IMAGE is set (e.g. in CI). Auth is disabled so tests don't need tokens for normal operations. A super token is configured for management endpoints (rotate, etc.). + + Pass ``extra_env`` to override or extend the default environment + variables (e.g. to enable authentication). """ - def __init__(self, data_dir, port=None): + def __init__(self, data_dir, port=None, extra_env=None): self.port = port or _find_free_port() self.data_dir = data_dir self._proc = None self._container = None self._image = os.environ.get("COFLUX_IMAGE") + self._extra_env = extra_env or {} def start(self, timeout=30): if self._image: @@ -58,6 +74,7 @@ def start(self, timeout=30): self._start_local(timeout) def _start_local(self, timeout): + cli_path = os.path.abspath(os.environ.get("COFLUX_BIN", "coflux")) env = { "PATH": os.environ["PATH"], "HOME": os.environ.get("HOME", "/tmp"), @@ -65,7 +82,10 @@ def _start_local(self, timeout): "COFLUX_DATA_DIR": self.data_dir, "COFLUX_PUBLIC_HOST": "%.localhost:" + str(self.port), "COFLUX_REQUIRE_AUTH": "false", + "COFLUX_LAUNCHER_TYPES": "process,docker", "COFLUX_SUPER_TOKEN_HASH": hashlib.sha256(SUPER_TOKEN.encode()).hexdigest(), + "COFLUX_CLI_PATH": cli_path, + **self._extra_env, } self._proc = subprocess.Popen( ["elixir", "-S", "mix", "run", "--no-halt"], diff --git a/tests/test_assets.py b/tests/test_assets.py index 40c0f82a..89e69089 100644 --- a/tests/test_assets.py +++ b/tests/test_assets.py @@ -1,12 +1,11 @@ """Tests for asset persistence and retrieval.""" import os -import tempfile from support.manifest import task, workflow -def test_persist_and_get_asset(worker): +def test_persist_and_get_asset(worker, tmp_path): """Executor persists a file as an asset, then retrieves it in another step.""" targets = [ workflow("test", "main"), @@ -26,14 +25,13 @@ def test_persist_and_get_asset(worker): ex1 = ctx.executor.next_execute() # Create a temp file to persist as an asset - fd, tmp_path = tempfile.mkstemp(suffix=".txt") - with open(fd, "w") as f: - f.write("hello asset") + asset_file = tmp_path / "asset.txt" + asset_file.write_text("hello asset") # Persist the file as an asset asset_result = ex1.conn.persist_asset( ex1.execution_id, - [tmp_path], + [str(asset_file)], metadata={"name": "my_asset"}, ) assert "asset_id" in asset_result @@ -66,10 +64,6 @@ def test_persist_and_get_asset(worker): ex0.conn.complete(ex0.execution_id, value="done") assert ctx.result(run_id)["value"]["data"] == "done" - # Clean up - if os.path.exists(tmp_path): - os.unlink(tmp_path) - def test_asset_inspect_and_download(worker, tmp_path): """CLI can inspect and download a persisted asset.""" diff --git a/tests/test_auth.py b/tests/test_auth.py new file mode 100644 index 00000000..45757b21 --- /dev/null +++ b/tests/test_auth.py @@ -0,0 +1,345 @@ +"""Tests for server authentication mechanisms. + +Covers Studio JWT auth, super token auth, and unauthenticated access +control. A dedicated server instance is started with authentication +enabled so these tests don't interfere with the rest of the suite. +""" + +import json +import tempfile +import time +import urllib.error +import urllib.request +import uuid + +import jwt as pyjwt +import pytest +from support.jwks import JWKSServer, generate_keypair, mint_jwt +from support.server import SUPER_TOKEN, ManagedServer + +TEAM_ID = "test-team-1" +KID = "test-key-1" + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- + + +@pytest.fixture(scope="module") +def keypair(): + """Ed25519 keypair for the whole module.""" + return generate_keypair(kid=KID) + + +@pytest.fixture(scope="module") +def jwks_server(keypair): + """JWKS HTTP server serving the test public key.""" + _private_key, jwk = keypair + srv = JWKSServer([jwk]) + srv.start() + yield srv + srv.stop() + + +@pytest.fixture(scope="module") +def auth_server(jwks_server): + """A Coflux server with authentication enabled.""" + data_dir = tempfile.mkdtemp(prefix="coflux-test-auth-") + srv = ManagedServer( + data_dir, + extra_env={ + "COFLUX_REQUIRE_AUTH": "true", + "COFLUX_STUDIO_TEAMS": TEAM_ID, + "COFLUX_STUDIO_URL": jwks_server.url, + "COFLUX_SECRET": "test-secret-for-service-tokens", + }, + ) + srv.start(timeout=30) + yield srv + srv.stop() + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _api_request(server_port, project_id, path, *, token=None, body=None): + """Make an HTTP request to the server API. + + Returns ``(status, parsed_json_or_None)``. + """ + url = f"http://127.0.0.1:{server_port}/api/{path}" + headers = {"Host": f"{project_id}.localhost:{server_port}"} + if token: + headers["Authorization"] = f"Bearer {token}" + data = None + if body is not None: + data = json.dumps(body).encode() + headers["Content-Type"] = "application/json" + req = urllib.request.Request(url, data=data, headers=headers) + try: + resp = urllib.request.urlopen(req, timeout=5) + raw = resp.read() + return resp.status, json.loads(raw) if raw else None + except urllib.error.HTTPError as e: + raw = e.read() + try: + resp_body = json.loads(raw) if raw else None + except Exception: + resp_body = None + return e.code, resp_body + + +def _discover(server_port, project_id, token=None): + """Call the discover endpoint.""" + return _api_request(server_port, project_id, "discover", token=token) + + +def _make_host(project_id, port): + return f"{project_id}.localhost:{port}" + + +def _mint(keypair, auth_server, project_id, **kwargs): + """Convenience: mint a JWT for the given project against the test server.""" + private_key, _jwk = keypair + defaults = dict( + kid=KID, + issuer=auth_server._extra_env["COFLUX_STUDIO_URL"], + team_id=TEAM_ID, + host=_make_host(project_id, auth_server.port), + ) + defaults.update(kwargs) + return mint_jwt(private_key, **defaults) + + +# --------------------------------------------------------------------------- +# JWT Authentication — Happy Path +# --------------------------------------------------------------------------- + + +class TestJWTHappyPath: + def test_valid_jwt_accepted(self, keypair, auth_server): + """A correctly signed JWT with valid claims is accepted.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + token = _mint(keypair, auth_server, project_id) + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 200 + assert "version" in body + assert body["access"]["workspaces"] == ["*"] + + def test_jwt_workspace_restriction(self, keypair, auth_server): + """The ``workspaces`` claim limits the reported access.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + token = _mint(keypair, auth_server, project_id, workspaces=["staging", "dev/*"]) + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 200 + assert set(body["access"]["workspaces"]) == {"staging", "dev/*"} + + def test_jwt_creates_principal(self, keypair, auth_server): + """Repeated requests with the same ``sub`` claim are idempotent.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + subject = f"user-{uuid.uuid4().hex[:8]}" + for _ in range(2): + token = _mint(keypair, auth_server, project_id, subject=subject) + status, _body = _discover(auth_server.port, project_id, token=token) + assert status == 200 + + def test_jwt_default_workspaces(self, keypair, auth_server): + """When the ``workspaces`` claim is absent, all workspaces are accessible.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + token = _mint(keypair, auth_server, project_id) + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 200 + assert body["access"]["workspaces"] == ["*"] + + +# --------------------------------------------------------------------------- +# JWT Authentication — Rejection Cases +# --------------------------------------------------------------------------- + + +class TestJWTRejection: + def test_expired_jwt(self, keypair, auth_server): + """A JWT whose ``exp`` is in the past is rejected.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + token = _mint(keypair, auth_server, project_id, expires_in=-60) + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 401 + assert body["error"] == "unauthorized" + + def test_not_yet_valid_jwt(self, keypair, auth_server): + """A JWT whose ``nbf`` is in the future is rejected.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + token = _mint( + keypair, + auth_server, + project_id, + not_before=int(time.time()) + 3600, + ) + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 401 + + def test_wrong_issuer(self, keypair, auth_server): + """A JWT with a non-matching ``iss`` is rejected.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + token = _mint( + keypair, + auth_server, + project_id, + issuer="https://evil.example.com", + ) + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 401 + + def test_wrong_team(self, keypair, auth_server): + """A JWT with a team ID not in COFLUX_STUDIO_TEAMS is rejected.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + token = _mint(keypair, auth_server, project_id, team_id="unknown-team") + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 401 + + def test_wrong_host(self, keypair, auth_server): + """A JWT whose ``aud`` host doesn't match the request host is rejected.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + private_key, _jwk = keypair + token = mint_jwt( + private_key, + kid=KID, + issuer=auth_server._extra_env["COFLUX_STUDIO_URL"], + team_id=TEAM_ID, + host="wrong-project.localhost:9999", + ) + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 401 + + def test_wrong_signature(self, keypair, auth_server): + """A JWT signed with a different key is rejected.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + other_private_key, _other_jwk = generate_keypair(kid=KID) + token = mint_jwt( + other_private_key, + kid=KID, + issuer=auth_server._extra_env["COFLUX_STUDIO_URL"], + team_id=TEAM_ID, + host=_make_host(project_id, auth_server.port), + ) + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 401 + + def test_unknown_kid(self, keypair, auth_server): + """A JWT with an unknown ``kid`` header is rejected.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + token = _mint(keypair, auth_server, project_id, kid="nonexistent-key") + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 401 + + def test_missing_audience(self, keypair, auth_server): + """A JWT without an ``aud`` claim is rejected.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + private_key, _jwk = keypair + now = int(time.time()) + claims = { + "iss": auth_server._extra_env["COFLUX_STUDIO_URL"], + "sub": "test-user", + "exp": now + 3600, + "iat": now, + } + token = pyjwt.encode( + claims, private_key, algorithm="EdDSA", headers={"kid": KID} + ) + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 401 + + +# --------------------------------------------------------------------------- +# Super Token Authentication +# --------------------------------------------------------------------------- + + +class TestSuperToken: + def test_super_token_accepted(self, auth_server): + """The super token still works when Studio auth is also enabled.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + status, body = _discover(auth_server.port, project_id, token=SUPER_TOKEN) + assert status == 200 + assert body["access"]["workspaces"] == ["*"] + + def test_invalid_super_token_rejected(self, auth_server): + """A token that isn't the super token and isn't a JWT is rejected.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + status, body = _discover( + auth_server.port, project_id, token="not-the-right-token" + ) + assert status == 401 + + def test_super_token_required_for_rotate_epoch(self, keypair, auth_server): + """Admin endpoints reject non-super tokens (including valid JWTs).""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + + # JWT should be forbidden + jwt_token = _mint(keypair, auth_server, project_id) + status, body = _api_request( + auth_server.port, + project_id, + "rotate_epoch", + token=jwt_token, + body={}, + ) + assert status == 403 + + # Super token should work + status, _body = _api_request( + auth_server.port, + project_id, + "rotate_epoch", + token=SUPER_TOKEN, + body={}, + ) + assert status == 204 or status == 200 + + +# --------------------------------------------------------------------------- +# Unauthenticated Access +# --------------------------------------------------------------------------- + + +class TestUnauthenticated: + def test_no_token_rejected(self, auth_server): + """Without a token, ``COFLUX_REQUIRE_AUTH=true`` returns 401.""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + status, body = _discover(auth_server.port, project_id) + assert status == 401 + assert body["error"] == "unauthorized" + + +# --------------------------------------------------------------------------- +# JWKS Key Rotation +# --------------------------------------------------------------------------- + + +class TestKeyRotation: + def test_new_key_accepted_after_rotation(self, keypair, auth_server, jwks_server): + """When the JWKS is updated with a new key, JWTs signed with + the new key are accepted (the server re-fetches on cache miss).""" + project_id = f"auth-{uuid.uuid4().hex[:8]}" + new_kid = "rotated-key-1" + new_private_key, new_jwk = generate_keypair(kid=new_kid) + _orig_private_key, orig_jwk = keypair + + # Add the new key to the JWKS (keep the old one too) + jwks_server.set_keys([orig_jwk, new_jwk]) + + token = mint_jwt( + new_private_key, + kid=new_kid, + issuer=auth_server._extra_env["COFLUX_STUDIO_URL"], + team_id=TEAM_ID, + host=_make_host(project_id, auth_server.port), + ) + status, body = _discover(auth_server.port, project_id, token=token) + assert status == 200 + + # Restore original keys for other tests + jwks_server.set_keys([orig_jwk]) diff --git a/tests/test_epochs.py b/tests/test_epochs.py index 53b45a60..b0d18a84 100644 --- a/tests/test_epochs.py +++ b/tests/test_epochs.py @@ -1,133 +1,94 @@ """Tests for epoch database rotation and cross-epoch behavior.""" -import urllib.request -import uuid - import support.cli as cli -from support.helpers import managed_worker, poll_result +from support.helpers import api_post, managed_worker, poll_result from support.manifest import task, workflow from support.protocol import json_args -from support.server import ManagedServer, SUPER_TOKEN def _rotate_epoch(port, project_id): """Force an epoch rotation via the management API.""" - url = f"http://{project_id}.localhost:{port}/api/rotate_epoch" - req = urllib.request.Request( - url, - method="POST", - data=b"{}", - headers={ - "Content-Type": "application/json", - "Authorization": f"Bearer {SUPER_TOKEN}", - }, - ) - urllib.request.urlopen(req, timeout=10) - - -def test_epoch_rotation_creates_new_epoch(tmp_path): - """Rotation creates a new epoch file and server remains functional.""" - project_id = f"test-{uuid.uuid4().hex[:12]}" - targets = [workflow("test", "my_workflow")] + api_post(port, project_id, "rotate_epoch") - server = ManagedServer(str(tmp_path / "data")) - server.start() - try: - host = f"{project_id}.localhost:{server.port}" - cli.workspaces_create("default", host=host) +def test_epoch_rotation_creates_new_epoch(isolated_server, tmp_path): + """Rotation creates a new epoch file and server remains functional.""" + server, host, project_id = isolated_server + targets = [workflow("test", "my_workflow")] - with managed_worker(targets, host, tmp_path) as executor: - # Submit and complete a workflow - resp = cli.submit("test/my_workflow", host=host) - run_id = resp["runId"] + with managed_worker(targets, host, tmp_path) as executor: + # Submit and complete a workflow + resp = cli.submit("test/my_workflow", host=host) + run_id = resp["runId"] - ex = executor.next_execute() - assert ex.target == "my_workflow" - ex.conn.complete(ex.execution_id, value=42) + ex = executor.next_execute() + assert ex.target == "my_workflow" + ex.conn.complete(ex.execution_id, value=42) - result = poll_result(run_id, host) - assert result["type"] == "value" - assert result["value"]["data"] == 42 + result = poll_result(run_id, host) + assert result["type"] == "value" + assert result["value"]["data"] == 42 - # Force rotation - _rotate_epoch(server.port, project_id) + # Force rotation + _rotate_epoch(server.port, project_id) - # Server still works after rotation — submit another workflow - resp2 = cli.submit("test/my_workflow", host=host) - run_id2 = resp2["runId"] + # Server still works after rotation — submit another workflow + resp2 = cli.submit("test/my_workflow", host=host) + run_id2 = resp2["runId"] - ex2 = executor.next_execute() - assert ex2.target == "my_workflow" - ex2.conn.complete(ex2.execution_id, value=99) + ex2 = executor.next_execute() + assert ex2.target == "my_workflow" + ex2.conn.complete(ex2.execution_id, value=99) - result2 = poll_result(run_id2, host) - assert result2["type"] == "value" - assert result2["value"]["data"] == 99 - finally: - server.stop() + result2 = poll_result(run_id2, host) + assert result2["type"] == "value" + assert result2["value"]["data"] == 99 -def test_rerun_across_epoch_boundary(tmp_path): +def test_rerun_across_epoch_boundary(isolated_server, tmp_path): """Re-running a step whose data is in an old epoch works (copy-on-reference).""" - project_id = f"test-{uuid.uuid4().hex[:12]}" + server, host, project_id = isolated_server targets = [workflow("test", "my_workflow")] - server = ManagedServer(str(tmp_path / "data")) - server.start() - - try: - host = f"{project_id}.localhost:{server.port}" - cli.workspaces_create("default", host=host) + with managed_worker(targets, host, tmp_path) as executor: + resp = cli.submit("test/my_workflow", host=host) + run_id = resp["runId"] + step_id = resp["stepId"] - with managed_worker(targets, host, tmp_path) as executor: - resp = cli.submit("test/my_workflow", host=host) - run_id = resp["runId"] - step_id = resp["stepId"] - - ex = executor.next_execute() - assert ex.target == "my_workflow" - ex.conn.complete(ex.execution_id, value=42) + ex = executor.next_execute() + assert ex.target == "my_workflow" + ex.conn.complete(ex.execution_id, value=42) - result = poll_result(run_id, host) - assert result["type"] == "value" - assert result["value"]["data"] == 42 + result = poll_result(run_id, host) + assert result["type"] == "value" + assert result["value"]["data"] == 42 - # Force rotation — original run is now in old epoch - _rotate_epoch(server.port, project_id) + # Force rotation — original run is now in old epoch + _rotate_epoch(server.port, project_id) - # Re-run the step (this triggers copy-on-reference from old epoch) - rerun_resp = cli.runs_rerun(step_id, host=host) - assert rerun_resp["attempt"] == 2 + # Re-run the step (this triggers copy-on-reference from old epoch) + rerun_resp = cli.runs_rerun(step_id, host=host) + assert rerun_resp["attempt"] == 2 - # Execute the re-run - ex2 = executor.next_execute() - assert ex2.target == "my_workflow" - ex2.conn.complete(ex2.execution_id, value=99) + # Execute the re-run + ex2 = executor.next_execute() + assert ex2.target == "my_workflow" + ex2.conn.complete(ex2.execution_id, value=99) - result2 = poll_result(run_id, host) - assert result2["type"] == "value" - assert result2["value"]["data"] == 99 - finally: - server.stop() + result2 = poll_result(run_id, host) + assert result2["type"] == "value" + assert result2["value"]["data"] == 99 -def test_cache_hit_across_epoch_boundary(tmp_path): +def test_cache_hit_across_epoch_boundary(isolated_server, tmp_path): """Cached task result from old epoch is found after rotation.""" - project_id = f"test-{uuid.uuid4().hex[:12]}" + server, host, project_id = isolated_server targets = [ workflow("test", "main"), task("test", "expensive", parameters=["x"]), ] - server = ManagedServer(str(tmp_path / "data")) - server.start() - - try: - host = f"{project_id}.localhost:{server.port}" - cli.workspaces_create("default", host=host) - - with managed_worker(targets, host, tmp_path, concurrency=2) as executor: + with managed_worker(targets, host, tmp_path, concurrency=2) as executor: # First workflow: execute a cached task resp1 = cli.submit("test/main", host=host) run_id1 = resp1["runId"] @@ -174,26 +135,17 @@ def test_cache_hit_across_epoch_boundary(tmp_path): result2 = poll_result(run_id2, host) assert result2["type"] == "value" assert result2["value"]["data"] == "done2" - finally: - server.stop() -def test_parent_child_run_across_epoch_boundary(tmp_path): +def test_parent_child_run_across_epoch_boundary(isolated_server, tmp_path): """Rerunning a child step copies its parent run from an old epoch (recursive copy).""" - project_id = f"test-{uuid.uuid4().hex[:12]}" + server, host, project_id = isolated_server targets = [ workflow("test", "main"), workflow("test", "child"), ] - server = ManagedServer(str(tmp_path / "data")) - server.start() - - try: - host = f"{project_id}.localhost:{server.port}" - cli.workspaces_create("default", host=host) - - with managed_worker(targets, host, tmp_path, concurrency=2) as executor: + with managed_worker(targets, host, tmp_path, concurrency=2) as executor: # Submit parent workflow resp = cli.submit("test/main", host=host) run_id = resp["runId"] @@ -233,26 +185,17 @@ def test_parent_child_run_across_epoch_boundary(tmp_path): ex = executor.next_execute() assert ex.target == "child" ex.conn.complete(ex.execution_id, value=99) - finally: - server.stop() -def test_resolve_execution_reference_from_old_epoch(tmp_path): +def test_resolve_execution_reference_from_old_epoch(isolated_server, tmp_path): """Resolving an execution reference from an old epoch finds and copies the run.""" - project_id = f"test-{uuid.uuid4().hex[:12]}" + server, host, project_id = isolated_server targets = [ workflow("test", "main"), task("test", "producer", parameters=["x"]), ] - server = ManagedServer(str(tmp_path / "data")) - server.start() - - try: - host = f"{project_id}.localhost:{server.port}" - cli.workspaces_create("default", host=host) - - with managed_worker(targets, host, tmp_path, concurrency=2) as executor: + with managed_worker(targets, host, tmp_path, concurrency=2) as executor: # First workflow: submit a task and capture the execution reference resp1 = cli.submit("test/main", host=host) run_id1 = resp1["runId"] @@ -293,26 +236,17 @@ def test_resolve_execution_reference_from_old_epoch(tmp_path): result2 = poll_result(run_id2, host) assert result2["type"] == "value" assert result2["value"]["data"] == "done2" - finally: - server.stop() -def test_multiple_rotations_cache_hit_from_oldest_epoch(tmp_path): +def test_multiple_rotations_cache_hit_from_oldest_epoch(isolated_server, tmp_path): """Cache lookup across 3 epochs finds the match in the oldest one via bloom filter.""" - project_id = f"test-{uuid.uuid4().hex[:12]}" + server, host, project_id = isolated_server targets = [ workflow("test", "main"), task("test", "expensive", parameters=["x"]), ] - server = ManagedServer(str(tmp_path / "data")) - server.start() - - try: - host = f"{project_id}.localhost:{server.port}" - cli.workspaces_create("default", host=host) - - with managed_worker(targets, host, tmp_path, concurrency=2) as executor: + with managed_worker(targets, host, tmp_path, concurrency=2) as executor: # --- Epoch 1: cached task with args(42) → value 84 --- resp1 = cli.submit("test/main", host=host) run_id1 = resp1["runId"] @@ -380,27 +314,18 @@ def test_multiple_rotations_cache_hit_from_oldest_epoch(tmp_path): ex_wf3.conn.complete(ex_wf3.execution_id, value="done3") result3 = poll_result(run_id3, host) assert result3["value"]["data"] == "done3" - finally: - server.stop() -def test_asset_reference_across_epoch_boundary(tmp_path): +def test_asset_reference_across_epoch_boundary(isolated_server, tmp_path): """Assets are preserved when a run is copied from an old epoch via rerun.""" - project_id = f"test-{uuid.uuid4().hex[:12]}" + server, host, project_id = isolated_server targets = [ workflow("test", "main"), task("test", "producer"), task("test", "consumer"), ] - server = ManagedServer(str(tmp_path / "data")) - server.start() - - try: - host = f"{project_id}.localhost:{server.port}" - cli.workspaces_create("default", host=host) - - with managed_worker(targets, host, tmp_path, concurrency=2) as executor: + with managed_worker(targets, host, tmp_path, concurrency=2) as executor: # Submit workflow resp = cli.submit("test/main", host=host) run_id = resp["runId"] @@ -477,23 +402,14 @@ def test_asset_reference_across_epoch_boundary(tmp_path): assert rerun_entry[1] == original_size ex.conn.complete(ex.execution_id, value="consumed again") - finally: - server.stop() -def test_idempotency_across_epoch_boundary(tmp_path): +def test_idempotency_across_epoch_boundary(isolated_server, tmp_path): """Idempotency key is found in archived epoch after rotation.""" - project_id = f"test-{uuid.uuid4().hex[:12]}" + server, host, project_id = isolated_server targets = [workflow("test", "my_workflow")] - server = ManagedServer(str(tmp_path / "data")) - server.start() - - try: - host = f"{project_id}.localhost:{server.port}" - cli.workspaces_create("default", host=host) - - with managed_worker(targets, host, tmp_path) as executor: + with managed_worker(targets, host, tmp_path) as executor: # Submit with idempotency key and complete resp1 = cli.submit( "test/my_workflow", idempotency_key="epoch-key", host=host @@ -517,5 +433,3 @@ def test_idempotency_across_epoch_boundary(tmp_path): run_id2 = resp2["runId"] assert run_id1 == run_id2 - finally: - server.stop() diff --git a/tests/test_execution.py b/tests/test_execution.py index aa9b8b90..af816132 100644 --- a/tests/test_execution.py +++ b/tests/test_execution.py @@ -1,8 +1,6 @@ """Tests for core workflow and task execution mechanics.""" import json -import os -import tempfile from support.manifest import task, workflow from support.protocol import execution_error, execution_result, json_args @@ -371,7 +369,7 @@ def test_workflow_calls_workflow(worker): assert ctx.result(run_id)["value"]["data"] == "outer done" -def test_blob_argument_round_trip(worker): +def test_blob_argument_round_trip(worker, tmp_path): """Large file argument is uploaded as blob and received as file on the other end.""" targets = [ workflow("test", "main"), @@ -386,35 +384,30 @@ def test_blob_argument_round_trip(worker): # Create a temp file larger than blob threshold (100 bytes) content = "x" * 200 - fd, tmp_path = tempfile.mkstemp(suffix=".json") - try: - with open(fd, "w") as f: - json.dump(content, f) - - # Submit task with file-type argument - file_arg = [{"type": "file", "format": "json", "path": tmp_path}] - ref = ex0.conn.submit_task(ex0.execution_id, "test", "process", file_arg) - - # Receiving executor should get the argument as a file - ex1 = ctx.executor.next_execute() - assert ex1.target == "process" - assert len(ex1.arguments) == 1 - assert ex1.arguments[0]["type"] == "file" - assert "path" in ex1.arguments[0] - - # Read the file to verify content survived the blob round-trip - with open(ex1.arguments[0]["path"]) as f: - received = json.load(f) - assert received == content - - ex1.conn.complete(ex1.execution_id, value="processed") - assert ex0.conn.resolve(ex0.execution_id, ref)["value"] == "processed" - - ex0.conn.complete(ex0.execution_id, value="done") - assert ctx.result(run_id)["value"]["data"] == "done" - finally: - if os.path.exists(tmp_path): - os.unlink(tmp_path) + blob_file = tmp_path / "blob_arg.json" + blob_file.write_text(json.dumps(content)) + + # Submit task with file-type argument + file_arg = [{"type": "file", "format": "json", "path": str(blob_file)}] + ref = ex0.conn.submit_task(ex0.execution_id, "test", "process", file_arg) + + # Receiving executor should get the argument as a file + ex1 = ctx.executor.next_execute() + assert ex1.target == "process" + assert len(ex1.arguments) == 1 + assert ex1.arguments[0]["type"] == "file" + assert "path" in ex1.arguments[0] + + # Read the file to verify content survived the blob round-trip + with open(ex1.arguments[0]["path"]) as f: + received = json.load(f) + assert received == content + + ex1.conn.complete(ex1.execution_id, value="processed") + assert ex0.conn.resolve(ex0.execution_id, ref)["value"] == "processed" + + ex0.conn.complete(ex0.execution_id, value="done") + assert ctx.result(run_id)["value"]["data"] == "done" def test_idempotency_same_key_returns_same_run(worker): diff --git a/tests/test_logging.py b/tests/test_logging.py index cb118966..d5e368b0 100644 --- a/tests/test_logging.py +++ b/tests/test_logging.py @@ -1,12 +1,10 @@ """Tests for logging and execution groups.""" -import json import time -import urllib.request +from support.helpers import api_post from support.manifest import task, workflow from support.protocol import json_args, log_message, register_group_notification -from support.server import SUPER_TOKEN def test_log_messages(worker): @@ -253,17 +251,7 @@ def test_log_display_format(worker): def _rotate_logs(port, project_id): """Force a log partition rotation via the management API.""" - url = f"http://{project_id}.localhost:{port}/api/rotate_logs" - req = urllib.request.Request( - url, - method="POST", - data=b"{}", - headers={ - "Content-Type": "application/json", - "Authorization": f"Bearer {SUPER_TOKEN}", - }, - ) - urllib.request.urlopen(req, timeout=10) + api_post(port, project_id, "rotate_logs") def test_logs_across_partition_boundary(worker, server): diff --git a/tests/test_pools.py b/tests/test_pools.py new file mode 100644 index 00000000..570bf440 --- /dev/null +++ b/tests/test_pools.py @@ -0,0 +1,363 @@ +"""Tests for pool management and the process launcher. + +A pool with a process launcher causes the server to automatically start +worker processes when executions are submitted for matching modules. +""" + +import json + +import pytest +import support.cli as cli +from support.executor import Executor +from support.helpers import ADAPTER_SCRIPT, poll_result +from support.manifest import manifest, workflow, task +from support.protocol import json_args + + +# Launcher-managed workers are slower to start than direct workers. +_LAUNCH_TIMEOUT = 30 # wait_connections: launcher startup + worker init +_EXEC_TIMEOUT = 15 # next_execute: execution dispatch after worker is ready +_RESULT_TIMEOUT = 15 # poll_result: result propagation + + +@pytest.fixture +def pool_env(server, project_id, tmp_path): + """Provide helpers for pool-based tests. + + Sets up an Executor and manifest file, and yields a context dict + with everything needed to configure a pool and interact with + launched workers. + """ + host = f"{project_id}.localhost:{server.port}" + worker_dir = tmp_path / "pool-worker" + worker_dir.mkdir() + socket_path = str(worker_dir / "executor.sock") + manifest_path = str(worker_dir / "manifest.json") + + executor = Executor(socket_path) + executor.start() + + try: + yield { + "host": host, + "worker_dir": worker_dir, + "socket_path": socket_path, + "manifest_path": manifest_path, + "executor": executor, + } + finally: + executor.close() + + +def _setup_pool(pool_env, targets, modules=None, pool_name="test-pool", provides=None, **kwargs): + """Write manifest and create a process-launcher pool. + + Creates the workspace (if needed) and configures a pool whose launcher + starts a ``coflux worker`` process pointing at the test adapter. + Extra keyword arguments are forwarded to ``cli.pools_update``. + """ + modules = modules or ["test"] + manifest_path = pool_env["manifest_path"] + socket_path = pool_env["socket_path"] + host = pool_env["host"] + + with open(manifest_path, "w") as f: + json.dump(manifest(targets), f) + + cli.pools_update( + pool_name, + modules=modules, + provides=provides, + process_dir=str(pool_env["worker_dir"]), + adapter=["python3", ADAPTER_SCRIPT, "--manifest", manifest_path, "--socket", socket_path], + host=host, + **kwargs, + ) + + +class TestPoolLifecycle: + def test_pool_create_and_list(self, pool_env): + """Creating a pool makes it visible in the pool list.""" + host = pool_env["host"] + targets = [workflow("test", "my_workflow")] + _setup_pool(pool_env, targets) + + pools = cli.pools_list(host=host) + assert "test-pool" in pools + + def test_pool_get(self, pool_env): + """Pool details can be retrieved by name.""" + host = pool_env["host"] + targets = [workflow("test", "my_workflow")] + _setup_pool(pool_env, targets) + + pool = cli.pools_get("test-pool", host=host) + assert pool["launcher"]["type"] == "process" + assert pool["launcher"]["directory"] == str(pool_env["worker_dir"]) + assert "test" in pool["modules"] + + def test_pool_delete(self, pool_env): + """Deleting a pool removes it from the list.""" + host = pool_env["host"] + targets = [workflow("test", "my_workflow")] + _setup_pool(pool_env, targets) + + cli.pools_delete("test-pool", host=host) + pools = cli.pools_list(host=host) + assert "test-pool" not in pools + + +class TestProcessLauncher: + def test_auto_launch_worker(self, pool_env): + """Submitting a workflow for a pooled module auto-launches a worker + that executes the workflow and returns the result.""" + host = pool_env["host"] + executor = pool_env["executor"] + targets = [workflow("test", "greet", parameters=["name"])] + _setup_pool(pool_env, targets) + + resp = cli.submit("test/greet", '"world"', host=host) + + # The server should launch a worker process which connects to our + # executor. Wait for that connection. + executor.wait_connections(1, timeout=_LAUNCH_TIMEOUT) + + ex = executor.next_execute(timeout=_EXEC_TIMEOUT) + assert ex.target == "greet" + assert ex.arguments[0]["value"] == "world" + ex.conn.complete(ex.execution_id, value="hello world") + + result = poll_result(resp["runId"], host, timeout=_RESULT_TIMEOUT) + assert result["type"] == "value" + assert result["value"]["data"] == "hello world" + + def test_multiple_executions(self, pool_env): + """A pool-launched worker can handle multiple sequential executions.""" + host = pool_env["host"] + executor = pool_env["executor"] + targets = [workflow("test", "add", parameters=["a", "b"])] + _setup_pool(pool_env, targets) + + # Submit first + resp1 = cli.submit("test/add", "1", "2", host=host) + executor.wait_connections(1, timeout=_LAUNCH_TIMEOUT) + ex1 = executor.next_execute(timeout=_EXEC_TIMEOUT) + ex1.conn.complete(ex1.execution_id, value=3) + result1 = poll_result(resp1["runId"], host, timeout=_RESULT_TIMEOUT) + assert result1["value"]["data"] == 3 + + # Submit second (reuses existing worker) + resp2 = cli.submit("test/add", "10", "20", host=host) + ex2 = executor.next_execute(timeout=_EXEC_TIMEOUT) + ex2.conn.complete(ex2.execution_id, value=30) + result2 = poll_result(resp2["runId"], host, timeout=_RESULT_TIMEOUT) + assert result2["value"]["data"] == 30 + + def test_workflow_with_child_task(self, pool_env): + """A pool-launched worker can submit child tasks during execution.""" + host = pool_env["host"] + executor = pool_env["executor"] + targets = [ + workflow("test", "orchestrator"), + task("test", "double", parameters=["x"]), + ] + _setup_pool(pool_env, targets) + + resp = cli.submit("test/orchestrator", host=host) + executor.wait_connections(1, timeout=_LAUNCH_TIMEOUT) + + # Handle the orchestrator: submit a child task + ex0 = executor.next_execute(timeout=_EXEC_TIMEOUT) + assert ex0.target == "orchestrator" + ref = ex0.conn.submit_task(ex0.execution_id, "test", "double", json_args(5)) + + # Handle the child task + ex1 = executor.next_execute(timeout=_EXEC_TIMEOUT) + assert ex1.target == "double" + assert ex1.arguments[0]["value"] == 5 + ex1.conn.complete(ex1.execution_id, value=10) + + # Resolve and complete the orchestrator + resolved = ex0.conn.resolve(ex0.execution_id, ref) + assert resolved["value"] == 10 + ex0.conn.complete(ex0.execution_id, value="done") + + result = poll_result(resp["runId"], host, timeout=_RESULT_TIMEOUT) + assert result["value"]["data"] == "done" + + def test_error_propagation(self, pool_env): + """Errors from pool-launched workers are reported correctly.""" + host = pool_env["host"] + executor = pool_env["executor"] + targets = [workflow("test", "failing")] + _setup_pool(pool_env, targets) + + resp = cli.submit("test/failing", host=host) + executor.wait_connections(1, timeout=_LAUNCH_TIMEOUT) + + ex = executor.next_execute(timeout=_EXEC_TIMEOUT) + ex.conn.fail(ex.execution_id, "RuntimeError", "something broke") + + result = poll_result(resp["runId"], host, timeout=_RESULT_TIMEOUT) + assert result["type"] == "error" + assert result["error"]["type"] == "RuntimeError" + assert result["error"]["message"] == "something broke" + + def test_pool_with_provides(self, pool_env): + """A pool with provides tags matches executions that require them.""" + host = pool_env["host"] + executor = pool_env["executor"] + targets = [ + workflow("test", "gpu_job", requires={"gpu": ["A100"]}), + ] + _setup_pool(pool_env, targets, provides={"gpu": ["A100"]}) + + resp = cli.submit("test/gpu_job", host=host) + executor.wait_connections(1, timeout=_LAUNCH_TIMEOUT) + + ex = executor.next_execute(timeout=_EXEC_TIMEOUT) + assert ex.target == "gpu_job" + ex.conn.complete(ex.execution_id, value="computed") + + result = poll_result(resp["runId"], host, timeout=_RESULT_TIMEOUT) + assert result["value"]["data"] == "computed" + + +class TestCommonLauncherFields: + def test_get_returns_adapter(self, pool_env): + """Adapter configured on a pool is returned in pool details.""" + host = pool_env["host"] + targets = [workflow("test", "my_workflow")] + _setup_pool(pool_env, targets) + + pool = cli.pools_get("test-pool", host=host) + adapter = pool["launcher"]["adapter"] + assert adapter[0] == "python3" + assert "--manifest" in adapter + assert "--socket" in adapter + + def test_get_returns_concurrency(self, pool_env): + """Concurrency configured on a pool is returned in pool details.""" + host = pool_env["host"] + targets = [workflow("test", "my_workflow")] + _setup_pool(pool_env, targets, pool_name="conc-pool", concurrency=4) + + pool = cli.pools_get("conc-pool", host=host) + assert pool["launcher"]["concurrency"] == 4 + + def test_get_returns_env(self, pool_env): + """Custom env vars configured on a pool are returned in pool details.""" + host = pool_env["host"] + targets = [workflow("test", "my_workflow")] + _setup_pool( + pool_env, targets, pool_name="env-pool", + env={"MY_VAR": "hello", "OTHER_VAR": "world"}, + ) + + pool = cli.pools_get("env-pool", host=host) + assert pool["launcher"]["env"]["MY_VAR"] == "hello" + assert pool["launcher"]["env"]["OTHER_VAR"] == "world" + + def test_update_common_fields(self, pool_env): + """Common launcher fields can be updated on an existing pool.""" + host = pool_env["host"] + targets = [workflow("test", "my_workflow")] + _setup_pool(pool_env, targets) + + # Update concurrency and env on the existing pool + cli.pools_update("test-pool", concurrency=8, env={"EXTRA": "val"}, host=host) + + pool = cli.pools_get("test-pool", host=host) + assert pool["launcher"]["concurrency"] == 8 + assert pool["launcher"]["env"]["EXTRA"] == "val" + # Original fields should be preserved + assert pool["launcher"]["type"] == "process" + assert pool["launcher"]["directory"] == str(pool_env["worker_dir"]) + + def test_env_reaches_worker(self, pool_env): + """Custom env vars set on a pool are visible in the launched worker.""" + host = pool_env["host"] + worker_dir = pool_env["worker_dir"] + executor = pool_env["executor"] + + # Use a marker file to prove env vars reach the adapter process. + # The adapter script doesn't use env vars directly, but we can + # verify the worker launches successfully with them set, since + # the server injects them into the process environment. + marker = str(worker_dir / "env_marker.txt") + targets = [workflow("test", "check_env")] + + # Create a small adapter wrapper that writes an env var to a file + # before delegating to the real adapter. + wrapper_script = str(worker_dir / "env_wrapper.py") + manifest_path = pool_env["manifest_path"] + socket_path = pool_env["socket_path"] + + with open(wrapper_script, "w") as f: + f.write( + "import os, sys, subprocess\n" + f"with open({marker!r}, 'w') as f:\n" + " f.write(os.environ.get('TEST_POOL_VAR', ''))\n" + "result = subprocess.run(\n" + f" ['python3', {ADAPTER_SCRIPT!r}] + sys.argv[1:],\n" + " stdin=sys.stdin, stdout=sys.stdout, stderr=sys.stderr\n" + ")\n" + "sys.exit(result.returncode)\n" + ) + + with open(manifest_path, "w") as f: + json.dump(manifest(targets), f) + + cli.pools_update( + "env-worker-pool", + modules=["test"], + process_dir=str(worker_dir), + adapter=["python3", wrapper_script, "--manifest", manifest_path, "--socket", socket_path], + env={"TEST_POOL_VAR": "pool-env-works"}, + host=host, + ) + + resp = cli.submit("test/check_env", host=host) + executor.wait_connections(1, timeout=_LAUNCH_TIMEOUT) + + ex = executor.next_execute(timeout=_EXEC_TIMEOUT) + ex.conn.complete(ex.execution_id, value="ok") + poll_result(resp["runId"], host, timeout=_RESULT_TIMEOUT) + + with open(marker) as f: + assert f.read() == "pool-env-works" + + def test_multiple_modules(self, pool_env): + """A pool with multiple modules handles targets from both.""" + host = pool_env["host"] + executor = pool_env["executor"] + targets = [ + workflow("module_a", "job_a"), + workflow("module_b", "job_b"), + ] + _setup_pool(pool_env, targets, modules=["module_a", "module_b"]) + + pool = cli.pools_get("test-pool", host=host) + assert "module_a" in pool["modules"] + assert "module_b" in pool["modules"] + + # Submit to first module + resp_a = cli.submit("module_a/job_a", host=host) + executor.wait_connections(1, timeout=_LAUNCH_TIMEOUT) + + ex_a = executor.next_execute(timeout=_EXEC_TIMEOUT) + assert ex_a.target == "job_a" + ex_a.conn.complete(ex_a.execution_id, value="from_a") + + result_a = poll_result(resp_a["runId"], host, timeout=_RESULT_TIMEOUT) + assert result_a["value"]["data"] == "from_a" + + # Submit to second module (reuses the same worker) + resp_b = cli.submit("module_b/job_b", host=host) + + ex_b = executor.next_execute(timeout=_EXEC_TIMEOUT) + assert ex_b.target == "job_b" + ex_b.conn.complete(ex_b.execution_id, value="from_b") + + result_b = poll_result(resp_b["runId"], host, timeout=_RESULT_TIMEOUT) + assert result_b["value"]["data"] == "from_b" diff --git a/tests/test_reconnect.py b/tests/test_reconnect.py index 5dd536ca..0301cf74 100644 --- a/tests/test_reconnect.py +++ b/tests/test_reconnect.py @@ -7,7 +7,6 @@ from support.helpers import managed_worker, poll_result from support.manifest import workflow from support.proxy import TCPProxy -from support.server import ManagedServer def test_result_buffered_on_disconnect(server, tmp_path): @@ -81,39 +80,30 @@ def test_error_buffered_on_disconnect(server, tmp_path): proxy.close() -def test_result_buffered_across_server_restart(tmp_path): +def test_result_buffered_across_server_restart(isolated_server, tmp_path): """Result completed while server is down is delivered after server restart.""" - project_id = f"test-{uuid.uuid4().hex[:12]}" + server, host, project_id = isolated_server targets = [workflow("test", "my_workflow")] - server = ManagedServer(str(tmp_path / "data")) - server.start() - - try: - host = f"{project_id}.localhost:{server.port}" - cli.workspaces_create("default", host=host) + with managed_worker(targets, host, tmp_path) as executor: + resp = cli.submit("test/my_workflow", host=host) + run_id = resp["runId"] - with managed_worker(targets, host, tmp_path) as executor: - resp = cli.submit("test/my_workflow", host=host) - run_id = resp["runId"] - - ex = executor.next_execute() - assert ex.target == "my_workflow" + ex = executor.next_execute() + assert ex.target == "my_workflow" - # Kill server - server.stop() - time.sleep(0.5) + # Kill server + server.stop() + time.sleep(0.5) - # Complete execution while server is down - ex.conn.complete(ex.execution_id, value=42) + # Complete execution while server is down + ex.conn.complete(ex.execution_id, value=42) - # Restart server — same port, same data directory - server.start() + # Restart server — same port, same data directory + server.start() - # Worker reconnects, server restores session from DB, - # sends execution IDs, worker flushes buffered result - result = poll_result(run_id, host, timeout=20) - assert result["type"] == "value" - assert result["value"]["data"] == 42 - finally: - server.stop() + # Worker reconnects, server restores session from DB, + # sends execution IDs, worker flushes buffered result + result = poll_result(run_id, host, timeout=20) + assert result["type"] == "value" + assert result["value"]["data"] == 42