diff --git a/README.md b/README.md index cff85e6..54d6c5a 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ fluid diagnose my-dataset -n default --archive |---------|-------------| | `fluid inspect` | List Pods, Runtimes, PVCs, and related resources for a Dataset | | `fluid diagnose` | Collect a support bundle (YAML, logs, events) for a Dataset | +| `fluid diagnose config` | Manage AI/LLM settings for diagnose (`~/.fluid/config`) | | `fluid version` | Print CLI version | For flags and examples, use `--help` on any command: @@ -31,6 +32,21 @@ fluid inspect --help fluid diagnose --help ``` +## AI-assisted diagnosis + +`fluid diagnose` can call an OpenAI-compatible LLM API to analyze collected cluster context, or export prompt files for manual copy/paste. + +```bash +fluid diagnose config set llm-endpoint https://api.openai.com/v1 +export FLUID_LLM_API_KEY=sk-... + +fluid diagnose my-dataset -n default -o dir +``` + +Artifact directory includes `context.json`, `prompt.txt`, and `llm-analysis.txt` when LLM analysis runs. Use `--llm-skip` to collect prompts only without calling the API. + +See [Diagnose guide](docs/guides/diagnose.md) for details. + ## Documentation - [Install](docs/install.md) diff --git a/cmd/fluid/root/diagnose/cmd.go b/cmd/fluid/root/diagnose/cmd.go index 3bee26e..be31726 100644 --- a/cmd/fluid/root/diagnose/cmd.go +++ b/cmd/fluid/root/diagnose/cmd.go @@ -48,8 +48,10 @@ type Options struct { includeControllerLogs bool since string - // LLM/AI output (Phase 3 stub) - promptFile string + promptFile string + llmEndpoint string + llmModel string + llmSkip bool } func NewDiagnoseCommand(configFlags *genericclioptions.ConfigFlags) *cobra.Command { @@ -79,7 +81,14 @@ into a tar.gz archive.`, fluid diagnose my-dataset -n default --output-dir /tmp/diag --no-logs # Only collect events from the last hour - fluid diagnose my-dataset -n default --since 1h`, + fluid diagnose my-dataset -n default --since 1h + + # Configure LLM settings (OpenAI-compatible API) + fluid diagnose config set llm-endpoint https://api.openai.com/v1 + export FLUID_LLM_API_KEY=sk-... + + # Collect artifacts and request LLM analysis + fluid diagnose my-dataset -n default -o dir`, Args: cobra.ExactArgs(1), SilenceUsage: true, SilenceErrors: true, @@ -111,9 +120,13 @@ into a tar.gz archive.`, cmd.Flags().BoolVar(&o.includeControllerLogs, "include-controller-logs", false, "Also collect Fluid controller logs from fluid-system namespace") cmd.Flags().StringVar(&o.since, "since", "", "Only collect logs/events newer than this duration (e.g. 1h, 30m)") - // Phase 3 stub - cmd.Flags().StringVar(&o.promptFile, "prompt-file", "", "[Phase 3] Write prompt-ready diagnostic context to this file") - _ = cmd.Flags().MarkHidden("prompt-file") + // AI-assisted diagnosis + cmd.Flags().StringVar(&o.promptFile, "prompt-file", "", "Also write prompt-ready diagnostic text to this file") + cmd.Flags().StringVar(&o.llmEndpoint, "llm-endpoint", "", "LLM API base URL (overrides FLUID_LLM_ENDPOINT and ~/.fluid/config)") + cmd.Flags().StringVar(&o.llmModel, "llm-model", "", "LLM model name (overrides FLUID_LLM_MODEL and ~/.fluid/config)") + cmd.Flags().BoolVar(&o.llmSkip, "llm-skip", false, "Skip LLM analysis (when endpoint is configured, analysis runs by default)") + + cmd.AddCommand(newConfigCommand()) return cmd } @@ -138,6 +151,11 @@ func (o *Options) run(cmd *cobra.Command) error { return fmt.Errorf("invalid output mode %q, expected tui|dir|stdout", o.output) } + llmSettings, err := diagpkg.ResolveLLMSettings(o.llmEndpoint, o.llmModel, o.llmSkip, cmd.Flags().Changed("llm-skip")) + if err != nil { + return err + } + runOpts := diagpkg.Options{ DatasetName: o.datasetName, Namespace: o.namespace, @@ -147,6 +165,12 @@ func (o *Options) run(cmd *cobra.Command) error { NoLogs: o.noLogs, IncludeControllerLogs: o.includeControllerLogs, Since: o.since, + PromptFile: o.promptFile, + LLMEndpoint: llmSettings.Endpoint, + LLMAPIKey: llmSettings.APIKey, + LLMModel: llmSettings.Model, + LLMSkip: llmSettings.Skip, + Stderr: cmd.ErrOrStderr(), } if o.output == "tui" { runOpts.Output = "dir" @@ -184,6 +208,15 @@ func (o *Options) run(cmd *cobra.Command) error { } fmt.Fprintf(cmd.OutOrStdout(), "diagnose: collected artifacts at %s\n", result.OutputPath) + if result.ContextPath != "" { + fmt.Fprintf(cmd.OutOrStdout(), "diagnose: diagnostic context written to %s\n", result.ContextPath) + } + if result.PromptPath != "" { + fmt.Fprintf(cmd.OutOrStdout(), "diagnose: prompt written to %s\n", result.PromptPath) + } + if result.LLMAnalysisPath != "" { + fmt.Fprintf(cmd.OutOrStdout(), "diagnose: LLM analysis written to %s\n", result.LLMAnalysisPath) + } if result.ArchivePath != "" { fmt.Fprintf(cmd.OutOrStdout(), "diagnose: archive written to %s\n", result.ArchivePath) } diff --git a/cmd/fluid/root/diagnose/config.go b/cmd/fluid/root/diagnose/config.go new file mode 100644 index 0000000..ba889b3 --- /dev/null +++ b/cmd/fluid/root/diagnose/config.go @@ -0,0 +1,219 @@ +// Copyright 2026 The Fluid Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package diagnose + +import ( + "fmt" + "os" + + diagpkg "github.com/fluid-cloudnative/fluid-cli/pkg/diagnose" + "github.com/spf13/cobra" + "sigs.k8s.io/yaml" +) + +func newConfigCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "config", + Short: "Manage diagnose AI/LLM settings", + Long: `Manage Fluid diagnose AI settings stored in ~/.fluid/config. + +Settings are used for OpenAI-compatible LLM analysis during fluid diagnose. +Prefer FLUID_LLM_API_KEY for secrets instead of storing apiKey in the config file.`, + } + + cmd.AddCommand( + newConfigSetCommand(), + newConfigGetCommand(), + newConfigUnsetCommand(), + newConfigViewCommand(), + ) + + return cmd +} + +func newConfigSetCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "set", + Short: "Set a diagnose LLM configuration value", + } + cmd.AddCommand( + &cobra.Command{ + Use: "llm-endpoint ", + Short: "Set the default LLM API base URL", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := diagpkg.SetLLMEndpoint(args[0]); err != nil { + return err + } + fmt.Fprintf(cmd.OutOrStdout(), "diagnose config: saved llm endpoint\n") + return nil + }, + }, + &cobra.Command{ + Use: "llm-api-key ", + Short: "Set the LLM API key in ~/.fluid/config (prefer FLUID_LLM_API_KEY env)", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := diagpkg.SetLLMAPIKey(args[0]); err != nil { + return err + } + fmt.Fprintf(cmd.OutOrStdout(), "diagnose config: saved llm api key\n") + return nil + }, + }, + &cobra.Command{ + Use: "llm-model ", + Short: "Set the default LLM model name", + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + if err := diagpkg.SetLLMModel(args[0]); err != nil { + return err + } + fmt.Fprintf(cmd.OutOrStdout(), "diagnose config: saved llm model\n") + return nil + }, + }, + ) + return cmd +} + +func newConfigGetCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "get", + Short: "Print a diagnose LLM configuration value", + } + cmd.AddCommand( + &cobra.Command{ + Use: "llm-endpoint", + Short: "Print the configured LLM API endpoint", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + endpoint, err := diagpkg.GetLLMEndpoint() + if err != nil { + return err + } + fmt.Fprintln(cmd.OutOrStdout(), endpoint) + return nil + }, + }, + &cobra.Command{ + Use: "llm-api-key", + Short: "Print whether an LLM API key is configured", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + ok, err := diagpkg.GetLLMAPIKeyConfigured() + if err != nil { + return err + } + if ok { + fmt.Fprintln(cmd.OutOrStdout(), "configured") + } + return nil + }, + }, + &cobra.Command{ + Use: "llm-model", + Short: "Print the configured LLM model", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + model, err := diagpkg.GetLLMModel() + if err != nil { + return err + } + fmt.Fprintln(cmd.OutOrStdout(), model) + return nil + }, + }, + ) + return cmd +} + +func newConfigUnsetCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "unset", + Short: "Remove a diagnose LLM configuration value", + } + cmd.AddCommand( + &cobra.Command{ + Use: "llm-endpoint", + Short: "Remove the configured LLM API endpoint", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + if err := diagpkg.UnsetLLMEndpoint(); err != nil { + return err + } + fmt.Fprintf(cmd.OutOrStdout(), "diagnose config: removed llm endpoint\n") + return nil + }, + }, + &cobra.Command{ + Use: "llm-api-key", + Short: "Remove the LLM API key from ~/.fluid/config", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + if err := diagpkg.UnsetLLMAPIKey(); err != nil { + return err + } + fmt.Fprintf(cmd.OutOrStdout(), "diagnose config: removed llm api key\n") + return nil + }, + }, + &cobra.Command{ + Use: "llm-model", + Short: "Remove the configured LLM model", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + if err := diagpkg.UnsetLLMModel(); err != nil { + return err + } + fmt.Fprintf(cmd.OutOrStdout(), "diagnose config: removed llm model\n") + return nil + }, + }, + ) + return cmd +} + +func newConfigViewCommand() *cobra.Command { + cmd := &cobra.Command{ + Use: "view", + Short: "Show the diagnose section of ~/.fluid/config", + Args: cobra.NoArgs, + RunE: func(cmd *cobra.Command, args []string) error { + cfg, err := diagpkg.LoadUserConfig() + if err != nil { + return err + } + if cfg.Diagnose.LLM.APIKey != "" { + cfg.Diagnose.LLM.APIKey = "" + } + path, err := diagpkg.ConfigPath() + if err != nil { + return err + } + if _, err := os.Stat(path); os.IsNotExist(err) { + fmt.Fprintf(cmd.OutOrStdout(), "# %s (not created yet)\n", path) + return nil + } + data, err := yaml.Marshal(cfg) + if err != nil { + return fmt.Errorf("marshaling config: %w", err) + } + fmt.Fprintf(cmd.OutOrStdout(), "# %s\n%s", path, string(data)) + return nil + }, + } + return cmd +} diff --git a/cmd/fluid/root/diagnose/config_test.go b/cmd/fluid/root/diagnose/config_test.go new file mode 100644 index 0000000..1cf72e8 --- /dev/null +++ b/cmd/fluid/root/diagnose/config_test.go @@ -0,0 +1,50 @@ +package diagnose + +import ( + "bytes" + "os" + "testing" + + diagpkg "github.com/fluid-cloudnative/fluid-cli/pkg/diagnose" +) + +func TestDiagnoseConfig_SetAndGet(t *testing.T) { + dir := t.TempDir() + t.Setenv("HOME", dir) + + root := bytes.NewBuffer(nil) + setCmd := newConfigSetCommand() + setCmd.SetOut(root) + setCmd.SetErr(root) + setCmd.SetArgs([]string{"llm-endpoint", "https://api.example.com/v1"}) + if err := setCmd.Execute(); err != nil { + t.Fatalf("config set: %v", err) + } + + got, err := diagpkg.GetLLMEndpoint() + if err != nil { + t.Fatalf("GetLLMEndpoint: %v", err) + } + if got != "https://api.example.com/v1" { + t.Fatalf("got %q", got) + } + + out := &bytes.Buffer{} + getCmd := newConfigGetCommand() + getCmd.SetOut(out) + getCmd.SetArgs([]string{"llm-endpoint"}) + if err := getCmd.Execute(); err != nil { + t.Fatalf("config get: %v", err) + } + if out.String() != "https://api.example.com/v1\n" { + t.Fatalf("get output: %q", out.String()) + } + + path, err := diagpkg.ConfigPath() + if err != nil { + t.Fatalf("ConfigPath: %v", err) + } + if _, err := os.Stat(path); err != nil { + t.Fatalf("config file missing: %v", err) + } +} diff --git a/docs/guides/diagnose.md b/docs/guides/diagnose.md index 0c284ab..0798709 100644 --- a/docs/guides/diagnose.md +++ b/docs/guides/diagnose.md @@ -38,6 +38,9 @@ Artifacts are written under a directory such as `fluid-diagnose-- [flags] # Only collect events from the last hour fluid diagnose my-dataset -n default --since 1h + + # Configure LLM settings (OpenAI-compatible API) + fluid diagnose config set llm-endpoint https://api.openai.com/v1 + export FLUID_LLM_API_KEY=sk-... + + # Collect artifacts and request LLM analysis + fluid diagnose my-dataset -n default -o dir ``` ### Options @@ -40,9 +47,13 @@ fluid diagnose [flags] --archive Package artifacts into a tar.gz archive -h, --help help for diagnose --include-controller-logs Also collect Fluid controller logs from fluid-system namespace + --llm-endpoint string LLM API base URL (overrides FLUID_LLM_ENDPOINT and ~/.fluid/config) + --llm-model string LLM model name (overrides FLUID_LLM_MODEL and ~/.fluid/config) + --llm-skip Skip LLM analysis (when endpoint is configured, analysis runs by default) --no-logs Skip collecting pod logs (useful in large clusters) -o, --output string Output mode: tui|dir|stdout (default "tui") --output-dir string Directory to write artifacts (default: fluid-diagnose--) + --prompt-file string Also write prompt-ready diagnostic text to this file --since string Only collect logs/events newer than this duration (e.g. 1h, 30m) ``` @@ -72,4 +83,5 @@ fluid diagnose [flags] ### SEE ALSO * [fluid](fluid.md) - Inspect and diagnose Fluid-managed datasets +* [fluid diagnose config](fluid_diagnose_config.md) - Manage diagnose AI/LLM settings diff --git a/docs/reference/fluid_diagnose_config.md b/docs/reference/fluid_diagnose_config.md new file mode 100644 index 0000000..9fed649 --- /dev/null +++ b/docs/reference/fluid_diagnose_config.md @@ -0,0 +1,48 @@ +## fluid diagnose config + +Manage diagnose AI/LLM settings + +### Synopsis + +Manage Fluid diagnose AI settings stored in ~/.fluid/config. + +Settings are used for OpenAI-compatible LLM analysis during fluid diagnose. +Prefer FLUID_LLM_API_KEY for secrets instead of storing apiKey in the config file. + +### Options + +``` + -h, --help help for config +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose](fluid_diagnose.md) - Collect diagnostic data for a Fluid Dataset and its Runtime(s) +* [fluid diagnose config get](fluid_diagnose_config_get.md) - Print a diagnose LLM configuration value +* [fluid diagnose config set](fluid_diagnose_config_set.md) - Set a diagnose LLM configuration value +* [fluid diagnose config unset](fluid_diagnose_config_unset.md) - Remove a diagnose LLM configuration value +* [fluid diagnose config view](fluid_diagnose_config_view.md) - Show the diagnose section of ~/.fluid/config + diff --git a/docs/reference/fluid_diagnose_config_get.md b/docs/reference/fluid_diagnose_config_get.md new file mode 100644 index 0000000..6efe116 --- /dev/null +++ b/docs/reference/fluid_diagnose_config_get.md @@ -0,0 +1,40 @@ +## fluid diagnose config get + +Print a diagnose LLM configuration value + +### Options + +``` + -h, --help help for get +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config](fluid_diagnose_config.md) - Manage diagnose AI/LLM settings +* [fluid diagnose config get llm-api-key](fluid_diagnose_config_get_llm-api-key.md) - Print whether an LLM API key is configured +* [fluid diagnose config get llm-endpoint](fluid_diagnose_config_get_llm-endpoint.md) - Print the configured LLM API endpoint +* [fluid diagnose config get llm-model](fluid_diagnose_config_get_llm-model.md) - Print the configured LLM model + diff --git a/docs/reference/fluid_diagnose_config_get_llm-api-key.md b/docs/reference/fluid_diagnose_config_get_llm-api-key.md new file mode 100644 index 0000000..d17d346 --- /dev/null +++ b/docs/reference/fluid_diagnose_config_get_llm-api-key.md @@ -0,0 +1,41 @@ +## fluid diagnose config get llm-api-key + +Print whether an LLM API key is configured + +``` +fluid diagnose config get llm-api-key [flags] +``` + +### Options + +``` + -h, --help help for llm-api-key +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config get](fluid_diagnose_config_get.md) - Print a diagnose LLM configuration value + diff --git a/docs/reference/fluid_diagnose_config_get_llm-endpoint.md b/docs/reference/fluid_diagnose_config_get_llm-endpoint.md new file mode 100644 index 0000000..0b1f133 --- /dev/null +++ b/docs/reference/fluid_diagnose_config_get_llm-endpoint.md @@ -0,0 +1,41 @@ +## fluid diagnose config get llm-endpoint + +Print the configured LLM API endpoint + +``` +fluid diagnose config get llm-endpoint [flags] +``` + +### Options + +``` + -h, --help help for llm-endpoint +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config get](fluid_diagnose_config_get.md) - Print a diagnose LLM configuration value + diff --git a/docs/reference/fluid_diagnose_config_get_llm-model.md b/docs/reference/fluid_diagnose_config_get_llm-model.md new file mode 100644 index 0000000..b39e7fc --- /dev/null +++ b/docs/reference/fluid_diagnose_config_get_llm-model.md @@ -0,0 +1,41 @@ +## fluid diagnose config get llm-model + +Print the configured LLM model + +``` +fluid diagnose config get llm-model [flags] +``` + +### Options + +``` + -h, --help help for llm-model +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config get](fluid_diagnose_config_get.md) - Print a diagnose LLM configuration value + diff --git a/docs/reference/fluid_diagnose_config_set.md b/docs/reference/fluid_diagnose_config_set.md new file mode 100644 index 0000000..486dfc7 --- /dev/null +++ b/docs/reference/fluid_diagnose_config_set.md @@ -0,0 +1,40 @@ +## fluid diagnose config set + +Set a diagnose LLM configuration value + +### Options + +``` + -h, --help help for set +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config](fluid_diagnose_config.md) - Manage diagnose AI/LLM settings +* [fluid diagnose config set llm-api-key](fluid_diagnose_config_set_llm-api-key.md) - Set the LLM API key in ~/.fluid/config (prefer FLUID_LLM_API_KEY env) +* [fluid diagnose config set llm-endpoint](fluid_diagnose_config_set_llm-endpoint.md) - Set the default LLM API base URL +* [fluid diagnose config set llm-model](fluid_diagnose_config_set_llm-model.md) - Set the default LLM model name + diff --git a/docs/reference/fluid_diagnose_config_set_llm-api-key.md b/docs/reference/fluid_diagnose_config_set_llm-api-key.md new file mode 100644 index 0000000..1deb139 --- /dev/null +++ b/docs/reference/fluid_diagnose_config_set_llm-api-key.md @@ -0,0 +1,41 @@ +## fluid diagnose config set llm-api-key + +Set the LLM API key in ~/.fluid/config (prefer FLUID_LLM_API_KEY env) + +``` +fluid diagnose config set llm-api-key [flags] +``` + +### Options + +``` + -h, --help help for llm-api-key +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config set](fluid_diagnose_config_set.md) - Set a diagnose LLM configuration value + diff --git a/docs/reference/fluid_diagnose_config_set_llm-endpoint.md b/docs/reference/fluid_diagnose_config_set_llm-endpoint.md new file mode 100644 index 0000000..3573dca --- /dev/null +++ b/docs/reference/fluid_diagnose_config_set_llm-endpoint.md @@ -0,0 +1,41 @@ +## fluid diagnose config set llm-endpoint + +Set the default LLM API base URL + +``` +fluid diagnose config set llm-endpoint [flags] +``` + +### Options + +``` + -h, --help help for llm-endpoint +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config set](fluid_diagnose_config_set.md) - Set a diagnose LLM configuration value + diff --git a/docs/reference/fluid_diagnose_config_set_llm-model.md b/docs/reference/fluid_diagnose_config_set_llm-model.md new file mode 100644 index 0000000..8a3d48b --- /dev/null +++ b/docs/reference/fluid_diagnose_config_set_llm-model.md @@ -0,0 +1,41 @@ +## fluid diagnose config set llm-model + +Set the default LLM model name + +``` +fluid diagnose config set llm-model [flags] +``` + +### Options + +``` + -h, --help help for llm-model +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config set](fluid_diagnose_config_set.md) - Set a diagnose LLM configuration value + diff --git a/docs/reference/fluid_diagnose_config_unset.md b/docs/reference/fluid_diagnose_config_unset.md new file mode 100644 index 0000000..236b2c2 --- /dev/null +++ b/docs/reference/fluid_diagnose_config_unset.md @@ -0,0 +1,40 @@ +## fluid diagnose config unset + +Remove a diagnose LLM configuration value + +### Options + +``` + -h, --help help for unset +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config](fluid_diagnose_config.md) - Manage diagnose AI/LLM settings +* [fluid diagnose config unset llm-api-key](fluid_diagnose_config_unset_llm-api-key.md) - Remove the LLM API key from ~/.fluid/config +* [fluid diagnose config unset llm-endpoint](fluid_diagnose_config_unset_llm-endpoint.md) - Remove the configured LLM API endpoint +* [fluid diagnose config unset llm-model](fluid_diagnose_config_unset_llm-model.md) - Remove the configured LLM model + diff --git a/docs/reference/fluid_diagnose_config_unset_llm-api-key.md b/docs/reference/fluid_diagnose_config_unset_llm-api-key.md new file mode 100644 index 0000000..81100b0 --- /dev/null +++ b/docs/reference/fluid_diagnose_config_unset_llm-api-key.md @@ -0,0 +1,41 @@ +## fluid diagnose config unset llm-api-key + +Remove the LLM API key from ~/.fluid/config + +``` +fluid diagnose config unset llm-api-key [flags] +``` + +### Options + +``` + -h, --help help for llm-api-key +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config unset](fluid_diagnose_config_unset.md) - Remove a diagnose LLM configuration value + diff --git a/docs/reference/fluid_diagnose_config_unset_llm-endpoint.md b/docs/reference/fluid_diagnose_config_unset_llm-endpoint.md new file mode 100644 index 0000000..327c91f --- /dev/null +++ b/docs/reference/fluid_diagnose_config_unset_llm-endpoint.md @@ -0,0 +1,41 @@ +## fluid diagnose config unset llm-endpoint + +Remove the configured LLM API endpoint + +``` +fluid diagnose config unset llm-endpoint [flags] +``` + +### Options + +``` + -h, --help help for llm-endpoint +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config unset](fluid_diagnose_config_unset.md) - Remove a diagnose LLM configuration value + diff --git a/docs/reference/fluid_diagnose_config_unset_llm-model.md b/docs/reference/fluid_diagnose_config_unset_llm-model.md new file mode 100644 index 0000000..724e2fd --- /dev/null +++ b/docs/reference/fluid_diagnose_config_unset_llm-model.md @@ -0,0 +1,41 @@ +## fluid diagnose config unset llm-model + +Remove the configured LLM model + +``` +fluid diagnose config unset llm-model [flags] +``` + +### Options + +``` + -h, --help help for llm-model +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config unset](fluid_diagnose_config_unset.md) - Remove a diagnose LLM configuration value + diff --git a/docs/reference/fluid_diagnose_config_view.md b/docs/reference/fluid_diagnose_config_view.md new file mode 100644 index 0000000..4118f56 --- /dev/null +++ b/docs/reference/fluid_diagnose_config_view.md @@ -0,0 +1,41 @@ +## fluid diagnose config view + +Show the diagnose section of ~/.fluid/config + +``` +fluid diagnose config view [flags] +``` + +### Options + +``` + -h, --help help for view +``` + +### Options inherited from parent commands + +``` + --as string Username to impersonate for the operation. User could be a regular user or a service account in a namespace. + --as-group stringArray Group to impersonate for the operation, this flag can be repeated to specify multiple groups. + --as-uid string UID to impersonate for the operation. + --cache-dir string Default cache directory (default "~/.kube/cache") + --certificate-authority string Path to a cert file for the certificate authority + --client-certificate string Path to a client certificate file for TLS + --client-key string Path to a client key file for TLS + --cluster string The name of the kubeconfig cluster to use + --context string The name of the kubeconfig context to use + --disable-compression If true, opt-out of response compression for all requests to the server + --insecure-skip-tls-verify If true, the server's certificate will not be checked for validity. This will make your HTTPS connections insecure + --kubeconfig string Path to the kubeconfig file to use for CLI requests. + -n, --namespace string If present, the namespace scope for this CLI request + --request-timeout string The length of time to wait before giving up on a single server request. Non-zero values should contain a corresponding time unit (e.g. 1s, 2m, 3h). A value of zero means don't timeout requests. (default "0") + -s, --server string The address and port of the Kubernetes API server + --tls-server-name string Server name to use for server certificate validation. If it is not provided, the hostname used to contact the server is used + --token string Bearer token for authentication to the API server + --user string The name of the kubeconfig user to use +``` + +### SEE ALSO + +* [fluid diagnose config](fluid_diagnose_config.md) - Manage diagnose AI/LLM settings + diff --git a/pkg/diagnose/ai_outputs.go b/pkg/diagnose/ai_outputs.go new file mode 100644 index 0000000..4022c55 --- /dev/null +++ b/pkg/diagnose/ai_outputs.go @@ -0,0 +1,104 @@ +// Copyright 2026 The Fluid Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package diagnose + +import ( + "context" + "fmt" + "io" + "os" + "path/filepath" + "strings" +) + +const ( + defaultContextFile = "context.json" + defaultPromptFile = "prompt.txt" + defaultAnalysisFile = "llm-analysis.txt" +) + +type aiOutputPaths struct { + ContextPath string + PromptPath string + AnalysisPath string +} + +// writeAIOutputs builds DiagnosticContext and writes context.json, prompt, and optional LLM analysis. +func writeAIOutputs(ctx context.Context, baseDir string, in BuildContextInput, opts Options, stderr io.Writer) (aiOutputPaths, error) { + diagCtx := BuildContext(in) + contextBytes, err := ContextAsJSON(diagCtx) + if err != nil { + return aiOutputPaths{}, fmt.Errorf("marshaling diagnostic context: %w", err) + } + prompt := FormatPrompt(diagCtx) + + paths := aiOutputPaths{} + if baseDir != "" { + contextPath := filepath.Join(baseDir, defaultContextFile) + if err := os.WriteFile(contextPath, contextBytes, 0o644); err != nil { + return aiOutputPaths{}, fmt.Errorf("writing %s: %w", defaultContextFile, err) + } + paths.ContextPath = contextPath + + promptPath := filepath.Join(baseDir, defaultPromptFile) + if err := os.WriteFile(promptPath, []byte(prompt), 0o644); err != nil { + return aiOutputPaths{}, fmt.Errorf("writing %s: %w", defaultPromptFile, err) + } + paths.PromptPath = promptPath + } + + extraPrompt := strings.TrimSpace(opts.PromptFile) + if extraPrompt != "" { + if err := os.WriteFile(extraPrompt, []byte(prompt), 0o644); err != nil { + return aiOutputPaths{}, fmt.Errorf("writing prompt file %q: %w", extraPrompt, err) + } + if paths.PromptPath == "" { + paths.PromptPath = extraPrompt + } + } + + if opts.LLMSkip || strings.TrimSpace(opts.LLMEndpoint) == "" { + return paths, nil + } + + client := NewLLMClient(opts.LLMEndpoint, opts.LLMSkip) + analysis, err := client.Diagnose(ctx, LLMRequest{ + Endpoint: opts.LLMEndpoint, + APIKey: opts.LLMAPIKey, + Model: opts.LLMModel, + Prompt: prompt, + }) + if err != nil { + return paths, fmt.Errorf("LLM diagnosis: %w", err) + } + + analysisPath := extraPrompt + if baseDir != "" { + analysisPath = filepath.Join(baseDir, defaultAnalysisFile) + } + if analysisPath == "" { + return paths, fmt.Errorf("LLM analysis produced output but no path to write it (use --output=dir or --prompt-file)") + } + if err := os.WriteFile(analysisPath, []byte(analysis+"\n"), 0o644); err != nil { + return paths, fmt.Errorf("writing LLM analysis: %w", err) + } + paths.AnalysisPath = analysisPath + + if stderr != nil { + fmt.Fprintf(stderr, "diagnose: LLM analysis written to %s\n", analysisPath) + } + + return paths, nil +} diff --git a/pkg/diagnose/ai_outputs_test.go b/pkg/diagnose/ai_outputs_test.go new file mode 100644 index 0000000..7372268 --- /dev/null +++ b/pkg/diagnose/ai_outputs_test.go @@ -0,0 +1,54 @@ +package diagnose + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + "time" + + fluidv1alpha1 "github.com/fluid-cloudnative/fluid/api/v1alpha1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestWriteAIOutputs_LLMAnalysis(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _ = json.NewEncoder(w).Encode(chatCompletionResponse{ + Choices: []struct { + Message chatMessage `json:"message"` + }{{Message: chatMessage{Content: "Analysis result"}}}, + }) + })) + defer server.Close() + + dir := t.TempDir() + paths, err := writeAIOutputs(context.Background(), dir, BuildContextInput{ + GeneratedAt: time.Now(), + Dataset: &fluidv1alpha1.Dataset{ + ObjectMeta: metav1.ObjectMeta{Name: "demo", Namespace: "default"}, + }, + }, Options{ + LLMEndpoint: server.URL, + LLMAPIKey: "key", + LLMModel: "test", + LLMSkip: false, + }, nil) + if err != nil { + t.Fatalf("writeAIOutputs: %v", err) + } + analysisPath := filepath.Join(dir, defaultAnalysisFile) + if paths.AnalysisPath != analysisPath { + t.Fatalf("AnalysisPath: got %q want %q", paths.AnalysisPath, analysisPath) + } + data, err := os.ReadFile(analysisPath) + if err != nil { + t.Fatalf("read analysis: %v", err) + } + if !strings.Contains(string(data), "Analysis result") { + t.Fatalf("analysis content: %q", string(data)) + } +} diff --git a/pkg/diagnose/config.go b/pkg/diagnose/config.go new file mode 100644 index 0000000..2e355df --- /dev/null +++ b/pkg/diagnose/config.go @@ -0,0 +1,283 @@ +// Copyright 2026 The Fluid Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package diagnose + +import ( + "fmt" + "net/url" + "os" + "path/filepath" + "strings" + + "sigs.k8s.io/yaml" +) + +const ( + envLLMEndpoint = "FLUID_LLM_ENDPOINT" + envLLMAPIKey = "FLUID_LLM_API_KEY" + envLLMModel = "FLUID_LLM_MODEL" +) + +// UserConfig is the on-disk Fluid CLI configuration (~/.fluid/config). +type UserConfig struct { + Diagnose DiagnoseConfig `json:"diagnose" yaml:"diagnose"` +} + +type DiagnoseConfig struct { + LLM LLMConfig `json:"llm" yaml:"llm"` +} + +type LLMConfig struct { + Endpoint string `json:"endpoint,omitempty" yaml:"endpoint,omitempty"` + APIKey string `json:"apiKey,omitempty" yaml:"apiKey,omitempty"` + Model string `json:"model,omitempty" yaml:"model,omitempty"` +} + +// LLMSettings are resolved settings used at diagnose runtime. +type LLMSettings struct { + Endpoint string + APIKey string + Model string + Skip bool +} + +// ConfigPath returns the default user config file path (~/.fluid/config). +func ConfigPath() (string, error) { + home, err := os.UserHomeDir() + if err != nil { + return "", fmt.Errorf("resolving home directory: %w", err) + } + return filepath.Join(home, ".fluid", "config"), nil +} + +// LoadUserConfig reads ~/.fluid/config. A missing file yields an empty config. +func LoadUserConfig() (UserConfig, error) { + path, err := ConfigPath() + if err != nil { + return UserConfig{}, err + } + data, err := os.ReadFile(path) + if err != nil { + if os.IsNotExist(err) { + return UserConfig{}, nil + } + return UserConfig{}, fmt.Errorf("reading config %q: %w", path, err) + } + var cfg UserConfig + if err := yaml.Unmarshal(data, &cfg); err != nil { + return UserConfig{}, fmt.Errorf("parsing config %q: %w", path, err) + } + return cfg, nil +} + +// SaveUserConfig writes ~/.fluid/config with mode 0600. +func SaveUserConfig(cfg UserConfig) error { + path, err := ConfigPath() + if err != nil { + return err + } + if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil { + return fmt.Errorf("creating config directory: %w", err) + } + data, err := yaml.Marshal(cfg) + if err != nil { + return fmt.Errorf("marshaling config: %w", err) + } + if err := os.WriteFile(path, data, 0o600); err != nil { + return fmt.Errorf("writing config %q: %w", path, err) + } + return nil +} + +// ValidateLLMEndpoint checks that endpoint is a valid HTTP(S) URL with a host. +func ValidateLLMEndpoint(endpoint string) error { + endpoint = strings.TrimSpace(endpoint) + if endpoint == "" { + return fmt.Errorf("endpoint is required") + } + u, err := url.Parse(endpoint) + if err != nil { + return fmt.Errorf("invalid endpoint URL: %w", err) + } + if u.Scheme != "http" && u.Scheme != "https" { + return fmt.Errorf("endpoint must use http or https scheme") + } + if u.Host == "" { + return fmt.Errorf("endpoint must include a host") + } + return nil +} + +// ResolveLLMSettings resolves LLM settings with precedence: +// endpoint: flag > FLUID_LLM_ENDPOINT > ~/.fluid/config +// api_key: FLUID_LLM_API_KEY > ~/.fluid/config +// model: flag > FLUID_LLM_MODEL > ~/.fluid/config > defaultLLMModel +// skip: explicit --llm-skip; when endpoint is set and --llm-skip not passed, LLM is enabled +func ResolveLLMSettings(flagEndpoint, flagModel string, flagSkip, flagSkipSet bool) (LLMSettings, error) { + settings := LLMSettings{Skip: true, Model: defaultLLMModel} + + cfg, err := LoadUserConfig() + if err != nil { + return LLMSettings{}, err + } + + flagEndpoint = strings.TrimSpace(flagEndpoint) + if flagEndpoint != "" { + if err := ValidateLLMEndpoint(flagEndpoint); err != nil { + return LLMSettings{}, err + } + settings.Endpoint = flagEndpoint + } else if envEndpoint := strings.TrimSpace(os.Getenv(envLLMEndpoint)); envEndpoint != "" { + if err := ValidateLLMEndpoint(envEndpoint); err != nil { + return LLMSettings{}, fmt.Errorf("%s: %w", envLLMEndpoint, err) + } + settings.Endpoint = envEndpoint + } else if fileEndpoint := strings.TrimSpace(cfg.Diagnose.LLM.Endpoint); fileEndpoint != "" { + if err := ValidateLLMEndpoint(fileEndpoint); err != nil { + return LLMSettings{}, fmt.Errorf("config file endpoint: %w", err) + } + settings.Endpoint = fileEndpoint + } + + if envKey := strings.TrimSpace(os.Getenv(envLLMAPIKey)); envKey != "" { + settings.APIKey = envKey + } else if fileKey := strings.TrimSpace(cfg.Diagnose.LLM.APIKey); fileKey != "" { + settings.APIKey = fileKey + } + + flagModel = strings.TrimSpace(flagModel) + if flagModel != "" { + settings.Model = flagModel + } else if envModel := strings.TrimSpace(os.Getenv(envLLMModel)); envModel != "" { + settings.Model = envModel + } else if fileModel := strings.TrimSpace(cfg.Diagnose.LLM.Model); fileModel != "" { + settings.Model = fileModel + } + + if settings.Endpoint != "" { + if flagSkipSet { + settings.Skip = flagSkip + } else { + settings.Skip = false + } + } + + if !settings.Skip && settings.Endpoint != "" && settings.APIKey == "" { + return LLMSettings{}, fmt.Errorf("LLM API key is required when LLM analysis is enabled (set %s or `fluid diagnose config set llm-api-key`)", envLLMAPIKey) + } + + return settings, nil +} + +// SetLLMEndpoint persists the LLM endpoint in user config. +func SetLLMEndpoint(endpoint string) error { + if err := ValidateLLMEndpoint(endpoint); err != nil { + return err + } + cfg, err := LoadUserConfig() + if err != nil { + return err + } + cfg.Diagnose.LLM.Endpoint = strings.TrimSpace(endpoint) + return SaveUserConfig(cfg) +} + +// GetLLMEndpoint returns the configured LLM endpoint, or empty if unset. +func GetLLMEndpoint() (string, error) { + cfg, err := LoadUserConfig() + if err != nil { + return "", err + } + return strings.TrimSpace(cfg.Diagnose.LLM.Endpoint), nil +} + +// UnsetLLMEndpoint removes the LLM endpoint from user config. +func UnsetLLMEndpoint() error { + cfg, err := LoadUserConfig() + if err != nil { + return err + } + cfg.Diagnose.LLM.Endpoint = "" + return SaveUserConfig(cfg) +} + +// SetLLMAPIKey persists the LLM API key in user config. +func SetLLMAPIKey(apiKey string) error { + apiKey = strings.TrimSpace(apiKey) + if apiKey == "" { + return fmt.Errorf("api key is required") + } + cfg, err := LoadUserConfig() + if err != nil { + return err + } + cfg.Diagnose.LLM.APIKey = apiKey + return SaveUserConfig(cfg) +} + +// GetLLMAPIKey returns whether an API key is configured (never returns the secret value). +func GetLLMAPIKeyConfigured() (bool, error) { + if strings.TrimSpace(os.Getenv(envLLMAPIKey)) != "" { + return true, nil + } + cfg, err := LoadUserConfig() + if err != nil { + return false, err + } + return strings.TrimSpace(cfg.Diagnose.LLM.APIKey) != "", nil +} + +// UnsetLLMAPIKey removes the LLM API key from user config. +func UnsetLLMAPIKey() error { + cfg, err := LoadUserConfig() + if err != nil { + return err + } + cfg.Diagnose.LLM.APIKey = "" + return SaveUserConfig(cfg) +} + +// SetLLMModel persists the default LLM model name. +func SetLLMModel(model string) error { + model = strings.TrimSpace(model) + if model == "" { + return fmt.Errorf("model is required") + } + cfg, err := LoadUserConfig() + if err != nil { + return err + } + cfg.Diagnose.LLM.Model = model + return SaveUserConfig(cfg) +} + +// GetLLMModel returns the configured model or empty if unset. +func GetLLMModel() (string, error) { + cfg, err := LoadUserConfig() + if err != nil { + return "", err + } + return strings.TrimSpace(cfg.Diagnose.LLM.Model), nil +} + +// UnsetLLMModel removes the configured model from user config. +func UnsetLLMModel() error { + cfg, err := LoadUserConfig() + if err != nil { + return err + } + cfg.Diagnose.LLM.Model = "" + return SaveUserConfig(cfg) +} diff --git a/pkg/diagnose/config_test.go b/pkg/diagnose/config_test.go new file mode 100644 index 0000000..e692e57 --- /dev/null +++ b/pkg/diagnose/config_test.go @@ -0,0 +1,149 @@ +package diagnose + +import ( + "os" + "path/filepath" + "testing" +) + +func TestLoadSaveUserConfig_RoundTrip(t *testing.T) { + dir := t.TempDir() + t.Setenv("HOME", dir) + + cfg := UserConfig{ + Diagnose: DiagnoseConfig{ + LLM: LLMConfig{Endpoint: "https://api.example.com/v1"}, + }, + } + if err := SaveUserConfig(cfg); err != nil { + t.Fatalf("SaveUserConfig: %v", err) + } + + loaded, err := LoadUserConfig() + if err != nil { + t.Fatalf("LoadUserConfig: %v", err) + } + if loaded.Diagnose.LLM.Endpoint != cfg.Diagnose.LLM.Endpoint { + t.Fatalf("endpoint: got %q want %q", loaded.Diagnose.LLM.Endpoint, cfg.Diagnose.LLM.Endpoint) + } + + path, err := ConfigPath() + if err != nil { + t.Fatalf("ConfigPath: %v", err) + } + info, err := os.Stat(path) + if err != nil { + t.Fatalf("stat config: %v", err) + } + if info.Mode().Perm() != 0o600 { + t.Fatalf("config mode: got %o want 0600", info.Mode().Perm()) + } +} + +func TestResolveLLMSettings_Precedence(t *testing.T) { + dir := t.TempDir() + t.Setenv("HOME", dir) + if err := SaveUserConfig(UserConfig{ + Diagnose: DiagnoseConfig{LLM: LLMConfig{Endpoint: "https://file.example/v1"}}, + }); err != nil { + t.Fatalf("SaveUserConfig: %v", err) + } + + t.Setenv(envLLMEndpoint, "https://env.example/v1") + t.Setenv(envLLMAPIKey, "env-key") + settings, err := ResolveLLMSettings("", "", false, false) + if err != nil { + t.Fatalf("ResolveLLMSettings env: %v", err) + } + if settings.Endpoint != "https://env.example/v1" { + t.Fatalf("env endpoint: got %q", settings.Endpoint) + } + if settings.APIKey != "env-key" { + t.Fatalf("env api key: got %q", settings.APIKey) + } + if settings.Skip { + t.Fatal("expected skip=false when endpoint configured without --llm-skip") + } + + t.Setenv(envLLMAPIKey, "") + _, err = ResolveLLMSettings("https://flag.example/v1", "", false, false) + if err == nil { + t.Fatal("expected error when endpoint set without API key") + } + + settings, err = ResolveLLMSettings("https://flag.example/v1", "", false, true) + if err == nil { + t.Fatal("expected error when skip=false without API key") + } + _ = settings + + settings, err = ResolveLLMSettings("https://flag.example/v1", "custom-model", true, true) + if err != nil { + t.Fatalf("ResolveLLMSettings explicit skip: %v", err) + } + if !settings.Skip { + t.Fatal("expected skip=true when --llm-skip=true") + } + if settings.Model != "custom-model" { + t.Fatalf("model: got %q", settings.Model) + } +} + +func TestValidateLLMEndpoint(t *testing.T) { + t.Parallel() + + cases := []struct { + endpoint string + wantErr bool + }{ + {"https://api.example.com/v1", false}, + {"http://localhost:8080", false}, + {"", true}, + {"ftp://bad.example", true}, + {"not-a-url", true}, + } + for _, tc := range cases { + err := ValidateLLMEndpoint(tc.endpoint) + if tc.wantErr && err == nil { + t.Fatalf("expected error for %q", tc.endpoint) + } + if !tc.wantErr && err != nil { + t.Fatalf("unexpected error for %q: %v", tc.endpoint, err) + } + } +} + +func TestSetGetUnsetLLMEndpoint(t *testing.T) { + dir := t.TempDir() + t.Setenv("HOME", dir) + + if err := SetLLMEndpoint("https://api.example.com/v1"); err != nil { + t.Fatalf("SetLLMEndpoint: %v", err) + } + got, err := GetLLMEndpoint() + if err != nil { + t.Fatalf("GetLLMEndpoint: %v", err) + } + if got != "https://api.example.com/v1" { + t.Fatalf("got %q", got) + } + + if err := UnsetLLMEndpoint(); err != nil { + t.Fatalf("UnsetLLMEndpoint: %v", err) + } + got, err = GetLLMEndpoint() + if err != nil { + t.Fatalf("GetLLMEndpoint after unset: %v", err) + } + if got != "" { + t.Fatalf("expected empty endpoint, got %q", got) + } + + path, _ := ConfigPath() + if _, err := os.Stat(path); err != nil { + t.Fatalf("config file should exist: %v", err) + } + if filepath.Base(path) != "config" { + t.Fatalf("unexpected config basename: %s", path) + } +} diff --git a/pkg/diagnose/context_test.go b/pkg/diagnose/context_test.go new file mode 100644 index 0000000..404e071 --- /dev/null +++ b/pkg/diagnose/context_test.go @@ -0,0 +1,87 @@ +package diagnose + +import ( + "encoding/json" + "strings" + "testing" + "time" + + fluidv1alpha1 "github.com/fluid-cloudnative/fluid/api/v1alpha1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestBuildContext_DeterministicAndGuards(t *testing.T) { + t.Parallel() + + now := time.Date(2026, 5, 20, 12, 0, 0, 0, time.UTC) + ds := &fluidv1alpha1.Dataset{ + ObjectMeta: metav1.ObjectMeta{ + Name: "demo", + Namespace: "default", + Labels: map[string]string{ + "token": "secret-value", + "team": "fluid", + }, + }, + Status: fluidv1alpha1.DatasetStatus{ + Phase: fluidv1alpha1.BoundDatasetPhase, + Conditions: []fluidv1alpha1.DatasetCondition{ + {Type: fluidv1alpha1.DatasetReady, Status: corev1.ConditionFalse, Reason: "X", Message: strings.Repeat("m", 600)}, + }, + }, + } + + events := make([]corev1.Event, 0, 60) + for i := 0; i < 60; i++ { + events = append(events, corev1.Event{ + Type: corev1.EventTypeWarning, + Reason: "Failed", + Message: strings.Repeat("e", 600), + LastTimestamp: metav1.Time{Time: now.Add(time.Duration(i) * time.Minute)}, + }) + } + + ctx := BuildContext(BuildContextInput{ + GeneratedAt: now, + Dataset: ds, + Pods: []corev1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "b", Namespace: "default"}, Status: corev1.PodStatus{Phase: corev1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "a", Namespace: "default"}, Status: corev1.PodStatus{Phase: corev1.PodPending}}, + }, + Events: events, + }) + + if ctx.Dataset.Labels["token"] != "" { + t.Fatalf("expected redacted token label") + } + if len(ctx.Events) != maxPromptEvents { + t.Fatalf("events: got %d want %d", len(ctx.Events), maxPromptEvents) + } + if !strings.Contains(ctx.Events[0].Message, "...(truncated)") { + t.Fatalf("expected truncated event message") + } + if ctx.Pods[0].Name != "a" || ctx.Pods[1].Name != "b" { + t.Fatalf("pods not sorted: %+v", ctx.Pods) + } + + b1, err := json.Marshal(ctx) + if err != nil { + t.Fatalf("marshal: %v", err) + } + b2, err := json.Marshal(BuildContext(BuildContextInput{ + GeneratedAt: now, + Dataset: ds, + Pods: []corev1.Pod{ + {ObjectMeta: metav1.ObjectMeta{Name: "b", Namespace: "default"}, Status: corev1.PodStatus{Phase: corev1.PodRunning}}, + {ObjectMeta: metav1.ObjectMeta{Name: "a", Namespace: "default"}, Status: corev1.PodStatus{Phase: corev1.PodPending}}, + }, + Events: events, + })) + if err != nil { + t.Fatalf("marshal 2: %v", err) + } + if string(b1) != string(b2) { + t.Fatal("expected deterministic JSON output") + } +} diff --git a/pkg/diagnose/llm.go b/pkg/diagnose/llm.go index 111f6b7..0191ab6 100644 --- a/pkg/diagnose/llm.go +++ b/pkg/diagnose/llm.go @@ -12,15 +12,25 @@ // See the License for the specific language governing permissions and // limitations under the License. +// LLM integration uses an OpenAI-compatible chat completions API (net/http only). +// +// POST {endpoint}/v1/chat/completions +// Authorization: Bearer +// Body: { "model": "...", "messages": [ { "role": "system", ... }, { "role": "user", ... } ] } + package diagnose import ( "context" - "fmt" + "strings" ) +const defaultLLMModel = "gpt-4o-mini" + type LLMRequest struct { Endpoint string + APIKey string + Model string Prompt string } @@ -28,9 +38,17 @@ type LLMClient interface { Diagnose(ctx context.Context, req LLMRequest) (string, error) } -// NoopLLMClient is a Phase 3 placeholder. HTTP integration is intentionally deferred. +// NoopLLMClient skips remote calls. type NoopLLMClient struct{} -func (NoopLLMClient) Diagnose(_ context.Context, req LLMRequest) (string, error) { - return "", fmt.Errorf("llm diagnose is not implemented (endpoint=%q)", req.Endpoint) +func (NoopLLMClient) Diagnose(_ context.Context, _ LLMRequest) (string, error) { + return "", nil +} + +// NewLLMClient returns an HTTP client when endpoint is set and skip is false. +func NewLLMClient(endpoint string, skip bool) LLMClient { + if skip || strings.TrimSpace(endpoint) == "" { + return NoopLLMClient{} + } + return &HTTPLLMClient{} } diff --git a/pkg/diagnose/llm_http.go b/pkg/diagnose/llm_http.go new file mode 100644 index 0000000..4f8cba9 --- /dev/null +++ b/pkg/diagnose/llm_http.go @@ -0,0 +1,129 @@ +// Copyright 2026 The Fluid Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package diagnose + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +const defaultLLMHTTPTimeout = 120 * time.Second + +type chatMessage struct { + Role string `json:"role"` + Content string `json:"content"` +} + +type chatCompletionRequest struct { + Model string `json:"model"` + Messages []chatMessage `json:"messages"` +} + +type chatCompletionResponse struct { + Choices []struct { + Message chatMessage `json:"message"` + } `json:"choices"` + Error *struct { + Message string `json:"message"` + Type string `json:"type"` + } `json:"error,omitempty"` +} + +// HTTPLLMClient calls an OpenAI-compatible chat completions API. +type HTTPLLMClient struct { + HTTPClient *http.Client +} + +func (c *HTTPLLMClient) Diagnose(ctx context.Context, req LLMRequest) (string, error) { + if strings.TrimSpace(req.Endpoint) == "" { + return "", fmt.Errorf("llm endpoint is required") + } + if strings.TrimSpace(req.APIKey) == "" { + return "", fmt.Errorf("llm API key is required (set %s or diagnose.llm.api_key in ~/.fluid/config)", envLLMAPIKey) + } + model := strings.TrimSpace(req.Model) + if model == "" { + model = defaultLLMModel + } + + system, user := SplitPromptForChat(req.Prompt) + body, err := json.Marshal(chatCompletionRequest{ + Model: model, + Messages: []chatMessage{ + {Role: "system", Content: system}, + {Role: "user", Content: user}, + }, + }) + if err != nil { + return "", fmt.Errorf("marshaling chat request: %w", err) + } + + url := chatCompletionsURL(req.Endpoint) + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body)) + if err != nil { + return "", fmt.Errorf("creating request: %w", err) + } + httpReq.Header.Set("Content-Type", "application/json") + httpReq.Header.Set("Authorization", "Bearer "+req.APIKey) + + client := c.HTTPClient + if client == nil { + client = &http.Client{Timeout: defaultLLMHTTPTimeout} + } + + resp, err := client.Do(httpReq) + if err != nil { + return "", fmt.Errorf("calling LLM endpoint %q: %w", url, err) + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(io.LimitReader(resp.Body, 8<<20)) + if err != nil { + return "", fmt.Errorf("reading LLM response: %w", err) + } + + var parsed chatCompletionResponse + if err := json.Unmarshal(respBody, &parsed); err != nil { + return "", fmt.Errorf("decoding LLM response (HTTP %d): %w", resp.StatusCode, err) + } + if parsed.Error != nil && parsed.Error.Message != "" { + return "", fmt.Errorf("LLM API error: %s", parsed.Error.Message) + } + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + msg := strings.TrimSpace(string(respBody)) + if len(msg) > 500 { + msg = msg[:500] + "..." + } + return "", fmt.Errorf("LLM API returned HTTP %d: %s", resp.StatusCode, msg) + } + if len(parsed.Choices) == 0 || strings.TrimSpace(parsed.Choices[0].Message.Content) == "" { + return "", fmt.Errorf("LLM API returned no choices") + } + return strings.TrimSpace(parsed.Choices[0].Message.Content), nil +} + +func chatCompletionsURL(base string) string { + base = strings.TrimRight(strings.TrimSpace(base), "/") + if strings.HasSuffix(base, "/v1") { + return base + "/chat/completions" + } + return base + "/v1/chat/completions" +} diff --git a/pkg/diagnose/llm_http_test.go b/pkg/diagnose/llm_http_test.go new file mode 100644 index 0000000..986707b --- /dev/null +++ b/pkg/diagnose/llm_http_test.go @@ -0,0 +1,75 @@ +package diagnose + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +func TestHTTPLLMClient_Diagnose(t *testing.T) { + t.Parallel() + + var gotReq chatCompletionRequest + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/v1/chat/completions" { + t.Fatalf("path: got %s", r.URL.Path) + } + if auth := r.Header.Get("Authorization"); auth != "Bearer test-key" { + t.Fatalf("authorization: got %q", auth) + } + if err := json.NewDecoder(r.Body).Decode(&gotReq); err != nil { + t.Fatalf("decode request: %v", err) + } + _ = json.NewEncoder(w).Encode(chatCompletionResponse{ + Choices: []struct { + Message chatMessage `json:"message"` + }{{Message: chatMessage{Role: "assistant", Content: "Likely mount failure on fuse pod."}}}, + }) + })) + defer server.Close() + + client := &HTTPLLMClient{HTTPClient: server.Client()} + analysis, err := client.Diagnose(context.Background(), LLMRequest{ + Endpoint: server.URL, + APIKey: "test-key", + Model: "test-model", + Prompt: "## Instructions\n- diagnose only\n\n## Dataset\n- Name: demo\n", + }) + if err != nil { + t.Fatalf("Diagnose: %v", err) + } + if analysis != "Likely mount failure on fuse pod." { + t.Fatalf("analysis: got %q", analysis) + } + if gotReq.Model != "test-model" { + t.Fatalf("model: got %q", gotReq.Model) + } + if len(gotReq.Messages) != 2 { + t.Fatalf("messages: got %d", len(gotReq.Messages)) + } + if gotReq.Messages[0].Role != "system" || !strings.Contains(gotReq.Messages[0].Content, "diagnosis only") { + t.Fatalf("system message: %+v", gotReq.Messages[0]) + } + if gotReq.Messages[1].Role != "user" || !strings.Contains(gotReq.Messages[1].Content, "## Dataset") { + t.Fatalf("user message: %+v", gotReq.Messages[1]) + } +} + +func TestChatCompletionsURL(t *testing.T) { + t.Parallel() + + cases := map[string]string{ + "https://api.openai.com": "https://api.openai.com/v1/chat/completions", + "https://api.openai.com/": "https://api.openai.com/v1/chat/completions", + "https://api.openai.com/v1": "https://api.openai.com/v1/chat/completions", + "https://api.openai.com/v1/": "https://api.openai.com/v1/chat/completions", + } + for in, want := range cases { + if got := chatCompletionsURL(in); got != want { + t.Fatalf("%q: got %q want %q", in, got, want) + } + } +} diff --git a/pkg/diagnose/prompt.go b/pkg/diagnose/prompt.go index c0fc1b3..6f95b09 100644 --- a/pkg/diagnose/prompt.go +++ b/pkg/diagnose/prompt.go @@ -42,11 +42,33 @@ func ContextAsJSON(ctx *DiagnosticContext) ([]byte, error) { return json.MarshalIndent(ctx, "", " ") } +const modelInstructionsText = `- Focus on diagnosis only: unhealthy signals, evidence correlation, ranked hypotheses, and uncertainties. +- Do not provide remediation instructions, shell commands, or kubectl/helm operations. +- If data is insufficient, state what additional observations would increase confidence.` + func writeModelInstructions(b *strings.Builder) { b.WriteString("## Instructions\n") - b.WriteString("- Focus on diagnosis only: unhealthy signals, evidence correlation, ranked hypotheses, and uncertainties.\n") - b.WriteString("- Do not provide remediation instructions, shell commands, or kubectl/helm operations.\n") - b.WriteString("- If data is insufficient, state what additional observations would increase confidence.\n\n") + b.WriteString(modelInstructionsText) + b.WriteString("\n\n") +} + +// SplitPromptForChat separates system instructions from user diagnostic content. +func SplitPromptForChat(prompt string) (system string, user string) { + system = "You are a Kubernetes and Fluid storage expert assisting with dataset diagnosis.\n\n" + modelInstructionsText + marker := "## Instructions" + if idx := strings.Index(prompt, marker); idx >= 0 { + rest := prompt[idx+len(marker):] + if nl := strings.Index(rest, "\n"); nl >= 0 { + rest = strings.TrimLeft(rest[nl:], "\n") + } + user = strings.TrimSpace(rest) + } else { + user = strings.TrimSpace(prompt) + } + if user == "" { + user = prompt + } + return system, user } func writeDatasetSection(b *strings.Builder, ds DatasetContext) { diff --git a/pkg/diagnose/prompt_test.go b/pkg/diagnose/prompt_test.go new file mode 100644 index 0000000..4040f8b --- /dev/null +++ b/pkg/diagnose/prompt_test.go @@ -0,0 +1,79 @@ +package diagnose + +import ( + "os" + "path/filepath" + "strings" + "testing" + "time" + + fluidv1alpha1 "github.com/fluid-cloudnative/fluid/api/v1alpha1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +func TestFormatPrompt_Golden(t *testing.T) { + t.Parallel() + + now := time.Date(2026, 5, 20, 12, 0, 0, 0, time.UTC) + ctx := BuildContext(BuildContextInput{ + GeneratedAt: now, + Dataset: &fluidv1alpha1.Dataset{ + ObjectMeta: metav1.ObjectMeta{Name: "demo", Namespace: "default"}, + Status: fluidv1alpha1.DatasetStatus{Phase: fluidv1alpha1.BoundDatasetPhase}, + }, + Pods: []corev1.Pod{ + { + ObjectMeta: metav1.ObjectMeta{Name: "fuse", Namespace: "default"}, + Status: corev1.PodStatus{ + Phase: corev1.PodRunning, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "fuse", Ready: true, RestartCount: 0}, + }, + }, + }, + }, + Events: []corev1.Event{ + { + Type: corev1.EventTypeWarning, + Reason: "FailedMount", + Message: "mount failed", + LastTimestamp: metav1.Time{Time: now}, + InvolvedObject: corev1.ObjectReference{Kind: "Pod", Name: "fuse"}, + }, + }, + }) + + got := FormatPrompt(ctx) + golden := filepath.Join("testdata", "prompt_golden.txt") + if os.Getenv("UPDATE_GOLDEN") == "1" { + if err := os.MkdirAll(filepath.Dir(golden), 0o755); err != nil { + t.Fatalf("mkdir testdata: %v", err) + } + if err := os.WriteFile(golden, []byte(got), 0o644); err != nil { + t.Fatalf("write golden: %v", err) + } + } + + wantBytes, err := os.ReadFile(golden) + if err != nil { + // First run without golden file: assert key sections instead + for _, section := range []string{ + "Version: fluid-diagnose-prompt/v1", + "## Instructions", + "## Dataset", + "## Pods", + "## Events", + "## Summary", + "Do not provide remediation", + } { + if !strings.Contains(got, section) { + t.Fatalf("prompt missing %q:\n%s", section, got) + } + } + return + } + if got != string(wantBytes) { + t.Fatalf("prompt mismatch with golden file %s (run UPDATE_GOLDEN=1 go test to refresh)", golden) + } +} diff --git a/pkg/diagnose/runner.go b/pkg/diagnose/runner.go index dbd7ecd..8feb926 100644 --- a/pkg/diagnose/runner.go +++ b/pkg/diagnose/runner.go @@ -49,6 +49,14 @@ type Options struct { NoLogs bool IncludeControllerLogs bool Since string + + // AI-assisted diagnosis + PromptFile string + LLMEndpoint string + LLMAPIKey string + LLMModel string + LLMSkip bool + Stderr io.Writer } type Result struct { @@ -56,6 +64,9 @@ type Result struct { ArchivePath string PartialFailureCount int Stdout string + ContextPath string + PromptPath string + LLMAnalysisPath string } type Runner struct { @@ -129,6 +140,7 @@ func (r *Runner) Run(ctx context.Context, opts Options) (*Result, error) { r.writeYAML(baseDir, "dataset.yaml", dataset, m) r.writeText(baseDir, "dataset.describe.txt", describeDataset(dataset), m) + var runtimeObjs []*unstructured.Unstructured for _, rt := range dataset.Status.Runtimes { runtimeNS := rt.Namespace if runtimeNS == "" { @@ -151,6 +163,7 @@ func (r *Runner) Run(ctx context.Context, opts Options) (*Result, error) { } r.writeYAML(baseDir, rtPathPrefix+".yaml", u.Object, m) r.writeText(baseDir, rtPathPrefix+".describe.txt", describeUnstructuredRuntime(u), m) + runtimeObjs = append(runtimeObjs, u.DeepCopy()) } inspector := inspect.New(r.client) @@ -162,6 +175,7 @@ func (r *Runner) Run(ctx context.Context, opts Options) (*Result, error) { seenPods := map[string]struct{}{} seenPVCs := map[string]struct{}{} seenPVs := map[string]struct{}{} + var collectedPods []corev1.Pod for _, rr := range report.Runtimes { for _, row := range rr.Resources { @@ -172,7 +186,9 @@ func (r *Runner) Run(ctx context.Context, opts Options) (*Result, error) { continue } seenPods[key] = struct{}{} - r.collectPod(ctx, baseDir, row.Namespace, row.Name, opts, m) + if pod, ok := r.collectPod(ctx, baseDir, row.Namespace, row.Name, opts, m); ok { + collectedPods = append(collectedPods, pod) + } case "PVC": key := row.Namespace + "/" + row.Name if _, ok := seenPVCs[key]; ok { @@ -202,7 +218,7 @@ func (r *Runner) Run(ctx context.Context, opts Options) (*Result, error) { r.collectPV(ctx, baseDir, canonicalPVName, m) } - r.collectEvents(ctx, baseDir, opts, m) + collectedEvents := r.collectEvents(ctx, baseDir, opts, m) if opts.IncludeControllerLogs && !opts.NoLogs { r.collectControllerLogs(ctx, baseDir, opts, m) } @@ -226,9 +242,26 @@ func (r *Runner) Run(ctx context.Context, opts Options) (*Result, error) { return nil, fmt.Errorf("writing manifest: %w", err) } + aiPaths, err := writeAIOutputs(ctx, baseDir, BuildContextInput{ + GeneratedAt: r.nowFn(), + Dataset: dataset, + Report: report, + RuntimeObjs: runtimeObjs, + Pods: collectedPods, + Events: collectedEvents, + NoLogs: opts.NoLogs, + Since: opts.Since, + }, opts, opts.Stderr) + if err != nil { + return nil, err + } + result := &Result{ OutputPath: baseDir, PartialFailureCount: failures, + ContextPath: aiPaths.ContextPath, + PromptPath: aiPaths.PromptPath, + LLMAnalysisPath: aiPaths.AnalysisPath, } if opts.Archive { archivePath := baseDir + ".tar.gz" @@ -343,24 +376,78 @@ func (r *Runner) runStdout(ctx context.Context, opts Options) (*Result, error) { } } - return &Result{ - Stdout: b.String(), - }, nil + stdout := b.String() + result := &Result{Stdout: stdout} + + if strings.TrimSpace(opts.PromptFile) != "" { + var runtimeObjs []*unstructured.Unstructured + for _, rt := range dataset.Status.Runtimes { + runtimeNS := rt.Namespace + if runtimeNS == "" { + runtimeNS = opts.Namespace + } + u := &unstructured.Unstructured{} + u.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "data.fluid.io", + Version: "v1alpha1", + Kind: rt.Type, + }) + if err := r.client.Get(ctx, types.NamespacedName{Name: rt.Name, Namespace: runtimeNS}, u); err == nil { + runtimeObjs = append(runtimeObjs, u.DeepCopy()) + } + } + + var collectedPods []corev1.Pod + for _, rr := range report.Runtimes { + for _, row := range rr.Resources { + if row.Kind != "Pod" { + continue + } + pod := &corev1.Pod{} + if err := r.client.Get(ctx, types.NamespacedName{Name: row.Name, Namespace: row.Namespace}, pod); err == nil { + collectedPods = append(collectedPods, *pod) + } + } + } + + var collectedEvents []corev1.Event + if evList, err := r.kubeClient.CoreV1().Events(opts.Namespace).List(ctx, metav1.ListOptions{}); err == nil { + collectedEvents = filterEvents(evList.Items, opts.Since, r.nowFn()) + } + + aiPaths, err := writeAIOutputs(ctx, "", BuildContextInput{ + GeneratedAt: r.nowFn(), + Dataset: dataset, + Report: report, + RuntimeObjs: runtimeObjs, + Pods: collectedPods, + Events: collectedEvents, + NoLogs: opts.NoLogs, + Since: opts.Since, + }, opts, opts.Stderr) + if err != nil { + return nil, err + } + result.PromptPath = aiPaths.PromptPath + result.LLMAnalysisPath = aiPaths.AnalysisPath + } + + return result, nil } -func (r *Runner) collectPod(ctx context.Context, baseDir, namespace, name string, opts Options, m *manifest) { +func (r *Runner) collectPod(ctx context.Context, baseDir, namespace, name string, opts Options, m *manifest) (corev1.Pod, bool) { podPrefix := fmt.Sprintf("pods/%s/%s", namespace, name) pod := &corev1.Pod{} if err := r.client.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, pod); err != nil { m.Artifacts = append(m.Artifacts, manifestEntry{Path: podPrefix + "/pod.yaml", Status: "failed", Reason: err.Error()}) - return + return corev1.Pod{}, false } r.writeYAML(baseDir, podPrefix+"/pod.yaml", pod, m) r.writeText(baseDir, podPrefix+"/describe.txt", describePod(pod), m) if opts.NoLogs { m.Artifacts = append(m.Artifacts, manifestEntry{Path: podPrefix + "/logs.txt", Status: "skipped", Reason: "--no-logs enabled"}) - return + return *pod, true } var sincePtr *metav1.Time @@ -388,6 +475,7 @@ func (r *Runner) collectPod(ctx context.Context, baseDir, namespace, name string b.WriteString("\n") } r.writeText(baseDir, podPrefix+"/logs.txt", b.String(), m) + return *pod, true } func copyPodLog(b *strings.Builder, stream io.ReadCloser) { @@ -418,20 +506,26 @@ func (r *Runner) collectPV(ctx context.Context, baseDir, name string, m *manifes r.writeText(baseDir, fmt.Sprintf("storage/pv-%s.describe.txt", name), describePV(pv), m) } -func (r *Runner) collectEvents(ctx context.Context, baseDir string, opts Options, m *manifest) { +func (r *Runner) collectEvents(ctx context.Context, baseDir string, opts Options, m *manifest) []corev1.Event { evList, err := r.kubeClient.CoreV1().Events(opts.Namespace).List(ctx, metav1.ListOptions{}) if err != nil { m.Artifacts = append(m.Artifacts, manifestEntry{Path: "events/events.yaml", Status: "failed", Reason: err.Error()}) - return + return nil } - items := make([]corev1.Event, 0, len(evList.Items)) + items := filterEvents(evList.Items, opts.Since, r.nowFn()) + r.writeYAML(baseDir, "events/events.yaml", map[string]any{"items": items}, m) + return items +} + +func filterEvents(events []corev1.Event, since string, now time.Time) []corev1.Event { + items := make([]corev1.Event, 0, len(events)) sinceCutoff := time.Time{} - if opts.Since != "" { - if d, err := time.ParseDuration(opts.Since); err == nil { - sinceCutoff = r.nowFn().Add(-d) + if since != "" { + if d, err := time.ParseDuration(since); err == nil { + sinceCutoff = now.Add(-d) } } - for _, ev := range evList.Items { + for _, ev := range events { eventTs := eventTimestamp(ev) if !sinceCutoff.IsZero() && !eventTs.IsZero() && eventTs.Before(sinceCutoff) { continue @@ -441,7 +535,7 @@ func (r *Runner) collectEvents(ctx context.Context, baseDir string, opts Options sort.Slice(items, func(i, j int) bool { return items[i].LastTimestamp.Time.Before(items[j].LastTimestamp.Time) }) - r.writeYAML(baseDir, "events/events.yaml", map[string]any{"items": items}, m) + return items } func eventTimestamp(ev corev1.Event) time.Time { diff --git a/pkg/diagnose/runner_test.go b/pkg/diagnose/runner_test.go index a7f4092..bcc25a3 100644 --- a/pkg/diagnose/runner_test.go +++ b/pkg/diagnose/runner_test.go @@ -250,3 +250,54 @@ func TestCollectPVC_PathIncludesNamespace(t *testing.T) { } } } + +func TestRun_WritesAIOutputs(t *testing.T) { + t.Parallel() + + scheme := runtime.NewScheme() + if err := fluidv1alpha1.AddToScheme(scheme); err != nil { + t.Fatalf("add fluid scheme: %v", err) + } + if err := corev1.AddToScheme(scheme); err != nil { + t.Fatalf("add corev1 scheme: %v", err) + } + + dataset := &fluidv1alpha1.Dataset{ + ObjectMeta: metav1.ObjectMeta{ + Name: "demo", + Namespace: "default", + }, + } + + c := ctrlclientfake.NewClientBuilder().WithScheme(scheme).WithObjects(dataset).Build() + kubeClient := kubefake.NewSimpleClientset() + runner := NewRunner(c, kubeClient) + runner.nowFn = func() time.Time { return time.Date(2026, 5, 20, 12, 0, 0, 0, time.UTC) } + + outputDir := t.TempDir() + result, err := runner.Run(context.Background(), Options{ + DatasetName: "demo", + Namespace: "default", + Output: "dir", + OutputDir: outputDir, + NoLogs: true, + }) + if err != nil { + t.Fatalf("Run: %v", err) + } + + contextPath := filepath.Join(outputDir, "context.json") + if _, err := os.Stat(contextPath); err != nil { + t.Fatalf("context.json missing: %v", err) + } + promptPath := filepath.Join(outputDir, "prompt.txt") + if _, err := os.Stat(promptPath); err != nil { + t.Fatalf("prompt.txt missing: %v", err) + } + if result.ContextPath != contextPath { + t.Fatalf("ContextPath: got %q want %q", result.ContextPath, contextPath) + } + if result.PromptPath != promptPath { + t.Fatalf("PromptPath: got %q want %q", result.PromptPath, promptPath) + } +}