From 92be9b7b8751ffe44f707cf03e0fe80f5b4ff0b4 Mon Sep 17 00:00:00 2001 From: casey-coreweave Date: Thu, 30 Apr 2026 13:52:01 -0700 Subject: [PATCH] fix(tilt): Full tilt integration rewrite --- .gitignore | 3 +- README.md | 46 +- Tiltfile | 934 +++++++++++++++++++------------- docs/design/wandb_v2/tilt.md | 231 +++----- docs/monitoring.md | 9 +- hack/scripts/tilt-dev-clean.sh | 8 +- hack/tilt/endpoint-anchors.yaml | 39 -- hack/tilt/wandbcr/main.go | 422 +++++++++++++++ hack/tilt/wandbcr/main_test.go | 273 ++++++++++ tilt-settings.sample.star | 70 +-- 10 files changed, 1414 insertions(+), 621 deletions(-) delete mode 100644 hack/tilt/endpoint-anchors.yaml create mode 100644 hack/tilt/wandbcr/main.go create mode 100644 hack/tilt/wandbcr/main_test.go diff --git a/.gitignore b/.gitignore index 7057c279..b6968e79 100644 --- a/.gitignore +++ b/.gitignore @@ -41,8 +41,7 @@ junit.xml tilt-settings.json tilt-settings.star -hack/testing-manifests/wandb/.generated/kustomization.yaml -hack/testing-manifests/wandb/.generated/wandb-cr.yaml +hack/testing-manifests/wandb/.generated/*.yaml /tilt_bin /dist diff --git a/README.md b/README.md index 20aedfbf..53ffb948 100644 --- a/README.md +++ b/README.md @@ -35,8 +35,7 @@ kind create cluster This will create a new kind cluster with the name `kind`. The kubernetes context will be called `kind-kind`. -Alternatively, you can use the provided scripts to manage the kind cluster, uses kindClusterName from -`tilt-settings.star`, if present. +Alternatively, you can use the provided scripts to manage the kind cluster. ```bash # Create cluster @@ -74,9 +73,36 @@ brew install kustomize #### Tilt Settings -There are settings for Tilt that can be configured using a `tilt-settings.star` file. The settings file is not checked -into source control. A sample settings file is provided in `tilt-settings.sample.star`. To use the sample settings file, -copy it to `tilt-settings.star` +Tilt reads local settings from `tilt-settings.star`. The file is not checked +into source control; start from `tilt-settings.sample.star` and keep local +overrides there. + +The default Tilt setup follows the normal operator install path: + +- installs one `wandb-operator` Helm release in `wandb-operators` +- builds the local controller image as `controller:latest` +- creates a `WeightsAndBiases` CR in `wandb` +- uses `networkMode="gateway"` with `http://localhost:8080` +- uses the published server manifest repository by default +- keeps telemetry off unless `observabilityMode="full"` is set + +Common W&B CR settings are scalar values such as `wandbHostname`, +`wandbVersion`, `size`, `retentionPolicy`, `licenseFile`, `manifestSource`, +and `networkMode`. +Set `networkMode="ingress"` to use the local ingress-nginx path instead of +Gateway API; if `wandbHostname` is not set explicitly, ingress mode uses +`http://wandb.localhost:8080`. + +Tilt defaults `manifestSource="published"`, which leaves +`spec.wandb.manifestRepository` empty so the W&B CR webhook applies the same +published OCI repository default as production installs. To test repo-local server manifest +definitions, set `manifestSource="local"` and keep +`localManifestPath="hack/testing-manifests/server-manifest"`. The default local +manifest path currently contains `0.79.0`, so also set `wandbVersion="0.79.0"` +when using that local source. + +Use `crFile` for custom CR shapes; Tilt treats it as a base CR and still +applies the scalar settings above. By default, Tilt is configured to only allow connections to the following Kubernetes contexts: @@ -84,6 +110,7 @@ By default, Tilt is configured to only allow connections to the following Kubern - `kind-kind` - `kind-wandb-operator` - `minikube` +- `orbstack` Please add any additional contexts to the `allowedContexts` list in your `tilt-settings.star` file. @@ -99,7 +126,8 @@ tilt up fully reset the cluster. The following are expected to survive a normal `tilt down`: - `cert-manager` and its namespace -- operator CRDs, including the W&B CRDs and third-party operator CRDs +- operator CRDs, including the W&B CRDs and operator dependency CRDs +- `wandb-operators` and dependency namespaces - dev PVC-backed data unless the backing operator deletes it For a true dev reset, use the helper script instead: @@ -123,11 +151,11 @@ then run `tilt down`. ### Locally testing external infra -1. Install the WandB CR with Tilt **without** the `purge-retention` `wandbOverlay` in `tilt-settings.star`. +1. Install the WandB CR with Tilt using the default `retentionPolicy="detach"` in `tilt-settings.star`. 2. Delete the WandB CR — infra should be detached but remain in place. 3. Run `./hack/scripts/managed-connections-to-external.sh` to convert the managed connection secrets into external ones. -4. Install the WandB CR with Tilt with the following `wandbOverlay`s: `external-mysql`, `external-redis`, -`external-kafka`, `external-objectstore`, `external-clickhouse`. +4. Install the WandB CR with Tilt using a custom `crFile` that points at a CR + with the external infra connection specs. 5. WandB should now run with externally managed infra. ### Counterfeiter diff --git a/Tiltfile b/Tiltfile index 005747d3..5adbc1f5 100644 --- a/Tiltfile +++ b/Tiltfile @@ -1,27 +1,55 @@ -# default values +# Local operator development. +# +# Tilt keeps the fast local controller loop while installing the operator +# through the same Helm chart path as a normal install. + +GENERATED_DIR = "hack/testing-manifests/wandb/.generated" +GENERATED_WANDB_CR = GENERATED_DIR + "/tilt-wandb-cr.yaml" +GENERATED_OPERATOR_VALUES = GENERATED_DIR + "/tilt-operator-values.yaml" + +GATEWAY_API_CRDS_URL = "https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.4.0/standard-install.yaml" +IMG = "controller:latest" + +GROUP_DEPENDENCIES = "Dependencies" +GROUP_WANDB_APP = "Wandb-App" +GROUP_TELEMETRY = "Telemetry" +GROUP_WANDB_OPERATOR = "Wandb-Operator" + settings = { "allowedContexts": [ "docker-desktop", "minikube", "kind-kind", + "kind-wandb-operator", "orbstack", - "crc-admin", ], - "installWandb": True, - "wandbCR": "hack/testing-manifests/wandb/.generated/wandb-cr.yaml", - "wandbOverlays": [], - "installTelemetry": True, - "installIngressNginx": True, - "installNginxGateway": True, - "logFormat": "pretty", # pretty, text, json - "openshiftSCC": False, + + # Operator install settings. + "operatorNamespace": "wandb-operators", + + # W&B CR settings. + "includeCR": True, + "crFile": "", + "wandbName": "wandb", + "wandbNamespace": "wandb", + "wandbHostname": "http://localhost:8080", + "wandbVersion": "0.80.0", + "size": "dev", + "retentionPolicy": "detach", "licenseFile": "", -} + "manifestSource": "published", # published or local + "localManifestPath": "hack/testing-manifests/server-manifest", + "networkMode": "gateway", # gateway or ingress + "gatewayClass": "nginx", + "ingressClass": "nginx", + "createCA": True, + "issuerName": "", -GENERATED_WANDB_CR = "hack/testing-manifests/wandb/.generated/wandb-cr.yaml" -LICENSE_PATCH = "hack/testing-manifests/wandb/kustomize/overlays/license-file/patch.yaml" -LOCAL_INGRESS_OVERLAY = "networking-ingress-local" -LOCAL_GATEWAY_OVERLAY = "networking-gateway-local" + # off, full, forward. + "observabilityMode": "off", + + "logFormat": "pretty", # pretty, text, json +} if os.path.exists("tilt-settings.json"): fail("tilt-settings.json is no longer supported. Migrate to tilt-settings.star (see tilt-settings.sample.star).") @@ -32,96 +60,167 @@ if not os.path.exists("tilt-settings.star"): load("./tilt-settings.star", "SETTINGS") settings.update(SETTINGS) -# Configure global watch settings with a 2-second debounce -watch_settings(ignore=["**/.git", "**/*.out", "hack/testing-manifests/wandb/.generated/**"]) -# Increase timeout for helm installations and apply operations -update_settings(k8s_upsert_timeout_secs=300) +def warn(message): + print("WARNING: " + message) -currentContext = k8s_context() -if currentContext in settings.get("allowedContexts"): - print("Context is allowed") -else: - fail("Selected context is not in allow list") +def as_bool(value): + if value == True: + return True + if value == False or value == None: + return False + return str(value).lower() in ["true", "yes", "1", "on"] -allow_k8s_contexts(settings.get("allowed_k8s_contexts")) -IS_CRC = 'crc' in currentContext or 'api-crc-testing' in currentContext -if IS_CRC: - settings['openshiftSCC'] = True - default_registry( - 'default-route-openshift-image-registry.apps-crc.testing/operator-system', - host_from_cluster='image-registry.openshift-image-registry.svc:5000/operator-system', - ) +def bool_string(value): + if value: + return "true" + return "false" -os.putenv('PATH', './bin:' + os.getenv('PATH')) -load('ext://restart_process', 'docker_build_with_restart') -load('ext://helm_resource', 'helm_repo', 'helm_resource') +def normalize_observability_mode(): + mode = str(settings.get("observabilityMode", "off")).lower() + if mode in ["off", "full", "forward"]: + return mode -DOCKERFILE = ''' -FROM registry.access.redhat.com/ubi9/ubi + fail("observabilityMode must be one of: off, full, forward") -ADD tilt_bin/manager /manager -ADD hack/testing-manifests/server-manifest /server-manifest -RUN mkdir -p /helm/.cache/helm /helm/.config/helm /helm/.local/share/helm +def normalize_network_mode(): + mode = str(settings.get("networkMode", "gateway")).lower() + if mode in ["gateway", "ingress"]: + return mode -ENV HELM_CACHE_HOME=/helm/.cache/helm -ENV HELM_CONFIG_HOME=/helm/.config/helm -ENV HELM_DATA_HOME=/helm/.local/share/helm -''' + fail("networkMode must be one of: gateway, ingress") -DOCKERFILE_OPENSHIFT = ''' -FROM registry.access.redhat.com/ubi9/ubi -ADD tilt_bin/manager /manager -ADD hack/testing-manifests/server-manifest /server-manifest +def normalize_manifest_source(): + source = str(settings.get("manifestSource", "published")).lower() + if source in ["published", "local"]: + return source -RUN mkdir -p /helm/.cache/helm /helm/.config/helm /helm/.local/share/helm && \ - chgrp -R 0 /helm && chmod -R g=u /helm + fail("manifestSource must be one of: published, local") -ENV HELM_CACHE_HOME=/helm/.cache/helm -ENV HELM_CONFIG_HOME=/helm/.config/helm -ENV HELM_DATA_HOME=/helm/.local/share/helm -USER 1001 -''' -DOMAIN = "wandb.com" -GROUP = "apps" -VERSION = "v1" -KIND = "wandb" -IMG = 'controller:latest' -CONTROLLERGEN = 'rbac:roleName=manager-role crd:allowDangerousTypes=true,generateEmbeddedObjectMeta=true,maxDescLen=0 webhook paths="{./api/v1,./api/v2,./internal/controller/...}" output:crd:artifacts:config=config/crd/bases' -DISABLE_SECURITY_CONTEXT = True +def shell_quote(value): + return "'" + str(value).replace("'", "'\"'\"'") + "'" + + +def write_file_cmd(path, contents): + return "cat > %s <<'EOF'\n%s\nEOF" % (path, contents) + + +def k8s_yaml_object(obj): + k8s_yaml(encode_yaml(obj)) + + +def repo_path(path): + path = str(path) + if path.startswith("./"): + return path + return "./" + path + + +def validate_local_manifest_path(path): + path = str(path) + if path.startswith("/") or path.startswith("../") or "/../" in path or path == "..": + fail("localManifestPath must be a repo-relative Docker build-context path.") + if " " in path: + fail("localManifestPath cannot contain spaces because it is used in a Dockerfile ADD instruction.") + if not os.path.exists(path): + fail("manifestSource='local' requires localManifestPath to exist: %s" % path) + return path -GROUP_WANDB_APP = "Wandb-App" -GROUP_TELEMETRY = "Telemetry" -GROUP_WANDB_OPERATOR = "Wandb-Operator" -GROUP_THIRD_PARTY_OPERATORS = "Third-Party-Operators" -def manifests(): - return 'make manifests' +def write_generated_yaml(path, obj): + local("mkdir -p " + GENERATED_DIR) + local(write_file_cmd(path, encode_yaml(obj))) + return path -def generate(): - return 'make generate' +def helm_supports_take_ownership(): + return str(local("helm upgrade --help | grep -q -- '--take-ownership' && echo true || echo false")).strip() == "true" -def vetfmt(): - return 'go vet ./...; go fmt ./...' +def url_host(url): + rest = str(url) + if "://" in rest: + rest = rest.split("://", 1)[1] + host_port = rest.split("/", 1)[0] + if "@" in host_port: + host_port = host_port.split("@", 1)[1] + parts = host_port.split(":") + return parts[0] + + +def url_port(url): + rest = str(url) + if "://" in rest: + rest = rest.split("://", 1)[1] + host_port = rest.split("/", 1)[0] + parts = host_port.split(":") + if len(parts) > 1: + return int(parts[len(parts) - 1]) + if str(url).startswith("https://"): + return 443 + return 80 + + +settings["networkMode"] = normalize_network_mode() +settings["observabilityMode"] = normalize_observability_mode() +settings["manifestSource"] = normalize_manifest_source() +if settings["manifestSource"] == "local": + settings["localManifestPath"] = validate_local_manifest_path(settings.get("localManifestPath")) + +watch_settings(ignore=["**/.git", "**/*.out", GENERATED_DIR + "/**"]) +update_settings(k8s_upsert_timeout_secs=300) + +currentContext = k8s_context() +if currentContext in settings.get("allowedContexts"): + print("Context is allowed") +else: + fail("Selected context is not in allow list") + +allow_k8s_contexts(settings.get("allowedContexts")) + +os.putenv("PATH", "./bin:" + os.getenv("PATH")) + +load("ext://restart_process", "docker_build_with_restart") +load("ext://helm_resource", "helm_repo", "helm_resource") + +def operator_dockerfile(): + lines = [ + "FROM registry.access.redhat.com/ubi9/ubi", + "", + "ADD tilt_bin/manager /manager", + ] + + if settings.get("manifestSource") == "local": + lines.append("ADD %s /server-manifest" % settings.get("localManifestPath")) + + lines += [ + "", + "RUN mkdir -p /helm/.cache/helm /helm/.config/helm /helm/.local/share/helm", + "", + "ENV HELM_CACHE_HOME=/helm/.cache/helm", + "ENV HELM_CONFIG_HOME=/helm/.config/helm", + "ENV HELM_DATA_HOME=/helm/.local/share/helm", + "", + ] + + return "\n".join(lines) -# build to tilt_bin because kubebuilder has a dockerignore for bin/ def binary(): - return 'CGO_ENABLED=0 GOOS=linux GO111MODULE=on go build -o tilt_bin/manager cmd/main.go' + return "CGO_ENABLED=0 GOOS=linux GO111MODULE=on go build -o tilt_bin/manager cmd/main.go" -def managed_endpoint_resource(name, anchor_object, deps, local_port, remote_port, link_name, pod_selector, labels, local_host='localhost'): + +def managed_endpoint_resource(name, anchor_object, deps, local_port, remote_port, link_name, pod_selector, labels, local_host="localhost"): k8s_resource( new_name=name, objects=[anchor_object], - discovery_strategy='selectors-only', + discovery_strategy="selectors-only", extra_pod_selectors=[pod_selector], resource_deps=deps, port_forwards=[ @@ -130,411 +229,480 @@ def managed_endpoint_resource(name, anchor_object, deps, local_port, remote_port labels=labels, ) -GENERATED_DIR = 'hack/testing-manifests/wandb/.generated' -def build_wandb_cr(): - local('mkdir -p ' + GENERATED_DIR) - overlays = [] + settings.get('wandbOverlays', []) - if settings.get('licenseFile', ''): - overlays.append('license-file') - license = str(read_file(settings.get('licenseFile'))).strip().replace('\n', '\n ') - local("cat > %s << 'LEOF'\napiVersion: apps.wandb.com/v2\nkind: WeightsAndBiases\nmetadata:\n name: wandb\nspec:\n wandb:\n license: |-\n %s\nLEOF" % (LICENSE_PATCH, license)) +def endpoint_anchor(name): + return { + "apiVersion": "v1", + "kind": "ConfigMap", + "metadata": { + "name": name, + "namespace": "default", + }, + "data": { + "managed-by": "tilt", + }, + } + - components_lines = '' - for o in overlays: - components_lines += ' - ../kustomize/overlays/' + o + '\n' +def build_endpoint_anchors(names): + for name in names: + k8s_yaml_object(endpoint_anchor(name)) - kustomization = 'apiVersion: kustomize.config.k8s.io/v1beta1\nkind: Kustomization\nresources:\n - ../kustomize/base\n' - if components_lines: - kustomization += 'components:\n' + components_lines - local("cat > %s/kustomization.yaml << 'KEOF'\n%sKEOF" % (GENERATED_DIR, kustomization)) - local('kustomize build %s > %s/wandb-cr.yaml' % (GENERATED_DIR, GENERATED_DIR)) +def build_wandb_namespace(namespace): + k8s_yaml_object({ + "apiVersion": "v1", + "kind": "Namespace", + "metadata": { + "name": namespace, + "labels": { + "app.kubernetes.io/managed-by": "tilt", + }, + }, + }) -def local_networking_mode(): - build_wandb_cr() - wandbCR = settings.get('wandbCR') - crContent = read_yaml(wandbCR) - networkingMode = crContent.get('spec', {}).get('networking', {}).get('mode', '') - return networkingMode +def build_operator_values(telemetry_namespace): + telemetry_enabled = settings.get("observabilityMode") != "off" + grafana_enabled = settings.get("observabilityMode") == "full" -LOCAL_NETWORKING_MODE = local_networking_mode() + return write_generated_yaml(GENERATED_OPERATOR_VALUES, { + "wandb": { + "install": False, + }, + "wandb-operator": { + "image": { + "pullPolicy": "IfNotPresent", + }, + "containers": { + "operator": { + "command": [], + }, + }, + }, + "victoria-metrics-operator": { + "enabled": telemetry_enabled, + "admissionWebhooks": { + "enabled": False, + }, + }, + "grafana-operator": { + "enabled": grafana_enabled, + }, + "telemetry": { + "mode": settings.get("observabilityMode"), + "namespace": telemetry_namespace, + }, + }) -if LOCAL_NETWORKING_MODE == 'ingress' and not settings.get('installIngressNginx'): - fail('The networking-ingress-local overlay requires installIngressNginx=True.') -if LOCAL_NETWORKING_MODE == 'gateway' and not settings.get('installNginxGateway'): - fail('The networking-gateway-local overlay requires installNginxGateway=True.') +def helper_flag(name, value): + if value == None or value == "": + return "" + return " --%s %s" % (name, shell_quote(value)) -installed = local("which kubebuilder") -print("kubebuilder is present:", installed) +def helper_bool_flag(name, value): + return " --%s=%s" % (name, bool_string(as_bool(value))) -DIRNAME = os.path.basename(os. getcwd()) -local_resource("Operator-Manifests", manifests(), labels=[GROUP_WANDB_OPERATOR]) -local_resource("Operator-Generate", generate(), labels=[GROUP_WANDB_OPERATOR]) +def build_wandb_cr(): + cmd = "go run ./hack/tilt/wandbcr" + cmd += helper_flag("out", GENERATED_WANDB_CR) + cmd += helper_flag("cr-file", settings.get("crFile", "")) + cmd += helper_flag("name", settings.get("wandbName")) + cmd += helper_flag("namespace", settings.get("wandbNamespace")) + cmd += helper_flag("hostname", settings.get("wandbHostname")) + cmd += helper_flag("version", settings.get("wandbVersion")) + cmd += helper_flag("size", settings.get("size")) + cmd += helper_flag("retention-policy", settings.get("retentionPolicy")) + cmd += helper_flag("license-file", settings.get("licenseFile", "")) + cmd += helper_flag("manifest-source", settings.get("manifestSource")) + cmd += helper_flag("observability-mode", settings.get("observabilityMode")) + cmd += helper_flag("network-mode", settings.get("networkMode")) + cmd += helper_flag("gateway-class", settings.get("gatewayClass")) + cmd += helper_flag("ingress-class", settings.get("ingressClass")) + cmd += helper_bool_flag("create-ca", settings.get("createCA")) + cmd += helper_flag("issuer-name", settings.get("issuerName", "")) + local(cmd) + + return GENERATED_WANDB_CR + + +def build_wandb_ca(name, namespace): + root_cert_name = name + "-root-cert" + selfsigned_issuer_name = name + "-selfsigned-issuer" + ca_issuer_name = name + "-ca-issuer" + + k8s_yaml_object({ + "apiVersion": "cert-manager.io/v1", + "kind": "Issuer", + "metadata": { + "name": selfsigned_issuer_name, + "namespace": namespace, + }, + "spec": { + "selfSigned": {}, + }, + }) + k8s_yaml_object({ + "apiVersion": "cert-manager.io/v1", + "kind": "Certificate", + "metadata": { + "name": root_cert_name, + "namespace": namespace, + }, + "spec": { + "secretName": root_cert_name, + "isCA": True, + "commonName": "wandb-ca", + "duration": "210240h", + "issuerRef": { + "name": selfsigned_issuer_name, + "kind": "Issuer", + "group": "cert-manager.io", + }, + }, + }) + k8s_yaml_object({ + "apiVersion": "cert-manager.io/v1", + "kind": "Issuer", + "metadata": { + "name": ca_issuer_name, + "namespace": namespace, + }, + "spec": { + "ca": { + "secretName": root_cert_name, + }, + }, + }) + k8s_resource( + new_name="WandB-CA", + objects=[ + "%s:issuer:%s" % (selfsigned_issuer_name, namespace), + "%s:certificate:%s" % (root_cert_name, namespace), + "%s:issuer:%s" % (ca_issuer_name, namespace), + ], + resource_deps=["cert-manager"], + labels=[GROUP_WANDB_APP], + ) -helm_resource( - 'cert-manager', - chart='oci://quay.io/jetstack/charts/cert-manager', - namespace='cert-manager', - flags=[ - '--create-namespace', - '--version=v1.20.2', - '--set', - 'crds.enabled=true', - '--set', - 'config.enableGatewayAPI=true', - ], - labels=["Cert-Manager"], + +WANDB_CR = build_wandb_cr() if as_bool(settings.get("includeCR")) else "" +WANDB_CR_CONTENT = read_yaml(WANDB_CR) if as_bool(settings.get("includeCR")) else {} +WANDB_NAME = WANDB_CR_CONTENT.get("metadata", {}).get("name", settings.get("wandbName")) +WANDB_NAMESPACE = WANDB_CR_CONTENT.get("metadata", {}).get("namespace", settings.get("wandbNamespace")) +WANDB_HOSTNAME = WANDB_CR_CONTENT.get("spec", {}).get("wandb", {}).get("hostname", settings.get("wandbHostname")) +OPERATOR_VALUES = build_operator_values(WANDB_NAMESPACE) +LOCAL_NETWORKING_MODE = WANDB_CR_CONTENT.get("spec", {}).get("networking", {}).get("mode", settings.get("networkMode")) +CREATE_WANDB_NAMESPACE = as_bool(settings.get("includeCR")) or settings.get("observabilityMode") != "off" + +endpoint_anchors = [] +if as_bool(settings.get("includeCR")): + if LOCAL_NETWORKING_MODE in ["gateway", "ingress"]: + endpoint_anchors.append("wandb-endpoint-anchor") + +if settings.get("observabilityMode") == "full": + endpoint_anchors += [ + "telemetry-grafana-endpoint-anchor", + "telemetry-victoria-metrics-endpoint-anchor", + "telemetry-victoria-logs-endpoint-anchor", + "telemetry-victoria-traces-endpoint-anchor", + ] + +if endpoint_anchors: + build_endpoint_anchors(endpoint_anchors) + +if CREATE_WANDB_NAMESPACE: + build_wandb_namespace(WANDB_NAMESPACE) + k8s_resource( + new_name="WandB-Namespace", + objects=["%s:namespace" % WANDB_NAMESPACE], + labels=[GROUP_DEPENDENCIES], + ) + +local_resource( + "Operator-Codegen", + "make manifests generate", + labels=[GROUP_WANDB_OPERATOR], +) + +local_resource( + "Operator-Build", + binary(), + deps=["internal", "pkg", "api", "cmd"], + resource_deps=["Operator-Codegen"], + ignore=["*/*/zz_generated.deepcopy.go"], + labels=[GROUP_WANDB_OPERATOR], ) local_resource( - 'selfsigned-issuer', - 'kubectl apply -f hack/testing-manifests/cert-manager/selfsigned-issuer.yaml', - resource_deps=['cert-manager'], - labels=["Cert-Manager"], + "Operator-Chart-Deps", + "helm dependency build ./deploy/operator --skip-refresh", + deps=["deploy/operator/Chart.yaml", "deploy/operator/Chart.lock", "deploy/telemetry/Chart.yaml"], + labels=[GROUP_DEPENDENCIES], ) local_resource( - 'ca-certificate', - 'kubectl apply -f hack/testing-manifests/cert-manager/ca-certificate.yaml', - resource_deps=['cert-manager'], - labels=["Cert-Manager"], + "WandB-CRDs-Apply", + "kubectl apply --server-side=true --force-conflicts --field-manager=helm " + + "-f config/crd/bases/apps.wandb.com_applications.yaml " + + "-f config/crd/bases/apps.wandb.com_weightsandbiases.yaml", + resource_deps=["Operator-Codegen"], + labels=[GROUP_DEPENDENCIES], ) local_resource( - 'ca-issuer', - 'kubectl apply -f hack/testing-manifests/cert-manager/ca-issuer.yaml', - resource_deps=['cert-manager'], - labels=["Cert-Manager"], + "WandB-CRDs-Ready", + "kubectl wait --for=condition=established --timeout=120s " + + "crd/applications.apps.wandb.com " + + "crd/weightsandbiases.apps.wandb.com", + resource_deps=["WandB-CRDs-Apply"], + labels=[GROUP_DEPENDENCIES], ) -if settings.get("installNginxGateway"): +helm_resource( + "cert-manager", + chart="oci://quay.io/jetstack/charts/cert-manager", + namespace="cert-manager", + flags=[ + "--create-namespace", + "--version=v1.20.2", + "--set=crds.enabled=true", + "--set=config.enableGatewayAPI=true", + "--set=startupapicheck.enabled=false", + ], + labels=[GROUP_DEPENDENCIES], +) + +if LOCAL_NETWORKING_MODE == "gateway": local_resource( - 'gateway-api-crds', - 'kubectl apply -f https://github.com/kubernetes-sigs/gateway-api/releases/download/v1.2.1/standard-install.yaml', - labels=["Gateway"], + "gateway-api-crds", + "kubectl apply -f " + GATEWAY_API_CRDS_URL, + labels=[GROUP_DEPENDENCIES], ) + + nginx_gateway_flags = [ + "--create-namespace", + "--version=2.5.1", + ] + if currentContext.startswith("kind-"): + nginx_gateway_flags += [ + "--set=nginx.service.type=NodePort", + "--set=nginx.service.nodePorts[0].port=31437", + "--set=nginx.service.nodePorts[0].listenerPort=8080", + "--set=nginx.service.nodePorts[1].port=30478", + "--set=nginx.service.nodePorts[1].listenerPort=8443", + ] + helm_resource( - 'nginx-gateway-fabric', - chart='oci://ghcr.io/nginx/charts/nginx-gateway-fabric', - namespace='nginx-gateway', - flags=[ - '--create-namespace', - '--version=2.5.1', - ], - resource_deps=['gateway-api-crds'], - labels=["Gateway"], + "nginx-gateway-fabric", + chart="oci://ghcr.io/nginx/charts/nginx-gateway-fabric", + namespace="nginx-gateway", + flags=nginx_gateway_flags, + resource_deps=["gateway-api-crds"], + labels=[GROUP_DEPENDENCIES], ) -if settings.get("installIngressNginx"): +if LOCAL_NETWORKING_MODE == "ingress": helm_repo( - 'ingress-nginx', - 'https://kubernetes.github.io/ingress-nginx', - resource_name='ingress-nginx-repo', - labels=["Ingress"], + "ingress-nginx", + "https://kubernetes.github.io/ingress-nginx", + resource_name="ingress-nginx-repo", + labels=[GROUP_DEPENDENCIES], ) helm_resource( - 'ingress-nginx-controller', - chart='ingress-nginx/ingress-nginx', - release_name='ingress-nginx', - namespace='ingress-nginx', + "ingress-nginx-controller", + chart="ingress-nginx/ingress-nginx", + release_name="ingress-nginx", + namespace="ingress-nginx", flags=[ - '--create-namespace', - '--version=4.14.1', - '--set-string=controller.ingressClass=nginx', - '--set-string=controller.ingressClassResource.name=nginx', - '--set-string=controller.service.type=ClusterIP', + "--create-namespace", + "--version=4.14.1", + "--set-string=controller.ingressClass=%s" % settings.get("ingressClass"), + "--set-string=controller.ingressClassResource.name=%s" % settings.get("ingressClass"), + "--set-string=controller.service.type=ClusterIP", ], - resource_deps=['ingress-nginx-repo'], - labels=["Ingress"], + resource_deps=["ingress-nginx-repo"], + labels=[GROUP_DEPENDENCIES], ) -local_resource( - 'ThirdParty-Chart-Deps', - 'helm dependency update ./deploy/operator', - labels=[GROUP_THIRD_PARTY_OPERATORS], -) - -third_party_operator_flags = [ - '--set=wandb-operator.enabled=false', - '--set=telemetry.mode=off', - '--create-namespace', -] +operator_deps = ["Operator-Chart-Deps", "Operator-Build", "WandB-CRDs-Ready"] +operator_deps.append("cert-manager") +if LOCAL_NETWORKING_MODE == "gateway": + operator_deps.append("nginx-gateway-fabric") +if settings.get("observabilityMode") != "off": + operator_deps.append("WandB-Namespace") -if settings.get('openshiftSCC'): - third_party_operator_flags += [ - '--set=altinity-clickhouse-operator.crdHook.enabled=false', - ] +operator_flags = ["--create-namespace"] +if helm_supports_take_ownership(): + operator_flags.append("--take-ownership") +else: + warn("helm does not support --take-ownership; legacy CRD ownership may require Dev-Clean before the operator release can install.") -if settings.get("installTelemetry"): - third_party_operator_flags += [ - '--set=victoria-metrics-operator.enabled=true', - '--set=grafana-operator.enabled=true', - ] +operator_flags += [ + "-f", + OPERATOR_VALUES, +] helm_resource( - 'ThirdParty-Operators', - chart='./deploy/operator', - release_name='third-party-operators', - resource_deps=['ThirdParty-Chart-Deps', 'WandB-CRDs-Apply', 'cert-manager'], - namespace='wandb-operator', - flags=third_party_operator_flags, - labels=[GROUP_THIRD_PARTY_OPERATORS], -) - -KUSTOMIZE_OVERLAY = 'config/openshift-dev' if settings.get('openshiftSCC') else 'config/tilt-dev' -k8s_yaml(local('kustomize build ' + KUSTOMIZE_OVERLAY)) -k8s_yaml('hack/tilt/endpoint-anchors.yaml') - -k8s_resource( - new_name='Operator-Certs', - objects=[ - 'operator-system:namespace', - 'operator-metrics-certs:certificate', - 'operator-serving-cert:certificate', - 'operator-selfsigned-issuer:issuer', - ], - resource_deps=["cert-manager"], - # deploy_cert_manager() runs local() commands and registers no Tilt resource, - # so a resource_dep cannot be declared here. Tilt retries on failure. - labels=[GROUP_WANDB_OPERATOR], -) - -local_resource( - 'WandB-CRDs-Apply', - 'kubectl apply --server-side=true --force-conflicts --field-manager=helm ' + - '-f deploy/operator/crds/apps.wandb.com_applications.yaml ' + - '-f deploy/operator/crds/apps.wandb.com_weightsandbiases.yaml', - resource_deps=["ThirdParty-Chart-Deps"], - labels=[GROUP_WANDB_OPERATOR], -) - -k8s_resource( - new_name='Operator-RBAC', - objects=[ - 'operator-manager-role:clusterrole', - 'operator-manager-rolebinding:clusterrolebinding', - 'operator-leader-election-role:role', - 'operator-leader-election-rolebinding:rolebinding', - 'operator-application-admin-role:clusterrole', - 'operator-application-editor-role:clusterrole', - 'operator-application-viewer-role:clusterrole', - 'operator-metrics-auth-role:clusterrole', - 'operator-metrics-reader:clusterrole', - 'operator-weightsandbiases-admin-role:clusterrole', - 'operator-weightsandbiases-editor-role:clusterrole', - 'operator-weightsandbiases-viewer-role:clusterrole', - 'operator-metrics-auth-rolebinding:clusterrolebinding', - ], - resource_deps=["Operator-Manifests", "Operator-Generate"], - labels=[GROUP_WANDB_OPERATOR], -) - -local_resource( - 'WandB-CRDs-Ready', - 'kubectl wait --for=condition=established --timeout=120s ' + - 'crd/applications.apps.wandb.com ' + - 'crd/weightsandbiases.apps.wandb.com', - resource_deps=["WandB-CRDs-Apply"], - labels=[GROUP_WANDB_OPERATOR], -) - -k8s_resource( - workload='operator-controller-manager', - new_name='Operator-Controller', - objects=[ - 'operator-mutating-webhook-configuration:mutatingwebhookconfiguration', - 'operator-validating-webhook-configuration:validatingwebhookconfiguration', - 'operator-controller-manager:serviceaccount', + "wandb-operator", + chart="./deploy/operator", + release_name="wandb-operator", + namespace=settings.get("operatorNamespace"), + flags=operator_flags, + image_deps=[IMG], + image_keys=[("wandb-operator.image.repository", "wandb-operator.image.tag")], + deps=[ + OPERATOR_VALUES, + "deploy/operator/Chart.yaml", + "deploy/operator/values.yaml", ], - # manifests/generate transitively satisfied via WandB-CRDs-Ready → WandB-CRDs-Apply - resource_deps=["WandB-CRDs-Ready", "ThirdParty-Operators"], - labels=[GROUP_WANDB_OPERATOR], -) - -deps = ['internal', 'pkg', 'api', 'cmd'] - -local_resource( - 'Operator-Build', - binary(), - deps=deps, - resource_deps=["Operator-Manifests", "Operator-Generate"], - ignore=['*/*/zz_generated.deepcopy.go'], + resource_deps=operator_deps, labels=[GROUP_WANDB_OPERATOR], ) local_resource( - 'Operator-Webhook-Ready', - cmd='until kubectl get mutatingwebhookconfiguration operator-mutating-webhook-configuration -o jsonpath=\'{.webhooks[0].clientConfig.caBundle}\' | grep -q .; do echo "Waiting for webhook CA bundle to be injected..."; sleep 2; done && echo "Webhook is ready!"', - resource_deps=["Operator-Controller"], + "Operator-Webhook-Ready", + cmd="kubectl wait --for=condition=available --timeout=300s -n %s deploy/wandb-operator && " % settings.get("operatorNamespace") + + "until kubectl get mutatingwebhookconfiguration wandb-operator-mutating-webhook-configuration " + + "-o jsonpath='{.webhooks[0].clientConfig.caBundle}' | grep -q .; " + + "do echo 'Waiting for webhook CA bundle to be injected...'; sleep 2; done && echo 'Webhook is ready!'", + resource_deps=["wandb-operator"], labels=[GROUP_WANDB_OPERATOR], ) -# Dev-only cleanup helper. Use this before `tilt down` when you want a truly clean -# rebuild of stateful services. It waits for W&B finalizers while operators are still up. local_resource( - 'Dev-Clean', - './hack/scripts/tilt-dev-clean.sh', + "Dev-Clean", + "./hack/scripts/tilt-dev-clean.sh", auto_init=False, labels=[GROUP_WANDB_APP], ) -if settings.get("installWandb"): - build_wandb_cr() - wandbCR = settings.get('wandbCR') - crName = read_yaml(wandbCR)['metadata']['name'] +if as_bool(settings.get("includeCR")): + wandb_deps = ["Operator-Webhook-Ready", "WandB-Namespace"] + if LOCAL_NETWORKING_MODE == "gateway": + wandb_deps.append("nginx-gateway-fabric") + if LOCAL_NETWORKING_MODE == "ingress": + wandb_deps.append("ingress-nginx-controller") + + if str(WANDB_HOSTNAME).startswith("https://") and as_bool(settings.get("createCA")): + build_wandb_ca(WANDB_NAME, WANDB_NAMESPACE) + wandb_deps.append("WandB-CA") - k8s_yaml(wandbCR) + k8s_yaml(WANDB_CR) k8s_resource( - new_name='Wandb', - objects=[ - '%s:weightsandbiases' % crName - ], - resource_deps=["Operator-Webhook-Ready"], + new_name="Wandb", + objects=["%s:weightsandbiases:%s" % (WANDB_NAME, WANDB_NAMESPACE)], + resource_deps=wandb_deps, labels=[GROUP_WANDB_APP], ) - if LOCAL_NETWORKING_MODE == 'ingress': + + endpoint_port = url_port(WANDB_HOSTNAME) + endpoint_host = url_host(WANDB_HOSTNAME) + + if LOCAL_NETWORKING_MODE == "gateway": managed_endpoint_resource( - name='Wandb-Endpoint', - anchor_object='wandb-endpoint-anchor:configmap:default', - deps=['Wandb', 'ingress-nginx-controller'], - local_port=8080, - remote_port=80, - link_name='W&B ingress', - local_host='wandb.localhost', + name="Wandb-Endpoint", + anchor_object="wandb-endpoint-anchor:configmap:default", + deps=["Wandb", "nginx-gateway-fabric"], + local_port=endpoint_port, + remote_port=endpoint_port, + link_name="W&B gateway", + local_host=endpoint_host, pod_selector={ - 'app.kubernetes.io/component': 'controller', - 'app.kubernetes.io/instance': 'ingress-nginx', - 'app.kubernetes.io/name': 'ingress-nginx', + "app.kubernetes.io/instance": "nginx-gateway-fabric", + "app.kubernetes.io/name": "nginx-gateway-fabric", }, labels=[GROUP_WANDB_APP], ) - elif LOCAL_NETWORKING_MODE == 'gateway': + elif LOCAL_NETWORKING_MODE == "ingress": managed_endpoint_resource( - name='Wandb-Endpoint', - anchor_object='wandb-endpoint-anchor:configmap:default', - deps=['Wandb', 'nginx-gateway-fabric'], - local_port=8080, + name="Wandb-Endpoint", + anchor_object="wandb-endpoint-anchor:configmap:default", + deps=["Wandb", "ingress-nginx-controller"], + local_port=endpoint_port, remote_port=80, - link_name='W&B gateway', - local_host='wandb.localhost', + link_name="W&B ingress", + local_host=endpoint_host, pod_selector={ - 'gateway.networking.k8s.io/gateway-name': crName + '-gateway', + "app.kubernetes.io/component": "controller", + "app.kubernetes.io/instance": "ingress-nginx", + "app.kubernetes.io/name": "ingress-nginx", }, labels=[GROUP_WANDB_APP], ) - else: - managed_endpoint_resource( - name='Wandb-Endpoint', - anchor_object='wandb-endpoint-anchor:configmap:default', - deps=['Wandb'], - local_port=8080, - remote_port=8080, - link_name='W&B nginx', - pod_selector={'app.kubernetes.io/name': crName + '-nginx-proxy'}, - labels=[GROUP_WANDB_APP], - ) - -if settings.get("installTelemetry"): - local_resource( - 'Telemetry-CRDs-Ready', - 'kubectl wait --for=condition=established --timeout=120s ' + - 'crd/vmsingles.operator.victoriametrics.com ' + - 'crd/vmagents.operator.victoriametrics.com ' + - 'crd/vlsingles.operator.victoriametrics.com ' + - 'crd/vtsingles.operator.victoriametrics.com ' + - 'crd/vmservicescrapes.operator.victoriametrics.com ' + - 'crd/vmpodscrapes.operator.victoriametrics.com ' + - 'crd/vmnodescrapes.operator.victoriametrics.com ' + - 'crd/grafanas.grafana.integreatly.org ' + - 'crd/grafanadatasources.grafana.integreatly.org ' + - 'crd/grafanadashboards.grafana.integreatly.org' + - ' && kubectl wait --for=condition=available --timeout=180s -n wandb-operator ' + - 'deploy/third-party-operators-victoria-metrics-operator ' + - 'deploy/third-party-operators-grafana-operator', - resource_deps=["ThirdParty-Operators"], - labels=[GROUP_TELEMETRY], - ) - helm_resource( - 'Telemetry-Stack', - chart='./deploy/telemetry', - release_name='telemetry-stack', - namespace='wandb-operator', - flags=[ - '--set=mode=full', - '--set=namespace=default', - '--create-namespace', - ], - resource_deps=["Telemetry-CRDs-Ready"], - labels=[GROUP_TELEMETRY], - ) +if settings.get("observabilityMode") == "full": managed_endpoint_resource( - name='Telemetry-Endpoint-Grafana', - anchor_object='telemetry-grafana-endpoint-anchor:configmap:default', - deps=['Telemetry-Stack'], + name="Telemetry-Endpoint-Grafana", + anchor_object="telemetry-grafana-endpoint-anchor:configmap:default", + deps=["wandb-operator"], local_port=3000, remote_port=3000, - link_name='Grafana', - pod_selector={'app': 'grafana'}, + link_name="Grafana", + pod_selector={"app": "grafana"}, labels=[GROUP_TELEMETRY], ) managed_endpoint_resource( - name='Telemetry-Endpoint-VictoriaMetrics', - anchor_object='telemetry-victoria-metrics-endpoint-anchor:configmap:default', - deps=['Telemetry-Stack'], + name="Telemetry-Endpoint-VictoriaMetrics", + anchor_object="telemetry-victoria-metrics-endpoint-anchor:configmap:default", + deps=["wandb-operator"], local_port=8428, remote_port=8429, - link_name='VictoriaMetrics UI', + link_name="VictoriaMetrics UI", pod_selector={ - 'app.kubernetes.io/name': 'vmsingle', - 'app.kubernetes.io/instance': 'victoria-instance', + "app.kubernetes.io/name": "vmsingle", + "app.kubernetes.io/instance": "victoria-instance", }, labels=[GROUP_TELEMETRY], ) managed_endpoint_resource( - name='Telemetry-Endpoint-VictoriaLogs', - anchor_object='telemetry-victoria-logs-endpoint-anchor:configmap:default', - deps=['Telemetry-Stack'], + name="Telemetry-Endpoint-VictoriaLogs", + anchor_object="telemetry-victoria-logs-endpoint-anchor:configmap:default", + deps=["wandb-operator"], local_port=9428, remote_port=9428, - link_name='VictoriaLogs', + link_name="VictoriaLogs", pod_selector={ - 'app.kubernetes.io/name': 'vlsingle', - 'app.kubernetes.io/instance': 'victoria-logs', + "app.kubernetes.io/name": "vlsingle", + "app.kubernetes.io/instance": "victoria-logs", }, labels=[GROUP_TELEMETRY], ) managed_endpoint_resource( - name='Telemetry-Endpoint-VictoriaTraces', - anchor_object='telemetry-victoria-traces-endpoint-anchor:configmap:default', - deps=['Telemetry-Stack'], + name="Telemetry-Endpoint-VictoriaTraces", + anchor_object="telemetry-victoria-traces-endpoint-anchor:configmap:default", + deps=["wandb-operator"], local_port=10428, remote_port=10428, - link_name='VictoriaTraces', + link_name="VictoriaTraces", pod_selector={ - 'app.kubernetes.io/name': 'vtsingle', - 'app.kubernetes.io/instance': 'victoria-traces', + "app.kubernetes.io/name": "vtsingle", + "app.kubernetes.io/instance": "victoria-traces", }, labels=[GROUP_TELEMETRY], ) -manager_entrypoint = ['/manager', '--log-format=' + settings['logFormat']] -if settings.get("installTelemetry"): - manager_entrypoint += [ - '--telemetry-enabled=true', - ] +manager_entrypoint = ["/manager", "--log-format=" + settings.get("logFormat")] +if settings.get("observabilityMode") != "off": + manager_entrypoint += ["--telemetry-enabled=true"] + +docker_only = ["./tilt_bin/manager"] +live_update_steps = [ + sync("./tilt_bin/manager", "/manager"), +] + +if settings.get("manifestSource") == "local": + docker_only.append(repo_path(settings.get("localManifestPath"))) + live_update_steps.append(sync(settings.get("localManifestPath"), "/server-manifest")) docker_build_with_restart( - IMG, '.', - dockerfile_contents=DOCKERFILE_OPENSHIFT if settings.get('openshiftSCC') else DOCKERFILE, + IMG, + ".", + dockerfile_contents=operator_dockerfile(), entrypoint=manager_entrypoint, - only=['./tilt_bin/manager', './hack/testing-manifests/server-manifest'], - live_update=[ - sync('./tilt_bin/manager', '/manager'), - ], + only=docker_only, + live_update=live_update_steps, ) diff --git a/docs/design/wandb_v2/tilt.md b/docs/design/wandb_v2/tilt.md index 42195e30..ddf0936a 100644 --- a/docs/design/wandb_v2/tilt.md +++ b/docs/design/wandb_v2/tilt.md @@ -1,159 +1,90 @@ # Tilt Resource Dependency Graph -Resources shown with their labels in parentheses. Telemetry resources are only -active when `installTelemetry: true` and Wandb is only active when -`installWandb: true` in `tilt-settings.star`. +Resources shown with their labels in parentheses. The default path installs one +`wandb-operator` Helm release, Gateway API networking, an optional W&B CR, and +telemetry disabled. -## Agent Instructions +Conditional resources: -To regenerate this graph, read `Tiltfile` at the repo root and apply the rules below. +- `Wandb`, `Wandb-Endpoint`, and `WandB-CA` only appear when `includeCR=True`. +- `WandB-Namespace` appears when Tilt needs a W&B namespace for the CR or telemetry resources. +- `gateway-api-crds` and `nginx-gateway-fabric` appear when `networkMode="gateway"`. +- `ingress-nginx-*` appears when `networkMode="ingress"`. +- `Telemetry-Endpoint-*` appears only when `observabilityMode="full"`. -### Deriving Nodes and Edges - -Each `local_resource(name, ...)`, `k8s_resource(new_name=name, ...)`, and -`helm_resource(name, ...)` call defines a node. Draw one directed edge per entry -in its `resource_deps=[...]` list. `deploy_cert_manager()` produces a node named -`cert-manager`. - -### Node Label Format - -``` -nodeId["resource-name\n(Label-Group)"] -``` - -Use the Tilt resource name as display text and the first value of `labels=[...]` -as the group. Node IDs are snake_case versions of the resource name. - -### Subgraphs - -Group nodes into a `subgraph` when they share identical incoming and outgoing -edges — i.e., every parent points to all members and all members point to the -same children. Replace the redundant per-node edges with single edges to/from -the subgraph. - -Current subgraphs: -- `codegen["Code Generation"]` — `manifests`, `generate` -- `victoria_stack["Victoria Stack"]` — `Victoria-Metrics`, `Victoria-Logs`, `Victoria-Traces` - -### Arrow Styles - -- `-->` for edges between individual nodes -- `==>` (thick) for any edge where either endpoint is a subgraph - -### Conditional Resources - -Include all resources in the graph unconditionally. The intro text above the -diagram already describes the `installTelemetry` / `installWandb` conditions. -Use `%% comments` to mark conditional sections in the Mermaid source. - -### Class Assignments - -| Class | Color | Assigned to | -|--------------|--------|-------------| -| `bootstrap` | grey | `cert-manager`, `helm-dep-update`, `manifests`, `generate` | -| `operator` | blue | CRDs, RBAC, certs, controller, webhook, `Watch&Compile` | -| `thirdparty` | green | `third-party-operators` | -| `gate` | yellow | readiness-gate resources (`*-ready`, `*-crds-ready`) | -| `telemetry` | pink | all telemetry stack resources | -| `wandb` | purple | `Wandb` CR | - -### Transitive Dependencies - -Do not draw edges that are transitively implied by another declared dep. -When a dep is omitted from `resource_deps` for this reason, the Tiltfile -contains a comment of the form `# X transitively satisfied via A → B → C`. -Do not add those edges to the graph. +Tilt generates the W&B CR through `go run ./hack/tilt/wandbcr`, then reads the +typed YAML back for the resource name, namespace, networking mode, and endpoint +hostname. The default CR omits `spec.wandb.manifestRepository` so the webhook +uses the published server manifest repository; `manifestSource="local"` mounts +`localManifestPath` into the operator image at `/server-manifest` and writes +`file:///server-manifest` into the generated CR. ```mermaid graph TD - %% ── Bootstrapping ────────────────────────────────────────────── - cert_manager["cert-manager\n(ext)"] - helm_dep_update["helm-dep-update\n(Helm-Repos)"] - - subgraph codegen["Code Generation"] - manifests["manifests"] - generate["generate"] - end - - %% ── Third-Party Operators ────────────────────────────────────── - third_party["third-party-operators\n(Third-Party-Operators)"] - helm_dep_update --> third_party - - %% ── Operator CRDs & RBAC ─────────────────────────────────────── - app_crd["Application CRD\n(Operator-Resources)"] - wandb_crd["Wandb CRD\n(Operator-Resources)"] - rbac["RBAC\n(Operator-Resources)"] - operator_certs["Operator-Certs\n(Operator-Resources)"] - codegen ==> app_crd - codegen ==> wandb_crd - codegen ==> rbac - - %% ── CRD Readiness Gate ───────────────────────────────────────── - crds_ready["operator-crds-ready\n(Operator-Resources)"] - app_crd --> crds_ready - wandb_crd --> crds_ready - - %% ── Operator Controller ──────────────────────────────────────── - watch_compile["Watch&Compile\n(Operator-Resources)"] - controller["operator-controller-manager\n(Operator-Resources)"] - codegen ==> watch_compile - crds_ready --> controller - third_party --> controller - - %% ── Webhook Readiness ────────────────────────────────────────── - webhook_ready["webhook-ready\n(Operator-Resources)"] - controller --> webhook_ready - - %% ── Wandb CR (installWandb) ──────────────────────────────────── - wandb["Wandb\n(Operator-Resources)"] - webhook_ready --> wandb - - %% ── Telemetry: CRD & Operator Gates ─────────────────────────── - vm_crds["vm-crds-ready\n(Telemetry)"] - grafana_crds["grafana-crds-ready\n(Telemetry)"] - vm_operator["vm-operator-ready\n(Telemetry)"] - third_party --> vm_crds - third_party --> grafana_crds - vm_crds --> vm_operator - - %% ── Victoria Stack ───────────────────────────────────────────── - subgraph victoria_stack["Victoria Stack"] - victoria_metrics["Victoria-Metrics\n(Telemetry)"] - victoria_logs["Victoria-Logs\n(Telemetry)"] - victoria_traces["Victoria-Traces\n(Telemetry)"] - end - vm_operator ==> victoria_stack - - %% ── Telemetry Dependents ─────────────────────────────────────── - otel["OTEL-Connection-Secret\n(Telemetry)"] - kube_metrics["Kubernetes-Metrics\n(Telemetry)"] - op_metrics["Operator-Metrics\n(Telemetry)"] - infra_metrics["Infrastructure-Metrics\n(Telemetry)"] - victoria_stack ==> otel - victoria_metrics --> kube_metrics - victoria_metrics --> op_metrics - victoria_metrics --> infra_metrics - - %% ── Grafana Stack ────────────────────────────────────────────── - grafana["Grafana\n(Telemetry)"] - grafana_ds["Grafana-Datasources\n(Telemetry)"] - grafana_crds --> grafana - grafana_crds --> grafana_ds - grafana --> grafana_ds - victoria_stack ==> grafana_ds - - %% ── Styles ───────────────────────────────────────────────────── - classDef bootstrap fill:#f5f5f5,stroke:#999 - classDef operator fill:#dbeafe,stroke:#3b82f6 - classDef thirdparty fill:#dcfce7,stroke:#16a34a - classDef gate fill:#fef9c3,stroke:#ca8a04 - classDef telemetry fill:#fce7f3,stroke:#db2777 - classDef wandb fill:#ede9fe,stroke:#7c3aed - - class manifests,generate,cert_manager,helm_dep_update bootstrap - class app_crd,wandb_crd,rbac,operator_certs,watch_compile,controller,webhook_ready operator - class third_party thirdparty - class crds_ready,vm_crds,grafana_crds,vm_operator gate - class victoria_metrics,victoria_logs,victoria_traces,otel,kube_metrics,op_metrics,infra_metrics,grafana,grafana_ds telemetry - class wandb wandb + %% Bootstrap and dependencies + operator_codegen["Operator-Codegen\n(Wandb-Operator)"] + operator_build["Operator-Build\n(Wandb-Operator)"] + operator_chart_deps["Operator-Chart-Deps\n(Dependencies)"] + wandb_crds_apply["WandB-CRDs-Apply\n(Dependencies)"] + wandb_crds_ready["WandB-CRDs-Ready\n(Dependencies)"] + cert_manager["cert-manager\n(Dependencies)"] + gateway_api_crds["gateway-api-crds\n(Dependencies)"] + nginx_gateway_fabric["nginx-gateway-fabric\n(Dependencies)"] + ingress_nginx_repo["ingress-nginx-repo\n(Dependencies)"] + ingress_nginx_controller["ingress-nginx-controller\n(Dependencies)"] + wandb_namespace["WandB-Namespace\n(Dependencies)"] + + operator_codegen --> operator_build + operator_codegen --> wandb_crds_apply + wandb_crds_apply --> wandb_crds_ready + gateway_api_crds --> nginx_gateway_fabric + ingress_nginx_repo --> ingress_nginx_controller + + %% Operator install + wandb_operator["wandb-operator\n(Wandb-Operator)"] + operator_chart_deps --> wandb_operator + operator_build --> wandb_operator + wandb_crds_ready --> wandb_operator + cert_manager --> wandb_operator + nginx_gateway_fabric --> wandb_operator + wandb_namespace --> wandb_operator + + %% Webhook and W&B CR + operator_webhook_ready["Operator-Webhook-Ready\n(Wandb-Operator)"] + wandb_ca["WandB-CA\n(Wandb-App)"] + wandb["Wandb\n(Wandb-App)"] + wandb_endpoint["Wandb-Endpoint\n(Wandb-App)"] + dev_clean["Dev-Clean\n(Wandb-App)"] + + wandb_operator --> operator_webhook_ready + cert_manager --> wandb_ca + operator_webhook_ready --> wandb + wandb_namespace --> wandb + wandb_ca --> wandb + nginx_gateway_fabric --> wandb + ingress_nginx_controller --> wandb + wandb --> wandb_endpoint + nginx_gateway_fabric --> wandb_endpoint + ingress_nginx_controller --> wandb_endpoint + + %% Telemetry endpoint port-forwards + telemetry_grafana["Telemetry-Endpoint-Grafana\n(Telemetry)"] + telemetry_metrics["Telemetry-Endpoint-VictoriaMetrics\n(Telemetry)"] + telemetry_logs["Telemetry-Endpoint-VictoriaLogs\n(Telemetry)"] + telemetry_traces["Telemetry-Endpoint-VictoriaTraces\n(Telemetry)"] + + wandb_operator --> telemetry_grafana + wandb_operator --> telemetry_metrics + wandb_operator --> telemetry_logs + wandb_operator --> telemetry_traces + + classDef dependencies fill:#f5f5f5,stroke:#777 + classDef operator fill:#dbeafe,stroke:#2563eb + classDef wandb fill:#ede9fe,stroke:#7c3aed + classDef telemetry fill:#fce7f3,stroke:#db2777 + + class operator_chart_deps,wandb_crds_apply,wandb_crds_ready,cert_manager,gateway_api_crds,nginx_gateway_fabric,ingress_nginx_repo,ingress_nginx_controller,wandb_namespace dependencies + class operator_codegen,operator_build,wandb_operator,operator_webhook_ready operator + class wandb_ca,wandb,wandb_endpoint,dev_clean wandb + class telemetry_grafana,telemetry_metrics,telemetry_logs,telemetry_traces telemetry ``` diff --git a/docs/monitoring.md b/docs/monitoring.md index c0a08529..a5b402ce 100644 --- a/docs/monitoring.md +++ b/docs/monitoring.md @@ -72,11 +72,12 @@ telemetry: ## Tilt Usage -Set `"installTelemetry": True` in `tilt-settings.star`. +Telemetry is off by default in Tilt. Set `"observabilityMode": "full"` in +`tilt-settings.star` for the local full stack. -Tilt treats `installTelemetry=True` as the local `full` mode. It installs the -VictoriaMetrics and Grafana operators, then installs the standalone telemetry -chart with `mode=full`. +Tilt renders the operator chart with `telemetry.mode=full`, enables the +VictoriaMetrics and Grafana operator dependencies, and turns on the controller's +telemetry flag. Tilt exposes endpoints for: - Grafana diff --git a/hack/scripts/tilt-dev-clean.sh b/hack/scripts/tilt-dev-clean.sh index 5da28230..c10d127e 100755 --- a/hack/scripts/tilt-dev-clean.sh +++ b/hack/scripts/tilt-dev-clean.sh @@ -3,7 +3,8 @@ set -euo pipefail SCRIPT_NAME="$(basename "$0")" -OPERATOR_NAMESPACE="wandb-operator" +OPERATOR_NAMESPACE="${OPERATOR_NAMESPACE:-wandb-operators}" +LEGACY_OPERATOR_NAMESPACE="wandb-operator" WAIT_TIMEOUT="10m" DRY_RUN="false" APP_NAMESPACE="" @@ -168,8 +169,9 @@ for app in "${apps[@]}"; do delete_app_and_wait "${namespace}" "${name}" done -uninstall_release "${OPERATOR_NAMESPACE}" "telemetry-stack" -uninstall_release "${OPERATOR_NAMESPACE}" "third-party-operators" +uninstall_release "${OPERATOR_NAMESPACE}" "wandb-operator" +uninstall_release "${LEGACY_OPERATOR_NAMESPACE}" "telemetry-stack" +uninstall_release "${LEGACY_OPERATOR_NAMESPACE}" "third-party-operators" for app in "${apps[@]}"; do namespace="${app%%$'\t'*}" diff --git a/hack/tilt/endpoint-anchors.yaml b/hack/tilt/endpoint-anchors.yaml deleted file mode 100644 index 64b115c3..00000000 --- a/hack/tilt/endpoint-anchors.yaml +++ /dev/null @@ -1,39 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: wandb-endpoint-anchor - namespace: default -data: - managed-by: tilt ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: telemetry-grafana-endpoint-anchor - namespace: default -data: - managed-by: tilt ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: telemetry-victoria-metrics-endpoint-anchor - namespace: default -data: - managed-by: tilt ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: telemetry-victoria-logs-endpoint-anchor - namespace: default -data: - managed-by: tilt ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: telemetry-victoria-traces-endpoint-anchor - namespace: default -data: - managed-by: tilt diff --git a/hack/tilt/wandbcr/main.go b/hack/tilt/wandbcr/main.go new file mode 100644 index 00000000..cf3388c8 --- /dev/null +++ b/hack/tilt/wandbcr/main.go @@ -0,0 +1,422 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "os" + "path/filepath" + "strings" + + v2 "github.com/wandb/operator/api/v2" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/yaml" +) + +const ( + defaultOutPath = "hack/testing-manifests/wandb/.generated/tilt-wandb-cr.yaml" + defaultName = "wandb" + defaultNamespace = "wandb" + defaultHostname = "http://localhost:8080" + defaultIngressHostname = "http://wandb.localhost:8080" + localManifestRepository = "file:///server-manifest" + defaultManifestSource = "published" + defaultVersion = "0.80.0" + defaultSize = v2.SizeDev + defaultRetentionPolicy = v2.DetachOnDelete +) + +type Options struct { + OutPath string + CRFile string + Name string + Namespace string + Hostname string + Version string + Size string + RetentionPolicy string + LicenseFile string + ManifestSource string + ObservabilityMode string + NetworkMode string + GatewayClass string + IngressClass string + CreateCA bool + CreateCASet bool + IssuerName string +} + +func main() { + opts := Options{} + flag.StringVar(&opts.OutPath, "out", defaultOutPath, "Path to write the generated WeightsAndBiases CR YAML") + flag.StringVar(&opts.CRFile, "cr-file", "", "Optional base WeightsAndBiases CR YAML to patch") + flag.StringVar(&opts.Name, "name", defaultName, "WeightsAndBiases resource name") + flag.StringVar(&opts.Namespace, "namespace", defaultNamespace, "WeightsAndBiases resource namespace") + flag.StringVar(&opts.Hostname, "hostname", defaultHostname, "W&B hostname") + flag.StringVar(&opts.Version, "version", defaultVersion, "W&B version") + flag.StringVar(&opts.Size, "size", string(defaultSize), "W&B size") + flag.StringVar(&opts.RetentionPolicy, "retention-policy", string(defaultRetentionPolicy), "Retention policy on delete") + flag.StringVar(&opts.LicenseFile, "license-file", "", "Path to W&B license file") + flag.StringVar(&opts.ManifestSource, "manifest-source", defaultManifestSource, "Server manifest source: published or local") + flag.StringVar(&opts.ObservabilityMode, "observability-mode", "off", "Observability mode: off, full, or forward") + flag.StringVar(&opts.NetworkMode, "network-mode", "gateway", "Networking mode: gateway or ingress") + flag.StringVar(&opts.GatewayClass, "gateway-class", "nginx", "GatewayClass name for gateway mode") + flag.StringVar(&opts.IngressClass, "ingress-class", "nginx", "IngressClass name for ingress mode") + flag.BoolVar(&opts.CreateCA, "create-ca", true, "Use the generated W&B CA issuer for HTTPS hostnames") + flag.StringVar(&opts.IssuerName, "issuer-name", "", "Existing cert-manager issuer for HTTPS hostnames") + flag.Parse() + flag.Visit(func(f *flag.Flag) { + if f.Name == "create-ca" { + opts.CreateCASet = true + } + }) + + if err := Run(opts); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } +} + +func Run(opts Options) error { + cr, err := BuildCR(opts) + if err != nil { + return err + } + + data, err := marshalCRYAML(cr) + if err != nil { + return fmt.Errorf("marshal CR YAML: %w", err) + } + + if err := os.MkdirAll(filepath.Dir(opts.OutPath), 0o755); err != nil { + return fmt.Errorf("create generated directory: %w", err) + } + if err := os.WriteFile(opts.OutPath, data, 0o644); err != nil { + return fmt.Errorf("write generated CR: %w", err) + } + return nil +} + +func marshalCRYAML(cr *v2.WeightsAndBiases) ([]byte, error) { + data, err := json.Marshal(cr) + if err != nil { + return nil, err + } + + obj := map[string]interface{}{} + if err := json.Unmarshal(data, &obj); err != nil { + return nil, err + } + delete(obj, "status") + + pruned, _ := pruneEmpty(obj) + return yaml.Marshal(pruned) +} + +func pruneEmpty(value interface{}) (interface{}, bool) { + switch typed := value.(type) { + case nil: + return nil, false + case string: + return typed, typed != "" + case []interface{}: + items := []interface{}{} + for _, item := range typed { + if pruned, keep := pruneEmpty(item); keep { + items = append(items, pruned) + } + } + return items, len(items) > 0 + case map[string]interface{}: + for key, item := range typed { + if pruned, keep := pruneEmpty(item); keep { + typed[key] = pruned + } else { + delete(typed, key) + } + } + return typed, len(typed) > 0 + default: + return typed, true + } +} + +func BuildCR(opts Options) (*v2.WeightsAndBiases, error) { + applyDefaults(&opts) + + cr, err := baseCR(opts.CRFile) + if err != nil { + return nil, err + } + + ensureTypeMeta(cr) + patchMetadata(cr, opts) + patchScalarSpec(cr, opts) + if err := patchManifestRepository(cr, opts.ManifestSource); err != nil { + return nil, err + } + if err := patchLicense(cr, opts.LicenseFile); err != nil { + return nil, err + } + if err := patchNetworking(cr, opts); err != nil { + return nil, err + } + if err := patchTelemetry(cr, opts.ObservabilityMode); err != nil { + return nil, err + } + + return cr, nil +} + +func applyDefaults(opts *Options) { + if opts.OutPath == "" { + opts.OutPath = defaultOutPath + } + if opts.Name == "" { + opts.Name = defaultName + } + if opts.Namespace == "" { + opts.Namespace = defaultNamespace + } + if opts.Hostname == "" { + opts.Hostname = defaultHostname + } + if opts.Version == "" { + opts.Version = defaultVersion + } + if opts.Size == "" { + opts.Size = string(defaultSize) + } + if opts.RetentionPolicy == "" { + opts.RetentionPolicy = string(defaultRetentionPolicy) + } + if opts.ManifestSource == "" { + opts.ManifestSource = defaultManifestSource + } + if opts.ObservabilityMode == "" { + opts.ObservabilityMode = "off" + } + if opts.NetworkMode == "" { + opts.NetworkMode = "gateway" + } + if opts.GatewayClass == "" { + opts.GatewayClass = "nginx" + } + if opts.IngressClass == "" { + opts.IngressClass = "nginx" + } + if !opts.CreateCASet { + opts.CreateCA = true + } +} + +func baseCR(crFile string) (*v2.WeightsAndBiases, error) { + if crFile != "" { + data, err := os.ReadFile(crFile) + if err != nil { + return nil, fmt.Errorf("read cr-file: %w", err) + } + + cr := &v2.WeightsAndBiases{} + if err := yaml.Unmarshal(data, cr); err != nil { + return nil, fmt.Errorf("parse cr-file: %w", err) + } + return cr, nil + } + + return &v2.WeightsAndBiases{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "apps.wandb.com/v2", + Kind: "WeightsAndBiases", + }, + Spec: v2.WeightsAndBiasesSpec{ + Wandb: v2.WandbAppSpec{ + Features: map[string]bool{"proxy": true}, + InternalServiceAuth: v2.InternalServiceAuth{Enabled: boolPtr(false)}, + }, + MySQL: v2.MySQLSpec{ManagedMysql: &v2.ManagedMysqlSpec{}}, + Redis: v2.RedisSpec{ManagedRedis: &v2.ManagedRedisSpec{}}, + Kafka: v2.KafkaSpec{ManagedKafka: &v2.ManagedKafkaSpec{}}, + ObjectStore: v2.ObjectStoreSpec{ManagedObjectStore: &v2.ManagedObjectStoreSpec{}}, + ClickHouse: v2.ClickHouseSpec{ManagedClickHouse: &v2.ManagedClickHouseSpec{}}, + }, + }, nil +} + +func ensureTypeMeta(cr *v2.WeightsAndBiases) { + if cr.APIVersion == "" { + cr.APIVersion = "apps.wandb.com/v2" + } + if cr.Kind == "" { + cr.Kind = "WeightsAndBiases" + } +} + +func patchMetadata(cr *v2.WeightsAndBiases, opts Options) { + cr.Name = opts.Name + cr.Namespace = opts.Namespace + if cr.Labels == nil { + cr.Labels = map[string]string{} + } + cr.Labels["app.kubernetes.io/managed-by"] = "tilt" +} + +func patchScalarSpec(cr *v2.WeightsAndBiases, opts Options) { + cr.Spec.Wandb.Hostname = effectiveHostname(opts) + cr.Spec.Wandb.Version = opts.Version + if cr.Spec.Wandb.Features == nil { + cr.Spec.Wandb.Features = map[string]bool{} + } + cr.Spec.Wandb.Features["proxy"] = true + cr.Spec.Wandb.InternalServiceAuth = v2.InternalServiceAuth{Enabled: boolPtr(false)} + cr.Spec.Size = v2.Size(opts.Size) + cr.Spec.RetentionPolicy.OnDelete = v2.OnDeletePolicy(opts.RetentionPolicy) +} + +func patchManifestRepository(cr *v2.WeightsAndBiases, manifestSource string) error { + switch normalizeManifestSource(manifestSource) { + case "published": + cr.Spec.Wandb.ManifestRepository = "" + case "local": + cr.Spec.Wandb.ManifestRepository = localManifestRepository + default: + return fmt.Errorf("manifest-source must be one of: published, local") + } + return nil +} + +func patchLicense(cr *v2.WeightsAndBiases, licenseFile string) error { + if licenseFile == "" { + return nil + } + + data, err := os.ReadFile(licenseFile) + if err != nil { + return fmt.Errorf("read license-file: %w", err) + } + cr.Spec.Wandb.License = strings.TrimSpace(string(data)) + return nil +} + +func patchNetworking(cr *v2.WeightsAndBiases, opts Options) error { + mode := normalizeNetworkMode(opts.NetworkMode) + switch mode { + case "gateway": + if opts.GatewayClass == "" { + return fmt.Errorf("gateway network mode requires gateway-class") + } + cr.Spec.Networking.Mode = v2.NetworkingModeGatewayAPI + cr.Spec.Networking.Ingress = nil + cr.Spec.Networking.GatewayAPI = &v2.GatewayAPIConfig{ + Gateway: v2.GatewayConfig{ + Managed: true, + GatewayClassName: stringPtr(opts.GatewayClass), + }, + } + case "ingress": + if opts.IngressClass == "" { + return fmt.Errorf("ingress network mode requires ingress-class") + } + cr.Spec.Networking.Mode = v2.NetworkingModeIngress + cr.Spec.Networking.GatewayAPI = nil + cr.Spec.Networking.Ingress = &v2.IngressConfig{ + IngressClassName: stringPtr(opts.IngressClass), + } + default: + return fmt.Errorf("network-mode must be one of: gateway, ingress") + } + + if strings.HasPrefix(cr.Spec.Wandb.Hostname, "https://") { + if opts.CreateCA { + cr.Spec.Networking.TLS = &v2.TLSConfig{ + SecretName: opts.Name + "-tls-secret", + CertManager: &v2.CertManagerConfig{ + Issuer: opts.Name + "-ca-issuer", + }, + } + } else if opts.IssuerName != "" { + cr.Spec.Networking.TLS = &v2.TLSConfig{ + SecretName: opts.Name + "-tls-secret", + CertManager: &v2.CertManagerConfig{ + Issuer: opts.IssuerName, + }, + } + } else { + return fmt.Errorf("https hostname requires create-ca=true or issuer-name") + } + } else { + cr.Spec.Networking.TLS = nil + } + + return nil +} + +func patchTelemetry(cr *v2.WeightsAndBiases, observabilityMode string) error { + mode := normalizeObservabilityMode(observabilityMode) + var enabled bool + switch mode { + case "off": + enabled = false + case "full", "forward": + enabled = true + default: + return fmt.Errorf("observability-mode must be one of: off, full, forward") + } + + if cr.Spec.MySQL.ManagedMysql != nil { + cr.Spec.MySQL.ManagedMysql.Telemetry.Enabled = enabled + } + if cr.Spec.Redis.ManagedRedis != nil { + cr.Spec.Redis.ManagedRedis.Telemetry.Enabled = enabled + } + if cr.Spec.Kafka.ManagedKafka != nil { + cr.Spec.Kafka.ManagedKafka.Telemetry.Enabled = enabled + } + if cr.Spec.ObjectStore.ManagedObjectStore != nil { + cr.Spec.ObjectStore.ManagedObjectStore.Telemetry.Enabled = enabled + } + if cr.Spec.ClickHouse.ManagedClickHouse != nil { + cr.Spec.ClickHouse.ManagedClickHouse.Telemetry.Enabled = enabled + } + return nil +} + +func effectiveHostname(opts Options) string { + if normalizeNetworkMode(opts.NetworkMode) == "ingress" && opts.Hostname == defaultHostname { + return defaultIngressHostname + } + return opts.Hostname +} + +func normalizeNetworkMode(mode string) string { + switch strings.ToLower(mode) { + case "gateway": + return "gateway" + case "ingress": + return "ingress" + default: + return mode + } +} + +func normalizeObservabilityMode(mode string) string { + if mode == "" { + return "off" + } + return strings.ToLower(mode) +} + +func normalizeManifestSource(source string) string { + if source == "" { + return defaultManifestSource + } + return strings.ToLower(source) +} + +func boolPtr(value bool) *bool { + return &value +} + +func stringPtr(value string) *string { + return &value +} diff --git a/hack/tilt/wandbcr/main_test.go b/hack/tilt/wandbcr/main_test.go new file mode 100644 index 00000000..0f5ee303 --- /dev/null +++ b/hack/tilt/wandbcr/main_test.go @@ -0,0 +1,273 @@ +package main + +import ( + "os" + "path/filepath" + "strings" + "testing" + + v2 "github.com/wandb/operator/api/v2" + "sigs.k8s.io/yaml" +) + +func TestBuildCRDefaultGateway(t *testing.T) { + cr, err := BuildCR(Options{}) + if err != nil { + t.Fatal(err) + } + + if cr.APIVersion != "apps.wandb.com/v2" || cr.Kind != "WeightsAndBiases" { + t.Fatalf("unexpected type meta: %s %s", cr.APIVersion, cr.Kind) + } + if cr.Name != defaultName || cr.Namespace != defaultNamespace { + t.Fatalf("unexpected metadata: %s/%s", cr.Namespace, cr.Name) + } + if cr.Spec.Wandb.ManifestRepository != "" { + t.Fatalf("manifest repository = %q", cr.Spec.Wandb.ManifestRepository) + } + if cr.Spec.Wandb.Version != defaultVersion { + t.Fatalf("version = %q", cr.Spec.Wandb.Version) + } + if !cr.Spec.Wandb.Features["proxy"] { + t.Fatalf("proxy feature not enabled") + } + if cr.Spec.Wandb.InternalServiceAuth.Enabled == nil || *cr.Spec.Wandb.InternalServiceAuth.Enabled { + t.Fatalf("internal service auth should be explicitly disabled") + } + if cr.Spec.MySQL.ManagedMysql == nil || cr.Spec.MySQL.ManagedMysql.Telemetry.Enabled { + t.Fatalf("mysql telemetry should be disabled by default") + } + if cr.Spec.Redis.ManagedRedis == nil || cr.Spec.Redis.ManagedRedis.Telemetry.Enabled { + t.Fatalf("redis telemetry should be disabled by default") + } + if cr.Spec.Kafka.ManagedKafka == nil || cr.Spec.Kafka.ManagedKafka.Telemetry.Enabled { + t.Fatalf("kafka telemetry should be disabled by default") + } + if cr.Spec.ObjectStore.ManagedObjectStore == nil || cr.Spec.ObjectStore.ManagedObjectStore.Telemetry.Enabled { + t.Fatalf("object store telemetry should be disabled by default") + } + if cr.Spec.ClickHouse.ManagedClickHouse == nil || cr.Spec.ClickHouse.ManagedClickHouse.Telemetry.Enabled { + t.Fatalf("clickhouse telemetry should be disabled by default") + } + if cr.Spec.Networking.Mode != v2.NetworkingModeGatewayAPI { + t.Fatalf("networking mode = %q", cr.Spec.Networking.Mode) + } + if cr.Spec.Networking.GatewayAPI == nil || cr.Spec.Networking.GatewayAPI.Gateway.GatewayClassName == nil || *cr.Spec.Networking.GatewayAPI.Gateway.GatewayClassName != "nginx" { + t.Fatalf("gateway class was not set") + } +} + +func TestBuildCRLocalManifestSource(t *testing.T) { + cr, err := BuildCR(Options{ManifestSource: "local"}) + if err != nil { + t.Fatal(err) + } + + if cr.Spec.Wandb.ManifestRepository != localManifestRepository { + t.Fatalf("manifest repository = %q", cr.Spec.Wandb.ManifestRepository) + } +} + +func TestBuildCRInvalidManifestSourceReturnsError(t *testing.T) { + _, err := BuildCR(Options{ManifestSource: "testing-manifests"}) + if err == nil { + t.Fatal("expected manifest source error") + } + if !strings.Contains(err.Error(), "manifest-source") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestBuildCRIngressRewritesDefaultHostname(t *testing.T) { + cr, err := BuildCR(Options{NetworkMode: "ingress"}) + if err != nil { + t.Fatal(err) + } + + if cr.Spec.Wandb.Hostname != defaultIngressHostname { + t.Fatalf("hostname = %q", cr.Spec.Wandb.Hostname) + } + if cr.Spec.Networking.Mode != v2.NetworkingModeIngress { + t.Fatalf("networking mode = %q", cr.Spec.Networking.Mode) + } + if cr.Spec.Networking.Ingress == nil || cr.Spec.Networking.Ingress.IngressClassName == nil || *cr.Spec.Networking.Ingress.IngressClassName != "nginx" { + t.Fatalf("ingress class was not set") + } + if cr.Spec.Networking.GatewayAPI != nil { + t.Fatalf("gateway config should be cleared in ingress mode") + } +} + +func TestBuildCRInvalidNetworkModeReturnsError(t *testing.T) { + _, err := BuildCR(Options{NetworkMode: "networking-ingress-local"}) + if err == nil { + t.Fatal("expected network mode error") + } + if !strings.Contains(err.Error(), "network-mode") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestBuildCRFullObservabilityEnablesManagedTelemetry(t *testing.T) { + cr, err := BuildCR(Options{ObservabilityMode: "full"}) + if err != nil { + t.Fatal(err) + } + + if !cr.Spec.MySQL.ManagedMysql.Telemetry.Enabled || + !cr.Spec.Redis.ManagedRedis.Telemetry.Enabled || + !cr.Spec.Kafka.ManagedKafka.Telemetry.Enabled || + !cr.Spec.ObjectStore.ManagedObjectStore.Telemetry.Enabled || + !cr.Spec.ClickHouse.ManagedClickHouse.Telemetry.Enabled { + t.Fatalf("managed telemetry was not enabled") + } +} + +func TestBuildCRInvalidObservabilityModeReturnsError(t *testing.T) { + _, err := BuildCR(Options{ObservabilityMode: "on"}) + if err == nil { + t.Fatal("expected observability mode error") + } + if !strings.Contains(err.Error(), "observability-mode") { + t.Fatalf("unexpected error: %v", err) + } +} + +func TestBuildCRPatchesBaseCRAndPreservesUnrelatedFields(t *testing.T) { + dir := t.TempDir() + crFile := filepath.Join(dir, "base.yaml") + base := `apiVersion: apps.wandb.com/v2 +kind: WeightsAndBiases +metadata: + name: custom + namespace: custom-ns + labels: + keep: me +spec: + requireLimits: true + wandb: + hostname: http://old.example + manifestRepository: file:///old-manifest + version: old + additionalHostnames: + - extra.example + networking: + mode: ingress + ingress: + ingressClassName: old +` + if err := os.WriteFile(crFile, []byte(base), 0o644); err != nil { + t.Fatal(err) + } + + cr, err := BuildCR(Options{ + CRFile: crFile, + Name: "patched", + Namespace: "patched-ns", + Hostname: "http://new.example", + Version: "1.2.3", + Size: "micro", + RetentionPolicy: "purge", + NetworkMode: "gateway", + GatewayClass: "custom-gateway", + }) + if err != nil { + t.Fatal(err) + } + + if cr.Name != "patched" || cr.Namespace != "patched-ns" { + t.Fatalf("metadata was not patched: %s/%s", cr.Namespace, cr.Name) + } + if cr.Labels["keep"] != "me" || cr.Labels["app.kubernetes.io/managed-by"] != "tilt" { + t.Fatalf("labels not preserved/patched: %#v", cr.Labels) + } + if !cr.Spec.RequireLimits { + t.Fatalf("unrelated spec field was not preserved") + } + if len(cr.Spec.Wandb.AdditionalHostnames) != 1 || cr.Spec.Wandb.AdditionalHostnames[0] != "extra.example" { + t.Fatalf("additional hostnames were not preserved: %#v", cr.Spec.Wandb.AdditionalHostnames) + } + if cr.Spec.Wandb.Hostname != "http://new.example" || cr.Spec.Wandb.Version != "1.2.3" { + t.Fatalf("wandb fields not patched: %#v", cr.Spec.Wandb) + } + if cr.Spec.Wandb.ManifestRepository != "" { + t.Fatalf("manifest repository should be cleared for published source: %q", cr.Spec.Wandb.ManifestRepository) + } + if cr.Spec.Size != v2.SizeMicro || cr.Spec.RetentionPolicy.OnDelete != v2.PurgeOnDelete { + t.Fatalf("size/retention not patched: %q %q", cr.Spec.Size, cr.Spec.RetentionPolicy.OnDelete) + } + if cr.Spec.Networking.Mode != v2.NetworkingModeGatewayAPI || cr.Spec.Networking.Ingress != nil { + t.Fatalf("networking not patched to gateway: %#v", cr.Spec.Networking) + } +} + +func TestBuildCRUnreadableLicenseFileReturnsError(t *testing.T) { + _, err := BuildCR(Options{LicenseFile: filepath.Join(t.TempDir(), "missing-license")}) + if err == nil { + t.Fatal("expected license file error") + } +} + +func TestBuildCRHTTPSUsesGeneratedCAIssuer(t *testing.T) { + cr, err := BuildCR(Options{Hostname: "https://wandb.example"}) + if err != nil { + t.Fatal(err) + } + + if cr.Spec.Networking.TLS == nil || cr.Spec.Networking.TLS.CertManager == nil { + t.Fatalf("TLS config was not set") + } + if cr.Spec.Networking.TLS.SecretName != "wandb-tls-secret" { + t.Fatalf("secret name = %q", cr.Spec.Networking.TLS.SecretName) + } + if cr.Spec.Networking.TLS.CertManager.Issuer != "wandb-ca-issuer" { + t.Fatalf("issuer = %q", cr.Spec.Networking.TLS.CertManager.Issuer) + } +} + +func TestBuildCRHTTPSUsesExplicitIssuer(t *testing.T) { + cr, err := BuildCR(Options{ + Name: "custom", + Hostname: "https://wandb.example", + CreateCA: false, + CreateCASet: true, + IssuerName: "existing-issuer", + }) + if err != nil { + t.Fatal(err) + } + + if cr.Spec.Networking.TLS == nil || cr.Spec.Networking.TLS.CertManager == nil { + t.Fatalf("TLS config was not set") + } + if cr.Spec.Networking.TLS.SecretName != "custom-tls-secret" { + t.Fatalf("secret name = %q", cr.Spec.Networking.TLS.SecretName) + } + if cr.Spec.Networking.TLS.CertManager.Issuer != "existing-issuer" { + t.Fatalf("issuer = %q", cr.Spec.Networking.TLS.CertManager.Issuer) + } +} + +func TestRunWritesStableYAML(t *testing.T) { + out := filepath.Join(t.TempDir(), "generated", "wandb.yaml") + if err := Run(Options{OutPath: out}); err != nil { + t.Fatal(err) + } + + data, err := os.ReadFile(out) + if err != nil { + t.Fatal(err) + } + + var cr v2.WeightsAndBiases + if err := yaml.Unmarshal(data, &cr); err != nil { + t.Fatal(err) + } + if cr.Name != defaultName || cr.Spec.Networking.Mode != v2.NetworkingModeGatewayAPI { + t.Fatalf("unexpected generated CR: %s %#v", cr.Name, cr.Spec.Networking) + } + rendered := string(data) + if strings.Contains(rendered, "\nstatus:") || strings.Contains(rendered, "\noidc:") || strings.Contains(rendered, "manifestRepository") { + t.Fatalf("generated CR contains empty runtime/defaulted fields:\n%s", rendered) + } +} diff --git a/tilt-settings.sample.star b/tilt-settings.sample.star index bd93588e..ae0ada91 100644 --- a/tilt-settings.sample.star +++ b/tilt-settings.sample.star @@ -1,34 +1,42 @@ SETTINGS = { - "allowedContexts": ["docker-desktop", "minikube", "kind-kind", "kind-wandb-operator"], - "kindClusterName": "wandb-operator", - "installMinio": False, - "installTelemetry": True, - "installIngressNginx": True, - "logFormat": "pretty", + "allowedContexts": ["docker-desktop", "minikube", "kind-kind", "kind-wandb-operator", "orbstack"], + + # Operator install settings. + "operatorNamespace": "wandb-operators", + + # W&B instance settings. + "includeCR": True, + + # Optional base WeightsAndBiases CR YAML. Tilt patches it with the scalar + # settings below. + "crFile": "", + "wandbName": "wandb", + "wandbNamespace": "wandb", + "wandbHostname": "http://localhost:8080", + "wandbVersion": "0.80.0", + "size": "dev", + "retentionPolicy": "detach", + "licenseFile": "", + + # Default to the published server manifest repository. Use + # local mode only when developing against repo-local manifest definitions. + "manifestSource": "published", + "localManifestPath": "hack/testing-manifests/server-manifest", - # Path to the WandB CR manifest (default: generated by kustomize from base + overlays) - # Override this value to use your own non-kustomized CR - "wandbCR": "hack/testing-manifests/wandb/.generated/wandb-cr.yaml", - - # Optional path to a local license file. When set, Tilt reads the file and - # injects its trimmed content into spec.wandb.license in the generated CR. - # "licenseFile": "/absolute/path/to/license.txt", - - # Overlays to stack on top of the base CR (hack/testing-manifests/wandb/kustomize/overlays/) - # Available overrides: - # sizes: "size-small", "size-micro", - # local networking: "networking-ingress-local", "networking-gateway-local" - # retention policies: "purge-retention", - # server version: "server-version-0.78.0" - # external infra: "external-mysql", "external-redis", "external-kafka", - # "external-objectstore", "external-clickhouse" - # disable infra: "disable-mysql", "disable-redis", "disable-kafka", - # "disable-objectstore", "disable-clickhouse" - # Default (empty list) uses base which is size=dev, retentionPolicy.onDelete=detach, version=0.79.0 - "wandbOverlays": [], - - # For CRC (Red Hat OpenShift Local), add "crc-admin" to allowedContexts. - # CRC auto-enables openshiftSCC and configures the internal registry. - # Run hack/scripts/setup_crc.sh first to expose the registry and login Docker. - # "allowedContexts": ["crc-admin"], + # Choose the local networking path. Tilt installs the matching local + # dependency automatically: nginx-gateway-fabric for "gateway", or + # ingress-nginx for "ingress". Ingress mode defaults to + # http://wandb.localhost:8080 unless wandbHostname is set explicitly. + "networkMode": "gateway", + + # Defaults for the generated CR. These usually only need to change when + # matching an existing local GatewayClass or IngressClass. + "gatewayClass": "nginx", + "ingressClass": "nginx", + + # off, full, or forward. "full" enables VictoriaMetrics/Grafana operators + # and exposes local telemetry endpoint resources. + "observabilityMode": "off", + + "logFormat": "pretty", }