From 34d36627cef33f95bcb83518d1438ae265a7d53b Mon Sep 17 00:00:00 2001 From: kbhos Date: Fri, 12 Jun 2026 09:37:56 +0530 Subject: [PATCH 01/10] feat(AIP-3938): AI tier openshift support --- config/configs/applications.yaml | 125 +- config/configs/features/saia.yaml | 7 +- tools/cluster_setup/artifacts.yaml | 67 +- .../openshift-cluster-config.yaml | 105 ++ tools/cluster_setup/openshift_with_stack.sh | 1417 +++++++++++++++++ 5 files changed, 1571 insertions(+), 150 deletions(-) create mode 100644 tools/cluster_setup/openshift-cluster-config.yaml create mode 100755 tools/cluster_setup/openshift_with_stack.sh diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index a9a0869f..dbb0c5c6 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -34,123 +34,6 @@ applications: SERVICE_NAME: "ai_platform_models" SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" - - args: - application_name: Gemma431bIt - deployment_configs: - LLMDeployment: - gpu_type_options_override: - H100: - autoscaling_config: - max_replicas: {{.Replicas.Gemma431bIt}} - min_replicas: {{.Replicas.Gemma431bIt}} - target_ongoing_requests: 6 - max_ongoing_requests: 8 - ray_actor_options: - num_gpus: 1 - L40S: - autoscaling_config: - max_replicas: {{.Replicas.Gemma431bIt}} - min_replicas: {{.Replicas.Gemma431bIt}} - target_ongoing_requests: 4 - max_ongoing_requests: 6 - ray_actor_options: - num_gpus: 2 - options: - autoscaling_config: - max_replicas: {{.Replicas.Gemma431bIt}} - min_replicas: {{.Replicas.Gemma431bIt}} - deployment_type: text_gen_model_deployment - gpu_types: '["{{.AcceleratorType}}"]' - model_definition: - gpu_type_model_config_override: - H100: - engine_args: - dtype: bfloat16 - gpu_memory_utilization: 0.9 - max_model_len: 32768 - max_num_batched_tokens: 4096 - tensor_parallel_size: 1 - L40S: - engine_args: - dtype: bfloat16 - gpu_memory_utilization: 0.85 - max_model_len: 120000 - max_num_batched_tokens: 4096 - max_num_seqs: 2 - tensor_parallel_size: 2 - model_config: - openai_serving_config: - chat: - enable_auto_tools: true - reasoning_parser: gemma4 - tool_parser: gemma4 - responses: - enable_auto_tools: true - reasoning_parser: gemma4 - tool_parser: gemma4 - model_id: gemma4_31b_it - model_loader: - blob_storage: - blob_prefix: model_artifacts/gemma-4-31b-it - tokenizer_definition: - model_id: gemma4_31b_it - model_loader: - blob_storage: - artifacts_list: - - chat_template.jinja - - config.json - - processor_config.json - - tokenizer_config.json - - tokenizer.json - blob_prefix: model_artifacts/gemma-4-31b-it - name: Gemma431bIt - import_path: main:create_serve_app - route_prefix: /gemma4_31b_it - runtime_env: - working_dir: "file:///home/ray/ray/applications/generic_application.zip" - env_vars: - API_VERSION: "v1" - APPLICATION_NAME: gemma4_31b_it - VLLM_ATTENTION_BACKEND: TRITON_ATTN - ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" - S3_BUCKET: "{{.ArtifactBucketName}}" - ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" - CLOUD_PROVIDER: "{{.CloudProvider}}" - S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" - S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" - S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" - # AWS / boto3 standard credential names — populated whenever the - # operator can load credentials from spec.objectStorage.secretRef. For - # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* - # names above are only consumed by the s3compat shim). Both code paths - # share the same source-of-truth Secret keys (s3_access_key / - # s3_secret_key) so emitting both pairs is safe — each provider only - # reads its own. AWS_REGION lets boto3 resolve the default regional S3 - # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 - # bucket outside us-east-1 to avoid PermanentRedirect on the first call. - AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" - AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" - AWS_REGION: "{{.Region}}" - AWS_DEFAULT_REGION: "{{.Region}}" - ENABLE_AUTHN: "false" - ENABLE_AUTHZ: "false" - SERVICE_EXTERNAL_NAME: "ai-platform-models" - SERVICE_INTERNAL_NAME: "ai_platform_models" - SERVICE_NAME: "ai_platform_models" - SKIP_VERIFICATION: "true" - USE_SYSTEM_PERMISSIONS: "true" - VLLM_WORKER_MULTIPROC_METHOD: spawn - # Disable the Redis-backed Responses API store (see ai-platform-models - # commit c1f9aef3: "feat: add a no-op store"). When True, the vLLM - # TextGen deployment constructs NoOpOpenAIServingResponses instead of - # RedisOpenAIServingResponses, so /v1/responses works without a Redis - # infra. Without this flag the deployment raises - # RuntimeError: Responses Redis URL not set - # on every request, which surfaces as an empty SSE stream and the SAIA - # v2 /query path fails with "An error occurred processing your request". - # Airgap k0s has no Redis; cloud sets this to "False" and wires - # RESPONSES_REDIS_ADDRESS to its in-namespace Redis StatefulSet. - DISABLE_RESPONSES_API_REDIS: "True" - args: application_name: GptOss20b deployment_configs: @@ -162,6 +45,14 @@ applications: L40S: ray_actor_options: num_gpus: 1 + RTX_PRO_6000_BLACKWELL: + autoscaling_config: + max_replicas: {{.Replicas.GptOss20b}} + min_replicas: {{.Replicas.GptOss20b}} + target_ongoing_requests: 4 + max_ongoing_requests: 8 + ray_actor_options: + num_gpus: 1 options: autoscaling_config: max_replicas: {{.Replicas.GptOss20b}} diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml index bfe5d96d..4e94b27d 100644 --- a/config/configs/features/saia.yaml +++ b/config/configs/features/saia.yaml @@ -4,7 +4,6 @@ applicationScale: CrossEncoder: 1 E5LanguageClassifier: 1 Entrypoint: 1 - Gemma431bIt: 1 GptOss20b: 1 MbartTranslator: 1 PromptInjectionClassifier: 1 @@ -23,4 +22,8 @@ instanceScale: h100-1-gpu: 2 H100_NVL: h100-nvl-0-gpu: 1 - h100-nvl-1-gpu: 2 \ No newline at end of file + h100-nvl-1-gpu: 2 + RTX_PRO_6000_BLACKWELL: + rtx-pro-6000-blackwell-0-gpu: 1 + rtx-pro-6000-blackwell-1-gpu: 1 + rtx-pro-6000-blackwell-2-gpu: 0 \ No newline at end of file diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index c6953e7b..f2347653 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -1061,11 +1061,18 @@ spec: items: description: FeatureSpec defines the features to enable in the AIPlatform properties: + env: + additionalProperties: + type: string + description: Env specifies environment variables to propagate + to the child AIService. + type: object name: description: Name of the feature, e.g. "saia" or "seca" enum: - saia - seca + - weaviate-service type: string scaleFactor: description: ScaleFactor is the desired fixed number of replicas @@ -2085,6 +2092,11 @@ spec: type: object x-kubernetes-map-type: atomic type: array + otelImage: + default: otel/opentelemetry-collector-contrib:0.122.1 + description: OTelImage is the OpenTelemetry Collector sidecar + image + type: string rayHeadGroupImage: description: Ray head group image, e.g. "rayproject/ray-head:latest" type: string @@ -2225,7 +2237,8 @@ spec: type: object objectStorage: description: |- - ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models + ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models. + It is optional for platforms that only enable features that do not require object storage. Supported providers: S3, GCS, Azure Blob Storage, MinIO properties: endpoint: @@ -2237,8 +2250,8 @@ spec: path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ - pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, or minio://bucketname/ + pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -2908,8 +2921,6 @@ spec: pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ type: string type: object - required: - - objectStorage type: object status: description: AIPlatformStatus defines observed state @@ -4084,11 +4095,18 @@ spec: features: description: Feature defines the features to be enabled for the AIService properties: + env: + additionalProperties: + type: string + description: Env specifies environment variables to propagate + to the child AIService. + type: object name: description: Name of the feature, e.g. "saia" or "seca" enum: - saia - seca + - weaviate-service type: string scaleFactor: description: ScaleFactor is the desired fixed number of replicas @@ -4866,27 +4884,15 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) - Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) + Optional override endpoint (only needed for S3-compatible services like MinIO) + Must be a valid HTTP/HTTPS URL pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// - pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ - type: string - provider: - description: |- - Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. - Values: aws, minio, seaweedfs, s3compat, gcs, azure - enum: - - aws - - minio - - seaweedfs - - s3compat - - gcs - - azure + azure://containername/, or minio://bucketname/ + pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -4894,8 +4900,7 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials (e.g. - s3_access_key, s3_secret_key for S3-compatible backends) + description: Secret name containing storage credentials maxLength: 253 minLength: 1 type: string @@ -5682,19 +5687,19 @@ spec: fieldRef: fieldPath: metadata.name - name: RELATED_IMAGE_RAY_HEAD - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-010 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-953 - name: RELATED_IMAGE_RAY_WORKER - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-010 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-953 - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a + - name: RELATED_IMAGE_WEAVIATE_SERVICE + value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-012 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-main-c3b489d - name: RELATED_IMAGE_SAIA_API_V2 - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-012 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-main-c3b489d - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-012 - - name: SPLUNK_METRICS_INDEX_NAME - value: _metrics + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-main-c3b489d - name: RELATED_IMAGE_FLUENT_BIT value: docker.io/fluent/fluent-bit:1.9.6 - name: RELATED_IMAGE_OTEL_COLLECTOR @@ -5705,7 +5710,7 @@ spec: value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.29 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.1 livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/openshift-cluster-config.yaml b/tools/cluster_setup/openshift-cluster-config.yaml new file mode 100644 index 00000000..2eb1c105 --- /dev/null +++ b/tools/cluster_setup/openshift-cluster-config.yaml @@ -0,0 +1,105 @@ +# ============================================================================= +# OpenShift Cluster Config for Splunk AI Platform +# ============================================================================= +# Used by openshift_with_stack.sh +# ============================================================================= + +kubernetes: + namespace: ai-platform + +# OpenShift-specific settings +openshift: + # Grant privileged SCC to Ray worker and operator service accounts. + # Required when running GPU workloads (nvidia.com/gpu requests). + # Set to "false" only if your cluster policy already covers this. + grantPrivilegedSCC: "true" + + # Node labeling for splunk.ai/* workload selectors. + # The operator schedules weaviate/ray-head on cpu nodes and Ray workers on gpu nodes. + # Use "auto" to detect by nvidia.com/gpu.present label (works when GPU Operator is installed). + # Use "manual" to specify node names explicitly below. + nodeLabelStrategy: "manual" + + # L40S nodes handle CPU workloads (weaviate, ray-head, saia-api). + # RTX 6000 Blackwell node is dedicated to GPU model pods (ray-worker). + nodes: + cpu: + - 00-25-b5-b5-00-31 + - 00-25-b5-b5-00-33 + gpu: + - cc-40-f3-9f-e2-3c + +images: + # Registry prefix applied to images that are not fully qualified + registry: "658391232643.dkr.ecr.us-east-2.amazonaws.com" + + operator: + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.1" + + ray: + headImage: "ml-platform/ray/ray-head:build-953" + workerImage: "ml-platform/ray/ray-worker-gpu:build-953" + + weaviate: + image: "docker.io/semitechnologies/weaviate:stable-v1.28-007846a" + + saia: + apiImage: "ml-platform/saia/saia-api:build-v2-main-c3b489d" + apiV2Image: "ml-platform/saia/saia-api-v2:build-v2-main-c3b489d" + dataLoaderImage: "ml-platform/saia/saia-data-loader:build-v2-main-c3b489d" + + splunk: + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/splunk/splunk:10-2-ai-custom" + operatorImage: "docker.io/splunk/splunk-operator:3.0.0" + + fluentBit: + image: "docker.io/fluent/fluent-bit:1.9.6" + + otelCollector: + image: "docker.io/otel/opentelemetry-collector-contrib:0.122.1" + + nginx: + image: "docker.io/library/nginx:1.27-alpine" + +storage: + storageClass: "local-path" + vectorDbSize: "50Gi" + objectStore: + type: "minio" # aws | s3compat | minio | seaweedfs + bucket: "ai-platform-bucket" + endpoint: "http://18.116.39.79:8333" + auth: + rootUser: "minioadmin" + rootPassword: "minioadmin" + +splunk: + standaloneName: splunk-standalone + +aiPlatform: + name: "openshift-ai-platform" + defaultAcceleratorType: "RTX_PRO_6000_BLACKWELL" + workerGroupConfig: + imageRegistry: "" + serviceTemplate: + type: NodePort + nodePort: 30080 + features: + - name: "saia" + version: "1.1.0" + +operators: + ray: + modelVersion: "v0.3.14-36-g1549f5a" + rayVersion: "2.53.0" + +files: + aiPlatform: "./artifacts.yaml" + splunkOperator: "./splunk-operator-cluster.yaml" + +# ECR pull secret — created automatically in all relevant namespaces during install. +# Requires AWS credentials in the environment (e.g. AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY +# or an instance profile). Set enabled: false for non-ECR registries. +ecr: + enabled: true + account: "658391232643" + region: "us-east-2" diff --git a/tools/cluster_setup/openshift_with_stack.sh b/tools/cluster_setup/openshift_with_stack.sh new file mode 100755 index 00000000..ced27236 --- /dev/null +++ b/tools/cluster_setup/openshift_with_stack.sh @@ -0,0 +1,1417 @@ +#!/bin/bash +set -euo pipefail + +# ============================================================================= +# OpenShift Cluster Setup Script for Splunk AI Platform +# ============================================================================= +# Installs/removes the Splunk AI Operator stack onto an existing OpenShift +# cluster. Assumes you are already logged in via `oc login` or have a valid +# KUBECONFIG pointing at the cluster. +# +# Usage: +# ./openshift_with_stack.sh [install|delete] +# +# The script reads openshift-cluster-config.yaml in the same directory. +# Override with: CONFIG_FILE=/path/to/config.yaml ./openshift_with_stack.sh +# ============================================================================= + +export PAGER=cat +export LANG=C LC_ALL=C + +# ====== CONFIG FILE LOCATION ====== +CONFIG_FILE="${CONFIG_FILE:-$(dirname "$0")/openshift-cluster-config.yaml}" + +# ====== SESSION LOG ====== +LOG_DIR="${LOG_DIR:-$(dirname "$0")/logs}" +mkdir -p "${LOG_DIR}" +LOG_FILE="${LOG_DIR}/openshift-install-$(date '+%Y-%m-%d_%H-%M-%S').log" +exec > >(tee -a "${LOG_FILE}") 2>&1 +echo "[LOG] Session log: ${LOG_FILE}" + +# ====== COLORS & LOGGING ====== +log() { echo -e "\033[1;36m[INFO]\033[0m $*" >&2; } +warn() { echo -e "\033[1;33m[WARN]\033[0m $*" >&2; } +err() { echo -e "\033[1;31m[ERROR]\033[0m $*" >&2; exit 1; } +need() { command -v "$1" >/dev/null 2>&1 || err "Missing $1 in PATH"; } + +# ====== LOAD CONFIGURATION ====== +load_config() { + log "Loading configuration from: ${CONFIG_FILE}" + [[ -f "${CONFIG_FILE}" ]] || err "Config file not found: ${CONFIG_FILE}" + + if command -v yq >/dev/null 2>&1; then + local yq_err + if ! yq_err=$(yq eval '.' "${CONFIG_FILE}" 2>&1 >/dev/null); then + err "Config file ${CONFIG_FILE} has YAML syntax errors: +${yq_err}" + fi + fi + + AI_NS=$(yq eval '.kubernetes.namespace // "ai-platform"' "${CONFIG_FILE}" 2>/dev/null || echo "ai-platform") + IMAGE_REGISTRY=$(yq eval '.images.registry // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + OPERATOR_IMAGE=$(yq eval '.images.operator.image // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + RAY_HEAD_IMAGE=$(yq eval '.images.ray.headImage // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + RAY_WORKER_IMAGE=$(yq eval '.images.ray.workerImage // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + WEAVIATE_IMAGE=$(yq eval '.images.weaviate.image // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + SAIA_API_IMAGE=$(yq eval '.images.saia.apiImage // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + SAIA_API_V2_IMAGE=$(yq eval '.images.saia.apiV2Image // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + SAIA_DATALOADER_IMAGE=$(yq eval '.images.saia.dataLoaderImage // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + SPLUNK_IMAGE=$(yq eval '.images.splunk.image // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + SPLUNK_OPERATOR_IMAGE=$(yq eval '.images.splunk.operatorImage // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + FLUENT_BIT_IMAGE=$(yq eval '.images.fluentBit.image // "fluent/fluent-bit:1.9.6"' "${CONFIG_FILE}" 2>/dev/null || echo "fluent/fluent-bit:1.9.6") + OTEL_COLLECTOR_IMAGE=$(yq eval '.images.otelCollector.image // "otel/opentelemetry-collector-contrib:0.122.1"' "${CONFIG_FILE}" 2>/dev/null || echo "otel/opentelemetry-collector-contrib:0.122.1") + NGINX_IMAGE=$(yq eval '.images.nginx.image // "docker.io/library/nginx:1.27-alpine"' "${CONFIG_FILE}" 2>/dev/null || echo "docker.io/library/nginx:1.27-alpine") + MODEL_VERSION=$(yq eval '.operators.ray.modelVersion // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + RAY_RUNTIME_VERSION=$(yq eval '.operators.ray.rayVersion // "2.44.0"' "${CONFIG_FILE}" 2>/dev/null || echo "2.44.0") + SPLUNK_AI_FILE=$(yq eval '.files.aiPlatform // "./artifacts.yaml"' "${CONFIG_FILE}" 2>/dev/null || echo "./artifacts.yaml") + SPLUNK_OPERATOR_FILE=$(yq eval '.files.splunkOperator // "./splunk-operator-cluster.yaml"' "${CONFIG_FILE}" 2>/dev/null || echo "./splunk-operator-cluster.yaml") + + # OpenShift-specific + # Whether to grant the operator service account privileged SCC. + # Required for Ray worker pods that request nvidia.com/gpu resources. + GRANT_PRIVILEGED_SCC=$(yq eval '.openshift.grantPrivilegedSCC // "true"' "${CONFIG_FILE}" 2>/dev/null || echo "true") + + NODE_LABEL_STRATEGY=$(yq eval '.openshift.nodeLabelStrategy // "auto"' "${CONFIG_FILE}" 2>/dev/null || echo "auto") + + ECR_ENABLED=$(yq eval '.ecr.enabled // "false"' "${CONFIG_FILE}" 2>/dev/null || echo "false") + ECR_ACCOUNT=$(yq eval '.ecr.account // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + ECR_REGION=$(yq eval '.ecr.region // "us-east-2"' "${CONFIG_FILE}" 2>/dev/null || echo "us-east-2") + + AI_PLATFORM_NAME=$(yq eval '.aiPlatform.name // "openshift-ai-platform"' "${CONFIG_FILE}" 2>/dev/null || echo "openshift-ai-platform") + DEFAULT_ACCELERATOR=$(yq eval '.aiPlatform.defaultAcceleratorType // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + WORKER_IMAGE_REGISTRY=$(yq eval '.aiPlatform.workerGroupConfig.imageRegistry // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + STORAGE_CLASS=$(yq eval '.storage.storageClass // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + VECTORDB_SIZE=$(yq eval '.storage.vectorDbSize // "50Gi"' "${CONFIG_FILE}" 2>/dev/null || echo "50Gi") + OBJ_STORE_TYPE=$(yq eval '.storage.objectStore.type // "minio"' "${CONFIG_FILE}" 2>/dev/null || echo "minio") + OBJ_STORE_BUCKET=$(yq eval '.storage.objectStore.bucket // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + OBJ_STORE_ENDPOINT=$(yq eval '.storage.objectStore.endpoint // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + MINIO_ROOT_USER=$(yq eval '.storage.objectStore.auth.rootUser // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + MINIO_ROOT_PASSWORD=$(yq eval '.storage.objectStore.auth.rootPassword // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + AI_STANDALONE_NAME=$(yq eval '.splunk.standaloneName // "splunk-standalone"' "${CONFIG_FILE}" 2>/dev/null || echo "splunk-standalone") + + log "Configuration loaded: namespace=${AI_NS}, accelerator=${DEFAULT_ACCELERATOR}" +} + +# ====== IMAGE HELPERS ====== +build_image_url() { + local registry="$1" + local image_path="$2" + # If the image is already fully qualified (contains a registry host) return as-is + if [[ "$image_path" =~ ^([a-zA-Z0-9.-]+\.[a-zA-Z]{2,}|[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(:[0-9]+)?)/.*:.+ ]]; then + echo "$image_path" + return 0 + fi + if [[ -n "$registry" && "$registry" != "null" ]]; then + echo "${registry}/${image_path}" + else + echo "$image_path" + fi +} + +validate_image_config() { + log "Validating image configuration..." + [[ -z "$OPERATOR_IMAGE" || "$OPERATOR_IMAGE" == "null" ]] && err "REQUIRED: images.operator.image must be set in config" + [[ -z "$RAY_HEAD_IMAGE" || "$RAY_HEAD_IMAGE" == "null" ]] && err "REQUIRED: images.ray.headImage must be set in config" + [[ -z "$RAY_WORKER_IMAGE" || "$RAY_WORKER_IMAGE" == "null" ]] && err "REQUIRED: images.ray.workerImage must be set in config" + [[ -z "$WEAVIATE_IMAGE" || "$WEAVIATE_IMAGE" == "null" ]] && err "REQUIRED: images.weaviate.image must be set in config" + [[ -z "$SAIA_API_IMAGE" || "$SAIA_API_IMAGE" == "null" ]] && err "REQUIRED: images.saia.apiImage must be set in config" + [[ -z "$SAIA_API_V2_IMAGE" || "$SAIA_API_V2_IMAGE" == "null" ]] && err "REQUIRED: images.saia.apiV2Image must be set in config" + [[ -z "$SAIA_DATALOADER_IMAGE" || "$SAIA_DATALOADER_IMAGE" == "null" ]] && err "REQUIRED: images.saia.dataLoaderImage must be set in config" + [[ -z "$SPLUNK_IMAGE" || "$SPLUNK_IMAGE" == "null" ]] && err "REQUIRED: images.splunk.image must be set in config" + [[ -z "$MODEL_VERSION" || "$MODEL_VERSION" == "null" ]] && { MODEL_VERSION="v0.3.14-36-g1549f5a"; log "Using default MODEL_VERSION: $MODEL_VERSION"; } + log "✓ Image configuration validated" +} + +configure_images() { + log "Patching image references in manifest files..." + + [[ -f "${SPLUNK_AI_FILE}" ]] || err "Manifest not found: ${SPLUNK_AI_FILE}" + + if [[ ! -f "${SPLUNK_AI_FILE}.original" ]]; then + cp "$SPLUNK_AI_FILE" "${SPLUNK_AI_FILE}.original" + fi + cp "${SPLUNK_AI_FILE}.original" "$SPLUNK_AI_FILE" + + local operator_full ray_head_full ray_worker_full weaviate_full + local saia_api_full saia_api_v2_full saia_dataloader_full + local fluent_bit_full otel_collector_full nginx_full + + operator_full=$(build_image_url "$IMAGE_REGISTRY" "$OPERATOR_IMAGE") + ray_head_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_HEAD_IMAGE") + ray_worker_full=$(build_image_url "$IMAGE_REGISTRY" "$RAY_WORKER_IMAGE") + weaviate_full=$(build_image_url "$IMAGE_REGISTRY" "$WEAVIATE_IMAGE") + saia_api_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_IMAGE") + saia_api_v2_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_API_V2_IMAGE") + saia_dataloader_full=$(build_image_url "$IMAGE_REGISTRY" "$SAIA_DATALOADER_IMAGE") + fluent_bit_full=$(build_image_url "$IMAGE_REGISTRY" "$FLUENT_BIT_IMAGE") + otel_collector_full=$(build_image_url "$IMAGE_REGISTRY" "$OTEL_COLLECTOR_IMAGE") + nginx_full=$(build_image_url "$IMAGE_REGISTRY" "$NGINX_IMAGE") + + # BSD (macOS) sed requires an explicit backup-suffix arg after -i. + local SED_INPLACE + if [[ "$OSTYPE" == "darwin"* ]]; then + SED_INPLACE=(sed -i "") + else + SED_INPLACE=(sed -i) + fi + + local ray_head_esc ray_worker_esc weaviate_esc saia_api_esc saia_api_v2_esc + local saia_dl_esc fluent_esc otel_esc nginx_esc operator_esc + + ray_head_esc=$(echo "$ray_head_full" | sed 's/[\/&]/\\&/g') + ray_worker_esc=$(echo "$ray_worker_full" | sed 's/[\/&]/\\&/g') + weaviate_esc=$(echo "$weaviate_full" | sed 's/[\/&]/\\&/g') + saia_api_esc=$(echo "$saia_api_full" | sed 's/[\/&]/\\&/g') + saia_api_v2_esc=$(echo "$saia_api_v2_full" | sed 's/[\/&]/\\&/g') + saia_dl_esc=$(echo "$saia_dataloader_full" | sed 's/[\/&]/\\&/g') + fluent_esc=$(echo "$fluent_bit_full" | sed 's/[\/&]/\\&/g') + otel_esc=$(echo "$otel_collector_full" | sed 's/[\/&]/\\&/g') + nginx_esc=$(echo "$nginx_full" | sed 's/[\/&]/\\&/g') + operator_esc=$(echo "$operator_full" | sed 's/[\/&]/\\&/g') + + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_HEAD/,/value:/ s|value:.*|value: ${ray_head_esc}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_RAY_WORKER/,/value:/ s|value:.*|value: ${ray_worker_esc}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_WEAVIATE/,/value:/ s|value:.*|value: ${weaviate_esc}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API$/,/value:/ s|value:.*|value: ${saia_api_esc}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_SAIA_API_V2/,/value:/ s|value:.*|value: ${saia_api_v2_esc}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_POST_INSTALL_HOOK/,/value:/ s|value:.*|value: ${saia_dl_esc}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_FLUENT_BIT/,/value:/ s|value:.*|value: ${fluent_esc}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_OTEL_COLLECTOR/,/value:/ s|value:.*|value: ${otel_esc}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RELATED_IMAGE_NGINX/,/value:/ s|value:.*|value: ${nginx_esc}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: MODEL_VERSION/,/value:/ s|value:.*|value: ${MODEL_VERSION}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "/name: RAY_VERSION/,/value:/ s|value:.*|value: ${RAY_RUNTIME_VERSION}|" "$SPLUNK_AI_FILE" + "${SED_INPLACE[@]}" "s|image: .*splunk.*ai.*operator.*|image: ${operator_esc}|I" "$SPLUNK_AI_FILE" + + log " ✓ RELATED_IMAGE_RAY_HEAD: $ray_head_full" + log " ✓ RELATED_IMAGE_RAY_WORKER: $ray_worker_full" + log " ✓ RELATED_IMAGE_WEAVIATE: $weaviate_full" + log " ✓ RELATED_IMAGE_SAIA_API: $saia_api_full" + log " ✓ RELATED_IMAGE_SAIA_API_V2: $saia_api_v2_full" + log " ✓ RELATED_IMAGE_POST_INSTALL_HOOK: $saia_dataloader_full" + log " ✓ RELATED_IMAGE_FLUENT_BIT: $fluent_bit_full" + log " ✓ RELATED_IMAGE_OTEL_COLLECTOR: $otel_collector_full" + log " ✓ RELATED_IMAGE_NGINX: $nginx_full" + log " ✓ Operator image: $operator_full" + log " ✓ MODEL_VERSION: $MODEL_VERSION" + log " ✓ RAY_VERSION: $RAY_RUNTIME_VERSION" +} + +# ====== PREFLIGHT CHECKS ====== +preflight_checks() { + log "Running preflight checks..." + + for tool in oc yq; do + command -v "$tool" >/dev/null 2>&1 && log " ✓ $tool found" || err "Missing $tool in PATH" + done + + # Verify we are connected to the cluster + if ! oc whoami &>/dev/null; then + err "Not logged in to OpenShift. Run: oc login " + fi + log " ✓ Logged in as: $(oc whoami)" + + # Verify cluster admin access (needed to install CRDs and grant SCCs) + if ! oc auth can-i create clusterrolebinding --all-namespaces &>/dev/null; then + warn " May not have cluster-admin; CRD and SCC operations might fail" + else + log " ✓ Cluster-admin access confirmed" + fi + + [[ -f "${SPLUNK_AI_FILE}" ]] && log " ✓ Manifest: ${SPLUNK_AI_FILE}" || err "Manifest not found: ${SPLUNK_AI_FILE}" + + log "Preflight checks passed" +} + +# ====== WAIT FOR CRD ====== +wait_for_crd() { + local crd_name="$1" + local timeout="${2:-300}" + log "Waiting for CRD ${crd_name} (timeout: ${timeout}s)..." + local elapsed=0 + while ! oc get crd "${crd_name}" >/dev/null 2>&1; do + sleep 5 + elapsed=$((elapsed + 5)) + if [[ ${elapsed} -ge ${timeout} ]]; then + err "Timeout waiting for CRD ${crd_name}" + fi + done + log " ✓ CRD ${crd_name} ready" +} + +# ====== ENSURE NAMESPACE ====== +ensure_namespace() { + local ns="$1" + if ! oc get namespace "${ns}" &>/dev/null; then + log "Creating namespace ${ns}..." + oc create namespace "${ns}" + fi +} + +# ====== OPENSHIFT: GRANT PRIVILEGED SCC ====== +# Ray worker pods request nvidia.com/gpu resources and run as non-root. +# On OpenShift the default restricted SCC blocks this — privileged SCC is needed. +grant_privileged_scc() { + if [[ "${GRANT_PRIVILEGED_SCC}" != "true" ]]; then + log "Skipping privileged SCC grant (openshift.grantPrivilegedSCC=false)" + return 0 + fi + + local ai_operator_ns="splunk-ai-operator-system" + log "Granting SCC policies to service account groups in ${ai_operator_ns} and ${AI_NS}..." + + # Use `oc adm policy add-scc-to-group` which modifies the SCC's groups list directly + # and is honored by OCP SCC admission (unlike ClusterRoleBinding which can be ignored). + # + # - privileged: operator namespace (webhook + leader election need elevated perms) + # - anyuid: AI platform namespace so operator-created SAs (saia-sa, weaviate, + # raycluster-*) run as the UID defined in their images, not OCP's random UID range. + # - privileged: also on AI platform so Splunk Standalone can write to hostPath PVCs. + oc adm policy add-scc-to-group privileged \ + "system:serviceaccounts:${ai_operator_ns}" 2>/dev/null || true + oc adm policy add-scc-to-group anyuid \ + "system:serviceaccounts:${AI_NS}" 2>/dev/null || true + oc adm policy add-scc-to-group privileged \ + "system:serviceaccounts:${AI_NS}" 2>/dev/null || true + # Splunk Operator pod adds NET_BIND_SERVICE capability which anyuid blocks; needs privileged. + oc adm policy add-scc-to-group privileged \ + "system:serviceaccounts:splunk-operator" 2>/dev/null || true + + log " ✓ anyuid + privileged SCC granted to all SAs in ${AI_NS} and splunk-operator" +} + +# ====== INSTALL NFD (Node Feature Discovery) via OLM ====== +# NFD labels nodes with hardware capabilities including nvidia.com/gpu.present=true. +# The GPU Operator depends on NFD labels to know which nodes to target. +install_nfd() { + log "Installing Node Feature Discovery Operator (NFD)..." + + if oc get subscription nfd -n openshift-nfd &>/dev/null; then + log " ✓ NFD subscription already exists, skipping" + return 0 + fi + + oc apply -f - <<'EOF' +apiVersion: v1 +kind: Namespace +metadata: + name: openshift-nfd +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: openshift-nfd + namespace: openshift-nfd +spec: + targetNamespaces: + - openshift-nfd +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: nfd + namespace: openshift-nfd +spec: + channel: stable + name: nfd + source: redhat-operators + sourceNamespace: openshift-marketplace + installPlanApproval: Automatic +EOF + + log "Waiting for NFD CSV to succeed..." + local retries=0 + while (( retries < 36 )); do + local phase + phase=$(oc get csv -n openshift-nfd -o jsonpath='{.items[0].status.phase}' 2>/dev/null || echo "") + if [[ "${phase}" == "Succeeded" ]]; then + log " ✓ NFD operator ready" + break + fi + sleep 10 + retries=$(( retries + 1 )) + log " Waiting for NFD CSV... (${retries}/36, phase=${phase:-pending})" + done + + # Create the NodeFeatureDiscovery CR to start labeling nodes + if ! oc get nodefeaturediscovery nfd-instance -n openshift-nfd &>/dev/null; then + log "Creating NodeFeatureDiscovery CR..." + oc apply -f - <<'EOF' +apiVersion: nfd.openshift.io/v1 +kind: NodeFeatureDiscovery +metadata: + name: nfd-instance + namespace: openshift-nfd +spec: + operand: + image: registry.redhat.io/openshift4/ose-node-feature-discovery-rhel9:v4.21 + imagePullPolicy: Always + workerConfig: + configData: | + core: + sleepInterval: 60s + sources: + pci: + deviceClassWhitelist: + - "03" + deviceLabelFields: + - "vendor" +EOF + fi + + log " ✓ NFD installed" +} + +# ====== INSTALL NVIDIA GPU OPERATOR via OLM ====== +# Installs driver, container toolkit, device plugin, and DCGM on GPU nodes. +# Uses OCP Driver Toolkit (use_ocp_driver_toolkit: true) so no SSH to nodes needed. +install_nvidia_gpu_operator() { + log "Installing NVIDIA GPU Operator..." + + if oc get subscription gpu-operator-certified -n nvidia-gpu-operator &>/dev/null; then + log " ✓ GPU Operator subscription already exists, skipping" + return 0 + fi + + oc apply -f - <<'EOF' +apiVersion: v1 +kind: Namespace +metadata: + name: nvidia-gpu-operator +--- +apiVersion: operators.coreos.com/v1 +kind: OperatorGroup +metadata: + name: nvidia-gpu-operator + namespace: nvidia-gpu-operator +spec: + targetNamespaces: + - nvidia-gpu-operator +--- +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: gpu-operator-certified + namespace: nvidia-gpu-operator +spec: + channel: v26.3 + name: gpu-operator-certified + source: certified-operators + sourceNamespace: openshift-marketplace + installPlanApproval: Automatic +EOF + + log "Waiting for GPU Operator CSV to succeed..." + local retries=0 + while (( retries < 36 )); do + local phase + phase=$(oc get csv -n nvidia-gpu-operator -o jsonpath='{.items[0].status.phase}' 2>/dev/null || echo "") + if [[ "${phase}" == "Succeeded" ]]; then + log " ✓ GPU Operator CSV ready" + break + fi + sleep 10 + retries=$(( retries + 1 )) + log " Waiting for GPU Operator CSV... (${retries}/36, phase=${phase:-pending})" + done + + # Create ClusterPolicy to trigger driver + toolkit + device-plugin rollout + if ! oc get clusterpolicy gpu-cluster-policy &>/dev/null; then + log "Creating ClusterPolicy CR..." + oc apply -f - <<'EOF' +apiVersion: nvidia.com/v1 +kind: ClusterPolicy +metadata: + name: gpu-cluster-policy +spec: + operator: {} + daemonsets: {} + driver: + enabled: true + use_ocp_driver_toolkit: true + toolkit: + enabled: true + devicePlugin: + enabled: true + dcgm: + enabled: true + dcgmExporter: + enabled: true + gfd: + enabled: true + nodeStatusExporter: + enabled: true + validator: + enabled: true +EOF + fi + + # Wait for nvidia.com/gpu.present=true to appear on at least one worker node. + # This confirms NFD + GFD have finished their discovery pass. + log "Waiting for GPU nodes to be labeled by GPU Operator / GFD..." + local retries=0 + while (( retries < 60 )); do + local count + count=$(oc get nodes -l nvidia.com/gpu.present=true --no-headers 2>/dev/null | wc -l | tr -d ' ') + if (( count > 0 )); then + log " ✓ ${count} GPU node(s) labeled with nvidia.com/gpu.present=true" + break + fi + sleep 15 + retries=$(( retries + 1 )) + log " Waiting for GPU node labels... (${retries}/60)" + done + + if (( retries >= 60 )); then + warn "GPU nodes not labeled after 15m — label_nodes will fall back to 0 GPU workers. + Check: oc get pods -n nvidia-gpu-operator + oc get clusterpolicy gpu-cluster-policy -o yaml" + fi + + log " ✓ NVIDIA GPU Operator installed" +} + +# ====== NODE LABELING ====== +# Applies splunk.ai/* labels that the operator uses to schedule workloads. +# Without these labels all operator-managed pods (weaviate, ray-head, ray-worker) +# will stay Pending forever because their nodeSelectors won't match any node. +# Runs after install_nvidia_gpu_operator so nvidia.com/gpu.present=true is already set. +label_nodes() { + log "Applying splunk.ai/* node labels (strategy: ${NODE_LABEL_STRATEGY})..." + + local cpu_nodes=() gpu_nodes=() control_nodes=() + + # Always label master/control-plane nodes + while IFS= read -r node; do + [[ -n "$node" ]] && control_nodes+=("$node") + done < <(oc get nodes -l node-role.kubernetes.io/master -o name 2>/dev/null | sed 's|node/||') + + case "${NODE_LABEL_STRATEGY}" in + auto) + # GPU nodes: detected by nvidia.com/gpu.present=true (set by NVIDIA GPU Operator / NFD) + while IFS= read -r node; do + [[ -n "$node" ]] && gpu_nodes+=("$node") + done < <(oc get nodes -l nvidia.com/gpu.present=true,node-role.kubernetes.io/worker -o name 2>/dev/null | sed 's|node/||') + + # CPU nodes: worker nodes without GPU label + while IFS= read -r node; do + [[ -n "$node" ]] && cpu_nodes+=("$node") + done < <(oc get nodes -l '!nvidia.com/gpu.present,node-role.kubernetes.io/worker' -o name 2>/dev/null | sed 's|node/||') + ;; + + manual) + local cpu_count gpu_count + cpu_count=$(yq eval '.openshift.nodes.cpu | length' "${CONFIG_FILE}" 2>/dev/null || echo "0") + gpu_count=$(yq eval '.openshift.nodes.gpu | length' "${CONFIG_FILE}" 2>/dev/null || echo "0") + local i=0 + while [[ $i -lt $cpu_count ]]; do + local n; n=$(yq eval ".openshift.nodes.cpu[$i]" "${CONFIG_FILE}" 2>/dev/null || echo "") + [[ -n "$n" && "$n" != "null" ]] && cpu_nodes+=("$n") + i=$((i+1)) + done + i=0 + while [[ $i -lt $gpu_count ]]; do + local n; n=$(yq eval ".openshift.nodes.gpu[$i]" "${CONFIG_FILE}" 2>/dev/null || echo "") + [[ -n "$n" && "$n" != "null" ]] && gpu_nodes+=("$n") + i=$((i+1)) + done + ;; + + *) + err "Unknown nodeLabelStrategy: ${NODE_LABEL_STRATEGY}. Use 'auto' or 'manual'." + ;; + esac + + # Label control-plane nodes + for node in "${control_nodes[@]}"; do + log " Labeling control-plane node: ${node}" + oc label node "${node}" \ + splunk.ai/node-role=controller \ + splunk.ai/workload-type=control-plane \ + --overwrite + done + + # Label CPU worker nodes + for node in "${cpu_nodes[@]}"; do + log " Labeling CPU worker node: ${node}" + oc label node "${node}" \ + splunk.ai/node-role=worker \ + splunk.ai/workload-type=cpu \ + splunk.ai/instance-type=cpu-worker \ + --overwrite + done + + # Label GPU worker nodes + for node in "${gpu_nodes[@]}"; do + log " Labeling GPU worker node: ${node}" + oc label node "${node}" \ + splunk.ai/node-role=worker \ + splunk.ai/workload-type=gpu \ + splunk.ai/instance-type=gpu-worker \ + --overwrite + # Taint GPU nodes so non-GPU workloads don't land on them + oc adm taint node "${node}" nvidia.com/gpu=true:NoSchedule --overwrite 2>/dev/null || true + done + + # Verify no worker node is left unlabeled — unlabeled workers cause silent Pending forever + local unlabeled + unlabeled=$(oc get nodes -l node-role.kubernetes.io/worker -o json 2>/dev/null \ + | python3 -c " +import json,sys +data=json.load(sys.stdin) +for n in data['items']: + if 'splunk.ai/workload-type' not in n['metadata']['labels']: + print(n['metadata']['name']) +" 2>/dev/null || echo "") + + if [[ -n "${unlabeled}" ]]; then + err "Worker node(s) still missing splunk.ai/workload-type after labeling: +$(echo "${unlabeled}" | sed 's/^/ /') + +If using nodeLabelStrategy: auto, check that the NVIDIA GPU Operator is installed +and nodes have nvidia.com/gpu.present=true, or switch to nodeLabelStrategy: manual +and list nodes explicitly under openshift.nodes.cpu / openshift.nodes.gpu in the config." + fi + + log " ✓ Control-plane nodes: ${#control_nodes[@]}" + log " ✓ CPU worker nodes: ${#cpu_nodes[@]}" + log " ✓ GPU worker nodes: ${#gpu_nodes[@]}" + log "Node labeling complete" +} + +# ====== INSTALL CERT-MANAGER ====== +install_cert_manager() { + log "Installing cert-manager..." + + if oc get namespace cert-manager &>/dev/null; then + log " cert-manager namespace already exists, checking if running..." + if oc get deployment cert-manager -n cert-manager &>/dev/null; then + log " ✓ cert-manager already installed, skipping" + return 0 + fi + fi + + oc apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.0/cert-manager.yaml + + log "Waiting for cert-manager to be ready..." + oc wait --for=condition=ready pod \ + -l app.kubernetes.io/instance=cert-manager \ + -n cert-manager --timeout=300s + + # On OpenShift, cert-manager pods may need anyuid SCC + oc adm policy add-scc-to-user anyuid \ + -z cert-manager -n cert-manager 2>/dev/null || true + oc adm policy add-scc-to-user anyuid \ + -z cert-manager-cainjector -n cert-manager 2>/dev/null || true + oc adm policy add-scc-to-user anyuid \ + -z cert-manager-webhook -n cert-manager 2>/dev/null || true + + log "Waiting for cert-manager webhook to be reachable with a valid TLS certificate..." + # The webhook endpoint being ready is not enough — the TLS cert has a notBefore + # timestamp ~30s in the future right after issuance. Probe by applying a test + # Issuer and retrying until the x509 clock-skew error clears. + # NOTE: heredoc inside $(...) is unreliable under set -euo pipefail; use a temp file. + local probe_file + probe_file=$(mktemp /tmp/cert-manager-probe-XXXXXX.yaml) + cat > "${probe_file}" <<'EOF' +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: cert-manager-webhook-probe + namespace: cert-manager +spec: + selfSigned: {} +EOF + local retries=0 + while (( retries < 60 )); do + local out + out=$(oc apply -f "${probe_file}" 2>&1) || true + if echo "${out}" | grep -q "x509: certificate\|failed to call webhook\|i/o timeout"; then + sleep 5 + retries=$((retries + 1)) + (( retries % 6 == 0 )) && log " Still waiting for cert-manager webhook TLS... (${retries}/60)" + continue + fi + oc delete issuer cert-manager-webhook-probe -n cert-manager --ignore-not-found=true 2>/dev/null || true + rm -f "${probe_file}" + break + done + rm -f "${probe_file}" 2>/dev/null || true + log " ✓ cert-manager installed" +} + +# ====== INSTALL LOCAL-PATH PROVISIONER ====== +# k0s installs this as part of cluster setup. OpenShift has no default storage +# class on bare-metal, so we install local-path-provisioner the same way. +install_local_path_provisioner() { + if oc get storageclass 2>/dev/null | grep -q "(default)"; then + log " ✓ Default storage class already exists, skipping local-path install" + oc get storageclass + return 0 + fi + + log "Installing local-path-provisioner (no default storage class found)..." + oc apply -f https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.26/deploy/local-path-storage.yaml + + log "Waiting for local-path-provisioner to be ready..." + oc rollout status deployment local-path-provisioner -n local-path-storage --timeout=120s || true + + log "Setting local-path as default storage class..." + oc patch storageclass local-path \ + -p '{"metadata": {"annotations":{"storageclass.kubernetes.io/is-default-class":"true"}}}' + + # The main provisioner pod and the helper pod it spawns both need privileged SCC. + # The main provisioner runs as local-path-provisioner-service-account. + # The helper pod runs as the namespace's default SA (no serviceAccountName set). + oc create clusterrolebinding local-path-provisioner-privileged \ + --clusterrole=system:openshift:scc:privileged \ + --serviceaccount=local-path-storage:local-path-provisioner-service-account \ + 2>/dev/null || true + oc create clusterrolebinding local-path-helper-privileged \ + --clusterrole=system:openshift:scc:privileged \ + --serviceaccount=local-path-storage:default \ + 2>/dev/null || true + + # Patch the helper pod template to run privileged and relabel the created directory + # with svirt_sandbox_file_t so containers can read/write it (SELinux on OpenShift). + # Without the chcon, directories get var_t which containers cannot access. + oc patch configmap local-path-config -n local-path-storage --type=merge -p "$(cat <<'PATCH' +{ + "data": { + "helperPod.yaml": "apiVersion: v1\nkind: Pod\nmetadata:\n name: helper-pod\nspec:\n priorityClassName: system-node-critical\n tolerations:\n - key: node.kubernetes.io/disk-pressure\n operator: Exists\n effect: NoSchedule\n containers:\n - name: helper-pod\n image: busybox\n imagePullPolicy: IfNotPresent\n securityContext:\n privileged: true\n", + "setup": "#!/bin/sh\nset -eu\nmkdir -m 0777 -p \"$VOL_DIR\"\nchcon -Rt container_file_t -l s0 \"$VOL_DIR\" 2>/dev/null || true\n" + } +} +PATCH + )" + + # Restart the provisioner so it picks up the new helper pod template + oc rollout restart deployment local-path-provisioner -n local-path-storage + oc rollout status deployment local-path-provisioner -n local-path-storage --timeout=60s || true + + log " ✓ local-path-provisioner installed and set as default storage class" +} + +# ====== RELABEL WORKER NODE HOST PATHS FOR SELINUX ====== +# On OpenShift with SELinux enforcing, hostPath directories created by root get +# var_t label which containers cannot access. Relabel to container_file_t:s0 +# (no MCS categories) so any container can read/write the volume. +relabel_worker_nodes_for_selinux() { + log "Relabeling /opt/local-path-provisioner on worker nodes for SELinux..." + local workers + workers=$(oc get nodes -l '!node-role.kubernetes.io/master,!node-role.kubernetes.io/control-plane' \ + -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) + for node in ${workers}; do + log " Relabeling node ${node}..." + oc debug "node/${node}" --image=registry.access.redhat.com/ubi8/ubi-minimal -- \ + sh -c "mkdir -p /host/opt/local-path-provisioner && \ + chcon -Rt container_file_t -l s0 /host/opt/local-path-provisioner/ 2>/dev/null || true; \ + echo relabeled" 2>/dev/null || \ + oc debug "node/${node}" -- \ + chroot /host sh -c "mkdir -p /opt/local-path-provisioner && \ + chcon -Rt container_file_t -l s0 /opt/local-path-provisioner/ 2>/dev/null || true" 2>/dev/null || true + done + log " ✓ SELinux labels set on worker nodes" +} + +# ====== INSTALL OPENTELEMETRY OPERATOR ====== +install_otel_operator() { + log "Installing OpenTelemetry Operator..." + + if oc get deployment opentelemetry-operator-controller-manager \ + -n opentelemetry-operator-system &>/dev/null; then + log " ✓ OpenTelemetry Operator already installed, skipping" + return 0 + fi + + helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts 2>/dev/null || true + helm repo update open-telemetry 2>/dev/null || true + + local otel_retries=0 + while (( otel_retries < 6 )); do + local otel_out + otel_out=$(helm upgrade --install opentelemetry-operator open-telemetry/opentelemetry-operator \ + --namespace opentelemetry-operator-system --create-namespace \ + --set manager.collectorImage.repository=otel/opentelemetry-collector-contrib \ + --set admissionWebhooks.certManager.enabled=true \ + --wait=false --timeout=10m 2>&1) + if echo "${otel_out}" | grep -q "x509: certificate\|failed to call webhook\|i/o timeout"; then + warn "cert-manager webhook not ready yet, waiting 10s (${otel_retries}/6)..." + sleep 10 + otel_retries=$((otel_retries + 1)) + continue + fi + echo "${otel_out}" + break + done + + # Grant privileged SCC before pods start (runs as UID 65532 which is outside OCP's range) + oc create clusterrolebinding otel-operator-privileged \ + --clusterrole=system:openshift:scc:privileged \ + --serviceaccount=opentelemetry-operator-system:opentelemetry-operator \ + 2>/dev/null || true + + oc rollout status deployment opentelemetry-operator \ + -n opentelemetry-operator-system --timeout=5m || \ + oc rollout restart deployment opentelemetry-operator \ + -n opentelemetry-operator-system + + wait_for_crd opentelemetrycollectors.opentelemetry.io 300 + log " ✓ OpenTelemetry Operator installed" +} + +# ====== INSTALL KUBERAY OPERATOR ====== +install_ray_operator() { + log "Installing KubeRay Operator..." + + if oc get deployment kuberay-operator -n ray-system &>/dev/null; then + log " ✓ KubeRay Operator already installed, skipping" + return 0 + fi + + helm repo add kuberay https://ray-project.github.io/kuberay-helm/ 2>/dev/null || true + helm repo update kuberay + + helm upgrade --install kuberay-operator kuberay/kuberay-operator \ + --namespace ray-system --create-namespace \ + --version 1.2.2 \ + --set image.repository=quay.io/kuberay/operator \ + --set image.tag=v1.2.2 \ + --wait --timeout=10m + + wait_for_crd rayservices.ray.io 300 + wait_for_crd rayclusters.ray.io 300 + + log " ✓ KubeRay Operator installed" +} + +# ====== ECR PULL SECRET ====== +# Creates ecr-registry-secret in every namespace that pulls ECR images. +# Uses --dry-run=client | apply so it is idempotent (safe to re-run). +ensure_ecr_pull_secret() { + if [[ "${ECR_ENABLED}" != "true" ]]; then + log "ECR pull secret disabled (ecr.enabled=false), skipping" + return 0 + fi + + log "Creating ECR pull secret (account=${ECR_ACCOUNT}, region=${ECR_REGION})..." + + if ! aws sts get-caller-identity &>/dev/null; then + warn "AWS credentials not available — skipping ECR secret creation." + warn "Pods pulling from ECR will fail. Export AWS credentials and re-run install." + return 0 + fi + + local ecr_password + if ! ecr_password=$(aws ecr get-login-password --region "${ECR_REGION}" 2>/dev/null); then + warn "Failed to get ECR token — skipping secret creation" + return 0 + fi + + local server="${ECR_ACCOUNT}.dkr.ecr.${ECR_REGION}.amazonaws.com" + for ns in splunk-ai-operator-system "${AI_NS}"; do + ensure_namespace "${ns}" + oc create secret docker-registry ecr-registry-secret \ + --docker-server="${server}" \ + --docker-username=AWS \ + --docker-password="${ecr_password}" \ + --namespace="${ns}" \ + --dry-run=client -o yaml | oc apply -f - + + # Patch the default SA so pods without explicit imagePullSecrets also pull correctly + oc patch serviceaccount default -n "${ns}" \ + -p '{"imagePullSecrets": [{"name": "ecr-registry-secret"}]}' 2>/dev/null || true + + log " ✓ ecr-registry-secret created in ${ns}" + done + + # Also patch the operator SA specifically + oc patch serviceaccount splunk-ai-operator-controller-manager \ + -n splunk-ai-operator-system \ + -p '{"imagePullSecrets": [{"name": "ecr-registry-secret"}]}' 2>/dev/null || true +} + +# ====== INSTALL SPLUNK AI OPERATOR ====== +install_splunk_ai_operator() { + log "Installing Splunk AI Operator from ${SPLUNK_AI_FILE}..." + + [[ -f "${SPLUNK_AI_FILE}" ]] || { warn "Manifest not found: ${SPLUNK_AI_FILE}"; return 0; } + + local ai_operator_ns="splunk-ai-operator-system" + ensure_namespace "${ai_operator_ns}" + + # Grant SCCs before applying manifests so pods start on first attempt + grant_privileged_scc + + log "Applying Splunk AI Operator manifests (server-side apply)..." + local apply_output + apply_output=$(oc apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" 2>&1) || true + echo "${apply_output}" + + # Retry if cert-manager webhook not ready OR if cert-manager CRD mapping was missing. + # Certificate/Issuer resources silently fail with "resource mapping not found" when + # cert-manager pods are up but CRDs haven't been registered in the API server yet. + if echo "${apply_output}" | grep -qi "webhook.*cert-manager\|failed calling webhook.*cert-manager\|i/o timeout\|resource mapping not found\|no matches for kind.*cert-manager"; then + warn "cert-manager CRDs not ready, waiting 20s and retrying full apply..." + sleep 20 + oc apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" 2>&1 || true + fi + + # Inject the local instance.yaml so the operator knows about RTX_PRO_6000_BLACKWELL + # and other accelerators that may not be baked into the operator image. + local instance_src + instance_src="$(dirname "${SPLUNK_AI_FILE}")/../../config/configs/instance.yaml" + if [[ ! -f "${instance_src}" ]]; then + instance_src="$(cd "$(dirname "$0")/../.." && pwd)/config/configs/instance.yaml" + fi + if [[ -f "${instance_src}" ]]; then + oc create configmap splunk-ai-operator-instance-yaml \ + -n "${ai_operator_ns}" \ + --from-file=instance.yaml="${instance_src}" \ + --dry-run=client -o yaml | oc -n "${ai_operator_ns}" apply -f - + # Mount the ConfigMap and set INSTANCE_FILE so the operator uses it + oc patch deployment splunk-ai-operator-controller-manager \ + -n "${ai_operator_ns}" --type=json -p='[ + {"op":"add","path":"/spec/template/spec/volumes/-","value":{"name":"instance-yaml","configMap":{"name":"splunk-ai-operator-instance-yaml"}}}, + {"op":"add","path":"/spec/template/spec/containers/0/volumeMounts/-","value":{"name":"instance-yaml","mountPath":"/etc/instance","readOnly":true}}, + {"op":"add","path":"/spec/template/spec/containers/0/env/-","value":{"name":"INSTANCE_FILE","value":"/etc/instance/instance.yaml"}} + ]' 2>/dev/null || true + log " ✓ instance.yaml ConfigMap injected into operator" + else + warn "instance.yaml not found at ${instance_src} — defaultAcceleratorType may not resolve" + fi + + # Patch the operator SA and deployment with ECR pull secret AFTER the manifest apply + # (the SA is created by the manifest; patching before apply silently does nothing). + if [[ "${ECR_ENABLED}" == "true" ]]; then + oc patch serviceaccount splunk-ai-operator-controller-manager \ + -n "${ai_operator_ns}" \ + -p '{"imagePullSecrets": [{"name": "ecr-registry-secret"}]}' 2>/dev/null || true + oc patch deployment splunk-ai-operator-controller-manager \ + -n "${ai_operator_ns}" --type=json \ + -p='[{"op":"add","path":"/spec/template/spec/imagePullSecrets","value":[{"name":"ecr-registry-secret"}]}]' \ + 2>/dev/null || true + log " ✓ ECR pull secret patched into operator SA and deployment" + fi + + # Rollout restart so the deployment picks up pull secrets and instance.yaml. + oc rollout restart deployment splunk-ai-operator-controller-manager \ + -n "${ai_operator_ns}" 2>/dev/null || true + + # Wait for operator deployment to be ready — use the deployment name directly, + # not a label selector, to avoid matching stale ReplicaSets. + # A generous timeout per attempt; the outer loop gives up to 10 minutes total. + log "Waiting for Splunk AI Operator deployment to be ready..." + local retries=0 + while (( retries < 40 )); do + if oc rollout status deployment/splunk-ai-operator-controller-manager \ + -n "${ai_operator_ns}" --timeout=30s 2>/dev/null; then + break + fi + # If the pod is stuck terminating, force-delete it to unblock the rollout + local terminating + terminating=$(oc get pods -n "${ai_operator_ns}" \ + --field-selector=status.phase=Running \ + -l control-plane=controller-manager \ + -o jsonpath='{.items[?(@.metadata.deletionTimestamp)].metadata.name}' 2>/dev/null || true) + if [[ -n "${terminating}" ]]; then + log " Force-deleting stuck terminating pod: ${terminating}" + oc delete pod "${terminating}" -n "${ai_operator_ns}" --grace-period=0 --force 2>/dev/null || true + fi + sleep 10 + retries=$((retries + 1)) + (( retries % 3 == 0 )) && log " Waiting for operator... (${retries}/40)" + done + + # Wait for the webhook service to have endpoints — the pod being Running is not + # enough; the API server needs to register the endpoint before we apply CRs. + log "Waiting for Splunk AI Operator webhook endpoint to be ready..." + local wh_retries=0 + while (( wh_retries < 60 )); do + local ep_count + ep_count=$(oc get endpoints splunk-ai-operator-webhook-service \ + -n "${ai_operator_ns}" -o jsonpath='{.subsets[*].addresses}' 2>/dev/null | wc -w | tr -d ' ') + if [[ "${ep_count}" -gt 0 ]]; then + log " ✓ Webhook endpoint ready" + break + fi + sleep 5 + wh_retries=$((wh_retries + 1)) + (( wh_retries % 6 == 0 )) && log " Still waiting for webhook endpoint... (${wh_retries}/60)" + done + + log " ✓ Splunk AI Operator installed" +} + +# ====== INSTALL SPLUNK OPERATOR ====== +install_splunk_operator() { + log "Installing Splunk Operator..." + + [[ -f "${SPLUNK_OPERATOR_FILE}" ]] || { warn "Splunk operator file not found: ${SPLUNK_OPERATOR_FILE}, skipping"; return 0; } + + local splunk_operator_ns="splunk-operator" + ensure_namespace "${splunk_operator_ns}" + + # Create ECR pull secret in splunk-operator namespace + if [[ "${ECR_ENABLED}" == "true" ]]; then + local ecr_password + if ecr_password=$(aws ecr get-login-password --region "${ECR_REGION}" 2>/dev/null); then + oc create secret docker-registry ecr-registry-secret \ + --docker-server="${ECR_ACCOUNT}.dkr.ecr.${ECR_REGION}.amazonaws.com" \ + --docker-username=AWS \ + --docker-password="${ecr_password}" \ + --namespace="${splunk_operator_ns}" \ + --dry-run=client -o yaml | oc apply -f - + fi + fi + + if oc create -f "${SPLUNK_OPERATOR_FILE}" 2>/dev/null; then + log " Splunk Operator resources created" + else + log " Resources already exist, updating..." + oc replace --force -f "${SPLUNK_OPERATOR_FILE}" 2>&1 | grep -v "Warning: --force is deprecated" || true + fi + + # Grant privileged SCC to the whole namespace group — this is the pattern OCP SCC admission + # actually honours. The operator pod adds NET_BIND_SERVICE which anyuid blocks; privileged + # covers both. group-based grant survives replace --force (which recreates the namespace). + oc adm policy add-scc-to-group privileged \ + "system:serviceaccounts:${splunk_operator_ns}" 2>/dev/null || true + # Force pod recreation so it picks up the new SCC grant + oc delete replicaset -n "${splunk_operator_ns}" --all 2>/dev/null || true + + # Patch deployment with pull secret if present + local dep_name + dep_name=$(oc -n "${splunk_operator_ns}" get deploy -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [[ -n "${dep_name}" ]] && oc get secret ecr-registry-secret -n "${splunk_operator_ns}" &>/dev/null; then + oc -n "${splunk_operator_ns}" patch deployment "${dep_name}" \ + --type='json' \ + -p='[{"op":"add","path":"/spec/template/spec/imagePullSecrets","value":[{"name":"ecr-registry-secret"}]}]' \ + 2>/dev/null || true + oc rollout restart deployment "${dep_name}" -n "${splunk_operator_ns}" 2>/dev/null || true + fi + + wait_for_crd standalones.enterprise.splunk.com 300 + log " ✓ Splunk Operator installed" +} + +# ====== INSTALL SPLUNK STANDALONE ====== +install_splunk_standalone() { + log "Installing Splunk Standalone: ${AI_STANDALONE_NAME} in ${AI_NS}..." + + ensure_namespace "${AI_NS}" + wait_for_crd standalones.enterprise.splunk.com 600 + + # Object storage credentials secret + oc -n "${AI_NS}" create secret generic minio-credentials \ + --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ + --from-literal=AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" \ + --from-literal=s3_access_key="${MINIO_ROOT_USER}" \ + --from-literal=s3_secret_key="${MINIO_ROOT_PASSWORD}" \ + --from-literal=MINIO_ACCESS_KEY="${MINIO_ROOT_USER}" \ + --from-literal=MINIO_SECRET_KEY="${MINIO_ROOT_PASSWORD}" \ + --dry-run=client -o yaml | oc -n "${AI_NS}" apply -f - + + # Derive S3 endpoint for Splunk appRepo (endpoint is required by the Splunk Operator) + local minio_endpoint="${OBJ_STORE_ENDPOINT}" + if [[ -z "${minio_endpoint}" && "${OBJ_STORE_TYPE}" == "aws" ]]; then + minio_endpoint="https://s3.${ECR_REGION}.amazonaws.com" + log " type=aws: using S3 endpoint ${minio_endpoint}" + fi + [[ -z "${minio_endpoint}" ]] && err "storage.objectStore.endpoint must be set for type=${OBJ_STORE_TYPE}" + + oc apply --server-side --force-conflicts -f - </dev/null || true + oc delete pods -n "${AI_NS}" --field-selector status.phase=Failed --wait=false 2>/dev/null || true + + # Build imagePullSecrets block + local secrets_yaml="" + for secret_name in ecr-registry-secret; do + oc get secret "${secret_name}" -n "${AI_NS}" &>/dev/null && \ + secrets_yaml+=" - name: ${secret_name}"$'\n' + done + local image_pull_secrets="" + [[ -n "${secrets_yaml}" ]] && image_pull_secrets=" imagePullSecrets:"$'\n'"${secrets_yaml}" + + # Object storage path and endpoint + local obj_path obj_endpoint + case "${OBJ_STORE_TYPE}" in + aws) obj_path="s3://${OBJ_STORE_BUCKET}"; obj_endpoint="" ;; + s3compat) obj_path="s3compat://${OBJ_STORE_BUCKET}"; obj_endpoint="${OBJ_STORE_ENDPOINT}" ;; + minio) obj_path="minio://${OBJ_STORE_BUCKET}"; obj_endpoint="${OBJ_STORE_ENDPOINT}" ;; + seaweedfs) obj_path="seaweedfs://${OBJ_STORE_BUCKET}";obj_endpoint="${OBJ_STORE_ENDPOINT}" ;; + *) err "Unsupported objectStore.type: ${OBJ_STORE_TYPE}" ;; + esac + + # Features + local features_yaml="" + local feature_count + feature_count=$(yq eval '.aiPlatform.features | length' "${CONFIG_FILE}" 2>/dev/null || echo "0") + if [[ "${feature_count}" -gt 0 ]]; then + local i=0 + while [[ $i -lt $feature_count ]]; do + local fname fver + fname=$(yq eval ".aiPlatform.features[$i].name" "${CONFIG_FILE}") + fver=$(yq eval ".aiPlatform.features[$i].version // \"1.0.0\"" "${CONFIG_FILE}") + [[ -n "$fname" && "$fname" != "null" ]] && \ + features_yaml+=" - name: ${fname}"$'\n'" version: \"${fver}\""$'\n' + i=$((i + 1)) + done + else + features_yaml=" - name: saia"$'\n'" version: \"1.1.0\""$'\n' + fi + + # Service template + local svc_template_yaml="" + local svc_type + svc_type=$(yq eval '.aiPlatform.serviceTemplate.type // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + if [[ -n "${svc_type}" && "${svc_type}" != "null" && "${svc_type}" != "ClusterIP" ]]; then + local svc_node_port + svc_node_port=$(yq eval '.aiPlatform.serviceTemplate.nodePort // ""' "${CONFIG_FILE}" 2>/dev/null || echo "") + svc_template_yaml=" serviceTemplate:"$'\n'" spec:"$'\n'" type: ${svc_type}"$'\n' + if [[ -n "${svc_node_port}" && "${svc_type}" == "NodePort" ]]; then + svc_template_yaml+=" ports:"$'\n'" - name: http"$'\n'" port: 8080"$'\n'" targetPort: 8080"$'\n'" nodePort: ${svc_node_port}"$'\n' + fi + fi + + # The operator looks up splunk--secret for the HEC token. + # Extract it from the Splunk standalone secret created by the Splunk Operator. + local splunk_ns_secret="splunk-${AI_NS}-secret" + local standalone_secret="splunk-${AI_STANDALONE_NAME}-standalone-secret-v1" + log " Waiting for Splunk standalone secret ${standalone_secret}..." + local retries=0 + while (( retries < 60 )); do + if oc get secret "${standalone_secret}" -n "${AI_NS}" &>/dev/null; then + local hec_token + hec_token=$(oc get secret "${standalone_secret}" -n "${AI_NS}" \ + -o jsonpath='{.data.hec_token}' 2>/dev/null || echo "") + if [[ -n "${hec_token}" ]]; then + oc -n "${AI_NS}" create secret generic "${splunk_ns_secret}" \ + --from-literal=hec_token="$(echo "${hec_token}" | base64 -d)" \ + --dry-run=client -o yaml | oc apply -f - + log " ✓ ${splunk_ns_secret} created" + break + fi + fi + sleep 10 + retries=$(( retries + 1 )) + log " Waiting for Splunk secret... (${retries}/60)" + done + if (( retries >= 60 )); then + warn "Splunk secret not ready after 10m — AIPlatform reconcile will retry automatically" + fi + + local storage_yaml="" + if [[ -n "${STORAGE_CLASS}" && "${STORAGE_CLASS}" != "null" ]]; then + storage_yaml=" storage:"$'\n'" vectorDB:"$'\n'" size: ${VECTORDB_SIZE}"$'\n'" storageClassName: ${STORAGE_CLASS}"$'\n' + fi + + # Probe the AIPlatform webhook TLS cert immediately before applying. + # cert-manager issues certs with notBefore ~30-60s in the future (clock skew); + # retry until the x509 error clears. Using --dry-run=server hits the exact + # same webhook (maiplatform-v1.kb.io) without creating anything. + local ai_operator_ns="splunk-ai-operator-system" + local tls_probe_file + tls_probe_file=$(mktemp /tmp/aiplatform-tls-probe-XXXXXX.yaml) + cat > "${tls_probe_file}" <<'PROBE_EOF' +apiVersion: ai.splunk.com/v1 +kind: AIPlatform +metadata: + name: webhook-tls-probe + namespace: splunk-ai-operator-system +spec: + defaultAcceleratorType: L40S + objectStorage: + path: s3://probe/probe +PROBE_EOF + local tls_retries=0 + while (( tls_retries < 60 )); do + local tls_out + tls_out=$(oc apply --dry-run=server -f "${tls_probe_file}" 2>&1) || true + if echo "${tls_out}" | grep -q "x509:\|not yet valid\|certificate has expired\|failed to verify certificate\|failed to call webhook"; then + sleep 5 + tls_retries=$((tls_retries + 1)) + (( tls_retries % 6 == 0 )) && log " Still waiting for operator webhook TLS cert... (${tls_retries}/60)" + continue + fi + log " ✓ Operator webhook TLS certificate valid" + break + done + rm -f "${tls_probe_file}" 2>/dev/null || true + + oc -n "${AI_NS}" apply --server-side --force-conflicts -f - </dev/null 2>&1; do + sleep 5; elapsed=$((elapsed + 5)) + [[ ${elapsed} -ge ${timeout} ]] && { warn "Timeout waiting for AIPlatform CR"; break; } + done + + oc get aiplatform "${AI_PLATFORM_NAME}" -n "${AI_NS}" -o wide || true + log " ✓ AIPlatform CR installed" +} + +# ====== MAIN INSTALL ====== +main_install() { + log "============================================" + log " Splunk AI Platform — OpenShift Install" + log "============================================" + + load_config + preflight_checks + validate_image_config + configure_images + install_nfd + install_nvidia_gpu_operator + label_nodes + install_local_path_provisioner + relabel_worker_nodes_for_selinux + install_cert_manager + install_otel_operator + install_ray_operator + ensure_ecr_pull_secret + install_splunk_ai_operator + install_splunk_operator + install_splunk_standalone + install_ai_platform_cr + + log "============================================" + log " Install complete" + log "============================================" + log "" + log "Next steps:" + log " 1. Create an AIPlatform CR in namespace '${AI_NS}'" + log " 2. Check operator logs:" + log " oc logs -n splunk-ai-operator-system -l control-plane=controller-manager -f" + log " 3. Watch resources:" + log " oc get aiplatform,raycluster,rayservice -n ${AI_NS}" + log "" + log "Log file: ${LOG_FILE}" +} + +# ====== MAIN DELETE ====== +main_delete() { + log "============================================" + log " Splunk AI Platform — OpenShift Delete" + log "============================================" + + load_config + + if ! oc whoami &>/dev/null; then + err "Not logged in to OpenShift. Run: oc login " + fi + + local ai_operator_ns="splunk-ai-operator-system" + local splunk_operator_ns="splunk-operator" + + # ── 1. AI Platform CRs (trigger operator finalizers before namespace delete) ── + log "Removing AIPlatform CR and waiting for finalizers..." + oc delete aiplatform --all -n "${AI_NS}" --timeout=120s 2>/dev/null || true + oc delete standalone --all -n "${AI_NS}" --timeout=60s 2>/dev/null || true + + # ── 2. AI Platform namespace (cascades all pods, PVCs, services, etc.) ── + log "Deleting namespace ${AI_NS}..." + oc delete namespace "${AI_NS}" --timeout=180s 2>/dev/null || true + + # ── 3. Splunk AI Operator ── + log "Removing Splunk AI Operator..." + oc delete namespace "${ai_operator_ns}" --timeout=60s 2>/dev/null || true + # Remove cluster-scoped resources (CRDs, ClusterRoles, webhooks) from manifests + [[ -f "${SPLUNK_AI_FILE}" ]] && \ + oc delete -f "${SPLUNK_AI_FILE}" --ignore-not-found=true 2>/dev/null || true + + # ── 4. Splunk Operator ── + log "Removing Splunk Operator..." + oc delete namespace "${splunk_operator_ns}" --timeout=60s 2>/dev/null || true + [[ -f "${SPLUNK_OPERATOR_FILE}" ]] && \ + oc delete -f "${SPLUNK_OPERATOR_FILE}" --ignore-not-found=true 2>/dev/null || true + + # ── 5. KubeRay Operator (helm) ── + log "Removing KubeRay Operator..." + helm uninstall kuberay-operator -n ray-system 2>/dev/null || true + oc delete namespace ray-system --timeout=60s 2>/dev/null || true + + # ── 6. OpenTelemetry Operator (helm) ── + log "Removing OpenTelemetry Operator..." + helm uninstall opentelemetry-operator -n opentelemetry-operator-system 2>/dev/null || true + oc delete namespace opentelemetry-operator-system --timeout=60s 2>/dev/null || true + + # ── 7. cert-manager (helm) ── + log "Removing cert-manager..." + helm uninstall cert-manager -n cert-manager 2>/dev/null || true + oc delete namespace cert-manager --timeout=60s 2>/dev/null || true + # Remove CRDs left by cert-manager (helm uninstall doesn't remove CRDs by default) + oc get crd -o name 2>/dev/null | grep cert-manager | xargs -r oc delete --ignore-not-found=true 2>/dev/null || true + + # ── 8. local-path-provisioner ── + log "Removing local-path-provisioner..." + oc delete -f https://raw.githubusercontent.com/rancher/local-path-provisioner/v0.0.26/deploy/local-path-storage.yaml \ + --ignore-not-found=true 2>/dev/null || true + oc delete namespace local-path-storage --timeout=60s 2>/dev/null || true + oc delete storageclass local-path --ignore-not-found=true 2>/dev/null || true + + # ── 9. NVIDIA GPU Operator ── + log "Removing NVIDIA GPU Operator..." + oc delete clusterpolicy gpu-cluster-policy --ignore-not-found=true 2>/dev/null || true + oc delete subscription gpu-operator-certified -n nvidia-gpu-operator --ignore-not-found=true 2>/dev/null || true + oc delete csv -n nvidia-gpu-operator --all --ignore-not-found=true 2>/dev/null || true + oc delete namespace nvidia-gpu-operator --timeout=60s 2>/dev/null || true + + # ── 10. NFD ── + log "Removing Node Feature Discovery..." + oc delete nodefeaturediscovery nfd-instance -n openshift-nfd --ignore-not-found=true 2>/dev/null || true + oc delete subscription nfd -n openshift-nfd --ignore-not-found=true 2>/dev/null || true + oc delete csv -n openshift-nfd --all --ignore-not-found=true 2>/dev/null || true + oc delete namespace openshift-nfd --timeout=60s 2>/dev/null || true + + # ── 11. Node labels and taints added by label_nodes() ── + log "Removing splunk.ai/* node labels and GPU taint..." + for node in $(oc get nodes -l 'splunk.ai/workload-type' -o name 2>/dev/null); do + oc label "${node}" splunk.ai/workload-type- 2>/dev/null || true + oc taint "${node}" nvidia.com/gpu=true:NoSchedule- 2>/dev/null || true + done + + # ── 12. SCC grants added during install ── + if [[ "${GRANT_PRIVILEGED_SCC}" == "true" ]]; then + log "Removing SCC grants..." + oc adm policy remove-scc-from-group privileged \ + "system:serviceaccounts:${ai_operator_ns}" 2>/dev/null || true + oc adm policy remove-scc-from-group anyuid \ + "system:serviceaccounts:${AI_NS}" 2>/dev/null || true + oc adm policy remove-scc-from-group privileged \ + "system:serviceaccounts:${AI_NS}" 2>/dev/null || true + oc adm policy remove-scc-from-group privileged \ + "system:serviceaccounts:local-path-storage" 2>/dev/null || true + oc adm policy remove-scc-from-group privileged \ + "system:serviceaccounts:splunk-operator" 2>/dev/null || true + fi + + # Remove individual ClusterRoleBindings created during install + for crb in \ + local-path-provisioner-privileged \ + local-path-helper-privileged \ + splunk-standalone-privileged \ + splunk-operator-privileged \ + splunk-operator-anyuid \ + otel-operator-privileged \ + otel-operator-anyuid \ + scc-privileged-ai-platform-all \ + scc-privileged-splunk-ai-operator-system-default \ + scc-privileged-splunk-ai-operator-system-splunk-ai-operator-controller-manager; do + oc delete clusterrolebinding "${crb}" --ignore-not-found=true 2>/dev/null || true + done + + # ── 13. ECR pull secret ClusterRoleBindings ── + oc delete clusterrolebinding ecr-registry-secret-updater 2>/dev/null || true + + log "============================================" + log " Delete complete" + log "============================================" + log "" + log "Cluster itself is untouched — only the AI Platform stack was removed." + log "Log file: ${LOG_FILE}" +} + +# ====== USAGE ====== +usage() { + cat < + - oc, yq in PATH + - artifacts.yaml (operator manifests) in the same directory, or set files.aiPlatform in config +EOF +} + +# ====== MAIN ====== +case "${1:-install}" in + install) + main_install + ;; + delete) + main_delete + ;; + *) + usage + exit 1 + ;; +esac From 64420d827cebc45b36b860a9142b8c4c1d9bd47f Mon Sep 17 00:00:00 2001 From: kbhos Date: Mon, 15 Jun 2026 11:54:19 +0530 Subject: [PATCH 02/10] UPdate test to reflect Gemma431bIt removal --- pkg/ai/raybuilder/configmap_apps_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/ai/raybuilder/configmap_apps_test.go b/pkg/ai/raybuilder/configmap_apps_test.go index 4beb1b4e..fbdad4ef 100644 --- a/pkg/ai/raybuilder/configmap_apps_test.go +++ b/pkg/ai/raybuilder/configmap_apps_test.go @@ -86,9 +86,11 @@ func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) { } } - expectedTextGenApps := []string{"Gemma431bIt", "GptOss20b"} + expectedTextGenApps := []string{"GptOss20b"} - // We expect exactly two text-gen apps today (Gemma431bIt, GptOss20b). + // We expect exactly one text-gen app today (GptOss20b). Gemma431bIt was + // removed — it requires 2× RTX PRO 6000 GPUs and cannot run alongside + // GptOss20b on a single 2-GPU node. // If this count changes, someone added a new text-gen model; they MUST // also add DISABLE_RESPONSES_API_REDIS to the new app. require.Len(t, textGenApps, len(expectedTextGenApps), From 377ff1dfbf9a2d9aa4741e6a98d060bc17f3c1d5 Mon Sep 17 00:00:00 2001 From: kbhos Date: Mon, 15 Jun 2026 12:34:57 +0530 Subject: [PATCH 03/10] =?UTF-8?q?fix(AIP-3938):=20Revert=20artifacts.yaml?= =?UTF-8?q?=20to=20main=20=E2=80=94=20upstream=20changes=20leaked=20via=20?= =?UTF-8?q?stash?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tools/cluster_setup/artifacts.yaml | 67 ++++++++++++++---------------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index f2347653..c6953e7b 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -1061,18 +1061,11 @@ spec: items: description: FeatureSpec defines the features to enable in the AIPlatform properties: - env: - additionalProperties: - type: string - description: Env specifies environment variables to propagate - to the child AIService. - type: object name: description: Name of the feature, e.g. "saia" or "seca" enum: - saia - seca - - weaviate-service type: string scaleFactor: description: ScaleFactor is the desired fixed number of replicas @@ -2092,11 +2085,6 @@ spec: type: object x-kubernetes-map-type: atomic type: array - otelImage: - default: otel/opentelemetry-collector-contrib:0.122.1 - description: OTelImage is the OpenTelemetry Collector sidecar - image - type: string rayHeadGroupImage: description: Ray head group image, e.g. "rayproject/ray-head:latest" type: string @@ -2237,8 +2225,7 @@ spec: type: object objectStorage: description: |- - ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models. - It is optional for platforms that only enable features that do not require object storage. + ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models Supported providers: S3, GCS, Azure Blob Storage, MinIO properties: endpoint: @@ -2250,8 +2237,8 @@ spec: path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -2921,6 +2908,8 @@ spec: pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ type: string type: object + required: + - objectStorage type: object status: description: AIPlatformStatus defines observed state @@ -4095,18 +4084,11 @@ spec: features: description: Feature defines the features to be enabled for the AIService properties: - env: - additionalProperties: - type: string - description: Env specifies environment variables to propagate - to the child AIService. - type: object name: description: Name of the feature, e.g. "saia" or "seca" enum: - saia - seca - - weaviate-service type: string scaleFactor: description: ScaleFactor is the desired fixed number of replicas @@ -4884,15 +4866,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -4900,7 +4894,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string @@ -5687,19 +5682,19 @@ spec: fieldRef: fieldPath: metadata.name - name: RELATED_IMAGE_RAY_HEAD - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-953 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-010 - name: RELATED_IMAGE_RAY_WORKER - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-953 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-010 - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - - name: RELATED_IMAGE_WEAVIATE_SERVICE - value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-main-c3b489d + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-012 - name: RELATED_IMAGE_SAIA_API_V2 - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-main-c3b489d + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-012 - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-main-c3b489d + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-012 + - name: SPLUNK_METRICS_INDEX_NAME + value: _metrics - name: RELATED_IMAGE_FLUENT_BIT value: docker.io/fluent/fluent-bit:1.9.6 - name: RELATED_IMAGE_OTEL_COLLECTOR @@ -5710,7 +5705,7 @@ spec: value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.1 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.29 livenessProbe: httpGet: path: /healthz From 87982bb5332e2f6756254d716f2b84b630bd53f5 Mon Sep 17 00:00:00 2001 From: kbhos Date: Mon, 15 Jun 2026 16:13:02 +0530 Subject: [PATCH 04/10] successfully deploy gpt-oss20b model --- config/configs/features/saia.yaml | 2 +- config/configs/instance.yaml | 33 ++++++++- tools/cluster_setup/artifacts.yaml | 67 ++++++++++--------- .../openshift-cluster-config.yaml | 2 +- 4 files changed, 70 insertions(+), 34 deletions(-) diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml index 4e94b27d..aea4a115 100644 --- a/config/configs/features/saia.yaml +++ b/config/configs/features/saia.yaml @@ -25,5 +25,5 @@ instanceScale: h100-nvl-1-gpu: 2 RTX_PRO_6000_BLACKWELL: rtx-pro-6000-blackwell-0-gpu: 1 - rtx-pro-6000-blackwell-1-gpu: 1 + rtx-pro-6000-blackwell-1-gpu: 2 rtx-pro-6000-blackwell-2-gpu: 0 \ No newline at end of file diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml index 71ea8e78..3183dbf9 100644 --- a/config/configs/instance.yaml +++ b/config/configs/instance.yaml @@ -85,4 +85,35 @@ H100_NVL: cpu: "16" memory: "48Gi" ephemeral-storage: "100Gi" - nvidia.com/gpu: "1" \ No newline at end of file + nvidia.com/gpu: "1" +RTX_PRO_6000_BLACKWELL: + - tier: rtx-pro-6000-blackwell-0-gpu + gpusPerPod: 0 + resources: + limits: + cpu: "16" + memory: "24Gi" + ephemeral-storage: "50Gi" + nvidia.com/gpu: "0" + requests: + cpu: "4" + - tier: rtx-pro-6000-blackwell-1-gpu + gpusPerPod: 1 + resources: + requests: + cpu: "4" + limits: + cpu: "16" + memory: "48Gi" + ephemeral-storage: "200Gi" + nvidia.com/gpu: "1" + - tier: rtx-pro-6000-blackwell-2-gpu + gpusPerPod: 2 + resources: + requests: + cpu: "4" + limits: + cpu: "8" + memory: "96Gi" + ephemeral-storage: "400Gi" + nvidia.com/gpu: "2" \ No newline at end of file diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index c6953e7b..cb480386 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -1061,11 +1061,18 @@ spec: items: description: FeatureSpec defines the features to enable in the AIPlatform properties: + env: + additionalProperties: + type: string + description: Env specifies environment variables to propagate + to the child AIService. + type: object name: description: Name of the feature, e.g. "saia" or "seca" enum: - saia - seca + - weaviate-service type: string scaleFactor: description: ScaleFactor is the desired fixed number of replicas @@ -2085,6 +2092,11 @@ spec: type: object x-kubernetes-map-type: atomic type: array + otelImage: + default: otel/opentelemetry-collector-contrib:0.122.1 + description: OTelImage is the OpenTelemetry Collector sidecar + image + type: string rayHeadGroupImage: description: Ray head group image, e.g. "rayproject/ray-head:latest" type: string @@ -2225,7 +2237,8 @@ spec: type: object objectStorage: description: |- - ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models + ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models. + It is optional for platforms that only enable features that do not require object storage. Supported providers: S3, GCS, Azure Blob Storage, MinIO properties: endpoint: @@ -2237,8 +2250,8 @@ spec: path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ - pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, or minio://bucketname/ + pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -2908,8 +2921,6 @@ spec: pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ type: string type: object - required: - - objectStorage type: object status: description: AIPlatformStatus defines observed state @@ -4084,11 +4095,18 @@ spec: features: description: Feature defines the features to be enabled for the AIService properties: + env: + additionalProperties: + type: string + description: Env specifies environment variables to propagate + to the child AIService. + type: object name: description: Name of the feature, e.g. "saia" or "seca" enum: - saia - seca + - weaviate-service type: string scaleFactor: description: ScaleFactor is the desired fixed number of replicas @@ -4866,27 +4884,15 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) - Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) + Optional override endpoint (only needed for S3-compatible services like MinIO) + Must be a valid HTTP/HTTPS URL pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// - pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ - type: string - provider: - description: |- - Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. - Values: aws, minio, seaweedfs, s3compat, gcs, azure - enum: - - aws - - minio - - seaweedfs - - s3compat - - gcs - - azure + azure://containername/, or minio://bucketname/ + pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -4894,8 +4900,7 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials (e.g. - s3_access_key, s3_secret_key for S3-compatible backends) + description: Secret name containing storage credentials maxLength: 253 minLength: 1 type: string @@ -5682,19 +5687,19 @@ spec: fieldRef: fieldPath: metadata.name - name: RELATED_IMAGE_RAY_HEAD - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-010 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-953 - name: RELATED_IMAGE_RAY_WORKER - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-010 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-953 - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a + - name: RELATED_IMAGE_WEAVIATE_SERVICE + value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-012 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-main-c3b489d - name: RELATED_IMAGE_SAIA_API_V2 - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-012 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-main-c3b489d - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-012 - - name: SPLUNK_METRICS_INDEX_NAME - value: _metrics + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-main-c3b489d - name: RELATED_IMAGE_FLUENT_BIT value: docker.io/fluent/fluent-bit:1.9.6 - name: RELATED_IMAGE_OTEL_COLLECTOR @@ -5705,7 +5710,7 @@ spec: value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.29 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.2 livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/openshift-cluster-config.yaml b/tools/cluster_setup/openshift-cluster-config.yaml index 2eb1c105..6eaab469 100644 --- a/tools/cluster_setup/openshift-cluster-config.yaml +++ b/tools/cluster_setup/openshift-cluster-config.yaml @@ -34,7 +34,7 @@ images: registry: "658391232643.dkr.ecr.us-east-2.amazonaws.com" operator: - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.1" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.2" ray: headImage: "ml-platform/ray/ray-head:build-953" From b8ac604b8b5cc5416d5fe1ac493c4062f1b376bc Mon Sep 17 00:00:00 2001 From: kbhos Date: Tue, 16 Jun 2026 09:59:31 +0530 Subject: [PATCH 05/10] misc --- config/configs/applications.yaml | 66 ++++ config/configs/features/saia.yaml | 4 +- tools/cluster_setup/artifacts.yaml | 2 +- .../openshift-cluster-config.yaml | 4 +- tools/cluster_setup/openshift_with_stack.sh | 355 +++++++++++++++++- 5 files changed, 416 insertions(+), 15 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index dbb0c5c6..eb29355d 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -53,6 +53,9 @@ applications: max_ongoing_requests: 8 ray_actor_options: num_gpus: 1 + resources: + "gpu_count:2": 0.001 + "accelerator_type:RTX_PRO_6000_BLACKWELL": 0.001 options: autoscaling_config: max_replicas: {{.Replicas.GptOss20b}} @@ -142,6 +145,12 @@ applications: L40S: ray_actor_options: num_gpus: 0.075 + RTX_PRO_6000_BLACKWELL: + ray_actor_options: + num_gpus: 0.031 + resources: + "gpu_count:2": 0.001 + "accelerator_type:RTX_PRO_6000_BLACKWELL": 0.001 options: autoscaling_config: max_replicas: {{.Replicas.UaeLarge}} @@ -157,6 +166,9 @@ applications: L40S: engine_args: gpu_memory_utilization: 0.075 + RTX_PRO_6000_BLACKWELL: + engine_args: + gpu_memory_utilization: 0.031 model_config: engine_args: gpu_memory_utilization: 0.15 @@ -208,6 +220,12 @@ applications: H100: ray_actor_options: num_gpus: 0.005 + RTX_PRO_6000_BLACKWELL: + ray_actor_options: + num_gpus: 0.004 + resources: + "gpu_count:2": 0.001 + "accelerator_type:RTX_PRO_6000_BLACKWELL": 0.001 options: autoscaling_config: max_replicas: {{.Replicas.AllMinilmL6V2}} @@ -220,6 +238,9 @@ applications: H100: engine_args: gpu_memory_utilization: 0.005 + RTX_PRO_6000_BLACKWELL: + engine_args: + gpu_memory_utilization: 0.004 model_config: engine_args: gpu_memory_utilization: 0.01 @@ -271,6 +292,12 @@ applications: H100: ray_actor_options: num_gpus: 0.005 + RTX_PRO_6000_BLACKWELL: + ray_actor_options: + num_gpus: 0.004 + resources: + "gpu_count:2": 0.001 + "accelerator_type:RTX_PRO_6000_BLACKWELL": 0.001 options: autoscaling_config: max_replicas: {{.Replicas.BiEncoder}} @@ -338,6 +365,12 @@ applications: L40S: ray_actor_options: num_gpus: 0.1 + RTX_PRO_6000_BLACKWELL: + ray_actor_options: + num_gpus: 0.05 + resources: + "gpu_count:2": 0.001 + "accelerator_type:RTX_PRO_6000_BLACKWELL": 0.001 options: autoscaling_config: max_replicas: {{.Replicas.MbartTranslator}} @@ -391,6 +424,12 @@ applications: L40S: ray_actor_options: num_gpus: 0.05 + RTX_PRO_6000_BLACKWELL: + ray_actor_options: + num_gpus: 0.021 + resources: + "gpu_count:2": 0.001 + "accelerator_type:RTX_PRO_6000_BLACKWELL": 0.001 options: autoscaling_config: max_replicas: {{.Replicas.XlmRobertaLanguageClassifier}} @@ -406,6 +445,9 @@ applications: L40S: engine_args: gpu_memory_utilization: 0.05 + RTX_PRO_6000_BLACKWELL: + engine_args: + gpu_memory_utilization: 0.021 model_config: engine_args: gpu_memory_utilization: 0.1 @@ -496,6 +538,12 @@ applications: H100: ray_actor_options: num_gpus: 0.005 + RTX_PRO_6000_BLACKWELL: + ray_actor_options: + num_gpus: 0.004 + resources: + "gpu_count:2": 0.001 + "accelerator_type:RTX_PRO_6000_BLACKWELL": 0.001 options: autoscaling_config: max_replicas: {{.Replicas.CrossEncoder}} @@ -508,6 +556,9 @@ applications: H100: engine_args: gpu_memory_utilization: 0.005 + RTX_PRO_6000_BLACKWELL: + engine_args: + gpu_memory_utilization: 0.004 model_config: engine_args: gpu_memory_utilization: 0.01 @@ -563,6 +614,12 @@ applications: L40S: ray_actor_options: num_gpus: 0.05 + RTX_PRO_6000_BLACKWELL: + ray_actor_options: + num_gpus: 0.021 + resources: + "gpu_count:2": 0.001 + "accelerator_type:RTX_PRO_6000_BLACKWELL": 0.001 options: autoscaling_config: max_replicas: {{.Replicas.E5LanguageClassifier}} @@ -578,6 +635,9 @@ applications: L40S: engine_args: gpu_memory_utilization: 0.05 + RTX_PRO_6000_BLACKWELL: + engine_args: + gpu_memory_utilization: 0.021 model_config: engine_args: gpu_memory_utilization: 0.1 @@ -632,6 +692,12 @@ applications: L40S: ray_actor_options: num_gpus: 0.025 + RTX_PRO_6000_BLACKWELL: + ray_actor_options: + num_gpus: 0.013 + resources: + "gpu_count:2": 0.001 + "accelerator_type:RTX_PRO_6000_BLACKWELL": 0.001 options: autoscaling_config: max_replicas: {{.Replicas.PromptInjectionCrossEncoder}} diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml index aea4a115..69528f49 100644 --- a/config/configs/features/saia.yaml +++ b/config/configs/features/saia.yaml @@ -25,5 +25,5 @@ instanceScale: h100-nvl-1-gpu: 2 RTX_PRO_6000_BLACKWELL: rtx-pro-6000-blackwell-0-gpu: 1 - rtx-pro-6000-blackwell-1-gpu: 2 - rtx-pro-6000-blackwell-2-gpu: 0 \ No newline at end of file + rtx-pro-6000-blackwell-1-gpu: 0 + rtx-pro-6000-blackwell-2-gpu: 1 \ No newline at end of file diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index cb480386..48710094 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -5710,7 +5710,7 @@ spec: value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.2 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.5 livenessProbe: httpGet: path: /healthz diff --git a/tools/cluster_setup/openshift-cluster-config.yaml b/tools/cluster_setup/openshift-cluster-config.yaml index 6eaab469..072de5c6 100644 --- a/tools/cluster_setup/openshift-cluster-config.yaml +++ b/tools/cluster_setup/openshift-cluster-config.yaml @@ -34,7 +34,7 @@ images: registry: "658391232643.dkr.ecr.us-east-2.amazonaws.com" operator: - image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.2" + image: "658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.5" ray: headImage: "ml-platform/ray/ray-head:build-953" @@ -65,7 +65,7 @@ storage: storageClass: "local-path" vectorDbSize: "50Gi" objectStore: - type: "minio" # aws | s3compat | minio | seaweedfs + type: "seaweedfs" # aws | s3compat | minio | seaweedfs bucket: "ai-platform-bucket" endpoint: "http://18.116.39.79:8333" auth: diff --git a/tools/cluster_setup/openshift_with_stack.sh b/tools/cluster_setup/openshift_with_stack.sh index ced27236..5533bb0c 100755 --- a/tools/cluster_setup/openshift_with_stack.sh +++ b/tools/cluster_setup/openshift_with_stack.sh @@ -28,11 +28,182 @@ LOG_FILE="${LOG_DIR}/openshift-install-$(date '+%Y-%m-%d_%H-%M-%S').log" exec > >(tee -a "${LOG_FILE}") 2>&1 echo "[LOG] Session log: ${LOG_FILE}" +# ====== LOG ROTATION (keep last 10 logs) ====== +_rotate_logs() { + local keep=10 + local logs=() + while IFS= read -r f; do logs+=("$f"); done < <(ls -1t "${LOG_DIR}"/openshift-install-*.log 2>/dev/null) + local excess=$(( ${#logs[@]} - keep )) + if (( excess > 0 )); then + for (( i=${#logs[@]}-1; i>=${#logs[@]}-excess; i-- )); do + rm -f "${logs[$i]}" + done + fi +} +_rotate_logs + # ====== COLORS & LOGGING ====== -log() { echo -e "\033[1;36m[INFO]\033[0m $*" >&2; } -warn() { echo -e "\033[1;33m[WARN]\033[0m $*" >&2; } -err() { echo -e "\033[1;31m[ERROR]\033[0m $*" >&2; exit 1; } -need() { command -v "$1" >/dev/null 2>&1 || err "Missing $1 in PATH"; } +_ts() { date '+%Y-%m-%d %H:%M:%S'; } +log() { echo -e "\033[1;36m[$(_ts) INFO]\033[0m $*" >&2; } +warn() { echo -e "\033[1;33m[$(_ts) WARN]\033[0m $*" >&2; } +err() { + echo -e "\033[1;31m[$(_ts) ERROR]\033[0m $*" >&2 + echo -e "\033[1;31m[$(_ts) ERROR]\033[0m Log file: ${LOG_FILE}" >&2 + echo -e "\033[1;31m[$(_ts) ERROR]\033[0m Run '$0 diagnose' to collect a full support bundle." >&2 + exit 1 +} + +# ====== TOOL CHECKER ====== +need() { + command -v "$1" >/dev/null 2>&1 && return 0 + local install_hint="" + case "$1" in + oc) install_hint="https://docs.openshift.com/container-platform/latest/cli_reference/openshift_cli/getting-started-cli.html" ;; + helm) install_hint="brew install helm OR https://helm.sh/docs/intro/install/" ;; + yq) install_hint="brew install yq OR wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/local/bin/yq && chmod +x /usr/local/bin/yq" ;; + jq) install_hint="brew install jq OR apt-get install jq OR dnf install jq" ;; + curl) install_hint="apt-get install curl OR brew install curl" ;; + aws) install_hint="https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html" ;; + git) install_hint="brew install git OR apt-get install git" ;; + *) install_hint="install '$1' via your system package manager" ;; + esac + err "Required tool not found: $1 + Install: ${install_hint}" +} + +# ====== STEP PROGRESS TRACKER ====== +declare -a _STEP_NAMES=() +declare -a _STEP_STATUS=() +_STEP_CURRENT="" + +step_start() { + _STEP_CURRENT="$1" + _STEP_NAMES+=("$1") + _STEP_STATUS+=("running") + local n=${#_STEP_NAMES[@]} + echo -e "\n\033[1;34m[$(_ts) ── STEP ${n}: $1 ──]\033[0m" >&2 +} + +step_ok() { + local last=$(( ${#_STEP_STATUS[@]} - 1 )) + _STEP_STATUS[$last]="ok" +} + +step_fail() { + local last=$(( ${#_STEP_STATUS[@]} - 1 )) + _STEP_STATUS[$last]="fail:${1:-unknown error}" +} + +step_skip() { + local last=$(( ${#_STEP_STATUS[@]} - 1 )) + _STEP_STATUS[$last]="skip:${1:-}" +} + +show_step_summary() { + echo -e "\n\033[1;34m[$(_ts) ════ INSTALL SUMMARY ════]\033[0m" >&2 + local total=${#_STEP_NAMES[@]} ok=0 fail=0 skip=0 + for i in "${!_STEP_NAMES[@]}"; do + local s="${_STEP_STATUS[$i]}" + local icon color label + case "${s%%:*}" in + ok) icon="✔"; color="\033[1;32m"; label="OK"; ok=$((ok+1)) ;; + fail) icon="✖"; color="\033[1;31m"; label="${s#fail:}"; fail=$((fail+1)) ;; + skip) icon="–"; color="\033[1;33m"; label="${s#skip:}"; skip=$((skip+1)) ;; + running) icon="?"; color="\033[1;33m"; label="interrupted"; fail=$((fail+1)) ;; + *) icon="?"; color="\033[0m"; label="${s}" ;; + esac + printf " ${color}${icon}\033[0m %-45s %s\n" "${_STEP_NAMES[$i]}" "${label}" >&2 + done + echo "" >&2 + if (( fail == 0 )); then + echo -e " \033[1;32mAll ${total} steps completed successfully.\033[0m" >&2 + else + echo -e " \033[1;31m${fail} step(s) failed, ${ok} succeeded, ${skip} skipped.\033[0m" >&2 + echo -e " \033[1;31mSee log: ${LOG_FILE}\033[0m" >&2 + fi + echo "" >&2 +} + +# ====== PHASE SECTION MARKERS ====== +phase_start() { echo -e "\n\033[1;35m[$(_ts) ════════ PHASE: $* ════════]\033[0m" >&2; } +phase_end() { echo -e "\033[1;35m[$(_ts) ════════ END: $* ════════]\033[0m\n" >&2; } + +# ====== WAIT FOR DEPENDENCY (interactive pause-and-retry) ====== +wait_for_dependency() { + local description="$1" + local check_cmd="$2" + local max_wait="${3:-600}" + local elapsed=0 interval=30 + + log "Waiting for external dependency: ${description}" + log " Max wait: ${max_wait}s. Press Enter at any time to retry immediately." + + while (( elapsed < max_wait )); do + if eval "${check_cmd}" >/dev/null 2>&1; then + log " ✔ ${description} — ready" + return 0 + fi + local remaining=$(( max_wait - elapsed )) + warn " ${description} not ready yet. Retrying in ${interval}s (${remaining}s remaining)." + warn " Press Enter to retry now, or wait..." + if read -t "${interval}" -r 2>/dev/null; then + log " Retrying immediately..." + fi + elapsed=$(( elapsed + interval )) + done + + err "Timed out after ${max_wait}s waiting for: ${description} + Resolve the issue, then re-run the installer." +} + +# ====== SHOW INSTALL PLAN ====== +show_install_plan() { + echo -e "\n\033[1;34m╔══════════════════════════════════════════════════════════╗\033[0m" >&2 + echo -e "\033[1;34m║ SPLUNK AI PLATFORM — OPENSHIFT INSTALL PLAN ║\033[0m" >&2 + echo -e "\033[1;34m╚══════════════════════════════════════════════════════════╝\033[0m" >&2 + echo "" >&2 + echo -e " \033[1mNamespace :\033[0m ${AI_NS}" >&2 + echo -e " \033[1mConfig file :\033[0m ${CONFIG_FILE}" >&2 + echo -e " \033[1mLog file :\033[0m ${LOG_FILE}" >&2 + echo "" >&2 + echo -e " \033[1mAccelerator type :\033[0m ${DEFAULT_ACCELERATOR:-}" >&2 + echo -e " \033[1mNode label strat :\033[0m ${NODE_LABEL_STRATEGY}" >&2 + echo -e " \033[1mOperator image :\033[0m ${OPERATOR_IMAGE}" >&2 + echo -e " \033[1mImage registry :\033[0m ${IMAGE_REGISTRY:-}" >&2 + echo -e " \033[1mECR enabled :\033[0m ${ECR_ENABLED}" >&2 + echo "" >&2 + echo -e " \033[1mObject store :\033[0m type=${OBJ_STORE_TYPE} bucket=${OBJ_STORE_BUCKET:-}" >&2 + echo -e " \033[1mObject endpoint :\033[0m ${OBJ_STORE_ENDPOINT:-}" >&2 + echo "" >&2 + echo -e " \033[1mSteps that will run:\033[0m" >&2 + echo -e " 1. Preflight checks (oc login, tools, manifest files)" >&2 + echo -e " 2. NFD Operator (OLM)" >&2 + echo -e " 3. NVIDIA GPU Operator (OLM)" >&2 + echo -e " 4. Node labeling (splunk.ai/workload-type)" >&2 + echo -e " 5. local-path-provisioner + SELinux relabeling" >&2 + echo -e " 6. cert-manager (Helm)" >&2 + echo -e " 7. OpenTelemetry Operator (Helm)" >&2 + echo -e " 8. KubeRay Operator (Helm)" >&2 + echo -e " 9. ECR pull secrets" >&2 + echo -e " 10. Splunk AI Operator" >&2 + echo -e " 11. Splunk Operator" >&2 + echo -e " 12. Splunk Standalone CR" >&2 + echo -e " 13. AIPlatform CR" >&2 + echo "" >&2 + + if [[ "${AUTO_APPROVE:-false}" == "true" ]]; then + log "AUTO_APPROVE=true — skipping confirmation." + return 0 + fi + + echo -e " \033[1mReview the plan above. Type 'yes' to proceed, anything else to abort:\033[0m" >&2 + local answer + read -r answer + if [[ "${answer}" != "yes" ]]; then + echo "Aborted by user." >&2 + exit 0 + fi +} # ====== LOAD CONFIGURATION ====== load_config() { @@ -1001,6 +1172,14 @@ install_splunk_standalone() { ensure_namespace "${AI_NS}" wait_for_crd standalones.enterprise.splunk.com 600 + # Wait for object store endpoint to be reachable before creating credentials secret + if [[ -n "${OBJ_STORE_ENDPOINT}" ]]; then + wait_for_dependency \ + "object store (${OBJ_STORE_TYPE}) at ${OBJ_STORE_ENDPOINT}" \ + "curl -sL --connect-timeout 5 --max-time 10 -o /dev/null -w '%{http_code}' '${OBJ_STORE_ENDPOINT}' 2>/dev/null | grep -qE '^[0-9]'" \ + 300 + fi + # Object storage credentials secret oc -n "${AI_NS}" create secret generic minio-credentials \ --from-literal=AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" \ @@ -1073,7 +1252,7 @@ install_ai_platform_cr() { aws) obj_path="s3://${OBJ_STORE_BUCKET}"; obj_endpoint="" ;; s3compat) obj_path="s3compat://${OBJ_STORE_BUCKET}"; obj_endpoint="${OBJ_STORE_ENDPOINT}" ;; minio) obj_path="minio://${OBJ_STORE_BUCKET}"; obj_endpoint="${OBJ_STORE_ENDPOINT}" ;; - seaweedfs) obj_path="seaweedfs://${OBJ_STORE_BUCKET}";obj_endpoint="${OBJ_STORE_ENDPOINT}" ;; + seaweedfs) obj_path="minio://${OBJ_STORE_BUCKET}"; obj_endpoint="${OBJ_STORE_ENDPOINT}" ;; *) err "Unsupported objectStore.type: ${OBJ_STORE_TYPE}" ;; esac @@ -1230,22 +1409,73 @@ main_install() { log "============================================" load_config - preflight_checks validate_image_config configure_images + + show_install_plan + + phase_start "Preflight" + step_start "Preflight checks" + preflight_checks + step_ok + phase_end "Preflight" + + phase_start "Infrastructure" + step_start "NFD Operator" install_nfd + step_ok + + step_start "NVIDIA GPU Operator" install_nvidia_gpu_operator + step_ok + + step_start "Node labeling" label_nodes + step_ok + + step_start "local-path-provisioner + SELinux" install_local_path_provisioner relabel_worker_nodes_for_selinux + step_ok + phase_end "Infrastructure" + + phase_start "Operators" + step_start "cert-manager" install_cert_manager + step_ok + + step_start "OpenTelemetry Operator" install_otel_operator + step_ok + + step_start "KubeRay Operator" install_ray_operator + step_ok + + step_start "ECR pull secrets" ensure_ecr_pull_secret + step_ok + + step_start "Splunk AI Operator" install_splunk_ai_operator + step_ok + + step_start "Splunk Operator" install_splunk_operator + step_ok + phase_end "Operators" + + phase_start "AI Platform Stack" + step_start "Splunk Standalone CR" install_splunk_standalone + step_ok + + step_start "AIPlatform CR" install_ai_platform_cr + step_ok + phase_end "AI Platform Stack" + + show_step_summary log "============================================" log " Install complete" @@ -1273,6 +1503,28 @@ main_delete() { err "Not logged in to OpenShift. Run: oc login " fi + log " Namespace : ${AI_NS}" + log " Cluster : $(oc whoami --show-server 2>/dev/null || echo '')" + log "============================================" + log "" + warn "This will DELETE the AI Platform stack from the OpenShift cluster." + warn "The cluster nodes themselves will remain running." + warn "This action CANNOT be undone." + log "" + + if [[ "${AUTO_APPROVE:-false}" != "true" ]]; then + echo -e " \033[1;31mType 'yes' to confirm deletion, or Ctrl-C to abort:\033[0m" >&2 + local confirm_input + read -r confirm_input + if [[ "${confirm_input}" != "yes" ]]; then + echo "Aborted — confirmation not given." >&2 + exit 0 + fi + log "Confirmed. Proceeding with deletion..." + else + log "AUTO_APPROVE=true — skipping confirmation prompt." + fi + local ai_operator_ns="splunk-ai-operator-system" local splunk_operator_ns="splunk-operator" @@ -1384,20 +1636,100 @@ main_delete() { log "Log file: ${LOG_FILE}" } +# ====== DIAGNOSE SUBCOMMAND ====== +diagnose() { + load_config 2>/dev/null || true + + local bundle_dir + bundle_dir="$(mktemp -d)/splunk-ai-diagnose-$(date '+%Y%m%d-%H%M%S')" + mkdir -p "${bundle_dir}" + + log "=== Collecting support bundle into ${bundle_dir} ===" + + # 1. Installer logs + log "Collecting installer logs..." + cp "${LOG_DIR}"/openshift-install-*.log "${bundle_dir}/" 2>/dev/null || true + + # 2. Cluster state (best-effort — cluster may be unreachable) + if timeout 10 oc cluster-info &>/dev/null 2>&1; then + log "Collecting cluster state..." + oc get nodes -o wide > "${bundle_dir}/nodes.txt" 2>&1 || true + oc get pods --all-namespaces -o wide > "${bundle_dir}/pods.txt" 2>&1 || true + oc get events --all-namespaces --sort-by='.lastTimestamp' > "${bundle_dir}/events.txt" 2>&1 || true + oc get pvc --all-namespaces > "${bundle_dir}/pvcs.txt" 2>&1 || true + oc get svc --all-namespaces > "${bundle_dir}/services.txt" 2>&1 || true + oc describe nodes > "${bundle_dir}/node-details.txt" 2>&1 || true + + # Per-namespace pod logs for failing pods + log "Collecting logs from non-Running pods..." + local ns pod + while IFS= read -r line; do + ns=$(echo "${line}" | awk '{print $1}') + pod=$(echo "${line}" | awk '{print $2}') + mkdir -p "${bundle_dir}/pod-logs/${ns}" + oc logs "${pod}" -n "${ns}" --tail=200 \ + > "${bundle_dir}/pod-logs/${ns}/${pod}.log" 2>&1 || true + oc logs "${pod}" -n "${ns}" --previous --tail=100 \ + > "${bundle_dir}/pod-logs/${ns}/${pod}.previous.log" 2>&1 || true + done < <(oc get pods --all-namespaces --no-headers 2>/dev/null \ + | awk '$4 != "Running" && $4 != "Completed" {print $1, $2}') + + # AI Platform specific resources + oc describe aiplatform --all -n "${AI_NS:-ai-platform}" > "${bundle_dir}/aiplatform-cr.txt" 2>&1 || true + oc describe aiservice --all -n "${AI_NS:-ai-platform}" > "${bundle_dir}/aiservice-cr.txt" 2>&1 || true + + # Operator logs + oc logs -n splunk-ai-operator-system -l control-plane=controller-manager --tail=500 \ + > "${bundle_dir}/operator-logs.txt" 2>&1 || true + else + warn "Cluster not reachable — skipping oc diagnostics." + echo "Cluster unreachable at time of diagnose run." > "${bundle_dir}/CLUSTER_UNREACHABLE.txt" + fi + + # 3. Config file (redact credentials) + if [[ -f "${CONFIG_FILE}" ]]; then + log "Including config file (credentials redacted)..." + sed 's/\(rootUser\|rootPassword\|AWS_ACCESS_KEY_ID\|AWS_SECRET_ACCESS_KEY\|accessKey\|secretKey\):.*/\1: /g' \ + "${CONFIG_FILE}" > "${bundle_dir}/cluster-config-redacted.yaml" + fi + + # 4. Tool versions + { + echo "=== Tool versions ===" + oc version 2>/dev/null || true + helm version 2>/dev/null || true + yq --version 2>/dev/null || true + echo "=== OS ===" + uname -a + } > "${bundle_dir}/versions.txt" 2>&1 + + # 5. Pack into tar.gz + local bundle_tar="${bundle_dir}.tar.gz" + tar -czf "${bundle_tar}" -C "$(dirname "${bundle_dir}")" "$(basename "${bundle_dir}")" 2>/dev/null + rm -rf "${bundle_dir}" + + log "=== Support bundle ready: ${bundle_tar} ===" + log "Attach this file to your support ticket or share with the team." +} + # ====== USAGE ====== usage() { cat < - - oc, yq in PATH + - oc, yq, helm in PATH - artifacts.yaml (operator manifests) in the same directory, or set files.aiPlatform in config EOF } @@ -1410,6 +1742,9 @@ case "${1:-install}" in delete) main_delete ;; + diagnose) + diagnose + ;; *) usage exit 1 From 4f6c8207a474de9d4190d061b3b875ac93dd2e9b Mon Sep 17 00:00:00 2001 From: kbhos Date: Tue, 16 Jun 2026 12:41:09 +0530 Subject: [PATCH 06/10] add back gemma model --- config/configs/applications.yaml | 128 ++++++++++++++++++++++- config/configs/features/saia.yaml | 1 + pkg/ai/raybuilder/configmap_apps_test.go | 6 +- 3 files changed, 129 insertions(+), 6 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index eb29355d..72745a90 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -34,6 +34,131 @@ applications: SERVICE_NAME: "ai_platform_models" SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" + - args: + application_name: Gemma431bIt + deployment_configs: + LLMDeployment: + gpu_type_options_override: + H100: + autoscaling_config: + max_replicas: {{.Replicas.Gemma431bIt}} + min_replicas: {{.Replicas.Gemma431bIt}} + target_ongoing_requests: 6 + max_ongoing_requests: 8 + ray_actor_options: + num_gpus: 1 + L40S: + autoscaling_config: + max_replicas: {{.Replicas.Gemma431bIt}} + min_replicas: {{.Replicas.Gemma431bIt}} + target_ongoing_requests: 4 + max_ongoing_requests: 6 + ray_actor_options: + num_gpus: 2 + RTX_PRO_6000_BLACKWELL: + autoscaling_config: + max_replicas: {{.Replicas.Gemma431bIt}} + min_replicas: {{.Replicas.Gemma431bIt}} + target_ongoing_requests: 4 + max_ongoing_requests: 10 + ray_actor_options: + num_gpus: 2 + resources: + "gpu_count:2": 0.001 + "accelerator_type:RTX_PRO_6000_BLACKWELL": 0.001 + options: + autoscaling_config: + max_replicas: {{.Replicas.Gemma431bIt}} + min_replicas: {{.Replicas.Gemma431bIt}} + deployment_type: text_gen_model_deployment + gpu_types: '["{{.AcceleratorType}}"]' + model_definition: + gpu_type_model_config_override: + H100: + engine_args: + dtype: bfloat16 + gpu_memory_utilization: 0.9 + max_model_len: 32768 + max_num_batched_tokens: 4096 + tensor_parallel_size: 1 + L40S: + engine_args: + dtype: bfloat16 + gpu_memory_utilization: 0.85 + max_model_len: 120000 + max_num_batched_tokens: 4096 + max_num_seqs: 2 + tensor_parallel_size: 2 + RTX_PRO_6000_BLACKWELL: + engine_args: + dtype: bfloat16 + gpu_memory_utilization: 0.85 + max_model_len: 240000 + max_num_batched_tokens: 4096 + max_num_seqs: 1 + tensor_parallel_size: 2 + model_config: + openai_serving_config: + chat: + enable_auto_tools: true + reasoning_parser: gemma4 + tool_parser: gemma4 + responses: + enable_auto_tools: true + reasoning_parser: gemma4 + tool_parser: gemma4 + model_id: gemma4_31b_it + model_loader: + blob_storage: + blob_prefix: model_artifacts/gemma-4-31b-it + tokenizer_definition: + model_id: gemma4_31b_it + model_loader: + blob_storage: + artifacts_list: + - chat_template.jinja + - config.json + - processor_config.json + - tokenizer_config.json + - tokenizer.json + blob_prefix: model_artifacts/gemma-4-31b-it + name: Gemma431bIt + import_path: main:create_serve_app + route_prefix: /gemma4_31b_it + runtime_env: + working_dir: "file:///home/ray/ray/applications/generic_application.zip" + env_vars: + API_VERSION: "v1" + APPLICATION_NAME: gemma4_31b_it + VLLM_ATTENTION_BACKEND: TRITON_ATTN + ARTIFACTS_S3_BUCKET: "{{.ArtifactBucketName}}" + S3_BUCKET: "{{.ArtifactBucketName}}" + ARTIFACTS_PROVIDER: "{{.ArtifactsProvider}}" + CLOUD_PROVIDER: "{{.CloudProvider}}" + S3COMPAT_OBJECT_STORE_ENDPOINT_URL: "{{.S3CompatObjectStoreEndpointUrl}}" + S3COMPAT_OBJECT_STORE_ACCESS_KEY: "{{.S3CompatObjectStoreAccessKey}}" + S3COMPAT_OBJECT_STORE_SECRET_KEY: "{{.S3CompatObjectStoreSecretKey}}" + # AWS / boto3 standard credential names — populated whenever the + # operator can load credentials from spec.objectStorage.secretRef. For + # CLOUD_PROVIDER=aws these are the values boto3 reads (the S3COMPAT_* + # names above are only consumed by the s3compat shim). Both code paths + # share the same source-of-truth Secret keys (s3_access_key / + # s3_secret_key) so emitting both pairs is safe — each provider only + # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # bucket outside us-east-1 to avoid PermanentRedirect on the first call. + AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" + AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" + AWS_REGION: "{{.Region}}" + AWS_DEFAULT_REGION: "{{.Region}}" + ENABLE_AUTHN: "false" + ENABLE_AUTHZ: "false" + SERVICE_EXTERNAL_NAME: "ai-platform-models" + SERVICE_INTERNAL_NAME: "ai_platform_models" + SERVICE_NAME: "ai_platform_models" + SKIP_VERIFICATION: "true" + USE_SYSTEM_PERMISSIONS: "true" + VLLM_WORKER_MULTIPROC_METHOD: spawn + DISABLE_RESPONSES_API_REDIS: "True" - args: application_name: GptOss20b deployment_configs: @@ -131,8 +256,7 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" VLLM_WORKER_MULTIPROC_METHOD: spawn - # See Gemma431bIt above for rationale. Must be "True" in airgap (no - # Redis) so vLLM uses NoOpOpenAIServingResponses. + # Must be "True" in airgap (no Redis) so vLLM uses NoOpOpenAIServingResponses. DISABLE_RESPONSES_API_REDIS: "True" - args: application_name: UaeLarge diff --git a/config/configs/features/saia.yaml b/config/configs/features/saia.yaml index 69528f49..73df509d 100644 --- a/config/configs/features/saia.yaml +++ b/config/configs/features/saia.yaml @@ -4,6 +4,7 @@ applicationScale: CrossEncoder: 1 E5LanguageClassifier: 1 Entrypoint: 1 + Gemma431bIt: 1 GptOss20b: 1 MbartTranslator: 1 PromptInjectionClassifier: 1 diff --git a/pkg/ai/raybuilder/configmap_apps_test.go b/pkg/ai/raybuilder/configmap_apps_test.go index fbdad4ef..4beb1b4e 100644 --- a/pkg/ai/raybuilder/configmap_apps_test.go +++ b/pkg/ai/raybuilder/configmap_apps_test.go @@ -86,11 +86,9 @@ func Test_ApplicationsYAML_DisableResponsesRedis(t *testing.T) { } } - expectedTextGenApps := []string{"GptOss20b"} + expectedTextGenApps := []string{"Gemma431bIt", "GptOss20b"} - // We expect exactly one text-gen app today (GptOss20b). Gemma431bIt was - // removed — it requires 2× RTX PRO 6000 GPUs and cannot run alongside - // GptOss20b on a single 2-GPU node. + // We expect exactly two text-gen apps today (Gemma431bIt, GptOss20b). // If this count changes, someone added a new text-gen model; they MUST // also add DISABLE_RESPONSES_API_REDIS to the new app. require.Len(t, textGenApps, len(expectedTextGenApps), From 5b330f95122635c7d019a07c6302b46da62f18e1 Mon Sep 17 00:00:00 2001 From: kbhos Date: Tue, 16 Jun 2026 12:49:18 +0530 Subject: [PATCH 07/10] clean-ups --- config/configs/applications.yaml | 14 ++++++- tools/cluster_setup/artifacts.yaml | 67 ++++++++++++++---------------- 2 files changed, 44 insertions(+), 37 deletions(-) diff --git a/config/configs/applications.yaml b/config/configs/applications.yaml index 72745a90..c8a38601 100644 --- a/config/configs/applications.yaml +++ b/config/configs/applications.yaml @@ -145,6 +145,7 @@ applications: # share the same source-of-truth Secret keys (s3_access_key / # s3_secret_key) so emitting both pairs is safe — each provider only # reads its own. AWS_REGION lets boto3 resolve the default regional S3 + # endpoint when no AWS_ENDPOINT_URL is set; required for any AWS S3 # bucket outside us-east-1 to avoid PermanentRedirect on the first call. AWS_ACCESS_KEY_ID: "{{.S3CompatObjectStoreAccessKey}}" AWS_SECRET_ACCESS_KEY: "{{.S3CompatObjectStoreSecretKey}}" @@ -158,6 +159,16 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" VLLM_WORKER_MULTIPROC_METHOD: spawn + # Disable the Redis-backed Responses API store (see ai-platform-models + # commit c1f9aef3: "feat: add a no-op store"). When True, the vLLM + # TextGen deployment constructs NoOpOpenAIServingResponses instead of + # RedisOpenAIServingResponses, so /v1/responses works without a Redis + # infra. Without this flag the deployment raises + # RuntimeError: Responses Redis URL not set + # on every request, which surfaces as an empty SSE stream and the SAIA + # v2 /query path fails with "An error occurred processing your request". + # Airgap k0s has no Redis; cloud sets this to "False" and wires + # RESPONSES_REDIS_ADDRESS to its in-namespace Redis StatefulSet. DISABLE_RESPONSES_API_REDIS: "True" - args: application_name: GptOss20b @@ -256,7 +267,8 @@ applications: SKIP_VERIFICATION: "true" USE_SYSTEM_PERMISSIONS: "true" VLLM_WORKER_MULTIPROC_METHOD: spawn - # Must be "True" in airgap (no Redis) so vLLM uses NoOpOpenAIServingResponses. + # See Gemma431bIt above for rationale. Must be "True" in airgap (no + # Redis) so vLLM uses NoOpOpenAIServingResponses. DISABLE_RESPONSES_API_REDIS: "True" - args: application_name: UaeLarge diff --git a/tools/cluster_setup/artifacts.yaml b/tools/cluster_setup/artifacts.yaml index 48710094..c6953e7b 100644 --- a/tools/cluster_setup/artifacts.yaml +++ b/tools/cluster_setup/artifacts.yaml @@ -1061,18 +1061,11 @@ spec: items: description: FeatureSpec defines the features to enable in the AIPlatform properties: - env: - additionalProperties: - type: string - description: Env specifies environment variables to propagate - to the child AIService. - type: object name: description: Name of the feature, e.g. "saia" or "seca" enum: - saia - seca - - weaviate-service type: string scaleFactor: description: ScaleFactor is the desired fixed number of replicas @@ -2092,11 +2085,6 @@ spec: type: object x-kubernetes-map-type: atomic type: array - otelImage: - default: otel/opentelemetry-collector-contrib:0.122.1 - description: OTelImage is the OpenTelemetry Collector sidecar - image - type: string rayHeadGroupImage: description: Ray head group image, e.g. "rayproject/ray-head:latest" type: string @@ -2237,8 +2225,7 @@ spec: type: object objectStorage: description: |- - ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models. - It is optional for platforms that only enable features that do not require object storage. + ObjectStorage defines the object storage configuration for AI artifacts, tasks, and models Supported providers: S3, GCS, Azure Blob Storage, MinIO properties: endpoint: @@ -2250,8 +2237,8 @@ spec: path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, minio://bucketname/, seaweedfs://bucketname/, or s3compat://bucketname/ + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ type: string region: description: Region of the remote storage volume. Required for @@ -2921,6 +2908,8 @@ spec: pattern: ^[a-z0-9]([-a-z0-9]*[a-z0-9])?$ type: string type: object + required: + - objectStorage type: object status: description: AIPlatformStatus defines observed state @@ -4095,18 +4084,11 @@ spec: features: description: Feature defines the features to be enabled for the AIService properties: - env: - additionalProperties: - type: string - description: Env specifies environment variables to propagate - to the child AIService. - type: object name: description: Name of the feature, e.g. "saia" or "seca" enum: - saia - seca - - weaviate-service type: string scaleFactor: description: ScaleFactor is the desired fixed number of replicas @@ -4884,15 +4866,27 @@ spec: properties: endpoint: description: |- - Optional override endpoint (only needed for S3-compatible services like MinIO) - Must be a valid HTTP/HTTPS URL + Optional override endpoint (only needed for S3-compatible services like MinIO, SeaweedFS) + Must be a valid HTTP/HTTPS URL. When set with s3:// path, backend is treated as S3-compatible (MinIO, SeaweedFS, etc.) pattern: ^https?://.*$ type: string path: description: |- Remote volume URI in the format s3://bucketname/, gs://bucketname/, - azure://containername/, or minio://bucketname/ - pattern: ^(s3|gs|azure|minio)://[a-zA-Z0-9.\-_]+(/.*)?$ + azure://containername/, s3compat://bucketname/ (generic S3-compatible), minio://, or seaweedfs:// + pattern: ^(s3|gs|azure|minio|seaweedfs|s3compat)://[a-zA-Z0-9.\-_]+(/.*)?$ + type: string + provider: + description: |- + Provider is an optional hint for documentation and tooling. Operator derives behavior from path scheme and endpoint. + Values: aws, minio, seaweedfs, s3compat, gcs, azure + enum: + - aws + - minio + - seaweedfs + - s3compat + - gcs + - azure type: string region: description: Region of the remote storage volume. Required for @@ -4900,7 +4894,8 @@ spec: minLength: 1 type: string secretRef: - description: Secret name containing storage credentials + description: Secret name containing storage credentials (e.g. + s3_access_key, s3_secret_key for S3-compatible backends) maxLength: 253 minLength: 1 type: string @@ -5687,19 +5682,19 @@ spec: fieldRef: fieldPath: metadata.name - name: RELATED_IMAGE_RAY_HEAD - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-953 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-head:build-v2-010 - name: RELATED_IMAGE_RAY_WORKER - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-953 + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/ray/ray-worker-gpu:build-v2-010 - name: RELATED_IMAGE_WEAVIATE value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - - name: RELATED_IMAGE_WEAVIATE_SERVICE - value: docker.io/semitechnologies/weaviate:stable-v1.28-007846a - name: RELATED_IMAGE_SAIA_API - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-main-c3b489d + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api:build-v2-012 - name: RELATED_IMAGE_SAIA_API_V2 - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-main-c3b489d + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-api-v2:build-v2-012 - name: RELATED_IMAGE_POST_INSTALL_HOOK - value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-main-c3b489d + value: 658391232643.dkr.ecr.us-east-2.amazonaws.com/ml-platform/saia/saia-data-loader:build-v2-012 + - name: SPLUNK_METRICS_INDEX_NAME + value: _metrics - name: RELATED_IMAGE_FLUENT_BIT value: docker.io/fluent/fluent-bit:1.9.6 - name: RELATED_IMAGE_OTEL_COLLECTOR @@ -5710,7 +5705,7 @@ spec: value: v0.3.14-36-g1549f5a - name: RAY_VERSION value: 2.53.0 - image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/kiran/splunk/splunk-ai-operator:openshift-0.5 + image: 658391232643.dkr.ecr.us-east-2.amazonaws.com/arif/splunk/splunk-ai-operator:v0.1.29 livenessProbe: httpGet: path: /healthz From 2a78e36bad21a404b8ec8072c51f9b4af33d6a78 Mon Sep 17 00:00:00 2001 From: kbhos Date: Tue, 16 Jun 2026 14:27:06 +0530 Subject: [PATCH 08/10] config map to store the issuer url --- tools/cluster_setup/openshift_with_stack.sh | 29 +++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tools/cluster_setup/openshift_with_stack.sh b/tools/cluster_setup/openshift_with_stack.sh index 5533bb0c..53f79650 100755 --- a/tools/cluster_setup/openshift_with_stack.sh +++ b/tools/cluster_setup/openshift_with_stack.sh @@ -1198,6 +1198,30 @@ install_splunk_standalone() { fi [[ -z "${minio_endpoint}" ]] && err "storage.objectStore.endpoint must be set for type=${OBJ_STORE_TYPE}" + # Configure Splunk to use the service URL as the token issuer so that JWT + # tokens have iss=https://splunk-splunk-standalone-standalone-service:8089, + # matching SAIA's SPLUNK_ISSUERS. Without this, Splunk uses the pod hostname + # as issuer (e.g. splunk-splunk-standalone-standalone-0) and SAIA rejects + # tokens with "Issuer not allowed". + cat <<'YAML' | oc -n "${AI_NS}" apply -f - +apiVersion: v1 +kind: ConfigMap +metadata: + name: splunk-defaults +data: + default.yml: | + splunk: + conf: + - key: authentication + value: + directory: /opt/splunk/etc/system/local + content: + oauth2_settings: + issuer_uri: https://splunk-splunk-standalone-standalone-service:8089 + certFile: $SPLUNK_HOME/etc/auth/server.pem + sslPassword: password +YAML + oc apply --server-side --force-conflicts -f - < Date: Tue, 16 Jun 2026 21:44:41 +0530 Subject: [PATCH 09/10] resolve copilot comments --- config/configs/instance.yaml | 2 ++ .../openshift-cluster-config.yaml | 6 ++--- tools/cluster_setup/openshift_with_stack.sh | 22 ++++++++++++------- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/config/configs/instance.yaml b/config/configs/instance.yaml index 3183dbf9..bf012a00 100644 --- a/config/configs/instance.yaml +++ b/config/configs/instance.yaml @@ -89,6 +89,8 @@ H100_NVL: RTX_PRO_6000_BLACKWELL: - tier: rtx-pro-6000-blackwell-0-gpu gpusPerPod: 0 + env: + NVIDIA_VISIBLE_DEVICES: void resources: limits: cpu: "16" diff --git a/tools/cluster_setup/openshift-cluster-config.yaml b/tools/cluster_setup/openshift-cluster-config.yaml index 072de5c6..9af62f2b 100644 --- a/tools/cluster_setup/openshift-cluster-config.yaml +++ b/tools/cluster_setup/openshift-cluster-config.yaml @@ -67,10 +67,10 @@ storage: objectStore: type: "seaweedfs" # aws | s3compat | minio | seaweedfs bucket: "ai-platform-bucket" - endpoint: "http://18.116.39.79:8333" + endpoint: "" auth: - rootUser: "minioadmin" - rootPassword: "minioadmin" + rootUser: "" + rootPassword: "" splunk: standaloneName: splunk-standalone diff --git a/tools/cluster_setup/openshift_with_stack.sh b/tools/cluster_setup/openshift_with_stack.sh index 53f79650..b090f408 100755 --- a/tools/cluster_setup/openshift_with_stack.sh +++ b/tools/cluster_setup/openshift_with_stack.sh @@ -371,8 +371,8 @@ configure_images() { preflight_checks() { log "Running preflight checks..." - for tool in oc yq; do - command -v "$tool" >/dev/null 2>&1 && log " ✓ $tool found" || err "Missing $tool in PATH" + for tool in oc yq helm aws curl jq base64 tar; do + command -v "$tool" >/dev/null 2>&1 && log " ✓ $tool found" || err "Missing required tool: $tool" done # Verify we are connected to the cluster @@ -988,9 +988,14 @@ ensure_ecr_pull_secret() { --namespace="${ns}" \ --dry-run=client -o yaml | oc apply -f - - # Patch the default SA so pods without explicit imagePullSecrets also pull correctly - oc patch serviceaccount default -n "${ns}" \ - -p '{"imagePullSecrets": [{"name": "ecr-registry-secret"}]}' 2>/dev/null || true + # Append ecr-registry-secret to the default SA only if not already present. + # Using JSON patch add rather than a merge patch to avoid overwriting existing pull secrets. + if ! oc get serviceaccount default -n "${ns}" -o jsonpath='{.imagePullSecrets[*].name}' 2>/dev/null | grep -qw ecr-registry-secret; then + oc patch serviceaccount default -n "${ns}" --type=json \ + -p='[{"op":"add","path":"/imagePullSecrets","value":[]}]' 2>/dev/null || true + oc patch serviceaccount default -n "${ns}" --type=json \ + -p='[{"op":"add","path":"/imagePullSecrets/-","value":{"name":"ecr-registry-secret"}}]' 2>/dev/null || true + fi log " ✓ ecr-registry-secret created in ${ns}" done @@ -1511,11 +1516,12 @@ main_install() { log "============================================" log "" log "Next steps:" - log " 1. Create an AIPlatform CR in namespace '${AI_NS}'" + log " 1. Verify resources:" + log " oc get aiplatform,aiservice,raycluster,rayservice -n ${AI_NS}" log " 2. Check operator logs:" log " oc logs -n splunk-ai-operator-system -l control-plane=controller-manager -f" - log " 3. Watch resources:" - log " oc get aiplatform,raycluster,rayservice -n ${AI_NS}" + log " 3. Watch Ray cluster:" + log " oc get raycluster,rayservice -n ${AI_NS} -w" log "" log "Log file: ${LOG_FILE}" } From cc16e49f15984747e983b280460696dbafaa27ff Mon Sep 17 00:00:00 2001 From: kbhos Date: Thu, 18 Jun 2026 15:50:12 +0530 Subject: [PATCH 10/10] clean up --- tools/cluster_setup/openshift_with_stack.sh | 26 +-------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/tools/cluster_setup/openshift_with_stack.sh b/tools/cluster_setup/openshift_with_stack.sh index b090f408..f0f49730 100755 --- a/tools/cluster_setup/openshift_with_stack.sh +++ b/tools/cluster_setup/openshift_with_stack.sh @@ -1032,30 +1032,6 @@ install_splunk_ai_operator() { oc apply --server-side --force-conflicts -f "${SPLUNK_AI_FILE}" 2>&1 || true fi - # Inject the local instance.yaml so the operator knows about RTX_PRO_6000_BLACKWELL - # and other accelerators that may not be baked into the operator image. - local instance_src - instance_src="$(dirname "${SPLUNK_AI_FILE}")/../../config/configs/instance.yaml" - if [[ ! -f "${instance_src}" ]]; then - instance_src="$(cd "$(dirname "$0")/../.." && pwd)/config/configs/instance.yaml" - fi - if [[ -f "${instance_src}" ]]; then - oc create configmap splunk-ai-operator-instance-yaml \ - -n "${ai_operator_ns}" \ - --from-file=instance.yaml="${instance_src}" \ - --dry-run=client -o yaml | oc -n "${ai_operator_ns}" apply -f - - # Mount the ConfigMap and set INSTANCE_FILE so the operator uses it - oc patch deployment splunk-ai-operator-controller-manager \ - -n "${ai_operator_ns}" --type=json -p='[ - {"op":"add","path":"/spec/template/spec/volumes/-","value":{"name":"instance-yaml","configMap":{"name":"splunk-ai-operator-instance-yaml"}}}, - {"op":"add","path":"/spec/template/spec/containers/0/volumeMounts/-","value":{"name":"instance-yaml","mountPath":"/etc/instance","readOnly":true}}, - {"op":"add","path":"/spec/template/spec/containers/0/env/-","value":{"name":"INSTANCE_FILE","value":"/etc/instance/instance.yaml"}} - ]' 2>/dev/null || true - log " ✓ instance.yaml ConfigMap injected into operator" - else - warn "instance.yaml not found at ${instance_src} — defaultAcceleratorType may not resolve" - fi - # Patch the operator SA and deployment with ECR pull secret AFTER the manifest apply # (the SA is created by the manifest; patching before apply silently does nothing). if [[ "${ECR_ENABLED}" == "true" ]]; then @@ -1069,7 +1045,7 @@ install_splunk_ai_operator() { log " ✓ ECR pull secret patched into operator SA and deployment" fi - # Rollout restart so the deployment picks up pull secrets and instance.yaml. + # Rollout restart so the deployment picks up the updated pull secrets. oc rollout restart deployment splunk-ai-operator-controller-manager \ -n "${ai_operator_ns}" 2>/dev/null || true