apache · ayushtkn · Jun 19, 2026 · May 25, 2026 · May 26, 2026 · May 28, 2026
diff --git a/packaging/src/kubernetes/README.md b/packaging/src/kubernetes/README.md
diff --git a/packaging/src/kubernetes/helm/hive-operator/crds/hiveclusters.hive.apache.org-v1.yml b/packaging/src/kubernetes/helm/hive-operator/crds/hiveclusters.hive.apache.org-v1.yml
diff --git a/packaging/src/kubernetes/helm/hive-operator/templates/clusterrole.yaml b/packaging/src/kubernetes/helm/hive-operator/templates/clusterrole.yaml
@@ -30,6 +30,10 @@ rules:
   - apiGroups: ["apps"]
     resources: ["deployments", "statefulsets"]
     verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
+  # Scale subresource for operator-driven autoscaling
+  - apiGroups: ["apps"]
+    resources: ["deployments/scale", "statefulsets/scale"]
+    verbs: ["get", "update", "patch"]
   # Jobs for schema initialization
   - apiGroups: ["batch"]
     resources: ["jobs"]
@@ -46,7 +50,11 @@ rules:
   - apiGroups: [""]
     resources: ["events"]
     verbs: ["create", "patch"]
-  # Pods: read-only for readiness checking
+  # Pods: read + patch (patch needed for pod-deletion-cost annotation)
   - apiGroups: [""]
     resources: ["pods"]
-    verbs: ["get", "list", "watch"]
+    verbs: ["get", "list", "watch", "patch"]
+  # PodDisruptionBudgets for graceful autoscaling
+  - apiGroups: ["policy"]
+    resources: ["poddisruptionbudgets"]
+    verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
diff --git a/packaging/src/kubernetes/helm/hive-operator/templates/hivecluster.yaml b/packaging/src/kubernetes/helm/hive-operator/templates/hivecluster.yaml
@@ -67,6 +67,18 @@ spec:
     extraVolumeMounts:
       {{- toYaml .Values.cluster.metastore.extraVolumeMounts | nindent 6 }}
     {{- end }}
+    {{- if and .Values.cluster.metastore.autoscaling .Values.cluster.metastore.autoscaling.enabled }}
+    autoscaling:
+      enabled: true
+      minReplicas: {{ .Values.cluster.metastore.autoscaling.minReplicas }}
+      scaleUpThreshold: {{ .Values.cluster.metastore.autoscaling.scaleUpThreshold }}
+      scaleUpStabilizationSeconds: {{ .Values.cluster.metastore.autoscaling.scaleUpStabilizationSeconds }}
+      scaleDownStabilizationSeconds: {{ .Values.cluster.metastore.autoscaling.scaleDownStabilizationSeconds }}
+      gracePeriodSeconds: {{ .Values.cluster.metastore.autoscaling.gracePeriodSeconds }}
+      metricsScrapeIntervalSeconds: {{ .Values.cluster.metastore.autoscaling.metricsScrapeIntervalSeconds | default 10 }}
+      cpuScaleUpThreshold: {{ .Values.cluster.metastore.autoscaling.cpuScaleUpThreshold | default 90 }}
+      cpuScaleDownThreshold: {{ .Values.cluster.metastore.autoscaling.cpuScaleDownThreshold | default 30 }}
+    {{- end }}
     {{- else }}
     {{- if .Values.cluster.metastore.externalUri }}
     externalUri: {{ .Values.cluster.metastore.externalUri | quote }}
@@ -96,6 +108,18 @@ spec:
     extraVolumeMounts:
       {{- toYaml .Values.cluster.hiveServer2.extraVolumeMounts | nindent 6 }}
     {{- end }}
+    {{- if and .Values.cluster.hiveServer2.autoscaling .Values.cluster.hiveServer2.autoscaling.enabled }}
+    autoscaling:
+      enabled: true
+      minReplicas: {{ .Values.cluster.hiveServer2.autoscaling.minReplicas }}
+      scaleUpThreshold: {{ .Values.cluster.hiveServer2.autoscaling.scaleUpThreshold }}
+      scaleUpStabilizationSeconds: {{ .Values.cluster.hiveServer2.autoscaling.scaleUpStabilizationSeconds }}
+      scaleDownStabilizationSeconds: {{ .Values.cluster.hiveServer2.autoscaling.scaleDownStabilizationSeconds }}
+      gracePeriodSeconds: {{ .Values.cluster.hiveServer2.autoscaling.gracePeriodSeconds }}
+      metricsScrapeIntervalSeconds: {{ .Values.cluster.hiveServer2.autoscaling.metricsScrapeIntervalSeconds | default 10 }}
+      cpuScaleUpThreshold: {{ .Values.cluster.hiveServer2.autoscaling.cpuScaleUpThreshold | default 90 }}
+      cpuScaleDownThreshold: {{ .Values.cluster.hiveServer2.autoscaling.cpuScaleDownThreshold | default 30 }}
+    {{- end }}
 
   llap:
     enabled: {{ .Values.cluster.llap.enabled }}
@@ -120,6 +144,16 @@ spec:
     extraVolumeMounts:
       {{- toYaml .Values.cluster.llap.extraVolumeMounts | nindent 6 }}
     {{- end }}
+    {{- if and .Values.cluster.llap.autoscaling .Values.cluster.llap.autoscaling.enabled }}
+    autoscaling:
+      enabled: true
+      minReplicas: {{ .Values.cluster.llap.autoscaling.minReplicas }}
+      scaleUpThreshold: {{ .Values.cluster.llap.autoscaling.scaleUpThreshold }}
+      scaleUpStabilizationSeconds: {{ .Values.cluster.llap.autoscaling.scaleUpStabilizationSeconds }}
+      scaleDownStabilizationSeconds: {{ .Values.cluster.llap.autoscaling.scaleDownStabilizationSeconds }}
+      gracePeriodSeconds: {{ .Values.cluster.llap.autoscaling.gracePeriodSeconds }}
+      metricsScrapeIntervalSeconds: {{ .Values.cluster.llap.autoscaling.metricsScrapeIntervalSeconds | default 10 }}
+    {{- end }}
     {{- end }}
 
   tezAm:
@@ -146,6 +180,15 @@ spec:
     extraVolumeMounts:
       {{- toYaml .Values.cluster.tezAm.extraVolumeMounts | nindent 6 }}
     {{- end }}
+    {{- if and .Values.cluster.tezAm.autoscaling .Values.cluster.tezAm.autoscaling.enabled }}
+    autoscaling:
+      enabled: true
+      minReplicas: {{ .Values.cluster.tezAm.autoscaling.minReplicas }}
+      scaleUpStabilizationSeconds: {{ .Values.cluster.tezAm.autoscaling.scaleUpStabilizationSeconds }}
+      scaleDownStabilizationSeconds: {{ .Values.cluster.tezAm.autoscaling.scaleDownStabilizationSeconds }}
+      gracePeriodSeconds: {{ .Values.cluster.tezAm.autoscaling.gracePeriodSeconds }}
+      metricsScrapeIntervalSeconds: {{ .Values.cluster.tezAm.autoscaling.metricsScrapeIntervalSeconds | default 10 }}
+    {{- end }}
     {{- end }}
 
   zookeeper:
@@ -176,4 +219,15 @@ spec:
   volumeMounts:
     {{- toYaml .Values.cluster.storage.volumeMounts | nindent 4 }}
   {{- end }}
+
+  {{- if and .Values.cluster.autoSuspend .Values.cluster.autoSuspend.enabled }}
+  autoSuspend:
+    enabled: true
+    idleTimeoutMinutes: {{ .Values.cluster.autoSuspend.idleTimeoutMinutes | default 15 }}
+    {{- if hasKey .Values.cluster.autoSuspend "includeMetastore" }}
+    includeMetastore: {{ .Values.cluster.autoSuspend.includeMetastore }}
+    {{- end }}
+  {{- end }}
+
+  suspend: false
 {{- end }}
diff --git a/packaging/src/kubernetes/helm/hive-operator/values.yaml b/packaging/src/kubernetes/helm/hive-operator/values.yaml
@@ -101,6 +101,20 @@ cluster:
       #   mountPath: /etc/gcs
       #   readOnly: true
 
+  # ---------------------------------------------------------------------------
+  # AUTO-SUSPEND — fully hibernates the cluster after idle timeout
+  # ---------------------------------------------------------------------------
+  # When enabled (requires autoscaling on all active components), the operator
+  # scales the entire cluster to 0 replicas after all components have been idle
+  # for idleTimeoutMinutes. Use kubectl patch to manually suspend/wake:
+  #   kubectl patch hivecluster hive --type=merge -p '{"spec":{"suspend":true}}'
+  #   kubectl patch hivecluster hive --type=merge -p '{"spec":{"suspend":false}}'
+  autoSuspend:
+    enabled: false
+    idleTimeoutMinutes: 15
+    # Set to false to keep HMS running during suspend (HMS autoscaling not required)
+    includeMetastore: true
+
   # ---------------------------------------------------------------------------
   # METASTORE — defaults to enabled, 2 replicas (HA)
   # ---------------------------------------------------------------------------
@@ -112,6 +126,19 @@ cluster:
     configOverrides: {}
     extraVolumes: []
     extraVolumeMounts: []
+    # Autoscaling (operator-driven, no external dependencies)
+    # The operator scrapes JMX Exporter metrics from pods directly.
+    # When enabled, 'replicas' above acts as the max replica ceiling.
+    autoscaling:
+      enabled: false
+      minReplicas: 1
+      scaleUpThreshold: 100
+      scaleUpStabilizationSeconds: 60
+      scaleDownStabilizationSeconds: 300
+      gracePeriodSeconds: 60
+      metricsScrapeIntervalSeconds: 10
+      cpuScaleUpThreshold: 90
+      cpuScaleDownThreshold: 30
     # Set to use an external Metastore instead of deploying one:
     # enabled: false
     # externalUri: "thrift://external-metastore:9083"
@@ -127,6 +154,18 @@ cluster:
     externalJars: []
     extraVolumes: []
     extraVolumeMounts: []
+    # Autoscaling (operator-driven, no external dependencies)
+    # When enabled, 'replicas' above acts as the max replica ceiling
+    autoscaling:
+      enabled: false
+      minReplicas: 1
+      scaleUpThreshold: 100
+      scaleUpStabilizationSeconds: 60
+      scaleDownStabilizationSeconds: 600
+      gracePeriodSeconds: 300
+      metricsScrapeIntervalSeconds: 10
+      cpuScaleUpThreshold: 90
+      cpuScaleDownThreshold: 30
 
   # ---------------------------------------------------------------------------
   # LLAP — enabled by default for full-HA
@@ -141,6 +180,17 @@ cluster:
     configOverrides: {}
     extraVolumes: []
     extraVolumeMounts: []
+    # Autoscaling (operator-driven, no external dependencies)
+    # minReplicas: 0 enables scale-to-zero — scales up when HS2 has active sessions
+    # When enabled, 'replicas' above acts as the max replica ceiling
+    autoscaling:
+      enabled: false
+      minReplicas: 0
+      scaleUpThreshold: 10
+      scaleUpStabilizationSeconds: 60
+      scaleDownStabilizationSeconds: 900
+      gracePeriodSeconds: 600
+      metricsScrapeIntervalSeconds: 10
 
   # ---------------------------------------------------------------------------
   # TEZ AM — enabled by default for full-HA
@@ -154,3 +204,15 @@ cluster:
     configOverrides: {}
     extraVolumes: []
     extraVolumeMounts: []
+    # Autoscaling (operator-driven, no external dependencies)
+    # minReplicas: 0 enables scale-to-zero — wakes when HS2 receives queries
+    # When enabled, 'replicas' above acts as the max replica ceiling
+    # TezAM scales demand-based: max(totalSessions, hs2Pods * sessionsPerQueue)
+    # No scaleUpThreshold needed — scaling is 1:1 with session demand
+    autoscaling:
+      enabled: false
+      minReplicas: 0
+      scaleUpStabilizationSeconds: 60
+      scaleDownStabilizationSeconds: 600
+      gracePeriodSeconds: 120
+      metricsScrapeIntervalSeconds: 10
diff --git a/packaging/src/kubernetes/pom.xml b/packaging/src/kubernetes/pom.xml
@@ -26,6 +26,10 @@
   <description>Kubernetes operator for managing Apache Hive clusters</description>
   <properties>
     <hive.path.to.root>../../..</hive.path.to.root>
+    <!-- The operator is a standalone shaded JAR and requires SLF4J 2.x
+         for the log4j-slf4j2-impl binding. This overrides the parent's
+         slf4j.version (1.7.x) intentionally. -->
+    <slf4j2.version>2.0.16</slf4j2.version>
   </properties>
   <dependencies>
     <dependency>
@@ -48,6 +52,10 @@
       <artifactId>kubernetes-httpclient-vertx</artifactId>
       <version>${fabric8.version}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-lang3</artifactId>
+    </dependency>
     <dependency>
       <groupId>io.github.java-diff-utils</groupId>
       <artifactId>java-diff-utils</artifactId>
@@ -65,9 +73,14 @@
       <version>${fabric8.version}</version>
       <scope>provided</scope>
     </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+      <version>${slf4j2.version}</version>
+    </dependency>
     <dependency>
       <groupId>org.apache.logging.log4j</groupId>
-      <artifactId>log4j-slf4j-impl</artifactId>
+      <artifactId>log4j-slf4j2-impl</artifactId>
       <version>${log4j2.version}</version>
     </dependency>
     <dependency>
@@ -189,6 +202,7 @@
                   <executable>docker</executable>
                   <arguments>
                     <argument>build</argument>
+                    <argument>--no-cache</argument>
                     <argument>-t</argument>
                     <argument>apache/hive:operator-${project.version}</argument>
                     <argument>.</argument>

diff --git a/packaging/src/kubernetes/src/java/org/apache/hive/kubernetes/operator/HiveOperatorMain.java b/packaging/src/kubernetes/src/java/org/apache/hive/kubernetes/operator/HiveOperatorMain.java
@@ -19,7 +19,11 @@
 package org.apache.hive.kubernetes.operator;
 
 import io.javaoperatorsdk.operator.Operator;
+import io.javaoperatorsdk.operator.api.config.ControllerConfiguration;
+import io.javaoperatorsdk.operator.api.config.ResolvedControllerConfiguration;
+import org.apache.hive.kubernetes.operator.model.HiveCluster;
 import org.apache.hive.kubernetes.operator.reconciler.HiveClusterReconciler;
+import org.apache.hive.kubernetes.operator.reconciler.HiveWorkflowSpec;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -36,7 +40,16 @@ private HiveOperatorMain() {
   public static void main(String[] args) {
     LOG.info("Starting Hive Kubernetes Operator");
     Operator operator = new Operator();
-    operator.register(new HiveClusterReconciler());
+    HiveClusterReconciler reconciler = new HiveClusterReconciler();
+    // Get the annotation-derived base config, then inject our programmatic workflow spec.
+    ControllerConfiguration<HiveCluster> baseConfig =
+        operator.getConfigurationService().getConfigurationFor(reconciler);
+    HiveWorkflowSpec workflowSpec = new HiveWorkflowSpec();
+    ((ResolvedControllerConfiguration<HiveCluster>) baseConfig)
+        .setWorkflowSpec(workflowSpec);
+    LOG.info("Registered workflow with {} dependent resource specs",
+        workflowSpec.getDependentResourceSpecs().size());
+    operator.register(reconciler, baseConfig);
     operator.start();
     LOG.info("Hive Kubernetes Operator started successfully");
   }