diff --git a/google/cloud/aiplatform/models.py b/google/cloud/aiplatform/models.py index e186b140ba..eab5c98a71 100644 --- a/google/cloud/aiplatform/models.py +++ b/google/cloud/aiplatform/models.py @@ -1367,6 +1367,9 @@ def deploy( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, enable_access_logging=False, @@ -1467,6 +1470,13 @@ def deploy( autoscaling_target_request_count_per_minute (int): Optional. The target number of requests per minute for autoscaling. If set, the model will be scaled based on the number of requests it receives. + autoscaling_target_dcgm_fi_dev_gpu_util (int): + Optional. Target DCGM metrics for GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): + Optional. Target vLLM metrics for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): + Optional. Target vLLM metrics for number of inference requests + currently waiting in the queue. autoscaling_target_pubsub_num_undelivered_messages (int): Optional. The target number of pubsub undelivered messages for autoscaling. If set, the model will be scaled based on the pubsub queue size. @@ -1555,6 +1565,9 @@ def deploy( autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle, autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute, + autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util, + autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc, + autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting, autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages, autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels, spot=spot, @@ -1591,6 +1604,9 @@ def _deploy( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, spot: bool = False, @@ -1694,6 +1710,13 @@ def _deploy( autoscaling_target_request_count_per_minute (int): Optional. The target number of requests per minute for autoscaling. If set, the model will be scaled based on the number of requests it receives. + autoscaling_target_dcgm_fi_dev_gpu_util (int): + Optional. Target DCGM metrics for GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): + Optional. Target vLLM metrics for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): + Optional. Target vLLM metrics for number of inference requests + currently waiting in the queue. autoscaling_target_pubsub_num_undelivered_messages (int): Optional. The target number of pubsub undelivered messages for autoscaling. If set, the model will be scaled based on the pubsub queue size. @@ -1759,6 +1782,9 @@ def _deploy( autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle, autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute, + autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util, + autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc, + autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting, autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages, autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels, spot=spot, @@ -1802,6 +1828,9 @@ def _deploy_call( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, spot: bool = False, @@ -1911,6 +1940,13 @@ def _deploy_call( A default value of 60 will be used if not specified. autoscaling_target_request_count_per_minute (int): Optional. Target request count per minute per instance. + autoscaling_target_dcgm_fi_dev_gpu_util (int): + Optional. Target DCGM metrics for GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): + Optional. Target vLLM metrics for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): + Optional. Target vLLM metrics for number of inference requests + currently waiting in the queue. autoscaling_target_pubsub_num_undelivered_messages (int): Optional. Target pubsub queue size per instance. autoscaling_pubsub_subscription_labels (Dict[str, str]): @@ -2006,6 +2042,9 @@ def _deploy_call( or autoscaling_target_accelerator_duty_cycle or autoscaling_target_cpu_utilization or autoscaling_target_request_count_per_minute + or autoscaling_target_dcgm_fi_dev_gpu_util + or autoscaling_target_vllm_gpu_cache_usage_perc + or autoscaling_target_vllm_num_requests_waiting or autoscaling_target_pubsub_num_undelivered_messages or autoscaling_pubsub_subscription_labels ) @@ -2017,6 +2056,9 @@ def _deploy_call( "autoscaling_target_accelerator_duty_cycle, " "autoscaling_target_cpu_utilization, " "autoscaling_target_request_count_per_minute, " + "autoscaling_target_dcgm_fi_dev_gpu_util, " + "autoscaling_target_vllm_gpu_cache_usage_perc, " + "autoscaling_target_vllm_num_requests_waiting, " "autoscaling_target_pubsub_num_undelivered_messages, " "autoscaling_pubsub_subscription_labels parameters " "may not be set when `deployment_resource_pool` is " @@ -2078,6 +2120,9 @@ def _deploy_call( or autoscaling_target_accelerator_duty_cycle or autoscaling_target_cpu_utilization or autoscaling_target_request_count_per_minute + or autoscaling_target_dcgm_fi_dev_gpu_util + or autoscaling_target_vllm_gpu_cache_usage_perc + or autoscaling_target_vllm_num_requests_waiting or autoscaling_target_pubsub_num_undelivered_messages or autoscaling_pubsub_subscription_labels ) @@ -2095,6 +2140,9 @@ def _deploy_call( "autoscaling_target_accelerator_duty_cycle, " "autoscaling_target_cpu_utilization, " "autoscaling_target_request_count_per_minute, " + "autoscaling_target_dcgm_fi_dev_gpu_util, " + "autoscaling_target_vllm_gpu_cache_usage_perc, " + "autoscaling_target_vllm_num_requests_waiting, " "autoscaling_target_pubsub_num_undelivered_messages, " "autoscaling_pubsub_subscription_labels parameters " "are ignored." @@ -2156,6 +2204,48 @@ def _deploy_call( [autoscaling_metric_spec] ) + if autoscaling_target_dcgm_fi_dev_gpu_util: + autoscaling_metric_spec = ( + gca_machine_resources_compat.AutoscalingMetricSpec( + metric_name=( + "prometheus.googleapis.com/" + "vertex_dcgm_fi_dev_gpu_util" + ), + target=autoscaling_target_dcgm_fi_dev_gpu_util, + ) + ) + dedicated_resources.autoscaling_metric_specs.extend( + [autoscaling_metric_spec] + ) + + if autoscaling_target_vllm_gpu_cache_usage_perc: + autoscaling_metric_spec = ( + gca_machine_resources_compat.AutoscalingMetricSpec( + metric_name=( + "prometheus.googleapis.com/" + "vertex_vllm_gpu_cache_usage_perc" + ), + target=autoscaling_target_vllm_gpu_cache_usage_perc, + ) + ) + dedicated_resources.autoscaling_metric_specs.extend( + [autoscaling_metric_spec] + ) + + if autoscaling_target_vllm_num_requests_waiting: + autoscaling_metric_spec = ( + gca_machine_resources_compat.AutoscalingMetricSpec( + metric_name=( + "prometheus.googleapis.com/" + "vertex_vllm_num_requests_waiting" + ), + target=autoscaling_target_vllm_num_requests_waiting, + ) + ) + dedicated_resources.autoscaling_metric_specs.extend( + [autoscaling_metric_spec] + ) + if autoscaling_target_pubsub_num_undelivered_messages: autoscaling_metric_spec = gca_machine_resources.AutoscalingMetricSpec( metric_name=( @@ -4492,6 +4582,9 @@ def deploy( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, ) -> None: @@ -4673,6 +4766,9 @@ def deploy( autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle, autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute, + autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util, + autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc, + autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting, autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages, autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels, ) @@ -5748,6 +5844,9 @@ def deploy( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, enable_access_logging=False, @@ -5870,6 +5969,13 @@ def deploy( autoscaling_target_request_count_per_minute (int): Optional. The target number of requests per minute for autoscaling. If set, the model will be scaled based on the number of requests it receives. + autoscaling_target_dcgm_fi_dev_gpu_util (int): + Optional. Target DCGM metrics for GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): + Optional. Target vLLM metrics for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): + Optional. Target vLLM metrics for number of inference requests + currently waiting in the queue. autoscaling_target_pubsub_num_undelivered_messages (int): Optional. The target number of pubsub undelivered messages for autoscaling. If set, the model will be scaled based on the pubsub queue size. @@ -5929,6 +6035,13 @@ def deploy( autoscaling_target_request_count_per_minute (int): Optional. The target number of requests per minute for autoscaling. If set, the model will be scaled based on the number of requests it receives. + autoscaling_target_dcgm_fi_dev_gpu_util (int): + Optional. Target DCGM metrics for GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): + Optional. Target vLLM metrics for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): + Optional. Target vLLM metrics for number of inference requests + currently waiting in the queue. autoscaling_target_pubsub_num_undelivered_messages (int): Optional. The target number of pubsub undelivered messages for autoscaling. If set, the model will be scaled based on the pubsub queue size. @@ -6001,6 +6114,9 @@ def deploy( autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle, autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute, + autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util, + autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc, + autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting, autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages, autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels, spot=spot, @@ -6047,6 +6163,9 @@ def _deploy( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, spot: bool = False, @@ -6171,6 +6290,13 @@ def _deploy( autoscaling_target_request_count_per_minute (int): Optional. The target number of requests per minute for autoscaling. If set, the model will be scaled based on the number of requests it receives. + autoscaling_target_dcgm_fi_dev_gpu_util (int): + Optional. Target DCGM metrics for GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): + Optional. Target vLLM metrics for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): + Optional. Target vLLM metrics for number of inference requests + currently waiting in the queue. autoscaling_target_pubsub_num_undelivered_messages (int): Optional. The target number of pubsub undelivered messages for autoscaling. If set, the model will be scaled based on the pubsub queue size. @@ -6267,6 +6393,9 @@ def _deploy( autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle, autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute, + autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util, + autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc, + autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting, autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages, autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels, spot=spot, diff --git a/google/cloud/aiplatform/preview/models.py b/google/cloud/aiplatform/preview/models.py index b3cb9f9ba8..64714f6abe 100644 --- a/google/cloud/aiplatform/preview/models.py +++ b/google/cloud/aiplatform/preview/models.py @@ -783,6 +783,9 @@ def deploy( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, deployment_resource_pool: Optional[DeploymentResourcePool] = None, @@ -871,8 +874,14 @@ def deploy( specified. A default value of 60 will be used if not specified. autoscaling_target_request_count_per_minute (int): Target request count per minute per instance. - autoscaling_target_pubsub_num_undelivered_messages (int): Target - number of pubsub undelivered messages per instance. + autoscaling_target_dcgm_fi_dev_gpu_util (int): Target DCGM metrics for + GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): Target vLLM metrics + for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): Target vLLM metrics + for number of inference requests currently waiting in the queue. + autoscaling_target_pubsub_num_undelivered_messages (int): Target number + of pubsub undelivered messages per instance. autoscaling_pubsub_subscription_labels (Dict[str, str]): Optional. Monitored resource labels as key value pairs for metric filtering for pubsub_num_undelivered_messages. @@ -961,6 +970,9 @@ def deploy( autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle, autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute, + autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util, + autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc, + autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting, autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages, autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels, deployment_resource_pool=deployment_resource_pool, @@ -996,6 +1008,9 @@ def _deploy( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, deployment_resource_pool: Optional[DeploymentResourcePool] = None, @@ -1078,8 +1093,14 @@ def _deploy( specified. A default value of 60 will be used if not specified. autoscaling_target_request_count_per_minute (int): Target request count per minute per instance. - autoscaling_target_pubsub_num_undelivered_messages (int): Target - number of pubsub undelivered messages per instance. + autoscaling_target_dcgm_fi_dev_gpu_util (int): Target DCGM metrics for + GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): Target vLLM metrics + for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): Target vLLM metrics + for number of inference requests currently waiting in the queue. + autoscaling_target_pubsub_num_undelivered_messages (int): Target number + of pubsub undelivered messages per instance. autoscaling_pubsub_subscription_labels (Dict[str, str]): Optional. Monitored resource labels as key value pairs for metric filtering for pubsub_num_undelivered_messages. @@ -1154,6 +1175,9 @@ def _deploy( autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle, autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute, + autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util, + autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc, + autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting, autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages, autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels, deployment_resource_pool=deployment_resource_pool, @@ -1196,6 +1220,9 @@ def _deploy_call( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, deployment_resource_pool: Optional[DeploymentResourcePool] = None, @@ -1285,6 +1312,13 @@ def _deploy_call( not specified. autoscaling_target_request_count_per_minute (int): Optional. Target request count per minute per instance. + autoscaling_target_dcgm_fi_dev_gpu_util (int): Optional. Target DCGM + metrics for GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): Optional. Target + vLLM metrics for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): Optional. Target + vLLM metrics for number of inference requests currently waiting in the + queue. autoscaling_target_pubsub_num_undelivered_messages (int): Optional. Target number of pubsub undelivered messages per instance. autoscaling_pubsub_subscription_labels (Dict[str, str]): Optional. @@ -1385,6 +1419,9 @@ def _deploy_call( or autoscaling_target_accelerator_duty_cycle or autoscaling_target_request_count_per_minute or autoscaling_target_cpu_utilization + or autoscaling_target_dcgm_fi_dev_gpu_util + or autoscaling_target_vllm_gpu_cache_usage_perc + or autoscaling_target_vllm_num_requests_waiting or autoscaling_target_pubsub_num_undelivered_messages or autoscaling_pubsub_subscription_labels ) @@ -1402,6 +1439,9 @@ def _deploy_call( "autoscaling_target_accelerator_duty_cycle, " "autoscaling_target_cpu_utilization, " "autoscaling_target_request_count_per_minute, " + "autoscaling_target_dcgm_fi_dev_gpu_util, " + "autoscaling_target_vllm_gpu_cache_usage_perc, " + "autoscaling_target_vllm_num_requests_waiting, " "autoscaling_target_pubsub_num_undelivered_messages, " "autoscaling_pubsub_subscription_labels parameters " "are ignored." @@ -1482,11 +1522,51 @@ def _deploy_call( [autoscaling_metric_spec] ) + if autoscaling_target_dcgm_fi_dev_gpu_util: + autoscaling_metric_spec = ( + gca_machine_resources_compat.AutoscalingMetricSpec( + metric_name=( + "prometheus.googleapis.com/vertex_dcgm_fi_dev_gpu_util" + ), + target=autoscaling_target_dcgm_fi_dev_gpu_util, + ) + ) + dedicated_resources.autoscaling_metric_specs.extend( + [autoscaling_metric_spec] + ) + + if autoscaling_target_vllm_gpu_cache_usage_perc: + autoscaling_metric_spec = ( + gca_machine_resources_compat.AutoscalingMetricSpec( + metric_name=( + "prometheus.googleapis.com/" + "vertex_vllm_gpu_cache_usage_perc" + ), + target=autoscaling_target_vllm_gpu_cache_usage_perc, + ) + ) + dedicated_resources.autoscaling_metric_specs.extend( + [autoscaling_metric_spec] + ) + + if autoscaling_target_vllm_num_requests_waiting: + autoscaling_metric_spec = ( + gca_machine_resources_compat.AutoscalingMetricSpec( + metric_name=( + "prometheus.googleapis.com/" + "vertex_vllm_num_requests_waiting" + ), + target=autoscaling_target_vllm_num_requests_waiting, + ) + ) + dedicated_resources.autoscaling_metric_specs.extend( + [autoscaling_metric_spec] + ) + if autoscaling_target_pubsub_num_undelivered_messages: autoscaling_metric_spec = gca_machine_resources_compat.AutoscalingMetricSpec( metric_name=( - "pubsub.googleapis.com/subscription/" - "num_undelivered_messages" + "pubsub.googleapis.com/subscription/num_undelivered_messages" ), target=autoscaling_target_pubsub_num_undelivered_messages, monitored_resource_labels=autoscaling_pubsub_subscription_labels, @@ -1542,6 +1622,9 @@ def _deploy_call( or autoscaling_target_accelerator_duty_cycle or autoscaling_target_cpu_utilization or autoscaling_target_request_count_per_minute + or autoscaling_target_dcgm_fi_dev_gpu_util + or autoscaling_target_vllm_gpu_cache_usage_perc + or autoscaling_target_vllm_num_requests_waiting or autoscaling_target_pubsub_num_undelivered_messages or autoscaling_pubsub_subscription_labels ) @@ -1553,6 +1636,9 @@ def _deploy_call( "autoscaling_target_accelerator_duty_cycle, " "autoscaling_target_cpu_utilization, " "autoscaling_target_request_count_per_minute, " + "autoscaling_target_dcgm_fi_dev_gpu_util, " + "autoscaling_target_vllm_gpu_cache_usage_perc, " + "autoscaling_target_vllm_num_requests_waiting, " "autoscaling_target_pubsub_num_undelivered_messages, " "autoscaling_pubsub_subscription_labels parameters " "may not be set when `deployment_resource_pool` is " @@ -1814,6 +1900,9 @@ def deploy( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, deployment_resource_pool: Optional[DeploymentResourcePool] = None, @@ -1923,6 +2012,13 @@ def deploy( not specified. autoscaling_target_request_count_per_minute (int): Optional. Target request count per minute per instance. + autoscaling_target_dcgm_fi_dev_gpu_util (int): Optional. Target DCGM + metrics for GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): Optional. Target + vLLM metrics for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): Optional. Target + vLLM metrics for number of inference requests currently waiting in the + queue. autoscaling_target_pubsub_num_undelivered_messages (int): Optional. Target number of pubsub undelivered messages per instance. autoscaling_pubsub_subscription_labels (Dict[str, str]): Optional. @@ -2030,6 +2126,9 @@ def deploy( autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle, autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute, + autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util, + autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc, + autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting, autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages, autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels, deployment_resource_pool=deployment_resource_pool, @@ -2071,6 +2170,9 @@ def _deploy( autoscaling_target_cpu_utilization: Optional[int] = None, autoscaling_target_accelerator_duty_cycle: Optional[int] = None, autoscaling_target_request_count_per_minute: Optional[int] = None, + autoscaling_target_dcgm_fi_dev_gpu_util: Optional[int] = None, + autoscaling_target_vllm_gpu_cache_usage_perc: Optional[int] = None, + autoscaling_target_vllm_num_requests_waiting: Optional[int] = None, autoscaling_target_pubsub_num_undelivered_messages: Optional[int] = None, autoscaling_pubsub_subscription_labels: Optional[Dict[str, str]] = None, deployment_resource_pool: Optional[DeploymentResourcePool] = None, @@ -2172,6 +2274,13 @@ def _deploy( not specified. autoscaling_target_request_count_per_minute (int): Optional. Target request count per minute per instance. + autoscaling_target_dcgm_fi_dev_gpu_util (int): Optional. Target DCGM + metrics for GPU utilization. + autoscaling_target_vllm_gpu_cache_usage_perc (int): Optional. Target + vLLM metrics for GPU KV cache usage percentage. + autoscaling_target_vllm_num_requests_waiting (int): Optional. Target + vLLM metrics for number of inference requests currently waiting in the + queue. autoscaling_target_pubsub_num_undelivered_messages (int): Optional. Target number of pubsub undelivered messages per instance. autoscaling_pubsub_subscription_labels (Dict[str, str]): Optional. @@ -2281,6 +2390,9 @@ def _deploy( autoscaling_target_cpu_utilization=autoscaling_target_cpu_utilization, autoscaling_target_accelerator_duty_cycle=autoscaling_target_accelerator_duty_cycle, autoscaling_target_request_count_per_minute=autoscaling_target_request_count_per_minute, + autoscaling_target_dcgm_fi_dev_gpu_util=autoscaling_target_dcgm_fi_dev_gpu_util, + autoscaling_target_vllm_gpu_cache_usage_perc=autoscaling_target_vllm_gpu_cache_usage_perc, + autoscaling_target_vllm_num_requests_waiting=autoscaling_target_vllm_num_requests_waiting, autoscaling_target_pubsub_num_undelivered_messages=autoscaling_target_pubsub_num_undelivered_messages, autoscaling_pubsub_subscription_labels=autoscaling_pubsub_subscription_labels, deployment_resource_pool=deployment_resource_pool, diff --git a/tests/unit/aiplatform/test_endpoints.py b/tests/unit/aiplatform/test_endpoints.py index 085139464a..89b53f6aef 100644 --- a/tests/unit/aiplatform/test_endpoints.py +++ b/tests/unit/aiplatform/test_endpoints.py @@ -146,6 +146,15 @@ _TEST_METRIC_NAME_REQUEST_COUNT = ( "aiplatform.googleapis.com/prediction/online/request_count" ) +_TEST_METRIC_NAME_DCGM_METRICS_GPU_UTILIZATION = ( + "prometheus.googleapis.com/vertex_dcgm_fi_dev_gpu_util" +) +_TEST_METRIC_NAME_VLLM_METRICS_GPU_CACHE_USAGE_PERCENTAGE = ( + "prometheus.googleapis.com/vertex_vllm_gpu_cache_usage_perc" +) +_TEST_METRIC_NAME_VLLM_METRICS_NUM_REQUESTS_WAITING = ( + "prometheus.googleapis.com/vertex_vllm_num_requests_waiting" +) _TEST_METRIC_NAME_PUBSUB_NUM_UNDELIVERED_MESSAGE = ( "pubsub.googleapis.com/subscription/num_undelivered_messages" ) @@ -2241,6 +2250,130 @@ def test_deploy_with_autoscaling_target_request_count_per_minute_preview( timeout=None, ) + @pytest.mark.usefixtures("get_endpoint_mock", "get_model_mock") + @pytest.mark.parametrize("sync", [True, False]) + def test_deploy_with_autoscaling_target_dcgm_vllm_metrics( + self, deploy_model_mock, sync + ): + test_endpoint = models.Endpoint(_TEST_ENDPOINT_NAME) + test_model = models.Model(_TEST_ID) + test_model._gca_resource.supported_deployment_resources_types.append( + aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES + ) + test_endpoint.deploy( + model=test_model, + machine_type=_TEST_MACHINE_TYPE, + service_account=_TEST_SERVICE_ACCOUNT, + sync=sync, + deploy_request_timeout=None, + autoscaling_target_dcgm_fi_dev_gpu_util=60, + autoscaling_target_vllm_gpu_cache_usage_perc=50, + autoscaling_target_vllm_num_requests_waiting=10, + ) + + if not sync: + test_endpoint.wait() + + expected_dedicated_resources = gca_machine_resources.DedicatedResources( + machine_spec=gca_machine_resources.MachineSpec( + machine_type=_TEST_MACHINE_TYPE, + ), + min_replica_count=1, + max_replica_count=1, + autoscaling_metric_specs=[ + gca_machine_resources.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_DCGM_METRICS_GPU_UTILIZATION, + target=60, + ), + gca_machine_resources.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_VLLM_METRICS_GPU_CACHE_USAGE_PERCENTAGE, + target=50, + ), + gca_machine_resources.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_VLLM_METRICS_NUM_REQUESTS_WAITING, + target=10, + ), + ], + ) + + expected_deployed_model = gca_endpoint.DeployedModel( + dedicated_resources=expected_dedicated_resources, + model=test_model.resource_name, + display_name=None, + service_account=_TEST_SERVICE_ACCOUNT, + ) + deploy_model_mock.assert_called_once_with( + endpoint=test_endpoint.resource_name, + deployed_model=expected_deployed_model, + traffic_split={"0": 100}, + metadata=(), + timeout=None, + ) + + @pytest.mark.usefixtures( + "get_endpoint_mock", "get_model_mock", "preview_deploy_model_mock" + ) + @pytest.mark.parametrize("sync", [True, False]) + def test_deploy_with_autoscaling_target_dcgm_vllm_metrics_preview( + self, preview_deploy_model_mock, sync + ): + test_endpoint = preview_models.Endpoint(_TEST_ENDPOINT_NAME) + test_model = preview_models.Model(_TEST_ID) + test_model._gca_resource.supported_deployment_resources_types.append( + aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES + ) + test_endpoint.deploy( + model=test_model, + machine_type=_TEST_MACHINE_TYPE, + service_account=_TEST_SERVICE_ACCOUNT, + sync=sync, + deploy_request_timeout=None, + autoscaling_target_dcgm_fi_dev_gpu_util=60, + autoscaling_target_vllm_gpu_cache_usage_perc=50, + autoscaling_target_vllm_num_requests_waiting=10, + ) + + if not sync: + test_endpoint.wait() + + expected_dedicated_resources = gca_machine_resources_v1beta1.DedicatedResources( + machine_spec=gca_machine_resources_v1beta1.MachineSpec( + machine_type=_TEST_MACHINE_TYPE, + ), + min_replica_count=1, + max_replica_count=1, + autoscaling_metric_specs=[ + gca_machine_resources_v1beta1.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_DCGM_METRICS_GPU_UTILIZATION, + target=60, + ), + gca_machine_resources_v1beta1.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_VLLM_METRICS_GPU_CACHE_USAGE_PERCENTAGE, + target=50, + ), + gca_machine_resources_v1beta1.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_VLLM_METRICS_NUM_REQUESTS_WAITING, + target=10, + ), + ], + ) + + expected_deployed_model = gca_endpoint_v1beta1.DeployedModel( + dedicated_resources=expected_dedicated_resources, + model=test_model.resource_name, + display_name=None, + service_account=_TEST_SERVICE_ACCOUNT, + enable_container_logging=True, + faster_deployment_config=gca_endpoint_v1beta1.FasterDeploymentConfig(), + ) + preview_deploy_model_mock.assert_called_once_with( + endpoint=test_endpoint.resource_name, + deployed_model=expected_deployed_model, + traffic_split={"0": 100}, + metadata=(), + timeout=None, + ) + @pytest.mark.usefixtures( "get_endpoint_mock", "get_model_mock", "preview_deploy_model_mock" ) diff --git a/tests/unit/aiplatform/test_models.py b/tests/unit/aiplatform/test_models.py index 5af84e24ea..5cf874db98 100644 --- a/tests/unit/aiplatform/test_models.py +++ b/tests/unit/aiplatform/test_models.py @@ -524,6 +524,15 @@ _TEST_METRIC_NAME_REQUEST_COUNT = ( "aiplatform.googleapis.com/prediction/online/request_count" ) +_TEST_METRIC_NAME_DCGM_METRICS_GPU_UTILIZATION = ( + "prometheus.googleapis.com/vertex_dcgm_fi_dev_gpu_util" +) +_TEST_METRIC_NAME_VLLM_METRICS_GPU_CACHE_USAGE_PERCENTAGE = ( + "prometheus.googleapis.com/vertex_vllm_gpu_cache_usage_perc" +) +_TEST_METRIC_NAME_VLLM_METRICS_NUM_REQUESTS_WAITING = ( + "prometheus.googleapis.com/vertex_vllm_num_requests_waiting" +) _TEST_METRIC_NAME_PUBSUB_NUM_UNDELIVERED_MESSAGE = ( "pubsub.googleapis.com/subscription/num_undelivered_messages" ) @@ -2559,6 +2568,140 @@ def test_preview_deploy_no_endpoint_dedicated_resources_autoscaling_request_coun timeout=None, ) + @pytest.mark.usefixtures( + "get_model_mock", + "create_endpoint_mock", + "get_endpoint_mock", + ) + @pytest.mark.parametrize("sync", [True, False]) + def test_deploy_no_endpoint_dedicated_resources_autoscaling_dcgm_vllm_metrics( + self, deploy_model_mock, sync + ): + test_model = models.Model(_TEST_ID) + test_model._gca_resource.supported_deployment_resources_types.append( + aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES + ) + + test_endpoint = test_model.deploy( + machine_type=_TEST_MACHINE_TYPE, + accelerator_type=_TEST_ACCELERATOR_TYPE, + accelerator_count=_TEST_ACCELERATOR_COUNT, + sync=sync, + deploy_request_timeout=None, + system_labels=_TEST_LABELS, + autoscaling_target_dcgm_fi_dev_gpu_util=60, + autoscaling_target_vllm_gpu_cache_usage_perc=50, + autoscaling_target_vllm_num_requests_waiting=10, + ) + + if not sync: + test_endpoint.wait() + + expected_dedicated_resources = gca_machine_resources.DedicatedResources( + machine_spec=gca_machine_resources.MachineSpec( + machine_type=_TEST_MACHINE_TYPE, + accelerator_type=_TEST_ACCELERATOR_TYPE, + accelerator_count=_TEST_ACCELERATOR_COUNT, + ), + min_replica_count=1, + max_replica_count=1, + autoscaling_metric_specs=[ + gca_machine_resources.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_DCGM_METRICS_GPU_UTILIZATION, + target=60, + ), + gca_machine_resources.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_VLLM_METRICS_GPU_CACHE_USAGE_PERCENTAGE, + target=50, + ), + gca_machine_resources.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_VLLM_METRICS_NUM_REQUESTS_WAITING, + target=10, + ), + ], + ) + expected_deployed_model = gca_endpoint.DeployedModel( + dedicated_resources=expected_dedicated_resources, + model=test_model.resource_name, + display_name=None, + system_labels=_TEST_LABELS, + ) + deploy_model_mock.assert_called_once_with( + endpoint=test_endpoint.resource_name, + deployed_model=expected_deployed_model, + traffic_split={"0": 100}, + metadata=(), + timeout=None, + ) + + @pytest.mark.usefixtures( + "get_model_mock", + "create_endpoint_mock", + "get_endpoint_mock", + ) + @pytest.mark.parametrize("sync", [True, False]) + def test_preview_deploy_no_endpoint_dedicated_resources_autoscaling_dcgm_vllm_metrics( + self, preview_deploy_model_mock, sync + ): + test_model = preview_models.Model(_TEST_ID).preview + test_model._gca_resource.supported_deployment_resources_types.append( + aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES + ) + + test_endpoint = test_model.deploy( + machine_type=_TEST_MACHINE_TYPE, + accelerator_type=_TEST_ACCELERATOR_TYPE, + accelerator_count=_TEST_ACCELERATOR_COUNT, + sync=sync, + deploy_request_timeout=None, + system_labels=_TEST_LABELS, + autoscaling_target_dcgm_fi_dev_gpu_util=60, + autoscaling_target_vllm_gpu_cache_usage_perc=50, + autoscaling_target_vllm_num_requests_waiting=10, + ) + + if not sync: + test_endpoint.wait() + + expected_dedicated_resources = gca_machine_resources_v1beta1.DedicatedResources( + machine_spec=gca_machine_resources_v1beta1.MachineSpec( + machine_type=_TEST_MACHINE_TYPE, + accelerator_type=_TEST_ACCELERATOR_TYPE, + accelerator_count=_TEST_ACCELERATOR_COUNT, + ), + min_replica_count=1, + max_replica_count=1, + autoscaling_metric_specs=[ + gca_machine_resources_v1beta1.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_DCGM_METRICS_GPU_UTILIZATION, + target=60, + ), + gca_machine_resources_v1beta1.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_VLLM_METRICS_GPU_CACHE_USAGE_PERCENTAGE, + target=50, + ), + gca_machine_resources_v1beta1.AutoscalingMetricSpec( + metric_name=_TEST_METRIC_NAME_VLLM_METRICS_NUM_REQUESTS_WAITING, + target=10, + ), + ], + ) + expected_deployed_model = gca_endpoint_v1beta1.DeployedModel( + dedicated_resources=expected_dedicated_resources, + model=test_model.resource_name, + display_name=None, + enable_container_logging=True, + faster_deployment_config=gca_endpoint_v1beta1.FasterDeploymentConfig(), + system_labels=_TEST_LABELS, + ) + preview_deploy_model_mock.assert_called_once_with( + endpoint=test_endpoint.resource_name, + deployed_model=expected_deployed_model, + traffic_split={"0": 100}, + metadata=(), + timeout=None, + ) + @pytest.mark.usefixtures( "get_model_mock", "create_endpoint_mock",