diff --git a/.gitignore b/.gitignore index 72c382291..b31c30d56 100644 --- a/.gitignore +++ b/.gitignore @@ -94,3 +94,8 @@ perf-test.py values-*.yaml helm/examples + +# perf-test +/perf-test/server-startup/.venv +/perf-test/server-startup/__pycache__/ +/perf-test/outputs/ diff --git a/deploy/README.md b/deploy/README.md new file mode 100644 index 000000000..ee99312b4 --- /dev/null +++ b/deploy/README.md @@ -0,0 +1,301 @@ +## Deploying the stack via Kubectl + +## Table of Contents + +- [Table of Contents](#table-of-contents) +- [Prerequisites](#prerequisites) +- [Steps](#steps) + - [1. Deploy VLLM router](#1-deploy-vllm-router) + - [2. Deploy VLLM app](#2-deploy-vllm-app) + - [3. Check GPUs allocated to the deployed VLLM application](#3-check-gpus-allocated-to-the-deployed-vllm-application) + - [4. Send a Query to the Stack](#4-send-a-query-to-the-stack) + - [4.1. Forward the Service Port](#41-forward-the-service-port) + - [4.2. Query the OpenAI-Compatible API to list the available models](#42-query-the-openai-compatible-api-to-list-the-available-models) + - [4.3. Query the OpenAI Completion Endpoint](#43-query-the-openai-completion-endpoint) + - [5. Uninstall](#5-uninstall) + + +### Prerequisites + +- A running Kubernetes (K8s) environment with GPUs + +### Steps + +Each component of the vLLM Production Stack can be individually deployed using yaml as following: + +Starting from a local directory containing this git repo: + +#### 1. Deploy VLLM router + + First, set the variable `NAMESPACE` to the name of a target namespace (e.g., `vllm-test`). For example: + + ```bash + NS=vllm-test + ``` + + Then, deploy the router app: + + ```bash + kubectl create ns vllm-test + sed s/%TARGET_NS%/$NS/g router-master.yaml | kubectl -n $NS apply -f - + ``` + + Optionally, check the deployment of the router app: + + ```bash + kubectl -n $NS get all + ``` + + Sample output: + + ```console + NAME READY STATUS RESTARTS AGE + pod/vllm-deployment-router-xx-yy 1/1 Running 0 12s + + NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + service/vllm-router-service ClusterIP 172.21.165.147 80/TCP 19s + + NAME READY UP-TO-DATE AVAILABLE AGE + deployment.apps/vllm-deployment-router 1/1 1 1 19s + + NAME DESIRED CURRENT READY AGE + replicaset.apps/vllm-deployment-router-xx 1 1 1 12s + ``` + + +#### 2. Deploy VLLM app + + First, set the variables `INSTANCE_NAME` and `MODEL_URL` to the given name and model URL for a VLLM instance. For example: + + ```bash + INSTANCE_NAME=vllm-1 + MODEL_URL=facebook/opt-125m + ``` + + Then, deploy the vllm instance in the target namespace (e.g., `vllm-test`): + + ```bash + sed -e s,%INSTANCE_NAME%,$INSTANCE_NAME,g -e s,%MODEL_URL%,$MODEL_URL,g vllm-master.yaml | kubectl -n $NS apply -f - + ``` + + Deploy a second vllm instance with a `gpt2` model: + + ```bash + INSTANCE_NAME=vllm-2 + MODEL_URL=openai-community/gpt2 + sed -e s,%INSTANCE_NAME%,$INSTANCE_NAME,g -e s,%MODEL_URL%,$MODEL_URL,g vllm-master.yaml | kubectl -n $NS apply -f - + ``` + + Optionally, check the deployment of the vllm apps: + + ```bash + kubectl -n $NS get pods -l environment=test + ``` + + Sample output: + ```console + NAME READY STATUS RESTARTS AGE + vllm-1-deployment-vllm-abcd-yyy 1/1 Running 0 6m28s + vllm-2-deployment-vllm-abcd-xyx 1/1 Running 0 112s + ``` + +#### 3. Check GPUs allocated to the deployed VLLM application + +In certain situations, if your cluster offers various GPU types, you can determine which GPU was chosen for a deployed VLLM instance with a specific model by using the tool [nvidia-smi](https://developer.download.nvidia.com/compute/DCGM/docs/nvidia-smi-367.38.pdf). + +This can be done with any pod capable of utilizing a GPU and with the open source vllm image from `vllm/vllm-openai:version`. Proceed as following: + +Exec into the vllm pod and list the NVIDIA GPUs in the system: + +a) For `vllm-1` instance: + + ```bash + kubectl -n $NS exec -it vllm-1-deployment-vllm-abcd-yyy -- nvidia-smi -L + ``` + Sample output: + + ```console + GPU 0: Tesla V100-PCIE-16GB (UUID: GPU-xxxxxx-d4ef-5ee2-3995-yyyyyyy) + ``` + +b) For `vllm-2` instance: + + ```bash + kubectl -n $NS exec -it vllm-2-deployment-vllm-abcd-xyx -- nvidia-smi -L + ``` + Sample output: + + ```console + GPU 0: Tesla V100-PCIE-16GB (UUID: GPU-yyyyyy-8224-b69d-3b55-xxxxx) + ``` + +Furthermore, you can also query for more detailed information about NVIDIA GPUs in the system (i.e, memory usage, etc.): + +```bash +kubectl -n $NS exec -it vllm-1-deployment-vllm-abcd-yyy -- nvidia-smi +``` + +Sample output: + +```console +Mon Feb 10 11:49:44 2025 ++-----------------------------------------------------------------------------------------+ +| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 | +|-----------------------------------------+------------------------+----------------------+ +| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | +| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|=========================================+========================+======================| +| 0 Tesla V100-PCIE-16GB On | 00000000:00:1E.0 Off | 0 | +| N/A 31C P0 38W / 250W | 14555MiB / 16384MiB | 0% Default | +| | | N/A | ++-----------------------------------------+------------------------+----------------------+ + ++-----------------------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=========================================================================================| ++-----------------------------------------------------------------------------------------+ +``` + + +#### 4. Send a Query to the Stack + +#### 4.1: Forward the Service Port + +Expose the `vllm-router-service` port to the host machine: + +```bash +sudo kubectl -n $NS port-forward svc/vllm-router-service 30080:80 +``` + +#### 4.2: Query the OpenAI-Compatible API to list the available models + +Open a new terminal and test the stack's OpenAI-compatible API by querying the available models: + +```bash +curl -o- http://localhost:30080/models +``` + +Expected output: + +```json +{ + "object": "list", + "data": [ + { + "id": "facebook/opt-125m", + "object": "model", + "created": 1739156592, + "owned_by": "vllm", + "root": null + }, + { + "id": "openai-community/gpt2", + "object": "model", + "created": 1739156824, + "owned_by": "vllm", + "root": null + } + ] +} +``` + +#### 4.3: Query the OpenAI Completion Endpoint + +a) Send a query to the OpenAI `/completion` endpoint to generate a completion for a prompt using the `facebook/opt-125m` model: + +```bash +curl -X POST http://localhost:30080/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "facebook/opt-125m", + "prompt": "The capital of France is,", + "max_tokens": 10 + }' \ + | jq +``` + +Sample output: + +```json +{ + "id": "cmpl-8e6eb10325484c52ab5e7666d452c14c", + "object": "text_completion", + "created": 1739157366, + "model": "facebook/opt-125m", + "choices": [ + { + "index": 0, + "text": " surprisingly, a French swim pool. Seriously, no", + "logprobs": null, + "finish_reason": "length", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "usage": { + "prompt_tokens": 7, + "total_tokens": 17, + "completion_tokens": 10, + "prompt_tokens_details": null + } +} +``` + +b) Send a query to the OpenAI `/completion` endpoint to generate a completion for a prompt using the `openai-community/gpt2` model: + +```bash +curl -s http://localhost:30080/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "openai-community/gpt2", + "prompt": "The capital of France is", + "max_tokens": 17, + "temperature": 0.2 + }' \ + | jq +``` + +Sample output: + +```json +{ + "id": "cmpl-0d5c63917f0f4ebb8eca62170fd1b4a4", + "object": "text_completion", + "created": 1739157195, + "model": "openai-community/gpt2", + "choices": [ + { + "index": 0, + "text": " the capital of the French Republic. The French Republic is the largest state in the world", + "logprobs": null, + "finish_reason": "length", + "stop_reason": null, + "prompt_logprobs": null + } + ], + "usage": { + "prompt_tokens": 5, + "total_tokens": 22, + "completion_tokens": 17, + "prompt_tokens_details": null + } +} +``` + +This demonstrates the model generating a continuation for the provided prompt for both `facebook/opt-125m` and `openai-community/gpt2` models. + +#### 5. Uninstall + +To remove the deployment, run: + +```bash +kubectl -n $NS delete -f router-master.yaml +kubectl -n $NS delete all -l environment=test +kubectl delete ns $NS +``` + + + diff --git a/deploy/router-master.yaml b/deploy/router-master.yaml new file mode 100644 index 000000000..4150a70e3 --- /dev/null +++ b/deploy/router-master.yaml @@ -0,0 +1,113 @@ +--- +apiVersion: policy/v1 +kind: PodDisruptionBudget +metadata: + name: "vllm-pdb" +spec: + maxUnavailable: 1 + +--- +apiVersion: v1 +kind: Service +metadata: + name: "vllm-router-service" + labels: + environment: router + release: router +spec: + type: ClusterIP + ports: + - name: "router-sport" + port: 80 + targetPort: 8000 + protocol: TCP + selector: + environment: router + release: router + +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: "vllm-router-service-account" + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: "vllm-pod-reader" +rules: +- apiGroups: [""] # "" indicates the core API group + resources: ["pods"] + verbs: ["get", "watch", "list"] + + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: vllm-deployment-access-binding +subjects: + - kind: ServiceAccount + name: vllm-router-service-account +roleRef: + kind: Role + name: vllm-pod-reader + apiGroup: rbac.authorization.k8s.io + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "vllm-deployment-router" + labels: + environment: router + release: router +spec: + replicas: 1 + selector: + matchLabels: + environment: router + release: router + template: + metadata: + labels: + environment: router + release: router + spec: + serviceAccountName: vllm-router-service-account + containers: + - name: router-container + image: lmcache/lmstack-router:latest + args: # TODO: update here + - "--host" + - "0.0.0.0" + - "--port" + - "8000" + - "--service-discovery" + - "k8s" + - "--k8s-namespace" + - "%TARGET_NS%" + - "--k8s-label-selector" + - environment=test,release=test + - "--routing-logic" + - "roundrobin" + - "--engine-stats-interval" + - "15" + - "--request-stats-window" + - "60" + resources: # TODO: update here + requests: + cpu: "4" + memory: "16G" + ports: + - name: "router-cport" + containerPort: 8000 + + livenessProbe: + initialDelaySeconds: 30 + periodSeconds: 5 + failureThreshold: 3 + httpGet: + path: /health + port: 8000 diff --git a/deploy/sample-vllm-deployment.yaml b/deploy/sample-vllm-deployment.yaml new file mode 100644 index 000000000..2e95fd390 --- /dev/null +++ b/deploy/sample-vllm-deployment.yaml @@ -0,0 +1,101 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-granite-3-0-2b-instruct + namespace: serverless-workstream +spec: + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: vllm-granite + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: "100%" + maxUnavailable: "0%" + template: + metadata: + labels: + app: vllm-granite + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu.product + operator: In + values: + - NVIDIA-L40S + containers: + - name: vllm + image: vllm/vllm-openai:latest + imagePullPolicy: IfNotPresent + command: + - vllm + - serve + - "ibm-granite/granite-3.0-3b-a800m-instruct" + - --host + - "0.0.0.0" + - --port + - "8000" + env: + - name: HF_HOME + value: "/data" + - name: XDG_CONFIG_HOME + value: "/data/vllm" + - name: TRITON_CACHE_DIR + value: "/data/triton" + - name: TORCH_HOME + value: "/data/torch" + - name: XDG_CACHE_HOME + value: "/data/cache" + - name: VLLM_USE_V1 + value: "1" + ports: + - containerPort: 8000 + name: container-port + protocol: TCP + livenessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + startupProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 60 + resources: + limits: + cpu: "10" + memory: 16Gi + nvidia.com/gpu: "1" + requests: + cpu: "10" + memory: 16Gi + nvidia.com/gpu: "1" + securityContext: + runAsNonRoot: false + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - name: vllm-storage + mountPath: /data + restartPolicy: Always + terminationGracePeriodSeconds: 30 + volumes: + - name: vllm-storage + persistentVolumeClaim: + claimName: vllm-cache-pvc2 \ No newline at end of file diff --git a/deploy/sample-vllm-pvc.yaml b/deploy/sample-vllm-pvc.yaml new file mode 100644 index 000000000..6dc438895 --- /dev/null +++ b/deploy/sample-vllm-pvc.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vllm-cache-pvc2 + namespace: serverless-workstream +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 15Gi \ No newline at end of file diff --git a/deploy/vllm-master.yaml b/deploy/vllm-master.yaml new file mode 100644 index 000000000..a17214a87 --- /dev/null +++ b/deploy/vllm-master.yaml @@ -0,0 +1,111 @@ +--- +apiVersion: v1 +kind: Service +metadata: + name: "%INSTANCE_NAME%-engine-service" + labels: + environment: test + release: test +spec: + type: ClusterIP + ports: + - name: "service-port" + port: 80 + targetPort: "container-port" + protocol: TCP + selector: + environment: test + release: test + +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: "%INSTANCE_NAME%-storage-claim" + labels: + environment: test + release: test +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: "%INSTANCE_NAME%-deployment-vllm" + labels: + environment: test + release: test +spec: + replicas: 1 + strategy: + rollingUpdate: + maxSurge: 100% + maxUnavailable: 0 + selector: + matchLabels: + environment: test + release: test + progressDeadlineSeconds: 1200 + template: + metadata: + labels: + environment: test + release: test + spec: + containers: + - name: "vllm" + image: "vllm/vllm-openai:latest" + + command: + - "vllm" + - "serve" + - "%MODEL_URL%" + - "--host" + - "0.0.0.0" + - "--port" + - "8000" + securityContext: + runAsNonRoot: false + imagePullPolicy: IfNotPresent + env: + - name: HF_HOME + value: /data + ports: + - name: "container-port" + containerPort: 8000 + startupProbe: + failureThreshold: 60 + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 10 + livenessProbe: + failureThreshold: 3 + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 15 + periodSeconds: 10 + resources: + requests: + memory: "16Gi" + cpu: "6" + nvidia.com/gpu: "1" + limits: + memory: "16Gi" + cpu: "6" + nvidia.com/gpu: "1" + volumeMounts: + - name: vllm-storage + mountPath: /data + + volumes: + - name: vllm-storage + persistentVolumeClaim: + claimName: "%INSTANCE_NAME%-storage-claim" diff --git a/observability/INSTRUCTIONS.md b/observability/INSTRUCTIONS.md new file mode 100644 index 000000000..2a514f1e7 --- /dev/null +++ b/observability/INSTRUCTIONS.md @@ -0,0 +1,111 @@ +## Setting up Grafana on Red Hat OpenShift Container Platform (OCP) + +1. Deploy the Grafana operator: + + Open the OpenShift web console and switch to the OperatorHub section. Search for the Grafana operator provided by the community. Follow the instructios and deploy it - you can use all the default values or specify your own namespace (e.g., `grafana-operator`) in the configuration parameters: + + + + Then, check that the Grafana operator pod is running: + + ```bash + oc kubectl -n grafana-operator get pods + ``` + + Sample output: + + ```console + NAME READY STATUS RESTARTS AGE + grafana-operator-controller-manager-v5-xxxxxx-yyy 1/1 Running 0 3d15h + ``` + +2. Create the Grafana instance: + + From Installed Operators, select the Grafana Operator. + Navigate to Grafana tab and select Create Grafana to create the Grafana instance. You can replace the default name for the Grafana instance. + + + + + Then, check that the Grafana pod is running: + + ```bash + oc -n grafana-operator get pods + ``` + + Sample output: + ```console + NAME READY STATUS RESTARTS AGE + grafana-a-deployment-758694d5c8-5dfd7 1/1 Running 0 16s + grafana-operator-controller-manager-v5-596559bbcc-fh2pk 1/1 Running 0 46h + ``` + +3. Create the Grafana router: + + ```bash + oc -n grafana-operator create route edge grafana-ui --service=grafana-a-service + ``` + +4. Create the Grafana data source: + + a) Give the Grafana service account the cluster-monitoring-view role. This account was created alongside the Grafana instance: + + ```bash + oc project grafana-operator + oc adm policy add-cluster-role-to-user cluster-monitoring-view -z grafana-a-sa + ``` + + b) Create a bearer token for the grafana service account. The token is used for the service account to authenticate with thanos-querier in the openshift-monitoring namespace: + + ```bash + BEARER_TOKEN=$(kubectl create token grafana-a-sa --duration 129600m --namespace grafana-operator) + ``` + + c) Deploy the prometheus data source: + + ```bash + cat << EOF | oc apply -f - + apiVersion: grafana.integreatly.org/v1beta1 + kind: GrafanaDatasource + metadata: + name: prometheus-test + namespace: grafana-operator + spec: + datasource: + jsonData: + httpHeaderName1: 'Authorization' + timeInterval: 5s + tlsSkipVerify: true + secureJsonData: + httpHeaderValue1: 'Bearer ' + access: proxy + isDefault: true + name: prometheus + type: prometheus + url: 'https://thanos-querier.openshift-monitoring.svc.cluster.local:9091' + instanceSelector: + matchLabels: + dashboards: grafana-a + plugins: + - name: grafana-clock-panel + version: 1.3.0 + resyncPeriod: 5m + EOF + ``` + + +5. Get the URL for Grafana: + + ```bash + oc -n grafana-operator get route grafana-ui -o jsonpath='{"https://"}{.spec.host}{"\n"}' + ``` + +6. Browse to the URL from above, and sign in with default administrator credentials. + + You can obtain the default administrator from the following secret: + + ```bash + oc -n grafana-operator get secret grafana-a-admin-credentials -o yaml + ``` + + Then, use `base64 -d` to decode and obtain the raw text of the credentials. This makes available the Grafana administrator features of creating dashboards. \ No newline at end of file diff --git a/observability/README.md b/observability/README.md index b08373426..eb82843ae 100644 --- a/observability/README.md +++ b/observability/README.md @@ -4,24 +4,77 @@ ## Deploy the observability stack -The observability stack is based on [kube-prom-stack](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/README.md). +1. If using plain K8s clusters: -To launch the observability stack: + The observability stack is based on [kube-prom-stack](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/README.md). -```bash -sudo bash install.sh -``` + To launch the observability stack: + + ```bash + sudo bash install.sh + ``` + + After installing, the dashboard can be accessed through the service `service/kube-prom-stack-grafana` in the `monitoring` namespace. + +2. If using OpenShift clusters: + + Follow the instructions [here](INSTRUCTIONS.md) if you need to set up Grafana on Red Hat OpenShift Container Platform (OCP). + + Then, configure Prometheus to scrape metrics from a running vllm instance: + + ```bash + kubectl -n k -n openshift-monitoring create -f vllm-sm.yaml + ``` + + Optionally, check the prometheus service monitor object was created: + + ```bash + kubectl -n openshift-monitoring get servicemonitor -l app=vllm + ``` + + Sample output: + + ```bash + NAME AGE + test-vllm-monitor2 6d3h + ``` + + Lastly, Grafana dashboard can be accessed through the openshift route in the Grafana namespace. -After installing, the dashboard can be accessed through the service `service/kube-prom-stack-grafana` in the `monitoring` namespace. ## Access the Grafana dashboard -Forward the Grafana dashboard port to the local node-port +1. If using plain K8s clusters: + + Forward the Grafana dashboard port to the local node-port + + ```bash + sudo kubectl --namespace monitoring port-forward svc/kube-prom-stack-grafana 3000:80 --address 0.0.0.0 + ``` + + Open the webpage at `http://:3000` to access the Grafana web page. The default user name is `admin` and the password can be configured in `values.yaml` (default is `prom-operator`). + +2. If using OpenShift clusters: + + Get the URL for Grafana: + + ```bash + oc -n grafana-operator get route grafana-ui -o jsonpath='{"https://"}{.spec.host}{"\n"}' + ``` + + Browse to the URL from above, and sign in with default administrator credentials. + + You can obtain the default administrator from the following secret: + + ```bash + oc -n grafana-operator get secret grafana-a-admin-credentials -o yaml + ``` -```bash -sudo kubectl --namespace monitoring port-forward svc/kube-prom-stack-grafana 3000:80 --address 0.0.0.0 -``` + Then, use `base64 -d` to decode and obtain the raw text of the credentials -Open the webpage at `http://:3000` to access the Grafana web page. The default user name is `admin` and the password can be configured in `values.yaml` (default is `prom-operator`). +3. Create the Grafana dashboard for vllm: -Import the dashboard using the `vllm-dashboard.json` in this folder. + ```bash + kubectl -n grafana-operator create -f vllm-dashboard.yaml + ``` + The above command uses the dashboard `vllm-dashboard.yaml` in this folder. diff --git a/observability/images/grafana-instance-setup.png b/observability/images/grafana-instance-setup.png new file mode 100644 index 000000000..38b5122ce Binary files /dev/null and b/observability/images/grafana-instance-setup.png differ diff --git a/observability/images/grafana-operator-setup.png b/observability/images/grafana-operator-setup.png new file mode 100644 index 000000000..71ff3b566 Binary files /dev/null and b/observability/images/grafana-operator-setup.png differ diff --git a/observability/vllm-dashboard.yaml b/observability/vllm-dashboard.yaml new file mode 100644 index 000000000..754cd811c --- /dev/null +++ b/observability/vllm-dashboard.yaml @@ -0,0 +1,665 @@ +apiVersion: grafana.integreatly.org/v1beta1 +kind: GrafanaDashboard +metadata: + labels: + app: grafana + name: cluster-metrics +spec: + instanceSelector: + matchLabels: + dashboards: grafana-a + json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 27, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "description": "Number of healthy vLLM instances", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 5, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "auto", + "percentChangeColorMode": "standard", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showPercentChange": false, + "textMode": "auto", + "wideLayout": true + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "disableTextWrap": false, + "editorMode": "builder", + "expr": "count by(endpoint) (vllm:cpu_cache_usage_perc)", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "vLLM instances", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Available vLLM instances", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 9, + "x": 5, + "y": 0 + }, + "id": 6, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "exemplar": false, + "expr": "sum by(le) (vllm:e2e_request_latency_seconds_bucket)", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "instant": false, + "legendFormat": "{{le}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request latency distribution", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 10, + "x": 14, + "y": 0 + }, + "id": 7, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "sum by(le) (vllm:time_to_first_token_seconds_bucket)", + "format": "heatmap", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "__auto", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Request TTFT distribution", + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:num_requests_running", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Number of running requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:gpu_cache_usage_perc", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU KV Usage percent", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:num_requests_waiting", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "Number of pending requests", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.4.0", + "targets": [ + { + "disableTextWrap": false, + "editorMode": "builder", + "expr": "vllm:gpu_prefix_cache_hit_rate", + "fullMetaSearch": false, + "includeNullMetadata": true, + "legendFormat": "{{instance}}", + "range": true, + "refId": "A", + "useBackend": false + } + ], + "title": "GPU KV cache hit rate", + "type": "timeseries" + } + ], + "preload": false, + "refresh": "auto", + "schemaVersion": 40, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "browser", + "title": "vllm dashboard", + "uid": "ee9i0i4y606psc", + "version": 17, + "weekStart": "" + } + \ No newline at end of file diff --git a/observability/vllm-sm.yaml b/observability/vllm-sm.yaml new file mode 100644 index 000000000..27d0cb6ab --- /dev/null +++ b/observability/vllm-sm.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: vllm-monitoring + labels: + app: vllm +spec: + endpoints: + - port: "service-port" + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + environment: test + release: test \ No newline at end of file diff --git a/perf-test/README.md b/perf-test/README.md new file mode 100644 index 000000000..76176157e --- /dev/null +++ b/perf-test/README.md @@ -0,0 +1,8 @@ +## Measure End-to-End VLLM StartUp Latency + + +Use the following instructions to compute the e2e latency for a vllm workload in a kubernetes cluster: + +a)[Instructions to Measure VLLM PodStartupLatency ](pod-startup/README.md) + +b)[Instructions for Measure VLLM Serve StartupLatency](server-startup/README.md) diff --git a/perf-test/pod-startup/README.md b/perf-test/pod-startup/README.md new file mode 100644 index 000000000..42bfcf22f --- /dev/null +++ b/perf-test/pod-startup/README.md @@ -0,0 +1,97 @@ +## Measure VLLM PodStartupLatency + +### Prerequisites +- A running Kubernetes (K8s) environment with GPUs + + +In this tutorial, we will use [clusterloader2](https://github.com/kubernetes/perf-tests/tree/master/clusterloader2) tool to measure the VLLM pod startup latency as following: + +#### 1. Clone the following clusterloader2 repo: + + ```bash + git clone -b vllm-exp https://github.com/dumb0002/perf-tests.git + ``` + +#### 2. Configure clusterloader2: + + Starting from a local directory containing this repo, run the following script to set-up your environment: + + a) cd into the `/production-stack/perf-test` directory from your local copy of the vllm production stack repo, for example: + + ```bash + cd $HOME/production-stack/perf-test + ``` + + b) set-up your environment: + + First, set the variable `CL2_DIR` to the path of the subdirectory `clusterloader2/` of the cloned repo in step 1. For example: + + ```bash + export CL2_DIR=$HOME/perf-tests/clusterloader2 + ``` + + Then, run the set-up script as following: + ```bash + ./setup-clusterloader2.sh + ``` + +#### 3. Configure the parameters of your workload: + + a) cd into the load configuration directory. + + ```bash + cd $CL2_DIR/testing/load/ + ``` + + b) configure the parameters to create the workload traffic. + + ```bash + vi vllm-config.yaml + ``` + + More specifically, configure the following parameter: + + - *namespace*: name prefix for the target namespace (e.g., `vllm-test`) + - *deploymentReplicas*: number of VLLM deployment replicas (default value: `1`) + - *modelName*: name of the VLLM model to be served by the instance (default value: `gpt2`) + - *modelRepo*: repo of the LLM model (default value: `openai-community/gpt2`) + - *podReplicas*: number of pod replicas per deployment (default value: `1`) + + +#### 4. Deploy your first VLLM app workload and measure the PodStartupLatency: + + Use a command of the following form to create the workload. The value given to the `--kubeconfig` flag should be a pathname of the kubeconfig of your k8s cluster. + + ```bash + cd $CL2_DIR + go run cmd/clusterloader.go --testconfig=./testing/load/vllm-config.yaml --kubeconfig=${KUBECONFIG:-$HOME/.kube/config} --provider=ks --v=2 + ``` + + At the end of clusterloader output you should see pod startup latency: + + ```console + { + "data": { + "Perc50": 28571.352, + "Perc90": 28571.352, + "Perc99": 28571.352 + }, + "unit": "ms", + "labels": { + "Metric": "pod_startup" + } + }, + ``` + + `pod_startup` measures time since pod was created until it was observed via watch as running. However, this time does not include the time it takes to bring up the vllm server with a model. More precisely, how long it takes for a `running` pod to reach the `ready` state + + You should also see that test succeeded: + + ```console + -------------------------------------------------------------------------------- + Test Finished + Test: ./testing/load/vllm-config.yaml + Status: Success + -------------------------------------------------------------------------------- + ``` + diff --git a/perf-test/pod-startup/measure.sh b/perf-test/pod-startup/measure.sh new file mode 100755 index 000000000..a3636842a --- /dev/null +++ b/perf-test/pod-startup/measure.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# This script should be called from the root dir of the cloned repository. + +NAMESPACE="serverless-workstream" +DEPLOYMENT_NAME="vllm-test" +LABEL="app=vllm-test" +CLEANUP=${CLEANUP:-false} + +# For MacOS, `brew install coreutils` can make gdate available, if necessary +START_TIME=$(gdate +%s%3N) + +kubectl apply -f ./perf-test/pod-startup/vllm-pvc-no-template.yaml -n $NAMESPACE +kubectl apply -f ./perf-test/pod-startup/vllm-deployment-no-template.yaml -n $NAMESPACE + +# Wait until at least one pod exists before proceeding +while [[ -z $(kubectl get pods -n $NAMESPACE -l $LABEL -o jsonpath='{.items[0].metadata.name}') ]]; do + sleep 0.1 +done + +# Get a pod name +POD_NAME=$(kubectl get pods -n $NAMESPACE -l $LABEL -o jsonpath='{.items[0].metadata.name}') + +# Wait for the pod to be "Running" +kubectl wait --for=jsonpath='{.status.phase}'=Running pod/$POD_NAME -n $NAMESPACE --timeout=3600s + +# Record the end time in milliseconds +END_TIME=$(gdate +%s%3N) + +# Calculate startup latency in milliseconds +LATENCY_MS=$((END_TIME - START_TIME)) + +# Convert milliseconds to seconds and show it +LATENCY_SEC=$(echo "scale=3; $LATENCY_MS / 1000" | bc) +echo "Pod startup latency: ${LATENCY_SEC} seconds" + +# Optional cleanup +if [ "$CLEANUP" = "true" ]; then + kubectl delete -f ./perf-test/pod-startup/vllm-deployment-no-template.yaml -n $NAMESPACE --wait + kubectl delete -f ./perf-test/pod-startup/vllm-pvc-no-template.yaml -n $NAMESPACE --wait +fi diff --git a/perf-test/pod-startup/setup-clusterloader2.sh b/perf-test/pod-startup/setup-clusterloader2.sh new file mode 100755 index 000000000..8a5c3c09f --- /dev/null +++ b/perf-test/pod-startup/setup-clusterloader2.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + + +set -x # echo so that users can understand what is happening +set -e # exit on error + +: +: ------------------------------------------------------------------------- +: "Configuring clusterloader2 to measure VLLM PodStartupLatency" + +if [ $CL2_DIR == "" ];then + echo "Set the variable CL2_DIR to the path of the subdirectory clusterloader2/ of a cloned `https://github.com/kubernetes/perf-tests/` repo." + exit 1; +fi + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +cp $SCRIPT_DIR/vllm-config.yaml $CL2_DIR/testing/load +cp $SCRIPT_DIR/vllm-deployment.yaml $CL2_DIR/testing/load +cp $SCRIPT_DIR/vllm-service.yaml $CL2_DIR/testing/load +cp $SCRIPT_DIR/vllm-pvc.yaml $CL2_DIR/testing/load + diff --git a/perf-test/pod-startup/vllm-config.yaml b/perf-test/pod-startup/vllm-config.yaml new file mode 100644 index 000000000..fe9675289 --- /dev/null +++ b/perf-test/pod-startup/vllm-config.yaml @@ -0,0 +1,69 @@ +# {{$namespace := DefaultParam .namespace "default"}} +# {{$deploymentReplicas:= DefaultParam .deploymentReplicas 1}} +# {{$modelName := DefaultParam .modelName "gpt2"}} +# {{$modelRepo := DefaultParam .modelName "openai-community/gpt2"}} +# {{$podReplicas := DefaultParam .podReplicas "1"}} + +{{$namespace := DefaultParam .namespace "serverless-workstream"}} +{{$deploymentReplicas:= DefaultParam .deploymentReplicas 1}} +{{$modelName := DefaultParam .modelName "granite-3-0-2b-instruct"}} +{{$modelRepo := DefaultParam .modelName "ibm-granite/granite-3.0-3b-a800m-instruct"}} +{{$podReplicas := DefaultParam .podReplicas "1"}} + + +name: test +automanagedNamespaces: 0 +namespace: + number: 1 + +tuningSets: +- name: Uniform1qps + qpsLoad: + qps: 1 + +steps: +- name: Start measurements + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: start + labelSelector: performance = test + threshold: 3600s + - Identifier: WaitForControlledPodsRunning + Method: WaitForControlledPodsRunning + Params: + action: start + apiVersion: apps/v1 + kind: Deployment + labelSelector: performance = test + operationTimeout: 3600s +- name: Create VLLM deployment + phases: + - NamespaceList: + - {{$namespace}} + replicasPerNamespace: {{$deploymentReplicas}} + tuningSet: Uniform1qps + objectBundle: + - basename: {{$modelName}} + objectTemplatePath: "vllm-pvc.yaml" + - basename: {{$modelName}} + objectTemplatePath: "vllm-deployment.yaml" + templateFillMap: + podReplicas: {{$podReplicas}} + claimName: {{$modelName}} + modelRepo: {{$modelRepo}} + - basename: {{$modelName}} + objectTemplatePath: "vllm-service.yaml" +- name: Wait for pods to be running + measurements: + - Identifier: WaitForControlledPodsRunning + Method: WaitForControlledPodsRunning + Params: + action: gather +- name: Measure pod startup latency + measurements: + - Identifier: PodStartupLatency + Method: PodStartupLatency + Params: + action: gather diff --git a/perf-test/pod-startup/vllm-deployment-no-template.yaml b/perf-test/pod-startup/vllm-deployment-no-template.yaml new file mode 100644 index 000000000..f5fd09c9a --- /dev/null +++ b/perf-test/pod-startup/vllm-deployment-no-template.yaml @@ -0,0 +1,109 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vllm-test + labels: + environment: test + release: test + performance: test +spec: + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + app: vllm-test + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: "100%" + maxUnavailable: "0%" + template: + metadata: + labels: + app: vllm-test + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu.product + operator: In + values: + - NVIDIA-L40S + # - NVIDIA-A100-SXM4-80GB + containers: + - name: vllm + image: vllm/vllm-openai:latest + imagePullPolicy: IfNotPresent + command: + - vllm + - serve + - ibm-granite/granite-3.1-8b-instruct + # - meta-llama/Llama-3.1-8B-Instruct + # - meta-llama/Llama-3.1-70B-Instruct + - --host + - "0.0.0.0" + - --port + - "8000" + env: + - name: HF_HOME + value: "/data" + - name: HF_TOKEN + value: + - name: XDG_CONFIG_HOME + value: "/data/vllm" + - name: TRITON_CACHE_DIR + value: "/data/triton" + - name: TORCH_HOME + value: "/data/torch" + - name: XDG_CACHE_HOME + value: "/data/cache" + - name: VLLM_USE_V1 + value: "1" + ports: + - containerPort: 8000 + name: container-port + protocol: TCP + livenessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + startupProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 60 + resources: + limits: + cpu: "10" + memory: 16Gi + nvidia.com/gpu: "1" + requests: + cpu: "10" + memory: 16Gi + nvidia.com/gpu: "1" + securityContext: + runAsNonRoot: false + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - name: vllm-storage + mountPath: /data + restartPolicy: Always + terminationGracePeriodSeconds: 30 + volumes: + - name: vllm-storage + persistentVolumeClaim: + claimName: vllm-test diff --git a/perf-test/pod-startup/vllm-deployment.yaml b/perf-test/pod-startup/vllm-deployment.yaml new file mode 100644 index 000000000..c2d77985d --- /dev/null +++ b/perf-test/pod-startup/vllm-deployment.yaml @@ -0,0 +1,104 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{.Name}} + labels: + environment: test + release: test + performance: test +spec: + replicas: {{.podReplicas}} + revisionHistoryLimit: 10 + selector: + matchLabels: + app: vllm-granite + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: "100%" + maxUnavailable: "0%" + template: + metadata: + labels: + app: vllm-granite + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu.product + operator: In + values: + - NVIDIA-L40S + containers: + - name: vllm + image: vllm/vllm-openai:latest + imagePullPolicy: IfNotPresent + command: + - vllm + - serve + - {{.modelRepo}} + - --host + - "0.0.0.0" + - --port + - "8000" + env: + - name: HF_HOME + value: "/data" + - name: XDG_CONFIG_HOME + value: "/data/vllm" + - name: TRITON_CACHE_DIR + value: "/data/triton" + - name: TORCH_HOME + value: "/data/torch" + - name: XDG_CACHE_HOME + value: "/data/cache" + - name: VLLM_USE_V1 + value: "1" + ports: + - containerPort: 8000 + name: container-port + protocol: TCP + livenessProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 3 + startupProbe: + httpGet: + path: /health + port: 8000 + scheme: HTTP + initialDelaySeconds: 15 + periodSeconds: 10 + timeoutSeconds: 1 + successThreshold: 1 + failureThreshold: 60 + resources: + limits: + cpu: "10" + memory: 16Gi + nvidia.com/gpu: "1" + requests: + cpu: "10" + memory: 16Gi + nvidia.com/gpu: "1" + securityContext: + runAsNonRoot: false + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - name: vllm-storage + mountPath: /data + restartPolicy: Always + terminationGracePeriodSeconds: 30 + volumes: + - name: vllm-storage + persistentVolumeClaim: + claimName: {{.claimName}}-{{.Index}} \ No newline at end of file diff --git a/perf-test/pod-startup/vllm-pvc-no-template.yaml b/perf-test/pod-startup/vllm-pvc-no-template.yaml new file mode 100644 index 000000000..9127092be --- /dev/null +++ b/perf-test/pod-startup/vllm-pvc-no-template.yaml @@ -0,0 +1,11 @@ +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vllm-test +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi diff --git a/perf-test/pod-startup/vllm-pvc.yaml b/perf-test/pod-startup/vllm-pvc.yaml new file mode 100644 index 000000000..950fdf88f --- /dev/null +++ b/perf-test/pod-startup/vllm-pvc.yaml @@ -0,0 +1,12 @@ +--- +# Source: vllm-stack/templates/pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{.Name}} +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 15Gi \ No newline at end of file diff --git a/perf-test/pod-startup/vllm-service.yaml b/perf-test/pod-startup/vllm-service.yaml new file mode 100644 index 000000000..203155bdf --- /dev/null +++ b/perf-test/pod-startup/vllm-service.yaml @@ -0,0 +1,19 @@ +--- +# Source: vllm-stack/templates/service-vllm.yaml +apiVersion: v1 +kind: Service +metadata: + name: {{.Name}} + labels: + environment: test + release: test +spec: + type: ClusterIP + ports: + - name: "service-port" + port: 80 + targetPort: "container-port" + protocol: TCP + selector: + environment: test + release: test diff --git a/perf-test/server-startup/README.md b/perf-test/server-startup/README.md new file mode 100644 index 000000000..9fd0b4e81 --- /dev/null +++ b/perf-test/server-startup/README.md @@ -0,0 +1,114 @@ +## Measure VLLM Server StartupLatency + + +### Prerequisites: +- A running Kubernetes (K8s) environment with GPUs +- [python3](https://www.python.org/downloads/) with all the dependencies listed [here](requirements.txt) installed. We recommend to create a python virtual environment `.venv` under `perf-test/server-startup`, for example: + +```bash +cd perf-test/server-startup +python3 -m venv .venv +. .venv/bin/activate +pip3 install -r requirements.txt +``` + +In this tutorial, we will compute the breakdown of the e2e latency for a VLLM server running in a Kubernetes cluster as following: + +### Steps + +1. Open a new terminal and cd into the `perf-test/server-startup` directory from your local copy of this repo repo, for example: + + ```bash + cd $HOME/production-stack/perf-test/server-startup + ``` + + Then, run the metrics collection script: + + ```bash + python3 vllm-serverStartupLatency.py + ``` + + For example: + + ```bash + python3 vllm-serverStartupLatency.py $HOME/.kube/config wec1 environment=test vllm-test $HOME/data + ``` + + Below is a detailed explanation of the input parameters: + - `kubeconfig`: path to the kubeconfig file, e.g., `$HOME/.kube/config` + - `k8s-context-name`: name of the context for the k8s cluster, e.g., `wec1` + - `pod-label-selector`: label of the vllm pods, e.g., `environment=test` + - `namespace`: namespace for the vllm pods, e.g., `vllm-test` + - `output-directory`: path to the directory for the output data files, e.g., `$HOME/data` + + + The input parameter `kubeconfig` and `k8s-context-name` are optional. Setting both parameters to `None`, we will use the current context set in the kubeconfig file set by the `KUBECONFIG` env variable. For example: + + ```bash + python3 vllm-serverStartupLatency.py None None environment=test vllm-test $HOME/data + ``` + + The generated output from the script are two files. The first is a tab-delimited file named `vllm-server-startup-latency.txt` with the following structure: + + ```console + + ``` + + Also, the unit of measurement for each value is `seconds` in the generated output file, except for the value in the first colum which corresponds to the VLLM pod name, and the `` which has unit of GB. + + For example: + + ```bash + cd $HOME/data + cat vllm-server-startup-latency.txt + ``` + + Sample output: + + ```console + vllm-granite-3-0-2b-instruct-6f9cb88444-2czv 13 14 6.1501 2.97 30.03 18 0 + ``` + + The second file contains the logs of the vllm pods. For example: + + ```bash + cat vllm-granite-3-0-2b-instruct-6f9cb88444-2czvd-log.txt + ``` + + Sample output: + ``` bash + INFO 02-13 15:01:07 gpu_model_runner.py:872] Loading model weights took 6.1501 GB + INFO 02-13 15:01:13 backends.py:579] Using cache directory: /data/cache/vllm/torch_compile_cache/02bf430320/rank_0 for vLLM's torch.compile + INFO 02-13 15:01:13 backends.py:587] Dynamo bytecode transform time: 6.77 s + INFO 02-13 15:01:16 backends.py:311] Cache the graph of shape None for later use + INFO 02-13 15:01:37 backends.py:323] Compiling a graph for general shape takes 23.26 s + WARNING 02-13 15:01:39 fused_moe.py:806] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=40,N=512,device_name=NVIDIA_L40S.json + INFO 02-13 15:01:40 monitor.py:33] torch.compile takes 30.03 s in total + INFO 02-13 15:01:40 kv_cache_utils.py:407] # GPU blocks: 33660 + INFO 02-13 15:01:40 kv_cache_utils.py:410] Maximum concurrency for 4096 tokens per request: 131.48x + INFO 02-13 15:01:59 gpu_model_runner.py:1043] Graph capturing finished in 18 secs, took 0.62 GiB + INFO 02-13 15:01:59 core.py:91] init engine (profile, create kv cache, warmup model) took 51.89 seconds + INFO 02-13 15:01:59 api_server.py:756] Using supplied chat template:^M + INFO 02-13 15:01:59 api_server.py:756] None + INFO 02-13 15:01:59 launcher.py:21] Available routes are: + INFO 02-13 15:01:59 launcher.py:29] Route: /openapi.json, Methods: HEAD, GET + INFO 02-13 15:01:59 launcher.py:29] Route: /docs, Methods: HEAD, GET + INFO 02-13 15:01:59 launcher.py:29] Route: /docs/oauth2-redirect, Methods: HEAD, GET + INFO 02-13 15:01:59 launcher.py:29] Route: /redoc, Methods: HEAD, GET + INFO 02-13 15:01:59 launcher.py:29] Route: /health, Methods: GET + INFO 02-13 15:01:59 launcher.py:29] Route: /ping, Methods: GET, POST + ``` + + +2. Clean up the generated workload Kubernetes API objects from your cluster: + + ```bash + kubectl -n delete pods -l + ``` + + For example: + + ```console + kubectl -n vllm-test delete pods -l environment=test + ``` + diff --git a/perf-test/server-startup/requirements.txt b/perf-test/server-startup/requirements.txt new file mode 100644 index 000000000..7c74298e4 --- /dev/null +++ b/perf-test/server-startup/requirements.txt @@ -0,0 +1 @@ +kubernetes==30.1.0 \ No newline at end of file diff --git a/perf-test/server-startup/vllm-serverStartupLatency.py b/perf-test/server-startup/vllm-serverStartupLatency.py new file mode 100644 index 000000000..a7443127b --- /dev/null +++ b/perf-test/server-startup/vllm-serverStartupLatency.py @@ -0,0 +1,130 @@ +#!/usr/bin/env python3 + +import os +import kubernetes +from kubernetes import client, config +from kubernetes.client.rest import ApiException +import re +import sys +# import pandas as pd +from vllmLogParser import * + + +class Collector(): + def __init__(self, kubeconfig=None, k8s_context=None): + self.kubeconfig = kubeconfig + self.context = k8s_context + self.core_v1_api, self.app_v1_api, self.custom_objects_api = self.load_kubeconfig(kubeconfig, k8s_context) + + def load_kubeconfig(self, kubeconfig, context): + + if kubeconfig != None and context == None: + config.load_kube_config(config_file=kubeconfigg) + elif kubeconfig != None and context != None: + config.load_kube_config(config_file=kubeconfig, context=context) + else: + config.load_kube_config() + + api_client = kubernetes.client.ApiClient() + core_v1_api = kubernetes.client.CoreV1Api(api_client) + app_v1_api = kubernetes.client.AppsV1Api(api_client) + custom_objects_api = kubernetes.client.CustomObjectsApi(api_client) + + return core_v1_api, app_v1_api, custom_objects_api + + + def read_pod_logs(self, pod_name, namespace="default"): + try: + logs = self.core_v1_api.read_namespaced_pod_log(name=pod_name, namespace=namespace) + return logs + + except client.exceptions.ApiException as e: + print(f"Error reading logs: {e}") + return None + + + def find_pod_by_label(self, label_selector, namespace="default"): + pods = self.core_v1_api.list_namespaced_pod(namespace, label_selector=label_selector).items + return pods + + def get_pod_readiness_time(self, pod_name, namespace="default"): + for cond in pod.status.conditions: + if cond.type == "Ready": + t = cond.last_transition_time + return t + + # def text_to_csv(text_file_path, csv_file_path, delimiter='\t'): + # try: + # df = pd.read_csv(text_file_path, sep=delimiter) + # df.to_csv(csv_file_path, index=False) + # print(f"Successfully converted '{text_file_path}' to '{csv_file_path}'") + # except FileNotFoundError: + # print(f"Error: Text file '{text_file_path}' not found.") + # except Exception as e: + # print(f"An error occurred: {e}") + + +if __name__=="__main__": + + kubeconfig = str(sys.argv[1]) # path to the kubeconfig file (e.g., "kscore-config) + k8s_context = str(sys.argv[2]) # name of the k8s cluster context (e.g., 'wec1') + label_selector = str(sys.argv[3]) # vllm pod label selector (e.g, 'environment=test') + namespace = str(sys.argv[4]) # vllm pod namespace (e.g., `vllm-test`) + output_dir = str(sys.argv[5]) # path to the directory for the output files (e.g., $HOME/data/) + + c = Collector() + pods = c.find_pod_by_label(label_selector, namespace) + + fout = open(output_dir + "/vllm-server-startup-latency.txt", "w") + #fout.write("pod_name" + "\t" + "engine_init" + "\t" + "model_loading model_weight_GB" + "\t" + "before_torch_compile" + "\t" + "torch_compile" + "\t" + "CUDA_graph" + "\t" + "API_readiness" + "\n") + + if pods: + for pod in pods: + name = pod.metadata.name + + # Check if the pod is running and condition status is 'ready' + if pod.status.phase == "Running" and any(cond.status == "True" for cond in pod.status.conditions if cond.type == "Ready"): + print("Pod is running and condition status is 'ready': ", name) + + pod_logs = c.read_pod_logs(name, namespace) + if pod_logs: + # save the logs into a file + f = open(output_dir + "/" + name + "-log.txt", "w") + f.write(pod_logs) + f.close() + + # Compute the breakdow of the e2e startup latency for the VLLM server + temp_logs = pod_logs.splitlines() + + print("Extracting engine initalization latency ...") + t1 = get_engine_init_time(temp_logs) + + print("Extracting Model Loading latency ...") + t2 = get_model_load_time(temp_logs) + + print("Extracting Model Weight Loading GB latency ...") + t3 = get_model_weight_gb(temp_logs) + + print("Extracting time before torch.compile...") + tx = get_before_torch_compile_time(temp_logs) + + print("Extracting torch.compile time ...") + t5 = get_torch_compile_time(temp_logs) + t4 = round(float(tx) - float(t5), 3) # computing time before torch.compile + + print("Extracting CUDA graph instantiation latency ...") + t6 = get_cuda_graph_time(temp_logs) + + print("Extracting API Server Init latency ...") + t7 = get_apiserver_init_time(temp_logs) + else: + print("Pod is not running or condition status is not 'ready': ", name) + continue + + print('--------------------------------------') + fout.write(name + "\t" + str(t1) + "\t" + str(t2) + "\t" + str(t3) + "\t" + str(t4) + "\t" + str(t5) + "\t" + str(t6) + "\t" + str(t7) + "\n") + fout.flush() + fout.close() + else: + print("No pods found with label: ", label_selector) + diff --git a/perf-test/server-startup/vllmLogParser.py b/perf-test/server-startup/vllmLogParser.py new file mode 100644 index 000000000..f23f1aa33 --- /dev/null +++ b/perf-test/server-startup/vllmLogParser.py @@ -0,0 +1,281 @@ +#!/usr/bin/env python3 +import re +from datetime import datetime + + + +# Extract Engine Initialization +def get_engine_init_time(logs): + pattern1 = "Automatically detected platform cuda." + pattern2 = "Starting to load model" + + t = 0 + m1=False + m2=False + + for line in logs: + sentence = line.strip("\n") + match_1 = re.search(pattern1, sentence) + match_2 = re.search(pattern2, sentence) + + if match_1 and not m1: + t1 = sentence.split(" ")[2] + m1=True + elif match_2 and not m2: + t2 = sentence.split(" ")[2] + m2=True + else: + continue + + # compute the time + if m1 and m2: + format_string = "%H:%M:%S" + d1 = datetime.strptime(t1, format_string) + d2 = datetime.strptime(t2, format_string) + delta = d2 - d1 + t = delta.seconds + break + return t + + +# Extract Model Loading latency +def get_model_load_time(logs): + pattern1 = "Starting to load model" + pattern2 = "Loading model weights took" + #pattern2 = "Capturing cudagraphs for decoding" + + t = 0 + m1=False + m2=False + + for line in logs: + sentence = line.strip("\n") + match_1 = re.search(pattern1, sentence) + match_2 = re.search(pattern2, sentence) + + if match_1 and not m1: + t1 = sentence.split(" ")[2] + m1=True + elif match_2 and not m2: + t2 = sentence.split(" ")[2] + m2=True + else: + continue + + # compute the time + if m1 and m2: + format_string = "%H:%M:%S" + d1 = datetime.strptime(t1, format_string) + d2 = datetime.strptime(t2, format_string) + delta = d2 - d1 + t = delta.seconds + break + return t + + +# Extract Model weight loading (GB) +def get_model_weight_gb(logs): + t = 0 + pattern = "Loading model weights took" + + for line in logs: + sentence = line.strip("\n") + match = re.search(pattern, sentence) + + if match: + new_sentence=sentence.split("]") + match = re.findall(r"-?\d+\.\d+|-?\d+", new_sentence[1]) # The regex pattern matches one or more digits (\d+) + + if match: + t = match[0] + break + else: + continue + else: + continue + return t + + +# Extract Torch compile time +def get_torch_compile_time(logs): + t = 0 + pattern = "torch.compile takes" + + for line in logs: + sentence = line.strip("\n") + match = re.search(pattern, sentence) + + if match: + new_sentence=sentence.split("]") + match = re.findall(r"-?\d+\.\d+|-?\d+", new_sentence[1]) # The regex pattern matches one or more digits (\d+) + + if match: + t = match[0] + break + else: + continue + else: + continue + return t + + +# Extract before Torch compile time +def get_before_torch_compile_time(logs): + pattern1 = "Loading model weights took" + pattern2 = "torch.compile takes" + + t = 0 + m1=False + m2=False + + for line in logs: + sentence = line.strip("\n") + match_1 = re.search(pattern1, sentence) + match_2 = re.search(pattern2, sentence) + + if match_1 and not m1: + t1 = sentence.split(" ")[2] + m1=True + elif match_2 and not m2: + t2 = sentence.split(" ")[2] + m2=True + else: + continue + + # compute the time + if m1 and m2: + format_string = "%H:%M:%S" + d1 = datetime.strptime(t1, format_string) + d2 = datetime.strptime(t2, format_string) + delta = d2 - d1 + t = delta.seconds + break + return t + + +# Extract CUDA graph instantiation latency +def get_cuda_graph_time(logs): + t = 0 + pattern = "Graph capturing finished" + + for line in logs: + sentence = line.strip("\n") + match = re.search(pattern, sentence) + + if match: + new_sentence=sentence.split("]") + match = re.findall(r"-?\d+\.\d+|-?\d+", new_sentence[1]) # The regex pattern matches one or more digits (\d+) + + if match: + t = match[0] + break + else: + continue + else: + continue + return t + + +# # Extract init engine time +# def get_apiserver_init_time (logs): +# d = None +# pattern = "init engine" + +# for line in logs: +# sentence = line.strip("\n") +# match = re.search(pattern, sentence) + +# if match: +# format_string = "%H:%M:%S" +# t = sentence.split(" ")[2] +# d = datetime.strptime(t, format_string) +# else: +# continue +# return d + +# Extract API Readiness time +def get_apiserver_init_time (logs): + pattern1 = "init engine" + pattern2 = "Route: /invocations, Methods: POST" + + t = 0 + m1=False + m2=False + + for line in logs: + sentence = line.strip("\n") + match_1 = re.search(pattern1, sentence) + match_2 = re.search(pattern2, sentence) + + if match_1 and not m1: + t1 = sentence.split(" ")[2] + m1=True + elif match_2 and not m2: + t2 = sentence.split(" ")[2] + m2=True + else: + continue + + # compute the time + if m1 and m2: + format_string = "%H:%M:%S" + d1 = datetime.strptime(t1, format_string) + d2 = datetime.strptime(t2, format_string) + delta = d2 - d1 + t = delta.seconds + break + return t + +# Extract memory profile latency +def get_memory_profile_time(logs): + t = 0 + pattern = "Memory profiling takes" + + for line in logs: + sentence = line.strip("\n") + match = re.search(pattern, sentence) + + if match: + new_sentence=sentence.split("]") + match = re.findall(r"-?\d+\.\d+|-?\d+", new_sentence[1]) # The regex pattern matches one or more digits (\d+) + + if match: + t = match[0] + break + else: + continue + else: + continue + return t + + +# Extract memory profile latency +def get_model_weight_time(logs): + t = 0 + pattern = "Time took to download weights for" + + for line in logs: + sentence = line.strip("\n") + match = re.search(pattern, sentence) + + if match: + new_sentence=(sentence.split("]")[1]).split(":") + match = re.findall(r"-?\d+\.\d+|-?\d+", new_sentence[1]) # The regex pattern matches one or more digits (\d+) + + if match: + t = match[0] + break + else: + continue + else: + continue + return t + + +def convert_string_to_datetime(date_string, format_string): + try: + datetime_object = datetime.strptime(date_string, format_string) + return datetime_object + + except ValueError: + return None \ No newline at end of file