diff --git a/docs/airgap/mirror-docker-images.md b/docs/airgap/mirror-docker-images.md index b65d29dcc..ccd04df47 100644 --- a/docs/airgap/mirror-docker-images.md +++ b/docs/airgap/mirror-docker-images.md @@ -42,9 +42,8 @@ Then, for each image you want to download, you should pull the image from the re In this example, we're saving all our Docker images to `/tmp/images`: ``` -$ docker pull nvidia/cuda:11.1-devel-ubuntu20.04 -$ docker save -o /tmp/images/nvidia-cuda-11.1-devel-ubuntu20.04.tar nvidia/cuda:11.1-devel-ubuntu20.04 ->>>>>>> e1d0a775 (Airgap documentation update.) +$ docker pull nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 +$ docker save -o /tmp/images/nvidia-cuda-12.4.1-base-ubuntu22.04.tar nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 ``` Additionally, you should download and save the [`registry` image](https://hub.docker.com/_/registry) so that you can deploy a local registry on the offline network. @@ -76,13 +75,8 @@ Additionally, we assume that the `registry` image was included when you transfer Load the registry image into the Docker image cache of your container registry host: -<<<<<<< HEAD ```bash -docker load < /tmp/images/registry-2.7.tar -======= -``` -$ docker load -i /tmp/images/registry-2.7.tar ->>>>>>> e1d0a775 (Airgap documentation update.) +$ docker load -i /tmp/images/registry-3.1.1.tar ``` Then create a Docker volume to store your container images: @@ -99,7 +93,7 @@ docker run -d \ --restart=always \ --name registry \ -v registry-images:/var/lib/registry \ - registry:2.7 + registry:3.1.1 ``` ## Configuring your hosts to use the offline container registry @@ -121,7 +115,7 @@ docker_insecure_registries: Once your registry is running and you've configured your hosts to access it, you can load additional images and push them to the offline registry: ```bash -docker load < /tmp/images/nvidia-cuda-11.1-devel-ubuntu20.04.tar -docker tag nvidia/cuda:11.1-devel-ubuntu20.04 registry-host:5000/nvidia/cuda:11.1-devel-ubuntu20.04 -docker push registry-host:5000/nvidia/cuda:11.1-devel-ubuntu20.04 +docker load -i /tmp/images/nvidia-cuda-12.4.1-base-ubuntu22.04.tar +docker tag nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 registry-host:5000/nvidia/cuda:12.4.1-base-ubuntu22.04 +docker push registry-host:5000/nvidia/cuda:12.4.1-base-ubuntu22.04 ``` diff --git a/docs/airgap/ngc-ready.md b/docs/airgap/ngc-ready.md index 1acebe12a..5917d3e75 100644 --- a/docs/airgap/ngc-ready.md +++ b/docs/airgap/ngc-ready.md @@ -37,9 +37,9 @@ For instructions on setting up an HTTP mirror, see the [doc on HTTP mirrors](./m Container images are only needed if you want to run the tests built into the playbook: -- nvcr.io/nvidia/cuda:10.1-base-ubuntu18.04 -- nvcr.io/nvidia/pytorch:18.10-py3 -- nvcr.io/nvidia/tensorflow:18.10-py3 +- nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 +- nvcr.io/nvidia/pytorch:24.04-py3 +- nvcr.io/nvidia/tensorflow:24.04-tf2-py3 For instructions on setting up a Docker registry mirror, see the [doc on Docker mirrors](./mirror-docker-images.md). @@ -63,9 +63,9 @@ For instructions on setting up an HTTP mirror, see the [doc on HTTP mirrors](./m Container images (how to mirror) are only needed if you want to run the tests built into the playbook: -- nvcr.io/nvidia/cuda:10.1-base-ubuntu18.04 -- nvcr.io/nvidia/pytorch:18.10-py3 -- nvcr.io/nvidia/tensorflow:18.10-py3 +- nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 +- nvcr.io/nvidia/pytorch:24.04-py3 +- nvcr.io/nvidia/tensorflow:24.04-tf2-py3 For instructions on setting up a Docker registry mirror, see the [doc on Docker mirrors](./mirror-docker-images.md). @@ -182,9 +182,9 @@ dcgm_rpm_package: "/path/to/datacenter-gpu-manager.rpm" If running the container tests as part of the NGC-Ready playbook, set the following variables in your DeepOps configuration: ```bash -ngc_ready_cuda_container: "/nvidia/cuda:10.1-base-ubuntu18.04" -ngc_ready_pytorch: "/nvidia/pytorch:18.10-py3" -ngc_ready_tensorflow: "/nvidia/tensorflow:18.10-py3" +ngc_ready_cuda_container: "/nvidia/cuda:12.4.1-base-ubuntu22.04" +ngc_ready_pytorch: "/nvidia/pytorch:24.04-py3" +ngc_ready_tensorflow: "/nvidia/tensorflow:24.04-tf2-py3" ``` ## Running the NGC-Ready playbook diff --git a/virtual/scripts/setup_k8s.sh b/virtual/scripts/setup_k8s.sh index 20860d8b3..a38777eab 100755 --- a/virtual/scripts/setup_k8s.sh +++ b/virtual/scripts/setup_k8s.sh @@ -40,7 +40,7 @@ source "${VIRT_DIR}/k8s_environment.sh" # Verify that the cluster is up file ${K8S_CONFIG_DIR}/artifacts/kubectl && chmod +x ${K8S_CONFIG_DIR}/artifacts/kubectl kubectl get nodes -#kubectl run gpu-test --rm -t -i --restart=Never --image=nvcr.io/nvidia/cuda:10.1-base-ubuntu18.04 --limits=nvidia.com/gpu=1 -- nvidia-smi +#kubectl run gpu-test --rm -t -i --restart=Never --image=nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 --limits=nvidia.com/gpu=1 -- nvidia-smi # Install helm "${ROOT_DIR}/scripts/k8s/install_helm.sh" diff --git a/workloads/examples/k8s/cluster-gpu-test-job.yml b/workloads/examples/k8s/cluster-gpu-test-job.yml index fd4af4904..8946601c7 100644 --- a/workloads/examples/k8s/cluster-gpu-test-job.yml +++ b/workloads/examples/k8s/cluster-gpu-test-job.yml @@ -10,7 +10,7 @@ spec: spec: containers: - name: cluster-gpu-tests - image: nvcr.io/nvidia/cuda:9.0-base + image: nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 command: ["/bin/bash","-c","nvidia-smi && sleep 10"] args: resources: diff --git a/workloads/examples/k8s/gpu-test-job.yml b/workloads/examples/k8s/gpu-test-job.yml index 7a0abdb71..63da1a375 100644 --- a/workloads/examples/k8s/gpu-test-job.yml +++ b/workloads/examples/k8s/gpu-test-job.yml @@ -5,11 +5,10 @@ metadata: spec: containers: - name: cuda-container - image: nvcr.io/nvidia/cuda:10.0-devel + image: nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 command: ["sleep", "6000"] args: resources: limits: nvidia.com/gpu: 1 restartPolicy: Never - diff --git a/workloads/jenkins/scripts/run-gpu-job.sh b/workloads/jenkins/scripts/run-gpu-job.sh index b1e2be648..470109ff1 100644 --- a/workloads/jenkins/scripts/run-gpu-job.sh +++ b/workloads/jenkins/scripts/run-gpu-job.sh @@ -32,7 +32,7 @@ fi # Occassionally this gpu-test fails and/or hangs. To ease debugging of this we run a describe several seconds into the launch. sleep 10 && kubectl describe pods gpu-test & -timeout 300 kubectl run gpu-test --rm -t -i --restart=Never --image=nvcr.io/nvidia/cuda:10.1-base-ubuntu18.04 --limits=nvidia.com/gpu=1 -- nvidia-smi +timeout 300 kubectl run gpu-test --rm -t -i --restart=Never --image=nvcr.io/nvidia/cuda:12.4.1-base-ubuntu22.04 --limits=nvidia.com/gpu=1 -- nvidia-smi # Run multi-GPU test if [ ${DEEPOPS_FULL_INSTALL} ]; then diff --git a/workloads/jenkins/scripts/test-slurm-enroot-job.sh b/workloads/jenkins/scripts/test-slurm-enroot-job.sh index d8168ec94..2504bbce7 100644 --- a/workloads/jenkins/scripts/test-slurm-enroot-job.sh +++ b/workloads/jenkins/scripts/test-slurm-enroot-job.sh @@ -9,5 +9,5 @@ ssh -v \ -i "${HOME}/.ssh/id_rsa" \ "10.0.0.5${GPU01}" \ srun -N1 -G1 \ - --container-image="nvcr.io#nvidia/cuda:10.2-base-ubuntu18.04" \ + --container-image="nvcr.io#nvidia/cuda:12.4.1-base-ubuntu22.04" \ nvidia-smi -L