diff --git a/docs/airgap/mirror-rpm-repos.md b/docs/airgap/mirror-rpm-repos.md index bef7a7b28..653f327f2 100644 --- a/docs/airgap/mirror-rpm-repos.md +++ b/docs/airgap/mirror-rpm-repos.md @@ -25,17 +25,16 @@ If you do not already have mirrors of the distribution repositories available, p The following additional RPM repositories are commonly used for GPU-enabled systems deployed by DeepOps: - [Fedora Extra Packages for Enterprise Linux (EPEL)](https://fedoraproject.org/wiki/EPEL) -- NVIDIA CUDA repository: [repo file for EL7](https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo), [repo file for EL8](https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel8.repo) -- NVIDIA container repositories: [repo file for EL7](https://raw.githubusercontent.com/NVIDIA/nvidia-docker/gh-pages/centos7/nvidia-docker.repo), [repo file for EL8](https://raw.githubusercontent.com/NVIDIA/nvidia-docker/gh-pages/centos8/nvidia-docker.repo) +- NVIDIA CUDA repository: [repo file for EL8](https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo), [repo file for EL9](https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo) +- NVIDIA Container Toolkit repository: [repo file for RPM-based distributions](https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo) - Docker CE repository: [repo file](https://download.docker.com/linux/centos/docker-ce.repo) These repo files provide the following repository IDs, which will be needed by `reposync` below: - epel -- cuda-rhel7-x86_64 or cuda-rhel8-x86_64 +- cuda-rhel8-x86_64 or cuda-rhel9-x86_64 - libnvidia-container -- nvidia-container-runtime -- nvidia-docker +- nvidia-container-toolkit - docker-ce-stable To discover a complete list of repositories needed for your particular workload, @@ -49,10 +48,11 @@ On a RHEL or CentOS machine with Internet access, install the `yum-utils` and `c sudo yum install yum-utils createrepo ``` -Then install the EPEL repository: +Then install the EPEL repository if your workload requires EPEL packages. +For example, on EL9: ```bash -sudo yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm +sudo yum install https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm ``` Then, for each of the other repo files, install the file into the `/etc/yum.repos.d` directory. @@ -61,8 +61,8 @@ For example, if using the list of repositories from the previous section: ```bash cd /etc/yum.repos.d sudo wget https://download.docker.com/linux/centos/docker-ce.repo -sudo wget https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo -sudo wget https://raw.githubusercontent.com/NVIDIA/nvidia-docker/gh-pages/centos7/nvidia-docker.repo +sudo wget https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo +sudo wget https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo ``` For each of the repositories you wish to mirror, run the `reposync` command to download the contents of the repository. @@ -80,7 +80,7 @@ At this point, you should have one subdirectory for each of the repositories you ```bash ls /var/repos/ -docker-ce-stable nvidia-docker +docker-ce-stable libnvidia-container nvidia-container-toolkit ``` For each of these directories, run the `createrepo` command to generate repository metadata: diff --git a/playbooks/container/nvidia-docker.yml b/playbooks/container/nvidia-docker.yml index b15edc99f..5c2b9733d 100644 --- a/playbooks/container/nvidia-docker.yml +++ b/playbooks/container/nvidia-docker.yml @@ -16,12 +16,15 @@ state: absent when: docker_install | default('yes') - - name: install NVIDIA Container Toolkit on Ubuntu 24.04 and newer + - name: install NVIDIA Container Toolkit on current OS releases include_role: name: nvidia_container_toolkit when: - - ansible_local['gpus']['count'] and ansible_distribution == "Ubuntu" - - ansible_distribution_version is version('24.04', '>=') + - ansible_local['gpus']['count'] + - > + (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('24.04', '>=')) + or + (ansible_os_family == "RedHat" and ansible_distribution_major_version is version('8', '>=')) - docker_install | default('yes') - name: install nvidia-docker @@ -29,6 +32,11 @@ name: nvidia.nvidia_docker when: - ansible_local['gpus']['count'] and (ansible_distribution == "Ubuntu" or ansible_os_family == "RedHat") - - not (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('24.04', '>=')) + - > + not ( + (ansible_distribution == "Ubuntu" and ansible_distribution_version is version('24.04', '>=')) + or + (ansible_os_family == "RedHat" and ansible_distribution_major_version is version('8', '>=')) + ) - docker_install | default('yes') environment: "{{ proxy_env if proxy_env is defined else {} }}" diff --git a/roles/nvidia_container_toolkit/defaults/main.yml b/roles/nvidia_container_toolkit/defaults/main.yml index 87577f72c..6159a7a68 100644 --- a/roles/nvidia_container_toolkit/defaults/main.yml +++ b/roles/nvidia_container_toolkit/defaults/main.yml @@ -4,6 +4,10 @@ nvidia_container_toolkit_repo_gpg_url: "{{ nvidia_container_toolkit_repo_base_ur nvidia_container_toolkit_keyring_ascii_path: "/usr/share/keyrings/nvidia-container-toolkit-keyring.asc" nvidia_container_toolkit_keyring_path: "/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg" nvidia_container_toolkit_apt_source_path: "/etc/apt/sources.list.d/nvidia-container-toolkit.list" +nvidia_container_toolkit_rpm_repo_url: "{{ nvidia_container_toolkit_repo_base_url }}/stable/rpm/nvidia-container-toolkit.repo" +nvidia_container_toolkit_yum_repo_path: "/etc/yum.repos.d/nvidia-container-toolkit.repo" +nvidia_container_toolkit_rpm_prerequisites: + - ca-certificates nvidia_container_toolkit_package: "nvidia-container-toolkit" nvidia_container_toolkit_configure_docker: true nvidia_container_toolkit_set_as_default_runtime: true diff --git a/roles/nvidia_container_toolkit/tasks/docker.yml b/roles/nvidia_container_toolkit/tasks/docker.yml new file mode 100644 index 000000000..432bc3e1c --- /dev/null +++ b/roles/nvidia_container_toolkit/tasks/docker.yml @@ -0,0 +1,32 @@ +--- +- name: Docker | ensure Docker configuration directory exists + ansible.builtin.file: + path: /etc/docker + state: directory + owner: root + group: root + mode: "0755" + +- name: Docker | check NVIDIA runtime configuration + ansible.builtin.command: + cmd: >- + python3 -c 'import json, pathlib, sys; + p = pathlib.Path("/etc/docker/daemon.json"); + required_default = {{ nvidia_container_toolkit_set_as_default_runtime | bool | ternary("True", "False") }}; + data = {}; + text = p.read_text().strip() if p.exists() else ""; + data = json.loads(text) if text else {}; + runtime = data.get("runtimes", {}).get("nvidia", {}); + ok = runtime.get("path") in ("nvidia-container-runtime", "/usr/bin/nvidia-container-runtime"); + ok = ok and (not required_default or data.get("default-runtime") == "nvidia"); + sys.exit(0 if ok else 1)' + register: nvidia_container_toolkit_docker_runtime + failed_when: false + changed_when: false + +- name: Docker | configure NVIDIA runtime + ansible.builtin.command: + cmd: "nvidia-ctk runtime configure --runtime=docker{{ ' --set-as-default' if nvidia_container_toolkit_set_as_default_runtime | bool else '' }}" + when: nvidia_container_toolkit_docker_runtime.rc != 0 + changed_when: true + notify: restart docker diff --git a/roles/nvidia_container_toolkit/tasks/main.yml b/roles/nvidia_container_toolkit/tasks/main.yml index c681d88e6..202a08390 100644 --- a/roles/nvidia_container_toolkit/tasks/main.yml +++ b/roles/nvidia_container_toolkit/tasks/main.yml @@ -1,114 +1,18 @@ --- -- name: Ubuntu | verify supported distribution +- name: Verify supported distribution ansible.builtin.assert: that: - - ansible_distribution == "Ubuntu" - fail_msg: "The nvidia_container_toolkit role currently supports Ubuntu only." - -- name: Ubuntu | set package architecture - ansible.builtin.set_fact: - nvidia_container_toolkit_deb_arch: "{{ _nvidia_container_toolkit_arch_map.get(ansible_architecture, ansible_architecture) }}" - vars: - _nvidia_container_toolkit_arch_map: - aarch64: arm64 - arm64: arm64 - x86_64: amd64 - -- name: Ubuntu | install repository prerequisites - ansible.builtin.apt: - name: - - ca-certificates - - gnupg - state: present - update_cache: true - when: nvidia_container_toolkit_repo_base_url | length > 0 - -- name: Ubuntu | ensure keyring directory exists - ansible.builtin.file: - path: "{{ nvidia_container_toolkit_keyring_path | dirname }}" - state: directory - owner: root - group: root - mode: "0755" - -- name: Ubuntu | check NVIDIA Container Toolkit keyring - ansible.builtin.stat: - path: "{{ nvidia_container_toolkit_keyring_path }}" - register: nvidia_container_toolkit_keyring - -- name: Ubuntu | download NVIDIA Container Toolkit GPG key - ansible.builtin.get_url: - url: "{{ nvidia_container_toolkit_repo_gpg_url }}" - dest: "{{ nvidia_container_toolkit_keyring_ascii_path }}" - owner: root - group: root - mode: "0644" - register: nvidia_container_toolkit_key - environment: "{{ proxy_env if proxy_env is defined else {} }}" - -- name: Ubuntu | install NVIDIA Container Toolkit GPG keyring - ansible.builtin.command: - cmd: "gpg --dearmor --yes -o {{ nvidia_container_toolkit_keyring_path }} {{ nvidia_container_toolkit_keyring_ascii_path }}" - when: nvidia_container_toolkit_key.changed or not nvidia_container_toolkit_keyring.stat.exists - register: nvidia_container_toolkit_keyring_install - changed_when: true - -- name: Ubuntu | set NVIDIA Container Toolkit GPG keyring permissions - ansible.builtin.file: - path: "{{ nvidia_container_toolkit_keyring_path }}" - owner: root - group: root - mode: "0644" - -- name: Ubuntu | configure NVIDIA Container Toolkit APT repository - ansible.builtin.copy: - content: | - deb [signed-by={{ nvidia_container_toolkit_keyring_path }}] {{ nvidia_container_toolkit_repo_base_url }}/stable/deb/{{ nvidia_container_toolkit_deb_arch }} / - dest: "{{ nvidia_container_toolkit_apt_source_path }}" - owner: root - group: root - mode: "0644" - register: nvidia_container_toolkit_apt_source + - ansible_distribution == "Ubuntu" or ansible_os_family == "RedHat" + fail_msg: "The nvidia_container_toolkit role supports Ubuntu and Red Hat family hosts only." - name: Ubuntu | install NVIDIA Container Toolkit - ansible.builtin.apt: - name: "{{ nvidia_container_toolkit_package }}" - state: present - update_cache: "{{ nvidia_container_toolkit_apt_source.changed or nvidia_container_toolkit_keyring_install.changed | default(false) }}" - environment: "{{ proxy_env if proxy_env is defined else {} }}" - -- name: Docker | ensure Docker configuration directory exists - ansible.builtin.file: - path: /etc/docker - state: directory - owner: root - group: root - mode: "0755" - when: nvidia_container_toolkit_configure_docker | bool + ansible.builtin.include_tasks: ubuntu.yml + when: ansible_distribution == "Ubuntu" -- name: Docker | check NVIDIA runtime configuration - ansible.builtin.command: - cmd: >- - python3 -c 'import json, pathlib, sys; - p = pathlib.Path("/etc/docker/daemon.json"); - required_default = {{ nvidia_container_toolkit_set_as_default_runtime | bool | ternary("True", "False") }}; - data = {}; - text = p.read_text().strip() if p.exists() else ""; - data = json.loads(text) if text else {}; - runtime = data.get("runtimes", {}).get("nvidia", {}); - ok = runtime.get("path") in ("nvidia-container-runtime", "/usr/bin/nvidia-container-runtime"); - ok = ok and (not required_default or data.get("default-runtime") == "nvidia"); - sys.exit(0 if ok else 1)' - register: nvidia_container_toolkit_docker_runtime - failed_when: false - changed_when: false - when: nvidia_container_toolkit_configure_docker | bool +- name: Red Hat | install NVIDIA Container Toolkit + ansible.builtin.include_tasks: redhat.yml + when: ansible_os_family == "RedHat" - name: Docker | configure NVIDIA runtime - ansible.builtin.command: - cmd: "nvidia-ctk runtime configure --runtime=docker{{ ' --set-as-default' if nvidia_container_toolkit_set_as_default_runtime | bool else '' }}" - when: - - nvidia_container_toolkit_configure_docker | bool - - nvidia_container_toolkit_docker_runtime.rc != 0 - changed_when: true - notify: restart docker + ansible.builtin.include_tasks: docker.yml + when: nvidia_container_toolkit_configure_docker | bool diff --git a/roles/nvidia_container_toolkit/tasks/redhat.yml b/roles/nvidia_container_toolkit/tasks/redhat.yml new file mode 100644 index 000000000..74744e41e --- /dev/null +++ b/roles/nvidia_container_toolkit/tasks/redhat.yml @@ -0,0 +1,23 @@ +--- +- name: Red Hat | install repository prerequisites + ansible.builtin.dnf: + name: "{{ nvidia_container_toolkit_rpm_prerequisites }}" + state: present + when: nvidia_container_toolkit_repo_base_url | length > 0 + +- name: Red Hat | configure NVIDIA Container Toolkit Yum repository + ansible.builtin.get_url: + url: "{{ nvidia_container_toolkit_rpm_repo_url }}" + dest: "{{ nvidia_container_toolkit_yum_repo_path }}" + owner: root + group: root + mode: "0644" + register: nvidia_container_toolkit_yum_repo + environment: "{{ proxy_env if proxy_env is defined else {} }}" + +- name: Red Hat | install NVIDIA Container Toolkit + ansible.builtin.dnf: + name: "{{ nvidia_container_toolkit_package }}" + state: present + update_cache: "{{ nvidia_container_toolkit_yum_repo.changed }}" + environment: "{{ proxy_env if proxy_env is defined else {} }}" diff --git a/roles/nvidia_container_toolkit/tasks/ubuntu.yml b/roles/nvidia_container_toolkit/tasks/ubuntu.yml new file mode 100644 index 000000000..e446a8ced --- /dev/null +++ b/roles/nvidia_container_toolkit/tasks/ubuntu.yml @@ -0,0 +1,72 @@ +--- +- name: Ubuntu | set package architecture + ansible.builtin.set_fact: + nvidia_container_toolkit_deb_arch: "{{ _nvidia_container_toolkit_arch_map.get(ansible_architecture, ansible_architecture) }}" + vars: + _nvidia_container_toolkit_arch_map: + aarch64: arm64 + arm64: arm64 + x86_64: amd64 + +- name: Ubuntu | install repository prerequisites + ansible.builtin.apt: + name: + - ca-certificates + - gnupg + state: present + update_cache: true + when: nvidia_container_toolkit_repo_base_url | length > 0 + +- name: Ubuntu | ensure keyring directory exists + ansible.builtin.file: + path: "{{ nvidia_container_toolkit_keyring_path | dirname }}" + state: directory + owner: root + group: root + mode: "0755" + +- name: Ubuntu | check NVIDIA Container Toolkit keyring + ansible.builtin.stat: + path: "{{ nvidia_container_toolkit_keyring_path }}" + register: nvidia_container_toolkit_keyring + +- name: Ubuntu | download NVIDIA Container Toolkit GPG key + ansible.builtin.get_url: + url: "{{ nvidia_container_toolkit_repo_gpg_url }}" + dest: "{{ nvidia_container_toolkit_keyring_ascii_path }}" + owner: root + group: root + mode: "0644" + register: nvidia_container_toolkit_key + environment: "{{ proxy_env if proxy_env is defined else {} }}" + +- name: Ubuntu | install NVIDIA Container Toolkit GPG keyring + ansible.builtin.command: + cmd: "gpg --dearmor --yes -o {{ nvidia_container_toolkit_keyring_path }} {{ nvidia_container_toolkit_keyring_ascii_path }}" + when: nvidia_container_toolkit_key.changed or not nvidia_container_toolkit_keyring.stat.exists + register: nvidia_container_toolkit_keyring_install + changed_when: true + +- name: Ubuntu | set NVIDIA Container Toolkit GPG keyring permissions + ansible.builtin.file: + path: "{{ nvidia_container_toolkit_keyring_path }}" + owner: root + group: root + mode: "0644" + +- name: Ubuntu | configure NVIDIA Container Toolkit APT repository + ansible.builtin.copy: + content: | + deb [signed-by={{ nvidia_container_toolkit_keyring_path }}] {{ nvidia_container_toolkit_repo_base_url }}/stable/deb/{{ nvidia_container_toolkit_deb_arch }} / + dest: "{{ nvidia_container_toolkit_apt_source_path }}" + owner: root + group: root + mode: "0644" + register: nvidia_container_toolkit_apt_source + +- name: Ubuntu | install NVIDIA Container Toolkit + ansible.builtin.apt: + name: "{{ nvidia_container_toolkit_package }}" + state: present + update_cache: "{{ nvidia_container_toolkit_apt_source.changed or nvidia_container_toolkit_keyring_install.changed | default(false) }}" + environment: "{{ proxy_env if proxy_env is defined else {} }}"