diff --git a/README.md b/README.md index 68df95a20..84edf7657 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,8 @@ The cluster nodes will follow the requirements described by Slurm or Kubernetes. - CentOS 7, 8 - Red Hat Enterprise Linux / Rocky Linux 8 and 9 for the DGX software stack through the `nvidia-dgx` role -You may also install a supported operating system on all servers via a 3rd-party solution (i.e. [MAAS](https://maas.io/), [Foreman](https://www.theforeman.org/)) or utilize the provided [OS install container](docs/pxe/minimal-pxe-container.md). +You may also install a supported operating system on all servers via a 3rd-party solution such as [MAAS](https://maas.io/) or [Foreman](https://www.theforeman.org/), or via an existing site-standard automated installer. +For new Ubuntu 24.04 or DGX OS 7 deployments, prefer Ubuntu autoinstall/cloud-init or MAAS and then apply DeepOps roles after the OS is present. For DGX platform software installation on top of vanilla Ubuntu or Red Hat family operating systems, see the [DGX software stack role guide](docs/deepops/dgx-software-stack.md). ### Kubernetes diff --git a/config.example/group_vars/all.yml b/config.example/group_vars/all.yml index b7de1ed73..d98664bdc 100644 --- a/config.example/group_vars/all.yml +++ b/config.example/group_vars/all.yml @@ -253,7 +253,9 @@ maas_adminusers: maas_dns_domain: 'deepops.local' maas_region_controller: '192.168.1.1' maas_region_controller_url: 'http://{{ maas_region_controller }}:5240/MAAS' -maas_repo: 'ppa:maas/3.5' +# MAAS 3.7 is the current Ubuntu 24.04 line. Keep the 3.5 PPA when the MAAS +# controller itself still runs Ubuntu 22.04. +maas_repo: "{{ 'ppa:maas/3.7' if ansible_distribution_version is version('24.04', '>=') else 'ppa:maas/3.5' }}" # Defines if maas user should generate ssh keys # Usable for remote KVM/libvirt power actions diff --git a/config.example/helm/dgxie.yml b/config.example/helm/dgxie.yml deleted file mode 100644 index 8417e9c22..000000000 --- a/config.example/helm/dgxie.yml +++ /dev/null @@ -1,62 +0,0 @@ -# -# DGXie configuration: -# - -# Default DGX boot mode -# Options: -# "DGX": boot DGX to install media -# "local": boot DGX to local disk -bootMode: DGX - -# DGX network interface to use during install -dgxNetInt: enp1s0f0 - -# DGX disk to use during install -dgxDisk: sda - -# DGX Keyboard layout to use -dgxKbd: us - -# Extra kernel parameters to pass during DGX OS install -# i.e. "rebuild-raid" -dgxKernExtra: '' - -# Network interface of public network on management servers -mgmtIntPub: eth0 - -# Network interface of private network where DGX are connected on management servers -mgmtIntPrv: eth1 - -# Network domain -netDomain: local - -# IP address of private network interface on management server -netPrvIp: 192.168.1.1 - -# Private Network -netPrvNet: 192.168.1.0 - -# Private network netmask -netPrvNetmask: 255.255.255.0 - -# Private network gateway -netPrvGateway: 192.168.1.1 - -# DNS nameservers -netPrvDns1: 8.8.8.8 -netPrvDns2: 8.8.4.4 - -# DHCP dynamic address range -netPrvDhcpStart: 192.168.1.100 -netPrvDhcpEnd: 192.168.1.199 - -#ntp: 192.168.1.10 - -# DHCP lease time -netPrvDhcpLease: 7200 - -# HTTPS proxy to use in preseed -#httpsProxy: "http://192.168.2.1:3128" - -# Install extra packages -extraPackages: "aptitude ubuntu-minimal dirmngr" diff --git a/config.example/pxe/dnsmasq.extra.conf b/config.example/pxe/dnsmasq.extra.conf deleted file mode 100644 index b46aeb66b..000000000 --- a/config.example/pxe/dnsmasq.extra.conf +++ /dev/null @@ -1,9 +0,0 @@ -# -# Additional DNSMASQ configuration -# - -# If the dhcp-ignore flag is specified in this fashion, only hosts configured with dhcp-host will be given DHCP -#dhcp-ignore=tag:!known - -# Example static IP; note this will not work for bonded interfaces -#dhcp-host=12:34:56:78,server-01,192.168.1.23 diff --git a/config.example/pxe/env b/config.example/pxe/env deleted file mode 100644 index dafa98528..000000000 --- a/config.example/pxe/env +++ /dev/null @@ -1,49 +0,0 @@ -## Docker Compose settings -COMPOSE_PROJECT_NAME=deepops -COMPOSE_FILE=src/containers/dgxie/docker-compose.yml - -## DHCP/PXE server settings -DHCP_ENABLE=1 -DHCP_INT=eth1 -NETWORK=192.168.1.0 -NETMASK=255.255.255.0 -GATEWAY=192.168.1.1 -DNS1=8.8.8.8 -DNS2=8.8.4.4 -DHCP_START=192.168.1.2 -DHCP_END=192.168.1.254 -LEASETIME=7200 -DOMAIN=local -NTP=pool.ntp.org - -## NAT settings -NAT_ENABLE=1 -NAT_INT_PUB=eth0 -NAT_INT_PRV=eth1 - -## HTTP server settings -HTTP_INT=eth1 -HTTP_PORT=13370 - -## NFS server settings -# Directory on host containing ISOs, drivers, etc. -DATA_DIR=/home/ubuntu - -## VMware install settings -VMW_NFS_IP=192.168.1.1 -VMW_GPU_VIB=NVIDIA-VMware_ESXi_6.7_Host_Driver-418.66-1OEM.670.0.0.8169922.x86_64.vib -VMW_VM_SIZE=100g - -## DGX OS install settings -# DGX boot interface -DGX_INT=enp1s0f0 -# DGX boot disk -DGX_DISK=sda -# DGX keyboard -DGX_KBD=us -# DGX extra kernel params -DGX_KERN_EXTRA="" -# DGX extra packages to install -DGX_EXTRA_PACKAGES="" -# DGX install proxy -DGX_HTTPS_PROXY="" diff --git a/config.example/pxe/ipmi.conf b/config.example/pxe/ipmi.conf deleted file mode 100644 index 7cd5ea95a..000000000 --- a/config.example/pxe/ipmi.conf +++ /dev/null @@ -1,4 +0,0 @@ -# This configuration file is used while rebooting DGX servers into PXE boot -# This information is used to connect to the DGX BMC -IPMI_USERNAME=dgxuser -IPMI_PASSWORD=dgxuser \ No newline at end of file diff --git a/config.example/pxe/ipmi_host_list b/config.example/pxe/ipmi_host_list deleted file mode 100644 index be3af39a6..000000000 --- a/config.example/pxe/ipmi_host_list +++ /dev/null @@ -1,4 +0,0 @@ -# This configuration file is used while rebooting DGX servers into PXE boot -# This information is used to connect to the DGX BMC -10.0.0.1 -10.0.0.2 \ No newline at end of file diff --git a/config.example/pxe/machines/machines.json b/config.example/pxe/machines/machines.json deleted file mode 100644 index b03b94ed7..000000000 --- a/config.example/pxe/machines/machines.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "esxi-example": { - "mac": "54:ab:3a.*", - "kernel": "http://localhost:$HTTP_PORT/vmware/mboot.efi", - "cmdline": "-c {{ URL \"http://localhost:$HTTP_PORT/vmware/boot.cfg\" }} rdblacklist=nouveau ip=dhcp nomodeset rw console=tty0 console=ttyS0,115200n8", - "message": "VMware ESXi" - }, - "dgx-station-example": { - "mac": "2c:56:dc.*", - "kernel": "http://localhost:$HTTP_PORT/iso/install/netboot/ubuntu-installer/amd64/linux", - "initrd": ["http://localhost:$HTTP_PORT/iso/install/netboot/ubuntu-installer/amd64/initrd.gz"], - "cmdline": "quiet url={{ URL \"http://localhost:$HTTP_PORT/dgx.seed\" }} ramdisk_size=100000 locale=en_US.UTF-8 auto=true priority=critical kbd-chooser/method=us netcfg/choose_interface=enp1s0f0 netcfg/get_hostname=dgx-server netcfg/dhcp_timeout=120", - "message": "DGX Station 4.1.0" - }, - "dgx-1-example": { - "mac": "d8:c4:97.*", - "kernel": "http://localhost:$HTTP_PORT/iso/install/netboot/ubuntu-installer/amd64/linux", - "initrd": ["http://localhost:$HTTP_PORT/iso/install/netboot/ubuntu-installer/amd64/initrd.gz"], - "cmdline": "quiet url={{ URL \"http://localhost:$HTTP_PORT/dgx.seed\" }} ramdisk_size=100000 locale=en_US.UTF-8 auto=true priority=critical kbd-chooser/method=us netcfg/choose_interface=enp1s0f0 netcfg/get_hostname=dgx-server netcfg/dhcp_timeout=120", - "message": "DGX-1 Server 4.1.0" - }, - "dgx-2-example": { - "mac": "5c:ff:35.*", - "kernel": "http://localhost:$HTTP_PORT/iso/install/netboot/ubuntu-installer/amd64/linux", - "initrd": ["http://localhost:$HTTP_PORT/iso/install/netboot/ubuntu-installer/amd64/initrd.gz"], - "cmdline": "quiet url={{ URL \"http://localhost:$HTTP_PORT/dgx.seed\" }} ramdisk_size=100000 locale=en_US.UTF-8 auto=true priority=critical kbd-chooser/method=us netcfg/choose_interface=enp1s0f0 netcfg/get_hostname=dgx-server netcfg/dhcp_timeout=120", - "message": "DGX-2 Server 4.1.0" - }, - "64-bit-ubuntu-example": { - "mac": "2c:56:dc:47:f5:1f", - "kernel": "http://archive.ubuntu.com/ubuntu/dists/xenial-updates/main/installer-amd64/current/images/netboot/ubuntu-installer/amd64/linux", - "initrd": ["http://archive.ubuntu.com/ubuntu/dists/xenial-updates/main/installer-amd64/current/images/netboot/ubuntu-installer/amd64/initrd.gz"], - "cmdline": "locale=en_US.UTF-8 priority=critical kbd-chooser/method=us", - "message": "Ubuntu 16.04" - }, - "64-bit-coreos-example": { - "mac": "2c:56:dc:47:f5:1f", - "kernel": "https://alpha.release.core-os.net/amd64-usr/current/coreos_production_pxe.vmlinuz", - "initrd": ["https://alpha.release.core-os.net/amd64-usr/current/coreos_production_pxe_image.cpio.gz"], - "cmdline": "coreos.autologin", - "message": "CoreOS Alpha" - }, - "32-bit-ubuntu-example": { - "mac": "00:24:e8:de:bf:53", - "kernel": "http://archive.ubuntu.com/ubuntu/dists/xenial-updates/main/installer-i386/current/images/netboot/ubuntu-installer/i386/linux", - "initrd": ["http://archive.ubuntu.com/ubuntu/dists/xenial-updates/main/installer-i386/current/images/netboot/ubuntu-installer/i386/initrd.gz"], - "cmdline": "locale=en_US.UTF-8 priority=critical kbd-chooser/method=us", - "message": "all your netbooks are belong to us" - } -} diff --git a/docs/airgap/mirror-apt-repos.md b/docs/airgap/mirror-apt-repos.md index 93064f704..d6d6d685e 100644 --- a/docs/airgap/mirror-apt-repos.md +++ b/docs/airgap/mirror-apt-repos.md @@ -212,7 +212,7 @@ deb http://archive.ubuntu.com/ubuntu noble-security main restricted universe mul deb http://archive.ubuntu.com/ubuntu noble-updates main restricted universe multiverse deb http://archive.ubuntu.com/ubuntu noble-proposed main restricted universe multiverse deb http://archive.ubuntu.com/ubuntu noble-backports main restricted universe multiverse -deb http://ppa.launchpad.net/maas/3.5/ubuntu noble main +deb http://ppa.launchpad.net/maas/3.7/ubuntu noble main deb http://archive.canonical.com/ubuntu noble partner deb-src http://archive.ubuntu.com/ubuntu noble main restricted universe multiverse diff --git a/docs/k8s-cluster/README.md b/docs/k8s-cluster/README.md index b7f1e32b1..3230f24a5 100644 --- a/docs/k8s-cluster/README.md +++ b/docs/k8s-cluster/README.md @@ -35,7 +35,8 @@ Instructions for deploying a GPU cluster with Kubernetes 1. Install a supported operating system on all nodes. - Install a supported operating system on all servers via a 3rd-party solution (i.e. [MAAS](https://maas.io/), [Foreman](https://www.theforeman.org/)) or utilize the provided [OS install container](../pxe). + Install a supported operating system on all servers via a 3rd-party solution such as [MAAS](https://maas.io/) or [Foreman](https://www.theforeman.org/), or via an existing site-standard automated installer. + For new Ubuntu 24.04 or DGX OS 7 deployments, prefer Ubuntu autoinstall/cloud-init or MAAS and then apply DeepOps after the OS is present. 2. Set up your provisioning machine. diff --git a/docs/k8s-cluster/roce-perf-k8s.md b/docs/k8s-cluster/roce-perf-k8s.md index 6f5d12c1b..13eb0911d 100644 --- a/docs/k8s-cluster/roce-perf-k8s.md +++ b/docs/k8s-cluster/roce-perf-k8s.md @@ -71,7 +71,8 @@ add switch PFC, ECN configuration 2. Install a supported operating system on all nodes. - Install a supported operating system on all servers utilizing the [DGXie](/docs/pxe/dgxie-container.md) provisioning container, via a 3rd-party solution (i.e. [MAAS](https://maas.io/), [Foreman](https://www.theforeman.org/)), or server BMC/console. + Install a supported operating system on all servers via a 3rd-party solution such as [MAAS](https://maas.io/) or [Foreman](https://www.theforeman.org/), via an existing site-standard automated installer, or through server BMC/console. + For new Ubuntu 24.04 or DGX OS 7 deployments, prefer Ubuntu autoinstall/cloud-init or MAAS. > NOTE: During OS installation, it is ideal if the identical user/password is configured. Otherwise, follow step 4 below to create an identical user across all nodes in the cluster. diff --git a/docs/pxe/README.md b/docs/pxe/README.md index cf0ea0b5a..3c22bd47c 100644 --- a/docs/pxe/README.md +++ b/docs/pxe/README.md @@ -14,10 +14,10 @@ Most of the playbooks in DeepOps are agnostic to the OS install tooling, assumin For example, DeepOps can be used to deploy a [Slurm cluster](../slurm-cluster/) or a [Kubernetes cluster](../k8s-cluster) regardless of how the OS was installed. This makes it relatively easy to integrate with an existing datacenter environment. -However, DeepOps does provide tooling for several PXE installation mechanisms which can be used if an existing tool isn't already deployed. -These include: +DeepOps does not try to replace a site provisioning system. +For environments without an existing bare-metal provisioning workflow, DeepOps provides MAAS setup guidance: - [MAAS](./maas.md), an open-source bare-metal provisioning tool developed by [Canonical](https://canonical.com/) -- [DGXIE](./dgxie-container.md), a containerized deployment tool developed specifically to deploy NVIDIA DGX OS - - [DGXIE on Kubernetes](./dgxie-on-k8s.md) -- A minimal [PXE container](./minimal-pxe-container.md) which wraps [Pixiecore](https://github.com/danderson/netboot/tree/master/pixiecore), an open source tool for network booting + +For new Ubuntu 24.04 or DGX OS 7 cluster deployments, prefer MAAS, an existing site provisioning system, or Ubuntu autoinstall/cloud-init. +NVIDIA DGX OS 7 supports installing the DGX Software Stack on regular Ubuntu 24.04 for cluster deployments, which is a better fit for current automated installation tooling than the retired legacy DGX OS installer workflows. diff --git a/docs/pxe/dgxie-container.md b/docs/pxe/dgxie-container.md deleted file mode 100644 index 98640da0c..000000000 --- a/docs/pxe/dgxie-container.md +++ /dev/null @@ -1,62 +0,0 @@ -# DGXIE Container - -DGXie is an all-in-one container for DHCP, DNS, and PXE, specifically tailored to the DGX Base OS. - -- [DGXIE Container](#dgxie-container) - - [Download DGX ISO](#download-dgx-iso) - - [Configure](#configure) - - [Deploy DGXie container](#deploy-dgxie-container) - - [Testing the DGXie PXE service](#testing-the-dgxie-pxe-service) - - [PXE booting the DGX](#pxe-booting-the-dgx) - - [Making updates](#making-updates) - -## Download DGX ISO - -You will need to download the official DGX Base OS ISO image to your provisioning machine. The latest DGX Base OS is available via the NVIDIA Enterprise Support Portal (ESP). - -Update the `DATA_DIR` specified in `config/pxe/env` and copy the DGX Base OS ISO there. - -## Configure - -Configuration information for DGXie is located in `config/pxe`. - -Update the `config/pxe/dnsmasq.extra.conf` with additional options, such as assigning static IPs by MAC address. - -DGXie uses docker-compose to build and run. The `src/containers/dgxie/docker-compose-yml` file consumes several environment variables that are defined in `config/pxe/env`. Changes to the DHCP range, network used for serving up PXE files, and other values can be updated there. Be sure to update the `eth1` and `eth0` values to match your machine interfaces or the DGXie will fail to start. - -> Note: This assumes you have run the setup.sh script. If you have not, you must manually copy the example config and install docker/docker-compose. - -## Deploy DGXie container - -```bash -./scripts/pxe/build_and_restart_dgxie.sh -``` - -## Testing the DGXie PXE service - -If the default HTTP_PORT or machines.json file have not been changed, the below curl call should verify that the PXE API is responding: - -```bash -curl localhost:13370/v1/boot/d8:c4:97:00:00:00 -``` - -## PXE booting the DGX - -The DGX servers can be PXE booted manually through the console. The DeepOps repo also provides the `dgxctl.sh` tool to automate this process using IPMI. - -Update the `config/pxe/ipmi_host_list` file with a list of BMC IPs. -Update the `config/pxe/ipmi.conf` file with the proper username and password. - -Run: - -```bash -./scripts/pxe/dgxctl.sh -i -``` - -> Note: This tool assumes all DGX systems are configured with the same username and password. - -## Making updates - -To make configuration changes or ISO updates, update the config files or ISO followed by re-running `./scripts/pxe/build_and_restart_dgxie.sh`. This will tear down the old DGXie and start a new one with the configuration changes. - -Updates to the machines.json file do not require a restart. diff --git a/docs/pxe/dgxie-on-k8s.md b/docs/pxe/dgxie-on-k8s.md deleted file mode 100644 index 5aadfc772..000000000 --- a/docs/pxe/dgxie-on-k8s.md +++ /dev/null @@ -1,87 +0,0 @@ -# DGXIE on Kubernetes - -DGXie is an all-in-one container for DHCP, DNS, and PXE, specifically tailored to the DGX Base OS. - -- [DGXIE on Kubernetes](#dgxie-on-kubernetes) - - [Setup](#setup) - - [Configure](#configure) - - [Deploy DGXie service](#deploy-dgxie-service) - - [Updating DHCP Configuration](#updating-dhcp-configuration) - - [Updating PXE Machines](#updating-pxe-machines) - -## Setup - -You will need to download the official DGX Base OS ISO image to your provisioning machine. The latest DGX Base OS is available via the NVIDIA Enterprise Support Portal (ESP). - -Copy the DGX Base OS ISO to shared storage via a container running in Kubernetes, substituting the path to the DGX ISO you downloaded (be sure to wait for the `iso-loader` POD to be in the _Running_ state before attempting to copy the ISO): - -```bash -kubectl apply -f workloads/services/k8s/iso-loader.yml -kubectl cp /local/DGXServer-4.0.2.180925_6acd9c.iso $(kubectl get pod -l app=iso-loader -o custom-columns=:metadata.name --no-headers):/data/iso/ -``` - -> Note: If the `iso-loader` POD fails to mount the CephFS volume, you may need to restart the kubelet service on the master node(s): `ansible mgmt -b -a "systemctl restart kubelet"` -> You may see an error that looks like this in your syslog file: `failed to get Plugin from volumeSpec for volume "cephfs" err=no volume plugin matched` - -## Configure - -Modify the DGXie configuration in `config/helm/dgxie.yml` to set values for the DHCP server and DGX install process. - -Modify `config/dhcpd.hosts.conf` to add a static IP lease for each login node and DGX server in the cluster if required. IP addresses should match those used in the `config/inventory` file. You may also add other valid configuration options for dnsmasq to this file. - -```bash -grep TODO config/* -``` - -> Note: There are several `TODO` comments in these configuration files that will likely need to be changed. Depending on the system architecture there may be additional required config changes. - -You can get the MAC address of DGX system interfaces via the BMC, for example: - -```bash -# interface 1 -ipmitool -I lanplus -U -P -H raw 0x30 0x19 0x00 0x02 | tail -c 18 | tr ' ' ':' -# interface 2 -ipmitool -I lanplus -U -P -H raw 0x30 0x19 0x00 0x12 | tail -c 18 | tr ' ' ':' -``` - -Modify `config/machines.json` to add a PXE entry for each DGX. Copy the `dgx-example` section and modify the MAC address for each DGX you would like to boot. You can modify boot parameters or install alternate operating systems if required. - -Store the config files as config-maps in Kubernetes, even if you have not made any changes (the DGXie container will try to mount these config maps): - -```bash -kubectl create configmap dhcpd --from-file=config/dhcpd.hosts.conf -kubectl create configmap pxe-machines --from-file=config/machines.json -``` - -## Deploy DGXie service - -Launch the DGXie service: - -```bash -helm install --values config/helm/dgxie.yml workloads/services/k8s/dgxie -``` - -Check the DGXie logs to make sure the services were started without errors: - -```bash -kubectl logs -l app=dgxie -``` - -> NOTE: If you later make changes to `config/dhcpd.hosts.conf` or `machines.json` you can follow the [steps](#updating-pxe-machines) to update the dgxie service. - -## Updating DHCP Configuration - -If you make changes to `config/dhcpd.hosts.conf`, you can update the file in Kubernetes and restart the service with: - -```bash -kubectl create configmap dhcpd --from-file=config/dhcpd.hosts.conf -o yaml --dry-run | kubectl replace -f - -kubectl delete pod -l app=dgxie -``` - -## Updating PXE Machines - -If you make changes to `machines.json`, you can update the file without having to restart the DGXie POD: - -```bash -kubectl create configmap pxe-machines --from-file=config/machines.json -o yaml --dry-run | kubectl replace -f - -``` diff --git a/docs/pxe/maas.md b/docs/pxe/maas.md index 1deb6dad1..08a22418b 100644 --- a/docs/pxe/maas.md +++ b/docs/pxe/maas.md @@ -3,7 +3,8 @@ OS Provisioning with MAAS - [MAAS](#maas) - - [Summary](#summary) + - [Introduction](#introduction) + - [DeepOps and MAAS operating model](#deepops-and-maas-operating-model) - [Pre-requisites](#pre-requisites) - [Installing MAAS with DeepOps](#installing-maas-with-deepops) - [Configuring MAAS](#configuring-maas) @@ -33,11 +34,29 @@ This guide was originally written using MAAS 2.8; current MAAS releases are 3.x. MAAS has a lot of different configuration options which are outside the scope of this guide. For the best reference on how to use MAAS in general, see the [documentation on maas.io](https://maas.io/docs). +## DeepOps and MAAS operating model + +DeepOps should stay modular and simple: MAAS is the source of truth for bare-metal provisioning state, while DeepOps consumes that state through small, understandable scripts and Ansible inventory. +MAAS owns machine lifecycle, power control, PXE/DHCP behavior, OS images, commissioning, deployment, release, pools, zones, and machine tags. +DeepOps should not run a second reconciliation loop for those responsibilities or become a replacement for BCM. + +The DeepOps-owned state should stay deliberately small: + +- MAAS tags that map deployed machines into DeepOps inventory groups, such as `kube_control_plane`, `kube_node`, `slurm-master`, and `slurm-node`. +- A dynamic inventory view from `scripts/maas_inventory.py`, derived from deployed MAAS machines and tags. +- Explicit deploy, tag, status, and release operations through `scripts/maas_deploy.sh`. +- Ansible run artifacts and validation results that record what DeepOps last applied and observed. + +This keeps DeepOps useful for lightweight cluster setup without competing with larger cluster managers. +It also keeps the contribution surface accessible: tags, inventory output, and playbook artifacts are easy for new contributors to inspect, reproduce, and improve. +If a site already uses BCM or another fleet manager, keep that system authoritative and use DeepOps only for the roles and playbooks the site intentionally delegates. + ## Pre-requisites In order to set up and use MAAS, you should at minimum have the following components: -- An Ubuntu 22.04 or 24.04 server which you can use to run MAAS +- An Ubuntu 22.04 or 24.04 server which you can use to run MAAS. + MAAS 3.7 is the current Ubuntu 24.04 line; use MAAS 3.5 if the controller itself remains on Ubuntu 22.04. - One or more servers which you will manage using MAAS - A network connection between all the servers on which you can safely run DHCP. This is needed so that MAAS can provision IP addresses to the nodes it manages. - A network connection which you can use to log into the MAAS server. This may be the same network as the inter-node network, or it may be a separate network. @@ -73,8 +92,10 @@ Please consult your hypervisor documentation for instructions on doing this. maas_dns_domain: 'deepops.local' maas_region_controller: '192.168.1.1' maas_region_controller_url: 'http://{{ maas_region_controller }}:5240/MAAS' - maas_repo: 'ppa:maas/3.5' + maas_repo: "{{ 'ppa:maas/3.7' if ansible_distribution_version is version('24.04', '>=') else 'ppa:maas/3.5' }}" ``` + If you know every MAAS controller host is running Ubuntu 24.04, you can set this directly to `ppa:maas/3.7`. + If the controller is still Ubuntu 22.04, keep `ppa:maas/3.5`. 1. Run the Ansible playbook to install: ```bash ansible-playbook -l playbooks/provisioning/maas.yml diff --git a/docs/pxe/minimal-pxe-container.md b/docs/pxe/minimal-pxe-container.md deleted file mode 100644 index bf68ab39e..000000000 --- a/docs/pxe/minimal-pxe-container.md +++ /dev/null @@ -1,69 +0,0 @@ -# Minimal PXE Container - -Minimal containers for OS installation - -- [Minimal PXE Container](#minimal-pxe-container) - - [Requirements](#requirements) - - [Installation Steps](#installation-steps) - - [IPMI Command Reference](#ipmi-command-reference) - -## Requirements - -- Control machine connected to the same VLAN/subnet as target machines -- Docker installed on control machine - -## Installation Steps - -This process should run from a Linux system on the same network segment as the target nodes. - -1. Install docker. - - ```bash - ./scripts/generic/install_docker.sh - ``` - -2. (Optional) Start DHCP server. - - If you have an existing DHCP server, skip this step - - ```bash - # Modify listen interface, DHCP range, and network gateway IP - docker-compose -f src/containers/pxe/docker-compose.yml run -d dhcp dnsmasq -d --interface=ens192 --dhcp-range=192.168.1.100,192.168.1.199,7200 --dhcp-option=6,8.8.8.8 --dhcp-option=3,192.168.1.1 - ``` - -3. (Optional) Configure NAT routing. - - If you have an existing network gateway, skip this step - - ```bash - # Set eth0 and eth1 to your public and private interfaces, respectively - ./scripts/pxe/setup_nat.sh eth0 eth1 - ``` - -4. Start PXE server. - - ```bash - docker-compose -f src/containers/pxe/docker-compose.yml up -d pxe - ``` - -5. Install OS. - - Set servers to boot from the network for the next boot only (to avoid re-install loops) and reboot them to install the OS. - - The default credentials are: - - - Username: `ubuntu` - - Password: `deepops` - -## IPMI Command Reference - -```bash -# Set to boot from disk, always -# Dell -chassis bootdev disk options=persistent -# DGX-1 -raw 0x00 0x08 0x05 0xe0 0x08 0x00 0x00 0x00 - -# Set to boot from the network, next boot only -chassis bootdev pxe options=efiboot -``` diff --git a/docs/slurm-cluster/README.md b/docs/slurm-cluster/README.md index 3db80e343..e5842cd4f 100644 --- a/docs/slurm-cluster/README.md +++ b/docs/slurm-cluster/README.md @@ -27,7 +27,8 @@ Instructions for deploying a GPU cluster with Slurm 1. Install a supported operating system on all nodes. - Install a supported operating system on all servers via a 3rd-party solution (i.e. [MAAS](https://maas.io/), [Foreman](https://www.theforeman.org/)) or utilize the provided [OS install container](../pxe). + Install a supported operating system on all servers via a 3rd-party solution such as [MAAS](https://maas.io/) or [Foreman](https://www.theforeman.org/), or via an existing site-standard automated installer. + For new Ubuntu 24.04 or DGX OS 7 deployments, prefer Ubuntu autoinstall/cloud-init or MAAS and then apply DeepOps after the OS is present. 2. Set up your provisioning machine. diff --git a/docs/slurm-cluster/slurm-perf-cluster.md b/docs/slurm-cluster/slurm-perf-cluster.md index 804fcc985..6b252f0d8 100644 --- a/docs/slurm-cluster/slurm-perf-cluster.md +++ b/docs/slurm-cluster/slurm-perf-cluster.md @@ -44,7 +44,8 @@ These packages have been installed and tested with the following Linux distribut 1. Install a supported operating system on all nodes. - Install a supported operating system on all servers utilizing the [DGXie](/docs/pxe/dgxie-container.md) provisioning container, via a 3rd-party solution (i.e. [MAAS](https://maas.io/), [Foreman](https://www.theforeman.org/)), or server BMC/console. + Install a supported operating system on all servers via a 3rd-party solution such as [MAAS](https://maas.io/) or [Foreman](https://www.theforeman.org/), via an existing site-standard automated installer, or through server BMC/console. + For new Ubuntu 24.04 or DGX OS 7 deployments, prefer Ubuntu autoinstall/cloud-init or MAAS. > NOTE: During OS installation, it is ideal if the identical user/password is configured. Otherwise, follow step 4 below to create an identical user across all nodes in the cluster. diff --git a/scripts/pxe/build_and_restart_dgxie.sh b/scripts/pxe/build_and_restart_dgxie.sh deleted file mode 100755 index f0d0ca46c..000000000 --- a/scripts/pxe/build_and_restart_dgxie.sh +++ /dev/null @@ -1,25 +0,0 @@ -#!/bin/bash -set -xe - -source config/pxe/env - -compose_directory_cmd="" #"--project-directory ." -compose_cmd="docker-compose --env-file ./config/pxe/env ${compose_directory} -f ${COMPOSE_FILE}" - - -function tear_down() { - ${compose_cmd} down -} - -function build() { - ${compose_cmd} build -} - -function bring_up() { - ${compose_cmd} up -d -} - - -tear_down -build -bring_up diff --git a/scripts/pxe/dgxctl.sh b/scripts/pxe/dgxctl.sh deleted file mode 100755 index c875d6218..000000000 --- a/scripts/pxe/dgxctl.sh +++ /dev/null @@ -1,270 +0,0 @@ -#!/usr/bin/env bash - -# Set configuration -. config/pxe/ipmi.conf - - -OPTIND=1 -IPMI_HOST_LIST="config/pxe/ipmi_host_list" -install=0 -progress=0 -upgrade=0 -config_host= -host_list=0 -install_log=0 -ssh_check=0 -show_bmc=0 -validate=0 -ipmi= -retry=0 -fw_target=all - -usage () { - echo "Manage DGX install/upgrade/configure/check process" - echo - echo "Usage: $0 [arguments]" - echo - echo "General Arguments:" - echo " -h Show help" - echo " -x Show list of DGX host IP addresses from DHCP server" - echo " -y Show DGX server install log from provisioning container" - echo " -z Verify DGX are available via SSH connection" - echo " -b Show DGX BMC IP matched with host IP (use with -z)" - echo " -w Control DGX power status via IPMI (options: on, off)" - echo - echo "Install Arguments:" - echo " -i Run install process" - echo " -p Show install progress" - echo " -f Use alternate IP file (default: ${IPMI_HOST_LIST})" - echo - echo "Update/Configuration arguments:" - echo " -u Run upgrade and configure process on all hosts" - echo " -l Run upgrade and configure process on single host" - echo " -q Re-run only failed upgrade/configuration tasks" - echo " -r Update FRU (default: only update SBIOS and BMC)" - echo - echo "Validation/check arguments:" - echo " -v Run the validation checks on all hosts" - echo " -l Run the validation checks on a single host" - exit 0 -} - -while getopts "h?pif:ul:rxyzbvw:q" opt; do - case "$opt" in - h|\?) - usage - ;; - i) install=1 - ;; - p) progress=1 - ;; - f) IPMI_HOST_LIST="${OPTARG}" - ;; - u) upgrade=1 - ;; - l) config_host="${OPTARG}" - ;; - r) fw_target="FRU" - ;; - x) host_list=1 - ;; - y) install_log=1 - ;; - z) ssh_check=1 - ;; - b) show_bmc=1 - ;; - v) validate=1 - ;; - w) ipmi="${OPTARG}" - ;; - q) retry=1 - ;; - *) - usage - ;; - esac -done -shift $((OPTIND-1)) -[ "$1" == "--" ] && shift - -test -f "${IPMI_HOST_LIST}" -if [ $? -ne 0 ] ; then - echo File not found: "${IPMI_HOST_LIST}" - exit 1 -fi - -if [ "${progress}" -eq 1 ] ; then - # Chassis identify information: - # $ sudo ipmitool -I lanplus -H -U dgxuser -P dgxuser chassis identify force - # Chassis identify interval: indefinite - # $ sudo ipmitool -I lanplus -H -U dgxuser -P dgxuser raw 0x00 0x01 - # 41 10 60 10 - # $ sudo ipmitool -I lanplus -H -U dgxuser -P dgxuser chassis identify 0 - # Chassis identify interval: off - # $ sudo ipmitool -I lanplus -H -U dgxuser -P dgxuser raw 0x00 0x01 - # 41 10 40 10 - echo "Install progress (host list: ${IPMI_HOST_LIST})": - echo - while read -u10 IPMI_HOST_IP ; do - echo -n "${IPMI_HOST_IP}: " - chassis_ident_state=$(sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} raw 0x00 0x01 | awk '{print $3}') - if [ "${chassis_ident_state}" == "60" ] ; then - echo installing... - elif [ "${chassis_ident_state}" == "40" ] ; then - echo finished - else - echo status unknown... - fi - done 10<${IPMI_HOST_LIST} -elif [ "${install}" -eq 1 ] ; then - echo Initiating PXE install process via BMC host list: ${IPMI_HOST_LIST} - while read -u10 IPMI_HOST_IP ; do - echo -n "${IPMI_HOST_IP}: " - - # make sure BMC is reachable - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} bmc info >/dev/null 2>&1 - if [ $? -ne 0 ] ; then - echo "Error communicating with BMC" - continue - fi - echo -n "available | " - - # disable IPMI boot device selection 60s timeout - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} raw 0x00 0x08 0x03 0x08 >/dev/null 2>&1 - if [ $? -ne 0 ] ; then - echo -n "config ERROR" - continue - fi - echo -n "config(1) | " - - # set baseline (boot to disk, efi, persistent) - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} raw 0x00 0x08 0x05 0xe0 0x08 0x00 0x00 0x00 >/dev/null 2>&1 - if [ $? -ne 0 ] ; then - echo -n "config ERROR" - continue - fi - echo -n "config(2) | " - - # set boot device to PXE, EFI, next boot only. Needed when defaulting to install vs boot local disk - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} chassis bootdev pxe options=efiboot >/dev/null 2>&1 - if [ $? -ne 0 ] ; then - echo -n "pxe ERROR" - continue - fi - echo -n "pxe | " - - # check that we have the correct bitmask - a004000000 - BOOT_CODE=a004000000 - boot_param=$(sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} chassis bootparam get 0x05 | egrep "^Boot parameter data:" | awk '{print $4}') - if [ "${boot_param}" != "${BOOT_CODE}" ] ; then - echo "Error: boot parameter incorrect (${boot_param})" - continue - fi - - # power off/on host - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} power off >/dev/null 2>&1 - if [ $? -ne 0 ] ; then - echo "power off ERROR" - continue - fi - sleep 5 - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} power on >/dev/null 2>&1 - if [ $? -ne 0 ] ; then - echo "power on ERROR" - continue - fi - echo "reset | installing..." - - # turn on chassis identifier light to indicate installation is in progress - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} chassis identify force >/dev/null 2>&1 - - done 10<${IPMI_HOST_LIST} - echo The install process will take approximately 15 minutes -elif [ "${upgrade}" -eq 1 ] || [ "${validate}" -eq 1 ] ; then - # Get host list and generate ansible inventory file - host_list=$(curl -s localhost/hosts | grep 54:ab:3a | awk '{print $2}') - - inventory_file=$(mktemp) - echo "[all:vars]" > "${inventory_file}" - echo "ansible_user=${DGX_USERNAME}" >> "${inventory_file}" - echo "ansible_ssh_pass=${DGX_PASSWORD}" >> "${inventory_file}" - echo "ansible_sudo_pass=${DGX_PASSWORD}" >> "${inventory_file}" - echo "[hosts]" >> "${inventory_file}" >> "${inventory_file}" - - for host in ${host_list} ; do - echo "${host}" >> "${inventory_file}" - # remove stale host key in case we did a re-install - ssh-keygen -f "~/.ssh/known_hosts" -R "${host}" >/dev/null 2>&1 - done - - # Run configuration scripts - if [ "${upgrade}" -eq 1 ] ; then - echo "The upgrade process will take upwards of 30 minutes" - echo "Start time: $(date)" - - if [ "${retry}" -eq 1 ] ; then - ansible-playbook -i "${inventory_file}" -l "@${ANSIBLE_REPO}/playbook.retry" "${ANSIBLE_REPO}/playbook.yml" - elif [ "x${config_host}" != "x" ] ; then - ansible-playbook -i "${inventory_file}" -l "${config_host}" "${ANSIBLE_REPO}/playbook.yml" --extra-vars \"target_fw=${fw_target}\" - else - ansible-playbook -i "${inventory_file}" -l hosts "${ANSIBLE_REPO}/playbook.yml" --extra-vars \"target_fw=${fw_target}\" - fi - elif [ "${validate}" -eq 1 ] ; then - echo "The validation process will take approximately 10 minutes" - echo "Start time: $(date)" - - if [ "x${config_host}" != "x" ] ; then - echo ansible-playbook -i "${inventory_file}" -l "${config_host}" "${ANSIBLE_REPO}/playbook.yml" - else - echo ansible-playbook -i "${inventory_file}" -l hosts "${ANSIBLE_REPO}/playbook.yml" - fi - fi - - rm -f "${inventory_file}" - echo "End time: $(date)" -elif [ "${host_list}" -eq 1 ] ; then - curl localhost/hosts -elif [ "${install_log}" -eq 1 ] ; then - curl localhost/log -elif [ "${ssh_check}" -eq 1 ] ; then - # Get host list - host_list=$(curl -s localhost/hosts | grep 54:ab:3a | awk '{print $2}') - # Check hosts - for host in ${host_list} ; do - echo -n "${host}: " - ping -c1 "${host}" >/dev/null 2>&1 - if [ $? -ne 0 ] ; then - echo "unavailable" - continue - fi - # remove stale host key in case we did a re-install - ssh-keygen -f "${HOME}/.ssh/known_hosts" -R "${host}" >/dev/null 2>&1 - if [ "${show_bmc}" -eq 1 ] ; then - sshpass -p "${DGX_PASSWORD}" ssh -oStrictHostKeyChecking=no dgxuser@"${host}" "echo ${DGX_PASSWORD} | sudo -S ipmitool lan print 1 2>/dev/null | egrep '^IP Address' | tail -1 | awk '{print \$4}'" 2>&1 | grep -v Warning - else - sshpass -p "${DGX_PASSWORD}" ssh -oStrictHostKeyChecking=no dgxuser@"${host}" uptime 2>&1 | grep -v Warning - fi - done -elif [ "x${ipmi}" != "x" ] ; then - # Power on/off DGX via IPMI - if [ "x${config_host}" != "x" ] ; then - echo -n "${config_host}: " - if [ "${ipmi}" == "on" ] ; then - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H "${config_host}" power on - elif [ "${ipmi}" == "off" ] ; then - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H "${config_host}" power off - fi - else - while read -u10 IPMI_HOST_IP ; do - echo -n "${IPMI_HOST_IP}: " - if [ "${ipmi}" == "on" ] ; then - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} power on - elif [ "${ipmi}" == "off" ] ; then - sudo ipmitool -I lanplus -U ${IPMI_USERNAME} -P ${IPMI_PASSWORD} -H ${IPMI_HOST_IP} power off - fi - done 10<${IPMI_HOST_LIST} - fi -else - usage -fi diff --git a/scripts/pxe/install_dgxie_prereqs.sh b/scripts/pxe/install_dgxie_prereqs.sh deleted file mode 100755 index 1895c4e92..000000000 --- a/scripts/pxe/install_dgxie_prereqs.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env bash - -# install required software -sudo apt-get update -sudo apt-get install -y git ipmitool vim software-properties-common sshpass - -# install docker -type docker >/dev/null 2>&1 -if [ $? -ne 0 ] ; then - sudo apt-get install -y apt-transport-https ca-certificates curl software-properties-common - curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - - sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" - sudo apt-get update - sudo apt-get install -y docker-ce - docker --version -fi - -# install docker-compose -type docker-compose >/dev/null 2>&1 -if [ $? -ne 0 ] ; then - sudo curl -L https://github.com/docker/compose/releases/download/1.17.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose - sudo chmod +x /usr/local/bin/docker-compose - docker-compose --version -fi - -# install ansible -type ansible >/dev/null 2>&1 -if [ $? -ne 0 ] ; then - sudo apt-add-repository -y ppa:ansible/ansible - sudo apt-get update - sudo apt-get -y install ansible - ansible --version -fi - -# configure ISO mount -grep DGXServer /etc/fstab -if [ $? -ne 0 ] ; then - mkdir -p /mnt/3.1.2 - echo "${HOME}/DGXServer-3.1.2.170902_f8777e.iso /mnt/3.1.2 iso9660 loop 0 0" | sudo tee -a /etc/fstab -fi - -# remove SUDO password requirement -sudo sed -i "s/^\%sudo.*/\%sudo ALL=\(ALL:ALL\) NOPASSWD: ALL/g" /etc/sudoers diff --git a/scripts/pxe/setup_nat.sh b/scripts/pxe/setup_nat.sh deleted file mode 100755 index 9d6183579..000000000 --- a/scripts/pxe/setup_nat.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env bash - -export HOST_INT_PUB="${1}" -export HOST_INT_PRV="${2}" - -ip a show dev "${HOST_INT_PUB}" -if [ $? -ne 0 ] ; then - exit 1 -fi - -ip a show dev "${HOST_INT_PRV}" -if [ $? -ne 0 ] ; then - exit 1 -fi - -set -x -sudo /sbin/iptables -t nat -A POSTROUTING -o ${HOST_INT_PUB} -j MASQUERADE -sudo /sbin/iptables -A FORWARD -i ${HOST_INT_PUB} -o ${HOST_INT_PRV} -m state --state RELATED,ESTABLISHED -j ACCEPT -sudo /sbin/iptables -A FORWARD -i ${HOST_INT_PRV} -o ${HOST_INT_PUB} -j ACCEPT -sudo sysctl -w net.ipv4.ip_forward=1 -set +x diff --git a/src/containers/dgxie/Dockerfile b/src/containers/dgxie/Dockerfile deleted file mode 100644 index b5498f9c3..000000000 --- a/src/containers/dgxie/Dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM ubuntu:16.04 - -MAINTAINER Douglas Holt - -RUN apt-get update && \ - apt-get -y install apt-transport-https curl && \ - curl -L https://packagecloud.io/danderson/pixiecore/gpgkey | apt-key add - && \ - echo "deb https://packagecloud.io/danderson/pixiecore/debian stretch main" >/etc/apt/sources.list.d/pixiecore.list && \ - apt-get update && \ - apt-get -y install pixiecore nginx vsftpd iptables dnsmasq python-flask - -RUN mkdir -p /www /var/run/vsftpd/empty /www/vmware - -COPY get_hosts.py /usr/local/bin -COPY rest_api.py /usr/local/bin -COPY api.py /api.py -COPY start /usr/sbin/start -COPY nginx.conf /etc/nginx/nginx.conf -COPY dnsmasq.conf /etc/dnsmasq.conf -COPY vsftpd.conf /etc/vsftpd.conf - -COPY kickstart.cfg /www/vmware -COPY mboot.efi /www/vmware - -VOLUME /etc/dnsmasq.d - -ENTRYPOINT ["/bin/bash"] -CMD ["/usr/sbin/start"] diff --git a/src/containers/dgxie/README.md b/src/containers/dgxie/README.md deleted file mode 100644 index 046388588..000000000 --- a/src/containers/dgxie/README.md +++ /dev/null @@ -1,461 +0,0 @@ - -DGXie: PXE Boot DGX Install Environment -=== - -This repo is intended to be used to automate provisioning of DGX-1 Servers over the network. The tools will automatically configure, reboot, re-install and re-configure the DGX with minimal commands. - -You will need a computer (i.e. a laptop) running Ubuntu 16.04 which has two network connections, i.e built-in and USB/Ethernet adapter. - -If you are setting up a laptop from scratch, see the "Setting up a new laptop" section at the end of the README - -Otherwise, follow each of the sections below in order: - -## Connect the laptop to the network - -Connect the built-in ethernet to the private network where the DGX are connected, and the usb-ethernet dongle to a public, internet accessible network, if available. - The install process does not require an internet connection, but it may be useful later for updating or installing extra software. - -This assumes the built-in ethernet interface is the "public" network and the usb-ethernet dongle is the "private" network where the DGX are connected - -In addition to the network connection to the DGX data network, the laptop will need to have access to the management network where the DGX BMCs are connected. - This can be shared through the private interface connected to the DGX data network or through the second interface connected to the public network. - -In some cases you may need to: -* Disconnect the network uplink from DGX data networks -* Set the DHCP helper IP in the ToR switches to point to the private interface on the laptop where DGXie runs its servers - -The private interface on the laptop should have a static IP/netmask/etc. (i.e 192.168.1.1/24) - -To configure the private interface on the laptop, you can either use the Network Manager GUI in Ubuntu Desktop, or configure - via `nmcli` in a terminal: - -List network devices: - -```console -nmcli con show -``` - -Modify settings of private interface: - -```console -sudo nmcli c modify "Belkin USB-ethernet adapter" ipv4.addresses 192.168.1.1/24 ipv4.dns 8.8.8.8 ipv4.gateway 192.168.1.1 -``` - -Show device properties: - -```console -nmcli con show "Belkin USB-ethernet adapter" -``` - -Activate interface: - -```console -sudo nmcli con up "Belkin USB-ethernet adapter" -``` - -### BMC considerations - -You will need a list of BMC IP addresses for each DGX Server to be provisioned. - -The laptop will likely need a second network connection to the BMC network, this can be accomplished by adding a second IP address to the network interface - used to connect to the DGX network, or by connecting a second network interface, such as the USB-ethernet adapter, to the network and assigning an IP in - the BMC subnet. - -If the DHCP server for the BMC network is disconnected, a logical choice for IP address would be the gateway IP for the BMC subnet. You can get this IP - from Penguin or from a DGX BMC via IPMI. - -## Run the Dgxie service container - -If you need to modify the default network settings, modify `docker-compose.yml` to edit the environment variables to configure DGXie. - See the section at the end of the README for all configuration options. You may need to configure the DGXie container with - public and private interface names, and IP information to configure the DHCP server if you are not using 192.168.1.0/24. - -Run containers: - -```console -sudo docker-compose build -sudo docker-compose up -d -``` - -Make sure the containers are running with: `sudo docker ps -a` - -You can check the container logs with (you may have to substitute a different container name): `sudo docker logs deepops_dgxie_1` - -## Provision DGX - -The DGX install process will take approximately **15** minutes. - -**Steps:** -* Modify the *ipmi_host_list* script to contain the BMC IP address of each DGX which needs to be re-imaged, one per line -* Modify the *configuration* file to contain the username and password of the DGX BMCs. This file also contains the default username and password of the DGX ISO, - which should not need to be changed. -* Run *dgxctl.sh* with the `-i` flag to start the install process (see example below) - -The *dgxctl.sh* script will iterate over each BMC IP address, one at a time, making sure it's up, disabling the boot order timeout, setting the system to PXE boot - and power-cycling the system via BMC. - -The DGX will immediately power-cycle and attempt to boot from the connected network interface. The DGXie container provides the DHCP and PXE server which the DGX - will use to automatically run the install process without user intervention. When the install process is finished, the DGX will automatically reboot and boot to - the first hard disk, which now contains the DGX Server OS. - -The *dgxctl.sh* script will turn on the DGX chassis identification light during the install. If there is a DGX with a suspected install problem, you can either - check the DGX via virtual console on the BMC IP address or connect a physical console to the affected DGX, identified by a lit chassis identification light. - -The default BMC credentials are: - -```console -Username: dgxuser -Password: dgxuser -``` - -The default DGX OS user login credentials are: - -```console -Username: dgxuser -Password: DgxUser123 -``` - -Here's an example of running *dgxctl.sh* and the expected output: - -```console -$ ./dgxctl.sh -i -Initiating PXE install process via BMC host list: ipmi_host_list - -10.0.1.1: available | config(1) | config(2) | pxe | reset | installing... -10.0.1.2: available | config(1) | config(2) | pxe | reset | installing... -``` - -If a system fails, you'll see an error: - -```console -$ ./dgxctl.sh -i -Initiating PXE install process via BMC host list: ipmi_host_list - -10.0.1.1: Error communicating with BMC on host: 10.0.1.1 -``` - -You can check the install progress with the `-p` flag: - -```console -$ ./dgxctl.sh -p -Install progress (host list: ipmi_host_list): - -10.0.1.1: installing... -10.0.1.2: installing... - -$ ./dgxctl.sh -p -Install progress (host list: ipmi_host_list): - -10.0.1.1: finished -10.0.1.2: finished -``` - -If you want to specify a different BMC IP file, use the `-f ` flag. You can see the help options with the `-h` flag. - -### Provisioning API - -DGXie provides several methods to obtain data, these should be run from the laptop outside of the DGXie container - -Get list of hosts: - -```console -$ ./dgxctl.sh -x -+------------------------------------------------------------------------------ -| DHCPD ACTIVE LEASES REPORT -+-----------------+-------------------+----------------------+----------------- -| IP Address | MAC Address | Expires (days,H:M:S) | Client Hostname -+-----------------+-------------------+----------------------+----------------- -| 192.168.1.3 | 54:ab:3a:d6:61:9d | 11:43:56 | -| 192.168.1.4 | 54:ab:3a:da:c4:8b | 11:43:53 | -+-----------------+-------------------+----------------------+----------------- -| Total Active Leases: 2 -| Report generated (UTC): 2017-11-09 15:16:51 -+------------------------------------------------------------------------------ -``` - -Get list of finished installs: - -```console -$ ./dgxctl.sh -y -== LOG OPENED == -2017-11-09 15:02:23.464795: start - 192.168.1.4 -2017-11-09 15:02:23.798555: start - 192.168.1.3 -2017-11-09 15:08:43.272009: end - 192.168.1.4 -2017-11-09 15:08:52.148569: end - 192.168.1.3 -``` - -### Monitoring the DGX install and confirming it has completed - -Once the output of `./dgxctl.sh -y` and `./dgxctl.sh -p` show that the install has ended/finished, it will take a few additional minutes for the DGX - to reboot and boot into the new operating system on the disk. - -You can run the command below to check whether the DGX are ready and available to move on to the next steps: - -```console -$ ./dgxctl.sh -z -192.168.1.3: 07:29:51 up 10:44, 0 users, load average: 0.16, 0.16, 0.11 -192.168.1.4: 07:29:51 up 10:45, 0 users, load average: 0.19, 0.14, 0.15 -``` - -## End - -## Misc Tasks - -**Power on/off all DGX via IPMI** - -```console -$ ./dgxctl.sh -w on -10.0.1.1: Chassis Power Control: Up/On -10.0.1.2: Chassis Power Control: Up/On -``` - -**Power on/off a single DGX via IPMI** - -```console -$ ./dgxctl.sh -w off -l 10.0.1.1 -10.0.1.1: Chassis Power Control: Down/Off -``` - -# Optional information - -This information is not required if you are using a pre-configured laptop. It's left here for reference if you are starting from scratch. - -## Setting up a new laptop - -### Setting up a system to run the DGXie tools: - -You will need a laptop running Ubuntu 16.04, which has two network connections, i.e wireless and USB/Ethernet adapter. - One connection should have access to the internet (public), - while the other connection should be to a dedicated network (private) containing the DGX to be provisioned. - -Download the DGX Server ISO from the Enterprise Support Portal: https://nvidia-esp.custhelp.com - -*Currently tested with DGX Server 3.1.2 170902 f8777e* - -Place the ISO in the user home directory, e.g. `${HOME}/DGXServer-3.1.2.170902_f8777e.iso` - -Run the *install_prereqs.sh* script on the Ubuntu 16.04 Linux laptop: - -```console -./install_prereqs.sh -``` - -DGXie: PXE Boot DGX Install Environment - container components -=== - -DGXie is a Docker container application for remotely installing the official DGX Server operating system over the network - -DGXie contains: - * DHCP server - * Provides PXE boot environment and DGX network settings - * TFTP server - * Provides PXE bootstrap files - * FTP server - * Provides a repo for the official DGX install ISO - * HTTP server - * Provides additional files such as a modified install pre-seed - * NAT setup - * Provides internet access to the DGX network through the system running DGXie - * REST API - * Provides list of host IP addresses from DHCP leases - -DGXie should be run on a system connected to a network of DGX servers. - The DGX servers are set to boot from the network interface connected to this network and will present a menu of boot options. - Current boot options are to boot to the local disk (default) or to install the DGX operating system on the DGX. - __There are no additional prompts after the menu, and the DGX will be completely erased during the install process.__ - -## Network topology - -(public network)------[DGXie system]------(private network)------[DGX server systems] - -The computer running this container can be on either a single network or two networks (public/private). - -The container will attempt to set up NAT routing from a private to public subnet. - -## Prerequisites - -The computer running DGXie needs Docker and should be capable of running IPtables for NAT to work (Linux) - -Download the DGX Server ISO from the Enterprise Support Portal: https://nvidia-esp.custhelp.com - -Disable the boot order update timeout (required): - -```console -# disable IPMI boot device selection 60s timeout -ipmitool -I lanplus -U -P -H raw 0x00 0x08 0x03 0x08 -``` - -Set your DGX to boot from the network (PXE) for the next boot only: - -```console -# set boot device to PXE, EFI, next boot only. Needed when defaulting to install vs boot local disk -ipmitool -I lanplus -U -P -H chassis bootdev pxe options=efiboot -``` - -### Optional/Misc - -You can also run the IPMI commands directly on the DGX via `ipmitool`: - -```console -sudo ipmitool raw ... -``` - -To set the DGX to boot from disk first: - -```console -# set boot device to first disk, EFI, persistent -ipmitool -I lanplus -U -P -H raw 0x00 0x08 0x05 0xe0 0x08 0x00 0x00 0x00 -``` - -You can set the DGX to boot from the network every time, but if DGXie is set to default to install, this can create a re-install loop - -```console -# set boot device to PXE, EFI, persistent -ipmitool -I lanplus -U -P -H raw 0x00 0x08 0x05 0xe0 0x04 0x00 0x00 0x00 -``` - -## Setup and running - -Mount the DGX Server ISO as a volume when running the container. - -*Tested with DGX Base OS 3.1.2* - -```console -sudo mkdir -p /mnt/3.1.2 -sudo mount -o loop DGXServer-3.1.2.170902_f8777e.iso /mnt/3.1.2 -``` - -Add an IP to your private interface (DGX network) if required - -```console -sudo ip addr add 192.168.1.1/24 broadcast 192.168.1.255 dev ens192 -``` - -Build and run the container - -```console -docker build -t dgxie . -docker run -d --privileged --net=host -v /mnt/3.1.2:/iso:ro --name dgxie dgxie -``` - -The `--privileged` and `--net=host` flags are required to manipulate IPTABLES on the host. - -### DGXie configuration options - -#### Provisioning host configuration - -Specify DGXie server public network interface - -```console -# default: eth0 --e HOST_INT_PUB=ens160 -``` - -Specify DGXie server private network interface - -```console -# default: eth1 --e HOST_INT_PRV=ens192 -``` - -#### Provisioning network configuration - -Options to configure the DHCP server subnet, these options are probably required - -Specify DHCP/PXE server IP address (IP of machine running DGXie on DGX network) - -```console -# default: 192.168.1.1 --e IP=10.0.0.1 -``` - -Specify DHCP/PXE server network subnet - -```console -# default: 192.168.1.0 --e NETWORK=10.0.0.0 -``` - -Specify DHCP/PXE server subnet netmask - -```console -# default: 255.255.255.0 --e NETMASK=255.255.255.0 -``` - -Specify DHCP/PXE server subnet gateway - -```console -# default: (192.168.1.1) --e GATEWAY=10.0.0.254 -``` - -Specify DHCP/PXE server DNS - -```console -# default: 8.8.8.8, 8.8.4.4 --e DNS1=10.1.1.1 -e DNS2=10.2.2.2 -``` - -Specify DHCP/PXE server lease range start/end - -```console -# default: 192.168.1.2 192.168.1.254 --e DHCP_START=10.0.0.2 -e DHCP_END=10.0.0.254 -``` - -#### DGX install options - -These options are probably optional unless you're installing on something other than a DGX - -Use a different interface on DGX clients - -```console -# default: enp1s0f0 --e INT=eth0 -``` - -Use a different disk for the root partition on DGX clients - -```console -# default: sda --e DISK=xvda -``` - -### Examples: - -DGXie container running on a VM and provisioning DGX with the default subnet options. The VM has a public and private interface on two different VLANs. - -```console -sudo docker run --rm -ti --net=host --privileged -v /mnt/3.1.2:/iso:ro -e HOST_INT_PUB=ens160 -e HOST_INT_PRV=ens192 --name dgxie dgxie -``` - -Attach to a running container: - -```console -docker exec -ti dgxie /bin/sh -``` - -Show current DGX boot flags: - -```console -sudo ipmitool chassis bootparam get 0x05 -``` - -## Default user/pass - -DGX installations will default to these login credentials: - -user: dgxuser - -pass: DgxUser123 - -## REST API - -DGXie will output a list of uniq IP address of DGX-1 servers via REST API. Adjust `localhost` to the host running DGXie if not running from the same machine: - -```console -curl http://localhost/hosts -``` - -DHCPD lease file parse script source: https://askubuntu.com/questions/219609/how-do-i-show-active-dhcp-leases diff --git a/src/containers/dgxie/api.py b/src/containers/dgxie/api.py deleted file mode 100755 index f839bb685..000000000 --- a/src/containers/dgxie/api.py +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/python -from flask import Flask, abort, request -import json -import datetime -import re -import os - -app = Flask(__name__) - -@app.route('/v1/boot/') -def pxe(mac): - '''See https://github.com/danderson/netboot/blob/master/pixiecore/README.api.md for API specs''' - # load machine profiles for each call so we can re-load changes from disk - jf = open('/etc/machines/machines.json', 'r') - machines = json.load(jf) - jf.close() - - if "HTTP_PORT" in os.environ.keys(): - http_port = os.environ['HTTP_PORT'] - else: - http_port = "13370" - - # return profile in json for matching machine - for machine in machines: - if 'mac' in machines[machine] and re.match(machines[machine]['mac'], mac): - machines[machine]['mac'] = mac - - machines[machine]['kernel'] = machines[machine]['kernel'].replace("$HTTP_PORT", http_port) - if 'cmdline' in machines[machine]: - machines[machine]['cmdline'] = machines[machine]['cmdline'].replace("$HTTP_PORT", http_port) - if 'initrd' in machines[machine]: - for i in range(len(machines[machine]['initrd'])): - machines[machine]['initrd'][i] = machines[machine]['initrd'][i].replace("$HTTP_PORT", http_port) - - return json.dumps(machines[machine]) - abort(404) - -@app.route('/install', methods=['POST']) -def install(): - if request.method == 'POST': - timestamp = datetime.datetime.now() - ip = request.environ.get('HTTP_X_REAL_IP', request.remote_addr) - action = request.form['action'] - print timestamp, ip - return 'done' - -if __name__ == '__main__': - app.run(port=9090, threaded=True) diff --git a/src/containers/dgxie/dnsmasq.conf b/src/containers/dgxie/dnsmasq.conf deleted file mode 100644 index 2d434177b..000000000 --- a/src/containers/dgxie/dnsmasq.conf +++ /dev/null @@ -1,24 +0,0 @@ -domain-needed -bogus-priv -strict-order -no-resolv -no-poll -expand-hosts -cache-size=2048 -bind-interfaces - -server=#DNS1# -server=#DNS2# -domain=#DOMAIN# -interface=#DHCP_INT# - -log-queries -log-dhcp -log-facility=/var/log/dnsmasq.log - -dhcp-authoritative -dhcp-range=#DHCP_START#,#DHCP_END#,#LEASETIME# -dhcp-option=tag:green,option:domain-search,#DOMAIN# -dhcp-option=3,#GATEWAY# - -conf-dir=/etc/dnsmasq.d,*.conf diff --git a/src/containers/dgxie/docker-compose.yml b/src/containers/dgxie/docker-compose.yml deleted file mode 100644 index 2b018fcd1..000000000 --- a/src/containers/dgxie/docker-compose.yml +++ /dev/null @@ -1,49 +0,0 @@ -# sudo docker run --rm -ti --net=host --privileged -v /mnt/:/iso:ro -e HOST_INT_PUB=ens160 -e HOST_INT_PRV=ens192 --name dgxie dgxie -# sudo docker run -d --name nfs --privileged --net=host -v /home/ubuntu/:/master -e SHARED_DIRECTORY=/master nfs-server:latest -version: '3.4' -services: - dgxie: - build: - context: . - network: "host" - network_mode: "host" - privileged: true - volumes: - - ${DATA_DIR}:/data:ro # path to ISO files - - ../../config/pxe/machines:/etc/machines:ro - - ../../config/pxe/dnsmasq.extra.conf:/etc/dnsmasq.d/dnsmasq.extra.conf:ro - environment: - - DHCP_ENABLE=${DHCP_ENABLE} - - DHCP_INT=${DHCP_INT} - - NETWORK=${NETWORK} - - NETMASK=${NETMASK} - - GATEWAY=${GATEWAY} - - DNS1=${DNS1} - - DNS2=${DNS2} - - DHCP_START=${DHCP_START} - - DHCP_END=${DHCP_END} - - LEASETIME=${LEASETIME} - - DOMAIN=${DOMAIN} - - NTP=${NTP} - - NAT_ENABLE=${NAT_ENABLE} - - NAT_INT_PUB=${NAT_INT_PUB} - - NAT_INT_PRV=${NAT_INT_PRV} - - HTTP_INT=${HTTP_INT} - - HTTP_PORT=${HTTP_PORT} - - DGX_INT=${DGX_INT} - - DGX_DISK=${DGX_DISK} - - DGX_KBD=${DGX_KBD} - - DGX_KERN_EXTRA=${DGX_KERN_EXTRA} - - DGX_EXTRA_PACKAGES=${DGX_EXTRA_PACKAGES} - - DGX_HTTPS_PROXY=${DGX_HTTPS_PROXY} - - VMW_NFS_IP=${VMW_NFS_IP} - - VMW_GPU_VIB=${VMW_GPU_VIB} - - VMW_VM_SIZE=${VMW_VM_SIZE} - nfs-server: - image: itsthenetwork/nfs-server-alpine:latest - network_mode: "host" - privileged: true - volumes: - - ${DATA_DIR}:/shared - environment: - - SHARED_DIRECTORY=/shared diff --git a/src/containers/dgxie/get_hosts.py b/src/containers/dgxie/get_hosts.py deleted file mode 100755 index 2b594bcc1..000000000 --- a/src/containers/dgxie/get_hosts.py +++ /dev/null @@ -1,258 +0,0 @@ -#!/usr/bin/python -import datetime, bisect - -def parse_timestamp(raw_str): - tokens = raw_str.split() - - if len(tokens) == 1: - if tokens[0].lower() == 'never': - return 'never'; - - else: - raise Exception('Parse error in timestamp') - - elif len(tokens) == 3: - return datetime.datetime.strptime(' '.join(tokens[1:]), - '%Y/%m/%d %H:%M:%S') - - else: - raise Exception('Parse error in timestamp') - - -def timestamp_is_ge(t1, t2): - if t1 == 'never': - return True - - elif t2 == 'never': - return False - - else: - return t1 >= t2 - - -def timestamp_is_lt(t1, t2): - if t1 == 'never': - return False - - elif t2 == 'never': - return t1 != 'never' - - else: - return t1 < t2 - - -def timestamp_is_between(t, tstart, tend): - return timestamp_is_ge(t, tstart) and timestamp_is_lt(t, tend) - - -def parse_hardware(raw_str): - tokens = raw_str.split() - - if len(tokens) == 2: - return tokens[1] - - else: - raise Exception('Parse error in hardware') - - -def strip_endquotes(raw_str): - return raw_str.strip('"') - - -def identity(raw_str): - return raw_str - - -def parse_binding_state(raw_str): - tokens = raw_str.split() - - if len(tokens) == 2: - return tokens[1] - - else: - raise Exception('Parse error in binding state') - - -def parse_next_binding_state(raw_str): - tokens = raw_str.split() - - if len(tokens) == 3: - return tokens[2] - - else: - raise Exception('Parse error in next binding state') - - -def parse_rewind_binding_state(raw_str): - tokens = raw_str.split() - - if len(tokens) == 3: - return tokens[2] - - else: - raise Exception('Parse error in next binding state') - - -def parse_leases_file(leases_file): - valid_keys = { - 'starts': parse_timestamp, - 'ends': parse_timestamp, - 'tstp': parse_timestamp, - 'tsfp': parse_timestamp, - 'atsfp': parse_timestamp, - 'cltt': parse_timestamp, - 'hardware': parse_hardware, - 'binding': parse_binding_state, - 'next': parse_next_binding_state, - 'rewind': parse_rewind_binding_state, - 'uid': strip_endquotes, - 'client-hostname': strip_endquotes, - 'option': identity, - 'set': identity, - 'on': identity, - 'abandoned': None, - 'bootp': None, - 'reserved': None, - } - - leases_db = {} - - lease_rec = {} - in_lease = False - in_failover = False - - for line in leases_file: - if line.lstrip().startswith('#'): - continue - - tokens = line.split() - - if len(tokens) == 0: - continue - - key = tokens[0].lower() - - if key == 'lease': - if not in_lease: - ip_address = tokens[1] - - lease_rec = {'ip_address' : ip_address} - in_lease = True - - else: - raise Exception('Parse error in leases file') - - elif key == 'failover': - in_failover = True - elif key == '}': - if in_lease: - for k in valid_keys: - if callable(valid_keys[k]): - lease_rec[k] = lease_rec.get(k, '') - else: - lease_rec[k] = False - - ip_address = lease_rec['ip_address'] - - if ip_address in leases_db: - leases_db[ip_address].insert(0, lease_rec) - - else: - leases_db[ip_address] = [lease_rec] - - lease_rec = {} - in_lease = False - - elif in_failover: - in_failover = False - continue - else: - raise Exception('Parse error in leases file') - - elif key in valid_keys: - if in_lease: - value = line[(line.index(key) + len(key)):] - value = value.strip().rstrip(';').rstrip() - - if callable(valid_keys[key]): - lease_rec[key] = valid_keys[key](value) - else: - lease_rec[key] = True - - else: - raise Exception('Parse error in leases file') - - else: - if in_lease: - raise Exception('Parse error in leases file') - - if in_lease: - raise Exception('Parse error in leases file') - - return leases_db - - -def round_timedelta(tdelta): - return datetime.timedelta(tdelta.days, - tdelta.seconds + (0 if tdelta.microseconds < 500000 else 1)) - - -def timestamp_now(): - n = datetime.datetime.utcnow() - return datetime.datetime(n.year, n.month, n.day, n.hour, n.minute, - n.second + (0 if n.microsecond < 500000 else 1)) - - -def lease_is_active(lease_rec, as_of_ts): - return timestamp_is_between(as_of_ts, lease_rec['starts'], - lease_rec['ends']) - - -def ipv4_to_int(ipv4_addr): - parts = ipv4_addr.split('.') - return (int(parts[0]) << 24) + (int(parts[1]) << 16) + \ - (int(parts[2]) << 8) + int(parts[3]) - - -def select_active_leases(leases_db, as_of_ts): - retarray = [] - sortedarray = [] - - for ip_address in leases_db: - lease_rec = leases_db[ip_address][0] - - if lease_is_active(lease_rec, as_of_ts): - ip_as_int = ipv4_to_int(ip_address) - insertpos = bisect.bisect(sortedarray, ip_as_int) - sortedarray.insert(insertpos, ip_as_int) - retarray.insert(insertpos, lease_rec) - - return retarray - - -############################################################################## - - -myfile = open('/var/lib/dhcp/dhcpd.leases', 'r') -leases = parse_leases_file(myfile) -myfile.close() - -now = timestamp_now() -report_dataset = select_active_leases(leases, now) - -print('+------------------------------------------------------------------------------') -print('| DHCPD ACTIVE LEASES REPORT') -print('+-----------------+-------------------+----------------------+-----------------') -print('| IP Address | MAC Address | Expires (days,H:M:S) | Client Hostname ') -print('+-----------------+-------------------+----------------------+-----------------') - -for lease in report_dataset: - print('| ' + format(lease['ip_address'], '<15') + ' | ' + \ - format(lease['hardware'], '<17') + ' | ' + \ - format(str((lease['ends'] - now) if lease['ends'] != 'never' else 'never'), '>20') + ' | ' + \ - lease['client-hostname']) - -print('+-----------------+-------------------+----------------------+-----------------') -print('| Total Active Leases: ' + str(len(report_dataset))) -print('| Report generated (UTC): ' + str(now)) -print('+------------------------------------------------------------------------------') diff --git a/src/containers/dgxie/kickstart.cfg b/src/containers/dgxie/kickstart.cfg deleted file mode 100644 index cdc149cc9..000000000 --- a/src/containers/dgxie/kickstart.cfg +++ /dev/null @@ -1,97 +0,0 @@ -#Accept the VMware End User License Agreement -vmaccepteula - -#clear paritions and install -clearpart --firstdisk --overwritevmfs -install --firstdisk --overwritevmfs - -#set the root password -rootpw d33pops! - -#Host Network Settings -network --bootproto=dhcp --addvmportgroup=1 - -reboot - -#Firstboot section 1 -%firstboot --interpreter=busybox - - -#Enable & start remote ESXi Shell (SSH) -vim-cmd hostsvc/enable_ssh -vim-cmd hostsvc/start_ssh - - -#Enable & start ESXi Shell (TSM) -vim-cmd hostsvc/enable_esx_shell -vim-cmd hostsvc/start_esx_shell - - -#Suppress Shell Warning -esxcli system settings advanced set -o /UserVars/SuppressShellWarning -i 1 -esxcli system settings advanced set -o /UserVars/ESXiShellTimeOut -i 1 - -#Disable ipv6 -esxcli network ip set --ipv6-enabled=0 - -# NTP Configuration -cat > /etc/ntp.conf << __NTP_CONFIG__ -restrict default kod nomodify notrap noquerynopeer -restrict 127.0.0.1 -server 0.us.pool.ntp.org -server 1.us.pool.ntp.org -server 2.us.pool.ntp.org -__NTP_CONFIG__ -/sbin/chkconfig ntpd on - -# enable High Performance -esxcli system settings advanced set --option=/Power/CpuPolicy --string-value="High Performance" - -# Disable CEIP -esxcli system settings advanced set -o /UserVars/HostClientCEIPOptIn -i 2 - -#Mount NFS -esxcli storage nfs41 add --hosts #VMW_NFS_IP# --share=/ --volume-name=nfsstore - -#Enable maintaince mode -esxcli system maintenanceMode set -e true - -#copy GRID driver -mkdir /vmfs/volumes/datastore1/vGPU_Driver -cp /vmfs/volumes/nfsstore/#VMW_GPU_VIB# /vmfs/volumes/datastore1/vGPU_Driver/ - -#install vGPU driver -esxcli software vib install -v /vmfs/volumes/datastore1/vGPU_Driver/#VMW_GPU_VIB# - -#enable vGPU mode -esxcli graphics host set --default-type=SharedPassthru - -#disable ECC -nvidia-smi -e 0 - -#Disable maintaince mode -esxcli system maintenanceMode set -e false - -#copy vmx files -mkdir /vmfs/volumes/datastore1/imported -cp /vmfs/volumes/nfsstore/*.vmx /vmfs/volumes/datastore1/imported/ - -#Create VMDK 001 -for disk in $(find /vmfs/volumes/datastore1/imported/ -type f -name \*.vmx -exec grep vmdk {} \; | awk '{print $NF}' | sed 's/\"//g') ; do - vmkfstools -c #VMW_VM_SIZE# /vmfs/volumes/datastore1/imported/${disk} -d thin -done - -#Register VMs to Host -find /vmfs/volumes/datastore1/imported/ -type f -name \*.vmx -exec vim-cmd solo/registervm {} \; - -#copy %first boot script logs to persisted datastore -cp /var/log/hostd.log "/vmfs/volumes/datastore1/firstboot-hostd.log" -cp /var/log/esxi_install.log "/vmfs/volumes/datastore1/firstboot-esxi_install.log" - -reboot - -#power on first VM -#vim-cmd vmsvc/power.on 1 - -#power on second VM -#vim-cmd vmsvc/power.on 2 diff --git a/src/containers/dgxie/mboot.efi b/src/containers/dgxie/mboot.efi deleted file mode 100644 index 52edcf5ea..000000000 Binary files a/src/containers/dgxie/mboot.efi and /dev/null differ diff --git a/src/containers/dgxie/nginx.conf b/src/containers/dgxie/nginx.conf deleted file mode 100644 index 82affda43..000000000 --- a/src/containers/dgxie/nginx.conf +++ /dev/null @@ -1,43 +0,0 @@ -#user www; -worker_processes auto; -#daemon on; - -error_log /var/log/nginx/error.log warn; -pid /var/run/nginx.pid; - -events { - worker_connections 1024; -} - -http { - include /etc/nginx/mime.types; - default_type application/octet-stream; - sendfile on; - access_log /var/log/nginx/access.log; - keepalive_timeout 3000; - server { - listen #HTTP_PORT#; - root /www; - index index.html index.htm; - autoindex on; - server_name localhost; - client_max_body_size 32m; - error_page 500 502 503 504 /50x.html; - location /50x.html { - root /var/lib/nginx/html; - } - location /hosts { - proxy_pass http://127.0.0.1:5000; - } - location /log { - proxy_pass http://127.0.0.1:5000; - } - location /install { - proxy_pass http://127.0.0.1:5000; - proxy_set_header X-Real-IP $remote_addr; - } - location /v1 { - proxy_pass http://127.0.0.1:9090; - } - } -} diff --git a/src/containers/dgxie/start b/src/containers/dgxie/start deleted file mode 100755 index 27d0d229f..000000000 --- a/src/containers/dgxie/start +++ /dev/null @@ -1,127 +0,0 @@ -#!/bin/sh - -### CONFIG -MNT=${MNT:-/data} -ISO=${ISO:-/iso} -VISO=${VISO:-/vmware_iso} -PRESEED=${ISO}/preseed/pxe.template.seed -DGX_INT=${DGX_INT:-enp1s0f0} -DGX_DISK=${DGX_DISK:-sda} -DGX_KBD=${DGX_KBD:-us} -DGX_KERN_EXTRA=${DGX_KERN_EXTRA:-} -DGX_EXTRA_PACKAGES=${DGX_EXTRA_PACKAGES:-} -DGX_HTTPS_PROXY=${DGX_HTTPS_PROXY:-} -HTTP_INT=${HTTP_INT:-eth1} -HTTP_PORT=${HTTP_PORT:-13370} -DHCP_INT=${DHCP_INT:-eth1} -NAT_INT_PUB=${NAT_INT_PUB:-eth0} -NAT_INT_PRV=${NAT_INT_PRV:-eth1} -NETWORK=${NETWORK:-192.168.1.0} -NETMASK=${NETMASK:-255.255.255.0} -GATEWAY=${GATEWAY:-192.168.1.1} -DNS1=${DNS1:-8.8.8.8} -DNS2=${DNS2:-8.8.4.4} -DHCP_START=${DHCP_START:-192.168.1.100} -DHCP_END=${DHCP_END:-192.168.1.199} -LEASETIME=${LEASETIME:-7200} -DOMAIN=${DOMAIN:-local} -NTP=${NTP:-pool.ntp.org} -VMW_NFS_IP=${VMW_NFS_IP:-192.168.1.1} -VMW_GPU_VIB=${VMW_GPU_VIB:-NVIDIA-VMware_ESXi_6.7_Host_Driver-418.66-1OEM.670.0.0.8169922.x86_64.vib} -VMW_VM_SIZE=${VMW_VM_SIZE:-100g} -### - -HTTP_IP=$(ip -4 addr show ${HTTP_INT} | grep -oP '(?<=inet\s)\d+(\.\d+){3}') - -mkdir -p "${ISO}" -mkdir -p "${VISO}" - -# DGX installer ISO -dgx_iso="$(find ${MNT} -name DGXServer\*.iso)" -if [ "z${dgx_iso}" != "z" ] ; then - mount -o loop -t iso9660 "$dgx_iso" "${ISO}" - cp ${PRESEED} /www/dgx.seed - PRESEED=/www/dgx.seed - ln -sf ${ISO} /www/iso - - # Modify preseed - sed -i "s//${HTTP_IP}/g" ${PRESEED} - sed -i "s/^d-i live-installer.*/d-i live-installer\/net-image string ftp:\/\/${HTTP_IP}\/install\/filesystem.squashfs/g" ${PRESEED} - sed -i "s/^d-i mirror\/ftp\/hostname.*/d-i mirror\/ftp\/hostname string ${HTTP_IP}/g" ${PRESEED} - sed -i "s/^d-i apt-setup\/local0\/repository.*/d-i apt-setup\/local0\/repository string deb ftp:\/\/${HTTP_IP}\/ bionic main multiverse restricted universe/g" ${PRESEED} - sed -i "s/^d-i mirror\/ftp\/directory.*/d-i mirror\/ftp\/directory string \//g" ${PRESEED} - sed -i "s/^d-i apt-setup\/security_host.*/d-i apt-setup\/security_host string ${HTTP_IP}/g" ${PRESEED} - - # Try to get install process to fix BIOS boot order on pre-3.18 BMC firmwares; sets DGX to boot from first disk - # Turn off chassis identify light - sed -i "s/in-target sh -c \"logger preseed\/late_command: end\"\;/modprobe -v ipmi_devintf\; \\\\\n\\t in-target sh -c \"sudo ipmitool raw 0x00 0x08 0x05 0xe0 0x08 0x00 0x00 0x00\"\; \\\\\n\\t in-target sh -c \"sudo ipmitool chassis identify 0\"\; \\\\\n\\t in-target sh -c \"curl -X POST -d action=end ${HTTP_IP}:${HTTP_PORT}\/install\"\; \\\\\n\\tin-target sh -c \"logger preseed\/late_command: end\"\;/g" ${PRESEED} - sed -i "s~^d-i mirror/https/proxy.*~d-i mirror/https/proxy string ${DGX_HTTPS_PROXY}~g" ${PRESEED} - sed -i "s~^d-i pkgsel/include string ~d-i pkgsel/include string ${DGX_EXTRA_PACKAGES} ~g" ${PRESEED} - - # Add early command to log install start - #sed -i "s/^# Use default/d-i preseed\/early_command string wget --post-data action=start ${HTTP_IP}\/install\\n\\n# Use default/g" ${PRESEED} - #sed -i "s/early_command string/early_command string wget --post-data action=start 192.168.1.1\/install \;/g" ${PRESEED} - - # Useful for Xen VM - sed -i "s/\/dev\/sda/\/dev\/${DGX_DISK}/g" ${PRESEED} - sed -i "s/enp1s0f0/${DGX_INT}/g" ${PRESEED} - - # FTP - sed -i "s~/srv/ftp~${ISO}~g" /etc/passwd -fi - -# Mount VMware ESXi installer -vmware_iso="$(find ${MNT} -name VMware-VMvisor-Installer\*.iso)" -if [ "z${vmware_iso}" != "z" ] ; then - mount -o loop -t iso9660 "$vmware_iso" "${VISO}" - ln -sf ${VISO} /www/viso - cp "${VISO}/boot.cfg" /www/vmware/boot.cfg - sed -i "s~^kernel=~kernel=http://${HTTP_IP}:${HTTP_PORT}/viso~g" /www/vmware/boot.cfg - sed -i "s~^kernelopt=.*~kernelopt=ks=http://${HTTP_IP}:${HTTP_PORT}/vmware/kickstart.cfg~g" /www/vmware/boot.cfg - sed -i "s~^modules=~modules=http://${HTTP_IP}:${HTTP_PORT}/viso~g" /www/vmware/boot.cfg - sed -i "s~--- /~--- http://${HTTP_IP}:${HTTP_PORT}/viso/~g" /www/vmware/boot.cfg - sed -i "s/\#VMW_NFS_IP\#/${VMW_NFS_IP}/g" /www/vmware/kickstart.cfg - sed -i "s/\#VMW_GPU_VIB\#/${VMW_GPU_VIB}/g" /www/vmware/kickstart.cfg - sed -i "s/\#VMW_VM_SIZE\#/${VMW_VM_SIZE}/g" /www/vmware/kickstart.cfg -fi - -# DHCP Server -sed -i "s/\#DHCP_INT\#/${DHCP_INT}/g" /etc/dnsmasq.conf -sed -i "s/\#LEASETIME\#/${LEASETIME}/g" /etc/dnsmasq.conf -sed -i "s/\#DOMAIN\#/${DOMAIN}/g" /etc/dnsmasq.conf -sed -i "s/\#NETWORK\#/${NETWORK}/g" /etc/dnsmasq.conf -sed -i "s/\#NETMASK\#/${NETMASK}/g" /etc/dnsmasq.conf -sed -i "s/\#GATEWAY\#/${GATEWAY}/g" /etc/dnsmasq.conf -sed -i "s/\#DNS1\#/${DNS1}/g" /etc/dnsmasq.conf -sed -i "s/\#DNS2\#/${DNS2}/g" /etc/dnsmasq.conf -sed -i "s/\#DHCP_START\#/${DHCP_START}/g" /etc/dnsmasq.conf -sed -i "s/\#DHCP_END\#/${DHCP_END}/g" /etc/dnsmasq.conf -if [ ! -z "${NTP}" ]; then - sed -i "s/\#NTP\#/${NTP}/g" /etc/dnsmasq.conf - sed -i "s/\#dhcp-option=42,/dhcp-option=42,/g" /etc/dnsmasq.conf -fi - -# HTTP Server -sed -i "s/\#HTTP_PORT\#/${HTTP_PORT}/g" /etc/nginx/nginx.conf - -# Set up NAT -if [ "${NAT_ENABLE}" -ne 0 ]; then - # NAT - /sbin/iptables -t nat -A POSTROUTING -o ${NAT_INT_PUB} -j MASQUERADE - /sbin/iptables -A FORWARD -i ${NAT_INT_PUB} -o ${NAT_INT_PRV} -m state --state RELATED,ESTABLISHED -j ACCEPT - /sbin/iptables -A FORWARD -i ${NAT_INT_PRV} -o ${NAT_INT_PUB} -j ACCEPT - sysctl -w net.ipv4.ip_forward=1 -fi - -# Run some servers -nginx & -/usr/sbin/vsftpd /etc/vsftpd.conf & -/usr/local/bin/rest_api.py >/dev/null 2>&1 & -python /api.py & -/usr/bin/pixiecore api http://127.0.0.1:${HTTP_PORT} --dhcp-no-bind --debug --port 81 |& tee -a /var/log/pixiecore.log & -if [ ${DHCP_ENABLE} -ne 0 ] ; then - dnsmasq - tail -f /var/log/dnsmasq.log /var/log/pixiecore.log -else - tail -f /var/log/nginx/* /var/log/pixiecore.log -fi diff --git a/src/containers/dgxie/vsftpd.conf b/src/containers/dgxie/vsftpd.conf deleted file mode 100644 index 54ed0efee..000000000 --- a/src/containers/dgxie/vsftpd.conf +++ /dev/null @@ -1,123 +0,0 @@ -# Example config file /etc/vsftpd.conf -# -# The default compiled in settings are fairly paranoid. This sample file -# loosens things up a bit, to make the ftp daemon more usable. -# Please see vsftpd.conf.5 for all compiled in defaults. -# -# READ THIS: This example file is NOT an exhaustive list of vsftpd options. -# Please read the vsftpd.conf.5 manual page to get a full idea of vsftpd's -# capabilities. -# -# Allow anonymous FTP? (Beware - allowed by default if you comment this out). -anonymous_enable=YES -# -# Uncomment this to allow local users to log in. -#local_enable=YES -# -# Uncomment this to enable any form of FTP write command. -#write_enable=YES -# -# Default umask for local users is 077. You may wish to change this to 022, -# if your users expect that (022 is used by most other ftpd's) -#local_umask=022 -# -# Uncomment this to allow the anonymous FTP user to upload files. This only -# has an effect if the above global write enable is activated. Also, you will -# obviously need to create a directory writable by the FTP user. -#anon_upload_enable=YES -# -# Uncomment this if you want the anonymous FTP user to be able to create -# new directories. -#anon_mkdir_write_enable=YES -# -# Activate directory messages - messages given to remote users when they -# go into a certain directory. -dirmessage_enable=YES -# -# Activate logging of uploads/downloads. -xferlog_enable=YES -# -# Make sure PORT transfer connections originate from port 20 (ftp-data). -connect_from_port_20=YES -# -# If you want, you can arrange for uploaded anonymous files to be owned by -# a different user. Note! Using "root" for uploaded files is not -# recommended! -#chown_uploads=YES -#chown_username=whoever -# -# You may override where the log file goes if you like. The default is shown -# below. -#xferlog_file=/var/log/vsftpd.log -# -# If you want, you can have your log file in standard ftpd xferlog format. -# Note that the default log file location is /var/log/xferlog in this case. -#xferlog_std_format=YES -# -# You may change the default value for timing out an idle session. -#idle_session_timeout=600 -# -# You may change the default value for timing out a data connection. -#data_connection_timeout=120 -# -# It is recommended that you define on your system a unique user which the -# ftp server can use as a totally isolated and unprivileged user. -#nopriv_user=ftpsecure -# -# Enable this and the server will recognise asynchronous ABOR requests. Not -# recommended for security (the code is non-trivial). Not enabling it, -# however, may confuse older FTP clients. -#async_abor_enable=YES -# -# By default the server will pretend to allow ASCII mode but in fact ignore -# the request. Turn on the below options to have the server actually do ASCII -# mangling on files when in ASCII mode. -# Beware that on some FTP servers, ASCII support allows a denial of service -# attack (DoS) via the command "SIZE /big/file" in ASCII mode. vsftpd -# predicted this attack and has always been safe, reporting the size of the -# raw file. -# ASCII mangling is a horrible feature of the protocol. -#ascii_upload_enable=YES -#ascii_download_enable=YES -# -# You may fully customise the login banner string: -#ftpd_banner=Welcome to blah FTP service. -# -# You may specify a file of disallowed anonymous e-mail addresses. Apparently -# useful for combatting certain DoS attacks. -#deny_email_enable=YES -# (default follows) -#banned_email_file=/etc/vsftpd.banned_emails -# -# You may specify an explicit list of local users to chroot() to their home -# directory. If chroot_local_user is YES, then this list becomes a list of -# users to NOT chroot(). -# (Warning! chroot'ing can be very dangerous. If using chroot, make sure that -# the user does not have write access to the top level directory within the -# chroot) -#chroot_local_user=YES -#chroot_list_enable=YES -# (default follows) -#chroot_list_file=/etc/vsftpd.chroot_list -# -# You may activate the "-R" option to the builtin ls. This is disabled by -# default to avoid remote users being able to cause excessive I/O on large -# sites. However, some broken FTP clients such as "ncftp" and "mirror" assume -# the presence of the "-R" option, so there is a strong case for enabling it. -#ls_recurse_enable=YES -# -# When "listen" directive is enabled, vsftpd runs in standalone mode and -# listens on IPv4 sockets. This directive cannot be used in conjunction -# with the listen_ipv6 directive. -listen=YES -# -# This directive enables listening on IPv6 sockets. To listen on IPv4 and IPv6 -# sockets, you must run two copies of vsftpd with two configuration files. -# Make sure, that one of the listen options is commented !! -#listen_ipv6=YES -local_enable=YES -passwd_chroot_enable=yes -seccomp_sandbox=NO -pasv_enable=Yes -pasv_max_port=10100 -pasv_min_port=10090 diff --git a/src/containers/pixiecore/Dockerfile b/src/containers/pixiecore/Dockerfile deleted file mode 100644 index d609ed998..000000000 --- a/src/containers/pixiecore/Dockerfile +++ /dev/null @@ -1,26 +0,0 @@ -FROM golang:1.10.2 - -ENV COMMIT_HASH 03425a0d6ae36852d5ea7b446571bbcd3829d717 -ENV CUSTOM_FORK_AUTHOR deepops -RUN apt-get update -RUN apt-get install -qy --no-install-recommends wget git -RUN [ -d ${GOPATH}/bin ] || mkdir ${GOPATH}/bin -RUN go get -u github.com/golang/dep/cmd/dep -RUN mkdir -p ${GOPATH}/src/go.universe.tf -WORKDIR /go/src/go.universe.tf -RUN git clone https://github.com/google/netboot.git -WORKDIR /go/src/go.universe.tf/netboot -RUN git remote add ${CUSTOM_FORK_AUTHOR} https://github.com/${CUSTOM_FORK_AUTHOR}/netboot.git && git fetch ${CUSTOM_FORK_AUTHOR} && git checkout ${COMMIT_HASH} -RUN dep ensure -RUN ls -al ./vendor -WORKDIR /go/src -RUN CGO_ENABLED=0 GOOS=linux go build -o /bin/pixiecore -ldflags "-w -s -v -extldflags -static" go.universe.tf/netboot/cmd/pixiecore - -FROM alpine:3.6 -MAINTAINER Douglas Holt - -RUN apk add --no-cache ca-certificates -COPY --from=0 /bin/pixiecore /usr/bin/pixiecore -RUN chmod +x /usr/bin/pixiecore - -ENTRYPOINT ["/usr/bin/pixiecore"] diff --git a/src/containers/pxe/dhcp/Dockerfile b/src/containers/pxe/dhcp/Dockerfile deleted file mode 100644 index 3555906ab..000000000 --- a/src/containers/pxe/dhcp/Dockerfile +++ /dev/null @@ -1,11 +0,0 @@ -FROM ubuntu:16.04 - -MAINTAINER Douglas Holt - -RUN apt-get update && \ - apt-get -y install dnsmasq - -VOLUME /etc/dnsmasq.d - -#ENTRYPOINT ["dnsmasq"] -CMD ["dnsmasq", "-d"] diff --git a/src/containers/pxe/dhcp/dnsmasq.conf b/src/containers/pxe/dhcp/dnsmasq.conf deleted file mode 100644 index cd6f8d9e5..000000000 --- a/src/containers/pxe/dhcp/dnsmasq.conf +++ /dev/null @@ -1,18 +0,0 @@ -domain-needed -bogus-priv -strict-order -no-resolv -no-poll -expand-hosts -cache-size=2048 -bind-interfaces - -server=8.8.8.8 -server=8.8.4.4 -domain=localdomain - -log-queries -log-dhcp -log-facility=/var/log/dnsmasq.log - -conf-dir=/etc/dnsmasq.d,*.conf diff --git a/src/containers/pxe/docker-compose.yml b/src/containers/pxe/docker-compose.yml deleted file mode 100644 index c03428223..000000000 --- a/src/containers/pxe/docker-compose.yml +++ /dev/null @@ -1,28 +0,0 @@ -version: '3.2' -services: - dhcp: - build: dhcp - network_mode: "host" - privileged: true - volumes: - - type: bind - source: ./dhcp/dnsmasq.conf - target: /etc/dnsmasq.conf - pxe-coreos: - network_mode: "host" - image: danderson/pixiecore - command: ["boot", - "https://alpha.release.core-os.net/amd64-usr/current/coreos_production_pxe.vmlinuz", - "https://alpha.release.core-os.net/amd64-usr/current/coreos_production_pxe_image.cpio.gz", - "--cmdline", - "coreos.autologin", - "--dhcp-no-bind"] - pxe: - network_mode: "host" - image: deepops/pixiecore - command: ["boot", - "http://archive.ubuntu.com/ubuntu/dists/bionic-updates/main/installer-amd64/current/images/netboot/ubuntu-installer/amd64/linux", - "http://archive.ubuntu.com/ubuntu/dists/bionic-updates/main/installer-amd64/current/images/netboot/ubuntu-installer/amd64/initrd.gz", - "--cmdline", - "auto=true priority=critical preseed/url=https://bit.ly/nvdeepseed", - "--dhcp-no-bind"] diff --git a/src/containers/pxe/preseed b/src/containers/pxe/preseed deleted file mode 100644 index 67a256a11..000000000 --- a/src/containers/pxe/preseed +++ /dev/null @@ -1,491 +0,0 @@ -#### Contents of the preconfiguration file (for stretch) -### Localization -# Preseeding only locale sets language, country and locale. -d-i debian-installer/locale string en_US - -# The values can also be preseeded individually for greater flexibility. -#d-i debian-installer/language string en -#d-i debian-installer/country string NL -#d-i debian-installer/locale string en_US.UTF-8 -# Optionally specify additional locales to be generated. -#d-i localechooser/supported-locales multiselect en_US.UTF-8, nl_NL.UTF-8 - -# Keyboard selection. -# Disable automatic (interactive) keymap detection. -d-i console-setup/ask_detect boolean false -d-i keyboard-configuration/xkb-keymap select us -# To select a variant of the selected layout: -#d-i keyboard-configuration/xkb-keymap select us(dvorak) -# d-i keyboard-configuration/toggle select No toggling - -### Network configuration -# Disable network configuration entirely. This is useful for cdrom -# installations on non-networked devices where the network questions, -# warning and long timeouts are a nuisance. -#d-i netcfg/enable boolean false - -# netcfg will choose an interface that has link if possible. This makes it -# skip displaying a list if there is more than one interface. -d-i netcfg/choose_interface select auto - -# To set a different link detection timeout (default is 3 seconds). -# Values are interpreted as seconds. -#d-i netcfg/link_wait_timeout string 10 - -# If you have a slow dhcp server and the installer times out waiting for -# it, this might be useful. -#d-i netcfg/dhcp_timeout string 60 -#d-i netcfg/dhcpv6_timeout string 60 - -# If you prefer to configure the network manually, uncomment this line and -# the static network configuration below. -#d-i netcfg/disable_autoconfig boolean true - -# If you want the preconfiguration file to work on systems both with and -# without a dhcp server, uncomment these lines and the static network -# configuration below. -#d-i netcfg/dhcp_failed note -#d-i netcfg/dhcp_options select Configure network manually - -# Static network configuration. -# -# IPv4 example -#d-i netcfg/get_ipaddress string 192.168.1.42 -#d-i netcfg/get_netmask string 255.255.255.0 -#d-i netcfg/get_gateway string 192.168.1.1 -#d-i netcfg/get_nameservers string 192.168.1.1 -#d-i netcfg/confirm_static boolean true -# -# IPv6 example -#d-i netcfg/get_ipaddress string fc00::2 -#d-i netcfg/get_netmask string ffff:ffff:ffff:ffff:: -#d-i netcfg/get_gateway string fc00::1 -#d-i netcfg/get_nameservers string fc00::1 -#d-i netcfg/confirm_static boolean true - -# Any hostname and domain names assigned from dhcp take precedence over -# values set here. However, setting the values still prevents the questions -# from being shown, even if values come from dhcp. -d-i netcfg/get_hostname string unassigned-hostname -d-i netcfg/get_domain string unassigned-domain - -# If you want to force a hostname, regardless of what either the DHCP -# server returns or what the reverse DNS entry for the IP is, uncomment -# and adjust the following line. -#d-i netcfg/hostname string ubuntu - -# Disable that annoying WEP key dialog. -d-i netcfg/wireless_wep string -# The wacky dhcp hostname that some ISPs use as a password of sorts. -#d-i netcfg/dhcp_hostname string radish - -# If non-free firmware is needed for the network or other hardware, you can -# configure the installer to always try to load it, without prompting. Or -# change to false to disable asking. -#d-i hw-detect/load_firmware boolean true - -### Network console -# Use the following settings if you wish to make use of the network-console -# component for remote installation over SSH. This only makes sense if you -# intend to perform the remainder of the installation manually. -#d-i anna/choose_modules string network-console -#d-i network-console/authorized_keys_url string http://10.0.0.1/openssh-key -#d-i network-console/password password r00tme -#d-i network-console/password-again password r00tme -# Use this instead if you prefer to use key-based authentication -#d-i network-console/authorized_keys_url http://host/authorized_keys - -### Mirror settings -# If you select ftp, the mirror/country string does not need to be set. -#d-i mirror/protocol string ftp -d-i mirror/country string manual -d-i mirror/http/hostname string archive.ubuntu.com -d-i mirror/http/directory string /ubuntu -d-i mirror/http/proxy string - -# Alternatively: by default, the installer uses CC.archive.ubuntu.com where -# CC is the ISO-3166-2 code for the selected country. You can preseed this -# so that it does so without asking. -#d-i mirror/http/mirror select CC.archive.ubuntu.com - -# Suite to install. -#d-i mirror/suite string stretch -# Suite to use for loading installer components (optional). -#d-i mirror/udeb/suite string stretch -# Components to use for loading installer components (optional). -#d-i mirror/udeb/components multiselect main, restricted - -### Account setup -# Skip creation of a root account (normal user account will be able to -# use sudo). The default is false; preseed this to true if you want to set -# a root password. -#d-i passwd/root-login boolean false -# Alternatively, to skip creation of a normal user account. -#d-i passwd/make-user boolean false - -# Root password, either in clear text -#d-i passwd/root-password password r00tme -#d-i passwd/root-password-again password r00tme -# or encrypted using a crypt(3) hash. -#d-i passwd/root-password-crypted password [crypt(3) hash] - -# To create a normal user account. -d-i passwd/user-fullname string Ubuntu User -d-i passwd/username string ubuntu -# Normal user's password, either in clear text -d-i passwd/user-password password deepops -d-i passwd/user-password-again password deepops -# or encrypted using a crypt(3) hash. -#d-i passwd/user-password-crypted password [crypt(3) hash] -# Create the first user with the specified UID instead of the default. -#d-i passwd/user-uid string 1010 -# The installer will warn about weak passwords. If you are sure you know -# what you're doing and want to override it, uncomment this. -d-i user-setup/allow-password-weak boolean true - -# The user account will be added to some standard initial groups. To -# override that, use this. -#d-i passwd/user-default-groups string audio cdrom video - -# Set to true if you want to encrypt the first user's home directory. -d-i user-setup/encrypt-home boolean false - -### Clock and time zone setup -# Controls whether or not the hardware clock is set to UTC. -d-i clock-setup/utc boolean true - -# You may set this to any valid setting for $TZ; see the contents of -# /usr/share/zoneinfo/ for valid values. -d-i time/zone string US/Pacific - -# Controls whether to use NTP to set the clock during the install -d-i clock-setup/ntp boolean true -# NTP server to use. The default is almost always fine here. -#d-i clock-setup/ntp-server string ntp.example.com - -### i386 specific disk storage -# Activate DASD disks -#d-i s390-dasd/dasd string 0.0.0200,0.0.0300,0.0.0400 - -# DASD configuration; by default dasdfmt (low-level format) if needed -#d-i s390-dasd/auto-format boolean true -#d-i s390-dasd/force-format boolean true - -# zFCP activation and configuration -# d-i s390-zfcp/zfcp string 0.0.1b34:0x400870075678a1b2:0x201480c800000000, \ -# 0.0.1b34:0x400870075679a1b2:0x201480c800000000 - -### Partitioning -## Partitioning example -# If the system has free space you can choose to only partition that space. -# This is only honoured if partman-auto/method (below) is not set. -# Alternatives: custom, some_device, some_device_crypto, some_device_lvm. -#d-i partman-auto/init_automatically_partition select biggest_free - -# Alternatively, you may specify a disk to partition. If the system has only -# one disk the installer will default to using that, but otherwise the device -# name must be given in traditional, non-devfs format (so e.g. /dev/sda -# and not e.g. /dev/discs/disc0/disc). -# For example, to use the first SCSI/SATA hard disk: -#d-i partman-auto/disk string /dev/sda -# In addition, you'll need to specify the method to use. -# The presently available methods are: -# - regular: use the usual partition types for your architecture -# - lvm: use LVM to partition the disk -# - crypto: use LVM within an encrypted partition -d-i partman-auto/method string lvm - -# If one of the disks that are going to be automatically partitioned -# contains an old LVM configuration, the user will normally receive a -# warning. This can be preseeded away... -d-i partman-lvm/device_remove_lvm boolean true -# The same applies to pre-existing software RAID array: -d-i partman-md/device_remove_md boolean true -# And the same goes for the confirmation to write the lvm partitions. -d-i partman-lvm/confirm boolean true -d-i partman-lvm/confirm_nooverwrite boolean true - -# For LVM partitioning, you can select how much of the volume group to use -# for logical volumes. -d-i partman-auto-lvm/guided_size string max -#d-i partman-auto-lvm/guided_size string 10GB -#d-i partman-auto-lvm/guided_size string 50% - -# You can choose one of the three predefined partitioning recipes: -# - atomic: all files in one partition -# - home: separate /home partition -# - multi: separate /home, /var, and /tmp partitions -d-i partman-auto/choose_recipe select atomic - -# Or provide a recipe of your own... -# If you have a way to get a recipe file into the d-i environment, you can -# just point at it. -#d-i partman-auto/expert_recipe_file string /hd-media/recipe - -# If not, you can put an entire recipe into the preconfiguration file in one -# (logical) line. This example creates a small /boot partition, suitable -# swap, and uses the rest of the space for the root partition: -#d-i partman-auto/expert_recipe string \ -# boot-root :: \ -# 40 50 100 ext3 \ -# $primary{ } $bootable{ } \ -# method{ format } format{ } \ -# use_filesystem{ } filesystem{ ext3 } \ -# mountpoint{ /boot } \ -# . \ -# 500 10000 1000000000 ext3 \ -# method{ format } format{ } \ -# use_filesystem{ } filesystem{ ext3 } \ -# mountpoint{ / } \ -# . \ -# 64 512 300% linux-swap \ -# method{ swap } format{ } \ -# . - -# If you just want to change the default filesystem from ext3 to something -# else, you can do that without providing a full recipe. -#d-i partman/default_filesystem string ext4 - -# The full recipe format is documented in the file partman-auto-recipe.txt -# included in the 'debian-installer' package or available from D-I source -# repository. This also documents how to specify settings such as file -# system labels, volume group names and which physical devices to include -# in a volume group. - -# This makes partman automatically partition without confirmation, provided -# that you told it what to do using one of the methods above. -d-i partman-partitioning/confirm_write_new_label boolean true -d-i partman/choose_partition select finish -d-i partman/confirm boolean true -d-i partman/confirm_nooverwrite boolean true - -## Partitioning using RAID -# The method should be set to "raid". -#d-i partman-auto/method string raid -# Specify the disks to be partitioned. They will all get the same layout, -# so this will only work if the disks are the same size. -#d-i partman-auto/disk string /dev/sda /dev/sdb - -# Next you need to specify the physical partitions that will be used. -#d-i partman-auto/expert_recipe string \ -# multiraid :: \ -# 1000 5000 4000 raid \ -# $primary{ } method{ raid } \ -# . \ -# 64 512 300% raid \ -# method{ raid } \ -# . \ -# 500 10000 1000000000 raid \ -# method{ raid } \ -# . - -# Last you need to specify how the previously defined partitions will be -# used in the RAID setup. Remember to use the correct partition numbers -# for logical partitions. RAID levels 0, 1, 5, 6 and 10 are supported; -# devices are separated using "#". -# Parameters are: -# \ -# - -#d-i partman-auto-raid/recipe string \ -# 1 2 0 ext3 / \ -# /dev/sda1#/dev/sdb1 \ -# . \ -# 1 2 0 swap - \ -# /dev/sda5#/dev/sdb5 \ -# . \ -# 0 2 0 ext3 /home \ -# /dev/sda6#/dev/sdb6 \ -# . - -# For additional information see the file partman-auto-raid-recipe.txt -# included in the 'debian-installer' package or available from D-I source -# repository. - -# This makes partman automatically partition without confirmation. -d-i partman-md/confirm boolean true -d-i partman-partitioning/confirm_write_new_label boolean true -d-i partman/choose_partition select finish -d-i partman/confirm boolean true -d-i partman/confirm_nooverwrite boolean true - -## Controlling how partitions are mounted -# The default is to mount by UUID, but you can also choose "traditional" to -# use traditional device names, or "label" to try filesystem labels before -# falling back to UUIDs. -#d-i partman/mount_style select uuid - -### Base system installation -# Configure a path to the preconfigured base filesystem. This can be used to -# specify a path for the installer to retrieve the filesystem image that will -# be deployed to disk and used as a base system for the installation. -#d-i live-installer/net-image string /install/filesystem.squashfs - -# Configure APT to not install recommended packages by default. Use of this -# option can result in an incomplete system and should only be used by very -# experienced users. -#d-i base-installer/install-recommends boolean false - -# The kernel image (meta) package to be installed; "none" can be used if no -# kernel is to be installed. -#d-i base-installer/kernel/image string linux-generic - -### Apt setup -# You can choose to install restricted and universe software, or to install -# software from the backports repository. -#d-i apt-setup/restricted boolean true -#d-i apt-setup/universe boolean true -#d-i apt-setup/backports boolean true -# Uncomment this if you don't want to use a network mirror. -#d-i apt-setup/use_mirror boolean false -# Select which update services to use; define the mirrors to be used. -# Values shown below are the normal defaults. -#d-i apt-setup/services-select multiselect security -#d-i apt-setup/security_host string security.ubuntu.com -#d-i apt-setup/security_path string /ubuntu - -# Additional repositories, local[0-9] available -#d-i apt-setup/local0/repository string \ -# http://local.server/ubuntu stretch main -#d-i apt-setup/local0/comment string local server -# Enable deb-src lines -#d-i apt-setup/local0/source boolean true -# URL to the public key of the local repository; you must provide a key or -# apt will complain about the unauthenticated repository and so the -# sources.list line will be left commented out -#d-i apt-setup/local0/key string http://local.server/key - -# By default the installer requires that repositories be authenticated -# using a known gpg key. This setting can be used to disable that -# authentication. Warning: Insecure, not recommended. -#d-i debian-installer/allow_unauthenticated boolean true - -# Uncomment this to add multiarch configuration for i386 -#d-i apt-setup/multiarch string i386 - - -### Package selection -tasksel tasksel/first multiselect server -#tasksel tasksel/first multiselect ubuntu-desktop -#tasksel tasksel/first multiselect lamp-server, print-server -#tasksel tasksel/first multiselect kubuntu-desktop - -# Individual additional packages to install -d-i pkgsel/include string openssh-server -# Whether to upgrade packages after debootstrap. -# Allowed values: none, safe-upgrade, full-upgrade -d-i pkgsel/upgrade select none - -# Language pack selection -#d-i pkgsel/language-packs multiselect de, en, zh - -# Policy for applying updates. May be "none" (no automatic updates), -# "unattended-upgrades" (install security updates automatically), or -# "landscape" (manage system with Landscape). -d-i pkgsel/update-policy select none - -# Some versions of the installer can report back on what software you have -# installed, and what software you use. The default is not to report back, -# but sending reports helps the project determine what software is most -# popular and include it on CDs. -popularity-contest popularity-contest/participate boolean false - -# By default, the system's locate database will be updated after the -# installer has finished installing most packages. This may take a while, so -# if you don't want it, you can set this to "false" to turn it off. -d-i pkgsel/updatedb boolean false - -### Boot loader installation -# Grub is the default boot loader (for x86). If you want lilo installed -# instead, uncomment this: -#d-i grub-installer/skip boolean true -# To also skip installing lilo, and install no bootloader, uncomment this -# too: -#d-i lilo-installer/skip boolean true - - -# This is fairly safe to set, it makes grub install automatically to the MBR -# if no other operating system is detected on the machine. -d-i grub-installer/only_debian boolean true - -# This one makes grub-installer install to the MBR if it also finds some other -# OS, which is less safe as it might not be able to boot that other OS. -d-i grub-installer/with_other_os boolean true - -# Due notably to potential USB sticks, the location of the MBR can not be -# determined safely in general, so this needs to be specified: -#d-i grub-installer/bootdev string /dev/sda -# To install to the first device (assuming it is not a USB stick): -#d-i grub-installer/bootdev string default - -# Alternatively, if you want to install to a location other than the mbr, -# uncomment and edit these lines: -#d-i grub-installer/only_debian boolean false -#d-i grub-installer/with_other_os boolean false -#d-i grub-installer/bootdev string (hd0,1) -# To install grub to multiple disks: -#d-i grub-installer/bootdev string (hd0,1) (hd1,1) (hd2,1) - -# Optional password for grub, either in clear text -#d-i grub-installer/password password r00tme -#d-i grub-installer/password-again password r00tme -# or encrypted using an MD5 hash, see grub-md5-crypt(8). -#d-i grub-installer/password-crypted password [MD5 hash] - -# Use the following option to add additional boot parameters for the -# installed system (if supported by the bootloader installer). -# Note: options passed to the installer will be added automatically. -d-i debian-installer/add-kernel-opts string nouveau.modeset=0 rd.driver.blacklist=nouveau - -### Finishing up the installation -# During installations from serial console, the regular virtual consoles -# (VT1-VT6) are normally disabled in /etc/inittab. Uncomment the next -# line to prevent this. -#d-i finish-install/keep-consoles boolean true - -# Avoid that last message about the install being complete. -d-i finish-install/reboot_in_progress note - -# This will prevent the installer from ejecting the CD during the reboot, -# which is useful in some situations. -#d-i cdrom-detect/eject boolean false - -# This is how to make the installer shutdown when finished, but not -# reboot into the installed system. -#d-i debian-installer/exit/halt boolean true -# This will power off the machine instead of just halting it. -#d-i debian-installer/exit/poweroff boolean true - -### Preseeding other packages -# Depending on what software you choose to install, or if things go wrong -# during the installation process, it's possible that other questions may -# be asked. You can preseed those too, of course. To get a list of every -# possible question that could be asked during an install, do an -# installation, and then run these commands: -# debconf-get-selections --installer > file -# debconf-get-selections >> file - - -#### Advanced options -### Running custom commands during the installation -## i386 Preseed Example -# d-i preseeding is inherently not secure. Nothing in the installer checks -# for attempts at buffer overflows or other exploits of the values of a -# preconfiguration file like this one. Only use preconfiguration files from -# trusted locations! To drive that home, and because it's generally useful, -# here's a way to run any shell command you'd like inside the installer, -# automatically. - -# This first command is run as early as possible, just after -# preseeding is read. -#d-i preseed/early_command string anna-install some-udeb -# This command is run immediately before the partitioner starts. It may be -# useful to apply dynamic partitioner preseeding that depends on the state -# of the disks (which may not be visible when preseed/early_command runs). -#d-i partman/early_command \ -# string debconf-set partman-auto/disk "$(list-devices disk | head -n1)" -# This command is run just before the install finishes, but when there is -# still a usable /target directory. You can chroot to /target and use it -# directly, or use the apt-install and in-target commands to easily install -# packages and run commands in the target system. -#d-i preseed/late_command string apt-install zsh; in-target chsh -s /bin/zsh diff --git a/workloads/examples/k8s/services/nfs-dgx-iso.yml b/workloads/examples/k8s/services/nfs-dgx-iso.yml deleted file mode 100644 index 367fa37cb..000000000 --- a/workloads/examples/k8s/services/nfs-dgx-iso.yml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: v1 -kind: PersistentVolume -metadata: - name: nfs-dgx-iso -spec: - capacity: - storage: 5Gi - accessModes: - - ReadOnlyMany - nfs: - server: # - path: "/path/to/iso/DGXServer-3.1.2.170902_f8777e" ---- -apiVersion: v1 -kind: PersistentVolumeClaim -metadata: - name: nfs-dgx-iso -spec: - accessModes: - - ReadOnlyMany - storageClassName: "" - resources: - requests: - storage: 5Gi diff --git a/workloads/examples/k8s/services/pxe.yml b/workloads/examples/k8s/services/pxe.yml deleted file mode 100644 index f4a607324..000000000 --- a/workloads/examples/k8s/services/pxe.yml +++ /dev/null @@ -1,27 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: pxe-server -spec: - hostNetwork: true - containers: - - name: pxe-server - image: deepops/provision/pxe # change me - volumeMounts: - - name: config-volume - mountPath: /data - - name: nfs - mountPath: "/iso" - imagePullSecrets: - - name: secret # change me - volumes: - - name: config-volume - configMap: - name: pxe - items: - - key: machines.json - path: machines.json - - name: nfs - persistentVolumeClaim: - claimName: nfs-dgx-iso - restartPolicy: Never diff --git a/workloads/services/k8s/dgxie/.helmignore b/workloads/services/k8s/dgxie/.helmignore deleted file mode 100644 index f0c131944..000000000 --- a/workloads/services/k8s/dgxie/.helmignore +++ /dev/null @@ -1,21 +0,0 @@ -# Patterns to ignore when building packages. -# This supports shell glob matching, relative path matching, and -# negation (prefixed with !). Only one pattern per line. -.DS_Store -# Common VCS dirs -.git/ -.gitignore -.bzr/ -.bzrignore -.hg/ -.hgignore -.svn/ -# Common backup files -*.swp -*.bak -*.tmp -*~ -# Various IDEs -.project -.idea/ -*.tmproj diff --git a/workloads/services/k8s/dgxie/Chart.yaml b/workloads/services/k8s/dgxie/Chart.yaml deleted file mode 100644 index 867116268..000000000 --- a/workloads/services/k8s/dgxie/Chart.yaml +++ /dev/null @@ -1,5 +0,0 @@ -apiVersion: v1 -appVersion: "1.0" -description: A Helm chart for Kubernetes -name: dgxie -version: 0.1.2 diff --git a/workloads/services/k8s/dgxie/templates/NOTES.txt b/workloads/services/k8s/dgxie/templates/NOTES.txt deleted file mode 100644 index e69de29bb..000000000 diff --git a/workloads/services/k8s/dgxie/templates/_helpers.tpl b/workloads/services/k8s/dgxie/templates/_helpers.tpl deleted file mode 100644 index 2474f1dc3..000000000 --- a/workloads/services/k8s/dgxie/templates/_helpers.tpl +++ /dev/null @@ -1,32 +0,0 @@ -{{/* vim: set filetype=mustache: */}} -{{/* -Expand the name of the chart. -*/}} -{{- define "dgxie.name" -}} -{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} -{{- end -}} - -{{/* -Create a default fully qualified app name. -We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). -If release name contains chart name it will be used as a full name. -*/}} -{{- define "dgxie.fullname" -}} -{{- if .Values.fullnameOverride -}} -{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} -{{- else -}} -{{- $name := default .Chart.Name .Values.nameOverride -}} -{{- if contains $name .Release.Name -}} -{{- .Release.Name | trunc 63 | trimSuffix "-" -}} -{{- else -}} -{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} -{{- end -}} -{{- end -}} -{{- end -}} - -{{/* -Create chart name and version as used by the chart label. -*/}} -{{- define "dgxie.chart" -}} -{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} -{{- end -}} diff --git a/workloads/services/k8s/dgxie/templates/deployment.yaml b/workloads/services/k8s/dgxie/templates/deployment.yaml deleted file mode 100644 index 6c4a30e8a..000000000 --- a/workloads/services/k8s/dgxie/templates/deployment.yaml +++ /dev/null @@ -1,139 +0,0 @@ -apiVersion: apps/v1beta2 -kind: Deployment -metadata: - name: {{ template "dgxie.fullname" . }} - labels: - app: {{ template "dgxie.name" . }} - chart: {{ template "dgxie.chart" . }} - release: {{ .Release.Name }} - heritage: {{ .Release.Service }} -spec: - replicas: {{ .Values.replicaCount }} - selector: - matchLabels: - app: {{ template "dgxie.name" . }} - release: {{ .Release.Name }} - template: - metadata: - labels: - app: {{ template "dgxie.name" . }} - release: {{ .Release.Name }} - spec: - hostNetwork: true - containers: - - name: {{ .Chart.Name }} - image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}" - imagePullPolicy: {{ .Values.image.pullPolicy }} - securityContext: - privileged: true - volumeMounts: - - name: dhcp-config-volume - mountPath: "/etc/dnsmasq.d" - - name: pxe-machines-vol - mountPath: "/etc/machines/" - - name: dhcp-leases - mountPath: "/var/lib/misc/" - - name: cephfs - mountPath: "{{ .Values.mntPath }}" - env: - # Path in container to storage containing ISO image - - name: MNT - value: "{{ .Values.mntPath }}" - # Path in container to mounted DGX ISO - - name: ISO - value: "{{ .Values.isoPath }}" - # Default DGX boot mode - # Options: - # "DGX": boot DGX to install media - # "local": boot DGX to local disk - - name: DEFAULT - value: "{{ .Values.bootMode }}" - # DGX network interface to use during install - - name: INT - value: "{{ .Values.dgxNetInt }}" - # DGX disk to use during install - - name: DISK - value: "{{ .Values.dgxDisk }}" - # DGX Keyboard layout to use - - name: KBD - value: "{{ .Values.dgxKbd }}" - # Extra kernel parameters to pass during DGX OS install - # i.e. "rebuild-raid" - - name: KERN_EXTRA - value: "{{ .Values.dgxKernExtra }}" - # Network interface of public network on management servers - - name: HOST_INT_PUB - value: "{{ .Values.mgmtIntPub }}" - # Network interface of private network where DGX are connected on management servers - - name: HOST_INT_PRV - value: "{{ .Values.mgmtIntPrv }}" - # Network domain - - name: DOMAIN - value: "{{ .Values.netDomain }}" - # IP address of private network interface on management server - - name: IP - value: "{{ .Values.netPrvIp }}" - # Private Network - - name: NETWORK - value: "{{ .Values.netPrvNet }}" - # Private network netmask - - name: NETMASK - value: "{{ .Values.netPrvNetmask }}" - # Private network gateway - - name: GATEWAY - value: "{{ .Values.netPrvGateway }}" - # DNS nameservers - - name: DNS1 - value: "{{ .Values.netPrvDns1 }}" - - name: DNS2 - value: "{{ .Values.netPrvDns2 }}" - # DHCP dynamic address range - - name: DHCP_START - value: "{{ .Values.netPrvDhcpStart }}" - - name: DHCP_END - value: "{{ .Values.netPrvDhcpEnd }}" - # DHCP lease time - - name: LEASETIME - value: "{{ .Values.netPrvDhcpLease }}" - # HTTP port for HTTP server - - name: HTTP_PORT - value: "{{ .Values.httpPort }}" - - name: HTTPS_PROXY - value: "{{ .Values.httpsProxy }}" - - name: EXTRA_PACKAGES - value: "{{ .Values.extraPackages }}" - - name: NTP - value: "{{ .Values.ntp }}" - resources: -{{ toYaml .Values.resources | indent 12 }} - {{- with .Values.nodeSelector }} - nodeSelector: -{{ toYaml . | indent 8 }} - {{- end }} - {{- with .Values.affinity }} - affinity: -{{ toYaml . | indent 8 }} - {{- end }} - {{- with .Values.tolerations }} - tolerations: -{{ toYaml . | indent 8 }} - {{- end }} - volumes: - - name: dhcp-config-volume - configMap: - name: dhcpd - - name: pxe-machines-vol - configMap: - name: pxe-machines - - name: cephfs - flexVolume: - driver: ceph.rook.io/rook - fsType: ceph - options: - fsName: cephfs - clusterNamespace: rook-ceph - path: /iso - - name: dhcp-leases - hostPath: - path: /var/lib/dhcp - type: DirectoryOrCreate diff --git a/workloads/services/k8s/dgxie/values.yaml b/workloads/services/k8s/dgxie/values.yaml deleted file mode 100644 index bacfcb832..000000000 --- a/workloads/services/k8s/dgxie/values.yaml +++ /dev/null @@ -1,56 +0,0 @@ -# Default values for dgxie. -# This is a YAML-formatted file. -# Declare variables to be passed into your templates. - -# DGXie config -bootMode: DGX -mntPath: /data -isoPath: /iso -dgxNetInt: enp1s0f0 -dgxDisk: sda -dgxKbd: us -dgxKernExtra: '' -mgmtIntPub: eth0 -mgmtIntPrv: eth1 -netDomain: local -netPrvIp: 192.168.1.1 -netPrvNet: 192.168.1.0 -netPrvNetmask: 255.255.255.0 -netPrvGateway: 192.168.1.1 -netPrvDns1: 192.168.1.1 -netPrvDns2: 8.8.8.8 -netPrvDhcpStart: 192.168.1.100 -netPrvDhcpEnd: 192.168.1.199 -netPrvDhcpLease: 7200 -httpPort: 13370 -httpsProxy: '' -extraPackages: '' - -# Deployment config -replicaCount: 1 - -ntp: '' - -image: - repository: deepops/dgxie - tag: latest - pullPolicy: IfNotPresent - -resources: {} - # We usually recommend not to specify default resources and to leave this as a conscious - # choice for the user. This also increases chances charts run on environments with little - # resources, such as Minikube. If you do want to specify resources, uncomment the following - # lines, adjust them as necessary, and remove the curly braces after 'resources:'. - # limits: - # cpu: 100m - # memory: 128Mi - # requests: - # cpu: 100m - # memory: 128Mi - -nodeSelector: - node-role.kubernetes.io/control-plane: "" - -tolerations: [] - -affinity: {} diff --git a/workloads/services/k8s/iso-loader.yml b/workloads/services/k8s/iso-loader.yml deleted file mode 100644 index 33558d2dd..000000000 --- a/workloads/services/k8s/iso-loader.yml +++ /dev/null @@ -1,32 +0,0 @@ -apiVersion: extensions/v1beta1 -kind: Deployment -metadata: - name: iso-loader -spec: - replicas: 1 - template: - metadata: - labels: - app: iso-loader - spec: - containers: - - name: iso-loader - image: ubuntu:16.04 - # /bin/bash -c "trap : TERM INT; sleep infinity & wait" - command: ["/bin/bash", "-c", "--"] - args: ["trap : TERM INT; mkdir -p /data/iso; sleep infinity & wait"] - resources: - limits: - cpu: 100m - memory: 100Mi - volumeMounts: - - name: cephfs - mountPath: /data - volumes: - - name: cephfs - flexVolume: - driver: ceph.rook.io/rook - fsType: ceph - options: - fsName: cephfs - clusterNamespace: rook-ceph