diff --git a/.github/workflows/validate-components.yml b/.github/workflows/validate-components.yml index 5a6be9dc683..d48e4c4b2a6 100644 --- a/.github/workflows/validate-components.yml +++ b/.github/workflows/validate-components.yml @@ -53,3 +53,17 @@ jobs: set -ex go test -v -run ^Test_Version_Consistency_GPU_Managed_Components$ . + dcgm-compatibility: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: actions/setup-go@v6 + with: + go-version-file: e2e/go.mod + cache-dependency-path: e2e/go.sum + - name: Verify DCGM Exporter package compatibility + working-directory: ./e2e + run: | + set -ex + go test -v -run ^TestDCGMExporterCompatibility$ ./components/ + diff --git a/aks-node-controller/go.mod b/aks-node-controller/go.mod index 7daefb5c643..4120031d768 100644 --- a/aks-node-controller/go.mod +++ b/aks-node-controller/go.mod @@ -10,6 +10,7 @@ require ( github.com/google/go-cmp v0.7.0 github.com/stretchr/testify v1.11.1 google.golang.org/protobuf v1.36.6 + gopkg.in/yaml.v3 v3.0.1 ) require ( @@ -29,7 +30,6 @@ require ( github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/vincent-petithory/dataurl v1.0.0 // indirect golang.org/x/sys v0.40.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect ) replace github.com/Azure/agentbaker => ../ diff --git a/e2e/components/components_test.go b/e2e/components/components_test.go index 72470af01c4..e3eb71567a3 100644 --- a/e2e/components/components_test.go +++ b/e2e/components/components_test.go @@ -1,10 +1,22 @@ package components import ( + "archive/tar" + "bufio" + "compress/gzip" "fmt" + "io" + "net/http" + "os" + "regexp" + "strings" "testing" + "time" "github.com/Azure/agentbaker/e2e/config" + "github.com/blakesmith/ar" + "github.com/cavaliergopher/rpm" + "github.com/klauspost/compress/zstd" "github.com/stretchr/testify/require" ) @@ -115,6 +127,248 @@ func TestWindowsImagesHaveServercoreAndNanoserverSpecified(t *testing.T) { } } +func TestDCGMExporterCompatibility(t *testing.T) { + type testCase struct { + name string + os string + osVersion string + downloadURL string + parseDeps func(t *testing.T, path string) (coreVersion, propVersion string) + } + + testCases := []testCase{ + { + name: "Ubuntu2204", + os: "ubuntu", + osVersion: "r2204", + downloadURL: "https://packages.microsoft.com/repos/microsoft-ubuntu-jammy-prod/pool/main/d/dcgm-exporter/dcgm-exporter_%s_amd64.deb", + parseDeps: parseDebDeps, + }, + { + name: "Ubuntu2404", + os: "ubuntu", + osVersion: "r2404", + downloadURL: "https://packages.microsoft.com/repos/microsoft-ubuntu-noble-prod/pool/main/d/dcgm-exporter/dcgm-exporter_%s_amd64.deb", + parseDeps: parseDebDeps, + }, + { + name: "AzureLinux3", + os: "azurelinux", + osVersion: "v3.0", + downloadURL: "https://packages.microsoft.com/azurelinux/3.0/prod/cloud-native/x86_64/Packages/d/dcgm-exporter-%s.x86_64.rpm", + parseDeps: parseRPMDeps, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Get expected versions from components.json + dcgmExporterVersions := GetExpectedPackageVersions("dcgm-exporter", tc.os, tc.osVersion) + require.NotEmpty(t, dcgmExporterVersions, "dcgm-exporter not found in components.json") + dcgmExporterVersion := dcgmExporterVersions[0] + + coreVersions := GetExpectedPackageVersions("datacenter-gpu-manager-4-core", tc.os, tc.osVersion) + require.NotEmpty(t, coreVersions, "datacenter-gpu-manager-4-core not found in components.json") + expectedCoreVersion := coreVersions[0] + + propVersions := GetExpectedPackageVersions("datacenter-gpu-manager-4-proprietary", tc.os, tc.osVersion) + require.NotEmpty(t, propVersions, "datacenter-gpu-manager-4-proprietary not found in components.json") + expectedPropVersion := propVersions[0] + + t.Logf("Expected versions from components.json:") + t.Logf(" dcgm-exporter: %s", dcgmExporterVersion) + t.Logf(" datacenter-gpu-manager-4-core: %s", expectedCoreVersion) + t.Logf(" datacenter-gpu-manager-4-proprietary: %s", expectedPropVersion) + + // Download the dcgm-exporter package + url := fmt.Sprintf(tc.downloadURL, dcgmExporterVersion) + t.Logf("Downloading dcgm-exporter package from %s", url) + + tmpFile, err := os.CreateTemp("", "dcgm-exporter-*") + require.NoError(t, err) + defer os.Remove(tmpFile.Name()) + + resp := downloadWithRetry(t, url, 3) + defer resp.Body.Close() + require.Equal(t, http.StatusOK, resp.StatusCode, "Failed to download dcgm-exporter package from %s", url) + + _, err = io.Copy(tmpFile, resp.Body) + require.NoError(t, err) + require.NoError(t, tmpFile.Close()) + + // Parse dependencies from the package + actualCoreVersion, actualPropVersion := tc.parseDeps(t, tmpFile.Name()) + + t.Logf("Actual versions from dcgm-exporter package:") + t.Logf(" datacenter-gpu-manager-4-core: %s", actualCoreVersion) + t.Logf(" datacenter-gpu-manager-4-proprietary: %s", actualPropVersion) + + // Verify versions match + require.Equalf(t, expectedCoreVersion, actualCoreVersion, + "datacenter-gpu-manager-4-core version mismatch: components.json has %s but dcgm-exporter requires %s", + expectedCoreVersion, actualCoreVersion) + + require.Equalf(t, expectedPropVersion, actualPropVersion, + "datacenter-gpu-manager-4-proprietary version mismatch: components.json has %s but dcgm-exporter requires %s", + expectedPropVersion, actualPropVersion) + + t.Logf("✅ Version compatibility verified: dcgm-exporter %s is compatible with DCGM packages %s", + dcgmExporterVersion, expectedCoreVersion) + }) + } +} + +// downloadWithRetry downloads a URL with a timeout and retries on transient failures. +func downloadWithRetry(t *testing.T, url string, maxRetries int) *http.Response { + t.Helper() + client := &http.Client{Timeout: 60 * time.Second} + var lastErr error + for attempt := range maxRetries { + resp, err := client.Get(url) + if err == nil { + return resp + } + lastErr = err + t.Logf("Download attempt %d/%d failed: %v", attempt+1, maxRetries, err) + time.Sleep(time.Duration(attempt+1) * 2 * time.Second) + } + require.NoError(t, lastErr, "All %d download attempts failed for %s", maxRetries, url) + return nil // unreachable +} + +// parseDebDeps extracts datacenter-gpu-manager-4-core and datacenter-gpu-manager-4-proprietary +// versions from a .deb package's control file. +func parseDebDeps(t *testing.T, path string) (string, string) { + t.Helper() + + f, err := os.Open(path) + require.NoError(t, err) + defer f.Close() + + reader := ar.NewReader(f) + for { + header, err := reader.Next() + require.NoError(t, err, "control file not found in .deb package") + + if !strings.HasPrefix(header.Name, "control.tar") { + continue + } + + var tarReader *tar.Reader + if strings.HasSuffix(header.Name, ".gz") { + gz, err := gzip.NewReader(reader) + require.NoError(t, err) + defer gz.Close() + tarReader = tar.NewReader(gz) + } else if strings.HasSuffix(header.Name, ".zst") { + zr, err := zstd.NewReader(reader) + require.NoError(t, err) + defer zr.Close() + tarReader = tar.NewReader(zr) + } else { + tarReader = tar.NewReader(reader) + } + + for { + th, err := tarReader.Next() + require.NoError(t, err, "control file not found in control.tar") + + if th.Name == "./control" || th.Name == "control" { + data, err := io.ReadAll(tarReader) + require.NoError(t, err) + + // Parse Depends field, handling RFC822 continuation lines + // (subsequent lines starting with space/tab are part of the same field) + dependsValue := parseDebControlField(string(data), "Depends") + require.NotEmpty(t, dependsValue, "Depends field not found in control file") + + coreRegex := regexp.MustCompile(`datacenter-gpu-manager-4-core \(= ([^)]+)\)`) + propRegex := regexp.MustCompile(`datacenter-gpu-manager-4-proprietary \(= ([^)]+)\)`) + + coreMatches := coreRegex.FindStringSubmatch(dependsValue) + require.Len(t, coreMatches, 2, "Failed to extract datacenter-gpu-manager-4-core version from Depends") + + propMatches := propRegex.FindStringSubmatch(dependsValue) + require.Len(t, propMatches, 2, "Failed to extract datacenter-gpu-manager-4-proprietary version from Depends") + + return coreMatches[1], propMatches[1] + } + } + } +} + +// parseDebControlField extracts the value of an RFC822-style field from a Debian control file, +// handling continuation lines (lines starting with space or tab). +func parseDebControlField(control, field string) string { + prefix := field + ":" + var result strings.Builder + found := false + scanner := bufio.NewScanner(strings.NewReader(control)) + for scanner.Scan() { + line := scanner.Text() + if found { + if len(line) > 0 && (line[0] == ' ' || line[0] == '\t') { + result.WriteString(" ") + result.WriteString(strings.TrimSpace(line)) + continue + } + break + } + if strings.HasPrefix(line, prefix) { + found = true + result.WriteString(strings.TrimSpace(strings.TrimPrefix(line, prefix))) + } + } + return result.String() +} + +// parseRPMDeps extracts datacenter-gpu-manager-4-core and datacenter-gpu-manager-4-proprietary +// versions from an .rpm package's Requires metadata. +func parseRPMDeps(t *testing.T, path string) (string, string) { + t.Helper() + + f, err := os.Open(path) + require.NoError(t, err) + defer f.Close() + + pkg, err := rpm.Read(f) + require.NoError(t, err) + + var coreVersion, propVersion string + + for _, req := range pkg.Requires() { + name := req.Name() + if name == "datacenter-gpu-manager-4-core" { + t.Logf("RPM dependency %s: epoch=%d version=%s release=%s", name, req.Epoch(), req.Version(), req.Release()) + coreVersion = formatRPMVersion(req) + } + if name == "datacenter-gpu-manager-4-proprietary" { + t.Logf("RPM dependency %s: epoch=%d version=%s release=%s", name, req.Epoch(), req.Version(), req.Release()) + propVersion = formatRPMVersion(req) + } + } + + require.NotEmpty(t, coreVersion, "datacenter-gpu-manager-4-core dependency not found in RPM Requires") + require.NotEmpty(t, propVersion, "datacenter-gpu-manager-4-proprietary dependency not found in RPM Requires") + + return coreVersion, propVersion +} + +// formatRPMVersion formats an RPM dependency's version as "epoch:version-release", +// matching the version format used in components.json. +func formatRPMVersion(dep rpm.Dependency) string { + epoch := dep.Epoch() + version := dep.Version() + release := dep.Release() + if epoch > 0 { + return fmt.Sprintf("%d:%s-%s", epoch, version, release) + } + if release != "" { + return fmt.Sprintf("%s-%s", version, release) + } + return version +} + type versionCheck struct { input string expected string diff --git a/e2e/go.mod b/e2e/go.mod index af7f4d39af0..f62938adfb1 100644 --- a/e2e/go.mod +++ b/e2e/go.mod @@ -17,11 +17,14 @@ require ( github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources/v3 v3.0.1 github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/storage/armstorage/v3 v3.0.0 github.com/Azure/azure-sdk-for-go/sdk/storage/azblob v1.6.2 + github.com/blakesmith/ar v0.0.0-20190502131153-809d4375e1fb github.com/blang/semver v3.5.1+incompatible github.com/bramvdbogaerde/go-scp v1.6.0 github.com/caarlos0/env/v11 v11.3.1 + github.com/cavaliergopher/rpm v1.3.0 github.com/coder/websocket v1.8.14 github.com/joho/godotenv v1.5.1 + github.com/klauspost/compress v1.18.5 github.com/samber/lo v1.52.0 github.com/sanity-io/litter v1.5.5 github.com/stretchr/testify v1.11.1 diff --git a/e2e/go.sum b/e2e/go.sum index 24b8552e84f..e1a66baba18 100644 --- a/e2e/go.sum +++ b/e2e/go.sum @@ -52,12 +52,16 @@ github.com/aws/aws-sdk-go-v2 v1.38.2 h1:QUkLO1aTW0yqW95pVzZS0LGFanL71hJ0a49w4TJL github.com/aws/aws-sdk-go-v2 v1.38.2/go.mod h1:sDioUELIUO9Znk23YVmIk86/9DOpkbyyVb1i/gUNFXY= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df h1:GSoSVRLoBaFpOOds6QyY1L8AX7uoY+Ln3BHc22W40X0= github.com/barkimedes/go-deepcopy v0.0.0-20220514131651-17c30cfc62df/go.mod h1:hiVxq5OP2bUGBRNS3Z/bt/reCLFNbdcST6gISi1fiOM= +github.com/blakesmith/ar v0.0.0-20190502131153-809d4375e1fb h1:m935MPodAbYS46DG4pJSv7WO+VECIWUQ7OJYSoTrMh4= +github.com/blakesmith/ar v0.0.0-20190502131153-809d4375e1fb/go.mod h1:PkYb9DJNAwrSvRx5DYA+gUcOIgTGVMNkfSCbZM8cWpI= github.com/blang/semver v3.5.1+incompatible h1:cQNTCjp13qL8KC3Nbxr/y2Bqb63oX6wdnnjpJbkM4JQ= github.com/blang/semver v3.5.1+incompatible/go.mod h1:kRBLl5iJ+tD4TcOOxsy/0fnwebNt5EWlYSAyrTnjyyk= github.com/bramvdbogaerde/go-scp v1.6.0 h1:lDh0lUuz1dbIhJqlKLwWT7tzIRONCp1Mtx3pgQVaLQo= github.com/bramvdbogaerde/go-scp v1.6.0/go.mod h1:on2aH5AxaFb2G0N5Vsdy6B0Ml7k9HuHSwfo1y0QzAbQ= github.com/caarlos0/env/v11 v11.3.1 h1:cArPWC15hWmEt+gWk7YBi7lEXTXCvpaSdCiZE2X5mCA= github.com/caarlos0/env/v11 v11.3.1/go.mod h1:qupehSf/Y0TUTsxKywqRt/vJjN5nz6vauiYEUUr8P4U= +github.com/cavaliergopher/rpm v1.3.0 h1:UHX46sasX8MesUXXQ+UbkFLUX4eUWTlEcX8jcnRBIgI= +github.com/cavaliergopher/rpm v1.3.0/go.mod h1:vEumo1vvtrHM1Ov86f6+k8j7zNKOxQfHDCAIcR/36ZI= github.com/clarketm/json v1.17.1 h1:U1IxjqJkJ7bRK4L6dyphmoO840P6bdhPdbbLySourqI= github.com/clarketm/json v1.17.1/go.mod h1:ynr2LRfb0fQU34l07csRNBTcivjySLLiY1YzQqKVfdo= github.com/coder/websocket v1.8.14 h1:9L0p0iKiNOibykf283eHkKUHHrpG7f65OE3BhhO7v9g= @@ -125,6 +129,8 @@ github.com/keybase/go-keychain v0.0.1 h1:way+bWYa6lDppZoZcgMbYsvC7GxljxrskdNInRt github.com/keybase/go-keychain v0.0.1/go.mod h1:PdEILRW3i9D8JcdM+FmY6RwkHGnhHxXwkPPMeUgOK1k= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= +github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= +github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=