diff --git a/cloudformation/patching/ami-patching.yaml b/cloudformation/patching/ami-patching.yaml new file mode 100644 index 0000000000..bf7d3dd9b0 --- /dev/null +++ b/cloudformation/patching/ami-patching.yaml @@ -0,0 +1,302 @@ +AWSTemplateFormatVersion: 2010-09-09 +Description: >- + AWS ParallelCluster AMI patching used for tests + +Parameters: + ParentImage: + Description: The ParallelCluster AMI to patch. + Type: String + InstanceType: + Description: Instance type used by Image Builder to build the patched AMI. + Type: String + SubnetId: + Description: Subnet (with outbound internet access) where the build instance runs. + Type: AWS::EC2::Subnet::Id + VpcId: + Description: VPC of the build subnet (used for the build instance security group). + Type: AWS::EC2::VPC::Id + PatchScriptS3Uri: + Description: S3 URI (s3://bucket/key) of the patching script to run on the build instance. + Type: String + +Resources: + + # =========================================================================== + # AMI helper + # + # On create it looks up the source AMI and returns its name + # (used as the prefix of the patched AMI name). + # On stack delete it deregisters the patched AMI built by this stack + # and deletes its backing snapshots. + # =========================================================================== + + AmiHelper: + Type: AWS::CloudFormation::CustomResource + Properties: + ServiceToken: !GetAtt AmiHelperFunction.Arn + SourceAmi: !Ref ParentImage + StackName: !Ref AWS::StackName + + AmiHelperFunction: + Type: AWS::Lambda::Function + Properties: + Handler: index.handler + Runtime: python3.12 + Timeout: 60 + Role: !GetAtt AmiHelperRole.Arn + Code: + ZipFile: | + import json, urllib.request, boto3 + ec2 = boto3.client("ec2") + + def respond(event, status, data=None): + # CloudFormation correlates the response to the request via StackId, + # RequestId and LogicalResourceId, and tracks the resource via + # PhysicalResourceId, so all four are mandatory. Reason is only required + # on failure and Data only when there is something to return. + body = { + "Status": status, + "PhysicalResourceId": event.get("PhysicalResourceId", "ami-patching-helper"), + "StackId": event["StackId"], + "RequestId": event["RequestId"], + "LogicalResourceId": event["LogicalResourceId"], + } + if status == "FAILED": + body["Reason"] = "See CloudWatch Logs" + if data: + body["Data"] = data + payload = json.dumps(body).encode() + req = urllib.request.Request( + event["ResponseURL"], data=payload, method="PUT", + headers={"content-type": "", "content-length": str(len(payload))}) + urllib.request.urlopen(req) + + def cleanup(stack_name): + # Deregister the patched AMI(s) built by this stack and delete their + # snapshots. The snapshots are tagged first so DeleteSnapshot is allowed + # by the (tag-scoped) IAM policy. + if not stack_name: + return + images = ec2.describe_images(Owners=["self"], Filters=[ + {"Name": "tag:parallelcluster:ami-patching-stack", "Values": [stack_name]}]).get("Images", []) + for img in images: + snaps = [m["Ebs"]["SnapshotId"] for m in img.get("BlockDeviceMappings", []) + if m.get("Ebs", {}).get("SnapshotId")] + if snaps: + ec2.create_tags(Resources=snaps, Tags=[ + {"Key": "parallelcluster:ami-patching-stack", "Value": stack_name}]) + ec2.deregister_image(ImageId=img["ImageId"]) + for snap in snaps: + ec2.delete_snapshot(SnapshotId=snap) + + def handler(event, context): + try: + p = event.get("ResourceProperties", {}) + if event["RequestType"] == "Delete": + cleanup(p.get("StackName")) + return respond(event, "SUCCESS") + src = p["SourceAmi"] + image = ec2.describe_images(ImageIds=[src])["Images"][0] + # The distributed AMI name is "-patched-" and AMI + # names are capped at 128 chars. Image Builder renders buildDate as + # "YYYY-MM-DD'T'HH-MM-SS'Z'" (20 chars); with the "-patched-" separator + # (9 chars) the suffix is up to 29 chars, so truncate the source name to + # 88 (128 - 40) to stay safely within the limit. + name = image.get("Name", src)[:88] + return respond(event, "SUCCESS", {"SourceName": name}) + except Exception as e: + print("Error: %s" % e) + return respond(event, "FAILED") + + AmiHelperRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: + Service: !Sub lambda.${AWS::URLSuffix} + Action: sts:AssumeRole + ManagedPolicyArns: + - !Sub arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaBasicExecutionRole + Policies: + - PolicyName: ami-helper + PolicyDocument: + Version: 2012-10-17 + Statement: + # ec2 Describe* actions do not support resource-level permissions. + - Effect: Allow + Action: ec2:DescribeImages + Resource: "*" + - Effect: Allow + Action: ec2:CreateTags + Resource: !Sub arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:snapshot/* + Condition: + StringEquals: + aws:RequestTag/parallelcluster:ami-patching-stack: !Ref AWS::StackName + - Effect: Allow + Action: ec2:DeregisterImage + Resource: !Sub arn:${AWS::Partition}:ec2:${AWS::Region}::image/* + Condition: + StringEquals: + aws:ResourceTag/parallelcluster:ami-patching-stack: !Ref AWS::StackName + - Effect: Allow + Action: ec2:DeleteSnapshot + Resource: !Sub arn:${AWS::Partition}:ec2:${AWS::Region}:${AWS::AccountId}:snapshot/* + Condition: + StringEquals: + aws:ResourceTag/parallelcluster:ami-patching-stack: !Ref AWS::StackName + + # =========================================================================== + # Image Builder + # + # Builds the patched AMI: the build instance downloads and runs the patching + # script, reboots, executes the AMI cleanup and create the new AMI. + # =========================================================================== + + PatchedImage: + Type: AWS::ImageBuilder::Image + DependsOn: RecipeLogGroup + Properties: + ImageRecipeArn: !Ref PatchImageRecipe + InfrastructureConfigurationArn: !Ref PatchInfrastructureConfiguration + DistributionConfigurationArn: !Ref PatchDistributionConfiguration + ImageTestsConfiguration: + ImageTestsEnabled: false + + PatchImageRecipe: + Type: AWS::ImageBuilder::ImageRecipe + Properties: + Name: !Sub pcluster-ami-patching-recipe-${AWS::StackName} + Version: 1.0.0 + ParentImage: !Ref ParentImage + Components: + - ComponentArn: !Ref PatchComponent + + PatchComponent: + Type: AWS::ImageBuilder::Component + Properties: + Name: !Sub pcluster-ami-patching-${AWS::StackName} + Platform: Linux + Version: 1.0.0 + Description: Apply OS security patches (kernel bump allowed) to the parent image. + Data: !Sub | + name: PatchNodeSecurityUpdates + description: Apply OS security patches to the parent image, allowing kernel bumps. + schemaVersion: 1.0 + phases: + - name: build + steps: + - name: PrePatchingChecks + action: ExecuteBash + inputs: + commands: + - echo "Active kernel:" + - uname -r + - echo "Active kernel modules:" + - lsmod + - name: ApplyPatches + action: ExecuteBash + inputs: + commands: + - aws s3 cp ${PatchScriptS3Uri} /usr/local/sbin/patch_node.sh + - sudo chown root:root /usr/local/sbin/patch_node.sh + - sudo chmod 0744 /usr/local/sbin/patch_node.sh + - sudo /usr/local/sbin/patch_node.sh + - name: Reboot + action: Reboot + - name: PostRebootChecks + action: ExecuteBash + inputs: + commands: + - echo "Active kernel:" + - uname -r + - echo "Active kernel modules:" + - lsmod + - name: Cleanup + action: ExecuteBash + inputs: + commands: + - /usr/local/sbin/ami_cleanup.sh + + RecipeLogGroup: + Type: AWS::Logs::LogGroup + DeletionPolicy: Retain + UpdateReplacePolicy: Retain + Properties: + LogGroupName: !Sub /aws/imagebuilder/pcluster-ami-patching-recipe-${AWS::StackName} + RetentionInDays: 7 + + PatchInfrastructureConfiguration: + Type: AWS::ImageBuilder::InfrastructureConfiguration + Properties: + Name: !Sub pcluster-ami-patching-config-${AWS::StackName} + InstanceProfileName: !Ref BuildInstanceProfile + InstanceTypes: + - !Ref InstanceType + SubnetId: !Ref SubnetId + SecurityGroupIds: + - !Ref BuildSecurityGroup + TerminateInstanceOnFailure: true + InstanceMetadataOptions: + HttpTokens: required + + PatchDistributionConfiguration: + Type: AWS::ImageBuilder::DistributionConfiguration + Properties: + Name: !Sub pcluster-ami-patching-distribution-${AWS::StackName} + Distributions: + - Region: !Ref AWS::Region + AmiDistributionConfiguration: + Name: !Sub + - "${SourceName}-patched-{{ imagebuilder:buildDate }}" + - SourceName: !GetAtt AmiHelper.SourceName + AmiTags: + parallelcluster:ami-patching-stack: !Ref AWS::StackName + parallelcluster:source-ami: !Ref ParentImage + + BuildInstanceProfile: + Type: AWS::IAM::InstanceProfile + Properties: + Roles: + - !Ref BuildInstanceRole + + BuildInstanceRole: + Type: AWS::IAM::Role + Properties: + AssumeRolePolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Principal: + Service: !Sub ec2.${AWS::URLSuffix} + Action: sts:AssumeRole + ManagedPolicyArns: + - !Sub arn:${AWS::Partition}:iam::aws:policy/EC2InstanceProfileForImageBuilder + - !Sub arn:${AWS::Partition}:iam::aws:policy/AmazonSSMManagedInstanceCore + Policies: + - PolicyName: read-patch-script + PolicyDocument: + Version: 2012-10-17 + Statement: + - Effect: Allow + Action: s3:GetObject + Resource: !Sub + - arn:${AWS::Partition}:s3:::${BucketAndKey} + - BucketAndKey: !Select [1, !Split ["s3://", !Ref PatchScriptS3Uri]] + + BuildSecurityGroup: + Type: AWS::EC2::SecurityGroup + Properties: + GroupDescription: Security group for the patched-AMI Image Builder build instance + VpcId: !Ref VpcId + SecurityGroupEgress: + - CidrIp: 0.0.0.0/0 + Description: Allow all outbound traffic + IpProtocol: "-1" + +Outputs: + AmiId: + Description: The id of the patched AMI produced by Image Builder. + Value: !GetAtt PatchedImage.ImageId diff --git a/tests/integration-tests/configs/develop.yaml b/tests/integration-tests/configs/develop.yaml index 8e1e2a4698..afa3190df5 100644 --- a/tests/integration-tests/configs/develop.yaml +++ b/tests/integration-tests/configs/develop.yaml @@ -173,6 +173,17 @@ test-suites: - regions: ["us-east-1"] instances: ["g4dn.2xlarge"] oss: [{{ OS_X86_1 }}] + patching: + test_patching.py::test_patching_cluster: + dimensions: + - regions: [{{ g4dn_8xlarge_CAPACITY_RESERVATION_3_INSTANCES_2_HOURS_NOPG_rhel9 }}] + instances: ["g4dn.8xlarge"] + oss: {{ RHEL_OS_X86 }} + schedulers: ["slurm"] + - regions: [{{ g4dn_8xlarge_CAPACITY_RESERVATION_3_INSTANCES_2_HOURS_NOPG_ubuntu2404 }}] + instances: ["g4dn.8xlarge"] + oss: {{ NO_RHEL_OS_X86 }} + schedulers: ["slurm"] custom_resource: test_cluster_custom_resource.py::test_cluster_create: dimensions: diff --git a/tests/integration-tests/configs/released.yaml b/tests/integration-tests/configs/released.yaml index ec423469c9..961ef8edcd 100644 --- a/tests/integration-tests/configs/released.yaml +++ b/tests/integration-tests/configs/released.yaml @@ -113,6 +113,19 @@ test-suites: - regions: ["ca-central-1"] instances: {{ common.INSTANCES_DEFAULT_X86 }} oss: ["alinux2023"] + # The patching test is currently expected to fail; it will be reintroduced once the + # failure is fixed. + # patching: + # test_patching.py::test_patching_cluster: + # dimensions: + # - regions: [{{ g4dn_8xlarge_CAPACITY_RESERVATION_3_INSTANCES_2_HOURS_NOPG_rhel9 }}] + # instances: ["g4dn.8xlarge"] + # oss: {{ RHEL_OS_X86 }} + # schedulers: ["slurm"] + # - regions: [{{ g4dn_8xlarge_CAPACITY_RESERVATION_3_INSTANCES_2_HOURS_NOPG_ubuntu2404 }}] + # instances: ["g4dn.8xlarge"] + # oss: {{ NO_RHEL_OS_X86 }} + # schedulers: ["slurm"] custom_resource: test_cluster_custom_resource.py::test_cluster_1_click: dimensions: diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 7e650fb950..e62d6550fe 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -244,6 +244,11 @@ def pytest_addoption(parser): "--proxy-stack", help="Name of CFN stack providing a Proxy environment.", ) + parser.addoption( + "--patch-ami-stack", + help="Name of an existing CFN stack that builds the patched AMI (cloudformation/patching/ami-patching.yaml). " + "When provided, the patching tests reuse this stack instead of creating and deleting a new one.", + ) parser.addoption( "--build-image-roles-stack", help="Name of CFN stack providing the build image permissions.", @@ -1721,6 +1726,78 @@ def _copy_image(image_id, test_name): logging.error("Delete copied AMI snapshot failed due to %s", e) +@pytest.fixture() +def patched_ami_factory(region, vpc_stack, test_datadir, request, cfn_stacks_factory, s3_bucket_factory): + """ + Factory fixture that builds a security-patched AMI from a given base AMI. + + The whole AMI-build infrastructure lives in a CloudFormation stack + (cloudformation/patching/ami-patching.yaml) that uses EC2 Image Builder: the + build instance downloads and runs the patch script, reboots, runs the AMI + cleanup, and Image Builder captures the patched AMI. The patched AMI is named + after the source AMI (with a -patched- suffix) and tagged with + parallelcluster:source-ami and parallelcluster:ami-patching-stack. The stack's + AmiId output is returned. + + The returned callable takes the base AMI id and the builder instance type. + On teardown the stack is deleted, which deregisters the produced AMI and its + snapshots (deleting an Image Builder image does not remove the produced AMI). + """ + # Path is relative to the integration-tests working directory, matching the + # convention used by other tests that load templates from cloudformation/. + with open("../../cloudformation/patching/ami-patching.yaml", encoding="utf-8") as template_file: + template_body = template_file.read() + reuse_stack_name = request.config.getoption("patch_ami_stack") + built = [] # list of (ami_id, stack_name) for stacks created (and to be deleted) by this fixture + + def _build(base_ami, builder_instance): + # Reuse an already-deployed patch-infra stack when requested: just read its + # AmiId output and skip creation/deletion. + if reuse_stack_name: + logging.info("Reusing existing patch-infra stack %s", reuse_stack_name) + stack = CfnStack(name=reuse_stack_name, region=region, template=template_body) + return stack.cfn_outputs["AmiId"] + + logging.info("Starting patching of AMI %s using a %s builder instance", base_ami, builder_instance) + bucket_name = s3_bucket_factory() + boto3.resource("s3", region_name=region).Bucket(bucket_name).upload_file( + str(test_datadir / "patch_node.sh"), "scripts/patch_node.sh" + ) + stack_name = generate_stack_name("integ-tests-patching-builder", request.config.getoption("stackname_suffix")) + stack = CfnStack( + name=stack_name, + region=region, + template=template_body, + parameters=[ + {"ParameterKey": "ParentImage", "ParameterValue": base_ami}, + {"ParameterKey": "InstanceType", "ParameterValue": builder_instance}, + {"ParameterKey": "SubnetId", "ParameterValue": vpc_stack.get_public_subnet()}, + {"ParameterKey": "VpcId", "ParameterValue": vpc_stack.cfn_outputs["VpcId"]}, + {"ParameterKey": "PatchScriptS3Uri", "ParameterValue": f"s3://{bucket_name}/scripts/patch_node.sh"}, + ], + capabilities=["CAPABILITY_IAM"], + ) + # create_stack blocks until CREATE_COMPLETE, i.e. until Image Builder has + # finished building the patched AMI. + cfn_stacks_factory.create_stack(stack) + ami_id = stack.cfn_outputs["AmiId"] + built.append((ami_id, stack_name)) + logging.info("Patched AMI %s is available", ami_id) + return ami_id + + yield _build + + # Leave everything in place when --no-delete is set, and never tear down a + # reused stack. Otherwise just delete the stack: it owns the cleanup of the + # patched AMI and its snapshots (via the AmiHelper custom resource on delete). + if request.config.getoption("no_delete"): + logging.info("--no-delete specified: retaining patched AMI(s) and stack(s): %s", built) + return + for ami_id, stack_name in built: + logging.info("Deleting patch-infra stack %s (removes patched AMI %s)", stack_name, ami_id) + cfn_stacks_factory.delete_stack(stack_name, region) + + @pytest.fixture() def mpi_variants(architecture): variants = ["openmpi"] diff --git a/tests/integration-tests/test_runner.py b/tests/integration-tests/test_runner.py index 09507df6e7..8510fbf783 100644 --- a/tests/integration-tests/test_runner.py +++ b/tests/integration-tests/test_runner.py @@ -111,6 +111,7 @@ "retain_ad_stack": False, "global_build_number": 0, "proxy_stack": None, + "patch_ami_stack": None, "build_image_roles_stack": None, "capacity_reservation_id": None, "skip_ddb_metadata": False, @@ -513,6 +514,12 @@ def _init_argparser(): help="Name of CFN stack providing a Proxy environment.", default=TEST_DEFAULTS.get("proxy_stack"), ) + debug_group.add_argument( + "--patch-ami-stack", + help="Name of an existing CFN stack that builds the patched AMI. " + "When provided, the patching tests reuse this stack instead of creating and deleting a new one.", + default=TEST_DEFAULTS.get("patch_ami_stack"), + ) debug_group.add_argument( "--build-image-roles-stack", help="Name of CFN stack providing build image permissions.", @@ -772,6 +779,9 @@ def _set_custom_stack_args(args, pytest_args): # noqa: C901 if args.proxy_stack: pytest_args.extend(["--proxy-stack", args.proxy_stack]) + if args.patch_ami_stack: + pytest_args.extend(["--patch-ami-stack", args.patch_ami_stack]) + if args.build_image_roles_stack: pytest_args.extend(["--build-image-roles-stack", args.build_image_roles_stack]) diff --git a/tests/integration-tests/tests/basic/test_essential_features.py b/tests/integration-tests/tests/basic/test_essential_features.py index 39a3029944..cbe128f99a 100644 --- a/tests/integration-tests/tests/basic/test_essential_features.py +++ b/tests/integration-tests/tests/basic/test_essential_features.py @@ -27,7 +27,7 @@ wait_instance_replaced_or_terminating, ) from tests.common.mpi_common import _test_mpi -from tests.common.utils import fetch_instance_slots, run_system_analyzer +from tests.common.utils import GPU_JOB_SCRIPT, fetch_instance_slots, run_system_analyzer def test_essential_features( @@ -344,7 +344,7 @@ def _test_gpu_workload(cluster, scheduler_commands_factory, test_datadir): for sample in samples: logging.info("Submitting CUDA sample job for %s", sample) result = scheduler_commands.submit_script( - str(test_datadir / "gpu_job.sh"), + str(GPU_JOB_SCRIPT), script_args=[sample], partition="gpu", nodes=1, diff --git a/tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh b/tests/integration-tests/tests/common/data/gpu_job.sh similarity index 100% rename from tests/integration-tests/tests/basic/test_essential_features/test_essential_features/gpu_job.sh rename to tests/integration-tests/tests/common/data/gpu_job.sh diff --git a/tests/integration-tests/tests/common/utils.py b/tests/integration-tests/tests/common/utils.py index 9978ba9e3a..7bda9f3dbd 100644 --- a/tests/integration-tests/tests/common/utils.py +++ b/tests/integration-tests/tests/common/utils.py @@ -19,6 +19,7 @@ from importlib.metadata import version as get_package_version import boto3 +import yaml from assertpy import assert_that from botocore.exceptions import ClientError from framework.framework_constants import METADATA_DEFAULT_REGION, PERFORMANCE_METADATA_TABLE @@ -26,8 +27,8 @@ from packaging import version as packaging_version from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor from retrying import retry -from time_utils import seconds -from utils import get_instance_info, run_command +from time_utils import minutes, seconds +from utils import get_instance_info, get_username_for_os, run_command from tests.common.osu_common import PRIVATE_OSES @@ -35,6 +36,16 @@ SYSTEM_ANALYZER_SCRIPT = pathlib.Path(__file__).parent / "data/system-analyzer.sh" +# Cluster node types exercised by the integration tests. +HEAD_NODE = "HeadNode" +COMPUTE_NODE = "ComputeNode" +LOGIN_NODE = "LoginNode" +NODE_TYPES = (HEAD_NODE, COMPUTE_NODE, LOGIN_NODE) + +# Shared Slurm job script that builds and runs a single CUDA sample on a GPU +# compute node. Used by multiple tests to validate GPU workloads. +GPU_JOB_SCRIPT = pathlib.Path(__file__).parent / "data/gpu_job.sh" + RHEL_OWNERS = ["309956199498", "841258680906", "219670896067"] OS_TO_OFFICIAL_AMI_NAME_OWNER_MAP = { @@ -383,6 +394,38 @@ def wait_login_node_status_ok(cluster): ) +@retry(stop_max_delay=minutes(3), wait_fixed=seconds(15)) +def wait_node_reachable(cluster, node_ip): + """Wait until the node at the given IP is reachable over SSH. + + Retried every 15 seconds for up to 3 minutes to absorb a reboot window, and + confirms the node is healthy by reading its running kernel. + """ + username = get_username_for_os(cluster.os) + ssh_opts = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ConnectTimeout=10" + command = f"ssh {ssh_opts} -i {cluster.ssh_key} {username}@{node_ip} uname -r" + kernel = run_command(command, timeout=30, shell=True).stdout.strip() + logging.info("Node %s reachable over SSH; running kernel: %s", node_ip, kernel) + + +@retry(stop_max_delay=minutes(5), wait_fixed=seconds(10), retry_on_result=lambda ami: ami is None) +def retrieve_cluster_head_node_ami(cluster, region): + """Return the AMI id the cluster uses, read from the cluster stack template. + + The AMI is read from the head node launch template (HeadNodeLaunchTemplate) in the + cluster CloudFormation stack template, which is available as soon as the stack is + created and avoids waiting for the head node instance to come up. + """ + template = ( + boto3.client("cloudformation", region_name=region).get_template(StackName=cluster.cfn_name).get("TemplateBody") + ) + if isinstance(template, str): + template = yaml.safe_load(template) + if not template: + return None + return template["Resources"]["HeadNodeLaunchTemplate"]["Properties"]["LaunchTemplateData"]["ImageId"] + + def get_default_vpc_security_group(vpc_id, region): return ( boto3.client("ec2", region_name=region) diff --git a/tests/integration-tests/tests/patching/__init__.py b/tests/integration-tests/tests/patching/__init__.py new file mode 100644 index 0000000000..5006309027 --- /dev/null +++ b/tests/integration-tests/tests/patching/__init__.py @@ -0,0 +1,11 @@ +# Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. diff --git a/tests/integration-tests/tests/patching/test_patching.py b/tests/integration-tests/tests/patching/test_patching.py new file mode 100644 index 0000000000..77ad496085 --- /dev/null +++ b/tests/integration-tests/tests/patching/test_patching.py @@ -0,0 +1,219 @@ +# Copyright 2026 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import logging + +import boto3 +from assertpy import assert_that, soft_assertions +from remote_command_executor import RemoteCommandExecutor +from retrying import retry +from time_utils import minutes, seconds + +from tests.common.login_nodes_utils import wait_for_login_fleet_stop +from tests.common.utils import ( + COMPUTE_NODE, + GPU_JOB_SCRIPT, + LOGIN_NODE, + NODE_TYPES, + reboot_head_node, + retrieve_cluster_head_node_ami, + wait_node_reachable, +) + +# Time budget (seconds) for the OS security patching to complete on the head node. +PATCHING_TIMEOUT = 1800 + + +def test_patching_cluster( + region, + os, + instance, + scheduler, + vpc_stack, + pcluster_config_reader, + clusters_factory, + test_datadir, + scheduler_commands_factory, + patched_ami_factory, + request, +): + """ + Validate that users can self-patch their clusters. + + Flow: + 1. Create a cluster. + 2. Read the AMI it uses from its CloudFormation stack template. + 3. Bake a patched AMI from that AMI. + 4. Wait for the cluster creation to complete. + 5. Run a baseline GPU workload from head node and login node + 6. Snapshot the loaded kernel modules. + 7. Stop the login nodes. + 8. Update the cluster to the patched AMI and wait for nodes to be replaced. + 9. Patch and reboot the head node, then wait for it to be reachable over SSH. + 10. Re-run the GPU workload from head node and login node. + 11. Assert that every kernel module loaded before patching is still loaded, on each node type. + """ + ec2 = boto3.client("ec2", region_name=region) + + # Start the cluster creation but do not block on it: the AMI patching below + # runs concurrently while the cluster comes up. + create_config = pcluster_config_reader(output_file="pcluster.config.create.yaml", login_nodes_count=1) + cluster = clusters_factory(create_config, wait=False) + + # Use the exact AMI the cluster uses as the source for patching, read from the + # cluster stack template instead of resolving it with a separate AMI lookup. + base_ami = retrieve_cluster_head_node_ami(cluster, region) + logging.info("Cluster is running on AMI %s", base_ami) + + # Bake the patched AMI while the cluster is still being created. The builder + # instance uses the same GPU instance type as the cluster nodes. + patched_ami = patched_ami_factory(base_ami, instance) + + # Wait for the cluster creation to complete before using it. + logging.info("Waiting for cluster %s to reach CREATE_COMPLETE", cluster.name) + cluster.wait_cluster_status("CREATE_COMPLETE") + + # Snapshot the loaded kernel modules on the head, compute and login nodes before + # patching so we can later assert the same modules remain loaded. + kernel_modules_before = _collect_loaded_kernel_modules(cluster, scheduler_commands_factory) + logging.info("Kernel modules loaded before patching: %s", kernel_modules_before) + + # GPU workload BEFORE patching, from the head node and login node (baseline). + _run_gpu_workload(cluster, scheduler_commands_factory, use_login_node=False) + _run_gpu_workload(cluster, scheduler_commands_factory, use_login_node=True) + + # Stop the login nodes (required before changing the login pool image). + stop_login_config = pcluster_config_reader(output_file="pcluster.config.stop-login.yaml", login_nodes_count=0) + cluster.update(str(stop_login_config)) + wait_for_login_fleet_stop(cluster) + logging.info("Login nodes stopped") + + # Update the cluster so login and compute nodes use the patched AMI. + update_config = pcluster_config_reader( + output_file="pcluster.config.update-ami.yaml", login_nodes_count=1, patched_ami=patched_ami + ) + cluster.update(str(update_config)) + + # With QueueUpdateStrategy DRAIN the static compute node is drained and replaced + # asynchronously after the update completes, and the login pool is recreated, so + # wait for both to come back running the patched AMI. + logging.info("Waiting for compute and login nodes to be replaced with the patched AMI") + _wait_instances_using_ami(ec2, cluster, "Compute", patched_ami) + _wait_instances_using_ami(ec2, cluster, "LoginNode", patched_ami) + + # Patch the head node in place and reboot it. + remote_command_executor = RemoteCommandExecutor(cluster) + logging.info("Patching the head node") + patch_result = remote_command_executor.run_remote_script( + str(test_datadir / "patch_node.sh"), run_as_root=True, timeout=PATCHING_TIMEOUT + ) + logging.info("Head node patching script output:\n%s", patch_result.stdout) + reboot_head_node(cluster) + + # Verify the head node is reachable over SSH again after the reboot (and that + # the patch left it healthy) before exercising the cluster further. + wait_node_reachable(cluster, cluster.head_node_ip) + + # GPU workload AFTER patching, from the head node and login node. + _run_gpu_workload(cluster, scheduler_commands_factory, use_login_node=False) + _run_gpu_workload(cluster, scheduler_commands_factory, use_login_node=True) + + # Snapshot and log the kernel modules loaded after patching, then assert (softly, + # so every node type is reported even if one fails) that every module loaded + # before patching is still loaded on the head, compute and login nodes. + kernel_modules_after = _collect_loaded_kernel_modules(cluster, scheduler_commands_factory) + logging.info("Kernel modules loaded after patching: %s", kernel_modules_after) + with soft_assertions(): + for node_type in NODE_TYPES: + missing = kernel_modules_before[node_type] - kernel_modules_after[node_type] + assert_that(missing).described_as(f"kernel modules no longer loaded on the {node_type}").is_empty() + + +@retry(stop_max_delay=minutes(15), wait_fixed=seconds(30), retry_on_result=lambda replaced: not replaced) +def _wait_instances_using_ami(ec2, cluster, node_type, expected_ami): + """Wait until all instances of the given node type are running the expected AMI. + + Used after a DRAIN-strategy update, where the static compute node is replaced + asynchronously and the login pool is recreated, so the new instances may not be + up (or may briefly coexist with the old ones) right after the update completes. + """ + instance_ids = cluster.get_cluster_instance_ids(node_type=node_type) + if not instance_ids: + return False + amis = { + ec2.describe_instances(InstanceIds=[instance_id])["Reservations"][0]["Instances"][0]["ImageId"] + for instance_id in instance_ids + } + logging.info("%s instances %s on AMIs %s (expected %s)", node_type, instance_ids, amis, expected_ami) + using_patched_ami = amis == {expected_ami} + if using_patched_ami: + logging.info( + "Detected new %s node(s) %s now running the patched AMI %s", + node_type, + instance_ids, + expected_ami, + ) + return using_patched_ami + + +def _run_gpu_workload(cluster, scheduler_commands_factory, use_login_node): + """Submit a CUDA sample onto the GPU partition and assert success. + + The job is submitted from the login node when use_login_node is True, otherwise + from the head node. + """ + source = "login node" if use_login_node else "head node" + logging.info("Submitting GPU validation job from the %s", source) + remote_command_executor = RemoteCommandExecutor(cluster, use_login_node=use_login_node) + scheduler_commands = scheduler_commands_factory(remote_command_executor) + result = scheduler_commands.submit_script( + str(GPU_JOB_SCRIPT), + script_args=["1_Utilities/deviceQuery"], + partition="q1", + nodes=1, + slots=1, + ) + job_id = scheduler_commands.assert_job_submitted(result.stdout) + scheduler_commands.wait_job_completed(job_id, timeout=20) + scheduler_commands.assert_job_succeeded(job_id) + logging.info("GPU validation job %s submitted from the %s succeeded", job_id, source) + + +def _collect_loaded_kernel_modules(cluster, scheduler_commands_factory): + """Snapshot the loaded kernel modules on the head, compute and login nodes. + + Returns a mapping of node type to the set of loaded kernel modules, so the same + modules can later be asserted as still loaded after patching. + """ + return { + node_type: _loaded_kernel_modules(_node_executor(cluster, scheduler_commands_factory, node_type)) + for node_type in NODE_TYPES + } + + +def _node_executor(cluster, scheduler_commands_factory, node_type): + """Return a RemoteCommandExecutor connected to the given node type. + + Compute nodes are reached through the head node, which acts as the bastion. + """ + if node_type == COMPUTE_NODE: + scheduler_commands = scheduler_commands_factory(RemoteCommandExecutor(cluster)) + compute_node = scheduler_commands.get_compute_nodes()[0] + return RemoteCommandExecutor(cluster, compute_node_ip=scheduler_commands.get_node_addr(compute_node)) + if node_type == LOGIN_NODE: + return RemoteCommandExecutor(cluster, use_login_node=True) + return RemoteCommandExecutor(cluster) + + +def _loaded_kernel_modules(remote_command_executor): + """Return the set of kernel module names currently loaded on the node.""" + output = remote_command_executor.run_remote_command("lsmod | tail -n +2 | awk '{print $1}'").stdout + return set(output.split()) diff --git a/tests/integration-tests/tests/patching/test_patching/test_patching_cluster/patch_node.sh b/tests/integration-tests/tests/patching/test_patching/test_patching_cluster/patch_node.sh new file mode 100644 index 0000000000..3b050135bb --- /dev/null +++ b/tests/integration-tests/tests/patching/test_patching/test_patching_cluster/patch_node.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# +# Patching script. +# +# Applies all available *security* patches to the system using the native +# package manager. Kernel packages are intentionally NOT excluded: if a +# security fix requires a newer kernel, the bump is accepted. A reboot after +# this script runs is required to activate a new kernel. +# +# Supports dnf (AL2023/RHEL9/Rocky9), yum (AL2/RHEL8) and apt (Ubuntu). +set -euo pipefail + +echo "===== Starting system security patching on $(hostname) =====" +# Report the running kernel before patching. The kernel after the reboot is +# reported separately once the node has rebooted (the reboot is mandatory to +# activate any new kernel). +echo "Kernel before patching: $(uname -r)" + +if command -v dnf >/dev/null 2>&1; then + echo "Detected dnf package manager" + sudo dnf clean all + sudo dnf makecache --refresh || true + # Apply only security errata. Kernel packages are allowed to be upgraded. + sudo dnf upgrade --security -y +elif command -v yum >/dev/null 2>&1; then + echo "Detected yum package manager" + sudo yum clean all + sudo yum makecache || true + # update-minimal --security applies the smallest set of security errata. + # Kernel bumps are allowed (no --exclude=kernel*). + sudo yum update-minimal --security -y +elif command -v apt-get >/dev/null 2>&1; then + echo "Detected apt package manager" + export DEBIAN_FRONTEND=noninteractive + sudo apt-get update -y + # unattended-upgrades applies only the security pocket by default and will + # upgrade linux-image-* (kernel) packages when needed. + sudo apt-get install -y unattended-upgrades + sudo unattended-upgrade -v +else + echo "ERROR: no supported package manager found (dnf/yum/apt-get)" >&2 + exit 1 +fi + +echo "===== System security patching completed on $(hostname) =====" diff --git a/tests/integration-tests/tests/patching/test_patching/test_patching_cluster/pcluster.config.yaml b/tests/integration-tests/tests/patching/test_patching/test_patching_cluster/pcluster.config.yaml new file mode 100644 index 0000000000..0625385839 --- /dev/null +++ b/tests/integration-tests/tests/patching/test_patching/test_patching_cluster/pcluster.config.yaml @@ -0,0 +1,66 @@ +Image: + Os: {{ os }} +HeadNode: + InstanceType: {{ instance }} + Networking: + SubnetId: {{ public_subnet_id }} + ElasticIp: true + Ssh: + KeyName: {{ key_name }} + Imds: + Secured: {{ imds_secured }} +LoginNodes: + Pools: + - Name: login1 + InstanceType: {{ instance }} + Count: {{ login_nodes_count }} + GracetimePeriod: 3 + {% if patched_ami %} + Image: + CustomAmi: {{ patched_ami }} + {% endif %} + Networking: + SubnetIds: + - {{ public_subnet_id }} +Scheduling: + Scheduler: {{ scheduler }} + SlurmSettings: + QueueUpdateStrategy: DRAIN + SlurmQueues: + - Name: q1 + {% if patched_ami %} + Image: + CustomAmi: {{ patched_ami }} + {% endif %} + HealthChecks: + Gpu: + Enabled: true + Networking: + PlacementGroup: + {% if capacity_reservation_framework_placement_group %} + Enabled: true + Name: {{ capacity_reservation_framework_placement_group }} + {% else %} + Enabled: false + {% endif %} + SubnetIds: + - {{ private_subnet_id }} + ComputeResources: + - Name: cr1 + InstanceType: {{ instance }} + MinCount: 1 + MaxCount: 1 + Efa: + Enabled: true +SharedStorage: + - MountDir: /shared-ebs + Name: shared-ebs + StorageType: Ebs + - MountDir: /shared-efs + Name: shared-efs + StorageType: Efs + - MountDir: /shared-fsxlustre + Name: shared-fsx + StorageType: FsxLustre + FsxLustreSettings: + StorageCapacity: 1200