From 4547e36672c44ecc7a62aee210ea17538cbc4c41 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 10:48:35 -0800 Subject: [PATCH 01/22] Update model_checkpoint.py asyncronous|asynchronous --- resiliency/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resiliency/callbacks/model_checkpoint.py b/resiliency/callbacks/model_checkpoint.py index fccfb4c..8daa581 100644 --- a/resiliency/callbacks/model_checkpoint.py +++ b/resiliency/callbacks/model_checkpoint.py @@ -46,7 +46,7 @@ class ModelCheckpoint(PTLModelCheckpoint): """Light wrapper around Lightning's ModelCheckpoint to force a saved checkpoint on train_end. - Adds support for asyncronous checkpointing and provides some additional logic + Adds support for asynchronous checkpointing and provides some additional logic to clean up invalid checkpoints Args: From 4a42b47ebb1b8b43ac36cd2312f336f8f5c75793 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:16:39 -0800 Subject: [PATCH 02/22] Update launcher.py attemps|attempts --- third_party/nvidia-resiliency-ext/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/nvidia-resiliency-ext/launcher.py b/third_party/nvidia-resiliency-ext/launcher.py index 9da6c34..4bb3bae 100644 --- a/third_party/nvidia-resiliency-ext/launcher.py +++ b/third_party/nvidia-resiliency-ext/launcher.py @@ -1978,7 +1978,7 @@ def get_use_env(args) -> bool: def _get_logs_specs_class(logs_specs_name: Optional[str]) -> Type[LogsSpecs]: - """Attemps to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param. + """attempts to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param. Provides plugin mechanism to provide custom implementation of LogsSpecs. From 76fdd0e2f69111e155fd565982812ab99318f786 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:17:24 -0800 Subject: [PATCH 03/22] Update launcher.py attemps|attempts --- third_party/nvidia-resiliency-ext/v0.4.1/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/nvidia-resiliency-ext/v0.4.1/launcher.py b/third_party/nvidia-resiliency-ext/v0.4.1/launcher.py index e191e21..f2bdbdc 100644 --- a/third_party/nvidia-resiliency-ext/v0.4.1/launcher.py +++ b/third_party/nvidia-resiliency-ext/v0.4.1/launcher.py @@ -2017,7 +2017,7 @@ def get_use_env(args) -> bool: def _get_logs_specs_class(logs_specs_name: Optional[str]) -> Type[LogsSpecs]: """ - Attemps to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param. + Attempts to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param. Provides plugin mechanism to provide custom implementation of LogsSpecs. Returns `DefaultLogsSpecs` when logs_spec_name is None. From d5ab1e73ef56e3fdcbdf0c3002848961735f5fab Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:20:07 -0800 Subject: [PATCH 04/22] Update launcher.py attemps|attempts --- third_party/nvidia-resiliency-ext/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/nvidia-resiliency-ext/launcher.py b/third_party/nvidia-resiliency-ext/launcher.py index 4bb3bae..565645f 100644 --- a/third_party/nvidia-resiliency-ext/launcher.py +++ b/third_party/nvidia-resiliency-ext/launcher.py @@ -1978,7 +1978,7 @@ def get_use_env(args) -> bool: def _get_logs_specs_class(logs_specs_name: Optional[str]) -> Type[LogsSpecs]: - """attempts to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param. + """Attempts to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param. Provides plugin mechanism to provide custom implementation of LogsSpecs. From c0fc1cc65eea9ee9378a2219c3892df5fe45890b Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:21:18 -0800 Subject: [PATCH 05/22] Update torchrun.py attemps|attempts --- third_party/torch/torchrun.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/torch/torchrun.py b/third_party/torch/torchrun.py index 4622b95..4630945 100644 --- a/third_party/torch/torchrun.py +++ b/third_party/torch/torchrun.py @@ -1476,7 +1476,7 @@ def get_use_env(args) -> bool: def _get_logs_specs_class(logs_specs_name: Optional[str]) -> type[LogsSpecs]: """ - Attemps to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param. + Attempts to load `torchrun.logs_spec` entrypoint with key of `logs_specs_name` param. Provides plugin mechanism to provide custom implementation of LogsSpecs. Returns `DefaultLogsSpecs` when logs_spec_name is None. From 7b674b0cbdf3bf7dda59028902ae52be36bf4e3c Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:22:58 -0800 Subject: [PATCH 06/22] Update launcher.py behaviour|behavior --- third_party/nvidia-resiliency-ext/v0.4.1/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/nvidia-resiliency-ext/v0.4.1/launcher.py b/third_party/nvidia-resiliency-ext/v0.4.1/launcher.py index f2bdbdc..2aeb747 100644 --- a/third_party/nvidia-resiliency-ext/v0.4.1/launcher.py +++ b/third_party/nvidia-resiliency-ext/v0.4.1/launcher.py @@ -374,7 +374,7 @@ def _invoke_run_with_any_failed_policy(self, role: str = DEFAULT_ROLE) -> RunRes else: self._stop_workers(self._worker_group) self._worker_group.state = WorkerState.FAILED - # to preserve torchrun's behaviour, should not return WorkerState.UNHEALTHY. + # to preserve torchrun's behavior, should not return WorkerState.UNHEALTHY. # we use WorkerState.UNHEALTHY to denote a worker group that is still # running but has some failed workers. torchrun does not use WorkerState.UNHEALTHY run_result = self._monitor_workers(self._worker_group) From e5d37aa0211f7baa4ecc73b8670a1567f2f0cf17 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:24:05 -0800 Subject: [PATCH 07/22] Update README.md benifits|benefits --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 21d37bc..1a97a2f 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ The Resiliency library provides resilient training code used to improve training The scale of large distributed AI training jobs results in frequent failures and interruptions. This library exists to help minimize the impact of failures and interruptions on training, ultimately improving training goodput for training workloads on Google Cloud. ### Hierarchy of ML frameworks within this library -This library intertwines several popular ML frameworks and adds features on top of them. The following diagram provides a visual representation of the ML resiliency stack and the benifits that come with each layer: +This library intertwines several popular ML frameworks and adds features on top of them. The following diagram provides a visual representation of the ML resiliency stack and the benefits that come with each layer: ![Diagram of the ML Resiliency Stack](docs/media/ml_resiliency_stack_diagram.png) @@ -86,4 +86,4 @@ python3 benchmark/benchmark.py --help ``` ## Help -If you run into any issues when using this library, please file an issue or create a pull request. \ No newline at end of file +If you run into any issues when using this library, please file an issue or create a pull request. From 9a620163adb91d241957921f52682b39f48f6351 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:25:51 -0800 Subject: [PATCH 08/22] Update launcher.py cutomized|customized --- third_party/nvidia-resiliency-ext/launcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/nvidia-resiliency-ext/launcher.py b/third_party/nvidia-resiliency-ext/launcher.py index 565645f..3a95c04 100644 --- a/third_party/nvidia-resiliency-ext/launcher.py +++ b/third_party/nvidia-resiliency-ext/launcher.py @@ -595,7 +595,7 @@ def _start_workers(self, worker_group: WorkerGroup) -> Dict[int, Any]: return self._pcontext.pids() def _patch_pcontext_close(self, pcontext: PContext) -> None: - # replace PContext._close with our version that has cutomized timeout + # replace PContext._close with our version that has customized timeout # this ensures that the workers have enough time between SIGTERM and SIGKILL orig_close = pcontext._close From 58aaf85c870a49880a996b4283783e5274a62d35 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:26:51 -0800 Subject: [PATCH 09/22] Update async_utils.py explictily|explicitly --- third_party/megatron/async_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/megatron/async_utils.py b/third_party/megatron/async_utils.py index 3582861..47f93aa 100644 --- a/third_party/megatron/async_utils.py +++ b/third_party/megatron/async_utils.py @@ -431,7 +431,7 @@ def async_loop( """Main function for the persistent checkpoint worker The persisent worker is created once and terminated at exit or - when application calls `close()` explictily + when application calls `close()` explicitly This routine receives `AsyncRequest` and does `preload_fn` first and put the integer value in `preload_q` to inform the trainer to proceed. From 7999611625d7847ba53433fd87d53e2991b5a2bb Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:28:02 -0800 Subject: [PATCH 10/22] Update weekend-exp-mt.md experment|experiment --- docs/weekend-exp-mt.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/weekend-exp-mt.md b/docs/weekend-exp-mt.md index 28e1880..1748a43 100644 --- a/docs/weekend-exp-mt.md +++ b/docs/weekend-exp-mt.md @@ -1,4 +1,4 @@ -# Weekend Experment +# Weekend Experiment - Create a bug under this [component](https://b.corp.google.com/issues/new?component=1784216&template=0), the bug should include head commit hash of the repo, the cmd started the exp, any cmd used to restart the exp, and the final goodput analysis report. From 11ef706b892935db3835854985efd127168c8a90 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:28:53 -0800 Subject: [PATCH 11/22] Update README.md initated|initiated --- examples/supervisor_quickstart/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/supervisor_quickstart/README.md b/examples/supervisor_quickstart/README.md index 6986829..9207843 100644 --- a/examples/supervisor_quickstart/README.md +++ b/examples/supervisor_quickstart/README.md @@ -144,7 +144,7 @@ RUN pip install google-cloud-resiliency-supervisor Note: This example assumes a Python 3.10 environment. The path to local_elastic_agent.py might differ slightly based on your Python version or base image. The ARG command dynamically finds the correct path. ## Step 3: Run Your PyTorch Workload -With the patched Docker image, you can now run your training job. To activate the supervisor client, you must set the `GCP_HOST_DAEMON_PORT` environment variable when launching your workload. If `GCP_HOST_DAEMON_PORT`, the supervisor client will not be initated by default. +With the patched Docker image, you can now run your training job. To activate the supervisor client, you must set the `GCP_HOST_DAEMON_PORT` environment variable when launching your workload. If `GCP_HOST_DAEMON_PORT`, the supervisor client will not be initiated by default. The client code uses this port to connect to the `supervisor` daemon running on the same node. From 2d3e401b8145cc28141479e231ad17360a879914 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:30:05 -0800 Subject: [PATCH 12/22] Update replication_utils.py laod|load --- resiliency/plugins/replication_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resiliency/plugins/replication_utils.py b/resiliency/plugins/replication_utils.py index 8ee9739..4d0f20a 100644 --- a/resiliency/plugins/replication_utils.py +++ b/resiliency/plugins/replication_utils.py @@ -470,7 +470,7 @@ def read_data( target_tensor.copy_(tensor) planner.commit_tensor(req, target_tensor) elif self.replication_coordinator is not None: - # For every request, this rank will need to receive data before being able to laod it. + # For every request, this rank will need to receive data before being able to load it. # This is a no-op if rank does not have any data to receive. for req in reqs: if req.type == LoadItemType.BYTE_IO: From 8520469775fd41235fdbd281046b99502777866a Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:31:11 -0800 Subject: [PATCH 13/22] Update high_scale_ckpt_utils.py laoder|loader --- resiliency/high_scale_ckpt_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resiliency/high_scale_ckpt_utils.py b/resiliency/high_scale_ckpt_utils.py index bd5ad47..36f9aa9 100644 --- a/resiliency/high_scale_ckpt_utils.py +++ b/resiliency/high_scale_ckpt_utils.py @@ -136,7 +136,7 @@ def _extract_step(f): _wait_config_file_disappear(replicator_config_path, timeout_s) handle_replicator_fail_situation(directory) - # TODO: will use this feature when enabling data laoder ckpt. + # TODO: will use this feature when enabling data loader ckpt. process_replicator_prerestore(directory) for _ in range(timeout_s): From 30decbf09e661abe56e8e498423015c5fd56d6fa Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:32:20 -0800 Subject: [PATCH 14/22] Update benchmark.py loging|logging | --- benchmark/benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/benchmark.py b/benchmark/benchmark.py index ff03f9e..099f1cb 100644 --- a/benchmark/benchmark.py +++ b/benchmark/benchmark.py @@ -481,7 +481,7 @@ def main(): # Add the new handler to the logger pl_logger.addHandler(console_handler) - # loging to /tmp folder by default + # logging | to /tmp folder by default log_file_dir = f"/tmp/{args.job_name}/log" if args.log_to_remote_storage: log_file_dir = Path(args.log_dir) / args.job_name / "log" From a90bf0c22841bb074a71410f6c982357e672b8dc Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:34:56 -0800 Subject: [PATCH 15/22] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1a97a2f..a550f48 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ - [Help](#help) ## Overview -The Resiliency library provides resilient training code used to improve training goodput (aka training efficiency) of distributed workloads. For those unfamiliar with the term "goodput", please read [Google Cloud Goodput Blogpost](https://cloud.google.com/blog/products/ai-machine-learning/goodput-metric-as-measure-of-ml-productivity). This library provides resilency features built on-top of [NVIDIA NeMo 2.0](https://github.com/NVIDIA/NeMo) training recipes. Reproducible benchmarks demonstrating high training goodput using this library are defined in [AI-Hypercomputer/gpu-recipes](https://github.com/AI-Hypercomputer/gpu-recipes). +The Resiliency library provides resilient training code used to improve training goodput (aka training efficiency) of distributed workloads. For those unfamiliar with the term "goodput", please read [Google Cloud Goodput Blogpost](https://cloud.google.com/blog/products/ai-machine-learning/goodput-metric-as-measure-of-ml-productivity). This library provides resiliency features built on-top of [NVIDIA NeMo 2.0](https://github.com/NVIDIA/NeMo) training recipes. Reproducible benchmarks demonstrating high training goodput using this library are defined in [AI-Hypercomputer/gpu-recipes](https://github.com/AI-Hypercomputer/gpu-recipes). ### Motivation The scale of large distributed AI training jobs results in frequent failures and interruptions. This library exists to help minimize the impact of failures and interruptions on training, ultimately improving training goodput for training workloads on Google Cloud. @@ -45,7 +45,7 @@ Holistic cluster control is supported via the Supervisor. The supervisor compris This library includes the client facing portion of the Supervisor in [`supervisor`](supervisor). One instance of the sensor, controller, and actuator should be deployed on CPU hosts per cluster. Each training host (GPUs) adjacent to the training workload should also run [`host_daemon.py`](supervisor/host_daemon.py) as a daemon deployment. The host daemon deployment on each host is responsible for monitoring the training processes on the same host and will relay all signal to the sensor deployment. -The Supervisor builds upon the resiliency support provided by [NVIDIA's NvRx library](https://github.com/NVIDIA/nvidia-resiliency-ext). In particular, it utilizes the heartbeating mechansisms defined in NvRx as a signal to whether each node is training as expected. When interruptions occur, the training job relies on the in-job restart support provided by NvRx. +The Supervisor builds upon the resiliency support provided by [NVIDIA's NvRx library](https://github.com/NVIDIA/nvidia-resiliency-ext). In particular, it utilizes the heartbeating mechanisms defined in NvRx as a signal to whether each node is training as expected. When interruptions occur, the training job relies on the in-job restart support provided by NvRx. Once the Supervisor is deployed, please include launcher flag `--use-supervisor` to attach the training job to the Supervisor. ## Tools From e8c9dcabbe56c0484afdd41a4d1e05f776804e7a Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:36:49 -0800 Subject: [PATCH 16/22] Update combined_functionality.py occuring|occurring --- resiliency/plugins/combined_functionality.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resiliency/plugins/combined_functionality.py b/resiliency/plugins/combined_functionality.py index 2a1b3e4..ee9054f 100644 --- a/resiliency/plugins/combined_functionality.py +++ b/resiliency/plugins/combined_functionality.py @@ -618,7 +618,7 @@ def load( pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, True) # Define planner. If replication coordinator is present, use FlexibleLoadPlanner. - # This will prevent errors from occuring if local checkpoint is missing. + # This will prevent errors from occurring if local checkpoint is missing. if self.replication_coordinator is not None: planner = ReplicationLoadPlanner( shapes_validation_sharded_tensors=flexible_shape_sharded_tensors, From 79cba9eabacb8a84faaaf2efd262426abce96599 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:37:38 -0800 Subject: [PATCH 17/22] Update in_cluster_local_ckpt.py occuring|occurring --- resiliency/plugins/in_cluster_local_ckpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resiliency/plugins/in_cluster_local_ckpt.py b/resiliency/plugins/in_cluster_local_ckpt.py index 68a1a8f..4f75975 100644 --- a/resiliency/plugins/in_cluster_local_ckpt.py +++ b/resiliency/plugins/in_cluster_local_ckpt.py @@ -563,7 +563,7 @@ def load( pyt_state_dict = mcore_to_pyt_state_dict(sharded_state_dict, True) # Define planner. If replication coordinator is present, use FlexibleLoadPlanner. - # This will prevent errors from occuring if local checkpoint is missing. + # This will prevent errors from occurring if local checkpoint is missing. if self.replication_coordinator is not None: planner = ReplicationLoadPlanner( shapes_validation_sharded_tensors=flexible_shape_sharded_tensors, From 01f7c1d44f742be8bd78fa5a470072802898f39f Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:42:19 -0800 Subject: [PATCH 18/22] Update replication_utils.py throguh|through --- resiliency/plugins/replication_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resiliency/plugins/replication_utils.py b/resiliency/plugins/replication_utils.py index 4d0f20a..4659330 100644 --- a/resiliency/plugins/replication_utils.py +++ b/resiliency/plugins/replication_utils.py @@ -629,7 +629,7 @@ def get_replication_coordinator( 3) (1) and (2) is collected for all ranks. 4) Determine mapping of ranks <-> local checkpoint rank 5) Determine all other ranks have replicated state of this rank. - 6) With this information, we iterate throguh all ranks again. For each rank, + 6) With this information, we iterate through all ranks again. For each rank, calculate: a) if rank already has access to its own checkpoint: this rank does not need to receive. From c6af40859edaa3c057bb12d92e1e5dd8d2717808 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:43:11 -0800 Subject: [PATCH 19/22] Update replication_utils.py repliated|replicated --- resiliency/plugins/replication_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resiliency/plugins/replication_utils.py b/resiliency/plugins/replication_utils.py index 4659330..18655d8 100644 --- a/resiliency/plugins/replication_utils.py +++ b/resiliency/plugins/replication_utils.py @@ -639,7 +639,7 @@ def get_replication_coordinator( i) if this rank's checkpoint exists somewhere (using (4)), determine that rank with checkpoint as source ii) if this rank's checkpoint does not exist somewhere (using (4)), - determine smallest rank with repliated state as source + determine smallest rank with replicated state as source smallest peer rank with replicated state will be source of broadcast to all ranks that need replicated state NOTE: if no replicated state has source, then we cannot rely on From 4275a37067420f9c0460b9c40aa1aa9dc3fcbf36 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:44:17 -0800 Subject: [PATCH 20/22] Update model_checkpoint.py reinintialize|reinitialize --- resiliency/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resiliency/callbacks/model_checkpoint.py b/resiliency/callbacks/model_checkpoint.py index 8daa581..b6f60f8 100644 --- a/resiliency/callbacks/model_checkpoint.py +++ b/resiliency/callbacks/model_checkpoint.py @@ -69,7 +69,7 @@ class ModelCheckpoint(PTLModelCheckpoint): save_optim_on_train_end: Whether to include the optimizer states in the final checkpoint at the end of training. Only applicable when save_weights_only is ``False``. - always_save_context: Whether to dump the artifacts needed to reinintialize + always_save_context: Whether to dump the artifacts needed to reinitialize the current model, trainer, and dataloader to allow for reproducibility of experiments. save_context_on_train_end: Whether to dump the artifacts on_train_end From ee2e382c06799e0291d1111076972852dd59698d Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:45:06 -0800 Subject: [PATCH 21/22] Update comm_overlap.py overriden|overridden --- resiliency/callbacks/comm_overlap.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resiliency/callbacks/comm_overlap.py b/resiliency/callbacks/comm_overlap.py index bd91089..6d92400 100644 --- a/resiliency/callbacks/comm_overlap.py +++ b/resiliency/callbacks/comm_overlap.py @@ -148,7 +148,7 @@ def _get_model_comm_overlap_cfgs( if vp_size is None: vp_size = 1 - # Optimizations disabled by default, can be overriden by user + # Optimizations disabled by default, can be overridden by user comm_overlap_cfg.tp_comm_overlap = False comm_overlap_cfg.tp_comm_overlap_cfg = None comm_overlap_cfg.tp_comm_bootstrap_backend = None From 48ab1c02c27d5a28662402839bf442983b7b7923 Mon Sep 17 00:00:00 2001 From: Tarun-SnoDx Date: Tue, 17 Feb 2026 15:45:59 -0800 Subject: [PATCH 22/22] Update async_utils.py persisent|persistent --- third_party/megatron/async_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/megatron/async_utils.py b/third_party/megatron/async_utils.py index 47f93aa..bf664d6 100644 --- a/third_party/megatron/async_utils.py +++ b/third_party/megatron/async_utils.py @@ -430,7 +430,7 @@ def async_loop( ): """Main function for the persistent checkpoint worker - The persisent worker is created once and terminated at exit or + The persistent worker is created once and terminated at exit or when application calls `close()` explicitly This routine receives `AsyncRequest` and does `preload_fn` first and