From a0ae635b930535129ec2d2e04bc671ce3febb81d Mon Sep 17 00:00:00 2001
From: Jaime Fraustro <jaime.fraustro.valdez@intel.com>
Date: Fri, 21 Feb 2025 11:42:11 -0600
Subject: [PATCH 1/6] Add support for Intel GPU to MNIST examples

* Add support for Intel GPU to MNIST example
* Add support for Intel GPU to MNIST Forward-Forward example
* Add support for Intel GPU to MNIST using RNN example and update README with optional arguments
* Refactor argument parsing in MNIST examples. There is no need to use `default=False` with `store_true`

Signed-off-by: jafraustro <jaime.fraustro.valdez@intel.com>
---
 mnist/main.py                   | 13 +++++++++----
 mnist_forward_forward/README.md |  1 +
 mnist_forward_forward/main.py   | 12 ++++++++----
 mnist_rnn/README.md             | 17 +++++++++++++++++
 mnist_rnn/main.py               | 10 +++++++---
 5 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/mnist/main.py b/mnist/main.py
index 184dc4744f..09487639d4 100644
--- a/mnist/main.py
+++ b/mnist/main.py
@@ -82,21 +82,24 @@ def main():
                         help='learning rate (default: 1.0)')
     parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                         help='Learning rate step gamma (default: 0.7)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
+    parser.add_argument('--no-cuda', action='store_true',
                         help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
+    parser.add_argument('--no-mps', action='store_true',
                         help='disables macOS GPU training')
-    parser.add_argument('--dry-run', action='store_true', default=False,
+    parser.add_argument('--no-xpu', action='store_true',
+                        help='disables Intel GPU training')
+    parser.add_argument('--dry-run', action='store_true',
                         help='quickly check a single pass')
     parser.add_argument('--seed', type=int, default=1, metavar='S',
                         help='random seed (default: 1)')
     parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                         help='how many batches to wait before logging training status')
-    parser.add_argument('--save-model', action='store_true', default=False,
+    parser.add_argument('--save-model', action='store_true',
                         help='For Saving the current Model')
     args = parser.parse_args()
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
+    use_xpu = not args.no_mps and torch.xpu.is_available()
 
     torch.manual_seed(args.seed)
 
@@ -104,6 +107,8 @@ def main():
         device = torch.device("cuda")
     elif use_mps:
         device = torch.device("mps")
+    elif use_xpu:
+        device = torch.device("xpu")
     else:
         device = torch.device("cpu")
 
diff --git a/mnist_forward_forward/README.md b/mnist_forward_forward/README.md
index f6ae12e56d..8857c9a6fb 100644
--- a/mnist_forward_forward/README.md
+++ b/mnist_forward_forward/README.md
@@ -18,6 +18,7 @@ optional arguments:
   --lr LR               learning rate (default: 0.03)
   --no_cuda             disables CUDA training
   --no_mps              disables MPS training
+  --no_xpu              disables XPU training
   --seed SEED           random seed (default: 1)
   --save_model          For saving the current Model
   --train_size TRAIN_SIZE
diff --git a/mnist_forward_forward/main.py b/mnist_forward_forward/main.py
index a175126067..e6c2902ed8 100644
--- a/mnist_forward_forward/main.py
+++ b/mnist_forward_forward/main.py
@@ -102,10 +102,13 @@ def train(self, x_pos, x_neg):
         help="learning rate (default: 0.03)",
     )
     parser.add_argument(
-        "--no_cuda", action="store_true", default=False, help="disables CUDA training"
+        "--no_cuda", action="store_true", help="disables CUDA training"
     )
     parser.add_argument(
-        "--no_mps", action="store_true", default=False, help="disables MPS training"
+        "--no_mps", action="store_true", help="disables MPS training"
+    )
+    parser.add_argument(
+        "--no_xpu", action="store_true", help="disables XPU training"
     )
     parser.add_argument(
         "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
@@ -113,7 +116,6 @@ def train(self, x_pos, x_neg):
     parser.add_argument(
         "--save_model",
         action="store_true",
-        default=False,
         help="For saving the current Model",
     )
     parser.add_argument(
@@ -126,7 +128,6 @@ def train(self, x_pos, x_neg):
     parser.add_argument(
         "--save-model",
         action="store_true",
-        default=False,
         help="For Saving the current Model",
     )
     parser.add_argument(
@@ -139,10 +140,13 @@ def train(self, x_pos, x_neg):
     args = parser.parse_args()
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
+    use_xpu = not args.no_xpu and torch.xpu.is_available()
     if use_cuda:
         device = torch.device("cuda")
     elif use_mps:
         device = torch.device("mps")
+    elif use_xpu:
+        device = torch.device("xpu")
     else:
         device = torch.device("cpu")
 
diff --git a/mnist_rnn/README.md b/mnist_rnn/README.md
index c879cb367f..ba63513711 100644
--- a/mnist_rnn/README.md
+++ b/mnist_rnn/README.md
@@ -8,3 +8,20 @@ pip install -r requirements.txt
 python main.py
 # CUDA_VISIBLE_DEVICES=2 python main.py  # to specify GPU id to ex. 2
 ```
+
+```bash
+optional arguments:
+  -h, --help            show this help message and exit
+  --batch_size          input batch_size for training (default:64)
+  --testing_batch_size  input batch size for testing (default: 1000)
+  --epochs EPOCHS       number of epochs to train (default: 14)
+  --lr LR               learning rate (default: 0.1)
+  --gamma               learning rate step gamma (default: 0.7)
+  --cuda                enables CUDA training
+  --xpu                 enables XPU training
+  --mps                 enables macos GPU training
+  --seed SEED           random seed (default: 1)
+  --save_model          For saving the current Model
+  --log_interval        how many batches to wait before logging training status
+  --dry-run             quickly check a single pass
+```
\ No newline at end of file
diff --git a/mnist_rnn/main.py b/mnist_rnn/main.py
index 2fa64c00d6..f6c1ff3d48 100644
--- a/mnist_rnn/main.py
+++ b/mnist_rnn/main.py
@@ -93,15 +93,17 @@ def main():
                         help='learning rate step gamma (default: 0.7)')
     parser.add_argument('--cuda', action='store_true', default=False,
                         help='enables CUDA training')
-    parser.add_argument('--mps', action="store_true", default=False,
+    parser.add_argument('--mps', action="store_true", 
                         help="enables MPS training")
-    parser.add_argument('--dry-run', action='store_true', default=False,
+    parser.add_argument('--xpu', action='store_true',
+                        help='enables XPU training')
+    parser.add_argument('--dry-run', action='store_true',
                         help='quickly check a single pass')
     parser.add_argument('--seed', type=int, default=1, metavar='S',
                         help='random seed (default: 1)')
     parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                         help='how many batches to wait before logging training status')
-    parser.add_argument('--save-model', action='store_true', default=False,
+    parser.add_argument('--save-model', action='store_true',
                         help='for Saving the current Model')
     args = parser.parse_args()
 
@@ -109,6 +111,8 @@ def main():
         device = "cuda"
     elif args.mps and not args.cuda:
         device = "mps"
+    elif args.xpu:
+        device = "xpu"
     else:
         device = "cpu"
 

From 4a2e3e30abd3dc91a76585d2ca47b84a8824b6b7 Mon Sep 17 00:00:00 2001
From: eromomon <edgar.romo.montiel@intel.com>
Date: Fri, 21 Feb 2025 11:44:24 -0600
Subject: [PATCH 2/6] Add support for Intel GPU to Basic VAE example

* Add support for Intel GPU to Basic VAE example and update README with optional arguments
* Remove `default=False` from `store_true` arguments
* Fix typo in Readme
---
 vae/README.md |  7 ++++---
 vae/main.py   | 11 +++++++++--
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/vae/README.md b/vae/README.md
index cda6a33672..e2a432fd1e 100644
--- a/vae/README.md
+++ b/vae/README.md
@@ -14,8 +14,9 @@ The main.py script accepts the following arguments:
 optional arguments:
   --batch-size		input batch size for training (default: 128)
   --epochs		number of epochs to train (default: 10)
-  --no-cuda		enables CUDA training
-  --mps         enables GPU on macOS
+  --no-cuda		disables CUDA training
+  --no-mps	        disables GPU on macOS
+  --no-xpu		disables XPU training in Intel GPUs
   --seed		random seed (default: 1)
   --log-interval	how many batches to wait before logging training status
-```
\ No newline at end of file
+```
diff --git a/vae/main.py b/vae/main.py
index d69833fbe0..f7915b9ced 100644
--- a/vae/main.py
+++ b/vae/main.py
@@ -13,10 +13,12 @@
                     help='input batch size for training (default: 128)')
 parser.add_argument('--epochs', type=int, default=10, metavar='N',
                     help='number of epochs to train (default: 10)')
-parser.add_argument('--no-cuda', action='store_true', default=False,
+parser.add_argument('--no-cuda', action='store_true',
                     help='disables CUDA training')
-parser.add_argument('--no-mps', action='store_true', default=False,
+parser.add_argument('--no-mps', action='store_true',
                         help='disables macOS GPU training')
+parser.add_argument('--no-xpu', action='store_true',
+                        help='disables Intel XPU training')
 parser.add_argument('--seed', type=int, default=1, metavar='S',
                     help='random seed (default: 1)')
 parser.add_argument('--log-interval', type=int, default=10, metavar='N',
@@ -24,6 +26,7 @@
 args = parser.parse_args()
 args.cuda = not args.no_cuda and torch.cuda.is_available()
 use_mps = not args.no_mps and torch.backends.mps.is_available()
+use_xpu = not args.no_xpu and torch.xpu.is_available()
 
 torch.manual_seed(args.seed)
 
@@ -31,9 +34,13 @@
     device = torch.device("cuda")
 elif use_mps:
     device = torch.device("mps")
+elif use_xpu:
+    device = torch.device("xpu")
 else:
     device = torch.device("cpu")
 
+print('Device to use: ', device)
+
 kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
 train_loader = torch.utils.data.DataLoader(
     datasets.MNIST('../data', train=True, download=True,

From 82129918d78d1b593f11ce66f827122734c77b71 Mon Sep 17 00:00:00 2001
From: eromomon <edgar.romo.montiel@intel.com>
Date: Fri, 21 Feb 2025 11:47:29 -0600
Subject: [PATCH 3/6] Add support for Intel GPU to Siamese Network example

---
 siamese_network/README.md | 37 ++++++++++++++++++++++++++++++++++++-
 siamese_network/main.py   | 15 +++++++++++----
 2 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/siamese_network/README.md b/siamese_network/README.md
index 973a0414a4..19b19f0e76 100644
--- a/siamese_network/README.md
+++ b/siamese_network/README.md
@@ -1,7 +1,42 @@
 # Siamese Network Example
 
+Siamese network for image similarity estimation.
+The network is composed of two identical networks, one for each input.
+The output of each network is concatenated and passed to a linear layer.
+The output of the linear layer passed through a sigmoid function.
+[FaceNet](https://arxiv.org/pdf/1503.03832.pdf) is a variant of the Siamese network.
+This implementation varies from FaceNet as we use the `ResNet-18` model from
+[Deep Residual Learning for Image Recognition](https://arxiv.org/pdf/1512.03385.pdf) as our feature extractor.
+In addition, we aren't using `TripletLoss` as the MNIST dataset is simple, so `BCELoss` can do the trick.
+
 ```bash
 pip install -r requirements.txt
 python main.py
-# CUDA_VISIBLE_DEVICES=2 python main.py  # to specify GPU id to ex. 2
 ```
+
+Optionally, you can add the following arguments to customize your execution.
+
+```bash
+--batch-size            input batch size for training (default: 64)
+--test-batch-size       input batch size for testing (default: 1000)
+--epochs                number of epochs to train (default: 14)
+--lr                    learning rate (default: 1.0)
+--gamma                 learning rate step gamma (default: 0.7)
+--no-cuda               disables CUDA training
+--no-xpu                disables XPU training
+--no-mps                disables macOS GPU training
+--dry-run               quickly check a single pass
+--seed                  random seed (default: 1)
+--log-interval          how many batches to wait before logging training status
+--save-model            Saving the current Model
+```
+
+If a GPU device (CUDA, XPU, or MPS) is detected, the example will be executed on the GPU by default; otherwise, it will run on the CPU.
+
+To disable the GPU option, add the appropriate argument to the command. For example:
+
+```bash
+python main.py --no-xpu
+```
+
+This command will execute the example on the CPU even if your system successfully detects an XPU.
diff --git a/siamese_network/main.py b/siamese_network/main.py
index 8f420a9b01..6bd55235e2 100644
--- a/siamese_network/main.py
+++ b/siamese_network/main.py
@@ -247,32 +247,39 @@ def main():
                         help='learning rate (default: 1.0)')
     parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
                         help='Learning rate step gamma (default: 0.7)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
+    parser.add_argument('--no-cuda', action='store_true',
                         help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
+    parser.add_argument('--no-xpu', action='store_true',
+                        help='disables XPU training')
+    parser.add_argument('--no-mps', action='store_true',
                         help='disables macOS GPU training')
-    parser.add_argument('--dry-run', action='store_true', default=False,
+    parser.add_argument('--dry-run', action='store_true',
                         help='quickly check a single pass')
     parser.add_argument('--seed', type=int, default=1, metavar='S',
                         help='random seed (default: 1)')
     parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                         help='how many batches to wait before logging training status')
-    parser.add_argument('--save-model', action='store_true', default=False,
+    parser.add_argument('--save-model', action='store_true',
                         help='For Saving the current Model')
     args = parser.parse_args()
     
     use_cuda = not args.no_cuda and torch.cuda.is_available()
+    use_xpu = not args.no_xpu and torch.xpu.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
 
     torch.manual_seed(args.seed)
 
     if use_cuda:
         device = torch.device("cuda")
+    elif use_xpu:
+        device = torch.device("xpu")
     elif use_mps:
         device = torch.device("mps")
     else:
         device = torch.device("cpu")
 
+    print('Device to use: ', device)
+
     train_kwargs = {'batch_size': args.batch_size}
     test_kwargs = {'batch_size': args.test_batch_size}
     if use_cuda:

From dcaff04e5138f6d1238d0c08bf0ec720a6447b2d Mon Sep 17 00:00:00 2001
From: eromomon <edgar.romo.montiel@intel.com>
Date: Fri, 21 Feb 2025 17:03:23 -0600
Subject: [PATCH 4/6] Add support for Intel GPU to Fast Neural Style example

---
 fast_neural_style/README.md                   | 10 ++++++----
 .../neural_style/neural_style.py              | 19 ++++++++++++++++++-
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/fast_neural_style/README.md b/fast_neural_style/README.md
index 8057847214..c7fbe80320 100644
--- a/fast_neural_style/README.md
+++ b/fast_neural_style/README.md
@@ -26,8 +26,9 @@ python neural_style/neural_style.py eval --content-image </path/to/content/image
 - `--model`: saved model to be used for stylizing the image (eg: `mosaic.pth`)
 - `--output-image`: path for saving the output image.
 - `--content-scale`: factor for scaling down the content image if memory is an issue (eg: value of 2 will halve the height and width of content-image)
-- `--cuda`: set it to 1 for running on GPU, 0 for CPU.
-- `--mps`: set it to 1 for running on macOS GPU
+- `--cuda 0|1`: set it to 1 for running on GPU, 0 for CPU.
+- `--mps`: use MPS device backend.
+- `--xpu`: use XPU device backend.
 
 Train model
 
@@ -40,8 +41,9 @@ There are several command line arguments, the important ones are listed below
 - `--dataset`: path to training dataset, the path should point to a folder containing another folder with all the training images. I used COCO 2014 Training images dataset [80K/13GB] [(download)](https://cocodataset.org/#download).
 - `--style-image`: path to style-image.
 - `--save-model-dir`: path to folder where trained model will be saved.
-- `--cuda`: set it to 1 for running on GPU, 0 for CPU.
-- `--mps`: set it to 1 for running on macOS GPU
+- `--cuda 0|1`: set it to 1 for running on GPU, 0 for CPU.
+- `--mps`: use MPS device backend.
+- `--xpu`: use XPU device backend.
 
 Refer to `neural_style/neural_style.py` for other command line arguments. For training new models you might have to tune the values of `--content-weight` and `--style-weight`. The mosaic style model shown above was trained with `--content-weight 1e5` and `--style-weight 1e10`. The remaining 3 models were also trained with similar order of weight parameters with slight variation in the `--style-weight` (`5e10` or `1e11`).
 
diff --git a/fast_neural_style/neural_style/neural_style.py b/fast_neural_style/neural_style/neural_style.py
index 91bf642d82..e51007c157 100644
--- a/fast_neural_style/neural_style/neural_style.py
+++ b/fast_neural_style/neural_style/neural_style.py
@@ -33,9 +33,13 @@ def train(args):
         device = torch.device("cuda")
     elif args.mps:
         device = torch.device("mps")
+    elif args.xpu:
+        device = torch.device("xpu")
     else:
         device = torch.device("cpu")
 
+    print("Device to use: ", device)
+
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
 
@@ -126,6 +130,9 @@ def train(args):
 
 def stylize(args):
     device = torch.device("cuda" if args.cuda else "cpu")
+    device = torch.device("xpu" if args.xpu else "cpu")
+    
+    print("Device to use: ", device)
 
     content_image = utils.load_image(args.content_image, scale=args.content_scale)
     content_transform = transforms.Compose([
@@ -219,6 +226,10 @@ def main():
                                   help="number of images after which the training loss is logged, default is 500")
     train_arg_parser.add_argument("--checkpoint-interval", type=int, default=2000,
                                   help="number of batches after which a checkpoint of the trained model will be created")
+    train_arg_parser.add_argument('--mps', action='store_true',
+                                  help='enable macOS GPU training')
+    train_arg_parser.add_argument('--xpu', action='store_true',
+                                  help='enable Intel XPU training')
 
     eval_arg_parser = subparsers.add_parser("eval", help="parser for evaluation/stylizing arguments")
     eval_arg_parser.add_argument("--content-image", type=str, required=True,
@@ -233,7 +244,11 @@ def main():
                                  help="set it to 1 for running on cuda, 0 for CPU")
     eval_arg_parser.add_argument("--export_onnx", type=str,
                                  help="export ONNX model to a given file")
-    eval_arg_parser.add_argument('--mps', action='store_true', default=False, help='enable macOS GPU training')
+    eval_arg_parser.add_argument('--mps', action='store_true',
+                                 help='enable macOS GPU evaluation')
+    eval_arg_parser.add_argument('--xpu', action='store_true',
+                                 help='enable Intel XPU evaluation')
+
 
     args = main_arg_parser.parse_args()
 
@@ -245,6 +260,8 @@ def main():
         sys.exit(1)
     if not args.mps and torch.backends.mps.is_available():
         print("WARNING: mps is available, run with --mps to enable macOS GPU")
+    if not args.xpu and torch.xpu.is_available():
+        print("WARNING: XPU is available, run with --xpu to enable Intel XPU")
 
     if args.subcommand == "train":
         check_paths(args)

From 78c48abd8fd96d1120a6247bb4cb3c054420b608 Mon Sep 17 00:00:00 2001
From: Jaime Fraustro <jaime.fraustro.valdez@intel.com>
Date: Mon, 3 Mar 2025 19:02:49 -0600
Subject: [PATCH 5/6] Add support for Intel GPU to GAT example

Signed-off-by: jafraustro <jaime.fraustro.valdez@intel.com>
---
 gat/README.md |  1 +
 gat/main.py   | 13 +++++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/gat/README.md b/gat/README.md
index 7bb71bc17b..d7ae967379 100644
--- a/gat/README.md
+++ b/gat/README.md
@@ -89,6 +89,7 @@ options:
                         epochs to wait for print training and validation evaluation (default: 20)
   --no-cuda             disables CUDA training
   --no-mps              disables macOS GPU training
+  --no-xpu              disables XPU training
   --dry-run             quickly check a single pass
   --seed S              random seed (default: 13)
 ```
diff --git a/gat/main.py b/gat/main.py
index 9c143af8ec..cba703de5c 100644
--- a/gat/main.py
+++ b/gat/main.py
@@ -303,15 +303,17 @@ def test(model, criterion, input, target, mask):
                         help='dimension of the hidden representation (default: 64)')
     parser.add_argument('--num-heads', type=int, default=8,
                         help='number of the attention heads (default: 4)')
-    parser.add_argument('--concat-heads', action='store_true', default=False,
+    parser.add_argument('--concat-heads', action='store_true',
                         help='wether to concatinate attention heads, or average over them (default: False)')
     parser.add_argument('--val-every', type=int, default=20,
                         help='epochs to wait for print training and validation evaluation (default: 20)')
-    parser.add_argument('--no-cuda', action='store_true', default=False,
+    parser.add_argument('--no-cuda', action='store_true',
                         help='disables CUDA training')
-    parser.add_argument('--no-mps', action='store_true', default=False,
+    parser.add_argument('--no-xpu', action='store_true',
+                        help='disables XPU training')
+    parser.add_argument('--no-mps', action='store_true',
                         help='disables macOS GPU training')
-    parser.add_argument('--dry-run', action='store_true', default=False,
+    parser.add_argument('--dry-run', action='store_true',
                         help='quickly check a single pass')
     parser.add_argument('--seed', type=int, default=13, metavar='S',
                         help='random seed (default: 13)')
@@ -320,12 +322,15 @@ def test(model, criterion, input, target, mask):
     torch.manual_seed(args.seed)
     use_cuda = not args.no_cuda and torch.cuda.is_available()
     use_mps = not args.no_mps and torch.backends.mps.is_available()
+    use_xpu = not args.no_xpu and torch.xpu.is_available()
 
     # Set the device to run on
     if use_cuda:
         device = torch.device('cuda')
     elif use_mps:
         device = torch.device('mps')
+    elif use_xpu:
+        device = torch.device('xpu')
     else:
         device = torch.device('cpu')
     print(f'Using {device} device')

From 27a4fd99995a46ae43f0cbca44fa5fb9f833b1f2 Mon Sep 17 00:00:00 2001
From: eromomon <edgar.romo.montiel@intel.com>
Date: Mon, 19 May 2025 16:27:21 -0700
Subject: [PATCH 6/6] Add Accelerator Api to Imagenet Example

Signed-off-by: eromomon <edgar.romo.montiel@intel.com>
---
 imagenet/README.md        |   7 ++-
 imagenet/main.py          | 111 ++++++++++++++++++++++----------------
 imagenet/requirements.txt |   4 +-
 3 files changed, 71 insertions(+), 51 deletions(-)

diff --git a/imagenet/README.md b/imagenet/README.md
index 9b280f087e..e3f66429b9 100644
--- a/imagenet/README.md
+++ b/imagenet/README.md
@@ -33,7 +33,9 @@ python main.py -a resnet18 --dummy
 
 ## Multi-processing Distributed Data Parallel Training
 
-You should always use the NCCL backend for multi-processing distributed training since it currently provides the best distributed training performance.
+If running on CUDA, you should always use the NCCL backend for multi-processing distributed training since it currently provides the best distributed training performance.
+
+For XPU multiprocessing is not supported as of PyTorch 2.6.
 
 ### Single node, multiple GPUs:
 
@@ -59,7 +61,7 @@ python main.py -a resnet50 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' --dist-backen
 
 ```bash
 usage: main.py [-h] [-a ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N] [--lr LR] [--momentum M] [--wd W] [-p N] [--resume PATH] [-e] [--pretrained] [--world-size WORLD_SIZE] [--rank RANK]
-               [--dist-url DIST_URL] [--dist-backend DIST_BACKEND] [--seed SEED] [--gpu GPU] [--multiprocessing-distributed] [--dummy]
+               [--dist-url DIST_URL] [--dist-backend DIST_BACKEND] [--seed SEED] [--gpu GPU] [--no-accel][--multiprocessing-distributed] [--dummy]
                [DIR]
 
 PyTorch ImageNet Training
@@ -96,6 +98,7 @@ optional arguments:
                         distributed backend
   --seed SEED           seed for initializing training.
   --gpu GPU             GPU id to use.
+  --no-accel            disables accelerator
   --multiprocessing-distributed
                         Use multi-processing distributed training to launch N processes per node, which has N GPUs. This is the fastest way to use PyTorch for either single node or multi node data parallel
                         training
diff --git a/imagenet/main.py b/imagenet/main.py
index cc32d50733..dd33470908 100644
--- a/imagenet/main.py
+++ b/imagenet/main.py
@@ -71,6 +71,8 @@
                     help='seed for initializing training. ')
 parser.add_argument('--gpu', default=None, type=int,
                     help='GPU id to use.')
+parser.add_argument('--no-accel', action='store_true',
+                    help='disables accelerator')
 parser.add_argument('--multiprocessing-distributed', action='store_true',
                     help='Use multi-processing distributed training to launch '
                          'N processes per node, which has N GPUs. This is the '
@@ -104,8 +106,17 @@ def main():
 
     args.distributed = args.world_size > 1 or args.multiprocessing_distributed
 
-    if torch.cuda.is_available():
-        ngpus_per_node = torch.cuda.device_count()
+    use_accel = not args.no_accel and torch.accelerator.is_available()
+
+    if use_accel:
+        device = torch.accelerator.current_accelerator()
+    else:
+        device = torch.device("cpu")
+
+    print(f"Using device: {device}")
+
+    if device.type =='cuda':
+        ngpus_per_node = torch.accelerator.device_count()
         if ngpus_per_node == 1 and args.dist_backend == "nccl":
             warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'")
     else:
@@ -127,8 +138,15 @@ def main_worker(gpu, ngpus_per_node, args):
     global best_acc1
     args.gpu = gpu
 
-    if args.gpu is not None:
-        print("Use GPU: {} for training".format(args.gpu))
+    use_accel = not args.no_accel and torch.accelerator.is_available()
+
+    if use_accel:
+        if args.gpu is not None:
+            torch.accelerator.set_device_index(args.gpu)
+            print("Use GPU: {} for training".format(args.gpu))
+        device = torch.accelerator.current_accelerator()
+    else:
+        device = torch.device("cpu")
 
     if args.distributed:
         if args.dist_url == "env://" and args.rank == -1:
@@ -147,16 +165,16 @@ def main_worker(gpu, ngpus_per_node, args):
         print("=> creating model '{}'".format(args.arch))
         model = models.__dict__[args.arch]()
 
-    if not torch.cuda.is_available() and not torch.backends.mps.is_available():
+    if not use_accel:
         print('using CPU, this will be slow')
     elif args.distributed:
         # For multiprocessing distributed, DistributedDataParallel constructor
         # should always set the single device scope, otherwise,
         # DistributedDataParallel will use all available devices.
-        if torch.cuda.is_available():
+        if device.type == 'cuda':
             if args.gpu is not None:
                 torch.cuda.set_device(args.gpu)
-                model.cuda(args.gpu)
+                model.cuda(device)
                 # When using a single GPU per process and per
                 # DistributedDataParallel, we need to divide the batch size
                 # ourselves based on the total number of GPUs of the current node.
@@ -168,29 +186,17 @@ def main_worker(gpu, ngpus_per_node, args):
                 # DistributedDataParallel will divide and allocate batch_size to all
                 # available GPUs if device_ids are not set
                 model = torch.nn.parallel.DistributedDataParallel(model)
-    elif args.gpu is not None and torch.cuda.is_available():
-        torch.cuda.set_device(args.gpu)
-        model = model.cuda(args.gpu)
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
-        model = model.to(device)
-    else:
+    elif device.type == 'cuda':
         # DataParallel will divide and allocate batch_size to all available GPUs
         if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
             model.features = torch.nn.DataParallel(model.features)
             model.cuda()
         else:
             model = torch.nn.DataParallel(model).cuda()
-
-    if torch.cuda.is_available():
-        if args.gpu:
-            device = torch.device('cuda:{}'.format(args.gpu))
-        else:
-            device = torch.device("cuda")
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
     else:
-        device = torch.device("cpu")
+        model.to(device)
+
+
     # define loss function (criterion), optimizer, and learning rate scheduler
     criterion = nn.CrossEntropyLoss().to(device)
 
@@ -207,9 +213,9 @@ def main_worker(gpu, ngpus_per_node, args):
             print("=> loading checkpoint '{}'".format(args.resume))
             if args.gpu is None:
                 checkpoint = torch.load(args.resume)
-            elif torch.cuda.is_available():
+            else:
                 # Map model to be loaded to specified single gpu.
-                loc = 'cuda:{}'.format(args.gpu)
+                loc = f'{device.type}:{args.gpu}'
                 checkpoint = torch.load(args.resume, map_location=loc)
             args.start_epoch = checkpoint['epoch']
             best_acc1 = checkpoint['best_acc1']
@@ -302,11 +308,14 @@ def main_worker(gpu, ngpus_per_node, args):
 
 
 def train(train_loader, model, criterion, optimizer, epoch, device, args):
-    batch_time = AverageMeter('Time', ':6.3f')
-    data_time = AverageMeter('Data', ':6.3f')
-    losses = AverageMeter('Loss', ':.4e')
-    top1 = AverageMeter('Acc@1', ':6.2f')
-    top5 = AverageMeter('Acc@5', ':6.2f')
+    
+    use_accel = not args.no_accel and torch.accelerator.is_available()
+
+    batch_time = AverageMeter('Time', use_accel, ':6.3f', Summary.NONE)
+    data_time = AverageMeter('Data', use_accel, ':6.3f', Summary.NONE)
+    losses = AverageMeter('Loss', use_accel, ':.4e', Summary.NONE)
+    top1 = AverageMeter('Acc@1', use_accel, ':6.2f', Summary.NONE)
+    top5 = AverageMeter('Acc@5', use_accel, ':6.2f', Summary.NONE)
     progress = ProgressMeter(
         len(train_loader),
         [batch_time, data_time, losses, top1, top5],
@@ -349,18 +358,27 @@ def train(train_loader, model, criterion, optimizer, epoch, device, args):
 
 def validate(val_loader, model, criterion, args):
 
+    use_accel = not args.no_accel and torch.accelerator.is_available()
+
     def run_validate(loader, base_progress=0):
+
+        if use_accel:
+            device = torch.accelerator.current_accelerator()
+        else:
+            device = torch.device("cpu")
+
         with torch.no_grad():
             end = time.time()
             for i, (images, target) in enumerate(loader):
                 i = base_progress + i
-                if args.gpu is not None and torch.cuda.is_available():
-                    images = images.cuda(args.gpu, non_blocking=True)
-                if torch.backends.mps.is_available():
-                    images = images.to('mps')
-                    target = target.to('mps')
-                if torch.cuda.is_available():
-                    target = target.cuda(args.gpu, non_blocking=True)
+                if use_accel:
+                    if args.gpu is not None and device.type=='cuda':
+                        torch.accelerator.set_device_index(argps.gpu)
+                        images = images.cuda(args.gpu, non_blocking=True)
+                        target = target.cuda(args.gpu, non_blocking=True)
+                    else:
+                        images = images.to(device)
+                        target = target.to(device)
 
                 # compute output
                 output = model(images)
@@ -379,10 +397,10 @@ def run_validate(loader, base_progress=0):
                 if i % args.print_freq == 0:
                     progress.display(i + 1)
 
-    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
-    losses = AverageMeter('Loss', ':.4e', Summary.NONE)
-    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
-    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
+    batch_time = AverageMeter('Time', use_accel, ':6.3f', Summary.NONE)
+    losses = AverageMeter('Loss', use_accel, ':.4e', Summary.NONE)
+    top1 = AverageMeter('Acc@1', use_accel, ':6.2f', Summary.AVERAGE)
+    top5 = AverageMeter('Acc@5', use_accel, ':6.2f', Summary.AVERAGE)
     progress = ProgressMeter(
         len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
         [batch_time, losses, top1, top5],
@@ -422,8 +440,9 @@ class Summary(Enum):
 
 class AverageMeter(object):
     """Computes and stores the average and current value"""
-    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
+    def __init__(self, name, use_accel, fmt=':f', summary_type=Summary.AVERAGE):
         self.name = name
+        self.use_accel = use_accel
         self.fmt = fmt
         self.summary_type = summary_type
         self.reset()
@@ -440,11 +459,9 @@ def update(self, val, n=1):
         self.count += n
         self.avg = self.sum / self.count
 
-    def all_reduce(self):
-        if torch.cuda.is_available():
-            device = torch.device("cuda")
-        elif torch.backends.mps.is_available():
-            device = torch.device("mps")
+    def all_reduce(self):    
+        if use_accel:
+            device = torch.accelerator.current_accelerator()
         else:
             device = torch.device("cpu")
         total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device)
diff --git a/imagenet/requirements.txt b/imagenet/requirements.txt
index 6cec7414dc..9a083ba390 100644
--- a/imagenet/requirements.txt
+++ b/imagenet/requirements.txt
@@ -1,2 +1,2 @@
-torch
-torchvision==0.20.0
+torch>=2.6
+torchvision