Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions fast_neural_style/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ python neural_style/neural_style.py eval --content-image </path/to/content/image
- `--model`: saved model to be used for stylizing the image (eg: `mosaic.pth`)
- `--output-image`: path for saving the output image.
- `--content-scale`: factor for scaling down the content image if memory is an issue (eg: value of 2 will halve the height and width of content-image)
- `--cuda`: set it to 1 for running on GPU, 0 for CPU.
- `--mps`: set it to 1 for running on macOS GPU
- `--cuda 0|1`: set it to 1 for running on GPU, 0 for CPU.
- `--mps`: use MPS device backend.
- `--xpu`: use XPU device backend.

Train model

Expand All @@ -40,8 +41,9 @@ There are several command line arguments, the important ones are listed below
- `--dataset`: path to training dataset, the path should point to a folder containing another folder with all the training images. I used COCO 2014 Training images dataset [80K/13GB] [(download)](https://cocodataset.org/#download).
- `--style-image`: path to style-image.
- `--save-model-dir`: path to folder where trained model will be saved.
- `--cuda`: set it to 1 for running on GPU, 0 for CPU.
- `--mps`: set it to 1 for running on macOS GPU
- `--cuda 0|1`: set it to 1 for running on GPU, 0 for CPU.
- `--mps`: use MPS device backend.
- `--xpu`: use XPU device backend.

Refer to `neural_style/neural_style.py` for other command line arguments. For training new models you might have to tune the values of `--content-weight` and `--style-weight`. The mosaic style model shown above was trained with `--content-weight 1e5` and `--style-weight 1e10`. The remaining 3 models were also trained with similar order of weight parameters with slight variation in the `--style-weight` (`5e10` or `1e11`).

Expand Down
19 changes: 18 additions & 1 deletion fast_neural_style/neural_style/neural_style.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,13 @@ def train(args):
device = torch.device("cuda")
elif args.mps:
device = torch.device("mps")
elif args.xpu:
device = torch.device("xpu")
else:
device = torch.device("cpu")

print("Device to use: ", device)

np.random.seed(args.seed)
torch.manual_seed(args.seed)

Expand Down Expand Up @@ -126,6 +130,9 @@ def train(args):

def stylize(args):
device = torch.device("cuda" if args.cuda else "cpu")
device = torch.device("xpu" if args.xpu else "cpu")

print("Device to use: ", device)

content_image = utils.load_image(args.content_image, scale=args.content_scale)
content_transform = transforms.Compose([
Expand Down Expand Up @@ -219,6 +226,10 @@ def main():
help="number of images after which the training loss is logged, default is 500")
train_arg_parser.add_argument("--checkpoint-interval", type=int, default=2000,
help="number of batches after which a checkpoint of the trained model will be created")
train_arg_parser.add_argument('--mps', action='store_true',
help='enable macOS GPU training')
train_arg_parser.add_argument('--xpu', action='store_true',
help='enable Intel XPU training')

eval_arg_parser = subparsers.add_parser("eval", help="parser for evaluation/stylizing arguments")
eval_arg_parser.add_argument("--content-image", type=str, required=True,
Expand All @@ -233,7 +244,11 @@ def main():
help="set it to 1 for running on cuda, 0 for CPU")
eval_arg_parser.add_argument("--export_onnx", type=str,
help="export ONNX model to a given file")
eval_arg_parser.add_argument('--mps', action='store_true', default=False, help='enable macOS GPU training')
eval_arg_parser.add_argument('--mps', action='store_true',
help='enable macOS GPU evaluation')
eval_arg_parser.add_argument('--xpu', action='store_true',
help='enable Intel XPU evaluation')


args = main_arg_parser.parse_args()

Expand All @@ -245,6 +260,8 @@ def main():
sys.exit(1)
if not args.mps and torch.backends.mps.is_available():
print("WARNING: mps is available, run with --mps to enable macOS GPU")
if not args.xpu and torch.xpu.is_available():
print("WARNING: XPU is available, run with --xpu to enable Intel XPU")

if args.subcommand == "train":
check_paths(args)
Expand Down
1 change: 1 addition & 0 deletions gat/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ options:
epochs to wait for print training and validation evaluation (default: 20)
--no-cuda disables CUDA training
--no-mps disables macOS GPU training
--no-xpu disables XPU training
--dry-run quickly check a single pass
--seed S random seed (default: 13)
```
Expand Down
13 changes: 9 additions & 4 deletions gat/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,15 +303,17 @@ def test(model, criterion, input, target, mask):
help='dimension of the hidden representation (default: 64)')
parser.add_argument('--num-heads', type=int, default=8,
help='number of the attention heads (default: 4)')
parser.add_argument('--concat-heads', action='store_true', default=False,
parser.add_argument('--concat-heads', action='store_true',
help='wether to concatinate attention heads, or average over them (default: False)')
parser.add_argument('--val-every', type=int, default=20,
help='epochs to wait for print training and validation evaluation (default: 20)')
parser.add_argument('--no-cuda', action='store_true', default=False,
parser.add_argument('--no-cuda', action='store_true',
help='disables CUDA training')
parser.add_argument('--no-mps', action='store_true', default=False,
parser.add_argument('--no-xpu', action='store_true',
help='disables XPU training')
parser.add_argument('--no-mps', action='store_true',
help='disables macOS GPU training')
parser.add_argument('--dry-run', action='store_true', default=False,
parser.add_argument('--dry-run', action='store_true',
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=13, metavar='S',
help='random seed (default: 13)')
Expand All @@ -320,12 +322,15 @@ def test(model, criterion, input, target, mask):
torch.manual_seed(args.seed)
use_cuda = not args.no_cuda and torch.cuda.is_available()
use_mps = not args.no_mps and torch.backends.mps.is_available()
use_xpu = not args.no_xpu and torch.xpu.is_available()

# Set the device to run on
if use_cuda:
device = torch.device('cuda')
elif use_mps:
device = torch.device('mps')
elif use_xpu:
device = torch.device('xpu')
else:
device = torch.device('cpu')
print(f'Using {device} device')
Expand Down
7 changes: 5 additions & 2 deletions imagenet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ python main.py -a resnet18 --dummy

## Multi-processing Distributed Data Parallel Training

You should always use the NCCL backend for multi-processing distributed training since it currently provides the best distributed training performance.
If running on CUDA, you should always use the NCCL backend for multi-processing distributed training since it currently provides the best distributed training performance.

For XPU multiprocessing is not supported as of PyTorch 2.6.

### Single node, multiple GPUs:

Expand All @@ -59,7 +61,7 @@ python main.py -a resnet50 --dist-url 'tcp://IP_OF_NODE0:FREEPORT' --dist-backen

```bash
usage: main.py [-h] [-a ARCH] [-j N] [--epochs N] [--start-epoch N] [-b N] [--lr LR] [--momentum M] [--wd W] [-p N] [--resume PATH] [-e] [--pretrained] [--world-size WORLD_SIZE] [--rank RANK]
[--dist-url DIST_URL] [--dist-backend DIST_BACKEND] [--seed SEED] [--gpu GPU] [--multiprocessing-distributed] [--dummy]
[--dist-url DIST_URL] [--dist-backend DIST_BACKEND] [--seed SEED] [--gpu GPU] [--no-accel][--multiprocessing-distributed] [--dummy]
[DIR]

PyTorch ImageNet Training
Expand Down Expand Up @@ -96,6 +98,7 @@ optional arguments:
distributed backend
--seed SEED seed for initializing training.
--gpu GPU GPU id to use.
--no-accel disables accelerator
--multiprocessing-distributed
Use multi-processing distributed training to launch N processes per node, which has N GPUs. This is the fastest way to use PyTorch for either single node or multi node data parallel
training
Expand Down
111 changes: 64 additions & 47 deletions imagenet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@
help='seed for initializing training. ')
parser.add_argument('--gpu', default=None, type=int,
help='GPU id to use.')
parser.add_argument('--no-accel', action='store_true',
help='disables accelerator')
parser.add_argument('--multiprocessing-distributed', action='store_true',
help='Use multi-processing distributed training to launch '
'N processes per node, which has N GPUs. This is the '
Expand Down Expand Up @@ -104,8 +106,17 @@ def main():

args.distributed = args.world_size > 1 or args.multiprocessing_distributed

if torch.cuda.is_available():
ngpus_per_node = torch.cuda.device_count()
use_accel = not args.no_accel and torch.accelerator.is_available()

if use_accel:
device = torch.accelerator.current_accelerator()
else:
device = torch.device("cpu")

print(f"Using device: {device}")

if device.type =='cuda':
ngpus_per_node = torch.accelerator.device_count()
if ngpus_per_node == 1 and args.dist_backend == "nccl":
warnings.warn("nccl backend >=2.5 requires GPU count>1, see https://github.com/NVIDIA/nccl/issues/103 perhaps use 'gloo'")
else:
Expand All @@ -127,8 +138,15 @@ def main_worker(gpu, ngpus_per_node, args):
global best_acc1
args.gpu = gpu

if args.gpu is not None:
print("Use GPU: {} for training".format(args.gpu))
use_accel = not args.no_accel and torch.accelerator.is_available()

if use_accel:
if args.gpu is not None:
torch.accelerator.set_device_index(args.gpu)
print("Use GPU: {} for training".format(args.gpu))
device = torch.accelerator.current_accelerator()
else:
device = torch.device("cpu")

if args.distributed:
if args.dist_url == "env://" and args.rank == -1:
Expand All @@ -147,16 +165,16 @@ def main_worker(gpu, ngpus_per_node, args):
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()

if not torch.cuda.is_available() and not torch.backends.mps.is_available():
if not use_accel:
print('using CPU, this will be slow')
elif args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
# should always set the single device scope, otherwise,
# DistributedDataParallel will use all available devices.
if torch.cuda.is_available():
if device.type == 'cuda':
if args.gpu is not None:
torch.cuda.set_device(args.gpu)
model.cuda(args.gpu)
model.cuda(device)
# When using a single GPU per process and per
# DistributedDataParallel, we need to divide the batch size
# ourselves based on the total number of GPUs of the current node.
Expand All @@ -168,29 +186,17 @@ def main_worker(gpu, ngpus_per_node, args):
# DistributedDataParallel will divide and allocate batch_size to all
# available GPUs if device_ids are not set
model = torch.nn.parallel.DistributedDataParallel(model)
elif args.gpu is not None and torch.cuda.is_available():
torch.cuda.set_device(args.gpu)
model = model.cuda(args.gpu)
elif torch.backends.mps.is_available():
device = torch.device("mps")
model = model.to(device)
else:
elif device.type == 'cuda':
# DataParallel will divide and allocate batch_size to all available GPUs
if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
model.features = torch.nn.DataParallel(model.features)
model.cuda()
else:
model = torch.nn.DataParallel(model).cuda()

if torch.cuda.is_available():
if args.gpu:
device = torch.device('cuda:{}'.format(args.gpu))
else:
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
model.to(device)


# define loss function (criterion), optimizer, and learning rate scheduler
criterion = nn.CrossEntropyLoss().to(device)

Expand All @@ -207,9 +213,9 @@ def main_worker(gpu, ngpus_per_node, args):
print("=> loading checkpoint '{}'".format(args.resume))
if args.gpu is None:
checkpoint = torch.load(args.resume)
elif torch.cuda.is_available():
else:
# Map model to be loaded to specified single gpu.
loc = 'cuda:{}'.format(args.gpu)
loc = f'{device.type}:{args.gpu}'
checkpoint = torch.load(args.resume, map_location=loc)
args.start_epoch = checkpoint['epoch']
best_acc1 = checkpoint['best_acc1']
Expand Down Expand Up @@ -302,11 +308,14 @@ def main_worker(gpu, ngpus_per_node, args):


def train(train_loader, model, criterion, optimizer, epoch, device, args):
batch_time = AverageMeter('Time', ':6.3f')
data_time = AverageMeter('Data', ':6.3f')
losses = AverageMeter('Loss', ':.4e')
top1 = AverageMeter('Acc@1', ':6.2f')
top5 = AverageMeter('Acc@5', ':6.2f')

use_accel = not args.no_accel and torch.accelerator.is_available()

batch_time = AverageMeter('Time', use_accel, ':6.3f', Summary.NONE)
data_time = AverageMeter('Data', use_accel, ':6.3f', Summary.NONE)
losses = AverageMeter('Loss', use_accel, ':.4e', Summary.NONE)
top1 = AverageMeter('Acc@1', use_accel, ':6.2f', Summary.NONE)
top5 = AverageMeter('Acc@5', use_accel, ':6.2f', Summary.NONE)
progress = ProgressMeter(
len(train_loader),
[batch_time, data_time, losses, top1, top5],
Expand Down Expand Up @@ -349,18 +358,27 @@ def train(train_loader, model, criterion, optimizer, epoch, device, args):

def validate(val_loader, model, criterion, args):

use_accel = not args.no_accel and torch.accelerator.is_available()

def run_validate(loader, base_progress=0):

if use_accel:
device = torch.accelerator.current_accelerator()
else:
device = torch.device("cpu")

with torch.no_grad():
end = time.time()
for i, (images, target) in enumerate(loader):
i = base_progress + i
if args.gpu is not None and torch.cuda.is_available():
images = images.cuda(args.gpu, non_blocking=True)
if torch.backends.mps.is_available():
images = images.to('mps')
target = target.to('mps')
if torch.cuda.is_available():
target = target.cuda(args.gpu, non_blocking=True)
if use_accel:
if args.gpu is not None and device.type=='cuda':
torch.accelerator.set_device_index(argps.gpu)
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)
else:
images = images.to(device)
target = target.to(device)

# compute output
output = model(images)
Expand All @@ -379,10 +397,10 @@ def run_validate(loader, base_progress=0):
if i % args.print_freq == 0:
progress.display(i + 1)

batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
losses = AverageMeter('Loss', ':.4e', Summary.NONE)
top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
batch_time = AverageMeter('Time', use_accel, ':6.3f', Summary.NONE)
losses = AverageMeter('Loss', use_accel, ':.4e', Summary.NONE)
top1 = AverageMeter('Acc@1', use_accel, ':6.2f', Summary.AVERAGE)
top5 = AverageMeter('Acc@5', use_accel, ':6.2f', Summary.AVERAGE)
progress = ProgressMeter(
len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
[batch_time, losses, top1, top5],
Expand Down Expand Up @@ -422,8 +440,9 @@ class Summary(Enum):

class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
def __init__(self, name, use_accel, fmt=':f', summary_type=Summary.AVERAGE):
self.name = name
self.use_accel = use_accel
self.fmt = fmt
self.summary_type = summary_type
self.reset()
Expand All @@ -440,11 +459,9 @@ def update(self, val, n=1):
self.count += n
self.avg = self.sum / self.count

def all_reduce(self):
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
def all_reduce(self):
if use_accel:
device = torch.accelerator.current_accelerator()
else:
device = torch.device("cpu")
total = torch.tensor([self.sum, self.count], dtype=torch.float32, device=device)
Expand Down
4 changes: 2 additions & 2 deletions imagenet/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
torch
torchvision==0.20.0
torch>=2.6
torchvision
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

torch>=2.6 on the line above.

Loading