Skip to content

Add support for Intel GPU to ImageNet example #1322

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion imagenet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ python main.py -a resnet18 --dummy

## Multi-processing Distributed Data Parallel Training

You should always use the NCCL backend for multi-processing distributed training since it currently provides the best distributed training performance.
If running on CUDA, you should always use the NCCL backend for multi-processing distributed training since it currently provides the best distributed training performance.

For XPU multiprocessing is not supported as of PyTorch 2.6.

### Single node, multiple GPUs:

Expand Down
27 changes: 21 additions & 6 deletions imagenet/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def main_worker(gpu, ngpus_per_node, args):
print("=> creating model '{}'".format(args.arch))
model = models.__dict__[args.arch]()

if not torch.cuda.is_available() and not torch.backends.mps.is_available():
if not torch.cuda.is_available() and not torch.backends.mps.is_available() and not torch.xpu.is_available():
print('using CPU, this will be slow')
elif args.distributed:
# For multiprocessing distributed, DistributedDataParallel constructor
Expand All @@ -171,6 +171,9 @@ def main_worker(gpu, ngpus_per_node, args):
elif args.gpu is not None and torch.cuda.is_available():
torch.cuda.set_device(args.gpu)
model = model.cuda(args.gpu)
elif torch.xpu.is_available():
device = torch.device("xpu")
model = model.to(device)
elif torch.backends.mps.is_available():
device = torch.device("mps")
model = model.to(device)
Expand All @@ -187,10 +190,15 @@ def main_worker(gpu, ngpus_per_node, args):
device = torch.device('cuda:{}'.format(args.gpu))
else:
device = torch.device("cuda")
elif torch.xpu.is_available():
device = torch.device("xpu")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")

print (f"Device to use: ", {device.type})

# define loss function (criterion), optimizer, and learning rate scheduler
criterion = nn.CrossEntropyLoss().to(device)

Expand Down Expand Up @@ -354,14 +362,19 @@ def run_validate(loader, base_progress=0):
end = time.time()
for i, (images, target) in enumerate(loader):
i = base_progress + i
if args.gpu is not None and torch.cuda.is_available():
images = images.cuda(args.gpu, non_blocking=True)
if torch.backends.mps.is_available():
images = images.to('mps')
target = target.to('mps')

if torch.cuda.is_available():
if args.gpu is not None:
images = images.cuda(args.gpu, non_blocking=True)
target = target.cuda(args.gpu, non_blocking=True)

elif torch.xpu.is_available():
images = images.to("xpu")
target = target.to("xpu")
elif torch.backends.mps.is_available():
images = images.to('mps')
target = target.to('mps')

# compute output
output = model(images)
loss = criterion(output, target)
Expand Down Expand Up @@ -443,6 +456,8 @@ def update(self, val, n=1):
def all_reduce(self):
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.xpu.is_available():
device = torch.device("xpu")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
Expand Down