Skip to content

Commit 79865c2

Browse files
committed
Add support for enabling GPU access
Signed-off-by: YISH <[email protected]>
1 parent 33d7d35 commit 79865c2

File tree

4 files changed

+173
-0
lines changed

4 files changed

+173
-0
lines changed

.gitignore

+3
Original file line numberDiff line numberDiff line change
@@ -105,3 +105,6 @@ venv.bak/
105105

106106
# mypy
107107
.mypy_cache/
108+
109+
110+
.vscode
+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
services:
2+
test:
3+
image: nvidia/cuda:12.3.1-base-ubuntu20.04
4+
command: nvidia-smi
5+
deploy:
6+
resources:
7+
reservations:
8+
devices:
9+
- driver: nvidia
10+
count: 1
11+
capabilities: [gpu]

podman_compose.py

+56
Original file line numberDiff line numberDiff line change
@@ -635,6 +635,62 @@ def get_secret_args(compose, cnt, secret, podman_is_building=False):
635635

636636

637637
def container_to_res_args(cnt, podman_args):
638+
container_to_cpu_res_args(cnt, podman_args)
639+
container_to_gpu_res_args(cnt, podman_args)
640+
641+
642+
def container_to_gpu_res_args(cnt, podman_args):
643+
# https://docs.docker.com/compose/gpu-support/
644+
# https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/cdi-support.html
645+
646+
deploy = cnt.get("deploy", None) or {}
647+
res = deploy.get("resources", None) or {}
648+
reservations = res.get("reservations", None) or {}
649+
devices = reservations.get("devices", [])
650+
gpu_on = False
651+
for device in devices:
652+
driver = device.get("driver", None)
653+
if driver is None:
654+
continue
655+
656+
capabilities = device.get("capabilities", None)
657+
if capabilities is None:
658+
continue
659+
660+
if driver != "nvidia" or "gpu" not in capabilities:
661+
continue
662+
663+
count = device.get("count", "all")
664+
device_ids = device.get("device_ids", "all")
665+
if device_ids != "all" and len(device_ids) > 0:
666+
for device_id in device_ids:
667+
podman_args.extend((
668+
"--device",
669+
f"nvidia.com/gpu={device_id}",
670+
))
671+
gpu_on = True
672+
continue
673+
674+
if count != "all":
675+
for device_id in range(count):
676+
podman_args.extend((
677+
"--device",
678+
f"nvidia.com/gpu={device_id}",
679+
))
680+
gpu_on = True
681+
continue
682+
683+
podman_args.extend((
684+
"--device",
685+
"nvidia.com/gpu=all",
686+
))
687+
gpu_on = True
688+
689+
if gpu_on:
690+
podman_args.append("--security-opt=label=disable")
691+
692+
693+
def container_to_cpu_res_args(cnt, podman_args):
638694
# v2: https://docs.docker.com/compose/compose-file/compose-file-v2/#cpu-and-other-resources
639695
# cpus, cpu_shares, mem_limit, mem_reservation
640696
cpus_limit_v2 = try_float(cnt.get("cpus", None), None)

pytests/test_container_to_args.py

+103
Original file line numberDiff line numberDiff line change
@@ -325,3 +325,106 @@ async def test_env_file_obj_optional(self):
325325
"busybox",
326326
],
327327
)
328+
329+
async def test_gpu(self):
330+
c = create_compose_mock()
331+
332+
cnt = get_minimal_container()
333+
cnt["command"] = ["nvidia-smi"]
334+
cnt["deploy"] = {"resources": {"reservations": {"devices": [{}]}}}
335+
336+
# count: all
337+
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
338+
"driver": "nvidia",
339+
"count": "all",
340+
"capabilities": ["gpu"],
341+
}
342+
343+
args = await container_to_args(c, cnt)
344+
self.assertEqual(
345+
args,
346+
[
347+
"--name=project_name_service_name1",
348+
"-d",
349+
"--network=bridge",
350+
"--network-alias=service_name",
351+
"--device",
352+
"nvidia.com/gpu=all",
353+
"--security-opt=label=disable",
354+
"busybox",
355+
"nvidia-smi",
356+
],
357+
)
358+
359+
# count: 2
360+
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
361+
"driver": "nvidia",
362+
"count": 2,
363+
"capabilities": ["gpu"],
364+
}
365+
366+
args = await container_to_args(c, cnt)
367+
self.assertEqual(
368+
args,
369+
[
370+
"--name=project_name_service_name1",
371+
"-d",
372+
"--network=bridge",
373+
"--network-alias=service_name",
374+
"--device",
375+
"nvidia.com/gpu=0",
376+
"--device",
377+
"nvidia.com/gpu=1",
378+
"--security-opt=label=disable",
379+
"busybox",
380+
"nvidia-smi",
381+
],
382+
)
383+
384+
# device_ids: all
385+
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
386+
"driver": "nvidia",
387+
"device_ids": "all",
388+
"capabilities": ["gpu"],
389+
}
390+
391+
args = await container_to_args(c, cnt)
392+
self.assertEqual(
393+
args,
394+
[
395+
"--name=project_name_service_name1",
396+
"-d",
397+
"--network=bridge",
398+
"--network-alias=service_name",
399+
"--device",
400+
"nvidia.com/gpu=all",
401+
"--security-opt=label=disable",
402+
"busybox",
403+
"nvidia-smi",
404+
],
405+
)
406+
407+
# device_ids: 1,3
408+
cnt["deploy"]["resources"]["reservations"]["devices"][0] = {
409+
"driver": "nvidia",
410+
"device_ids": [1, 3],
411+
"capabilities": ["gpu"],
412+
}
413+
414+
args = await container_to_args(c, cnt)
415+
self.assertEqual(
416+
args,
417+
[
418+
"--name=project_name_service_name1",
419+
"-d",
420+
"--network=bridge",
421+
"--network-alias=service_name",
422+
"--device",
423+
"nvidia.com/gpu=1",
424+
"--device",
425+
"nvidia.com/gpu=3",
426+
"--security-opt=label=disable",
427+
"busybox",
428+
"nvidia-smi",
429+
],
430+
)

0 commit comments

Comments
 (0)