Skip to content

Commit 7c9e2c2

Browse files
committed
add amd gpu test
1 parent 034e069 commit 7c9e2c2

File tree

4 files changed

+143
-19
lines changed

4 files changed

+143
-19
lines changed

e2e/config/azure.go

+21-19
Original file line numberDiff line numberDiff line change
@@ -291,25 +291,27 @@ func (a *AzureClient) UploadAndGetSignedLink(ctx context.Context, blobName strin
291291
}
292292

293293
func (a *AzureClient) CreateVMManagedIdentity(ctx context.Context) (string, error) {
294-
identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{
295-
Location: to.Ptr(Config.Location),
296-
}, nil)
297-
if err != nil {
298-
return "", fmt.Errorf("create managed identity: %w", err)
299-
}
300-
err = a.createBlobStorageAccount(ctx)
301-
if err != nil {
302-
return "", err
303-
}
304-
err = a.createBlobStorageContainer(ctx)
305-
if err != nil {
306-
return "", err
307-
}
308-
309-
if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil {
310-
return "", err
311-
}
312-
return *identity.Properties.ClientID, nil
294+
// HACK: temporary disable to allow running test in different subscription, without enough permissions
295+
return "", nil
296+
// identity, err := a.UserAssignedIdentities.CreateOrUpdate(ctx, ResourceGroupName, VMIdentityName, armmsi.Identity{
297+
// Location: to.Ptr(Config.Location),
298+
// }, nil)
299+
// if err != nil {
300+
// return "", fmt.Errorf("create managed identity: %w", err)
301+
// }
302+
// err = a.createBlobStorageAccount(ctx)
303+
// if err != nil {
304+
// return "", err
305+
// }
306+
// err = a.createBlobStorageContainer(ctx)
307+
// if err != nil {
308+
// return "", err
309+
// }
310+
311+
// if err := a.assignRolesToVMIdentity(ctx, identity.Properties.PrincipalID); err != nil {
312+
// return "", err
313+
// }
314+
// return *identity.Properties.ClientID, nil
313315
}
314316

315317
func (a *AzureClient) createBlobStorageAccount(ctx context.Context) error {

e2e/kube.go

+49
Original file line numberDiff line numberDiff line change
@@ -637,3 +637,52 @@ func nvidiaDevicePluginDaemonSet() *appsv1.DaemonSet {
637637
},
638638
}
639639
}
640+
641+
func podEnableAMDGPUResource(s *Scenario) *corev1.Pod {
642+
return &corev1.Pod{
643+
ObjectMeta: metav1.ObjectMeta{
644+
Name: fmt.Sprintf("%s-amdgpu-device-plugin", s.Runtime.KubeNodeName),
645+
Namespace: defaultNamespace,
646+
},
647+
Spec: corev1.PodSpec{
648+
PriorityClassName: "system-node-critical",
649+
NodeSelector: map[string]string{
650+
"kubernetes.io/hostname": s.Runtime.KubeNodeName,
651+
},
652+
Containers: []corev1.Container{
653+
{
654+
Name: "amdgpu-device-plugin-container",
655+
Image: "rocm/k8s-device-plugin",
656+
VolumeMounts: []corev1.VolumeMount{
657+
{
658+
Name: "device-plugin",
659+
MountPath: "/var/lib/kubelet/device-plugins",
660+
},
661+
{
662+
Name: "sys",
663+
MountPath: "/sys",
664+
},
665+
},
666+
},
667+
},
668+
Volumes: []corev1.Volume{
669+
{
670+
Name: "device-plugin",
671+
VolumeSource: corev1.VolumeSource{
672+
HostPath: &corev1.HostPathVolumeSource{
673+
Path: "/var/lib/kubelet/device-plugins",
674+
},
675+
},
676+
},
677+
{
678+
Name: "sys",
679+
VolumeSource: corev1.VolumeSource{
680+
HostPath: &corev1.HostPathVolumeSource{
681+
Path: "/sys",
682+
},
683+
},
684+
},
685+
},
686+
},
687+
}
688+
}

e2e/scenario_test.go

+61
Original file line numberDiff line numberDiff line change
@@ -1664,3 +1664,64 @@ func Test_Ubuntu2404ARM(t *testing.T) {
16641664
},
16651665
})
16661666
}
1667+
1668+
func Test_Ubuntu2204Gen2Containerd_AMDGPU_MI300(t *testing.T) {
1669+
//t.Skip("Provisioning of Standard_ND96isr_MI300X_v5 isn't reliable yet")
1670+
RunScenario(t, &Scenario{
1671+
Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
1672+
Tags: Tags{
1673+
GPU: true,
1674+
},
1675+
Config: Config{
1676+
Cluster: ClusterKubenet,
1677+
VHD: config.VHDUbuntu2204Gen2Containerd, //TODO: add support for older
1678+
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
1679+
nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_ND96isr_MI300X_v5"
1680+
nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2"
1681+
nbc.AgentPoolProfile.VMSize = "Standard_ND96isr_MI300X_v5"
1682+
nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2"
1683+
nbc.EnableAMDGPU = true
1684+
nbc.ConfigGPUDriverIfNeeded = true
1685+
},
1686+
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
1687+
vmss.SKU.Name = to.Ptr("Standard_ND96isr_MI300X_v5")
1688+
vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom
1689+
},
1690+
Validator: func(ctx context.Context, s *Scenario) {
1691+
ValidateAMDGPU(ctx, s)
1692+
},
1693+
},
1694+
})
1695+
}
1696+
1697+
func Test_Ubuntu2204Gen2Containerd_AMDGPU_V710(t *testing.T) {
1698+
// the SKU isn't available in subscriptrion/region we run tests
1699+
//t.Skip("Provisioning of NV4ads_V710_v5 isn't reliable yet")
1700+
// LOCATION=southcentralus
1701+
RunScenario(t, &Scenario{
1702+
Description: "Tests that a GPU-enabled node using a MarinerV2 VHD can be properly bootstrapped",
1703+
Tags: Tags{
1704+
GPU: true,
1705+
},
1706+
Config: Config{
1707+
Cluster: ClusterKubenet,
1708+
VHD: config.VHDUbuntu2204Gen2Containerd,
1709+
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
1710+
nbc.ContainerService.Properties.AgentPoolProfiles[0].VMSize = "Standard_NV4ads_V710_v5"
1711+
nbc.ContainerService.Properties.AgentPoolProfiles[0].Distro = "aks-cblmariner-v2-gen2"
1712+
nbc.AgentPoolProfile.VMSize = "Standard_NV4ads_V710_v5"
1713+
nbc.AgentPoolProfile.Distro = "aks-cblmariner-v2-gen2"
1714+
nbc.EnableAMDGPU = true
1715+
nbc.ConfigGPUDriverIfNeeded = true
1716+
1717+
},
1718+
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
1719+
vmss.SKU.Name = to.Ptr("Standard_NV4ads_V710_v5")
1720+
vmss.Properties.VirtualMachineProfile.StorageProfile.OSDisk.DiskSizeGB = to.Ptr[int32](128) // drivers and gpu images are huge, give us some headroom
1721+
},
1722+
Validator: func(ctx context.Context, s *Scenario) {
1723+
ValidateAMDGPU(ctx, s)
1724+
},
1725+
},
1726+
})
1727+
}

e2e/validators.go

+12
Original file line numberDiff line numberDiff line change
@@ -427,3 +427,15 @@ func GetFieldFromJsonObjectOnNode(ctx context.Context, s *Scenario, fileName str
427427

428428
return podExecResult.stdout.String()
429429
}
430+
431+
func ValidateAMDGPU(ctx context.Context, s *Scenario) {
432+
s.T.Logf("validating pod using AMD GPU")
433+
434+
execResult := execScriptOnVMForScenario(ctx, s, "lspci -k")
435+
require.Equal(s.T, "0", execResult.exitCode, "expected to find lspci command, but did not")
436+
assert.Contains(s.T, execResult.stdout.String(), "amdgpu", "expected to see amdgpu kernel module managing a PCI device, but did not")
437+
438+
ensurePod(ctx, s, podEnableAMDGPUResource(s))
439+
waitUntilResourceAvailable(ctx, s, "amd.com/gpu")
440+
//ensureJob(ctx, s, jobAMDGPUWorkload(s))
441+
}

0 commit comments

Comments
 (0)