Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion e2e/scenario_gpu_daemonset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) {
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")

// Validate that GPU workloads can be scheduled
ValidateGPUWorkloadSchedulable(ctx, s, 1)
ValidateGPUWorkloadSchedulable(ctx, s, 1, "nvidia.com/gpu")

s.T.Logf("NVIDIA device plugin DaemonSet is functioning correctly")
},
Expand Down
66 changes: 61 additions & 5 deletions e2e/scenario_gpu_managed_experience_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning(t *testing.T) {
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")

// Validate that GPU workloads can be scheduled
ValidateGPUWorkloadSchedulable(ctx, s, 1)
ValidateGPUWorkloadSchedulable(ctx, s, 1, "nvidia.com/gpu")

// Validate that the NVIDIA DCGM packages were installed correctly
for _, packageName := range getDCGMPackageNames(os) {
Expand Down Expand Up @@ -360,7 +360,7 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning(t *testing.T) {
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")

// Validate that GPU workloads can be scheduled
ValidateGPUWorkloadSchedulable(ctx, s, 1)
ValidateGPUWorkloadSchedulable(ctx, s, 1, "nvidia.com/gpu")

for _, packageName := range getDCGMPackageNames(os) {
versions := components.GetExpectedPackageVersions(packageName, os, osVersion)
Expand Down Expand Up @@ -437,7 +437,7 @@ func Test_AzureLinux3_NvidiaDevicePluginRunning(t *testing.T) {
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")

// Validate that GPU workloads can be scheduled
ValidateGPUWorkloadSchedulable(ctx, s, 1)
ValidateGPUWorkloadSchedulable(ctx, s, 1, "nvidia.com/gpu")

for _, packageName := range getDCGMPackageNames(os) {
versions := components.GetExpectedPackageVersions(packageName, os, osVersion)
Expand Down Expand Up @@ -516,7 +516,7 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG(t *testing.T) {
ValidateNodeAdvertisesGPUResources(ctx, s, 3, "nvidia.com/gpu")

// Validate that MIG workloads can be scheduled
ValidateGPUWorkloadSchedulable(ctx, s, 3)
ValidateGPUWorkloadSchedulable(ctx, s, 3, "nvidia.com/gpu")

// Validate that the NVIDIA DCGM packages were installed correctly
for _, packageName := range getDCGMPackageNames(os) {
Expand Down Expand Up @@ -587,7 +587,7 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning_WithoutVMSSTag(t *testing.T) {
ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu")

// Validate that GPU workloads can be scheduled
ValidateGPUWorkloadSchedulable(ctx, s, 1)
ValidateGPUWorkloadSchedulable(ctx, s, 1, "nvidia.com/gpu")

for _, packageName := range getDCGMPackageNames(os) {
versions := components.GetExpectedPackageVersions(packageName, os, osVersion)
Expand Down Expand Up @@ -657,3 +657,59 @@ func Test_CreateVMExtensionLinuxAKSNode_Timing(t *testing.T) {
require.Equal(t, *ext.Properties.TypeHandlerVersion, *ext2.Properties.TypeHandlerVersion,
"both calls should return the same extension version")
}

func Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG_Mixed(t *testing.T) {
RunScenario(t, &Scenario{
Description: "Tests that NVIDIA device plugin work with MIG Mixed mode on Ubuntu 24.04 GPU nodes",
Comment thread
runzhen marked this conversation as resolved.
Tags: Tags{
GPU: true,
},
Config: Config{
Cluster: ClusterKubenet,
VHD: config.VHDUbuntu2404Gen2Containerd,
WaitForSSHAfterReboot: 5 * time.Minute,
BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) {
nbc.AgentPoolProfile.VMSize = "Standard_NC24ads_A100_v4"
nbc.ConfigGPUDriverIfNeeded = true
nbc.EnableGPUDevicePluginIfNeeded = true
nbc.EnableNvidia = true
nbc.GPUInstanceProfile = "MIG1g"
nbc.EnableManagedGPU = true
nbc.MigStrategy = "Mixed"
},
VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) {
vmss.SKU.Name = to.Ptr("Standard_NC24ads_A100_v4")

// Enable the AKS VM extension for GPU nodes
extension, err := createVMExtensionLinuxAKSNode(t.Context(), vmss.Location)
require.NoError(t, err, "creating AKS VM extension")
vmss.Properties = addVMExtensionToVMSS(vmss.Properties, extension)
},
Validator: func(ctx context.Context, s *Scenario) {
os := "ubuntu"
osVersion := "r2404"

// Validate that the NVIDIA device plugin binary was installed correctly
versions := components.GetExpectedPackageVersions("nvidia-device-plugin", os, osVersion)
require.Lenf(s.T, versions, 1, "Expected exactly one nvidia-device-plugin version for %s %s but got %d", os, osVersion, len(versions))
ValidateInstalledPackageVersion(ctx, s, "nvidia-device-plugin", versions[0])

// Validate that the NVIDIA device plugin systemd service is running
ValidateNvidiaDevicePluginServiceRunning(ctx, s)

// Validate that MIG mode is enabled via nvidia-smi
ValidateMIGModeEnabled(ctx, s)

// Validate that MIG instances are created
ValidateMIGInstancesCreated(ctx, s, "MIG 1g.10gb")

// Validate that MIG profile-specific GPU resources are advertised by the device plugin
migResourceName := "nvidia.com/mig-1g.10gb"
ValidateNodeAdvertisesGPUResources(ctx, s, 7, migResourceName)

// Validate that MIG workloads can be scheduled
ValidateGPUWorkloadSchedulable(ctx, s, 2, migResourceName)
},
},
})
}
6 changes: 3 additions & 3 deletions e2e/validators.go
Original file line number Diff line number Diff line change
Expand Up @@ -1622,12 +1622,12 @@ func ValidateNodeAdvertisesGPUResources(ctx context.Context, s *Scenario, gpuCou
s.T.Logf("node %s advertises %s=%d resources", nodeName, resourceName, gpuCount)
}

func ValidateGPUWorkloadSchedulable(ctx context.Context, s *Scenario, gpuCount int) {
func ValidateGPUWorkloadSchedulable(ctx context.Context, s *Scenario, gpuCount int, resourceName string) {
s.T.Helper()
s.T.Logf("validating that GPU workloads can be scheduled")

// Wait for resources to be available and add delay for device health
waitUntilResourceAvailable(ctx, s, "nvidia.com/gpu")
waitUntilResourceAvailable(ctx, s, resourceName)
time.Sleep(20 * time.Second) // Same delay as existing GPU tests

// Create a GPU test pod using the same pattern as podRunNvidiaWorkload
Expand All @@ -1646,7 +1646,7 @@ func ValidateGPUWorkloadSchedulable(ctx context.Context, s *Scenario, gpuCount i
},
Resources: corev1.ResourceRequirements{
Limits: corev1.ResourceList{
"nvidia.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuCount)),
corev1.ResourceName(resourceName): resource.MustParse(fmt.Sprintf("%d", gpuCount)),
},
},
},
Expand Down
Loading