From 5e25b565b4e3a80a38fde4d219c177c2c79d7c8a Mon Sep 17 00:00:00 2001 From: Runzhen Wang Date: Wed, 8 Apr 2026 00:38:57 +0000 Subject: [PATCH] add mig mixed --- e2e/scenario_gpu_daemonset_test.go | 2 +- e2e/scenario_gpu_managed_experience_test.go | 66 +++++++++++++++++++-- e2e/validators.go | 6 +- 3 files changed, 65 insertions(+), 9 deletions(-) diff --git a/e2e/scenario_gpu_daemonset_test.go b/e2e/scenario_gpu_daemonset_test.go index 2b98e3d0384..139b199ee92 100644 --- a/e2e/scenario_gpu_daemonset_test.go +++ b/e2e/scenario_gpu_daemonset_test.go @@ -67,7 +67,7 @@ func Test_Ubuntu2204_NvidiaDevicePlugin_Daemonset(t *testing.T) { ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu") // Validate that GPU workloads can be scheduled - ValidateGPUWorkloadSchedulable(ctx, s, 1) + ValidateGPUWorkloadSchedulable(ctx, s, 1, "nvidia.com/gpu") s.T.Logf("NVIDIA device plugin DaemonSet is functioning correctly") }, diff --git a/e2e/scenario_gpu_managed_experience_test.go b/e2e/scenario_gpu_managed_experience_test.go index b16d265bbcf..56284886ec2 100644 --- a/e2e/scenario_gpu_managed_experience_test.go +++ b/e2e/scenario_gpu_managed_experience_test.go @@ -282,7 +282,7 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning(t *testing.T) { ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu") // Validate that GPU workloads can be scheduled - ValidateGPUWorkloadSchedulable(ctx, s, 1) + ValidateGPUWorkloadSchedulable(ctx, s, 1, "nvidia.com/gpu") // Validate that the NVIDIA DCGM packages were installed correctly for _, packageName := range getDCGMPackageNames(os) { @@ -360,7 +360,7 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning(t *testing.T) { ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu") // Validate that GPU workloads can be scheduled - ValidateGPUWorkloadSchedulable(ctx, s, 1) + ValidateGPUWorkloadSchedulable(ctx, s, 1, "nvidia.com/gpu") for _, packageName := range getDCGMPackageNames(os) { versions := components.GetExpectedPackageVersions(packageName, os, osVersion) @@ -437,7 +437,7 @@ func Test_AzureLinux3_NvidiaDevicePluginRunning(t *testing.T) { ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu") // Validate that GPU workloads can be scheduled - ValidateGPUWorkloadSchedulable(ctx, s, 1) + ValidateGPUWorkloadSchedulable(ctx, s, 1, "nvidia.com/gpu") for _, packageName := range getDCGMPackageNames(os) { versions := components.GetExpectedPackageVersions(packageName, os, osVersion) @@ -516,7 +516,7 @@ func Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG(t *testing.T) { ValidateNodeAdvertisesGPUResources(ctx, s, 3, "nvidia.com/gpu") // Validate that MIG workloads can be scheduled - ValidateGPUWorkloadSchedulable(ctx, s, 3) + ValidateGPUWorkloadSchedulable(ctx, s, 3, "nvidia.com/gpu") // Validate that the NVIDIA DCGM packages were installed correctly for _, packageName := range getDCGMPackageNames(os) { @@ -587,7 +587,7 @@ func Test_Ubuntu2204_NvidiaDevicePluginRunning_WithoutVMSSTag(t *testing.T) { ValidateNodeAdvertisesGPUResources(ctx, s, 1, "nvidia.com/gpu") // Validate that GPU workloads can be scheduled - ValidateGPUWorkloadSchedulable(ctx, s, 1) + ValidateGPUWorkloadSchedulable(ctx, s, 1, "nvidia.com/gpu") for _, packageName := range getDCGMPackageNames(os) { versions := components.GetExpectedPackageVersions(packageName, os, osVersion) @@ -657,3 +657,59 @@ func Test_CreateVMExtensionLinuxAKSNode_Timing(t *testing.T) { require.Equal(t, *ext.Properties.TypeHandlerVersion, *ext2.Properties.TypeHandlerVersion, "both calls should return the same extension version") } + +func Test_Ubuntu2404_NvidiaDevicePluginRunning_MIG_Mixed(t *testing.T) { + RunScenario(t, &Scenario{ + Description: "Tests that NVIDIA device plugin work with MIG Mixed mode on Ubuntu 24.04 GPU nodes", + Tags: Tags{ + GPU: true, + }, + Config: Config{ + Cluster: ClusterKubenet, + VHD: config.VHDUbuntu2404Gen2Containerd, + WaitForSSHAfterReboot: 5 * time.Minute, + BootstrapConfigMutator: func(nbc *datamodel.NodeBootstrappingConfiguration) { + nbc.AgentPoolProfile.VMSize = "Standard_NC24ads_A100_v4" + nbc.ConfigGPUDriverIfNeeded = true + nbc.EnableGPUDevicePluginIfNeeded = true + nbc.EnableNvidia = true + nbc.GPUInstanceProfile = "MIG1g" + nbc.EnableManagedGPU = true + nbc.MigStrategy = "Mixed" + }, + VMConfigMutator: func(vmss *armcompute.VirtualMachineScaleSet) { + vmss.SKU.Name = to.Ptr("Standard_NC24ads_A100_v4") + + // Enable the AKS VM extension for GPU nodes + extension, err := createVMExtensionLinuxAKSNode(t.Context(), vmss.Location) + require.NoError(t, err, "creating AKS VM extension") + vmss.Properties = addVMExtensionToVMSS(vmss.Properties, extension) + }, + Validator: func(ctx context.Context, s *Scenario) { + os := "ubuntu" + osVersion := "r2404" + + // Validate that the NVIDIA device plugin binary was installed correctly + versions := components.GetExpectedPackageVersions("nvidia-device-plugin", os, osVersion) + require.Lenf(s.T, versions, 1, "Expected exactly one nvidia-device-plugin version for %s %s but got %d", os, osVersion, len(versions)) + ValidateInstalledPackageVersion(ctx, s, "nvidia-device-plugin", versions[0]) + + // Validate that the NVIDIA device plugin systemd service is running + ValidateNvidiaDevicePluginServiceRunning(ctx, s) + + // Validate that MIG mode is enabled via nvidia-smi + ValidateMIGModeEnabled(ctx, s) + + // Validate that MIG instances are created + ValidateMIGInstancesCreated(ctx, s, "MIG 1g.10gb") + + // Validate that MIG profile-specific GPU resources are advertised by the device plugin + migResourceName := "nvidia.com/mig-1g.10gb" + ValidateNodeAdvertisesGPUResources(ctx, s, 7, migResourceName) + + // Validate that MIG workloads can be scheduled + ValidateGPUWorkloadSchedulable(ctx, s, 2, migResourceName) + }, + }, + }) +} diff --git a/e2e/validators.go b/e2e/validators.go index b1c614cb3b4..fb07526d900 100644 --- a/e2e/validators.go +++ b/e2e/validators.go @@ -1622,12 +1622,12 @@ func ValidateNodeAdvertisesGPUResources(ctx context.Context, s *Scenario, gpuCou s.T.Logf("node %s advertises %s=%d resources", nodeName, resourceName, gpuCount) } -func ValidateGPUWorkloadSchedulable(ctx context.Context, s *Scenario, gpuCount int) { +func ValidateGPUWorkloadSchedulable(ctx context.Context, s *Scenario, gpuCount int, resourceName string) { s.T.Helper() s.T.Logf("validating that GPU workloads can be scheduled") // Wait for resources to be available and add delay for device health - waitUntilResourceAvailable(ctx, s, "nvidia.com/gpu") + waitUntilResourceAvailable(ctx, s, resourceName) time.Sleep(20 * time.Second) // Same delay as existing GPU tests // Create a GPU test pod using the same pattern as podRunNvidiaWorkload @@ -1646,7 +1646,7 @@ func ValidateGPUWorkloadSchedulable(ctx context.Context, s *Scenario, gpuCount i }, Resources: corev1.ResourceRequirements{ Limits: corev1.ResourceList{ - "nvidia.com/gpu": resource.MustParse(fmt.Sprintf("%d", gpuCount)), + corev1.ResourceName(resourceName): resource.MustParse(fmt.Sprintf("%d", gpuCount)), }, }, },