From 0b9522b872919bf502b58439089bf3887da83ed0 Mon Sep 17 00:00:00 2001 From: Suraj Deshmukh Date: Wed, 1 Apr 2026 16:21:35 -0700 Subject: [PATCH 1/3] fix(e2e): replace watch with polling in `WaitUntilNodeReady` to fix flaky node detection The Kubernetes API server closes watches after a random 5-10 minute timeout. `WaitUntilNodeReady` had no retry logic, so when the watch channel closed before the node appeared, the test immediately failed with "haven't appeared in k8s API server". Replace the bare watch with `wait.PollUntilContextTimeout` polling every 5s, matching the pattern used by `WaitUntilPodRunningWithRetry`. This also fixes a race where a node added between VMSS creation and watch establishment could be missed entirely. CSEs for the GPU nodes may take longer to become ready, so it is highly likely that the watch timesout before the Kubelet can register itself with the kube-apiserver. Signed-off-by: Suraj Deshmukh --- e2e/kube.go | 70 ++++++++++++++++++++++++++--------------------------- 1 file changed, 34 insertions(+), 36 deletions(-) diff --git a/e2e/kube.go b/e2e/kube.go index 3b551f6f370..61e43039638 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -12,7 +12,6 @@ import ( "github.com/Azure/agentbaker/e2e/config" "github.com/Azure/agentbaker/e2e/toolkit" "github.com/Azure/azure-sdk-for-go/sdk/azcore/to" - "github.com/stretchr/testify/require" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" v1 "k8s.io/api/core/v1" @@ -20,7 +19,6 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" - "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" @@ -147,53 +145,53 @@ func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string, func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string { defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)() - var node *corev1.Node = nil - watcher, err := k.Typed.CoreV1().Nodes().Watch(ctx, metav1.ListOptions{}) - require.NoError(t, err, "failed to start watching nodes") - defer watcher.Stop() - - for event := range watcher.ResultChan() { - if event.Type != watch.Added && event.Type != watch.Modified { - continue - } - var nodeFromEvent *corev1.Node - switch v := event.Object.(type) { - case *corev1.Node: - nodeFromEvent = v + var lastSeenNode *corev1.Node + var nodeName string - default: - t.Logf("skipping object type %T", event.Object) - continue + err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 90*time.Minute, true, func(ctx context.Context) (bool, error) { + nodes, listErr := k.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if listErr != nil { + t.Logf("failed to list nodes: %v", listErr) + return false, nil } - if !strings.HasPrefix(nodeFromEvent.Name, vmssName) { - continue - } + for i := range nodes.Items { + node := &nodes.Items[i] + if !strings.HasPrefix(node.Name, vmssName) { + continue + } - // found the right node. Use it! - node = nodeFromEvent - nodeTaints, _ := json.Marshal(node.Spec.Taints) - nodeConditions, _ := json.Marshal(node.Status.Conditions) + lastSeenNode = node + nodeTaints, _ := json.Marshal(node.Spec.Taints) + nodeConditions, _ := json.Marshal(node.Status.Conditions) - for _, cond := range node.Status.Conditions { - if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { - t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) - return node.Name + for _, cond := range node.Status.Conditions { + if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue { + t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) + nodeName = node.Name + return true, nil + } } + + t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) + return false, nil } - t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) - } + return false, nil + }) - if node == nil { - t.Fatalf("%q haven't appeared in k8s API server", vmssName) + if err != nil { + if lastSeenNode == nil { + t.Fatalf("%q haven't appeared in k8s API server: %v", vmssName, err) + } else { + nodeString, _ := json.Marshal(lastSeenNode) + t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s", vmssName, lastSeenNode.Name, lastSeenNode.Status, string(nodeString)) + } return "" } - nodeString, _ := json.Marshal(node) - t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s", vmssName, node.Name, node.Status, string(nodeString)) - return node.Name + return nodeName } // GetPodNetworkDebugPodForNode returns a pod that's a member of the 'debugnonhost' daemonset running in the cluster - this will return From 04655129edc7e97518c3ae069707a9c92536c999 Mon Sep 17 00:00:00 2001 From: Suraj Deshmukh Date: Thu, 2 Apr 2026 14:51:15 -0700 Subject: [PATCH 2/3] fix(e2e): check all matching nodes in `WaitUntilNodeReady` poll loop - Replace `return false, nil` with `continue` when a matched node is NotReady so the loop checks all prefix-matched nodes before retrying, fixing a regression where capacity>1 VMSS would short-circuit on the first NotReady node and miss an already-Ready sibling - Use `node.DeepCopy()` for `lastSeenNode` to avoid retaining the full `NodeList` backing array across poll iterations - Include `err` in the timeout `Fatalf` message for easier diagnosis Signed-off-by: Suraj Deshmukh --- e2e/kube.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/e2e/kube.go b/e2e/kube.go index 61e43039638..14378dbce36 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -162,7 +162,7 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssN continue } - lastSeenNode = node + lastSeenNode = node.DeepCopy() nodeTaints, _ := json.Marshal(node.Spec.Taints) nodeConditions, _ := json.Marshal(node.Status.Conditions) @@ -175,7 +175,7 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssN } t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions)) - return false, nil + continue } return false, nil @@ -186,7 +186,7 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssN t.Fatalf("%q haven't appeared in k8s API server: %v", vmssName, err) } else { nodeString, _ := json.Marshal(lastSeenNode) - t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s", vmssName, lastSeenNode.Name, lastSeenNode.Status, string(nodeString)) + t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s Err: %v", vmssName, lastSeenNode.Name, lastSeenNode.Status, string(nodeString), err) } return "" } From fa79411d83767c112dc9726ed5f5f3c328df7459 Mon Sep 17 00:00:00 2001 From: Suraj Deshmukh Date: Fri, 3 Apr 2026 13:45:32 -0700 Subject: [PATCH 3/3] fix(e2e): improve error handling in `WaitUntilNodeReady` - Replace `PollUntilContextTimeout` with `PollUntilContextCancel` to defer timeout to the caller's context deadline instead of hardcoding 90 minutes - Fail fast on non-retryable `Forbidden`/`Unauthorized` API errors instead of silently polling until timeout Signed-off-by: Suraj Deshmukh --- e2e/kube.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/e2e/kube.go b/e2e/kube.go index 14378dbce36..ab3593bf536 100644 --- a/e2e/kube.go +++ b/e2e/kube.go @@ -149,10 +149,13 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssN var lastSeenNode *corev1.Node var nodeName string - err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 90*time.Minute, true, func(ctx context.Context) (bool, error) { + err := wait.PollUntilContextCancel(ctx, 5*time.Second, true, func(ctx context.Context) (bool, error) { nodes, listErr := k.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) if listErr != nil { t.Logf("failed to list nodes: %v", listErr) + if errorsk8s.IsForbidden(listErr) || errorsk8s.IsUnauthorized(listErr) { + return false, listErr + } return false, nil }