From 0b9522b872919bf502b58439089bf3887da83ed0 Mon Sep 17 00:00:00 2001
From: Suraj Deshmukh <suraj.deshmukh@microsoft.com>
Date: Wed, 1 Apr 2026 16:21:35 -0700
Subject: [PATCH 1/3] fix(e2e): replace watch with polling in
 `WaitUntilNodeReady` to fix flaky node detection

The Kubernetes API server closes watches after a random 5-10 minute
timeout. `WaitUntilNodeReady` had no retry logic, so when the watch
channel closed before the node appeared, the test immediately failed
with "haven't appeared in k8s API server".

Replace the bare watch with `wait.PollUntilContextTimeout` polling
every 5s, matching the pattern used by `WaitUntilPodRunningWithRetry`.

This also fixes a race where a node added between VMSS creation and
watch establishment could be missed entirely.

CSEs for the GPU nodes may take longer to become ready, so it is highly
likely that the watch timesout before the Kubelet can register itself
with the kube-apiserver.

Signed-off-by: Suraj Deshmukh <suraj.deshmukh@microsoft.com>
---
 e2e/kube.go | 70 ++++++++++++++++++++++++++---------------------------
 1 file changed, 34 insertions(+), 36 deletions(-)

diff --git a/e2e/kube.go b/e2e/kube.go
index 3b551f6f370..61e43039638 100644
--- a/e2e/kube.go
+++ b/e2e/kube.go
@@ -12,7 +12,6 @@ import (
 	"github.com/Azure/agentbaker/e2e/config"
 	"github.com/Azure/agentbaker/e2e/toolkit"
 	"github.com/Azure/azure-sdk-for-go/sdk/azcore/to"
-	"github.com/stretchr/testify/require"
 	appsv1 "k8s.io/api/apps/v1"
 	corev1 "k8s.io/api/core/v1"
 	v1 "k8s.io/api/core/v1"
@@ -20,7 +19,6 @@ import (
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/util/wait"
-	"k8s.io/apimachinery/pkg/watch"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
 	"k8s.io/client-go/tools/clientcmd"
@@ -147,53 +145,53 @@ func (k *Kubeclient) WaitUntilPodRunning(ctx context.Context, namespace string,
 
 func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssName string) string {
 	defer toolkit.LogStepf(t, "waiting for node %s to be ready", vmssName)()
-	var node *corev1.Node = nil
-	watcher, err := k.Typed.CoreV1().Nodes().Watch(ctx, metav1.ListOptions{})
-	require.NoError(t, err, "failed to start watching nodes")
-	defer watcher.Stop()
-
-	for event := range watcher.ResultChan() {
-		if event.Type != watch.Added && event.Type != watch.Modified {
-			continue
-		}
 
-		var nodeFromEvent *corev1.Node
-		switch v := event.Object.(type) {
-		case *corev1.Node:
-			nodeFromEvent = v
+	var lastSeenNode *corev1.Node
+	var nodeName string
 
-		default:
-			t.Logf("skipping object type %T", event.Object)
-			continue
+	err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 90*time.Minute, true, func(ctx context.Context) (bool, error) {
+		nodes, listErr := k.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+		if listErr != nil {
+			t.Logf("failed to list nodes: %v", listErr)
+			return false, nil
 		}
 
-		if !strings.HasPrefix(nodeFromEvent.Name, vmssName) {
-			continue
-		}
+		for i := range nodes.Items {
+			node := &nodes.Items[i]
+			if !strings.HasPrefix(node.Name, vmssName) {
+				continue
+			}
 
-		// found the right node. Use it!
-		node = nodeFromEvent
-		nodeTaints, _ := json.Marshal(node.Spec.Taints)
-		nodeConditions, _ := json.Marshal(node.Status.Conditions)
+			lastSeenNode = node
+			nodeTaints, _ := json.Marshal(node.Spec.Taints)
+			nodeConditions, _ := json.Marshal(node.Status.Conditions)
 
-		for _, cond := range node.Status.Conditions {
-			if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
-				t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
-				return node.Name
+			for _, cond := range node.Status.Conditions {
+				if cond.Type == corev1.NodeReady && cond.Status == corev1.ConditionTrue {
+					t.Logf("node %s is ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
+					nodeName = node.Name
+					return true, nil
+				}
 			}
+
+			t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
+			return false, nil
 		}
 
-		t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
-	}
+		return false, nil
+	})
 
-	if node == nil {
-		t.Fatalf("%q haven't appeared in k8s API server", vmssName)
+	if err != nil {
+		if lastSeenNode == nil {
+			t.Fatalf("%q haven't appeared in k8s API server: %v", vmssName, err)
+		} else {
+			nodeString, _ := json.Marshal(lastSeenNode)
+			t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s", vmssName, lastSeenNode.Name, lastSeenNode.Status, string(nodeString))
+		}
 		return ""
 	}
 
-	nodeString, _ := json.Marshal(node)
-	t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s", vmssName, node.Name, node.Status, string(nodeString))
-	return node.Name
+	return nodeName
 }
 
 // GetPodNetworkDebugPodForNode returns a pod that's a member of the 'debugnonhost' daemonset running in the cluster - this will return

From 04655129edc7e97518c3ae069707a9c92536c999 Mon Sep 17 00:00:00 2001
From: Suraj Deshmukh <suraj.deshmukh@microsoft.com>
Date: Thu, 2 Apr 2026 14:51:15 -0700
Subject: [PATCH 2/3] fix(e2e): check all matching nodes in
 `WaitUntilNodeReady` poll loop

- Replace `return false, nil` with `continue` when a matched node is
  NotReady so the loop checks all prefix-matched nodes before retrying,
  fixing a regression where capacity>1 VMSS would short-circuit on the
  first NotReady node and miss an already-Ready sibling
- Use `node.DeepCopy()` for `lastSeenNode` to avoid retaining the full
  `NodeList` backing array across poll iterations
- Include `err` in the timeout `Fatalf` message for easier diagnosis

Signed-off-by: Suraj Deshmukh <suraj.deshmukh@microsoft.com>
---
 e2e/kube.go | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/e2e/kube.go b/e2e/kube.go
index 61e43039638..14378dbce36 100644
--- a/e2e/kube.go
+++ b/e2e/kube.go
@@ -162,7 +162,7 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssN
 				continue
 			}
 
-			lastSeenNode = node
+			lastSeenNode = node.DeepCopy()
 			nodeTaints, _ := json.Marshal(node.Spec.Taints)
 			nodeConditions, _ := json.Marshal(node.Status.Conditions)
 
@@ -175,7 +175,7 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssN
 			}
 
 			t.Logf("node %s is not ready. Taints: %s Conditions: %s", node.Name, string(nodeTaints), string(nodeConditions))
-			return false, nil
+			continue
 		}
 
 		return false, nil
@@ -186,7 +186,7 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssN
 			t.Fatalf("%q haven't appeared in k8s API server: %v", vmssName, err)
 		} else {
 			nodeString, _ := json.Marshal(lastSeenNode)
-			t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s", vmssName, lastSeenNode.Name, lastSeenNode.Status, string(nodeString))
+			t.Fatalf("failed to wait for %q (%s) to be ready %+v. Detail: %s Err: %v", vmssName, lastSeenNode.Name, lastSeenNode.Status, string(nodeString), err)
 		}
 		return ""
 	}

From fa79411d83767c112dc9726ed5f5f3c328df7459 Mon Sep 17 00:00:00 2001
From: Suraj Deshmukh <suraj.deshmukh@microsoft.com>
Date: Fri, 3 Apr 2026 13:45:32 -0700
Subject: [PATCH 3/3] fix(e2e): improve error handling in `WaitUntilNodeReady`

- Replace `PollUntilContextTimeout` with `PollUntilContextCancel` to
  defer timeout to the caller's context deadline instead of hardcoding
  90 minutes
- Fail fast on non-retryable `Forbidden`/`Unauthorized` API errors
  instead of silently polling until timeout

Signed-off-by: Suraj Deshmukh <suraj.deshmukh@microsoft.com>
---
 e2e/kube.go | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/e2e/kube.go b/e2e/kube.go
index 14378dbce36..ab3593bf536 100644
--- a/e2e/kube.go
+++ b/e2e/kube.go
@@ -149,10 +149,13 @@ func (k *Kubeclient) WaitUntilNodeReady(ctx context.Context, t testing.TB, vmssN
 	var lastSeenNode *corev1.Node
 	var nodeName string
 
-	err := wait.PollUntilContextTimeout(ctx, 5*time.Second, 90*time.Minute, true, func(ctx context.Context) (bool, error) {
+	err := wait.PollUntilContextCancel(ctx, 5*time.Second, true, func(ctx context.Context) (bool, error) {
 		nodes, listErr := k.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
 		if listErr != nil {
 			t.Logf("failed to list nodes: %v", listErr)
+			if errorsk8s.IsForbidden(listErr) || errorsk8s.IsUnauthorized(listErr) {
+				return false, listErr
+			}
 			return false, nil
 		}