diff --git a/pkg/monitortests/network/onpremhaproxy/monitortest.go b/pkg/monitortests/network/onpremhaproxy/monitortest.go index 617424039ce2..cf27966a159e 100644 --- a/pkg/monitortests/network/onpremhaproxy/monitortest.go +++ b/pkg/monitortests/network/onpremhaproxy/monitortest.go @@ -372,11 +372,21 @@ func findFullAPIOutageWindows(downIntervals monitorapi.Intervals, threshold int) return ret } +// installGracePeriod is the amount of time after the end of the first all-backends-down window +// (the expected install-time outage) during which subsequent all-down windows are still tolerated. +// During cluster installation, kube-apiserver static pods roll through multiple revisions in quick +// succession and may briefly come up between revisions only to go back down, producing multiple +// short all-down windows that are all part of the same installation phase. +const installGracePeriod = 20 * time.Minute + // evaluateFullAPIOutages produces a junit result failing whenever a single haproxy instance // reported all kube-apiserver backends down at the same time. The first occurrence for every // haproxy instance is tolerated: when haproxy starts during the installation, all kube-apiservers -// are expected to be down until they come up for the first time. Any later occurrence means the -// API was completely unreachable through the on-prem loadbalancer. +// are expected to be down until they come up for the first time. Additional all-down windows +// that start within installGracePeriod after the end of the first window are also tolerated, +// because installer revision rollouts can cause the apiservers to bounce multiple times before +// the control plane stabilises. Any occurrence after that grace period means the API was +// completely unreachable through the on-prem loadbalancer. func evaluateFullAPIOutages(downIntervals monitorapi.Intervals) []*junitapi.JUnitTestCase { const testName = "[Jira: Networking / On-Prem Host Networking] Haproxy should not encounter all kube apiservers down simultaneously" @@ -390,9 +400,25 @@ func evaluateFullAPIOutages(downIntervals monitorapi.Intervals) []*junitapi.JUni failures := []string{} for _, node := range nodes { + windows := outagesPerNode[node] + if len(windows) == 0 { + continue + } + // The first full outage observed by every haproxy instance is the initial state: when haproxy // starts during the installation, none of the kube-apiservers is up yet. - for _, window := range outagesPerNode[node][1:] { + // Additional all-down windows that start within the install grace period after the end of + // the first window are also part of the installation phase — installer revision rollouts + // can cause apiservers to bounce several times before the control plane stabilises. + graceDeadline := windows[0].to.Add(installGracePeriod) + for _, window := range windows[1:] { + if window.from.Before(graceDeadline) { + // Still within the install grace period — extend the deadline from the end of + // this window so that a chain of closely-spaced install-time bounces is fully + // covered. + graceDeadline = window.to.Add(installGracePeriod) + continue + } failures = append(failures, fmt.Sprintf( "haproxy on node %s reported %d or more kube-apiserver backends down at the same time between %s and %s (%s)", node, fullOutageBackendThreshold, window.from.Format(time.RFC3339), window.to.Format(time.RFC3339), window.to.Sub(window.from))) diff --git a/pkg/monitortests/network/onpremhaproxy/monitortest_test.go b/pkg/monitortests/network/onpremhaproxy/monitortest_test.go index 4730f2131463..0abbc3460340 100644 --- a/pkg/monitortests/network/onpremhaproxy/monitortest_test.go +++ b/pkg/monitortests/network/onpremhaproxy/monitortest_test.go @@ -196,6 +196,68 @@ func TestEvaluateFullAPIOutages(t *testing.T) { ), expectFailure: false, }, + { + name: "install-time bounce within grace period is tolerated", + intervals: append(installOutage("master-0"), + // Second all-down window 90 seconds after the first one ends (at 420). + // This simulates kube-apiservers briefly coming up then going back down + // during installer revision rollouts — a common pattern on 5.0/RHEL-10. + haproxyDownInterval("master-0", "masters/master-0", at(510), at(570)), + haproxyDownInterval("master-0", "masters/master-1", at(510), at(570)), + haproxyDownInterval("master-0", "masters/master-2", at(510), at(570)), + ), + expectFailure: false, + }, + { + name: "multiple install-time bounces within grace period are tolerated", + intervals: append(installOutage("master-0"), + // Second bounce 2 minutes after install outage ends + haproxyDownInterval("master-0", "masters/master-0", at(540), at(600)), + haproxyDownInterval("master-0", "masters/master-1", at(540), at(600)), + haproxyDownInterval("master-0", "masters/master-2", at(540), at(600)), + // Third bounce 3 minutes after second ends — still within the sliding + // grace window because the deadline extends from each tolerated window + haproxyDownInterval("master-0", "masters/master-0", at(780), at(840)), + haproxyDownInterval("master-0", "masters/master-1", at(780), at(840)), + haproxyDownInterval("master-0", "masters/master-2", at(780), at(840)), + ), + expectFailure: false, + }, + { + name: "outage well after install grace period fails", + intervals: append(installOutage("master-0"), + // All-down window 30 minutes after install outage ends (well past the + // 20 minute grace period) + haproxyDownInterval("master-0", "masters/master-0", at(2220), at(2280)), + haproxyDownInterval("master-0", "masters/master-1", at(2220), at(2280)), + haproxyDownInterval("master-0", "masters/master-2", at(2220), at(2280)), + ), + expectFailure: true, + expectedOutputs: []string{ + "haproxy on node master-0", + at(2220).Format(time.RFC3339), + at(2280).Format(time.RFC3339), + }, + }, + { + name: "install bounce tolerated but later outage still detected", + intervals: append(installOutage("master-0"), + // Bounce during install grace period — tolerated + haproxyDownInterval("master-0", "masters/master-0", at(510), at(570)), + haproxyDownInterval("master-0", "masters/master-1", at(510), at(570)), + haproxyDownInterval("master-0", "masters/master-2", at(510), at(570)), + // Real outage well after grace period — should fail + haproxyDownInterval("master-0", "masters/master-0", at(7200), at(7230)), + haproxyDownInterval("master-0", "masters/master-1", at(7200), at(7230)), + haproxyDownInterval("master-0", "masters/master-2", at(7200), at(7230)), + ), + expectFailure: true, + expectedOutputs: []string{ + "haproxy on node master-0", + at(7200).Format(time.RFC3339), + at(7230).Format(time.RFC3339), + }, + }, } for _, tt := range tests {