Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions pkg/monitortests/network/onpremhaproxy/monitortest.go
Original file line number Diff line number Diff line change
Expand Up @@ -372,11 +372,21 @@ func findFullAPIOutageWindows(downIntervals monitorapi.Intervals, threshold int)
return ret
}

// installGracePeriod is the amount of time after the end of the first all-backends-down window
// (the expected install-time outage) during which subsequent all-down windows are still tolerated.
// During cluster installation, kube-apiserver static pods roll through multiple revisions in quick
// succession and may briefly come up between revisions only to go back down, producing multiple
// short all-down windows that are all part of the same installation phase.
const installGracePeriod = 20 * time.Minute

// evaluateFullAPIOutages produces a junit result failing whenever a single haproxy instance
// reported all kube-apiserver backends down at the same time. The first occurrence for every
// haproxy instance is tolerated: when haproxy starts during the installation, all kube-apiservers
// are expected to be down until they come up for the first time. Any later occurrence means the
// API was completely unreachable through the on-prem loadbalancer.
// are expected to be down until they come up for the first time. Additional all-down windows
// that start within installGracePeriod after the end of the first window are also tolerated,
// because installer revision rollouts can cause the apiservers to bounce multiple times before
// the control plane stabilises. Any occurrence after that grace period means the API was
// completely unreachable through the on-prem loadbalancer.
func evaluateFullAPIOutages(downIntervals monitorapi.Intervals) []*junitapi.JUnitTestCase {
const testName = "[Jira: Networking / On-Prem Host Networking] Haproxy should not encounter all kube apiservers down simultaneously"

Expand All @@ -390,9 +400,25 @@ func evaluateFullAPIOutages(downIntervals monitorapi.Intervals) []*junitapi.JUni

failures := []string{}
for _, node := range nodes {
windows := outagesPerNode[node]
if len(windows) == 0 {
continue
}

// The first full outage observed by every haproxy instance is the initial state: when haproxy
// starts during the installation, none of the kube-apiservers is up yet.
for _, window := range outagesPerNode[node][1:] {
// Additional all-down windows that start within the install grace period after the end of
// the first window are also part of the installation phase — installer revision rollouts
// can cause apiservers to bounce several times before the control plane stabilises.
graceDeadline := windows[0].to.Add(installGracePeriod)
for _, window := range windows[1:] {
if window.from.Before(graceDeadline) {
Comment thread
mkowalski marked this conversation as resolved.
// Still within the install grace period — extend the deadline from the end of
Comment thread
mkowalski marked this conversation as resolved.
// this window so that a chain of closely-spaced install-time bounces is fully
// covered.
graceDeadline = window.to.Add(installGracePeriod)
continue
}
failures = append(failures, fmt.Sprintf(
"haproxy on node %s reported %d or more kube-apiserver backends down at the same time between %s and %s (%s)",
node, fullOutageBackendThreshold, window.from.Format(time.RFC3339), window.to.Format(time.RFC3339), window.to.Sub(window.from)))
Expand Down
62 changes: 62 additions & 0 deletions pkg/monitortests/network/onpremhaproxy/monitortest_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,68 @@ func TestEvaluateFullAPIOutages(t *testing.T) {
),
expectFailure: false,
},
{
name: "install-time bounce within grace period is tolerated",
intervals: append(installOutage("master-0"),
// Second all-down window 90 seconds after the first one ends (at 420).
// This simulates kube-apiservers briefly coming up then going back down
// during installer revision rollouts — a common pattern on 5.0/RHEL-10.
haproxyDownInterval("master-0", "masters/master-0", at(510), at(570)),
Comment thread
mkowalski marked this conversation as resolved.
haproxyDownInterval("master-0", "masters/master-1", at(510), at(570)),
haproxyDownInterval("master-0", "masters/master-2", at(510), at(570)),
),
expectFailure: false,
},
{
name: "multiple install-time bounces within grace period are tolerated",
intervals: append(installOutage("master-0"),
// Second bounce 2 minutes after install outage ends
haproxyDownInterval("master-0", "masters/master-0", at(540), at(600)),
Comment thread
mkowalski marked this conversation as resolved.
haproxyDownInterval("master-0", "masters/master-1", at(540), at(600)),
haproxyDownInterval("master-0", "masters/master-2", at(540), at(600)),
// Third bounce 3 minutes after second ends — still within the sliding
// grace window because the deadline extends from each tolerated window
haproxyDownInterval("master-0", "masters/master-0", at(780), at(840)),
haproxyDownInterval("master-0", "masters/master-1", at(780), at(840)),
haproxyDownInterval("master-0", "masters/master-2", at(780), at(840)),
),
expectFailure: false,
},
{
name: "outage well after install grace period fails",
intervals: append(installOutage("master-0"),
// All-down window 30 minutes after install outage ends (well past the
// 20 minute grace period)
haproxyDownInterval("master-0", "masters/master-0", at(2220), at(2280)),
haproxyDownInterval("master-0", "masters/master-1", at(2220), at(2280)),
haproxyDownInterval("master-0", "masters/master-2", at(2220), at(2280)),
),
expectFailure: true,
expectedOutputs: []string{
"haproxy on node master-0",
at(2220).Format(time.RFC3339),
at(2280).Format(time.RFC3339),
},
},
{
name: "install bounce tolerated but later outage still detected",
intervals: append(installOutage("master-0"),
// Bounce during install grace period — tolerated
haproxyDownInterval("master-0", "masters/master-0", at(510), at(570)),
haproxyDownInterval("master-0", "masters/master-1", at(510), at(570)),
haproxyDownInterval("master-0", "masters/master-2", at(510), at(570)),
// Real outage well after grace period — should fail
haproxyDownInterval("master-0", "masters/master-0", at(7200), at(7230)),
haproxyDownInterval("master-0", "masters/master-1", at(7200), at(7230)),
haproxyDownInterval("master-0", "masters/master-2", at(7200), at(7230)),
),
expectFailure: true,
expectedOutputs: []string{
"haproxy on node master-0",
at(7200).Format(time.RFC3339),
at(7230).Format(time.RFC3339),
},
},
}

for _, tt := range tests {
Expand Down