Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: kubearchive-logging-cleanup-script
namespace: product-kubearchive-logging
data:
cleanup-monitor.sh: |
#!/bin/bash

# Loki Autoscaling Resource Cleanup Monitor
# This script monitors and cleans up resources during autoscaling operations

NAMESPACE="product-kubearchive-logging"
LOG_FILE="/logs/loki-cleanup.log"

log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') $1" | tee -a "$LOG_FILE"
}

# Function to check for EndpointSlice issues
check_endpoint_slice_health() {
log "Checking EndpointSlice health..."

# Check for recent FailedToUpdateEndpointSlices events
RECENT_ERRORS=$(oc get events -n "$NAMESPACE" --field-selector reason=FailedToUpdateEndpointSlices 2>/dev/null | wc -l)

if [ "$RECENT_ERRORS" -gt 1 ]; then # More than header line
log "WARNING: Found $((RECENT_ERRORS-1)) EndpointSlice update failures"

# Check for pods on missing nodes
oc get pods -n "$NAMESPACE" -o wide | tail -n +2 | while read -r pod_info; do
POD_NAME=$(echo "$pod_info" | awk '{print $1}')
NODE_NAME=$(echo "$pod_info" | awk '{print $7}')

if [ "$NODE_NAME" != "<none>" ] && ! oc get node "$NODE_NAME" >/dev/null 2>&1; then
log "Found pod $POD_NAME on missing node $NODE_NAME - marking for cleanup"
# Force delete the pod to trigger cleanup
oc delete pod "$POD_NAME" -n "$NAMESPACE" --grace-period=0 --force
fi
done
else
log "✓ No recent EndpointSlice issues detected"
fi
}

# Main execution
main() {
log "Starting Loki autoscaling cleanup monitor..."

check_endpoint_slice_health

log "Cleanup monitor cycle completed"
}

# Run main function
main
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: kubearchive-logging-cleanup
namespace: product-kubearchive-logging
spec:
schedule: "*/15 * * * *" # Run every 15 minutes
jobTemplate:
spec:
template:
spec:
serviceAccountName: loki-cleanup-sa
containers:
- name: cleanup-monitor
image: registry.redhat.io/ubi8/ubi:latest
command:
- /bin/bash
- /scripts/cleanup-monitor.sh
volumeMounts:
- name: cleanup-script
mountPath: /scripts
- name: log-storage
mountPath: /logs
resources:
requests:
cpu: 50m
memory: 64Mi
limits:
cpu: 100m
memory: 128Mi
volumes:
- name: cleanup-script
configMap:
name: loki-autoscaling-cleanup-script
defaultMode: 0755
- name: log-storage
persistentVolumeClaim:
claimName: kubearchive-logging-logs
restartPolicy: OnFailure
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: loki-cleanup-sa
namespace: product-kubearchive-logging
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: loki-cleanup-role
namespace: product-kubearchive-logging
rules:
- apiGroups: [""]
resources: ["pods", "persistentvolumeclaims", "events"]
verbs: ["get", "list", "delete"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["get", "list", "delete"]
- apiGroups: ["autoscaling"]
resources: ["horizontalpodautoscalers"]
verbs: ["get", "list"]
- apiGroups: ["policy"]
resources: ["poddisruptionbudgets"]
verbs: ["get", "list"]
- apiGroups: ["metrics.k8s.io"]
resources: ["pods"]
verbs: ["get", "list"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: loki-cleanup-binding
namespace: product-kubearchive-logging
subjects:
- kind: ServiceAccount
name: loki-cleanup-sa
namespace: product-kubearchive-logging
roleRef:
kind: Role
name: loki-cleanup-role
apiGroup: rbac.authorization.k8s.io
Loading