redhat-appstudio · olegbet · Oct 27, 2025
diff --git a/...nts/vector-kubearchive-log-collector/development/kubearchive-logging-cleanup-monitor.yaml b/...nts/vector-kubearchive-log-collector/development/kubearchive-logging-cleanup-monitor.yaml
@@ -0,0 +1,139 @@
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: kubearchive-logging-cleanup-script
+  namespace: product-kubearchive-logging
+data:
+  cleanup-monitor.sh: |
+    #!/bin/bash
+
+    # Loki Autoscaling Resource Cleanup Monitor
+    # This script monitors and cleans up resources during autoscaling operations
+
+    NAMESPACE="product-kubearchive-logging"
+    LOG_FILE="/logs/loki-cleanup.log"
+
+    log() {
+        echo "$(date '+%Y-%m-%d %H:%M:%S') $1" | tee -a "$LOG_FILE"
+    }
+
+    # Function to check for EndpointSlice issues
+    check_endpoint_slice_health() {
+        log "Checking EndpointSlice health..."
+
+        # Check for recent FailedToUpdateEndpointSlices events
+        RECENT_ERRORS=$(oc get events -n "$NAMESPACE" --field-selector reason=FailedToUpdateEndpointSlices 2>/dev/null | wc -l)
+
+        if [ "$RECENT_ERRORS" -gt 1 ]; then  # More than header line
+            log "WARNING: Found $((RECENT_ERRORS-1)) EndpointSlice update failures"
+
+            # Check for pods on missing nodes
+            oc get pods -n "$NAMESPACE" -o wide | tail -n +2 | while read -r pod_info; do
+                POD_NAME=$(echo "$pod_info" | awk '{print $1}')
+                NODE_NAME=$(echo "$pod_info" | awk '{print $7}')
+
+                if [ "$NODE_NAME" != "<none>" ] && ! oc get node "$NODE_NAME" >/dev/null 2>&1; then
+                    log "Found pod $POD_NAME on missing node $NODE_NAME - marking for cleanup"
+                    # Force delete the pod to trigger cleanup
+                    oc delete pod "$POD_NAME" -n "$NAMESPACE" --grace-period=0 --force
+                fi
+            done
+        else
+            log "✓ No recent EndpointSlice issues detected"
+        fi
+    }
+
+    # Main execution
+    main() {
+        log "Starting Loki autoscaling cleanup monitor..."
+
+        check_endpoint_slice_health
+
+        log "Cleanup monitor cycle completed"
+    }
+
+    # Run main function
+    main
+---
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: kubearchive-logging-cleanup
+  namespace: product-kubearchive-logging
+spec:
+  schedule: "*/15 * * * *"  # Run every 15 minutes
+  jobTemplate:
+    spec:
+      template:
+        spec:
+          serviceAccountName: loki-cleanup-sa
+          containers:
+          - name: cleanup-monitor
+            image: registry.redhat.io/ubi8/ubi:latest
+            command:
+            - /bin/bash
+            - /scripts/cleanup-monitor.sh
+            volumeMounts:
+            - name: cleanup-script
+              mountPath: /scripts
+            - name: log-storage
+              mountPath: /logs
+            resources:
+              requests:
+                cpu: 50m
+                memory: 64Mi
+              limits:
+                cpu: 100m
+                memory: 128Mi
+          volumes:
+          - name: cleanup-script
+            configMap:
+              name: loki-autoscaling-cleanup-script
+              defaultMode: 0755
+          - name: log-storage
+            persistentVolumeClaim:
+              claimName: kubearchive-logging-logs
+          restartPolicy: OnFailure
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: loki-cleanup-sa
+  namespace: product-kubearchive-logging
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: loki-cleanup-role
+  namespace: product-kubearchive-logging
+rules:
+- apiGroups: [""]
+  resources: ["pods", "persistentvolumeclaims", "events"]
+  verbs: ["get", "list", "delete"]
+- apiGroups: ["batch"]
+  resources: ["jobs"]
+  verbs: ["get", "list", "delete"]
+- apiGroups: ["autoscaling"]
+  resources: ["horizontalpodautoscalers"]
+  verbs: ["get", "list"]
+- apiGroups: ["policy"]
+  resources: ["poddisruptionbudgets"]
+  verbs: ["get", "list"]
+- apiGroups: ["metrics.k8s.io"]
+  resources: ["pods"]
+  verbs: ["get", "list"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: loki-cleanup-binding
+  namespace: product-kubearchive-logging
+subjects:
+- kind: ServiceAccount
+  name: loki-cleanup-sa
+  namespace: product-kubearchive-logging
+roleRef:
+  kind: Role
+  name: loki-cleanup-role
+  apiGroup: rbac.authorization.k8s.io