Add safe-to-evict annotation to image builder job (#334)

ariefrahmansyah · web-flow · commit 3ebecf41f284 · 2023-04-18T21:11:36.000+08:00
* Add safe-to-evict annotation to image builder job

* Add safe-to-evict annotation to image builder job
diff --git a/api/config-dev.yaml b/api/config-dev.yaml
@@ -19,6 +19,7 @@ BatchEnsemblingConfig:
     BuildNamespace: default
     BuildTimeoutDuration: 20m
     DestinationRegistry: ghcr.io
+    SafeToEvict: false
     BaseImageRef:
       3.7.*: ghcr.io/caraml-dev/turing/pyfunc-ensembler-job-py3.7:v0.0.0-build.205-cf36433
       3.8.*: ghcr.io/caraml-dev/turing/pyfunc-ensembler-job-py3.8:v0.0.0-build.205-cf36433
@@ -69,7 +70,8 @@ RouterDefaults:
 ClusterConfig:
   InClusterConfig: false
   EnvironmentConfigPath: ""
-  EnsemblingServiceK8sConfig: {}
+  EnsemblingServiceK8sConfig:
+    {}
     # Example k8s_config to connect to cluster using gke-gcloud-auth-plugin
     # name: dev-cluster
     # cluster:
diff --git a/api/turing/cluster/job.go b/api/turing/cluster/job.go
@@ -11,6 +11,7 @@ type Job struct {
 	Name                    string
 	Namespace               string
 	Labels                  map[string]string
+	Annotations             map[string]string
 	Completions             *int32
 	BackOffLimit            *int32
 	TTLSecondsAfterFinished *int32
@@ -47,17 +48,19 @@ func (j *Job) Build() *batchv1.Job {
 
 	return &batchv1.Job{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      j.Name,
-			Namespace: j.Namespace,
-			Labels:    j.Labels,
+			Name:        j.Name,
+			Namespace:   j.Namespace,
+			Labels:      j.Labels,
+			Annotations: j.Annotations,
 		},
 		Spec: batchv1.JobSpec{
 			Completions:             j.Completions,
 			BackoffLimit:            j.BackOffLimit,
 			TTLSecondsAfterFinished: j.TTLSecondsAfterFinished,
 			Template: corev1.PodTemplateSpec{
 				ObjectMeta: metav1.ObjectMeta{
-					Labels: j.Labels,
+					Labels:      j.Labels,
+					Annotations: j.Annotations,
 				},
 				Spec: corev1.PodSpec{
 					RestartPolicy: j.RestartPolicy,
diff --git a/api/turing/cluster/job_test.go b/api/turing/cluster/job_test.go
@@ -17,6 +17,9 @@ var (
 	labels = map[string]string{
 		"foo": "bar",
 	}
+	annotations = map[string]string{
+		"key": "value",
+	}
 	jobCompletions            int32 = 1
 	jobBackOffLimit           int32 = 3
 	jobTTLSecondAfterComplete int32 = 3600 * 24
@@ -25,17 +28,19 @@ var (
 func TestJob(t *testing.T) {
 	expected := batchv1.Job{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      jobName,
-			Namespace: namespace,
-			Labels:    labels,
+			Name:        jobName,
+			Namespace:   namespace,
+			Labels:      labels,
+			Annotations: annotations,
 		},
 		Spec: batchv1.JobSpec{
 			Completions:             &jobCompletions,
 			BackoffLimit:            &jobBackOffLimit,
 			TTLSecondsAfterFinished: &jobTTLSecondAfterComplete,
 			Template: corev1.PodTemplateSpec{
 				ObjectMeta: metav1.ObjectMeta{
-					Labels: labels,
+					Labels:      labels,
+					Annotations: annotations,
 				},
 				Spec: corev1.PodSpec{
 					RestartPolicy: corev1.RestartPolicyNever,
@@ -67,6 +72,7 @@ func TestJob(t *testing.T) {
 		Name:                    jobName,
 		Namespace:               namespace,
 		Labels:                  labels,
+		Annotations:             annotations,
 		Completions:             &jobCompletions,
 		BackOffLimit:            &jobBackOffLimit,
 		TTLSecondsAfterFinished: &jobTTLSecondAfterComplete,
diff --git a/api/turing/config/config.go b/api/turing/config/config.go
@@ -176,6 +176,8 @@ type ImageBuildingConfig struct {
 	TolerationName *string
 	// NodeSelector restricts the running of image building jobs to nodes with the specified labels.
 	NodeSelector map[string]string
+	// Value for cluster-autoscaler.kubernetes.io/safe-to-evict annotation
+	SafeToEvict bool
 }
 
 // Resource contains the Kubernetes resource request and limits
@@ -636,7 +638,8 @@ func StringToQuantityHookFunc() mapstructure.DecodeHookFunc {
 	return func(
 		f reflect.Type,
 		t reflect.Type,
-		data interface{}) (interface{}, error) {
+		data interface{},
+	) (interface{}, error) {
 		if f.Kind() != reflect.String {
 			return data, nil
 		}
diff --git a/api/turing/config/example.yaml b/api/turing/config/example.yaml
@@ -6,7 +6,7 @@ LogLevel: DEBUG
 
 # Enable Cross-Origin Requests (CORS) for the following origins
 AllowedOrigins:
-- "*"
+  - "*"
 
 # Configuration to enable authorization using ORY Keto server
 AuthConfig:
@@ -42,6 +42,7 @@ BatchEnsemblingConfig:
     BuildNamespace: default
     BuildTimeoutDuration: 20m
     DestinationRegistry: ghcr.io
+    SafeToEvict: false
     BaseImageRef:
       3.7.*: ghcr.io/caraml-dev/turing/pyfunc-ensembler-job:latest
     KanikoConfig: &kanikoConfig
@@ -90,7 +91,7 @@ NewRelic:
   Enabled: true
   AppName: turing
   License: <your very secret license key here>
-  IgnoreStatusCodes: 
+  IgnoreStatusCodes:
     - 400
     - 401
     - 403
@@ -170,7 +171,7 @@ RouterDefaults:
 Sentry:
   Enabled: false
   DSN: https://public@sentry.example.com/1
-  Labels: 
+  Labels:
     labelkey1: labelvalue1
     labelkey2: labelvalue2
 
@@ -188,10 +189,10 @@ ClusterConfig:
 # such as user provided service account keys
 TuringEncryptionKey: secret
 
-# Alert configuration for Turing router 
+# Alert configuration for Turing router
 AlertConfig:
   Enabled: false
-  # Persist alert configuration in a GitLab repo with pre-configured 
+  # Persist alert configuration in a GitLab repo with pre-configured
   # continuous integration jobs that will register and activate the alerts
   # to a backend alert server
   GitLab:
@@ -218,7 +219,7 @@ MLPConfig:
 TuringUIConfig:
   ServingDirectory: /ui
   ServingPath: /turing
- 
+
 # Openapi validation configuration
 OpenapiConfig:
   ValidationEnabled: true
@@ -239,15 +240,15 @@ Experiment:
     config:
       op: seq
       seq:
-      - op: set
-        var: variant
-        value:
-          choices:
-            op: array
-            values:
-            - control
-            - treatment-1
-          unit:
-            op: get
-            var: customer
-          op: uniformChoice
+        - op: set
+          var: variant
+          value:
+            choices:
+              op: array
+              values:
+                - control
+                - treatment-1
+            unit:
+              op: get
+              var: customer
+            op: uniformChoice
diff --git a/api/turing/imagebuilder/imagebuilder.go b/api/turing/imagebuilder/imagebuilder.go
@@ -237,10 +237,20 @@ func (ib *imageBuilder) createKanikoJob(
 		"--single-snapshot",
 	}
 
+	annotations := make(map[string]string)
+	if !ib.imageBuildingConfig.SafeToEvict {
+		// The image-building jobs are timing out. We found that one of the root causes is the node pool got scaled down
+		// resulting in the image building pods to be rescheduled.
+		// Adding "cluster-autoscaler.kubernetes.io/safe-to-evict": "false" to avoid the pod get killed and rescheduled.
+		// https://kubernetes.io/docs/reference/labels-annotations-taints/#cluster-autoscaler-kubernetes-io-safe-to-evict
+		annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
+	}
+
 	job := cluster.Job{
 		Name:                    kanikoJobName,
 		Namespace:               ib.imageBuildingConfig.BuildNamespace,
 		Labels:                  buildLabels,
+		Annotations:             annotations,
 		Completions:             &jobCompletions,
 		BackOffLimit:            &jobBackOffLimit,
 		TTLSecondsAfterFinished: &jobTTLSecondAfterComplete,
diff --git a/infra/charts/turing/values.yaml b/infra/charts/turing/values.yaml
@@ -31,11 +31,11 @@ turing:
     enabled: false
     # -- Set host value to enable name based virtual hosting. This allows routing
     # HTTP traffic to multiple host names at the same IP address. If no host is
-    # specified, the ingress rule applies to all inbound HTTP traffic through 
+    # specified, the ingress rule applies to all inbound HTTP traffic through
     # the IP address specified.
     # https://kubernetes.io/docs/concepts/services-networking/ingress/#name-based-virtual-hosting
     host: ""
-    # -- Ingress class annotation to add to this Ingress rule, 
+    # -- Ingress class annotation to add to this Ingress rule,
     # useful when there are multiple ingress controllers installed
     class: ""
     # -- Whether to use networking.k8s.io/v1 (k8s version >= 1.19) or networking.k8s.io/v1beta1 (1.16 >= k8s version >= 1.22)
@@ -65,7 +65,7 @@ turing:
   #   command: ['sh', '-c', "until nslookup myservice.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
 
   # -- List of sidecar containers to attach to the Pod. For example, you can
-  # attach sidecar container that forward logs or dynamically update some 
+  # attach sidecar container that forward logs or dynamically update some
   # configuration files.
   extraContainers: []
   # Example:
@@ -80,7 +80,7 @@ turing:
   #   - name: config-volume
   #     mountPath: /etc/fluentd-config
 
-  # -- Extra volumes to attach to the Pod. For example, you can mount 
+  # -- Extra volumes to attach to the Pod. For example, you can mount
   # additional secrets to these volumes
   extraVolumes: []
   # Example:
@@ -104,20 +104,19 @@ turing:
     # See api/environments-dev.yaml for example contents
     environmentConfigPath: "environments.yaml"
     # -- See ClusterConfig.EnsemblingServiceK8sConfig in api/config-dev.yaml for example structure
-    ensemblingServiceK8sConfig: 
+    ensemblingServiceK8sConfig:
       name: dev-cluster
       cluster: {}
       user: {}
 
   # -- Set this field to configure environment configs. See api/environments-dev.yaml for sample structure
-  environmentConfigs: 
+  environmentConfigs:
     - name: dev
       k8s_config:
         name: dev-cluster
         cluster: {}
         user: {}
 
-
   # -- Turing Experiment Engines configuration
   experimentEngines: []
   # Example:
@@ -143,6 +142,7 @@ turing:
         BuildNamespace: default
         BuildTimeoutDuration: 20m
         DestinationRegistry: ghcr.io
+        SafeToEvict: false
         BaseImageRef:
           3.7.*: ghcr.io/caraml-dev/turing/pyfunc-ensembler-service:latest
         KanikoConfig:
@@ -168,7 +168,7 @@ turing:
       FailureRetryInterval: 10
       PythonVersion: "3"
       TTLSecond: 86400
-    DeployConfig: { }
+    DeployConfig: {}
     DbConfig:
       # The maximum amount of time a connection may be idle. For more info see: https://pkg.go.dev/database/sql#DB.SetConnMaxIdleTime
       ConnMaxIdleTime: 0s
@@ -178,8 +178,8 @@ turing:
       MaxIdleConns: 0
       # The maximum number of open connections to the database. For more info see: https://pkg.go.dev/database/sql#DB.SetMaxOpenConns
       MaxOpenConns: 0
-    KubernetesLabelConfigs: { }
-    MLPConfig: { }
+    KubernetesLabelConfigs: {}
+    MLPConfig: {}
     Sentry:
       Enabled: false
     RouterDefaults:
@@ -190,7 +190,7 @@ turing:
   # -- Override OpenAPI spec as long as it follows the OAS3 specifications.
   # A common use for this is to set the enums of the ExperimentEngineType.
   # See api/api/override-sample.yaml for an example.
-  openApiSpecOverrides: { }
+  openApiSpecOverrides: {}
 
   # -- Turing UI configuration.
   # Please Refer to https://github.com/caraml-dev/turing/blob/main/ui/public/app.config.js
@@ -213,7 +213,7 @@ turing:
       batchEnsemblingEnabled: *batchEnsemblingEnabled
     authConfig:
       oauthClientId: ""
-    sentryConfig: { }
+    sentryConfig: {}
 
 sentry:
   # -- Sentry DSN value used by both Turing API and Turing UI
@@ -242,7 +242,7 @@ postgresql:
     # -- Password for Turing Postgresql database
     password: turing
   containerPorts:
-      postgresql: 5432
+    postgresql: 5432
   metrics:
     enabled: false
     serviceMonitor:
@@ -256,7 +256,6 @@ postgresql:
     numSynchronousReplicas: 2
     applicationName: turing
 
-
 mlp:
   # -- MLP API endpoint, used by the MLP UI for fetching data
   apiHost: /api/v1
@@ -282,11 +281,11 @@ mlp:
     # -- Name of MLP's Postgresql deployment
     nameOverride: postgresql-mlp
     containerPorts:
-      postgresql: 5432 
+      postgresql: 5432
   environmentConfigSecret:
     # -- Name refers to a secret that contains the K8sConfig. Set if there already exists a K8s secret
     # that turing should use. Key in secret defaults to environment.yaml
-    name: "" 
+    name: ""
     envKey: environment.yaml
 
 merlin:
@@ -303,7 +302,6 @@ merlin:
     # @default -- computed value
     apiHost: "http://{{ .Release.Name }}-mlp:8080/v1"
 
-
   # -- List of Merlin environment configs, available to Turing for deploying routers
   # By default, a new dev environment will automatically be created
   # @default -- computed value
@@ -343,7 +341,6 @@ global:
       # -- (string) Global MLP Encryption Key to be used by all MLP components
       key:
 
-
 # Tags are used to include/exclude chart dependencies
 tags:
   # -- Specifies if the necessary MLP components needs to be installed together with Turing