Skip to content

Commit 3ebecf4

Browse files
Add safe-to-evict annotation to image builder job (#334)
* Add safe-to-evict annotation to image builder job * Add safe-to-evict annotation to image builder job
1 parent 6c7b8be commit 3ebecf4

File tree

7 files changed

+68
-46
lines changed

7 files changed

+68
-46
lines changed

api/config-dev.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ BatchEnsemblingConfig:
1919
BuildNamespace: default
2020
BuildTimeoutDuration: 20m
2121
DestinationRegistry: ghcr.io
22+
SafeToEvict: false
2223
BaseImageRef:
2324
3.7.*: ghcr.io/caraml-dev/turing/pyfunc-ensembler-job-py3.7:v0.0.0-build.205-cf36433
2425
3.8.*: ghcr.io/caraml-dev/turing/pyfunc-ensembler-job-py3.8:v0.0.0-build.205-cf36433
@@ -69,7 +70,8 @@ RouterDefaults:
6970
ClusterConfig:
7071
InClusterConfig: false
7172
EnvironmentConfigPath: ""
72-
EnsemblingServiceK8sConfig: {}
73+
EnsemblingServiceK8sConfig:
74+
{}
7375
# Example k8s_config to connect to cluster using gke-gcloud-auth-plugin
7476
# name: dev-cluster
7577
# cluster:

api/turing/cluster/job.go

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ type Job struct {
1111
Name string
1212
Namespace string
1313
Labels map[string]string
14+
Annotations map[string]string
1415
Completions *int32
1516
BackOffLimit *int32
1617
TTLSecondsAfterFinished *int32
@@ -47,17 +48,19 @@ func (j *Job) Build() *batchv1.Job {
4748

4849
return &batchv1.Job{
4950
ObjectMeta: metav1.ObjectMeta{
50-
Name: j.Name,
51-
Namespace: j.Namespace,
52-
Labels: j.Labels,
51+
Name: j.Name,
52+
Namespace: j.Namespace,
53+
Labels: j.Labels,
54+
Annotations: j.Annotations,
5355
},
5456
Spec: batchv1.JobSpec{
5557
Completions: j.Completions,
5658
BackoffLimit: j.BackOffLimit,
5759
TTLSecondsAfterFinished: j.TTLSecondsAfterFinished,
5860
Template: corev1.PodTemplateSpec{
5961
ObjectMeta: metav1.ObjectMeta{
60-
Labels: j.Labels,
62+
Labels: j.Labels,
63+
Annotations: j.Annotations,
6164
},
6265
Spec: corev1.PodSpec{
6366
RestartPolicy: j.RestartPolicy,

api/turing/cluster/job_test.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ var (
1717
labels = map[string]string{
1818
"foo": "bar",
1919
}
20+
annotations = map[string]string{
21+
"key": "value",
22+
}
2023
jobCompletions int32 = 1
2124
jobBackOffLimit int32 = 3
2225
jobTTLSecondAfterComplete int32 = 3600 * 24
@@ -25,17 +28,19 @@ var (
2528
func TestJob(t *testing.T) {
2629
expected := batchv1.Job{
2730
ObjectMeta: metav1.ObjectMeta{
28-
Name: jobName,
29-
Namespace: namespace,
30-
Labels: labels,
31+
Name: jobName,
32+
Namespace: namespace,
33+
Labels: labels,
34+
Annotations: annotations,
3135
},
3236
Spec: batchv1.JobSpec{
3337
Completions: &jobCompletions,
3438
BackoffLimit: &jobBackOffLimit,
3539
TTLSecondsAfterFinished: &jobTTLSecondAfterComplete,
3640
Template: corev1.PodTemplateSpec{
3741
ObjectMeta: metav1.ObjectMeta{
38-
Labels: labels,
42+
Labels: labels,
43+
Annotations: annotations,
3944
},
4045
Spec: corev1.PodSpec{
4146
RestartPolicy: corev1.RestartPolicyNever,
@@ -67,6 +72,7 @@ func TestJob(t *testing.T) {
6772
Name: jobName,
6873
Namespace: namespace,
6974
Labels: labels,
75+
Annotations: annotations,
7076
Completions: &jobCompletions,
7177
BackOffLimit: &jobBackOffLimit,
7278
TTLSecondsAfterFinished: &jobTTLSecondAfterComplete,

api/turing/config/config.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,8 @@ type ImageBuildingConfig struct {
176176
TolerationName *string
177177
// NodeSelector restricts the running of image building jobs to nodes with the specified labels.
178178
NodeSelector map[string]string
179+
// Value for cluster-autoscaler.kubernetes.io/safe-to-evict annotation
180+
SafeToEvict bool
179181
}
180182

181183
// Resource contains the Kubernetes resource request and limits
@@ -636,7 +638,8 @@ func StringToQuantityHookFunc() mapstructure.DecodeHookFunc {
636638
return func(
637639
f reflect.Type,
638640
t reflect.Type,
639-
data interface{}) (interface{}, error) {
641+
data interface{},
642+
) (interface{}, error) {
640643
if f.Kind() != reflect.String {
641644
return data, nil
642645
}

api/turing/config/example.yaml

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ LogLevel: DEBUG
66

77
# Enable Cross-Origin Requests (CORS) for the following origins
88
AllowedOrigins:
9-
- "*"
9+
- "*"
1010

1111
# Configuration to enable authorization using ORY Keto server
1212
AuthConfig:
@@ -42,6 +42,7 @@ BatchEnsemblingConfig:
4242
BuildNamespace: default
4343
BuildTimeoutDuration: 20m
4444
DestinationRegistry: ghcr.io
45+
SafeToEvict: false
4546
BaseImageRef:
4647
3.7.*: ghcr.io/caraml-dev/turing/pyfunc-ensembler-job:latest
4748
KanikoConfig: &kanikoConfig
@@ -90,7 +91,7 @@ NewRelic:
9091
Enabled: true
9192
AppName: turing
9293
License: <your very secret license key here>
93-
IgnoreStatusCodes:
94+
IgnoreStatusCodes:
9495
- 400
9596
- 401
9697
- 403
@@ -170,7 +171,7 @@ RouterDefaults:
170171
Sentry:
171172
Enabled: false
172173
DSN: https://[email protected]/1
173-
Labels:
174+
Labels:
174175
labelkey1: labelvalue1
175176
labelkey2: labelvalue2
176177

@@ -188,10 +189,10 @@ ClusterConfig:
188189
# such as user provided service account keys
189190
TuringEncryptionKey: secret
190191

191-
# Alert configuration for Turing router
192+
# Alert configuration for Turing router
192193
AlertConfig:
193194
Enabled: false
194-
# Persist alert configuration in a GitLab repo with pre-configured
195+
# Persist alert configuration in a GitLab repo with pre-configured
195196
# continuous integration jobs that will register and activate the alerts
196197
# to a backend alert server
197198
GitLab:
@@ -218,7 +219,7 @@ MLPConfig:
218219
TuringUIConfig:
219220
ServingDirectory: /ui
220221
ServingPath: /turing
221-
222+
222223
# Openapi validation configuration
223224
OpenapiConfig:
224225
ValidationEnabled: true
@@ -239,15 +240,15 @@ Experiment:
239240
config:
240241
op: seq
241242
seq:
242-
- op: set
243-
var: variant
244-
value:
245-
choices:
246-
op: array
247-
values:
248-
- control
249-
- treatment-1
250-
unit:
251-
op: get
252-
var: customer
253-
op: uniformChoice
243+
- op: set
244+
var: variant
245+
value:
246+
choices:
247+
op: array
248+
values:
249+
- control
250+
- treatment-1
251+
unit:
252+
op: get
253+
var: customer
254+
op: uniformChoice

api/turing/imagebuilder/imagebuilder.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,20 @@ func (ib *imageBuilder) createKanikoJob(
237237
"--single-snapshot",
238238
}
239239

240+
annotations := make(map[string]string)
241+
if !ib.imageBuildingConfig.SafeToEvict {
242+
// The image-building jobs are timing out. We found that one of the root causes is the node pool got scaled down
243+
// resulting in the image building pods to be rescheduled.
244+
// Adding "cluster-autoscaler.kubernetes.io/safe-to-evict": "false" to avoid the pod get killed and rescheduled.
245+
// https://kubernetes.io/docs/reference/labels-annotations-taints/#cluster-autoscaler-kubernetes-io-safe-to-evict
246+
annotations["cluster-autoscaler.kubernetes.io/safe-to-evict"] = "false"
247+
}
248+
240249
job := cluster.Job{
241250
Name: kanikoJobName,
242251
Namespace: ib.imageBuildingConfig.BuildNamespace,
243252
Labels: buildLabels,
253+
Annotations: annotations,
244254
Completions: &jobCompletions,
245255
BackOffLimit: &jobBackOffLimit,
246256
TTLSecondsAfterFinished: &jobTTLSecondAfterComplete,

infra/charts/turing/values.yaml

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ turing:
3131
enabled: false
3232
# -- Set host value to enable name based virtual hosting. This allows routing
3333
# HTTP traffic to multiple host names at the same IP address. If no host is
34-
# specified, the ingress rule applies to all inbound HTTP traffic through
34+
# specified, the ingress rule applies to all inbound HTTP traffic through
3535
# the IP address specified.
3636
# https://kubernetes.io/docs/concepts/services-networking/ingress/#name-based-virtual-hosting
3737
host: ""
38-
# -- Ingress class annotation to add to this Ingress rule,
38+
# -- Ingress class annotation to add to this Ingress rule,
3939
# useful when there are multiple ingress controllers installed
4040
class: ""
4141
# -- Whether to use networking.k8s.io/v1 (k8s version >= 1.19) or networking.k8s.io/v1beta1 (1.16 >= k8s version >= 1.22)
@@ -65,7 +65,7 @@ turing:
6565
# command: ['sh', '-c', "until nslookup myservice.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"]
6666

6767
# -- List of sidecar containers to attach to the Pod. For example, you can
68-
# attach sidecar container that forward logs or dynamically update some
68+
# attach sidecar container that forward logs or dynamically update some
6969
# configuration files.
7070
extraContainers: []
7171
# Example:
@@ -80,7 +80,7 @@ turing:
8080
# - name: config-volume
8181
# mountPath: /etc/fluentd-config
8282

83-
# -- Extra volumes to attach to the Pod. For example, you can mount
83+
# -- Extra volumes to attach to the Pod. For example, you can mount
8484
# additional secrets to these volumes
8585
extraVolumes: []
8686
# Example:
@@ -104,20 +104,19 @@ turing:
104104
# See api/environments-dev.yaml for example contents
105105
environmentConfigPath: "environments.yaml"
106106
# -- See ClusterConfig.EnsemblingServiceK8sConfig in api/config-dev.yaml for example structure
107-
ensemblingServiceK8sConfig:
107+
ensemblingServiceK8sConfig:
108108
name: dev-cluster
109109
cluster: {}
110110
user: {}
111111

112112
# -- Set this field to configure environment configs. See api/environments-dev.yaml for sample structure
113-
environmentConfigs:
113+
environmentConfigs:
114114
- name: dev
115115
k8s_config:
116116
name: dev-cluster
117117
cluster: {}
118118
user: {}
119119

120-
121120
# -- Turing Experiment Engines configuration
122121
experimentEngines: []
123122
# Example:
@@ -143,6 +142,7 @@ turing:
143142
BuildNamespace: default
144143
BuildTimeoutDuration: 20m
145144
DestinationRegistry: ghcr.io
145+
SafeToEvict: false
146146
BaseImageRef:
147147
3.7.*: ghcr.io/caraml-dev/turing/pyfunc-ensembler-service:latest
148148
KanikoConfig:
@@ -168,7 +168,7 @@ turing:
168168
FailureRetryInterval: 10
169169
PythonVersion: "3"
170170
TTLSecond: 86400
171-
DeployConfig: { }
171+
DeployConfig: {}
172172
DbConfig:
173173
# The maximum amount of time a connection may be idle. For more info see: https://pkg.go.dev/database/sql#DB.SetConnMaxIdleTime
174174
ConnMaxIdleTime: 0s
@@ -178,8 +178,8 @@ turing:
178178
MaxIdleConns: 0
179179
# The maximum number of open connections to the database. For more info see: https://pkg.go.dev/database/sql#DB.SetMaxOpenConns
180180
MaxOpenConns: 0
181-
KubernetesLabelConfigs: { }
182-
MLPConfig: { }
181+
KubernetesLabelConfigs: {}
182+
MLPConfig: {}
183183
Sentry:
184184
Enabled: false
185185
RouterDefaults:
@@ -190,7 +190,7 @@ turing:
190190
# -- Override OpenAPI spec as long as it follows the OAS3 specifications.
191191
# A common use for this is to set the enums of the ExperimentEngineType.
192192
# See api/api/override-sample.yaml for an example.
193-
openApiSpecOverrides: { }
193+
openApiSpecOverrides: {}
194194

195195
# -- Turing UI configuration.
196196
# Please Refer to https://github.com/caraml-dev/turing/blob/main/ui/public/app.config.js
@@ -213,7 +213,7 @@ turing:
213213
batchEnsemblingEnabled: *batchEnsemblingEnabled
214214
authConfig:
215215
oauthClientId: ""
216-
sentryConfig: { }
216+
sentryConfig: {}
217217

218218
sentry:
219219
# -- Sentry DSN value used by both Turing API and Turing UI
@@ -242,7 +242,7 @@ postgresql:
242242
# -- Password for Turing Postgresql database
243243
password: turing
244244
containerPorts:
245-
postgresql: 5432
245+
postgresql: 5432
246246
metrics:
247247
enabled: false
248248
serviceMonitor:
@@ -256,7 +256,6 @@ postgresql:
256256
numSynchronousReplicas: 2
257257
applicationName: turing
258258

259-
260259
mlp:
261260
# -- MLP API endpoint, used by the MLP UI for fetching data
262261
apiHost: /api/v1
@@ -282,11 +281,11 @@ mlp:
282281
# -- Name of MLP's Postgresql deployment
283282
nameOverride: postgresql-mlp
284283
containerPorts:
285-
postgresql: 5432
284+
postgresql: 5432
286285
environmentConfigSecret:
287286
# -- Name refers to a secret that contains the K8sConfig. Set if there already exists a K8s secret
288287
# that turing should use. Key in secret defaults to environment.yaml
289-
name: ""
288+
name: ""
290289
envKey: environment.yaml
291290

292291
merlin:
@@ -303,7 +302,6 @@ merlin:
303302
# @default -- computed value
304303
apiHost: "http://{{ .Release.Name }}-mlp:8080/v1"
305304

306-
307305
# -- List of Merlin environment configs, available to Turing for deploying routers
308306
# By default, a new dev environment will automatically be created
309307
# @default -- computed value
@@ -343,7 +341,6 @@ global:
343341
# -- (string) Global MLP Encryption Key to be used by all MLP components
344342
key:
345343

346-
347344
# Tags are used to include/exclude chart dependencies
348345
tags:
349346
# -- Specifies if the necessary MLP components needs to be installed together with Turing

0 commit comments

Comments
 (0)