From a608d9452d7418e399950fe504d648f860ed2824 Mon Sep 17 00:00:00 2001 From: alimaazamat Date: Thu, 30 Oct 2025 14:46:00 -0700 Subject: [PATCH 1/2] add validation for idleTimeoutSeconds config per worker groups Signed-off-by: alimaazamat --- .../controllers/ray/utils/validation.go | 21 ++ .../controllers/ray/utils/validation_test.go | 191 ++++++++++++++++++ 2 files changed, 212 insertions(+) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 1bcda9af903..06af8b3a6d3 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -104,6 +104,9 @@ func ValidateRayClusterSpec(spec *rayv1.RayClusterSpec, annotations map[string]s if err := validateRayGroupLabels(workerGroup.GroupName, workerGroup.RayStartParams, workerGroup.Labels); err != nil { return err } + if err := validateWorkerGroupIdleTimeout(workerGroup, spec); err != nil { + return err + } } if annotations[RayFTEnabledAnnotationKey] != "" && spec.GcsFaultToleranceOptions != nil { @@ -596,3 +599,21 @@ func validateLegacyDeletionPolicies(rayJob *rayv1.RayJob) error { return nil } + +// validateWorkerGroupIdleTimeout validates the idleTimeoutSeconds field in a worker group spec +func validateWorkerGroupIdleTimeout(workerGroup rayv1.WorkerGroupSpec, spec *rayv1.RayClusterSpec) error { + idleTimeoutSeconds := workerGroup.IdleTimeoutSeconds + if idleTimeoutSeconds != nil { + if *idleTimeoutSeconds < 0 { + return fmt.Errorf("idleTimeoutSeconds must be non-negative, got %d", *idleTimeoutSeconds) + } + + // idleTimeoutSeconds only allowed on autoscaler v2 + envVar, exists := EnvVarByName(RAY_ENABLE_AUTOSCALER_V2, spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env) + if !exists || (envVar.Value != "1" && envVar.Value != "true") { + return fmt.Errorf("worker group %s has idleTimeoutSeconds set, but %s environment variable is not set to 'true' in the head pod", workerGroup.GroupName, RAY_ENABLE_AUTOSCALER_V2) + } + } + + return nil +} diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index 014d0917d68..214b1408175 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1882,3 +1882,194 @@ func TestValidateClusterUpgradeOptions(t *testing.T) { }) } } + +func TestValidateWorkerGroupIdleTimeout(t *testing.T) { + tests := map[string]struct { + expectedErr string + spec rayv1.RayClusterSpec + }{ + "should accept worker group with valid idleTimeoutSeconds": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec([]corev1.EnvVar{ + {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"}, + }, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(60)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "", + }, + "should reject negative idleTimeoutSeconds": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec([]corev1.EnvVar{ + {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"}, + }, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(-10)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "idleTimeoutSeconds must be non-negative, got -10", + }, + "should accept zero idleTimeoutSeconds": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec([]corev1.EnvVar{ + {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"}, + }, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(0)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "", + }, + "should reject idleTimeoutSeconds when autoscaler version is not v2": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(60)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod", + }, + "should reject idleTimeoutSeconds when autoscaler version is not set": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(60)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod", + }, + "should reject idleTimeoutSeconds when AutoscalerOptions is nil": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(60)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod", + }, + "should reject idleTimeoutSeconds when env var is set to invalid value": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec([]corev1.EnvVar{ + {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "false"}, + }, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(60)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod", + }, + "should accept worker group with idleTimeoutSeconds when env var is set to true": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec([]corev1.EnvVar{ + {Name: RAY_ENABLE_AUTOSCALER_V2, Value: "true"}, + }, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + IdleTimeoutSeconds: ptr.To(int32(60)), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "", + }, + "should accept worker group without idleTimeoutSeconds and without autoscaler v2": { + spec: rayv1.RayClusterSpec{ + EnableInTreeAutoscaling: ptr.To(true), + HeadGroupSpec: rayv1.HeadGroupSpec{ + Template: podTemplateSpec(nil, nil), + }, + WorkerGroupSpecs: []rayv1.WorkerGroupSpec{ + { + GroupName: "worker-group-1", + Template: podTemplateSpec(nil, nil), + MinReplicas: ptr.To(int32(0)), + MaxReplicas: ptr.To(int32(10)), + }, + }, + }, + expectedErr: "", + }, + } + + for name, tc := range tests { + t.Run(name, func(t *testing.T) { + err := ValidateRayClusterSpec(&tc.spec, nil) + if tc.expectedErr != "" { + require.Error(t, err) + require.EqualError(t, err, tc.expectedErr) + } else { + require.NoError(t, err) + } + }) + } +} From d76a8619d457a9025bbb664a6644d1501e04913b Mon Sep 17 00:00:00 2001 From: Alima Azamat <92766804+alimaazamat@users.noreply.github.com> Date: Fri, 5 Dec 2025 10:01:46 -0800 Subject: [PATCH 2/2] Check spec version then fall back to env var Co-authored-by: Jun-Hao Wan Signed-off-by: Alima Azamat <92766804+alimaazamat@users.noreply.github.com> Signed-off-by: alimaazamat --- ray-operator/controllers/ray/utils/validation.go | 8 ++++++-- ray-operator/controllers/ray/utils/validation_test.go | 8 ++++---- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/ray-operator/controllers/ray/utils/validation.go b/ray-operator/controllers/ray/utils/validation.go index 06af8b3a6d3..07d4591afc1 100644 --- a/ray-operator/controllers/ray/utils/validation.go +++ b/ray-operator/controllers/ray/utils/validation.go @@ -609,10 +609,14 @@ func validateWorkerGroupIdleTimeout(workerGroup rayv1.WorkerGroupSpec, spec *ray } // idleTimeoutSeconds only allowed on autoscaler v2 + if IsAutoscalingV2Enabled(spec) { + return nil + } envVar, exists := EnvVarByName(RAY_ENABLE_AUTOSCALER_V2, spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env) - if !exists || (envVar.Value != "1" && envVar.Value != "true") { - return fmt.Errorf("worker group %s has idleTimeoutSeconds set, but %s environment variable is not set to 'true' in the head pod", workerGroup.GroupName, RAY_ENABLE_AUTOSCALER_V2) + if exists && (envVar.Value == "1" || envVar.Value == "true") { + return nil } + return fmt.Errorf("worker group %s has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", workerGroup.GroupName, RAY_ENABLE_AUTOSCALER_V2) } return nil diff --git a/ray-operator/controllers/ray/utils/validation_test.go b/ray-operator/controllers/ray/utils/validation_test.go index 214b1408175..43fa6af48e9 100644 --- a/ray-operator/controllers/ray/utils/validation_test.go +++ b/ray-operator/controllers/ray/utils/validation_test.go @@ -1964,7 +1964,7 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) { }, }, }, - expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod", + expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", RAY_ENABLE_AUTOSCALER_V2), }, "should reject idleTimeoutSeconds when autoscaler version is not set": { spec: rayv1.RayClusterSpec{ @@ -1982,7 +1982,7 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) { }, }, }, - expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod", + expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", RAY_ENABLE_AUTOSCALER_V2), }, "should reject idleTimeoutSeconds when AutoscalerOptions is nil": { spec: rayv1.RayClusterSpec{ @@ -2000,7 +2000,7 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) { }, }, }, - expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod", + expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", RAY_ENABLE_AUTOSCALER_V2), }, "should reject idleTimeoutSeconds when env var is set to invalid value": { spec: rayv1.RayClusterSpec{ @@ -2020,7 +2020,7 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) { }, }, }, - expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod", + expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' (or set %s environment variable to 'true' in the head pod if using KubeRay < 1.4.0)", RAY_ENABLE_AUTOSCALER_V2), }, "should accept worker group with idleTimeoutSeconds when env var is set to true": { spec: rayv1.RayClusterSpec{