diff --git a/example/agent-runtime/agent-runtime.yaml b/example/agent-runtime/agent-runtime.yaml index 321c533c..c506017c 100644 --- a/example/agent-runtime/agent-runtime.yaml +++ b/example/agent-runtime/agent-runtime.yaml @@ -19,4 +19,7 @@ spec: image: busybox command: ["/bin/sh", "-c", "sleep 36000"] sessionTimeout: "15m" - maxSessionDuration: "8h" \ No newline at end of file + # maxSessionDuration is optional. When omitted, the sandbox runs indefinitely + # and is only cleaned up by the idle timeout (sessionTimeout) above. + # Uncomment the line below to set a hard upper limit on session lifetime. + # maxSessionDuration: "24h" \ No newline at end of file diff --git a/go.mod b/go.mod index f0a23c80..c637c9f7 100644 --- a/go.mod +++ b/go.mod @@ -7,8 +7,8 @@ toolchain go1.24.9 require ( github.com/agiledragon/gomonkey/v2 v2.13.0 github.com/alicebob/miniredis/v2 v2.35.0 - github.com/gin-contrib/gzip v1.0.1 github.com/fsnotify/fsnotify v1.9.0 + github.com/gin-contrib/gzip v1.0.1 github.com/gin-gonic/gin v1.10.0 github.com/golang-jwt/jwt/v5 v5.2.2 github.com/google/uuid v1.6.0 diff --git a/manifests/charts/base/crds/runtime.agentcube.volcano.sh_agentruntimes.yaml b/manifests/charts/base/crds/runtime.agentcube.volcano.sh_agentruntimes.yaml index a6508b34..18451c9d 100644 --- a/manifests/charts/base/crds/runtime.agentcube.volcano.sh_agentruntimes.yaml +++ b/manifests/charts/base/crds/runtime.agentcube.volcano.sh_agentruntimes.yaml @@ -44,10 +44,11 @@ spec: description: Spec defines the desired state of the AgentRuntime. properties: maxSessionDuration: - default: 8h description: |- MaxSessionDuration describes the maximum duration for a session. After this duration, the session will be terminated no matter active or inactive. + When omitted, sessions have no maximum duration and will only be cleaned up + by idle timeout (SessionTimeout). type: string podTemplate: description: PodTemplate describes the template that will be used @@ -8504,7 +8505,6 @@ spec: type: object type: array required: - - maxSessionDuration - podTemplate - sessionTimeout - targetPort diff --git a/pkg/apis/runtime/v1alpha1/agent_type.go b/pkg/apis/runtime/v1alpha1/agent_type.go index fe5413fa..26680781 100644 --- a/pkg/apis/runtime/v1alpha1/agent_type.go +++ b/pkg/apis/runtime/v1alpha1/agent_type.go @@ -53,8 +53,9 @@ type AgentRuntimeSpec struct { // MaxSessionDuration describes the maximum duration for a session. // After this duration, the session will be terminated no matter active or inactive. - // +kubebuilder:validation:Required - // +kubebuilder:default="8h" + // When omitted, sessions have no maximum duration and will only be cleaned up + // by idle timeout (SessionTimeout). + // +optional MaxSessionDuration *metav1.Duration `json:"maxSessionDuration,omitempty" protobuf:"bytes,3,opt,name=maxSessionDuration"` } diff --git a/pkg/store/store_redis.go b/pkg/store/store_redis.go index 8b6a0e8b..c533b1a5 100644 --- a/pkg/store/store_redis.go +++ b/pkg/store/store_redis.go @@ -155,16 +155,16 @@ func (rs *redisStore) StoreSandbox(ctx context.Context, sandboxRedis *types.Sand return fmt.Errorf("StoreSandbox: marshal sandbox failed: %w", err) } - if sandboxRedis.ExpiresAt.IsZero() { - return fmt.Errorf("StoreSandbox: sandbox expired at is zero") - } - pipe := rs.cli.Pipeline() pipe.SetNX(ctx, sessionKey, b, 0) - pipe.ZAdd(ctx, rs.expiryIndexKey, redisv9.Z{ - Score: float64(sandboxRedis.ExpiresAt.Unix()), - Member: sandboxRedis.SessionID, - }) + // Only add to expiry index when ExpiresAt is set. + // Zero ExpiresAt means the sandbox has no maximum lifetime. + if !sandboxRedis.ExpiresAt.IsZero() { + pipe.ZAdd(ctx, rs.expiryIndexKey, redisv9.Z{ + Score: float64(sandboxRedis.ExpiresAt.Unix()), + Member: sandboxRedis.SessionID, + }) + } pipe.ZAdd(ctx, rs.lastActivityIndexKey, redisv9.Z{ Score: float64(time.Now().Unix()), Member: sandboxRedis.SessionID, diff --git a/pkg/store/store_redis_test.go b/pkg/store/store_redis_test.go index 6c449fef..9436b525 100644 --- a/pkg/store/store_redis_test.go +++ b/pkg/store/store_redis_test.go @@ -343,13 +343,70 @@ func TestUpdateSandboxLastActivity(t *testing.T) { if int64(score) != newLastActivity.Unix() { t.Fatalf("unexpected lastActivity score after update: got %v, want %v", score, newLastActivity.Unix()) } +} + +// TestRedisStore_StoreSandbox_ZeroExpiresAt verifies the behavior for AgentRuntime +// sandboxes that have no MaxSessionDuration (ExpiresAt == zero time): +// - The session is still stored and retrievable by session ID. +// - The session is NOT added to the expiry sorted-set (no hard cap). +// - The session IS tracked in the last-activity index after UpdateSessionLastActivity, +// so idle-timeout GC still works correctly. +func TestRedisStore_StoreSandbox_ZeroExpiresAt(t *testing.T) { + ctx := context.Background() + c, mr := newTestRedisClient(t) + + // Zero ExpiresAt — simulates an AgentRuntime sandbox with no MaxSessionDuration. + sb := &types.SandboxInfo{ + SandboxID: "sb-no-expiry", + Name: "test-sandbox-no-expiry", + SessionID: "sess-no-expiry", + CreatedAt: time.Now().UTC(), + ExpiresAt: time.Time{}, // zero + Status: "running", + } + + // StoreSandbox must succeed even with zero ExpiresAt. + if err := c.StoreSandbox(ctx, sb); err != nil { + t.Fatalf("StoreSandbox with zero ExpiresAt failed: %v", err) + } + + // The session must be retrievable. + got, err := c.GetSandboxBySessionID(ctx, "sess-no-expiry") + if err != nil { + t.Fatalf("GetSandboxBySessionID failed: %v", err) + } + if got.SandboxID != "sb-no-expiry" { + t.Errorf("expected SandboxID sb-no-expiry, got %q", got.SandboxID) + } - // session not exists - err = c.UpdateSessionLastActivity(ctx, "sess-1-not-exist", newLastActivity) + // Must NOT be in the expiry sorted-set (zero ExpiresAt = no hard cap). + _, err = mr.ZScore(c.expiryIndexKey, "sess-no-expiry") if err == nil { - t.Fatalf("expected error for non-existent session") + t.Error("expected sess-no-expiry to be absent from expiry index, but ZScore succeeded") } - if !errors.Is(err, ErrNotFound) { - t.Fatalf("expected ErrNotFound, got %v", err) + + // Must NOT appear in ListExpiredSandboxes even at a very late cutoff. + farFuture := time.Now().Add(100 * 365 * 24 * time.Hour) + expired, err := c.ListExpiredSandboxes(ctx, farFuture, 100) + if err != nil { + t.Fatalf("ListExpiredSandboxes error: %v", err) + } + for _, s := range expired { + if s.SessionID == "sess-no-expiry" { + t.Error("sess-no-expiry must not appear in ListExpiredSandboxes") + } + } + + // After UpdateSessionLastActivity the session MUST be tracked in the activity index. + now := time.Now().UTC().Truncate(time.Second) + if err := c.UpdateSessionLastActivity(ctx, "sess-no-expiry", now); err != nil { + t.Fatalf("UpdateSessionLastActivity failed: %v", err) + } + score, err := mr.ZScore(c.lastActivityIndexKey, "sess-no-expiry") + if err != nil { + t.Fatalf("expected sess-no-expiry in last-activity index: %v", err) + } + if int64(score) != now.Unix() { + t.Errorf("unexpected last-activity score: got %v, want %v", int64(score), now.Unix()) } } diff --git a/pkg/store/store_valkey.go b/pkg/store/store_valkey.go index 5e68b20d..fd02d88f 100644 --- a/pkg/store/store_valkey.go +++ b/pkg/store/store_valkey.go @@ -174,16 +174,28 @@ func (vs *valkeyStore) StoreSandbox(ctx context.Context, sandboxStore *types.San return errors.New("StoreSandbox: sandbox is nil") } - if sandboxStore.ExpiresAt.IsZero() { - return fmt.Errorf("StoreSandbox: sandbox expires time is zero") - } - sessionKey := vs.sessionKey(sandboxStore.SessionID) b, err := json.Marshal(sandboxStore) if err != nil { return fmt.Errorf("StoreSandbox: marshal sandbox: %w", err) } + if sandboxStore.ExpiresAt.IsZero() { + // Zero ExpiresAt means no maximum lifetime — skip expiry index. + // The sandbox will still be tracked in the activity index for idle timeout. + commands := make(valkey.Commands, 0, 3) + commands = append(commands, vs.cli.B().Setnx().Key(sessionKey).Value(string(b)).Build()) + commands = append(commands, vs.cli.B().Zadd().Key(vs.lastActivityIndexKey).ScoreMember(). + ScoreMember(float64(time.Now().Unix()), sandboxStore.SessionID).Build()) + + for i, resp := range vs.cli.DoMulti(ctx, commands...) { + if err = resp.Error(); err != nil { + return fmt.Errorf("StoreSandbox: DoMulti failed: %w, command index: %v", err, i) + } + } + return nil + } + commands := make(valkey.Commands, 0, 5) commands = append(commands, vs.cli.B().Setnx().Key(sessionKey).Value(string(b)).Build()) commands = append(commands, vs.cli.B().Zadd().Key(vs.expiryIndexKey).ScoreMember(). diff --git a/pkg/workloadmanager/sandbox_helper.go b/pkg/workloadmanager/sandbox_helper.go index 322b5d8d..8f9c4572 100644 --- a/pkg/workloadmanager/sandbox_helper.go +++ b/pkg/workloadmanager/sandbox_helper.go @@ -51,8 +51,6 @@ func buildSandboxPlaceHolder(sandboxCR *sandboxv1alpha1.Sandbox, entry *sandboxE var expiresAt time.Time if sandboxCR.Spec.Lifecycle.ShutdownTime != nil { expiresAt = sandboxCR.Spec.Lifecycle.ShutdownTime.Time - } else { - expiresAt = time.Now().Add(DefaultSandboxTTL) } idleTimeout := entry.IdleTimeout if idleTimeout == 0 { @@ -71,7 +69,7 @@ func buildSandboxPlaceHolder(sandboxCR *sandboxv1alpha1.Sandbox, entry *sandboxE func buildSandboxInfo(sandbox *sandboxv1alpha1.Sandbox, podIP string, entry *sandboxEntry) *types.SandboxInfo { createdAt := sandbox.GetCreationTimestamp().Time - expiresAt := createdAt.Add(DefaultSandboxTTL) + var expiresAt time.Time if sandbox.Spec.Lifecycle.ShutdownTime != nil { expiresAt = sandbox.Spec.Lifecycle.ShutdownTime.Time } diff --git a/pkg/workloadmanager/sandbox_helper_test.go b/pkg/workloadmanager/sandbox_helper_test.go index 793108ad..4194d43b 100644 --- a/pkg/workloadmanager/sandbox_helper_test.go +++ b/pkg/workloadmanager/sandbox_helper_test.go @@ -40,7 +40,7 @@ func TestBuildSandboxPlaceHolder_TableDriven(t *testing.T) { validate func(t *testing.T, result *types.SandboxInfo) }{ { - name: "no ShutdownTime falls back to DefaultSandboxTTL", + name: "no ShutdownTime means no expiry (sandbox runs indefinitely)", setupSandbox: func() *sandboxv1alpha1.Sandbox { return &sandboxv1alpha1.Sandbox{ ObjectMeta: metav1.ObjectMeta{ @@ -54,8 +54,8 @@ func TestBuildSandboxPlaceHolder_TableDriven(t *testing.T) { SessionID: "session-123", }, validate: func(t *testing.T, result *types.SandboxInfo) { - expected := now.Add(DefaultSandboxTTL) - assert.WithinDuration(t, expected, result.ExpiresAt, 2*time.Second) + assert.True(t, result.ExpiresAt.IsZero(), + "ExpiresAt should be zero when no MaxSessionDuration is set") assert.Equal(t, "creating", result.Status) assert.Equal(t, "session-123", result.SessionID) }, diff --git a/pkg/workloadmanager/workload_builder.go b/pkg/workloadmanager/workload_builder.go index a6dbfb49..9c45dc1c 100644 --- a/pkg/workloadmanager/workload_builder.go +++ b/pkg/workloadmanager/workload_builder.go @@ -153,15 +153,10 @@ type buildSandboxClaimParams struct { // buildSandboxObject builds a Sandbox object from parameters func buildSandboxObject(params *buildSandboxParams) *sandboxv1alpha1.Sandbox { - if params.ttl == 0 { - params.ttl = DefaultSandboxTTL - } if params.idleTimeout == 0 { params.idleTimeout = DefaultSandboxIdleTimeout } - shutdownTime := metav1.NewTime(time.Now().Add(params.ttl)) - // Allocate fresh maps for copied metadata so we never mutate informer-cached input. // Annotations are only copied when params.podAnnotations is non-nil. podLabels := make(map[string]string, len(params.podLabels)+2) @@ -198,12 +193,18 @@ func buildSandboxObject(params *buildSandboxParams) *sandboxv1alpha1.Sandbox { Annotations: podAnnotations, }, }, - Lifecycle: sandboxv1alpha1.Lifecycle{ - ShutdownTime: &shutdownTime, - }, Replicas: ptr.To[int32](1), }, } + + // Only set ShutdownTime when MaxSessionDuration is explicitly configured. + // When omitted (ttl == 0), the sandbox runs indefinitely and is only + // cleaned up by idle timeout (SessionTimeout). + if params.ttl > 0 { + shutdownTime := metav1.NewTime(time.Now().Add(params.ttl)) + sandbox.Spec.Lifecycle.ShutdownTime = &shutdownTime + } + return sandbox } @@ -400,10 +401,15 @@ func buildSandboxByCodeInterpreter(namespace string, codeInterpreterName string, }, }, } + // CodeInterpreter always has a hard cap: use explicit MaxSessionDuration or + // fall back to DefaultSandboxTTL (8h). This differs from AgentRuntime where + // omitting MaxSessionDuration means no hard expiry. + ttl := DefaultSandboxTTL if codeInterpreterObj.Spec.MaxSessionDuration != nil { - shutdownTime := metav1.NewTime(time.Now().Add(codeInterpreterObj.Spec.MaxSessionDuration.Duration)) - simpleSandbox.Spec.Lifecycle.ShutdownTime = &shutdownTime + ttl = codeInterpreterObj.Spec.MaxSessionDuration.Duration } + shutdownTime := metav1.NewTime(time.Now().Add(ttl)) + simpleSandbox.Spec.Lifecycle.ShutdownTime = &shutdownTime sandboxEntry.Kind = types.SandboxClaimsKind return simpleSandbox, sandboxClaim, sandboxEntry, nil } @@ -443,8 +449,13 @@ func buildSandboxByCodeInterpreter(namespace string, codeInterpreterName string, idleTimeout: idleTimeout, } + // CodeInterpreter always has a hard cap: use explicit MaxSessionDuration or + // fall back to DefaultSandboxTTL (8h). This differs from AgentRuntime where + // omitting MaxSessionDuration means no hard expiry. if codeInterpreterObj.Spec.MaxSessionDuration != nil { buildParams.ttl = codeInterpreterObj.Spec.MaxSessionDuration.Duration + } else { + buildParams.ttl = DefaultSandboxTTL } sandbox := buildSandboxObject(buildParams) return sandbox, nil, sandboxEntry, nil diff --git a/pkg/workloadmanager/workload_builder_test.go b/pkg/workloadmanager/workload_builder_test.go index 1438cc94..b96cbc70 100644 --- a/pkg/workloadmanager/workload_builder_test.go +++ b/pkg/workloadmanager/workload_builder_test.go @@ -51,7 +51,7 @@ func TestBuildSandboxObject_DoesNotMutateCallerLabels(t *testing.T) { namespace: "default", sandboxName: "sandbox-abc", sessionID: "session-123", - ttl: DefaultSandboxTTL, + ttl: time.Hour, idleTimeout: DefaultSandboxIdleTimeout, podLabels: original, } @@ -115,6 +115,26 @@ func TestBuildSandboxObject_NilLabels(t *testing.T) { } } +// TestBuildSandboxObject_NoTTL verifies that when ttl==0 (AgentRuntime with no +// MaxSessionDuration configured), ShutdownTime is nil — the sandbox runs +// indefinitely and is only cleaned up by idle timeout. +func TestBuildSandboxObject_NoTTL(t *testing.T) { + params := &buildSandboxParams{ + namespace: "default", + sandboxName: "sandbox-no-ttl", + sessionID: "session-no-ttl", + ttl: 0, // explicitly no TTL — AgentRuntime without MaxSessionDuration + idleTimeout: 15 * time.Minute, + } + + sandbox := buildSandboxObject(params) + + if sandbox.Spec.Lifecycle.ShutdownTime != nil { + t.Errorf("expected ShutdownTime to be nil for zero-ttl sandbox, got %v", + sandbox.Spec.Lifecycle.ShutdownTime) + } +} + // TestBuildSandboxObject_WorkloadNameLabel verifies that the WorkloadNameLabelKey // label is correctly set on the Sandbox object metadata from the workloadName param. // This covers the bug where CodeInterpreter sandboxes were missing this label. diff --git a/test/e2e/echo_agent.yaml b/test/e2e/echo_agent.yaml index da662978..4994df98 100644 --- a/test/e2e/echo_agent.yaml +++ b/test/e2e/echo_agent.yaml @@ -100,3 +100,4 @@ spec: sessionTimeout: "15m" maxSessionDuration: "8h" status: {} + diff --git a/test/e2e/run_e2e.sh b/test/e2e/run_e2e.sh index 7c529607..b5f4055d 100755 --- a/test/e2e/run_e2e.sh +++ b/test/e2e/run_e2e.sh @@ -395,12 +395,16 @@ run_setup() { kubectl create namespace "${WORKLOAD_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f - # Create normal echo-agent apply_workload_fixture test/e2e/echo_agent.yaml - # Create echo-agent-short-ttl with short sessionTimeout for TTL testing + # Create echo-agent-short-ttl with short sessionTimeout for TTL testing. + # maxSessionDuration is intentionally omitted (deleted by sed) — the TTL test + # exercises idle-timeout (sessionTimeout) cleanup, not hard-expiry (maxSessionDuration). + # This also validates the PR change that makes maxSessionDuration optional for AgentRuntime. tmp_ttl_agent=$(mktemp) - sed 's/name: echo-agent/name: echo-agent-short-ttl/; s/app: echo-agent/app: echo-agent-short-ttl/; s/sessionTimeout: "15m"/sessionTimeout: "30s"/' test/e2e/echo_agent.yaml > "$tmp_ttl_agent" + sed 's/name: echo-agent/name: echo-agent-short-ttl/; s/app: echo-agent/app: echo-agent-short-ttl/; s/sessionTimeout: "15m"/sessionTimeout: "30s"/; /maxSessionDuration:/d' test/e2e/echo_agent.yaml > "$tmp_ttl_agent" apply_workload_fixture "$tmp_ttl_agent" rm -f "$tmp_ttl_agent" + step "Creating test CodeInterpreter..." # Create e2e-code-interpreter CodeInterpreter apply_workload_fixture test/e2e/e2e_code_interpreter.yaml