Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion example/agent-runtime/agent-runtime.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ spec:
image: busybox
command: ["/bin/sh", "-c", "sleep 36000"]
sessionTimeout: "15m"
maxSessionDuration: "8h"
# maxSessionDuration is optional. When omitted, the sandbox runs indefinitely
# and is only cleaned up by the idle timeout (sessionTimeout) above.
# Uncomment the line below to set a hard upper limit on session lifetime.
# maxSessionDuration: "24h"
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ toolchain go1.24.9
require (
github.com/agiledragon/gomonkey/v2 v2.13.0
github.com/alicebob/miniredis/v2 v2.35.0
github.com/gin-contrib/gzip v1.0.1
github.com/fsnotify/fsnotify v1.9.0
github.com/gin-contrib/gzip v1.0.1
github.com/gin-gonic/gin v1.10.0
github.com/golang-jwt/jwt/v5 v5.2.2
github.com/google/uuid v1.6.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,11 @@ spec:
description: Spec defines the desired state of the AgentRuntime.
properties:
maxSessionDuration:
default: 8h
description: |-
MaxSessionDuration describes the maximum duration for a session.
After this duration, the session will be terminated no matter active or inactive.
When omitted, sessions have no maximum duration and will only be cleaned up
by idle timeout (SessionTimeout).
type: string
podTemplate:
description: PodTemplate describes the template that will be used
Expand Down Expand Up @@ -8504,7 +8505,6 @@ spec:
type: object
type: array
required:
- maxSessionDuration
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also upd example/agent-runtime/agent-runtime.yaml

- podTemplate
- sessionTimeout
- targetPort
Expand Down
5 changes: 3 additions & 2 deletions pkg/apis/runtime/v1alpha1/agent_type.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,9 @@ type AgentRuntimeSpec struct {

// MaxSessionDuration describes the maximum duration for a session.
// After this duration, the session will be terminated no matter active or inactive.
// +kubebuilder:validation:Required
// +kubebuilder:default="8h"
// When omitted, sessions have no maximum duration and will only be cleaned up
// by idle timeout (SessionTimeout).
// +optional
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This needs the generated CRD artifact update as well. Right now the installed AgentRuntime CRD still has default: 8h and still lists maxSessionDuration as required, so the API server will keep defaulting omitted values to 8h and this change will not actually fix #303 in a deployed cluster.

MaxSessionDuration *metav1.Duration `json:"maxSessionDuration,omitempty" protobuf:"bytes,3,opt,name=maxSessionDuration"`
}

Expand Down
16 changes: 8 additions & 8 deletions pkg/store/store_redis.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,16 +155,16 @@ func (rs *redisStore) StoreSandbox(ctx context.Context, sandboxRedis *types.Sand
return fmt.Errorf("StoreSandbox: marshal sandbox failed: %w", err)
}

if sandboxRedis.ExpiresAt.IsZero() {
return fmt.Errorf("StoreSandbox: sandbox expired at is zero")
}

pipe := rs.cli.Pipeline()
pipe.SetNX(ctx, sessionKey, b, 0)
pipe.ZAdd(ctx, rs.expiryIndexKey, redisv9.Z{
Score: float64(sandboxRedis.ExpiresAt.Unix()),
Member: sandboxRedis.SessionID,
})
// Only add to expiry index when ExpiresAt is set.
// Zero ExpiresAt means the sandbox has no maximum lifetime.
if !sandboxRedis.ExpiresAt.IsZero() {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add store tests for the zero ExpiresAt branch? The important behavior is that the session is still stored and tracked in the last-activity index, but is not added to the expiry index.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes ofc we can i was fixing the failure test cases till now i would add store test for the zero expireat now

pipe.ZAdd(ctx, rs.expiryIndexKey, redisv9.Z{
Comment on lines 159 to +163
Score: float64(sandboxRedis.ExpiresAt.Unix()),
Member: sandboxRedis.SessionID,
})
}
Comment on lines 158 to +167
pipe.ZAdd(ctx, rs.lastActivityIndexKey, redisv9.Z{
Score: float64(time.Now().Unix()),
Member: sandboxRedis.SessionID,
Expand Down
67 changes: 62 additions & 5 deletions pkg/store/store_redis_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,13 +343,70 @@ func TestUpdateSandboxLastActivity(t *testing.T) {
if int64(score) != newLastActivity.Unix() {
t.Fatalf("unexpected lastActivity score after update: got %v, want %v", score, newLastActivity.Unix())
}
}

// TestRedisStore_StoreSandbox_ZeroExpiresAt verifies the behavior for AgentRuntime
// sandboxes that have no MaxSessionDuration (ExpiresAt == zero time):
// - The session is still stored and retrievable by session ID.
// - The session is NOT added to the expiry sorted-set (no hard cap).
// - The session IS tracked in the last-activity index after UpdateSessionLastActivity,
// so idle-timeout GC still works correctly.
func TestRedisStore_StoreSandbox_ZeroExpiresAt(t *testing.T) {
ctx := context.Background()
c, mr := newTestRedisClient(t)

// Zero ExpiresAt — simulates an AgentRuntime sandbox with no MaxSessionDuration.
sb := &types.SandboxInfo{
SandboxID: "sb-no-expiry",
Name: "test-sandbox-no-expiry",
SessionID: "sess-no-expiry",
CreatedAt: time.Now().UTC(),
ExpiresAt: time.Time{}, // zero
Status: "running",
}

// StoreSandbox must succeed even with zero ExpiresAt.
if err := c.StoreSandbox(ctx, sb); err != nil {
t.Fatalf("StoreSandbox with zero ExpiresAt failed: %v", err)
}

// The session must be retrievable.
got, err := c.GetSandboxBySessionID(ctx, "sess-no-expiry")
if err != nil {
t.Fatalf("GetSandboxBySessionID failed: %v", err)
}
if got.SandboxID != "sb-no-expiry" {
t.Errorf("expected SandboxID sb-no-expiry, got %q", got.SandboxID)
}

// session not exists
err = c.UpdateSessionLastActivity(ctx, "sess-1-not-exist", newLastActivity)
// Must NOT be in the expiry sorted-set (zero ExpiresAt = no hard cap).
_, err = mr.ZScore(c.expiryIndexKey, "sess-no-expiry")
if err == nil {
t.Fatalf("expected error for non-existent session")
t.Error("expected sess-no-expiry to be absent from expiry index, but ZScore succeeded")
}
if !errors.Is(err, ErrNotFound) {
t.Fatalf("expected ErrNotFound, got %v", err)

// Must NOT appear in ListExpiredSandboxes even at a very late cutoff.
farFuture := time.Now().Add(100 * 365 * 24 * time.Hour)
expired, err := c.ListExpiredSandboxes(ctx, farFuture, 100)
if err != nil {
t.Fatalf("ListExpiredSandboxes error: %v", err)
}
for _, s := range expired {
if s.SessionID == "sess-no-expiry" {
t.Error("sess-no-expiry must not appear in ListExpiredSandboxes")
}
}

// After UpdateSessionLastActivity the session MUST be tracked in the activity index.
now := time.Now().UTC().Truncate(time.Second)
if err := c.UpdateSessionLastActivity(ctx, "sess-no-expiry", now); err != nil {
t.Fatalf("UpdateSessionLastActivity failed: %v", err)
}
score, err := mr.ZScore(c.lastActivityIndexKey, "sess-no-expiry")
if err != nil {
t.Fatalf("expected sess-no-expiry in last-activity index: %v", err)
}
if int64(score) != now.Unix() {
t.Errorf("unexpected last-activity score: got %v, want %v", int64(score), now.Unix())
}
}
20 changes: 16 additions & 4 deletions pkg/store/store_valkey.go
Original file line number Diff line number Diff line change
Expand Up @@ -174,16 +174,28 @@ func (vs *valkeyStore) StoreSandbox(ctx context.Context, sandboxStore *types.San
return errors.New("StoreSandbox: sandbox is nil")
}

if sandboxStore.ExpiresAt.IsZero() {
return fmt.Errorf("StoreSandbox: sandbox expires time is zero")
}

sessionKey := vs.sessionKey(sandboxStore.SessionID)
b, err := json.Marshal(sandboxStore)
if err != nil {
return fmt.Errorf("StoreSandbox: marshal sandbox: %w", err)
}

if sandboxStore.ExpiresAt.IsZero() {
// Zero ExpiresAt means no maximum lifetime — skip expiry index.
// The sandbox will still be tracked in the activity index for idle timeout.
commands := make(valkey.Commands, 0, 3)
commands = append(commands, vs.cli.B().Setnx().Key(sessionKey).Value(string(b)).Build())
Comment on lines +183 to +187
commands = append(commands, vs.cli.B().Zadd().Key(vs.lastActivityIndexKey).ScoreMember().
ScoreMember(float64(time.Now().Unix()), sandboxStore.SessionID).Build())

for i, resp := range vs.cli.DoMulti(ctx, commands...) {
if err = resp.Error(); err != nil {
return fmt.Errorf("StoreSandbox: DoMulti failed: %w, command index: %v", err, i)
}
Comment on lines +183 to +194
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am seeing this is a little different from redis, is that because valkey sdk does not align with redis?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the structural difference is purely due to the Valkey SDK's API design, not a logic difference.

Redis (go-redis/v9) has a Pipeline() abstraction that lets you queue a variable number of commands and skip some conditionally before calling Exec():

pipe := rs.cli.Pipeline()
pipe.SetNX(...)
if !sandboxRedis.ExpiresAt.IsZero() {
    pipe.ZAdd(...) // conditionally added
}
pipe.ZAdd(...) // lastActivity always added
pipe.Exec(ctx)

Valkey (valkey-go) uses DoMulti() which takes a fixed slice of commands built upfront. Since Go slices can't have "holes", the cleanest pattern with this SDK is to branch the two cases (zero vs non-zero ExpiresAt) into separate command slices with an early return, rather than trying to conditionally append inside a single block:

if sandboxStore.ExpiresAt.IsZero() {
    commands := [SETNX, ZADD lastActivity]
    vs.cli.DoMulti(ctx, commands...)
    return nil
}
commands := [SETNX, ZADD expiryIndex, ZADD lastActivity]
vs.cli.DoMulti(ctx, commands...)

The end result is identical behaviour — zero ExpiresAt skips the expiry index in both stores. The branching is an SDK-driven implementation detail, not a semantic difference.

}
return nil
}
Comment on lines +183 to +197

commands := make(valkey.Commands, 0, 5)
commands = append(commands, vs.cli.B().Setnx().Key(sessionKey).Value(string(b)).Build())
commands = append(commands, vs.cli.B().Zadd().Key(vs.expiryIndexKey).ScoreMember().
Expand Down
4 changes: 1 addition & 3 deletions pkg/workloadmanager/sandbox_helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,6 @@ func buildSandboxPlaceHolder(sandboxCR *sandboxv1alpha1.Sandbox, entry *sandboxE
var expiresAt time.Time
if sandboxCR.Spec.Lifecycle.ShutdownTime != nil {
expiresAt = sandboxCR.Spec.Lifecycle.ShutdownTime.Time
} else {
expiresAt = time.Now().Add(DefaultSandboxTTL)
}
idleTimeout := entry.IdleTimeout
if idleTimeout == 0 {
Expand All @@ -71,7 +69,7 @@ func buildSandboxPlaceHolder(sandboxCR *sandboxv1alpha1.Sandbox, entry *sandboxE

func buildSandboxInfo(sandbox *sandboxv1alpha1.Sandbox, podIP string, entry *sandboxEntry) *types.SandboxInfo {
createdAt := sandbox.GetCreationTimestamp().Time
expiresAt := createdAt.Add(DefaultSandboxTTL)
var expiresAt time.Time
if sandbox.Spec.Lifecycle.ShutdownTime != nil {
expiresAt = sandbox.Spec.Lifecycle.ShutdownTime.Time
}
Expand Down
6 changes: 3 additions & 3 deletions pkg/workloadmanager/sandbox_helper_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ func TestBuildSandboxPlaceHolder_TableDriven(t *testing.T) {
validate func(t *testing.T, result *types.SandboxInfo)
}{
{
name: "no ShutdownTime falls back to DefaultSandboxTTL",
name: "no ShutdownTime means no expiry (sandbox runs indefinitely)",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not remove all DefaultSandboxTTL in ut test like in workload_builder_test.go L44 since we are not using it anymore

setupSandbox: func() *sandboxv1alpha1.Sandbox {
return &sandboxv1alpha1.Sandbox{
ObjectMeta: metav1.ObjectMeta{
Expand All @@ -54,8 +54,8 @@ func TestBuildSandboxPlaceHolder_TableDriven(t *testing.T) {
SessionID: "session-123",
},
validate: func(t *testing.T, result *types.SandboxInfo) {
expected := now.Add(DefaultSandboxTTL)
assert.WithinDuration(t, expected, result.ExpiresAt, 2*time.Second)
assert.True(t, result.ExpiresAt.IsZero(),
"ExpiresAt should be zero when no MaxSessionDuration is set")
assert.Equal(t, "creating", result.Status)
assert.Equal(t, "session-123", result.SessionID)
},
Expand Down
31 changes: 21 additions & 10 deletions pkg/workloadmanager/workload_builder.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,15 +153,10 @@ type buildSandboxClaimParams struct {

// buildSandboxObject builds a Sandbox object from parameters
func buildSandboxObject(params *buildSandboxParams) *sandboxv1alpha1.Sandbox {
if params.ttl == 0 {
params.ttl = DefaultSandboxTTL
}
if params.idleTimeout == 0 {
params.idleTimeout = DefaultSandboxIdleTimeout
}

shutdownTime := metav1.NewTime(time.Now().Add(params.ttl))

// Allocate fresh maps for copied metadata so we never mutate informer-cached input.
// Annotations are only copied when params.podAnnotations is non-nil.
podLabels := make(map[string]string, len(params.podLabels)+2)
Expand Down Expand Up @@ -198,12 +193,18 @@ func buildSandboxObject(params *buildSandboxParams) *sandboxv1alpha1.Sandbox {
Annotations: podAnnotations,
},
},
Lifecycle: sandboxv1alpha1.Lifecycle{
ShutdownTime: &shutdownTime,
},
Replicas: ptr.To[int32](1),
},
}

// Only set ShutdownTime when MaxSessionDuration is explicitly configured.
// When omitted (ttl == 0), the sandbox runs indefinitely and is only
// cleaned up by idle timeout (SessionTimeout).
if params.ttl > 0 {
shutdownTime := metav1.NewTime(time.Now().Add(params.ttl))
sandbox.Spec.Lifecycle.ShutdownTime = &shutdownTime
}
Comment on lines +203 to +206
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

By removing the fallback to DefaultSandboxTTL when params.ttl == 0 inside buildSandboxObject, any CodeInterpreter created without an explicit MaxSessionDuration (for example, in unit tests or direct client-go calls where kubebuilder defaults are not applied) will now run indefinitely without a hard expiry.

To maintain the 8h default guarantee for CodeInterpreter across all environments, the fallback to DefaultSandboxTTL should be explicitly handled in buildSandboxByCodeInterpreter (both in the warm pool path and the standard path).

Comment on lines +200 to +206
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes ttl == 0 mean no hard expiry for every caller of buildSandboxObject. That is correct for AgentRuntime, but CodeInterpreter is supposed to keep the 8h max-session default from the issue discussion and PR description. Can we set DefaultSandboxTTL explicitly in the CodeInterpreter paths when Spec.MaxSessionDuration is nil, including the warm-pool placeholder path?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1 It's better to align with CodeInterpreter.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.


return sandbox
}

Expand Down Expand Up @@ -400,10 +401,15 @@ func buildSandboxByCodeInterpreter(namespace string, codeInterpreterName string,
},
},
}
// CodeInterpreter always has a hard cap: use explicit MaxSessionDuration or
// fall back to DefaultSandboxTTL (8h). This differs from AgentRuntime where
// omitting MaxSessionDuration means no hard expiry.
ttl := DefaultSandboxTTL
if codeInterpreterObj.Spec.MaxSessionDuration != nil {
shutdownTime := metav1.NewTime(time.Now().Add(codeInterpreterObj.Spec.MaxSessionDuration.Duration))
simpleSandbox.Spec.Lifecycle.ShutdownTime = &shutdownTime
ttl = codeInterpreterObj.Spec.MaxSessionDuration.Duration
}
shutdownTime := metav1.NewTime(time.Now().Add(ttl))
simpleSandbox.Spec.Lifecycle.ShutdownTime = &shutdownTime
sandboxEntry.Kind = types.SandboxClaimsKind
return simpleSandbox, sandboxClaim, sandboxEntry, nil
}
Expand Down Expand Up @@ -443,8 +449,13 @@ func buildSandboxByCodeInterpreter(namespace string, codeInterpreterName string,
idleTimeout: idleTimeout,
}

// CodeInterpreter always has a hard cap: use explicit MaxSessionDuration or
// fall back to DefaultSandboxTTL (8h). This differs from AgentRuntime where
// omitting MaxSessionDuration means no hard expiry.
if codeInterpreterObj.Spec.MaxSessionDuration != nil {
buildParams.ttl = codeInterpreterObj.Spec.MaxSessionDuration.Duration
} else {
buildParams.ttl = DefaultSandboxTTL
}
sandbox := buildSandboxObject(buildParams)
return sandbox, nil, sandboxEntry, nil
Expand Down
22 changes: 21 additions & 1 deletion pkg/workloadmanager/workload_builder_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ func TestBuildSandboxObject_DoesNotMutateCallerLabels(t *testing.T) {
namespace: "default",
sandboxName: "sandbox-abc",
sessionID: "session-123",
ttl: DefaultSandboxTTL,
ttl: time.Hour,
idleTimeout: DefaultSandboxIdleTimeout,
podLabels: original,
}
Expand Down Expand Up @@ -115,6 +115,26 @@ func TestBuildSandboxObject_NilLabels(t *testing.T) {
}
}

// TestBuildSandboxObject_NoTTL verifies that when ttl==0 (AgentRuntime with no
// MaxSessionDuration configured), ShutdownTime is nil — the sandbox runs
// indefinitely and is only cleaned up by idle timeout.
func TestBuildSandboxObject_NoTTL(t *testing.T) {
params := &buildSandboxParams{
namespace: "default",
sandboxName: "sandbox-no-ttl",
sessionID: "session-no-ttl",
ttl: 0, // explicitly no TTL — AgentRuntime without MaxSessionDuration
idleTimeout: 15 * time.Minute,
}

sandbox := buildSandboxObject(params)

if sandbox.Spec.Lifecycle.ShutdownTime != nil {
t.Errorf("expected ShutdownTime to be nil for zero-ttl sandbox, got %v",
sandbox.Spec.Lifecycle.ShutdownTime)
}
}

// TestBuildSandboxObject_WorkloadNameLabel verifies that the WorkloadNameLabelKey
// label is correctly set on the Sandbox object metadata from the workloadName param.
// This covers the bug where CodeInterpreter sandboxes were missing this label.
Expand Down
1 change: 1 addition & 0 deletions test/e2e/echo_agent.yaml
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

never used?

Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,4 @@ spec:
sessionTimeout: "15m"
maxSessionDuration: "8h"
status: {}

8 changes: 6 additions & 2 deletions test/e2e/run_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -395,12 +395,16 @@ run_setup() {
kubectl create namespace "${WORKLOAD_NAMESPACE}" --dry-run=client -o yaml | kubectl apply -f -
# Create normal echo-agent
apply_workload_fixture test/e2e/echo_agent.yaml
# Create echo-agent-short-ttl with short sessionTimeout for TTL testing
# Create echo-agent-short-ttl with short sessionTimeout for TTL testing.
# maxSessionDuration is intentionally omitted (deleted by sed) — the TTL test
# exercises idle-timeout (sessionTimeout) cleanup, not hard-expiry (maxSessionDuration).
# This also validates the PR change that makes maxSessionDuration optional for AgentRuntime.
tmp_ttl_agent=$(mktemp)
sed 's/name: echo-agent/name: echo-agent-short-ttl/; s/app: echo-agent/app: echo-agent-short-ttl/; s/sessionTimeout: "15m"/sessionTimeout: "30s"/' test/e2e/echo_agent.yaml > "$tmp_ttl_agent"
sed 's/name: echo-agent/name: echo-agent-short-ttl/; s/app: echo-agent/app: echo-agent-short-ttl/; s/sessionTimeout: "15m"/sessionTimeout: "30s"/; /maxSessionDuration:/d' test/e2e/echo_agent.yaml > "$tmp_ttl_agent"
apply_workload_fixture "$tmp_ttl_agent"
rm -f "$tmp_ttl_agent"


step "Creating test CodeInterpreter..."
# Create e2e-code-interpreter CodeInterpreter
apply_workload_fixture test/e2e/e2e_code_interpreter.yaml
Expand Down
Loading