From f6de383bce252cea91f42ebe080831bb598e68d4 Mon Sep 17 00:00:00 2001 From: Mahil Patel Date: Wed, 27 May 2026 23:09:24 +0530 Subject: [PATCH 1/4] docs: add tutorial for securing internal traffic with SPIRE (mTLS) Signed-off-by: Mahil Patel --- .../docs/tutorials/internal-auth-spire.md | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 docs/agentcube/docs/tutorials/internal-auth-spire.md diff --git a/docs/agentcube/docs/tutorials/internal-auth-spire.md b/docs/agentcube/docs/tutorials/internal-auth-spire.md new file mode 100644 index 00000000..838bef75 --- /dev/null +++ b/docs/agentcube/docs/tutorials/internal-auth-spire.md @@ -0,0 +1,61 @@ +# Securing Internal Traffic with SPIRE (mTLS) + +This tutorial covers how we use SPIRE to establish zero-trust, mutually authenticated TLS (mTLS) for AgentCube's control plane. + +## Why Do We Need This? + +By default, internal components trust each other based on network reachability. With our new mTLS implementation, we're locking down the control plane so the Router and WorkloadManager cryptographically verify each other's SPIFFE identities before communicating. + +## 1. How the Architecture Works + +We implemented a robust mTLS abstraction layer that handles the heavy lifting for the control plane: + +- **Strict Identity Enforcement**: The Router and WorkloadManager have hardcoded SPIFFE IDs. + - The WorkloadManager accepts any client presenting a valid certificate signed by the trusted CA pool (authorization is handled later at the application layer). + - However, the Router strictly verifies it's actually talking to the `WorkloadManagerSPIFFEID` before it forwards any traffic, preventing spoofed servers. +- **Zero-Downtime Rotation**: A new `CertWatcher` actively monitors the certificates on disk (using `fsnotify`). When SPIRE rotates the short-lived certs, they are hot-reloaded without dropping any active connections. + +### What about Sandboxes? + +You might wonder why we don't inject mTLS into the `PicoD` or `AgentRuntime` sandboxes: +- **Startup Latency**: Establishing a new TLS handshake for every short-lived sandbox adds significant latency. We opted to use our existing, blazing-fast JWT-based authentication for the `Router -> Sandbox` path instead. +- **User-Defined Runtimes**: `AgentRuntime` sandboxes are user-defined containers. By avoiding mTLS sidecar injection, we keep them clean and pure without forcing SPIRE dependencies on them. +- **WorkloadManager isolation**: The WorkloadManager never communicates directly with sandboxes over HTTP; it solely manages them via the secure Kubernetes API. + +## 2. Enabling mTLS on the Control Plane + +To turn on mTLS, you just need to pass the appropriate certificate paths to the binaries. They automatically enable mTLS when the CA bundle is provided alongside the cert and key. + +For the **Router**, use the `mtls` prefix: +```bash +--mtls-cert=/path/to/tls.crt +--mtls-key=/path/to/tls.key +--mtls-ca=/path/to/ca.crt +``` + +For the **WorkloadManager**, use the `tls` prefix: +```bash +--tls-cert=/path/to/tls.crt +--tls-key=/path/to/tls.key +--tls-ca=/path/to/ca.crt +``` + +When you deploy AgentCube via our Helm charts, you don't have to manually manage these certificates. Instead, the **`spiffe-helper` sidecar** runs alongside the Router and WorkloadManager containers in their respective pods. + +Here is what the `spiffe-helper` sidecar does in the background: +1. It securely authenticates with the local SPIRE Agent. +2. It fetches the short-lived SVIDs (certificates) for the control plane component. +3. It writes the certificates to a shared volume where the component's `CertWatcher` instantly picks them up. +4. It continuously handles rotation before the certificates expire. + +## 3. Verifying It Works + +Once you've applied the configuration: + +1. Check the logs for the Router and WorkloadManager. You'll see the `CertWatcher` output confirming it has successfully loaded the certificates. +2. Try deploying an agent and sending a request. +3. If everything is wired correctly, the Router will perform the mTLS handshake and verify the WorkloadManager's SPIFFE ID when provisioning the sandbox, and then seamlessly fall back to the low-latency JWT auth when proxying your request directly to the sandbox. + +## Next Steps + +Now that your control plane communications are locked down, your AgentCube deployment is running a zero-trust architecture. You can safely deploy sensitive agents in multi-tenant environments. \ No newline at end of file From ccb5fde14065f03efb37dab75494b2158528ef84 Mon Sep 17 00:00:00 2001 From: Mahil Patel Date: Thu, 28 May 2026 13:35:36 +0530 Subject: [PATCH 2/4] docs: enhanced tutorial for securing internal traffic with SPIRE (mTLS) Signed-off-by: Mahil Patel --- .../docs/tutorials/internal-auth-spire.md | 331 ++++++++++++++++-- 1 file changed, 294 insertions(+), 37 deletions(-) diff --git a/docs/agentcube/docs/tutorials/internal-auth-spire.md b/docs/agentcube/docs/tutorials/internal-auth-spire.md index 838bef75..cab0e857 100644 --- a/docs/agentcube/docs/tutorials/internal-auth-spire.md +++ b/docs/agentcube/docs/tutorials/internal-auth-spire.md @@ -1,61 +1,318 @@ # Securing Internal Traffic with SPIRE (mTLS) -This tutorial covers how we use SPIRE to establish zero-trust, mutually authenticated TLS (mTLS) for AgentCube's control plane. +This task shows you how to enable mutual TLS (mTLS) between AgentCube's +control-plane components using [SPIRE](https://spiffe.io/docs/latest/spire-about/spire-concepts/). +By the end, every request between the Router and WorkloadManager will be +cryptographically authenticated using short-lived X.509 certificates that rotate +automatically. -## Why Do We Need This? +## Before you begin -By default, internal components trust each other based on network reachability. With our new mTLS implementation, we're locking down the control plane so the Router and WorkloadManager cryptographically verify each other's SPIFFE identities before communicating. +1. Follow the [Getting Started](../getting-started.md) guide to install + AgentCube on your cluster. **Do not** enable SPIRE during the initial + installation - this tutorial walks through that step explicitly. -## 1. How the Architecture Works +2. Make sure you have the following tools installed: + - [`kubectl`](https://kubernetes.io/docs/tasks/tools/) (v1.25+) + - [`helm`](https://helm.sh/docs/intro/install/) (v3.12+) -We implemented a robust mTLS abstraction layer that handles the heavy lifting for the control plane: +3. Confirm AgentCube is running without SPIRE: -- **Strict Identity Enforcement**: The Router and WorkloadManager have hardcoded SPIFFE IDs. - - The WorkloadManager accepts any client presenting a valid certificate signed by the trusted CA pool (authorization is handled later at the application layer). - - However, the Router strictly verifies it's actually talking to the `WorkloadManagerSPIFFEID` before it forwards any traffic, preventing spoofed servers. -- **Zero-Downtime Rotation**: A new `CertWatcher` actively monitors the certificates on disk (using `fsnotify`). When SPIRE rotates the short-lived certs, they are hot-reloaded without dropping any active connections. + ```bash + kubectl get pods -n agentcube-system + ``` -### What about Sandboxes? + You should see the Router and WorkloadManager pods in `Running` state, each + showing `1/1` containers ready (no sidecar yet): -You might wonder why we don't inject mTLS into the `PicoD` or `AgentRuntime` sandboxes: -- **Startup Latency**: Establishing a new TLS handshake for every short-lived sandbox adds significant latency. We opted to use our existing, blazing-fast JWT-based authentication for the `Router -> Sandbox` path instead. -- **User-Defined Runtimes**: `AgentRuntime` sandboxes are user-defined containers. By avoiding mTLS sidecar injection, we keep them clean and pure without forcing SPIRE dependencies on them. -- **WorkloadManager isolation**: The WorkloadManager never communicates directly with sandboxes over HTTP; it solely manages them via the secure Kubernetes API. + ``` + NAME READY STATUS RESTARTS AGE + agentcube-router-5d8f9b7c4-xxxxx 1/1 Running 0 5m + workloadmanager-6b6bb75d98-xxxxx 1/1 Running 0 5m + ``` -## 2. Enabling mTLS on the Control Plane +> **Tip :** +> If you are running on a local [Kind](https://kind.sigs.k8s.io/) or +[Minikube](https://minikube.sigs.k8s.io/) cluster, you will need to pass two +extra overrides in the Helm upgrade command shown below. These are already +included in the instructions, so just keep them in.. -To turn on mTLS, you just need to pass the appropriate certificate paths to the binaries. They automatically enable mTLS when the CA bundle is provided alongside the cert and key. -For the **Router**, use the `mtls` prefix: +## What gets deployed + +When you enable SPIRE, the Helm chart creates the following additional resources +inside your cluster: + +| Resource | Kind | Purpose | +|---|---|---| +| `spire-server` | StatefulSet (1 replica) | Central certificate authority. Runs the SPIRE Controller Manager as a sidecar. | +| `spire-agent` | DaemonSet | Runs on every node. Attests workloads and delivers certificates. | +| `ClusterSPIFFEID` (×2) | CRD | Declarative identity registration for the Router and WorkloadManager. | +| `spiffe-helper` sidecar | Container (injected) | Fetches and rotates certificates inside the Router and WorkloadManager pods. | + +The Router and WorkloadManager pods will each go from `1/1` to `2/2` containers +(the main process + the `spiffe-helper` sidecar). + +## Step 1 - Install the SPIRE Controller Manager CRDs + +The SPIRE Controller Manager watches `ClusterSPIFFEID` custom resources. These +CRDs must be present in the cluster **before** the Helm upgrade, otherwise the +chart will fail to create them. + +```bash +kubectl apply -k "https://github.com/spiffe/spire-controller-manager/config/crd?ref=v0.6.4" +``` + +Verify the CRD was installed: + +```bash +kubectl get crd clusterspiffeids.spire.spiffe.io +``` + +Expected output: + +``` +NAME CREATED AT +clusterspiffeids.spire.spiffe.io 2025-XX-XXTXX:XX:XXZ +``` + +## Step 2 - Upgrade the Helm release with SPIRE enabled + +Run the Helm upgrade with `spire.enabled=true`. The extra `--set` flags for +`insecureBootstrap` and `skipKubeletVerification` are needed for local +development clusters (Kind / Minikube). On a production cluster with proper +kubelet certificates, you can omit them. + +```bash +helm upgrade agentcube manifests/charts/base \ + -n agentcube-system \ + --set spire.enabled=true \ + --set spire.agent.insecureBootstrap=true \ + --set spire.agent.skipKubeletVerification=true +``` + +This single command deploys the full SPIRE infrastructure **and** injects the +`spiffe-helper` sidecar into the Router and WorkloadManager pods. + +Wait for everything to become ready: + +```bash +kubectl rollout status statefulset/spire-server -n agentcube-system --timeout=120s +kubectl rollout status daemonset/spire-agent -n agentcube-system --timeout=120s +kubectl rollout status deployment/agentcube-router -n agentcube-system --timeout=120s +kubectl rollout status deployment/workloadmanager -n agentcube-system --timeout=120s +``` + +## Step 3 - Verify SPIRE is healthy + +Check that the SPIRE Server is up and has registered agents: + ```bash ---mtls-cert=/path/to/tls.crt ---mtls-key=/path/to/tls.key ---mtls-ca=/path/to/ca.crt +kubectl exec -n agentcube-system statefulset/spire-server -c spire-server -- \ + /opt/spire/bin/spire-server agent list +``` + +You should see at least one agent entry (one per cluster node): + ``` +Found X attested agent(s): + +SPIFFE ID : spiffe://cluster.local/spire/agent/k8s_psat/agentcube-cluster/... +Attestation type : k8s_psat +... +``` + +Next, confirm the identity registrations were picked up from the +`ClusterSPIFFEID` resources: -For the **WorkloadManager**, use the `tls` prefix: ```bash ---tls-cert=/path/to/tls.crt ---tls-key=/path/to/tls.key ---tls-ca=/path/to/ca.crt +kubectl exec -n agentcube-system statefulset/spire-server -c spire-server -- \ + /opt/spire/bin/spire-server entry show ``` -When you deploy AgentCube via our Helm charts, you don't have to manually manage these certificates. Instead, the **`spiffe-helper` sidecar** runs alongside the Router and WorkloadManager containers in their respective pods. +You should see entries for both the Router and WorkloadManager, with SPIFFE IDs +following the format +`spiffe://cluster.local/ns/agentcube-system/sa/`: -Here is what the `spiffe-helper` sidecar does in the background: -1. It securely authenticates with the local SPIRE Agent. -2. It fetches the short-lived SVIDs (certificates) for the control plane component. -3. It writes the certificates to a shared volume where the component's `CertWatcher` instantly picks them up. -4. It continuously handles rotation before the certificates expire. +``` +Entry ID : ... +SPIFFE ID : spiffe://cluster.local/ns/agentcube-system/sa/agentcube-router +Parent ID : ... +Revision : 0 -## 3. Verifying It Works +Entry ID : ... +SPIFFE ID : spiffe://cluster.local/ns/agentcube-system/sa/workloadmanager +Parent ID : ... +Revision : 0 +``` -Once you've applied the configuration: +## Step 4 - Verify the sidecar and certificates -1. Check the logs for the Router and WorkloadManager. You'll see the `CertWatcher` output confirming it has successfully loaded the certificates. -2. Try deploying an agent and sending a request. -3. If everything is wired correctly, the Router will perform the mTLS handshake and verify the WorkloadManager's SPIFFE ID when provisioning the sandbox, and then seamlessly fall back to the low-latency JWT auth when proxying your request directly to the sandbox. +Confirm that both the Router and WorkloadManager pods now show `2/2` containers +(the main container + the `spiffe-helper` sidecar): -## Next Steps +```bash +kubectl get pods -n agentcube-system +``` + +Expected output: + +``` +NAME READY STATUS RESTARTS AGE +agentcube-router-7f8d4b9c6-xxxxx 2/2 Running 0 2m +workloadmanager-8c7dd85f9-xxxxx 2/2 Running 0 2m +spire-agent-xxxxx 1/1 Running 0 2m +spire-server-0 2/2 Running 0 2m +``` + +Check the Router logs to confirm mTLS is active. You should see a log line +indicating it is waiting for, and then successfully loading, the certificates: -Now that your control plane communications are locked down, your AgentCube deployment is running a zero-trust architecture. You can safely deploy sensitive agents in multi-tenant environments. \ No newline at end of file +```bash +kubectl logs -n agentcube-system deployment/agentcube-router -c agentcube-router | grep -i mtls +``` + +Expected output: + +``` +Waiting for Router mTLS cert/key/CA files +All mTLS cert/key/CA files are present +Router→WorkloadManager mTLS enabled: expecting server SPIFFE ID spiffe://cluster.local/ns/agentcube-system/sa/workloadmanager +``` + +Do the same for the WorkloadManager: + +```bash +kubectl logs -n agentcube-system deployment/workloadmanager -c workloadmanager | grep -i mtls +``` + +Expected output: + +``` +Waiting for WorkloadManager mTLS cert/key/CA files +All mTLS cert/key/CA files are present +WorkloadManager mTLS enabled: accepting clients with valid SPIRE-provisioned certificates +``` + +## Step 5 - Test it end-to-end + +Deploy a simple agent and invoke it through the Router to confirm the full +mTLS-secured path works: + +```bash +kubectl apply -f - < Date: Sat, 30 May 2026 21:36:54 +0530 Subject: [PATCH 3/4] fix(docs,mtls): address review comments on internal-auth-spire tutorial - Added klog.Infof to wait.go so expected output is conistent with what actually appears in logs, matching the tutorial's expected output - Helm upgrade removes SPIRE workloads and sidecars, not CRDs (those are removed separately via kubectl) - added --reuse-values flag to preserve the install-time values Signed-off-by: Mahil Patel --- docs/agentcube/docs/tutorials/internal-auth-spire.md | 11 ++++++++--- pkg/mtls/wait.go | 3 +++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/docs/agentcube/docs/tutorials/internal-auth-spire.md b/docs/agentcube/docs/tutorials/internal-auth-spire.md index cab0e857..c39a74a5 100644 --- a/docs/agentcube/docs/tutorials/internal-auth-spire.md +++ b/docs/agentcube/docs/tutorials/internal-auth-spire.md @@ -78,7 +78,9 @@ clusterspiffeids.spire.spiffe.io 2025-XX-XXTXX:XX:XXZ ## Step 2 - Upgrade the Helm release with SPIRE enabled -Run the Helm upgrade with `spire.enabled=true`. The extra `--set` flags for +Run the Helm upgrade with `spire.enabled=true`. Keep `--reuse-values` so your +existing install-time settings (for example Redis, images, RBAC, or service +accounts) are preserved while enabling SPIRE. The extra `--set` flags for `insecureBootstrap` and `skipKubeletVerification` are needed for local development clusters (Kind / Minikube). On a production cluster with proper kubelet certificates, you can omit them. @@ -86,6 +88,7 @@ kubelet certificates, you can omit them. ```bash helm upgrade agentcube manifests/charts/base \ -n agentcube-system \ + --reuse-values \ --set spire.enabled=true \ --set spire.agent.insecureBootstrap=true \ --set spire.agent.skipKubeletVerification=true @@ -305,11 +308,13 @@ plane components, run the Helm upgrade again with `spire.enabled=false`: ```bash helm upgrade agentcube manifests/charts/base \ -n agentcube-system \ + --reuse-values \ --set spire.enabled=false ``` -This removes all SPIRE resources (Server, Agent, CRDs, sidecars) and the -Router/WorkloadManager pods will restart with `1/1` containers. +This removes all SPIRE workloads (Server, Agent), sidecars, and ClusterSPIFFEID +resources from this Helm release. The Router/WorkloadManager pods will restart +with `1/1` containers. To also remove the SPIRE Controller Manager CRDs: diff --git a/pkg/mtls/wait.go b/pkg/mtls/wait.go index 79f5cf1e..71b25876 100644 --- a/pkg/mtls/wait.go +++ b/pkg/mtls/wait.go @@ -21,6 +21,8 @@ import ( "os" "strings" "time" + + "k8s.io/klog/v2" ) // DefaultCertificateFileWaitTimeout bounds the startup race while spiffe-helper writes the initial SVID files. @@ -41,6 +43,7 @@ func WaitForCertificateFiles(cfg Config, timeout time.Duration) error { return fmt.Errorf("failed to access mTLS cert/key/CA files: %w", err) } if exist { + klog.Infof("All mTLS cert/key/CA files are present") return nil } missing = currentMissing From bd05fe571f83f1c3de49ea6f08dadc18c75ce89a Mon Sep 17 00:00:00 2001 From: Mahil Patel Date: Mon, 1 Jun 2026 23:16:11 +0530 Subject: [PATCH 4/4] docs: update SPIRE tutorial with authentic outputs and fix namespace - Replaced generalized output placeholders (xxxxx pod hashes, ... UUIDs, and XX-XX timestamps) with actual outputs to prevent ambiguity. - Updated expected log outputs for Router and WorkloadManager to accurately reflect the format emitted by the codebase. - Fixed agentcube-system namespace inconsistencies across the documentation to align with the core getting-started guide. Signed-off-by: Mahil Patel --- .../docs/tutorials/internal-auth-spire.md | 105 +++++++++--------- pkg/mtls/loader_test.go | 4 +- pkg/mtls/spiffeid.go | 4 +- pkg/mtls/spiffeid_test.go | 4 +- pkg/router/session_manager_test.go | 2 +- test/e2e/run_e2e.sh | 2 +- 6 files changed, 63 insertions(+), 58 deletions(-) diff --git a/docs/agentcube/docs/tutorials/internal-auth-spire.md b/docs/agentcube/docs/tutorials/internal-auth-spire.md index c39a74a5..16fdf22f 100644 --- a/docs/agentcube/docs/tutorials/internal-auth-spire.md +++ b/docs/agentcube/docs/tutorials/internal-auth-spire.md @@ -19,23 +19,23 @@ automatically. 3. Confirm AgentCube is running without SPIRE: ```bash - kubectl get pods -n agentcube-system + kubectl get pods -n agentcube ``` You should see the Router and WorkloadManager pods in `Running` state, each showing `1/1` containers ready (no sidecar yet): - ``` - NAME READY STATUS RESTARTS AGE - agentcube-router-5d8f9b7c4-xxxxx 1/1 Running 0 5m - workloadmanager-6b6bb75d98-xxxxx 1/1 Running 0 5m - ``` +``` + NAME READY STATUS RESTARTS AGE + agentcube-router-7fbb7b54c-7khq5 1/1 Running 0 8s + workloadmanager-6c44454f68-zmfcc 1/1 Running 0 8s +``` -> **Tip :** +> **Tip:** > If you are running on a local [Kind](https://kind.sigs.k8s.io/) or -[Minikube](https://minikube.sigs.k8s.io/) cluster, you will need to pass two -extra overrides in the Helm upgrade command shown below. These are already -included in the instructions, so just keep them in.. +> [Minikube](https://minikube.sigs.k8s.io/) cluster, you will need to pass two +> extra overrides in the Helm upgrade command shown below. These are already +> included in the instructions, so just keep them in. ## What gets deployed @@ -73,7 +73,7 @@ Expected output: ``` NAME CREATED AT -clusterspiffeids.spire.spiffe.io 2025-XX-XXTXX:XX:XXZ +clusterspiffeids.spire.spiffe.io 2026-06-01T16:22:32Z ``` ## Step 2 - Upgrade the Helm release with SPIRE enabled @@ -87,7 +87,7 @@ kubelet certificates, you can omit them. ```bash helm upgrade agentcube manifests/charts/base \ - -n agentcube-system \ + -n agentcube \ --reuse-values \ --set spire.enabled=true \ --set spire.agent.insecureBootstrap=true \ @@ -100,10 +100,10 @@ This single command deploys the full SPIRE infrastructure **and** injects the Wait for everything to become ready: ```bash -kubectl rollout status statefulset/spire-server -n agentcube-system --timeout=120s -kubectl rollout status daemonset/spire-agent -n agentcube-system --timeout=120s -kubectl rollout status deployment/agentcube-router -n agentcube-system --timeout=120s -kubectl rollout status deployment/workloadmanager -n agentcube-system --timeout=120s +kubectl rollout status statefulset/spire-server -n agentcube --timeout=120s +kubectl rollout status daemonset/spire-agent -n agentcube --timeout=120s +kubectl rollout status deployment/agentcube-router -n agentcube --timeout=120s +kubectl rollout status deployment/workloadmanager -n agentcube --timeout=120s ``` ## Step 3 - Verify SPIRE is healthy @@ -111,16 +111,16 @@ kubectl rollout status deployment/workloadmanager -n agentcube-system --timeout= Check that the SPIRE Server is up and has registered agents: ```bash -kubectl exec -n agentcube-system statefulset/spire-server -c spire-server -- \ +kubectl exec -n agentcube statefulset/spire-server -c spire-server -- \ /opt/spire/bin/spire-server agent list ``` You should see at least one agent entry (one per cluster node): ``` -Found X attested agent(s): +Found 1 attested agent(s): -SPIFFE ID : spiffe://cluster.local/spire/agent/k8s_psat/agentcube-cluster/... +SPIFFE ID : spiffe://cluster.local/spire/agent/k8s_psat/agentcube-cluster/67790303-3657-42d6-bf4f-c3833ec6dd5e Attestation type : k8s_psat ... ``` @@ -129,23 +129,23 @@ Next, confirm the identity registrations were picked up from the `ClusterSPIFFEID` resources: ```bash -kubectl exec -n agentcube-system statefulset/spire-server -c spire-server -- \ +kubectl exec -n agentcube statefulset/spire-server -c spire-server -- \ /opt/spire/bin/spire-server entry show ``` You should see entries for both the Router and WorkloadManager, with SPIFFE IDs following the format -`spiffe://cluster.local/ns/agentcube-system/sa/`: +`spiffe://cluster.local/ns/agentcube/sa/`: ``` -Entry ID : ... -SPIFFE ID : spiffe://cluster.local/ns/agentcube-system/sa/agentcube-router -Parent ID : ... +Entry ID : bfd507ec-10d8-43e5-b984-861a3ff81167 +SPIFFE ID : spiffe://cluster.local/ns/agentcube/sa/agentcube-router +Parent ID : spiffe://cluster.local/spire/agent/k8s_psat/agentcube-cluster/67790303-3657-42d6-bf4f-c3833ec6dd5e Revision : 0 -Entry ID : ... -SPIFFE ID : spiffe://cluster.local/ns/agentcube-system/sa/workloadmanager -Parent ID : ... +Entry ID : 21e3ba6f-ad13-4076-9e08-90a2d4ff518f +SPIFFE ID : spiffe://cluster.local/ns/agentcube/sa/workloadmanager +Parent ID : spiffe://cluster.local/spire/agent/k8s_psat/agentcube-cluster/67790303-3657-42d6-bf4f-c3833ec6dd5e Revision : 0 ``` @@ -155,46 +155,47 @@ Confirm that both the Router and WorkloadManager pods now show `2/2` containers (the main container + the `spiffe-helper` sidecar): ```bash -kubectl get pods -n agentcube-system +kubectl get pods -n agentcube ``` Expected output: ``` -NAME READY STATUS RESTARTS AGE -agentcube-router-7f8d4b9c6-xxxxx 2/2 Running 0 2m -workloadmanager-8c7dd85f9-xxxxx 2/2 Running 0 2m -spire-agent-xxxxx 1/1 Running 0 2m -spire-server-0 2/2 Running 0 2m +NAME READY STATUS RESTARTS AGE +agentcube-router-574d98b76-tr2nr 2/2 Running 5 (2m24s ago) 3m17s +spire-agent-8r9jx 1/1 Running 3 (2m44s ago) 3m17s +spire-server-0 2/2 Running 0 3m17s +workloadmanager-5797888bd4-jm2qj 2/2 Running 3 (118s ago) 3m17s ``` Check the Router logs to confirm mTLS is active. You should see a log line indicating it is waiting for, and then successfully loading, the certificates: ```bash -kubectl logs -n agentcube-system deployment/agentcube-router -c agentcube-router | grep -i mtls +kubectl logs -n agentcube deployment/agentcube-router -c agentcube-router | grep -i mtls ``` Expected output: ``` -Waiting for Router mTLS cert/key/CA files -All mTLS cert/key/CA files are present -Router→WorkloadManager mTLS enabled: expecting server SPIFFE ID spiffe://cluster.local/ns/agentcube-system/sa/workloadmanager +I0601 16:25:21.444099 1 main.go:64] Waiting for Router mTLS cert/key/CA files +I0601 16:25:21.444259 1 wait.go:46] All mTLS cert/key/CA files are present +I0601 16:25:21.445161 1 session_manager.go:84] Using https:// for WORKLOAD_MANAGER_URL because mTLS is configured +I0601 16:25:21.445482 1 session_manager.go:93] Router→WorkloadManager mTLS enabled: expecting server SPIFFE ID spiffe://cluster.local/ns/agentcube/sa/workloadmanager ``` Do the same for the WorkloadManager: ```bash -kubectl logs -n agentcube-system deployment/workloadmanager -c workloadmanager | grep -i mtls +kubectl logs -n agentcube deployment/workloadmanager -c workloadmanager | grep -i mtls ``` Expected output: ``` -Waiting for WorkloadManager mTLS cert/key/CA files -All mTLS cert/key/CA files are present -WorkloadManager mTLS enabled: accepting clients with valid SPIRE-provisioned certificates +I0601 16:25:22.561316 1 main.go:80] Waiting for WorkloadManager mTLS cert/key/CA files +I0601 16:25:22.561931 1 wait.go:46] All mTLS cert/key/CA files are present +I0601 16:25:22.678777 1 server.go:218] WorkloadManager mTLS enabled: accepting clients with valid SPIRE-provisioned certificates ``` ## Step 5 - Test it end-to-end @@ -230,18 +231,22 @@ spec: EOF ``` -Port-forward the Router and send a request: +Open a new terminal and port-forward the Router: + +```bash +kubectl port-forward -n agentcube svc/agentcube-router 8080:8080 +``` + +In your original terminal, send a request to the root path of the sandbox: ```bash -kubectl port-forward -n agentcube-system svc/agentcube-router 8080:8080 & -curl -s -o /dev/null -w "%{http_code}" \ - http://localhost:8080/v1/namespaces/default/agent-runtimes/mtls-test/invocations/test/ +curl -i http://localhost:8080/v1/namespaces/default/agent-runtimes/mtls-test/invocations/ ``` If the mTLS handshake between Router and WorkloadManager succeeds, you will see -a `200` (or `502` while the sandbox is still booting - retry after a few -seconds). A TLS-related error in the Router logs would indicate a -misconfiguration. +a `200 OK` response with a directory listing from the python server (or a `502` +while the sandbox is still booting - just retry after a few seconds). A +TLS-related error in the Router logs would indicate a misconfiguration. ## Understanding what changed @@ -262,7 +267,7 @@ The Helm chart passes these flags to the Router binary: When all three are present, the Router creates a dedicated HTTPS transport for its WorkloadManager connection. It verifies that the WorkloadManager's certificate contains the expected SPIFFE ID -(`spiffe://cluster.local/ns/agentcube-system/sa/workloadmanager`). +(`spiffe://cluster.local/ns/agentcube/sa/workloadmanager`). ### WorkloadManager (mTLS server) @@ -307,7 +312,7 @@ plane components, run the Helm upgrade again with `spire.enabled=false`: ```bash helm upgrade agentcube manifests/charts/base \ - -n agentcube-system \ + -n agentcube \ --reuse-values \ --set spire.enabled=false ``` diff --git a/pkg/mtls/loader_test.go b/pkg/mtls/loader_test.go index 27517e10..793c75c6 100644 --- a/pkg/mtls/loader_test.go +++ b/pkg/mtls/loader_test.go @@ -263,7 +263,7 @@ func TestLoadServerConfig_InvalidCAPEM(t *testing.T) { // --- SPIFFE ID verification --- func TestVerifyServerCert_MatchingID(t *testing.T) { - spiffeID := "spiffe://cluster.local/ns/agentcube-system/sa/workloadmanager" + spiffeID := "spiffe://cluster.local/ns/agentcube/sa/workloadmanager" certFile, _, caFile := generateTestCertsWithSPIFFEID(t, spiffeID) rawCert := readRawCert(t, certFile) @@ -282,7 +282,7 @@ func TestVerifyServerCert_MatchingID(t *testing.T) { } func TestVerifyServerCert_UntrustedCA(t *testing.T) { - spiffeID := "spiffe://cluster.local/ns/agentcube-system/sa/workloadmanager" + spiffeID := "spiffe://cluster.local/ns/agentcube/sa/workloadmanager" certFile, _, _ := generateTestCertsWithSPIFFEID(t, spiffeID) // Use a DIFFERENT CA — chain verification should fail diff --git a/pkg/mtls/spiffeid.go b/pkg/mtls/spiffeid.go index 0e511ab0..287b7592 100644 --- a/pkg/mtls/spiffeid.go +++ b/pkg/mtls/spiffeid.go @@ -25,7 +25,7 @@ import ( const ( defaultTrustDomain = "cluster.local" trustDomainEnvVar = "AGENTCUBE_SPIFFE_TRUST_DOMAIN" - defaultNamespace = "agentcube-system" + defaultNamespace = "agentcube" namespaceEnvVar = "AGENTCUBE_NAMESPACE" ) @@ -33,7 +33,7 @@ const ( // These follow the Istio-convention format: spiffe:///ns//sa/. // The trust domain defaults to cluster.local and can be overridden with AGENTCUBE_SPIFFE_TRUST_DOMAIN // to match the SPIRE trust domain configured by deployment tooling. -// The namespace defaults to agentcube-system and can be overridden with AGENTCUBE_NAMESPACE. +// The namespace defaults to agentcube and can be overridden with AGENTCUBE_NAMESPACE. var ( // RouterSPIFFEID is the SPIFFE identity for the Router component. RouterSPIFFEID = componentSPIFFEID(configuredTrustDomain(), configuredNamespace(), "agentcube-router") diff --git a/pkg/mtls/spiffeid_test.go b/pkg/mtls/spiffeid_test.go index bdd328d1..71baede8 100644 --- a/pkg/mtls/spiffeid_test.go +++ b/pkg/mtls/spiffeid_test.go @@ -43,8 +43,8 @@ func TestConfiguredNamespace(t *testing.T) { } func TestComponentSPIFFEID(t *testing.T) { - got := componentSPIFFEID("example.org", "agentcube-system", "agentcube-router") - want := "spiffe://example.org/ns/agentcube-system/sa/agentcube-router" + got := componentSPIFFEID("example.org", "agentcube", "agentcube-router") + want := "spiffe://example.org/ns/agentcube/sa/agentcube-router" if got != want { t.Fatalf("componentSPIFFEID() = %q, want %q", got, want) } diff --git a/pkg/router/session_manager_test.go b/pkg/router/session_manager_test.go index 8b585f4b..0b61f28d 100644 --- a/pkg/router/session_manager_test.go +++ b/pkg/router/session_manager_test.go @@ -636,7 +636,7 @@ func generateTestCertsForRouter(t *testing.T, dir string) (certFile, keyFile, ca if err != nil { t.Fatalf("generate leaf key: %v", err) } - spiffeURL, _ := url.Parse("spiffe://cluster.local/ns/agentcube-system/sa/agentcube-router") + spiffeURL, _ := url.Parse("spiffe://cluster.local/ns/agentcube/sa/agentcube-router") leafTemplate := &x509.Certificate{ SerialNumber: big.NewInt(2), Subject: pkix.Name{Organization: []string{"Test Router"}}, diff --git a/test/e2e/run_e2e.sh b/test/e2e/run_e2e.sh index 0c215718..e14c1c5b 100755 --- a/test/e2e/run_e2e.sh +++ b/test/e2e/run_e2e.sh @@ -18,7 +18,7 @@ WORKLOAD_MANAGER_IMAGE=${WORKLOAD_MANAGER_IMAGE:-workloadmanager:latest} ROUTER_IMAGE=${ROUTER_IMAGE:-agentcube-router:latest} PICOD_IMAGE=${PICOD_IMAGE:-picod:latest} REDIS_IMAGE=${REDIS_IMAGE:-redis:7-alpine} -AGENTCUBE_NAMESPACE=${AGENTCUBE_NAMESPACE:-agentcube-system} +AGENTCUBE_NAMESPACE=${AGENTCUBE_NAMESPACE:-agentcube} WORKLOAD_NAMESPACE=${WORKLOAD_NAMESPACE:-agentcube} E2E_VENV_DIR=${E2E_VENV_DIR:-/tmp/agentcube-e2e-venv} MCP_K8S_LOCAL_PORT=${MCP_K8S_LOCAL_PORT:-19446}