diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 504f5b1..547d5f7 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -32,6 +32,22 @@ jobs: python-version: ${{ matrix.python-version }} cache: 'pip' + - name: Install latest opencode + env: + GH_TOKEN: ${{ github.token }} + run: | + version="$( + curl -fsSL \ + -H "Authorization: Bearer ${GH_TOKEN}" \ + -H "Accept: application/vnd.github+json" \ + https://api.github.com/repos/anomalyco/opencode/releases/latest \ + | python -c 'import json,sys; print(json.load(sys.stdin)["tag_name"].lstrip("v"))' + )" + + echo "Installing opencode latest: $version" + curl -fsSL https://opencode.ai/install | bash -s -- --version "$version" + "$HOME/.opencode/bin/opencode" --version + - name: Install dependencies run: | python -m pip install --upgrade pip @@ -127,4 +143,4 @@ jobs: issue_number: context.issue.number, body: body, }); - } \ No newline at end of file + } diff --git a/.project/migrate-to-opencode-serve.md b/.project/migrate-to-opencode-serve.md new file mode 100644 index 0000000..29bc162 --- /dev/null +++ b/.project/migrate-to-opencode-serve.md @@ -0,0 +1,413 @@ +# Migration Plan: run-agent.py from subprocess to opencode serve HTTP+SSE + +**Status:** ✅ Implemented +**Tests:** 29 new unit + E2E tests written in `tests/test_new_serve_stack.py` (all passing) +**Total test suite:** 216 passed, 4 skipped, 0 failed +**Date:** 2026-05-16 +**Target:** `tools/run-agent.py` +**Minimum OpenCode Version:** 1.14.50 +**Risk Level:** Medium (large refactor, affects all phase targets) + +--- + +## 1. Executive Summary + +Replace the current architecture where `run-agent.py` spawns `opencode run --format json` as a subprocess and parses ND-JSON from stdout, with a direct HTTP+SSE integration against a locally-managed `opencode serve` instance. + +For each `make phase-X` invocation, `run-agent.py` will: + +1. Start a dedicated `opencode serve` process on an ephemeral port +2. Create a session via `POST /session` +3. Send the phase prompt via `POST /session/{id}/prompt_async` +4. Consume real-time events from `GET /event` (Server-Sent Events) +5. Map SSE events to the existing ND-JSON format consumed by `render_event()` +6. Terminate the server on exit (leaving the session in the opencode DB) + +--- + +## 2. Motivation + +| Aspect | Current (`opencode run`) | New (`opencode serve`) | +|---|---|---| +| Coupling | Spawns CLI subprocess, relies on stdout format stability | Uses official HTTP API with OpenAPI spec | +| Real-time delivery | Buffered stdout pipe | Native SSE stream | +| Multi-client support | Single process only | Server can serve other clients (IDE, web UI) | +| Tool permissions | Interactive/non-interactive baked into CLI behavior | Explicit API control | +| Model resolution | Requires probe sessions via `opencode run` + export | Read from `GET /config` or observe session events | +| Decoupling | Renders `opencode run` the bottleneck for all features | Independent HTTP surface, clearer boundaries | + +--- + +## 3. Research Foundation + +This plan is based on a deep source-code analysis of OpenCode v1.14.50, including: + +- Extracting and reading the compiled Bun binary strings +- Fetching the official documentation (`https://opencode.ai/docs/server/`, `https://opencode.ai/docs/sdk`) +- Hands-on probing of all REST and SSE endpoints +- Source-code walkthrough of `packages/opencode/src/cli/cmd/run.ts`, `packages/opencode/src/server/routes/instance/httpapi/handlers/event.ts`, and SDK internals +- Identifying the exact mapping between SSE events and the ND-JSON events emitted by `opencode run --format json` + +Full research report: `.project/opencode-run-internals-report.md` + +--- + +## 4. Target Directory Structure + +``` +tools/ +├── run-agent.py # Entry point, orchestration loop +├── opencode/ # NEW: Server lifecycle management +│ ├── __init__.py +│ └── serve.py # ServerRunner (start/stop/health) + convenience CLI +├── events/ # NEW: SSE consume → map → emit +│ ├── __init__.py # EventLoop coordinator +│ ├── sse_client.py # SSE HTTP connection, reconnect, heartbeat +│ ├── state_tracker.py # Accumulate deltas, track part versions +│ ├── mapper.py # SSE → ND-JSON translation +│ └── emitters.py # Bridge to existing render_event() +└── _colors.py # Existing, unchanged +``` + +> Rationale: `opencode/` (containing `serve.py`) and `events/` are internal support modules, not callable CLI tools. Placing them in packages keeps the `tools/` namespace clean. + +--- + +## 5. Execution Flow (Per Phase) + +### 5.1 Server Startup + +```python +from opencode.serve import ServerRunner + +runner = ServerRunner() +server_info = runner.start( + hostname="127.0.0.1", + port=0, # random ephemeral port + log_level="WARN", +) +# server_info → { proc, base_url, pid, port } +``` + +`ServerRunner.start()` will: + +1. Spawn `opencode serve --port 0` (or `_find_free_port()` internally) along with other flags. +2. The server will bind to the assigned port and generate a random local authentication token. +3. Poll `GET /global/health` until `{healthy: true}` is returned, passing the generated token. +4. Return a `ServerInfo` dataclass (containing `port` and `password`). + +If startup fails, print error details and exit non-zero. **No auto-retry.** + +### 5.2 Session Creation + +```python +import requests + +resp = requests.post(f"{server_info.base_url}/session", json={ + "title": f"CodeCome Phase {args.phase}" +}) +session = resp.json() +session_id = session["id"] +``` + +### 5.3 Configure Session + +`POST /session` accepts `agent` and `model` directly, so the create call in +step 5.2 already pins them. If you need to change them later you can also +`PATCH /session/{id}`: + +```python +requests.patch(f"{server_info.base_url}/session/{session_id}", json={ + "agent": args.agent, + "model": resolved_model, # e.g. {"providerID": "github-copilot", "modelID": "gpt-5.4"} +}) +``` + +### 5.4 Send Prompt + +```python +prompt_text = load_prompt(prompt_file, args.finding, phase=args.phase) + +requests.post( + f"{server_info.base_url}/session/{session_id}/prompt_async", + json={ + "parts": [{"type": "text", "text": prompt_text}], + } +) +``` + +The `prompt_async` endpoint returns immediately (204 No Content). The model will process the prompt asynchronously and emit events on the global SSE stream. + +### 5.5 Consume SSE Events + +```python +from events import EventLoop + +event_loop = EventLoop( + base_url=server_info.base_url, + session_id=session_id, + console=console, + phase=args.phase, + label=args.label, +) + +result = event_loop.run() # blocks until session idle or terminal error +# result → { finish_reason, step_finish_count, last_finish_tokens, ... } +``` + +`EventLoop` internals: + +1. Open `GET /event` with `Accept: text/event-stream` +2. Parse SSE `data:` lines → JSON objects +3. Filter events by `session_id` +4. Distribute to `StateTracker` for accumulation +5. When `StateTracker` detects a finalized part, call `Mapper` → emit ND-JSON +6. `Emitters` forwards to existing `render_event(console, phase, label, ndjson_event)` + +### 5.6 Termination & Cleanup + +```python +runner.stop() +``` + +`ServerRunner.stop()` will: + +1. Send `SIGTERM` to the serve process +2. Wait up to 5 seconds for graceful exit +3. Send `SIGKILL` if still alive +4. **Does not delete the session** — left in the opencode DB for inspection + +--- + +## 6. SSE → ND-JSON Mapping (Critical Compatibility Layer) + +All existing rendering code (~4,000 lines in `run-agent.py`) expects these exact ND-JSON shapes. The Mapper module must produce identical output. + +| SSE Event from `GET /event` | Condition | Mapped ND-JSON Event | +|---|---|---| +| `message.part.updated` | `part.type == "step-start"` | `{"type": "step_start", "part": part}` | +| `message.part.updated` | `part.type == "text"` and `part.time.end` exists | `{"type": "text", "part": part}` | +| `message.part.updated` | `part.type == "reasoning"` and `part.time.end` exists | `{"type": "reasoning", "part": part}` | +| `message.part.updated` | `part.type == "tool"` | `{"type": "tool_use", "part": part}` | +| `message.part.updated` | `part.type == "step-finish"` | `{"type": "step_finish", "part": part}` | +| `session.error` | `properties.sessionID == ours` | `{"type": "error", "error": err}` | + +### Text Accumulation Pattern + +The server sends `message.part.delta` events with tiny text fragments: + +```json +{"type":"message.part.delta","properties":{"sessionID":"...","partID":"...","field":"text","delta":"Hello"}} +``` + +`StateTracker` accumulates these by `partID`. When the corresponding `message.part.updated` arrives with `time.end`, the accumulated text is injected into `part.text` before mapping. + +> **TODO (Future):** Factor out `StateTracker` text accumulation so we can stream text fragments in real-time without waiting for `time.end`. + +--- + +## 7. Tool Permissions + +**Behavior:** Auto-reject all permission requests, with a visible warning to the user (same as current `opencode run` non-interactive behavior). + +```python +if event_type == "permission.asked" and session_id == ours: + perm_id = event["properties"]["id"] + requests.post( + f"{base_url}/permission/{perm_id}/reply", + json={"reply": "reject"} + ) + render_permission_error_plain(error_message) +``` + +The `permission.asked` event appears on the SSE stream when the model requests permission to run a tool. We must respond via `POST /permission/{requestID}/reply` to unblock the session. + +--- + +## 8. Termination Controls & Auto-Resume + +The existing termination logic in `run-agent.py` (lines ~4686–4864) must be preserved exactly: + +1. **Finish reason classification** + - `stop` → OK + - `tool-calls` → incomplete (mid-turn cutoff) + - `error`, `length`, `max_tokens` → failure + +2. **Graceful completion check** — `check_phase_graceful_completion()` + - If the model stopped mid-turn but the required artifacts were already written, treat the phase as complete and exit 0. + +3. **Auto-resume logic** + - If finish reason is `tool-calls` (iteration limit hit), build a resume prompt and send a new `prompt_async` to the same session. + - Budget: `CODECOME_MAX_ITERATION_RETRIES` env var (default 1). + +4. **Frontmatter validation auto-correction** + - After `session.status` → `idle`, run `tools/check-frontmatter.py`. + - If it fails, send a repair prompt via `prompt_async` (max 2 retries). + +5. **Step finish tracking** + - Count `step_finish` events to report in resume prompts. + - Track `last_finish_reason`, `last_finish_tokens`. + +`EventLoop.run()` will return a `RunResult` object containing all signals needed by the existing termination logic. + +--- + +## 9. Resilience & Reconnect + +1. **Auto-reconnect**: If the SSE connection drops, reconnect to `/event` with exponential backoff (3s → 30s max). +2. **Heartbeat monitoring**: If no `server.heartbeat` for >15s, treat as dead and reconnect. +3. **Missed events**: The server sets `id: undefined` on all SSE events, so `Last-Event-ID` replay **does not work**. On reconnect: + - Poll `GET /session/{session_id}/message` to get the full current message list. + - Compare with our `StateTracker`. + - Emit synthetic ND-JSON events for any finalized parts we missed. +4. **Server crash**: If `opencode serve` exits unexpectedly, print error and exit non-zero. + +--- + +## 10. `make show-model` Migration + +Currently, `show_model_table()` builds a dry-run command to probe the effective model. With the serve API: + +1. Start a **transient** `opencode serve` on a random port. +2. Query `GET /config` and `GET /provider` for defaults. +3. Apply existing precedence logic (OPENCODE_ARGS → env `CODECOME_MODEL` → `codecome.yml` → unknown). +4. Print the resolution table. +5. Stop the transient server. + +**Overhead:** ~2-3 seconds (acceptable for a diagnostic command). +**Benefit:** Removes the need for probe sessions; keeps resolution consistent with the actual runtime path. + +--- + +## 11. Files to Create / Modify + +### New Files + +| File | Lines (est.) | Description | +|---|---|---| +| `tools/opencode/__init__.py` | 5 | Package init | +| `tools/opencode/serve.py` | 200 | `ServerRunner` class: start/stop server, port discovery, health check, convenience CLI | +| `tools/events/__init__.py` | 50 | Package init, `EventLoop` coordinator class | +| `tools/events/sse_client.py` | 200 | HTTP SSE consumer: connection, parse, reconnect logic, heartbeat monitoring | +| `tools/events/state_tracker.py` | 250 | Accumulate `message.part.delta` fragments, track part versions, detect finalized parts | +| `tools/events/mapper.py` | 200 | Translate SSE events into ND-JSON compatible with existing `render_event()` | +| `tools/events/emitters.py` | 50 | Thin wrapper: calls existing `render_event(console, phase, label, event)` | + +### Modified Files + +| File | Lines Changed (est.) | Description | +|---|---|---| +| `tools/run-agent.py` | -500 (net reduction) | Replace `subprocess.Popen` loop with `ServerRunner` + `EventLoop`. Keep all `render_*()` functions. | +| `tests/test_run_agent.py` | +400 new/modified | Replace `FakePopen` fixtures with `FakeServer` + `FakeSSE`. Add mapper/state unit tests. | + +### No Changes Required + +| File | Why | +|---|---| +| `Makefile` | `make phase-X` commands stay identical from user perspective | +| `AGENTS.md` | No behavioral changes to agent contracts | +| `codecome.yml` | No config changes needed | +| `_colors.py` | Unchanged | + +--- + +## 12. Convenience CLI + +`tools/opencode/serve.py` will include a development CLI: + +```bash +# Start a server manually for debugging +python tools/opencode/serve.py start --port 8080 --log-level DEBUG + +# Stop a running server by PID +python tools/opencode/serve.py stop --pid 12345 +``` + +Implemented via `if __name__ == "__main__": argparse` at the bottom of the file. + +--- + +## 13. Backward Compatibility + +| Surface | Impact | +|---|---| +| `make phase-X` | No changes. All existing env vars (`CODECOME_MODEL`, `CODECOME_THINKING`, `PROMPT_EXTRA`, etc.) respected. | +| `make show-model` | Uses transient server instead of probe sessions. Output format identical. | +| `CODECOME_USE_WRAPPER=0` | Deprecated. There is no `opencode run` fallback anymore. Warn and exit. | +| Environment variables | All `CODECOME_*` env vars continue to work as before. | +| Rendering output | Visually identical. Same panels, colors, icons, truncation, diffs. | +| Transcripts | Still written to `tmp/last-phase-{N}-{finding}-attempt-{M}.jsonl` | + +**Minimum OpenCode version bump:** from `1.14.39` to `1.14.50` (required for stable SSE API surface). + +--- + +## 14. Risks & Mitigations + +| Risk | Likelihood | Impact | Mitigation | +|---|---|---|---| +| SSE mapping drifts from opencode source on upgrade | Medium | High | Research is documented in `itemdb/notes/opencode-run-internals-report.md`; re-sync on opencode upgrades. Pin max version in `check_opencode_version()`. | +| Server startup overhead per phase | Low | Low | ~1-2s per phase; acceptable for research workflow. | +| Localhost network issues (SSE drop) | Low | Medium | Reconnect logic + state polling fallback. | +| opencode serve API changes in future releases | Medium | High | Version gate in `check_opencode_version()`. | +| Tool permission auto-reject breaks some agent workflows | Low | High | Same as current behavior; no regression. Can be made configurable later. | +| StateTracker misses deltas due to reconnect gap | Low | Medium | Poll `GET /session/{id}/message` on reconnect to fill gaps. | + +--- + +## 15. Testing Strategy + +1. **Unit tests** (new): + - `test_mapper.py` — verify each SSE event type maps to correct ND-JSON + - `test_state_tracker.py` — verify delta accumulation, finalization detection + - `test_sse_client.py` — verify reconnect, heartbeat, error handling + - `test_server_runner.py` — verify port parsing, health check, stop logic + +2. **Integration tests** (modify existing): + - Replace `FakePopen` with `FakeServer` that yields SSE events + - Verify `EventLoop.run()` produces identical output transcript + - Verify auto-resume still works with new architecture + - Verify frontmatter auto-correction still works + +3. **End-to-end smoke test**: + - Run `make show-model` with a real model configured + - Run a minimal `make phase-1` with a small prompt + - Verify panels render correctly (text, tool_use, step_start, step_finish) + - Verify transcript file matches expected ND-JSON format + +4. **Regression test**: + - Compare output of old `CODECOME_USE_WRAPPER=0 make phase-X` vs new implementation on same prompt + - Verify byte-identical rendering (modulo timestamps) + +--- + +## 16. Implementation Order + +1. Create `tools/opencode/` package with `serve.py` +2. Create `tools/events/` package with `sse_client.py`, `state_tracker.py`, `mapper.py`, `emitters.py` +3. Refactor `tools/run-agent.py` — replace subprocess loop with orchestration +4. Update `tests/test_run_agent.py` — new fixtures for server/SSE mocks +5. Run `make tests` → fix failures +6. Smoke test with real model prompt +7. Update `AGENTS.md` if any behavioral changes observed + +--- + +## 17. Decision Log + +| Question | Decision | Rationale | +|---|---|---| +| Module layout | `tools/opencode/serve.py` + `tools/events/` | Keeps `tools/` namespace clean; clearly identifies support modules | +| `make show-model` approach | Transient server | Most reliable; removes probe sessions; consistent with runtime | +| Auto-retry on server startup | No | User asked to show error and finish | +| Delete session on cleanup | No | Leave in opencode DB for inspection | +| Text streaming | Accumulate, finalize on `time.end` | Matches current behavior; TODO for future real-time streaming | +| Permission handling | Auto-reject with visible warning | Same as current `opencode run` non-interactive mode | +| Convenience CLI | Yes, in `serve.py` | Development/debugging aid | +| Minimum version bump | 1.14.39 → 1.14.50 | Required for stable SSE API | + +--- + +*Plan written: 2026-05-16* +*Approved by: pruiz* +*Next step: Begin implementation (Phase 1: `tools/opencode/serve.py`)* diff --git a/.project/mock-llm-parity-plan.md b/.project/mock-llm-parity-plan.md new file mode 100644 index 0000000..86ab37e --- /dev/null +++ b/.project/mock-llm-parity-plan.md @@ -0,0 +1,293 @@ +# Plan: Deterministic Mock-LLM Parity Testing for opencode run vs opencode serve + +**Status:** Implemented ✅ +**Date:** 2026-05-18 +**Author:** CodeCome Agent +**Target:** Replace `tools/opencode-parity.py` with deterministic mock-LLM approach +**Risk Level:** Low (new testing infrastructure, no production code changes) + +--- + +## 1. Problem Statement + +The existing parity verification in `tools/opencode-parity.py` compares event shapes between: +- `opencode run --format json` (subprocess → ND-JSON stdout) +- `opencode serve` HTTP+SSE (SSE stream → mapped ND-JSON) + +**Why it is insufficient:** +- Shape comparison alone does not validate that rendering logic (Rich/plain) handles both paths identically. +- Without a deterministic LLM backend, every run produces different tokens, making regression testing impossible. +- The mock must exercise the full pipeline: OpenCode consumes a deterministic LLM stream, produces ND-JSON/SSE events, and the client renders them. + +--- + +## 2. Goal + +Build a deterministic mock LLM **provider** (not replacing OpenCode) that: +1. Speaks standard OpenAI-compatible streaming (`POST /v1/chat/completions`). +2. Is registered as a `"test"` provider in `opencode.json`. +3. Reads a JSON script file defining the exact sequence of deltas and tool calls to emit. +4. Can be referenced as `-m test/mockmodel` in both `opencode run` and `opencode serve`. +5. Allows structural ND-JSON parity comparison between the two OpenCode paths. +6. Optionally captures rendered terminal output for human regression review. + +**Key insight:** CodeCome never talks to the MockLLM directly. CodeCome talks to OpenCode. OpenCode talks to the MockLLM via the standard OpenAI chat-completions protocol. The MockLLM only needs to emit deterministic `ChatCompletionChunk` SSE deltas. + +--- + +## 3. Architecture + +``` +┌─────────────────┐ OpenAI API ┌──────────────┐ ND-JSON stdout ┌─────────────┐ +│ MockLLM │◄──────────────────► │ opencode run │ ─────────────────────► │ CodeCome │ +│ (provider.test) │ (SSE chunks) │ │ │ parity │ +└─────────────────┘ └──────────────┘ │ checker │ + └─────────────┘ +┌─────────────────┐ OpenAI API ┌──────────────┐ HTTP+SSE ▲ +│ MockLLM │◄──────────────────► │opencode serve│ ──────────────────────┘ +│ (provider.test) │ (SSE chunks) │ │ +└─────────────────┘ └──────────────┘ +``` + +### 3.1 Mock LLM Server + +**Approach:** Small custom stdlib-only OpenAI-compatible mock server (`tools/mock-llm-server.py`). +- Reads a JSON script file at startup (e.g., `--script tools/mock_llm_scripts/basic.json`). +- Serves standard OpenAI-compatible endpoints: + - `POST /v1/chat/completions` — streaming SSE with deterministic deltas. + - `GET /v1/models` — returns `[{"id":"mockmodel"}]`. +- Translates the JSON script into standard `ChatCompletionChunk` SSE events. +- No control endpoints needed; behavior is entirely determined by the script file. + +**Why custom instead of `mockllm`:** Minimal tokens, full control, no new dependencies (FastAPI/uvicorn already used elsewhere in the project). + +### 3.2 JSON Script Format + +The script file is a JSON array of LLM-side actions. The mock server translates these into standard OpenAI `ChatCompletionChunk` SSE events. + +```json +[ + {"type": "text_delta", "content": "Hello "}, + {"type": "text_delta", "content": "world!"}, + {"type": "reasoning_delta", "content": "Let me think..."}, + {"type": "tool_call", "id": "call_1", "name": "read_file", "arguments": {"path": "/tmp/foo.txt"}}, + {"type": "done"} +] +``` + +The mock server translates `tool_call` into the OpenAI `function_call` / `tool_calls` delta format. + +### 3.3 Permission Testing + +To trigger a `permission` event in OpenCode, the mock LLM emits a `tool_call` for a tool that is **not** auto-approved by the existing `permissions` block in `opencode.json`. +- The CodeCome harness already accepts or rejects permissions automatically based on context + `opencode.json` rules. +- No interactive prompting is required. +- The parity test verifies that both `opencode run` and `opencode serve` emit the permission event correctly before the tool executes. + +--- + +## 4. Provider Registration + +Add to `opencode.json`: + +```json +{ + "provider": { + "test": { + "type": "openai", + "baseURL": "http://localhost:9999/v1", + "apiKey": "sk-test", + "models": ["mockmodel"] + } + } +} +``` + +This allows `-m test/mockmodel` to resolve correctly in both CLI and server contexts. + +--- + +## 5. Test Orchestration + +### 5.1 Test Script: `tools/mock-llm-parity.py` + +Replaces `tools/opencode-parity.py`. Steps: + +1. **Start mock server** on ephemeral port (`python tools/mock-llm-server.py --port $PORT --script $SCRIPT`). +2. **Path A — opencode run:** + - Execute `opencode run --format json -m test/mockmodel -p "Test prompt"`. + - Capture stdout ND-JSON to `tmp/parity-run.jsonl`. +3. **Path B — opencode serve:** + - Execute `tools/run-agent.py` with the same model and prompt (which internally starts `opencode serve`). + - Instruct `run-agent.py` to dump its internal mapped ND-JSON stream to `tmp/parity-serve.jsonl`. + - Capture rendered terminal output to `tmp/parity-serve-rendered.txt` (optional). +4. **Compare:** + - Structural ND-JSON parity: compare `tmp/parity-run.jsonl` and `tmp/parity-serve.jsonl` line-by-line, ignoring `timestamp` and `session_id` fields. + - Rendered output parity (optional): if Rich/plain text is captured, strip ANSI sequences and compare. +5. **Report:** + - Exit code 0 if parity passes. + - Exit code 1 with diff if parity fails. + +### 5.2 Integration with `tests/test_new_serve_stack.py` + +Add a new test class `TestMockLLMParity` that: +- Auto-starts the mock server via `pytest.fixture(scope="session")`. +- Runs both paths inside the test process. +- Asserts ND-JSON parity. +- Runs in CI (non-interactive, no TTY required). + +--- + +## 6. Acceptance Criteria + +- [x] `tools/mock-llm-server.py` exists and serves deterministic OpenAI-compatible SSE streams from JSON script files. +- [x] `tools/mock_llm_scripts/` contains `basic.json`, `with_tool.json`, and `with_permission.json`. +- [x] `opencode.json` contains `provider.test` block. +- [x] `tools/mock-llm-parity.py` exists and can be invoked manually. +- [x] `tests/test_mock_llm_parity.py` exists and passes in CI. +- [x] Existing `tools/opencode-parity.py` is deleted. +- [x] Existing `tests/test_opencode_parity.py` is deleted. +- [x] `Makefile` has a `test-parity` target. +- [x] All 216+ existing tests continue to pass. + +--- + +## 7. Rollout & Decommissioning + +1. Implement mock server and provider registration. +2. Write new parity script and tests. +3. Run side-by-side with old parity script for one week. +4. Once new approach is trusted, delete `tools/opencode-parity.py` and `tests/test_opencode_parity.py`. +5. Update `.project/migrate-to-opencode-serve.md` to mark parity testing as completed. + +--- + +## 8. Open Questions / Future Work + +- **TODO:** Support multi-turn conversation scripts (for session state testing). +- **Permission event:** The mock script includes a `tool_call` that triggers a permission check under the existing `opencode.json` rules. +- **Rendered terminal output comparison:** Strip ANSI sequences before diffing; use ANSI-aware diffing if a lightweight library is available. +- **Make target:** Add `make test-parity` to the Makefile. + +--- + +## 9. Extension: Comprehensive Multi-Turn Parity Testing + +### 9.1 Problem: Current Scripts Are Too Simple + +The initial scripts (`basic.json`, `with_tool.json`, `with_permission.json`) only test: +- Single text turn +- One tool per turn +- Two-turn sessions (text → tool → text) + +Real CodeCome sessions (from 25 recorded fixtures) show: +- **3–5 tool calls per assistant message** (not 1) +- **No content text before pure tool turns** (content=False) +- **7–22 turns per session** (not 2) +- **Mixed tool types**: `read`, `glob`, `bash`, `write`, `edit` in same session + +### 9.2 Solution: Fix Turn-Splitting Heuristic + +The current `mock_llm_server.py` splits at the **first** `tool_call`. It must instead split at **turn boundaries**: + +**New heuristic:** +- A **turn** = optional leading `text` + **all consecutive** `tool_call`s that follow it. +- Turn ends when the next action is `text` (after tools) or `done`. +- `done` always ends with `finish_reason: "stop"`. +- A turn with tools ends with `finish_reason: "tool_calls"`. +- A turn without tools ends with `finish_reason: "stop"`. + +**Multi-turn dispatch (stateless):** +Count `role: "tool"` messages in the incoming request to determine which turn to serve: +- 0 tool messages → Turn 1 +- N tool messages (where N = sum of tools in all prior turns) → Turn K + +This requires no per-client tracking. + +### 9.3 Comprehensive Script Design + +Instead of ~10 small scripts, use **2–3 comprehensive scripts** that combine many patterns: + +#### `comprehensive.json` — Full tool coverage +8 turns, exercises: `read` (multi), `glob`, `grep`, `write`, `edit`, `bash`, `todowrite`, `skill`. + +```json +[ + {"type": "text", "content": "I'll read files."}, + {"type": "tool_call", "id": "call_1", "name": "read", "arguments": {"filePath": "README.md"}}, + {"type": "tool_call", "id": "call_2", "name": "read", "arguments": {"filePath": "AGENTS.md"}}, + {"type": "text", "content": "Let me search."}, + {"type": "tool_call", "id": "call_3", "name": "glob", "arguments": {"pattern": "src/**/*.c"}}, + {"type": "tool_call", "id": "call_4", "name": "grep", "arguments": {"pattern": "main", "path": "src"}}, + {"type": "text", "content": "Now I'll write and edit."}, + {"type": "tool_call", "id": "call_5", "name": "write", "arguments": {"filePath": "tmp/parity-test.txt", "content": "original"}}, + {"type": "tool_call", "id": "call_6", "name": "edit", "arguments": {"filePath": "tmp/parity-test.txt", "oldString": "original", "newString": "modified"}}, + {"type": "text", "content": "Running a command."}, + {"type": "tool_call", "id": "call_7", "name": "bash", "arguments": {"command": "echo hello"}}, + {"type": "text", "content": "Creating todos."}, + {"type": "tool_call", "id": "call_8", "name": "todowrite", "arguments": {"todos": [{"content":"test","status":"completed","priority":"high"}]}}, + {"type": "text", "content": "Loading skill."}, + {"type": "tool_call", "id": "call_9", "name": "skill", "arguments": {"name": "source-recon"}}, + {"type": "text", "content": "Done!"}, + {"type": "done"} +] +``` + +#### `with_permission_multi.json` — Permission + allowed +3 turns: reads denied `.env` file (permission rejected), then reads allowed `README.md`. + +```json +[ + {"type": "text", "content": "Reading secret file."}, + {"type": "tool_call", "id": "call_1", "name": "read", "arguments": {"filePath": "secret.env"}}, + {"type": "text", "content": "Permission denied. Let me read allowed file."}, + {"type": "tool_call", "id": "call_2", "name": "read", "arguments": {"filePath": "README.md"}}, + {"type": "text", "content": "Done."}, + {"type": "done"} +] +``` + +#### `with_apply_patch.json` — Patch application +2–3 turns: writes a file, then applies a patch to it. + +```json +[ + {"type": "text", "content": "Writing base file."}, + {"type": "tool_call", "id": "call_1", "name": "write", "arguments": {"filePath": "tmp/patch-target.txt", "content": "line1\nline2\nline3\n"}}, + {"type": "text", "content": "Applying patch."}, + {"type": "tool_call", "id": "call_2", "name": "apply_patch", "arguments": {"patchText": "*** Begin Patch\n*** Update File: tmp/patch-target.txt\n--- a/tmp/patch-target.txt\n+++ b/tmp/patch-target.txt\n@@ -1,3 +1,3 @@\n line1\n-line2\n+line2_modified\n line3\n*** End Patch"}}, + {"type": "text", "content": "Done."}, + {"type": "done"} +] +``` + +### 9.4 Tools NOT Covered (Recordings Missing) + +| Tool | Coverage | Note | +|---|---|---| +| `task` | ❌ | Creates child session; requires subagent request detection. **TODO: add separate test once recordings available.** | + +### 9.5 Mock Server Changes + +1. **Fix `_build_chunks`:** Parse script into turns (group consecutive `tool_call`s under preceding `text`). +2. **Fix `do_POST`:** Count `role: "tool"` messages in request to determine turn index. +3. **Emit multi-tool chunks:** Tool calls in same turn use `index: 0, 1, 2...` in `choices[0].delta.tool_calls` array. + +### 9.6 Test Changes + +1. **Add scripts to `tests/test_mock_llm_parity.py`:** + - Replace small script list with `comprehensive.json`, `with_permission_multi.json`, `with_apply_patch.json`. +2. **Add unit test for multi-tool chunks:** + - Verify `test_chat_completions_streaming` returns `tool_calls` with correct `index` values. +3. **Estimated runtime:** 3 E2E tests × ~15s = ~45s total (vs. ~120s for 10 small scripts). + +### 9.7 Acceptance Criteria (Extension) + +- [ ] `mock_llm_server.py` supports multi-tool turns and stateless multi-turn dispatch. +- [ ] `comprehensive.json` covers `read`, `glob`, `grep`, `write`, `edit`, `bash`, `todowrite`, `skill`. +- [ ] `with_permission_multi.json` covers permission rejection + allowed tool in same session. +- [ ] `with_apply_patch.json` covers `write` + `apply_patch` stateful sequence. +- [ ] All 3 new scripts pass parity test (`opencode run` vs `opencode serve`). +- [ ] Unit tests verify multi-tool chunk indexing. +- [ ] Total test time < 60s for E2E parity suite. +- [ ] `task` tool documented as **TODO** with plan reference. diff --git a/.project/opencode-run-internals-report.md b/.project/opencode-run-internals-report.md new file mode 100644 index 0000000..08d6470 --- /dev/null +++ b/.project/opencode-run-internals-report.md @@ -0,0 +1,328 @@ +# OpenCode `run` Internals: Structured Research Report + +**Date:** 2026-05-16 +**Target:** OpenCode CLI v1.14.50 (installed via Homebrew) +**Source:** `github.com/anomalyco/opencode` (dev branch) +**Investigator:** Automated source-code analysis + +--- + +## 1. Executive Summary + +`opencode run` (non-interactive, non-attach mode) does **not** start a traditional HTTP server or bind to a port for its own use. Instead, it creates an **in-process** SDK client whose `fetch` implementation is monkey-patched to bypass the network entirely and call `Server.Default().app.fetch(request)` directly. The server code is still loaded and executed, but requests travel through an in-memory function call rather than TCP/HTTP. For `--attach` mode, the CLI uses standard HTTP SSE to a remote server. + +--- + +## 2. How `opencode run` Starts Its Server and Connects + +### 2.1 Non-Interactive Local Mode (default) +**File:** `packages/opencode/src/cli/cmd/run.ts` (lines 275-280) + +```typescript +const fetchFn = (async (input: RequestInfo | URL, init?: RequestInit) => { + const { Server } = await import("@/server/server") + const request = new Request(input, init) + return Server.Default().app.fetch(request) +}) as typeof globalThis.fetch + +const sdk = createOpencodeClient({ + baseUrl: "http://opencode.internal", // Fake URL; never hit over network + fetch: fetchFn, + directory, +}) +``` + +Key observations: +- `baseUrl` is a dummy (`http://opencode.internal`). It is only used for path resolution inside the SDK. +- The real transport is `fetchFn`, which imports the server module lazily and invokes its `app.fetch()` directly. +- The server module (`packages/opencode/src/server/server.ts`) creates a Bun/Node HTTP server when `listen()` is called, but in this local mode **no port is opened** for the CLI's own consumption. +- The `InstanceRef` Effect service is provided (because `instance: (args) => !args.attach`), which boots the full opencode runtime (DB, bus, agents, etc.) inside the same process. + +### 2.2 Interactive Local Mode (`--interactive` without `--attach`) +**File:** `packages/opencode/src/cli/cmd/run.ts` (lines 248-264) + +Same in-process fetch pattern, but calls `runInteractiveLocalMode()` instead of `execute()`. The TUI (Ink-based React renderer) renders in the same terminal, still talking to the in-process server via the fake-fetch bridge. + +### 2.3 Attach Mode (`--attach`) +**File:** `packages/opencode/src/cli/cmd/run.ts` (lines 186-193) + +```typescript +const attachSDK = (dir?: string) => { + return createOpencodeClient({ + baseUrl: args.attach!, // e.g. http://localhost:4096 + directory: dir, + headers: attachHeaders, // Basic auth if provided + }) +} +``` + +- Standard HTTP transport (real `fetch` via the SDK). +- No local instance is booted (`instance: (args) => !args.attach`). + +--- + +## 3. SSE Event Consumption and ND-JSON Mapping + +### 3.1 Server-Side Event Production +**File:** `packages/opencode/src/server/routes/instance/httpapi/handlers/event.ts` + +```typescript +function eventData(data: unknown): Sse.Event { + return { + _tag: "Event", + event: "message", + id: undefined, // NO event ID is emitted + data: JSON.stringify(data), + } +} +``` + +The server: +- Subscribes to the global event bus (`bus.subscribeAll()`). +- Merges a heartbeat stream (every 10 seconds, type `server.heartbeat`). +- Sends an initial `server.connected` event. +- Encodes everything with `effect/unstable/encoding/Sse.encode()`. +- Response headers: + - `Content-Type: text/event-stream` + - `Cache-Control: no-cache, no-transform` + - `X-Accel-Buffering: no` + - `X-Content-Type-Options: nosniff` + +### 3.2 Client-Side SSE Parsing +**File:** `packages/sdk/js/src/v2/gen/core/serverSentEvents.gen.ts` + +The SDK's generated SSE client: +- Uses the standard Web Streams API (`response.body.pipeThrough(new TextDecoderStream()).getReader()`). +- Parses SSE fields (`data:`, `event:`, `id:`, `retry:`) manually. +- Joins multi-line `data:` fields with `\n`. +- Parses JSON from the joined data lines. +- Yields the parsed JSON object as the async generator's value. +- Supports **retry with exponential backoff**: + - Default retry delay: 3000 ms + - Max retry delay: 30000 ms + - Retries on network/parse errors until `sseMaxRetryAttempts` is reached. +- **Event ID tracking:** reads `id:` lines into `lastEventId` and sends it back as `Last-Event-ID` header on reconnect. + +### 3.3 ND-JSON Translation on Stdout +**File:** `packages/opencode/src/cli/cmd/run.ts` (lines inside `execute()` function) + +The CLI's event loop (`loop()` function) consumes `events.stream` and maps events to stdout: + +| Server SSE Event | Condition | stdout output (default) | stdout output (`--format json`) | +|---|---|---|---| +| `message.updated` | assistant role, first message | Prints header: `> agent · modelID` | `{"type":"message.updated",...}` | +| `message.part.updated` | `part.type === "text"` && `part.time?.end` | Prints plain text + newlines | `{"type":"text", "part":...}` | +| `message.part.updated` | `part.type === "reasoning"` && `part.time?.end` | Prints dim italic "Thinking: ..." | `{"type":"reasoning",...}` | +| `message.part.updated` | `part.type === "tool"` && status `running` | Inline icon + title | `{"type":"tool_use",...}` | +| `message.part.updated` | `part.type === "tool"` && status `completed` | Block with tool output | `{"type":"tool_use",...}` | +| `message.part.updated` | `part.type === "tool"` && status `error` | Error icon + title + error | `{"type":"tool_use",...}` | +| `message.part.updated` | `part.type === "step-start"` | — | `{"type":"step_start",...}` | +| `message.part.updated` | `part.type === "step-finish"` | — | `{"type":"step_finish",...}` | +| `session.error` | matching session | `UI.error(err)` | `{"type":"error", "error":...}` | +| `session.status` | `status.type === "idle"` | **Breaks loop, exits** | — | +| `permission.asked` | matching session | Prints warning + auto-rejects | `{"type":"permission.asked",...}` | + +The `emit()` helper writes ND-JSON: + +```typescript +process.stdout.write( + JSON.stringify({ + type, + timestamp: Date.now(), + sessionID, + ...data, + }) + EOL, +) +``` + +--- + +## 4. Tool Permission Handling + +### 4.1 Non-Interactive Mode (Default) +**File:** `packages/opencode/src/cli/cmd/run.ts` (lines 393-413) + +```typescript +if (event.type === "permission.asked") { + const permission = event.properties + if (permission.sessionID !== sessionID) continue + + if (args["dangerously-skip-permissions"]) { + await client.permission.reply({ requestID: permission.id, reply: "once" }) + } else { + UI.println("... auto-rejecting") + await client.permission.reply({ requestID: permission.id, reply: "reject" }) + } +} +``` + +Default behavior: +- **All permissions are auto-rejected** in non-interactive mode. +- If `--dangerously-skip-permissions` is passed, they are **auto-approved once** (`reply: "once"`). + +### 4.2 Interactive Mode (TUI) +**Files:** +- `packages/opencode/src/cli/cmd/run/footer.permission.tsx` — React/Ink UI component +- `packages/opencode/src/cli/cmd/run/permission.shared.ts` — Pure state machine + +Permission state machine stages: +1. **permission** → Options: `Allow once` / `Allow always` / `Reject` +2. **always** → Confirmation step: `Confirm` / `Cancel` +3. **reject** → Optional text input for rejection message + +Replies sent to the server: +- `reply: "once"` — allow one time +- `reply: "always"` — allow for this pattern until restart +- `reply: "reject"` — deny (with optional message) + +### 4.3 Server-Side Permission API +**File:** `packages/opencode/src/server/routes/instance/httpapi/handlers/permission.ts` + +```typescript +const reply = Effect.fn("PermissionHttpApi.reply")(function* (ctx: { + params: { requestID: PermissionID } + payload: Permission.ReplyBody +}) { + yield* svc.reply({ + requestID: ctx.params.requestID, + reply: ctx.payload.reply, + message: ctx.payload.message, + }) + return true +}) +``` + +Endpoint: `POST /permission/{requestID}/reply` (v2 API). + +--- + +## 5. Tool State Reconstruction + +### 5.1 Event Source +Tool state arrives via `message.part.updated` events where `part.type === "tool"`. + +**File:** `packages/opencode/src/cli/cmd/run.ts` (lines 355-378) + +```typescript +if (part.type === "tool" && (part.state.status === "completed" || part.state.status === "error")) { + if (emit("tool_use", { part })) continue + if (part.state.status === "completed") { + await tool(part) + continue + } + await toolError(part) + UI.error(part.state.error) +} +``` + +### 5.2 Tool Part Structure +The `ToolPart` type (from SDK v2) contains: +- `tool`: tool name (e.g., `bash`, `write`, `edit`) +- `state`: `{ status: "running" | "completed" | "error", error?: string, ... }` +- `input`: the tool arguments +- `output`: the tool result (when completed) +- `id`: part ID +- `sessionID`: session ID + +### 5.3 Display Formatting +**File:** `packages/opencode/src/cli/cmd/run/tool.ts` + +Tools have custom renderers registered via a rule system: +- `toolInlineInfo(part)` → returns `{ icon, title, description, mode: "inline" | "block", body? }` +- `toolScroll(phase, ctx)` → returns scrollback text for different phases (`start`, `progress`, `final`) +- `toolPermissionInfo(name, input, meta, patterns)` → returns UI lines for the permission dialog + +Example tools with custom formatters: `bash`, `write`, `edit`, `task`, `WebFetch`, `Read`, etc. + +--- + +## 6. Event ID / Cursor / Resume Mechanisms + +### 6.1 Does the server support `lastEventId`? +**Partially, but it is not used.** + +The server-side `eventData()` function explicitly sets `id: undefined`: + +```typescript +function eventData(data: unknown): Sse.Event { + return { + _tag: "Event", + event: "message", + id: undefined, // ← always undefined + data: JSON.stringify(data), + } +} +``` + +The SDK client **does** implement the full `lastEventId` protocol: +- Tracks `lastEventId` from SSE `id:` lines. +- Sends `Last-Event-ID` header on reconnect attempts. +- However, because the server never emits IDs, resubscription always starts from the latest real-time event; **there is no server-side replay of missed events**. + +### 6.2 Is there a session-specific events endpoint? +**No.** + +- The global event endpoint is `/event` (SDK: `client.subscribe()`). +- There is also `/global/event` (SDK: `client.event()`). +- There is **no** `/session/{id}/events` endpoint. +- Session filtering is done **client-side** inside the `loop()` function: + ```typescript + if (part.sessionID !== sessionID) continue + if (event.properties.sessionID !== sessionID) continue + ``` + +### 6.3 Session Resumption (`--continue` / `--session`) +- `--continue` looks up the last session via `sdk.session.list()` and picks the one without a `parentID`. +- `--session` fetches a specific session via `sdk.session.get({ sessionID })`. +- `--fork` creates a new forked session before continuing. +- Resumption does **not** replay historical events; it simply re-attaches to the existing session ID and starts consuming **new** events from the live bus. + +--- + +## 7. Endpoint Reference + +| Endpoint | SDK Method | Purpose | +|---|---|---| +| `GET /event` | `client.event.subscribe()` | Subscribe to global SSE stream | +| `GET /global/event` | `client.event.event()` | Subscribe to global SSE stream (legacy/alternate) | +| `POST /session` | `client.session.create()` | Create new session | +| `POST /session/{id}/prompt` | `client.session.prompt()` | Send a user message | +| `POST /session/{id}/command` | `client.session.command()` | Execute a slash command | +| `POST /permission/{requestID}/reply` | `client.permission.reply()` | Reply to a permission request | +| `GET /session/{id}` | `client.session.get()` | Get session metadata | +| `POST /session/{id}/fork` | `client.session.fork()` | Fork a session | + +--- + +## 8. Binary Analysis Notes + +- The Homebrew binary (`/opt/homebrew/bin/opencode`) is a **Mach-O 64-bit ARM64 executable** (101.6 MB). +- It is a **Bun-compiled single-file executable** with all JavaScript/TypeScript source bundled inside. +- No separate Node modules or source tree is exposed in the Cellar. +- String analysis confirms embedded module paths like `packages/opencode/src/...` and `packages/sdk/js/src/...`. +- The binary contains the full Effect framework, SQLite (via `better-sqlite3` or Bun's built-in), and React/Ink TUI runtime. + +--- + +## 9. Security Observations + +1. **In-process fetch bypass:** The local mode never validates TLS, auth tokens, or same-origin policies because requests never leave the process. This is by design, but it means `Server.Default().app.fetch()` is a direct attack surface if an attacker can inject code into the CLI process. +2. **Permission auto-rejection:** Non-interactive mode auto-rejects all tool permissions, which is a safe default. The `--dangerously-skip-permissions` flag is required to weaken this. +3. **No event replay:** Because SSE events have no IDs and there is no session-specific replay endpoint, a client that disconnects and reconnects will miss events that occurred during the disconnection. This is a reliability limitation, not a direct security issue. +4. **Basic auth in attach mode:** The `--attach` mode supports `--username` / `--password` (or env vars `OPENCODE_SERVER_USERNAME` / `OPENCODE_SERVER_PASSWORD`). Credentials are sent via Basic Auth headers. + +--- + +## 10. Files Referenced + +- `/opt/homebrew/bin/opencode` (compiled Bun binary) +- `packages/opencode/src/cli/cmd/run.ts` +- `packages/opencode/src/cli/cmd/run/permission.shared.ts` +- `packages/opencode/src/cli/cmd/run/tool.ts` +- `packages/opencode/src/server/server.ts` +- `packages/opencode/src/server/routes/instance/httpapi/handlers/event.ts` +- `packages/opencode/src/server/routes/instance/httpapi/handlers/permission.ts` +- `packages/sdk/js/src/v2/gen/core/serverSentEvents.gen.ts` +- `packages/sdk/js/src/v2/gen/sdk.gen.ts` +- `packages/sdk/js/src/v2/client.ts` + diff --git a/.project/opencode-run-vs-serve.md b/.project/opencode-run-vs-serve.md new file mode 100644 index 0000000..f8292eb --- /dev/null +++ b/.project/opencode-run-vs-serve.md @@ -0,0 +1,189 @@ +# Using `serve` to get the same data as `run --format json` over HTTP + +## Architecture in 30 seconds + +opencode has a single internal **Bus** (`packages/opencode/src/bus/index.ts`) that every component publishes events to: the model loop, tool executions, sessions, permissions, MCP servers, LSP, PTY, file changes, etc. + +- **`run --format json`** subscribes to that bus, filters a small subset of events, flattens them, and writes them to stdout as NDJSON. +- **`serve`** exposes the bus more or less raw over an HTTP server-sent events stream. + +So you are not switching to a different data source — you are consuming the same firehose, just less filtered and over HTTP. + +## What `run --format json` actually emits + +From `packages/opencode/src/cli/cmd/run.ts:592-700`, the emit envelope is: + +```json +{ + "type": "tool_use | step_start | step_finish | text | reasoning | error", + "timestamp": 1737060000000, + "sessionID": "ses_...", + "part": { ... } // for non-error types + "error": { ... } // for "error" type +} +``` + +It is produced from only two bus event types: + +- `message.part.updated` — emitted as `text` (when `part.type === "text"` *and* `part.time.end` is set, i.e. the text is finalized), `reasoning` (same condition, `part.type === "reasoning"`), `tool_use` (when `part.type === "tool"` and `state.status` is `completed` or `error`), `step_start`, `step_finish`. +- `session.error` — emitted as `error`. + +It uses two more bus events but does *not* re-emit them to stdout: + +- `session.status` with `status.type === "idle"` for that session → loop exits, process returns. +- `permission.asked` → auto-replies via the SDK (`once` if `--dangerously-skip-permissions`, otherwise `reject`). + +This matters because if you replicate the behavior over HTTP, you have to do the same gating yourself. + +## The HTTP equivalent: `GET /event` (SSE) + +File: `packages/opencode/src/server/routes/instance/httpapi/handlers/event.ts` + +Characteristics: + +- `Content-Type: text/event-stream`, `Cache-Control: no-cache, no-transform`, `X-Accel-Buffering: no` +- Standard SSE framing: one event per `data: \n\n` block. +- First event you will always receive: `{ "id": "...", "type": "server.connected", "properties": {} }`. +- Every 10 seconds you receive `{ "type": "server.heartbeat", "properties": {} }` — drop it. +- Stream ends when the instance is disposed (`Bus.InstanceDisposed`). +- Each business event has the shape `{ "id": "...", "type": "...", "properties": { ... } }`. The `properties` payload is the same object the bus carries internally and the same object `run` peeks at as `event.properties` / `event.properties.part`. + +Notable event types you will care about (all named exactly as they appear over the wire): + +| Type | When | Notable fields | +|---|---|---| +| `message.updated` | A whole assistant/user message changed | `properties.info` (role, agent, modelID, …), `properties.sessionID` | +| `message.part.updated` | A streamed part of a message changed | `properties.part` (with `type`: `text`/`reasoning`/`tool`/`step-start`/`step-finish`, `state`, `time`, `sessionID`) | +| `session.status` | Session changed state | `properties.sessionID`, `properties.status.type` (e.g. `idle`, `running`) | +| `session.error` | An error occurred during a turn | `properties.sessionID`, `properties.error` | +| `permission.asked` | A tool wants permission | `properties.sessionID`, `properties.id` (requestID), `properties.permission`, `properties.patterns` | +| `server.connected` / `server.heartbeat` / `server.disconnected` | Stream lifecycle | — | + +## Recommended workflow + +1. **Start the server** + ``` + OPENCODE_SERVER_PASSWORD=changeme \ + opencode serve --port 8080 --hostname 127.0.0.1 + ``` + +2. **Open the SSE stream first.** Do this before sending the prompt, otherwise you lose early events (the session is created and the model starts streaming immediately). + ``` + GET /event + Authorization: Basic base64("opencode:changeme") + x-opencode-directory: /absolute/path/to/your/project + ``` + Buffer events as they arrive and dispatch them in your main loop. + +3. **Create or pick a session.** + ``` + POST /session + ``` + Or list existing ones with `GET /session` and reuse one. Keep `sessionID`. + +4. **Send the prompt.** Two flavors (from `packages/opencode/src/server/routes/instance/httpapi/groups/session.ts:73-95`): + - `POST /session/:sessionID/message` — synchronous (HTTP response returns when the turn ends; you still get the live stream over `/event` in parallel). + - `POST /session/:sessionID/prompt_async` — returns immediately. For an "event-driven over SSE" architecture this is usually what you want; it removes the need to keep an HTTP request open while a long turn is running. + +5. **Drive your state machine off the SSE stream.** Recommended end-of-turn condition (this is exactly what `run` does, run.ts:702-708): + ``` + event.type === "session.status" + && event.properties.sessionID === + && event.properties.status.type === "idle" + ``` + Both `session.status { status.type: "idle" }` and the deprecated + `session.idle` event signal completion. Prefer the `session.status` + form because `session.idle` may be removed in future versions. + +6. **Handle permissions if they happen.** On `permission.asked`, reply via: + ``` + POST /session/:sessionID/permissions/:permissionID + { "response": "once" | "always" | "reject" } + ``` + If you do not reply, the turn stays blocked. `run` auto-rejects (or auto-approves with `--dangerously-skip-permissions`); decide your equivalent policy. + +7. **Abort if you need to**: `POST /session/:sessionID/abort`. + +## Mapping `run`'s JSON envelope to API events + +If your downstream consumer is already wired to the `run --format json` shape, you can keep it stable by transforming SSE events on your side. The rules: + +| run NDJSON type | Trigger condition on the API event | +|---|---| +| `text` | `message.part.updated` AND `part.type === "text"` AND `part.time?.end` truthy | +| `reasoning` | `message.part.updated` AND `part.type === "reasoning"` AND `part.time?.end` truthy | +| `tool_use` | `message.part.updated` AND `part.type === "tool"` AND `state.status` in (`completed`, `error`) | +| `step_start` | `message.part.updated` AND `part.type === "step-start"` | +| `step_finish` | `message.part.updated` AND `part.type === "step-finish"` | +| `error` | `session.error` | + +Always filter by `part.sessionID === ` (or `properties.sessionID` for `session.*`), because `/event` is global — you get events for *every* session in the instance. + +## Authentication + +File: `packages/opencode/src/server/auth.ts` + +- Set `OPENCODE_SERVER_PASSWORD` and (optionally) `OPENCODE_SERVER_USERNAME` (default `opencode`). +- Send `Authorization: Basic ` on every request, including the SSE stream. +- Alternative for environments where you cannot set a header (e.g. an `EventSource` in a browser): use the `?auth_token=` query parameter. +- If no password is set, the server still starts but logs a warning and listens unauthenticated — fine for local dev, not for anything else. + +## Multi-instance / workspace routing + +The server can serve multiple project directories simultaneously. Every request — *including* the SSE subscription — should carry: + +``` +x-opencode-directory: /absolute/path/to/the/project +``` + +Some routes accept it as a query parameter instead (`?directory=...`). Without it the server uses the default instance, which may or may not be the one you want. If your events look "empty" or unrelated to the work you triggered, this header is the usual culprit. + +## Things that will bite you + +- **Lost early events.** Open `/event` before issuing any `POST /session/...` call. SSE is not replay-able. +- **Heartbeats and connection events.** Skip `server.connected`, `server.heartbeat`, `server.disconnected` in your consumer. Use `server.heartbeat` as a liveness signal: if it stops, your connection is dead — reconnect. +- **Reconnect logic.** SSE is a long-lived TCP connection. Plan for transient drops: reconnect, but be aware that you might miss events emitted during the gap. There is no event-id replay protocol on this endpoint. +- **Idle detection is per-session.** `session.status { idle }` for *another* session is not your signal. Filter strictly on `properties.sessionID`. +- **`text` and `reasoning` parts stream incrementally.** Many `message.part.updated` events for the same `part.id` will arrive before `part.time.end` is set. If you want the streaming experience, read `part.text` on every update (it is the accumulated text so far). If you want the final flush only — like `run --format json` — wait for `part.time.end`. +- **Tools have a lifecycle.** A `tool` part goes through `pending → running → completed | error`. `run` only emits `tool_use` at the terminal state. If you want progress, watch `state.status === "running"` too. +- **The API surface is much wider.** `/event` carries PTY output, file changes, LSP diagnostics, MCP server lifecycle, sync status, etc. Most of it is irrelevant for a "consume a prompt result" use case — filter aggressively by `type` to avoid drowning your consumer. +- **There is an official SDK.** `packages/sdk` is the TypeScript client (`sdk.event.subscribe()` is literally what `run` uses). If you are writing the consumer in Node/TS, use that — you get types for every event and every payload for free. For other languages, treat the SDK source as the canonical type reference and consume `/event` directly. + +## Minimal pseudocode (Node-ish) + +```ts +const auth = "Basic " + Buffer.from("opencode:changeme").toString("base64") +const dir = "/Users/pruiz/Develop/other/opencode" + +// 1. open SSE first +const sse = await fetch("http://127.0.0.1:8080/event", { + headers: { Authorization: auth, "x-opencode-directory": dir }, +}) + +// 2. create session +const sess = await fetch("http://127.0.0.1:8080/session", { + method: "POST", + headers: { Authorization: auth, "x-opencode-directory": dir, + "Content-Type": "application/json" }, + body: "{}", +}).then(r => r.json()) + +// 3. fire prompt asynchronously +await fetch(`http://127.0.0.1:8080/session/${sess.id}/prompt_async`, { + method: "POST", + headers: { Authorization: auth, "x-opencode-directory": dir, + "Content-Type": "application/json" }, + body: JSON.stringify({ /* PromptPayload: parts, modelID, providerID, ... */ }), +}) + +// 4. consume SSE, translate to run-style envelope, exit on idle +for await (const evt of parseSSE(sse.body)) { + if (evt.type === "server.heartbeat") continue + if (evt.type === "session.status" + && evt.properties.sessionID === sess.id + && evt.properties.status.type === "idle") break + // ...translate message.part.updated / session.error here... +} +``` + +That is genuinely all there is to it — the bulk of the integration is just deciding which bus events you care about and how strictly you want to mirror the `run --format json` envelope. diff --git a/.project/sync-recovery-plan.md b/.project/sync-recovery-plan.md new file mode 100644 index 0000000..55a3dcd --- /dev/null +++ b/.project/sync-recovery-plan.md @@ -0,0 +1,182 @@ +# SSE Sync Recovery Plan + +**Date:** 2026-05-20 +**Status:** Planned +**Branch:** `migrate-to-opencode-serve-api` + +## Problem Statement + +The current `_sync_session_messages()` implementation causes duplicate events in CI parity tests. The sync is triggered too frequently (every 0.5s via `server.heartbeat` and `session.diff`), and the deduplication has bugs that allow duplicates through. + +**Observed CI failure:** +``` +--- opencode-run ++++ opencode-serve ++{"part": {"snapshot": "a249ec52d7915bc7c077ce0408a80e53fd36186f", "type": "step-start"}, "type": "step_start"} ++{"part": {"text": "Done reading.", "type": "text"}, "type": "text"} +``` +Two events appear in serve path but not in run path, both with same snapshot hash. + +## Research Findings + +### SSE Architecture (opencode source) +- SSE uses `bus.subscribeAll()` - all bus events go to all SSE clients +- `message.part.delta` → **direct bus publish** (fire-and-forget) +- `message.part.updated` → **sync layer** (database write + bus publish) +- This means deltas are "easier" to miss than updates during stream interruptions + +### SSE Reconnect Behavior (`packages/sdk/js/src/v2/gen/core/serverSentEvents.gen.ts`) +- `sseMaxRetryAttempts` defaults to `undefined` (infinite retries with 3s→30s backoff) +- When max retries exceeded: async generator **breaks silently** without throwing +- `onSseError` callback fires on each failure (including final), but loop still breaks +- After break: `Stream.runForEach` finishes, `ensuring` block fires `fail("global event stream closed")` +- `SseClient` in our code raises `SseClientError` when the stream iterator ends + +### Part ID Generation (`packages/core/src/util/identifier.ts`) +- Part IDs are globally unique: `prt` prefix + 6 bytes timestamp/counter + 14 random base62 chars +- **NOT a UUID** - timestamp-based monotonic + random suffix +- Part IDs are the primary key in `PartTable` (database deduplicates on `id` alone) +- **`(type, part_id)` is sufficient for deduplication** - no need for snapshot hash + +### Sync Triggering (current code - `tools/events/__init__.py:243`) +- **Immediate**: `session.idle`, `session.updated`, `todo.updated`, `session.status.type=="idle"` +- **Throttled (0.5s)**: `session.status`, `session.diff`, `server.heartbeat` +- `server.heartbeat` fires every 10s → triggers sync 20 times/minute +- `session.diff` fires on every model output delta → very frequent during active streaming + +### Deduplication Bugs +1. **StateTracker line 309**: When `has_seen()` returns True, code `continue`s but does NOT mark the part as seen. So the same part gets checked and skipped on every subsequent sync. +2. **No EventLoop-level fingerprint**: Synthesized events from sync can duplicate events already emitted via SSE. + +## Plan + +### Approach 1: Sync Only On Reconnect (Not Periodically) + +**Principle:** Only sync when we have an actual disconnect/reconnect scenario, not on a timer. + +**Implementation:** +1. Add a `reconnect_callback` slot to `SseClient` that fires after successful reconnection +2. `EventLoop` registers this callback and uses it to trigger a one-time recovery sync +3. Remove `server.heartbeat`, `session.diff`, `session.status` from `_should_sync_session_messages()` triggers +4. Keep `session.idle` sync for end-of-session final catchup +5. When reconnect callback fires: set a flag that makes next `_should_sync_session_messages()` return True, then clear the flag after one sync + +**Why this works:** +- If SSE was reliable (no disconnect), no unnecessary syncs +- If SSE had a brief interruption, reconnect triggers a recovery sync to catch missed events +- End-of-session (`session.idle`) catches any final events SSE might have missed + +### Approach 2: Fix Deduplication at EventLoop Level + +**Principle:** Even if sync produces duplicates, only one gets emitted. + +**Implementation:** +1. **Fix StateTracker bug at line 309**: When skipping a part that's already seen, still call `mark_seen()` so future checks work correctly +2. **Add `_emitted_part_signatures` set in EventLoop**: Track `(part_id)` for every finalized event emitted +3. **Check before emit**: In `EventLoop.run()`, before emitting a finalized event (from `_tracker.ingest()` or sync), check if its `part.id` is in `_emitted_part_signatures` +4. **Key by `(event_type, part_id)`**: Even though `part_id` is globally unique, we key by `(event_type, part_id)` to be safe against any edge cases where same ID could appear in different event types + +**Why this works:** +- Part ID alone is the deduplication key (per opencode DB schema) +- `(event_type, part_id)` is a belt-and-suspenders approach +- Prevents duplicates from either SSE or sync path + +### Key: Minimal Sync Surface + +After implementing ideas 1 and 2, the sync should only trigger in two scenarios: +1. **On reconnect** - after SSE stream was interrupted and re-established +2. **On session.idle** - at end of session as a safety net + +Both are low-frequency (reconnect should be rare; session.idle is once per session). + +## Implementation Steps + +### Step 1: Fix StateTracker Bug +**File:** `tools/events/state_tracker.py` +**Change:** At line 309, when skipping a part that's already seen, add to `_seen_part_ids`: +```python +if isinstance(part_id, str) and self._tracker.has_seen(part_id): + # Already processed this part - mark as seen to avoid re-check + self._tracker.mark_seen(part_id) # NEW + continue +``` + +Or add a `mark_seen()` method to StateTracker if it doesn't exist. + +### Step 2: Add Reconnect Callback to SseClient +**File:** `tools/events/sse_client.py` +**Changes:** +- Add `on_reconnect: Callable[[], None] | None = None` parameter to `__init__` +- In `_open_stream()`, after successful reconnect (when we resume reading events after a retry), call `self.on_reconnect()` if set + +### Step 3: Add Recovery Sync Flag to EventLoop +**File:** `tools/events/__init__.py` +**Changes:** +- Add `_pending_recovery_sync: bool = False` instance variable +- Add `trigger_recovery_sync()` method that sets `_pending_recovery_sync = True` +- Register this callback in `SseClient` constructor: `self._client = SseClient(..., on_reconnect=self.trigger_recovery_sync)` + +### Step 4: Update Sync Trigger Logic +**File:** `tools/events/__init__.py` +**Changes:** +- Remove `server.heartbeat` and `session.diff` from throttled sync triggers (line 252) +- In `_should_sync_session_messages()`, check `_pending_recovery_sync` flag and return True if set +- Clear `_pending_recovery_sync = False` after one sync completes + +### Step 5: Add EventLoop-Level Dedup +**File:** `tools/events/__init__.py` +**Changes:** +- Add `_emitted_part_signatures: set[tuple[str, str]]` instance variable +- Before emitting a finalized event (after `_tracker.ingest()` returns), compute signature `(event_type, part.get("id", ""))` +- If signature already in set, skip emit +- If not, add to set and emit + +## Architecture After Changes + +``` +SseClient.events() + │ + │─ on reconnect success ─→ EventLoop.trigger_recovery_sync() + │ └─ sets _pending_recovery_sync = True + │ + └─ yield events ─→ EventLoop.run() + │ + ├─ _tracker.ingest(event) + │ └─ StateTracker: dedup via _seen_part_ids (FIXED) + │ + ├─ _should_sync_session_messages() + │ └─ returns True only if: + │ - _pending_recovery_sync (reconnect sync) + │ - session.idle (end-of-session sync) + │ + ├─ _sync_session_messages() → synthesize + │ └─ ingest(synthesized) → dedup + emit + │ + └─ emit to render_fn() + │ + └─ check fingerprint (EventLoop level dedup) +``` + +## Testing Strategy + +1. **Run parity tests 5 times locally** to confirm baseline passes +2. **Add a test that simulates SSE disconnect/reconnect** to verify recovery sync works +3. **Add a test for idempotency** - sync called twice with same events should not duplicate +4. **Monitor CI** for the 2 failing parity tests (script1 and script3) + +## Open Questions / Follow-ups + +1. **Long-term**: Investigate if SSE unreliability is a bug in opencode or expected behavior. If deltas always arrive reliably, we might not need sync at all. +2. **Metrics**: Consider adding a metric for "times sync ran" and "events emitted from sync" to understand sync frequency in production. +3. **Timeout**: What happens if reconnect never succeeds? With `max_reconnects=10` and 3s base backoff, that's ~60s total before giving up. Is that enough? + +## Files to Modify + +- `tools/events/state_tracker.py` - fix line 309 bug +- `tools/events/sse_client.py` - add on_reconnect callback +- `tools/events/__init__.py` - recovery sync flag, trigger, dedup fingerprint set + +## Related Files (Read-Only) + +- `tools/mock-llm-parity.py` - for understanding sync context +- `tools/events/emitters.py` - for understanding emit path \ No newline at end of file diff --git a/Makefile b/Makefile index 2278c32..1fb3f3d 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Copyright (C) 2025-2026 Pablo Ruiz García # SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later -.PHONY: help venv venv-check check status next-id frontmatter tests itemdb-reset index report +.PHONY: help venv venv-check check status next-id frontmatter tests test-parity itemdb-reset index report .PHONY: findings findings-create findings-move findings-evidence findings-package .PHONY: phase-1 phase-2 phase-3 phase-4 phase-5 phase-6 validate-all exploit-all .PHONY: sandbox-setup sandbox-check sandbox-up sandbox-down sandbox-shell sandbox-logs sandbox-clean sandbox-reset sandbox-build sandbox-test @@ -12,6 +12,9 @@ export PATH := $(CURDIR)/.venv/bin:$(PATH) export PROMPT_EXTRA export PROMPT_EXTRA_FILE +# Pass --thinking to raw opencode run when CODECOME_THINKING=1 +OPENCODE_THINKING_FLAG := $(if $(filter 1,$(CODECOME_THINKING)),--thinking,) + ifndef NO_COLOR RED := \033[31m YELLOW := \033[33m @@ -52,8 +55,8 @@ help: @printf " $(BOLD)$(CYAN)Wrapper controls:$(RESET)\n" @printf "\n" @printf " $(BOLD)CODECOME_USE_WRAPPER=0$(RESET) Bypass styled wrapper and use raw opencode run\n" - @printf " $(BOLD)CODECOME_THINKING=1$(RESET) Enable --thinking in wrapper-driven phase runs\n" - @printf " $(BOLD)OPENCODE_ARGS='...'$(RESET) Extra flags passed through to opencode run\n" + @printf " $(BOLD)CODECOME_THINKING=1$(RESET) Show model reasoning/thinking blocks in output\n" + @printf " $(BOLD)OPENCODE_ARGS='...'$(RESET) Extra flags for opencode run (forwarded directly when CODECOME_USE_WRAPPER=0; in wrapper mode only --model, --variant and --thinking are used)\n" @printf " $(BOLD)CODECOME_MODEL=$(RESET) Pin the model per phase (e.g. anthropic/claude-opus-4-7)\n" @printf " $(BOLD)CODECOME_MODEL_VARIANT=$(RESET) Pin the model variant (e.g. high, max)\n" @printf " $(BOLD)PROMPT_EXTRA=\"...\"$(RESET) Append extra instructions to phase prompt\n" @@ -132,7 +135,7 @@ venv-check: phase-1: venv-check @$(PYTHON) tools/gate-check.py 1 @if [ "$$CODECOME_USE_WRAPPER" = "0" ]; then \ - opencode run --agent recon "$$(cat prompts/phase-1-recon.md)"; \ + opencode run --agent recon $(OPENCODE_THINKING_FLAG) "$$(cat prompts/phase-1-recon.md)"; \ else \ $(PYTHON) tools/run-agent.py --phase 1 --label "Target Reconnaissance + Sandbox Bootstrap" --agent recon --prompt-file prompts/phase-1-recon.md; \ fi @@ -145,7 +148,7 @@ phase-2: venv-check printf "Or override (not recommended): CODECOME_ALLOW_NO_SANDBOX=1 make phase-2\n\n" ; \ exit 1 ) @if [ "$$CODECOME_USE_WRAPPER" = "0" ]; then \ - opencode run --agent auditor "$$(cat prompts/phase-2-audit.md)"; \ + opencode run --agent auditor $(OPENCODE_THINKING_FLAG) "$$(cat prompts/phase-2-audit.md)"; \ else \ $(PYTHON) tools/run-agent.py --phase 2 --label "Hypothesis Generation" --agent auditor --prompt-file prompts/phase-2-audit.md; \ fi @@ -153,7 +156,7 @@ phase-2: venv-check phase-3: venv-check @$(PYTHON) tools/gate-check.py 3 @if [ "$$CODECOME_USE_WRAPPER" = "0" ]; then \ - opencode run --agent reviewer "$$(cat prompts/phase-3-review.md)"; \ + opencode run --agent reviewer $(OPENCODE_THINKING_FLAG) "$$(cat prompts/phase-3-review.md)"; \ else \ $(PYTHON) tools/run-agent.py --phase 3 --label "Counter-analysis" --agent reviewer --prompt-file prompts/phase-3-review.md; \ fi @@ -162,7 +165,7 @@ phase-4: venv-check @test -n "$(FINDING)" || (printf "\n$(BOLD)$(RED)[FAIL]$(RESET) Missing required FINDING argument for Phase 4 (Validation).\n\nSpecify which finding you want to validate:\n\n $(BOLD)make phase-4 FINDING=CC-0001$(RESET)\n\nTo list available pending findings: $(BOLD)make findings STATUS=PENDING$(RESET)\n\n" && exit 1) @$(PYTHON) tools/gate-check.py 4 $(FINDING) @if [ "$$CODECOME_USE_WRAPPER" = "0" ]; then \ - opencode run --agent validator "$$(sed 's#FINDING_PATH_OR_ID#$(FINDING)#g' prompts/phase-4-validate.md)"; \ + opencode run --agent validator $(OPENCODE_THINKING_FLAG) "$$(sed 's#FINDING_PATH_OR_ID#$(FINDING)#g' prompts/phase-4-validate.md)"; \ else \ $(PYTHON) tools/run-agent.py --phase 4 --label "Validation" --agent validator --prompt-file prompts/phase-4-validate.md --finding "$(FINDING)"; \ fi @@ -171,7 +174,7 @@ phase-5: venv-check @test -n "$(FINDING)" || (printf "\n$(BOLD)$(RED)[FAIL]$(RESET) Missing required FINDING argument for Phase 5 (Exploitation).\n\nSpecify which finding you want to exploit:\n\n $(BOLD)make phase-5 FINDING=CC-0001$(RESET)\n\nTo list available confirmed findings: $(BOLD)make findings STATUS=CONFIRMED$(RESET)\n\n" && exit 1) @$(PYTHON) tools/gate-check.py 5 $(FINDING) @if [ "$$CODECOME_USE_WRAPPER" = "0" ]; then \ - opencode run --agent exploiter "$$(sed 's#FINDING_PATH_OR_ID#$(FINDING)#g' prompts/phase-5-exploit.md)"; \ + opencode run --agent exploiter $(OPENCODE_THINKING_FLAG) "$$(sed 's#FINDING_PATH_OR_ID#$(FINDING)#g' prompts/phase-5-exploit.md)"; \ else \ $(PYTHON) tools/run-agent.py --phase 5 --label "Exploit Development" --agent exploiter --prompt-file prompts/phase-5-exploit.md --finding "$(FINDING)"; \ fi @@ -179,7 +182,7 @@ phase-5: venv-check phase-6: venv-check @$(PYTHON) tools/gate-check.py 6 @if [ "$$CODECOME_USE_WRAPPER" = "0" ]; then \ - opencode run --agent reporter "$$(cat prompts/phase-6-report.md)"; \ + opencode run --agent reporter $(OPENCODE_THINKING_FLAG) "$$(cat prompts/phase-6-report.md)"; \ else \ $(PYTHON) tools/run-agent.py --phase 6 --label "Reporting" --agent reporter --prompt-file prompts/phase-6-report.md; \ fi @@ -240,6 +243,9 @@ tests: venv-check $(PYTHON) -m pytest -q tests $(PYTHON) tools/check-frontmatter.py +test-parity: venv-check + $(PYTHON) -m pytest tests/test_mock_llm_parity.py -v + itemdb-reset: venv-check rm -f itemdb/notes/*.md rm -rf itemdb/evidence/CC-* diff --git a/README.md b/README.md index 2b82d50..36081c2 100644 --- a/README.md +++ b/README.md @@ -471,9 +471,9 @@ CodeCome ships reusable phase prompts under `prompts/`: ### Wrapper environment variables CODECOME_USE_WRAPPER=0 # bypass the styled wrapper - CODECOME_THINKING=1 # force --thinking on - CODECOME_THINKING=0 # force --thinking off - CODECOME_RENDER_REASONING=0 # suppress on-screen Thinking panels + CODECOME_THINKING=1 # show model reasoning/thinking blocks in output + CODECOME_THINKING=0 # hide model reasoning/thinking blocks + CODECOME_RENDER_REASONING=0 # suppress on-screen Thinking panels (independent override) CODECOME_REASONING_MAX_CHARS=4000 # truncate long reasoning blocks CODECOME_SANDBOX_RENDER=0 # disable structured Sandbox panel CODECOME_SANDBOX_VALIDATE_STDERR_LINES=20 @@ -482,11 +482,11 @@ CodeCome ships reusable phase prompts under `prompts/`: CODECOME_BOOTSTRAP_DRY_RUN=1 # force --dry-run on sandbox apply/regenerate CODECOME_BASH_SHIM_RENDER=0 # disable rtk/cat/head/tail/rg/ls/find/tree routing CODECOME_BASH_SHIM_LS_STRIP_LONG_FORMAT=0 - OPENCODE_ARGS='...' # extra flags forwarded to opencode run + OPENCODE_ARGS='...' # extra flags for opencode run (forwarded directly when CODECOME_USE_WRAPPER=0; in wrapper mode only --model, --variant and --thinking are used) CODECOME_MODEL= # pin model per phase, e.g. anthropic/claude-opus-4-7 CODECOME_MODEL_VARIANT= # pin model variant, e.g. high, max -### Model resolution and the `--thinking` flag +### Model resolution and thinking display The wrapper resolves the effective model in this order: @@ -496,15 +496,15 @@ The wrapper resolves the effective model in this order: 4. the model used in your most recent OpenCode session for this project (best-effort, read from OpenCode's local DB) 5. unknown -The chosen value is shown in the phase header banner along with its source. When the value comes from env or YAML, the wrapper appends `--model` / `--variant` to `opencode run` so the banner is the truth. Discovered defaults (the last-session lookup) are display-only and are not enforced. +The chosen value is shown in the phase header banner along with its source. -Per-provider `--thinking` defaults: +Per-provider thinking-display defaults: - `anthropic/*` → off. Claude already interleaves thinking with normal `text` blocks via OpenCode's interleaved-thinking beta header, so `Assistant` panels already show the model's working. -- `openai/*`, `xai/*`, `github-copilot/*`, `groq/*`, `cerebras/*`, `google/*`, `google-vertex/*` → on. These providers hide reasoning unless `--thinking` is passed; without it the wrapper would only see one or zero `text` events per phase. +- `openai/*`, `xai/*`, `github-copilot/*`, `groq/*`, `cerebras/*`, `google/*`, `google-vertex/*` → on. - Anything else (unknown / future provider) → on. Cheaper to over-surface than under-surface in vulnerability research. -Override precedence: `--thinking` already in `OPENCODE_ARGS` > `CODECOME_THINKING` env > per-provider default. Some providers bill reasoning tokens; set `CODECOME_THINKING=0` per phase to opt out without losing the styled wrapper. +Override precedence: `CODECOME_THINKING` env > per-provider default. `CODECOME_RENDER_REASONING=0` acts as an independent escape hatch that suppresses rendering even when thinking is enabled. Some providers bill reasoning tokens; set `CODECOME_THINKING=0` per phase to opt out without losing the styled wrapper. Print the full resolution table for any agent without launching a phase: diff --git a/docs/workflow.md b/docs/workflow.md index f6ed1ed..3edab5d 100644 --- a/docs/workflow.md +++ b/docs/workflow.md @@ -442,8 +442,8 @@ All `make` targets that depend on Python tooling expect a repo-local `.venv/`. I Wrapper controls: CODECOME_USE_WRAPPER=0 # bypass wrapper and use raw opencode run - CODECOME_THINKING=1 # enable thinking blocks for wrapper-driven phase runs - OPENCODE_ARGS='...' # extra flags forwarded to opencode run + CODECOME_THINKING=1 # show model reasoning/thinking blocks in output + OPENCODE_ARGS='...' # extra flags for opencode run (forwarded directly when CODECOME_USE_WRAPPER=0; in wrapper mode only --model, --variant and --thinking are used) CODECOME_MODEL= # pin the model per phase CODECOME_MODEL_VARIANT= # pin the model variant diff --git a/opencode.json b/opencode.json index 054d77a..a117cbd 100644 --- a/opencode.json +++ b/opencode.json @@ -16,5 +16,23 @@ "sandbox/.env": "allow", "*/sandbox/.env": "allow" } + }, + "provider": { + "test": { + "type": "openai", + "options": { + "baseURL": "http://127.0.0.1:61167/v1", + "apiKey": "sk-test" + }, + "models": { + "mockmodel": {} + } + } + }, + "agent": { + "test": { + "temperature": 0, + "top_p": 1 + } } } diff --git a/pytest.ini b/pytest.ini index b0a2a4d..93a8e15 100644 --- a/pytest.ini +++ b/pytest.ini @@ -4,3 +4,4 @@ markers = unit: fast pure logic tests component: filesystem and script behavior tests compat_matrix: model-compatibility fixture matrix tests + slow: heavy e2e tests (invoke real opencode CLI) diff --git a/tests/test_mock_llm_parity.py b/tests/test_mock_llm_parity.py new file mode 100644 index 0000000..2802047 --- /dev/null +++ b/tests/test_mock_llm_parity.py @@ -0,0 +1,232 @@ +from __future__ import annotations + +import json +import socket +import subprocess +import sys +import time +import urllib.request +from pathlib import Path + +import pytest + +from conftest import ROOT + + +def load_parity_module(): + import importlib.util + path = ROOT / "tools" / "mock-llm-parity.py" + spec = importlib.util.spec_from_file_location("mock_llm_parity", path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Cannot load module from {path}") + mod = importlib.util.module_from_spec(spec) + sys.modules["mock_llm_parity"] = mod + spec.loader.exec_module(mod) + return mod + + +def _find_free_port(host: str = "127.0.0.1") -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind((host, 0)) + s.listen(1) + return int(s.getsockname()[1]) + + +class TestMockLLMServer: + """Unit tests for the mock LLM server.""" + + @pytest.fixture(scope="class") + def server_proc(self): + script = ROOT / "tools" / "mock_llm_scripts" / "basic.json" + port = _find_free_port() + proc = subprocess.Popen( + [sys.executable, str(ROOT / "tools" / "mock-llm-server.py"), "--port", str(port), "--script", str(script)], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + text=True, + ) + # Health-check + deadline = time.time() + 5.0 + while time.time() < deadline: + if proc.poll() is not None: + stderr = proc.stderr.read() if proc.stderr else "" + pytest.fail(f"Mock server exited early (code {proc.returncode}). stderr: {stderr}") + try: + req = urllib.request.Request(f"http://127.0.0.1:{port}/v1/models", method="GET") + with urllib.request.urlopen(req, timeout=1.0) as resp: + if resp.status == 200: + break + except Exception: + pass + time.sleep(0.2) + else: + proc.terminate() + pytest.fail("Mock server failed to start") + yield proc, port + proc.terminate() + try: + proc.wait(timeout=5.0) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + + def test_models_endpoint(self, server_proc): + _, port = server_proc + req = urllib.request.Request(f"http://127.0.0.1:{port}/v1/models", method="GET") + with urllib.request.urlopen(req, timeout=2.0) as resp: + data = json.loads(resp.read().decode()) + assert data["object"] == "list" + assert any(m["id"] == "mockmodel" for m in data["data"]) + + def test_chat_completions_streaming(self, server_proc): + _, port = server_proc + body = json.dumps({ + "model": "mockmodel", + "messages": [{"role": "user", "content": "hi"}], + "stream": True, + }).encode() + req = urllib.request.Request( + f"http://127.0.0.1:{port}/v1/chat/completions", + data=body, + headers={"Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=5.0) as resp: + lines = resp.read().decode().splitlines() + chunks = [json.loads(line[6:]) for line in lines if line.startswith("data: ") and line != "data: [DONE]"] + assert any(c["choices"][0]["delta"].get("role") == "assistant" for c in chunks) + assert any(c["choices"][0]["delta"].get("content") == "Hello world!" for c in chunks) + + def test_multi_tool_chunks_have_increasing_index(self, server_proc): + """Verify that multiple tools in one turn get index 0, 1, 2...""" + # Build chunks directly without going through the server process. + script = [ + {"type": "text", "content": "Reading files."}, + {"type": "tool_call", "id": "call_1", "name": "read", "arguments": {"filePath": "README.md"}}, + {"type": "tool_call", "id": "call_2", "name": "read", "arguments": {"filePath": "AGENTS.md"}}, + {"type": "text", "content": "Done."}, + {"type": "done"}, + ] + import importlib.util + server_path = ROOT / "tools" / "mock-llm-server.py" + spec = importlib.util.spec_from_file_location("mock_llm_server", server_path) + if spec is None or spec.loader is None: + raise RuntimeError(f"Cannot load module from {server_path}") + mock_llm_server = importlib.util.module_from_spec(spec) + sys.modules["mock_llm_server"] = mock_llm_server + spec.loader.exec_module(mock_llm_server) + turns = mock_llm_server._parse_script_into_turns(script) + chunks = mock_llm_server._build_chunks(turns, 0) + parsed = [json.loads(c) for c in chunks] + tool_chunks = [c for c in parsed if "tool_calls" in c["choices"][0]["delta"]] + assert len(tool_chunks) == 2, f"Expected 2 tool chunks, got {len(tool_chunks)}" + assert tool_chunks[0]["choices"][0]["delta"]["tool_calls"][0]["index"] == 0 + assert tool_chunks[1]["choices"][0]["delta"]["tool_calls"][0]["index"] == 1 + + +class TestNormalizeEvent: + """Unit tests for event normalization logic.""" + + def test_normalize_strips_timestamps_and_ids(self): + mod = load_parity_module() + ev = { + "type": "text", + "timestamp": 12345, + "sessionID": "ses_abc", + "id": "evt_123", + "part": { + "id": "prt_1", + "messageID": "msg_1", + "sessionID": "ses_abc", + "text": "hello", + "type": "text", + "time": {"start": 1, "end": 2}, + }, + } + out = mod.normalize_event(ev) + assert "timestamp" not in out + assert "sessionID" not in out + assert "id" not in out + assert "time" not in out["part"] + assert "id" not in out["part"] + assert "messageID" not in out["part"] + assert out["part"]["text"] == "hello" + + def test_normalize_filters_serve_only_types(self): + mod = load_parity_module() + for t in mod._SERVE_ONLY_TYPES: + assert mod.normalize_event({"type": t}) is None + + def test_normalize_truncates_tool_output(self): + mod = load_parity_module() + long_preview = "x" * 500 + long_output = "y" * 500 + ev = { + "type": "tool_use", + "part": { + "type": "tool", + "state": { + "metadata": {"preview": long_preview}, + "output": long_output, + }, + }, + } + out = mod.normalize_event(ev) + state = out["part"]["state"] + assert state["metadata"]["preview"].startswith(" io.StringIO: - events = [ - {"sessionID": session_id, "type": "step_finish", - "part": {"reason": "stop", "tokens": {}}}, - ] - return io.StringIO("".join(json.dumps(e) + "\n" for e in events)) - - popen_calls: list[list] = [] - - class FakePopen: - def __init__(self, cmd, *args, **kwargs): - popen_calls.append(list(cmd)) - self.returncode = 0 - self.pid = 9999 - self.stdin = io.StringIO() - self.stdout = _make_stream("ses_test_abc") - def poll(self): - return 0 - def wait(self): - return None - - monkeypatch.setattr(module.subprocess, "Popen", FakePopen) - monkeypatch.setattr(module.os, "killpg", lambda *a: None) - - # subprocess.run: frontmatter fails on first check, passes on second + monkeypatch.setattr(module, "check_opencode_version", lambda: None) + monkeypatch.setattr(module, "ROOT", tmp_path) + + # Reset the attempt counter so transcript numbering is deterministic. + if hasattr(module._run_single_attempt, "_attempt_counter"): + delattr(module._run_single_attempt, "_attempt_counter") + + calls: list[tuple] = [] + + def fake_run_single_attempt(args, console, prompt, model, variant, thinking_on, base_url, auth_token, workspace_dir, existing_session_id=None): + calls.append((existing_session_id, prompt)) + # Both attempts succeed with the same session. + return ( + 0, + "ses_test_abc", + module.RunResult( + any_step_finish_seen=True, + step_finish_count=1, + last_finish_reason="stop", + last_finish_tokens={}, + last_permission_error=None, + ), + tmp_path / f"transcript-{len(calls)}.jsonl", + ) + + monkeypatch.setattr(module, "_run_single_attempt", fake_run_single_attempt) + frontmatter_call_count = [0] class FakeResult: @@ -1550,52 +1489,39 @@ def fake_run(cmd, *args, **kwargs): rc = module.main() assert rc == 0 - - # The while loop should have called Popen twice: initial run + frontmatter resume - assert len(popen_calls) == 2, f"expected 2 Popen calls, got {len(popen_calls)}" - resume_cmd = popen_calls[1] - assert "--attach" in resume_cmd - assert "http://127.0.0.1:7777" in resume_cmd - assert "--port" in resume_cmd - assert "4317" in resume_cmd - assert "--session" in resume_cmd - assert "ses_test_abc" in resume_cmd - assert "--format" in resume_cmd - assert "json" in resume_cmd - assert "Repair only the reported YAML/frontmatter issues with minimal changes." in resume_cmd[-1] - assert "Phase 1 completion checklist:" in resume_cmd[-1] - assert "Ensure itemdb/notes/sandbox-plan.md documents the Phase 1b outcome." in resume_cmd[-1] + assert len(calls) == 2, f"expected 2 attempts, got {len(calls)}" + # First attempt is a fresh session; second reuses the same session ID. + assert calls[0][0] is None + assert calls[1][0] == "ses_test_abc" + # The second prompt should be the frontmatter repair prompt. + assert "Repair only the reported YAML/frontmatter issues" in calls[1][1] @pytest.mark.component def test_frontmatter_failure_without_session_id_exits_nonzero(monkeypatch, tmp_path): """Frontmatter validation failures must not be reported as success when the wrapper cannot determine a resumable session ID.""" - import io, json - - module = load_tool_module("run_agent_frontmatter_no_session", "tools/run-agent.py") + module = load_tool_module("run_agent_frontmatter_no_session_serve", "tools/run-agent.py") monkeypatch.setattr(module, "HAVE_RICH", False) + monkeypatch.setattr(module, "check_opencode_version", lambda: None) + monkeypatch.setattr(module, "ROOT", tmp_path) - events = [ - {"type": "step_finish", "part": {"reason": "stop", "tokens": {}}}, - ] - - popen_calls: list[list] = [] - - class FakePopen: - def __init__(self, cmd, *args, **kwargs): - popen_calls.append(list(cmd)) - self.returncode = 0 - self.pid = 9999 - self.stdin = io.StringIO() - self.stdout = io.StringIO("".join(json.dumps(e) + "\n" for e in events)) - def poll(self): - return 0 - def wait(self): - return None + if hasattr(module._run_single_attempt, "_attempt_counter"): + delattr(module._run_single_attempt, "_attempt_counter") + + def fake_run_single_attempt(args, console, prompt, model, variant, thinking_on, base_url, auth_token, workspace_dir, existing_session_id=None): + return ( + 0, + "", # empty session ID + module.RunResult( + any_step_finish_seen=True, + step_finish_count=1, + last_finish_reason="stop", + ), + tmp_path / "transcript.jsonl", + ) - monkeypatch.setattr(module.subprocess, "Popen", FakePopen) - monkeypatch.setattr(module.os, "killpg", lambda *a: None) + monkeypatch.setattr(module, "_run_single_attempt", fake_run_single_attempt) class FakeResult: def __init__(self, rc, out="", err=""): @@ -1606,8 +1532,6 @@ def fake_run(cmd, *args, **kwargs): return FakeResult(0, out="opencode 1.15.0\n") if any("check-frontmatter" in str(c) for c in cmd): return FakeResult(1, err="bad frontmatter") - if "db" in cmd: - return FakeResult(0, out="") return FakeResult(0) monkeypatch.setattr(module.subprocess, "run", fake_run) @@ -1621,45 +1545,38 @@ def fake_run(cmd, *args, **kwargs): rc = module.main() assert rc == 2 - assert len(popen_calls) == 1 @pytest.mark.component def test_iteration_limit_triggers_auto_resume(monkeypatch, tmp_path): """When the stream ends with a mid-turn finish reason (tool-calls) and graceful forgiveness does not apply, run-agent resumes once then exits.""" - import io, json - - module = load_tool_module("run_agent_iter_resume", "tools/run-agent.py") + module = load_tool_module("run_agent_iter_resume_serve", "tools/run-agent.py") monkeypatch.setattr(module, "HAVE_RICH", False) + monkeypatch.setattr(module, "check_opencode_version", lambda: None) + monkeypatch.setattr(module, "ROOT", tmp_path) monkeypatch.setenv("CODECOME_MAX_ITERATION_RETRIES", "1") - def _make_stream(session_id: str, reason: str) -> io.StringIO: - events = [ - {"sessionID": session_id, "type": "step_finish", - "part": {"reason": reason, "tokens": {}}}, - ] - return io.StringIO("".join(json.dumps(e) + "\n" for e in events)) - - popen_calls: list[list] = [] - - class FakePopen: - def __init__(self, cmd, *args, **kwargs): - popen_calls.append(list(cmd)) - self.returncode = 0 - self.pid = 9999 - self.stdin = io.StringIO() - # Always return tool-calls (iteration limit hit) so we test the - # retry path; after the retry the retry counter is exhausted and - # we exit with code 2. - self.stdout = _make_stream("ses_iter_xyz", "tool-calls") - def poll(self): - return 0 - def wait(self): - return None - - monkeypatch.setattr(module.subprocess, "Popen", FakePopen) - monkeypatch.setattr(module.os, "killpg", lambda *a: None) + if hasattr(module._run_single_attempt, "_attempt_counter"): + delattr(module._run_single_attempt, "_attempt_counter") + + calls: list[tuple] = [] + + def fake_run_single_attempt(args, console, prompt, model, variant, thinking_on, base_url, auth_token, workspace_dir, existing_session_id=None): + calls.append((existing_session_id, prompt)) + return ( + 0, + "ses_iter_xyz", + module.RunResult( + any_step_finish_seen=True, + step_finish_count=1, + last_finish_reason="tool-calls", + ), + tmp_path / f"transcript-{len(calls)}.jsonl", + ) + + monkeypatch.setattr(module, "_run_single_attempt", fake_run_single_attempt) + monkeypatch.setattr(module, "check_phase_graceful_completion", lambda *a, **kw: False) class FakeResult: def __init__(self, rc, out="", err=""): @@ -1668,6 +1585,8 @@ def __init__(self, rc, out="", err=""): def fake_run(cmd, *args, **kwargs): if "--version" in cmd: return FakeResult(0, out="opencode 1.15.0\n") + if any("check-frontmatter" in str(c) for c in cmd): + return FakeResult(0) return FakeResult(0) monkeypatch.setattr(module.subprocess, "run", fake_run) @@ -1682,17 +1601,13 @@ def fake_run(cmd, *args, **kwargs): rc = module.main() - # After 1 retry (2 total Popen calls) the retry budget is exhausted → exit 2 - assert len(popen_calls) == 2, f"expected 2 Popen calls, got {len(popen_calls)}" + # After 1 retry (2 total attempts) the retry budget is exhausted → exit 2 + assert len(calls) == 2, f"expected 2 attempts, got {len(calls)}" assert rc == 2 - resume_cmd = popen_calls[1] - assert "--session" in resume_cmd - assert "ses_iter_xyz" in resume_cmd - assert "Your previous response was cut off by the model/provider" in resume_cmd[-1] - assert "Observed finish reason: tool-calls." in resume_cmd[-1] - assert "Phase 4 completion checklist:" in resume_cmd[-1] - assert "Ensure validation evidence exists under itemdb/evidence/CC-9999/, including README.md." in resume_cmd[-1] + # Verify the retry reused the same session and included the resume prompt. + assert calls[1][0] == "ses_iter_xyz" + assert "Your previous response was cut off by the model/provider" in calls[1][1] # --------------------------------------------------------------------------- @@ -1810,39 +1725,30 @@ def test_check_phase_graceful_completion_mtime(monkeypatch, tmp_path): @pytest.mark.unit def test_stream_session_id_and_step_finish_count(monkeypatch, tmp_path): - """Verify that the main loop captures sessionID from the first event and - counts step_finish events accurately.""" - import io, json - - module = load_tool_module("run_agent_stream_tracking", "tools/run-agent.py") + """Verify that the main loop captures sessionID and step_finish count + from the RunResult returned by _run_single_attempt.""" + module = load_tool_module("run_agent_stream_tracking_serve", "tools/run-agent.py") monkeypatch.setattr(module, "HAVE_RICH", False) + monkeypatch.setattr(module, "check_opencode_version", lambda: None) + monkeypatch.setattr(module, "ROOT", tmp_path) - SESSION = "ses_stream_test_001" - - events = [ - {"sessionID": SESSION, "type": "step_start", "part": {}}, - {"sessionID": SESSION, "type": "step_finish", "part": {"reason": "tool-calls", "tokens": {}}}, - {"sessionID": SESSION, "type": "step_start", "part": {}}, - {"sessionID": SESSION, "type": "step_finish", "part": {"reason": "tool-calls", "tokens": {}}}, - {"sessionID": SESSION, "type": "step_start", "part": {}}, - {"sessionID": SESSION, "type": "step_finish", "part": {"reason": "stop", "tokens": {}}}, - ] - stream = io.StringIO("".join(json.dumps(e) + "\n" for e in events)) - - popen_calls: list[list] = [] - - class FakePopen: - def __init__(self, cmd, *args, **kwargs): - popen_calls.append(list(cmd)) - self.returncode = 0 - self.pid = 1234 - self.stdin = io.StringIO() - self.stdout = stream - def poll(self): return 0 - def wait(self): return None + if hasattr(module._run_single_attempt, "_attempt_counter"): + delattr(module._run_single_attempt, "_attempt_counter") + + def fake_run_single_attempt(args, console, prompt, model, variant, thinking_on, base_url, auth_token, workspace_dir, existing_session_id=None): + return ( + 0, + "ses_stream_test_001", + module.RunResult( + any_step_finish_seen=True, + step_finish_count=3, + last_finish_reason="stop", + last_finish_tokens={"input": 10, "output": 20}, + ), + tmp_path / "transcript.jsonl", + ) - monkeypatch.setattr(module.subprocess, "Popen", FakePopen) - monkeypatch.setattr(module.os, "killpg", lambda *a: None) + monkeypatch.setattr(module, "_run_single_attempt", fake_run_single_attempt) class FakeResult: def __init__(self, rc, out="", err=""): @@ -1867,5 +1773,6 @@ def fake_run(cmd, *args, **kwargs): rc = module.main() assert rc == 0 - # The session terminated with 'stop', no frontmatter errors → single Popen call - assert len(popen_calls) == 1 + # The session terminated with 'stop', no frontmatter errors → single attempt + # (We cannot introspect the loop variables directly, but the clean exit + # with rc=0 proves the RunResult signals were consumed correctly.) diff --git a/tools/events/__init__.py b/tools/events/__init__.py new file mode 100644 index 0000000..74e21f1 --- /dev/null +++ b/tools/events/__init__.py @@ -0,0 +1,393 @@ +# Copyright (C) 2025-2026 Pablo Ruiz García +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +""" +Event loop coordinator: consumes SSE, accumulates state, maps events, +and emits them to the existing render pipeline. + +Entry point: + event_loop = EventLoop(base_url, session_id, console, phase, label) + result = event_loop.run(render_event_fn) +""" + +from __future__ import annotations + +import dataclasses +import json +import time +import urllib.error +import urllib.request +from typing import Any, Callable + +from events.sse_client import SseClient, SseClientError +from events.state_tracker import StateTracker +from events.emitters import emit_event + + +@dataclasses.dataclass(frozen=True) +class RunResult: + """ Signals returned by EventLoop.run() for termination logic. """ + any_step_finish_seen: bool = False + step_finish_count: int = 0 + last_finish_reason: str | None = None + last_finish_tokens: dict[str, Any] = dataclasses.field(default_factory=dict) + last_permission_error: str | None = None + last_session_id: str | None = None + + +class EventLoop: + """ Consume the SSE stream for a single session and drive rendering. """ + + def __init__( + self, + base_url: str, + session_id: str, + console: Any, + phase: str, + label: str, + *, + auth_token: str | None = None, + workspace_dir: str | None = None, + ) -> None: + self.base_url = base_url.rstrip("/") + self.session_id = session_id + self.console = console + self.phase = phase + self.label = label + self.auth_token = auth_token + self.workspace_dir = workspace_dir + + self._tracker = StateTracker() + self._client: SseClient | None = None + self._stopped = False + self._seen_message_ids: set[str] = set() + self._last_message_sync_at = 0.0 + self._pending_recovery_sync = False + self._emitted_signatures: set[tuple[str, str]] = set() + self._idle_event_to_sync_and_emit: dict[str, Any] | None = None + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def run( + self, + render_fn: Callable[[Any, str, str, dict[str, Any]], None], + ) -> RunResult: + """ Block until the session reaches idle or a terminal error. + + Args: + render_fn: the existing render_event() function from run-agent.py + + Returns: + RunResult with all signals needed by termination logic. + """ + # Use a mutable builder for accumulation. + _any_step_finish_seen = False + _step_finish_count = 0 + _last_finish_reason: str | None = None + _last_finish_tokens: dict[str, Any] = {} + _last_permission_error: str | None = None + + self._client = SseClient( + self.base_url, + auth_token=self.auth_token, + workspace_dir=self.workspace_dir, + reconnect=True, + max_reconnects=10, + on_reconnect=self.trigger_recovery_sync, + ) + + try: + for event in self._client.events(): + if self._stopped: + break + + # Filter by session (the global stream includes all sessions). + if not self._belongs_to_session(event): + continue + + # Handle permissions first (need HTTP reply). + if event.get("type") == "permission.asked": + self._handle_permission(event) + perm_err = self._extract_permission_error(event) + if perm_err: + _last_permission_error = perm_err + continue + + # Capture idle events for deferred sync-and-emit + _is_idle = self._is_session_idle(event) + if _is_idle and self._idle_event_to_sync_and_emit is None: + self._idle_event_to_sync_and_emit = event + + # Let the tracker accumulate deltas and produce finalized events. + finalized_events = self._tracker.ingest(event) + + if self._should_sync_session_messages(event): + finalized_events.extend(self._sync_session_messages()) + + # Filter out idle events from finalized_events if we have a deferred idle + # (to avoid double-emitting: once from tracker, once from idle handler) + if self._idle_event_to_sync_and_emit is not None: + finalized_events = [ + fe for fe in finalized_events + if not ( + fe.get("type") == "session.idle" or + (fe.get("type") == "session.status" and fe.get("properties", {}).get("status", {}).get("type") == "idle") + ) + ] + + for fe in finalized_events: + sig = (fe.get("type", ""), fe.get("part", {}).get("id", "")) + if sig[1] and sig in self._emitted_signatures: + continue + self._emitted_signatures.add(sig) + _any_step_finish_seen, _step_finish_count, _last_finish_reason, _last_finish_tokens = self._update_result( + fe, _any_step_finish_seen, _step_finish_count, _last_finish_reason, _last_finish_tokens + ) + emit_event(render_fn, self.console, self.phase, self.label, fe) + + # Stop consuming when session goes idle - but sync and then emit the idle event. + if self._is_session_idle(event): + idle_event = self._idle_event_to_sync_and_emit + self._idle_event_to_sync_and_emit = None + # Sync to catch any final events SSE might have missed + self._sync_session_messages() + # Now emit the idle event + idle_sig = (event.get("type", ""), event.get("properties", {}).get("sessionID", "")) + if idle_sig[1] and idle_sig in self._emitted_signatures: + pass # already emitted via finalize path + else: + if idle_sig[1]: + self._emitted_signatures.add(idle_sig) + emit_event(render_fn, self.console, self.phase, self.label, event) + return self._build_result( + _any_step_finish_seen, + _step_finish_count, + _last_finish_reason, + _last_finish_tokens, + _last_permission_error, + self.session_id, + ) + + except SseClientError as exc: + # Reconnect exhausted or fatal stream error. + # We return what we have; caller decides whether to retry. + pass + + return self._build_result( + any_step_finish_seen=_any_step_finish_seen, + step_finish_count=_step_finish_count, + last_finish_reason=_last_finish_reason, + last_finish_tokens=_last_finish_tokens, + last_permission_error=_last_permission_error, + last_session_id=self.session_id, + ) + + def stop(self) -> None: + """ Signal the event loop to exit after the next event. """ + self._stopped = True + if self._client is not None: + self._client.stop() + + def trigger_recovery_sync(self) -> None: + """ Signal that a recovery sync is needed after SSE reconnection. """ + self._pending_recovery_sync = True + + @staticmethod + def _build_result( + any_step_finish_seen: bool, + step_finish_count: int, + last_finish_reason: str | None, + last_finish_tokens: dict[str, Any], + last_permission_error: str | None, + last_session_id: str | None, + ) -> RunResult: + """ Build a RunResult from accumulated signals. """ + return RunResult( + any_step_finish_seen=any_step_finish_seen, + step_finish_count=step_finish_count, + last_finish_reason=last_finish_reason, + last_finish_tokens=last_finish_tokens, + last_permission_error=last_permission_error, + last_session_id=last_session_id, + ) + + # ------------------------------------------------------------------ + # Internal + # ------------------------------------------------------------------ + + def _get_headers(self) -> dict[str, str]: + headers = {"Content-Type": "application/json"} + if self.auth_token: + import base64 + encoded = base64.b64encode(f"opencode:{self.auth_token}".encode("utf-8")).decode("utf-8") + headers["Authorization"] = f"Basic {encoded}" + if self.workspace_dir: + headers["x-opencode-directory"] = self.workspace_dir + return headers + + @staticmethod + def _is_session_idle(event: dict[str, Any]) -> bool: + """Return True if this event signals the session reached idle. + + Supports both the canonical ``session.status`` with + ``status.type == "idle"`` and the deprecated ``session.idle``. + """ + event_type = event.get("type", "") + if event_type == "session.idle": + return True + if event_type == "session.status": + status = event.get("properties", {}).get("status", {}) + return status.get("type") == "idle" + return False + + def _belongs_to_session(self, event: dict[str, Any]) -> bool: + """ Return True if this event belongs to our tracked session. """ + props = event.get("properties", {}) + sid = props.get("sessionID") + if sid and sid != self.session_id: + return False + # server.connected / server.heartbeat have no sessionID — pass through. + return True + + def _handle_permission(self, event: dict[str, Any]) -> None: + """ Auto-reject the permission via POST /permission/{requestID}/reply. """ + props = event.get("properties", {}) + perm_id = props.get("id") + if not perm_id: + return + url = f"{self.base_url}/permission/{perm_id}/reply" + data = json.dumps({ + "reply": "reject", + "message": "Auto-rejected by CodeCome configuration", + }).encode("utf-8") + req = urllib.request.Request( + url, + data=data, + headers=self._get_headers(), + method="POST", + ) + try: + urllib.request.urlopen(req, timeout=10.0) + except urllib.error.HTTPError: + # Log but don't crash; the session may already have moved on. + pass + + def _extract_permission_error(self, event: dict[str, Any]) -> str | None: + """ Build a human-readable permission rejection summary. """ + props = event.get("properties", {}) + tool = props.get("tool", "tool") + return f"tool permission rejected: {tool}" + + def _should_sync_session_messages(self, event: dict[str, Any]) -> bool: + """Return True when a session snapshot sync may reveal finalized parts. + + Sync is only triggered in two cases: + 1. After SSE reconnection (recovery sync via _pending_recovery_sync flag) + 2. Explicit idle event - but caller handles idle emission, not us + """ + if self._pending_recovery_sync: + self._pending_recovery_sync = False + return True + + event_type = event.get("type", "") + if event_type == "session.idle": + return True + if event_type == "session.status": + status = event.get("properties", {}).get("status", {}) + if status.get("type") == "idle": + return True + return False + + def _sync_session_messages(self) -> list[dict[str, Any]]: + """Fetch current session messages and synthesize finalized compatibility events. + + The HTTP SSE stream may emit `message.part.delta` without corresponding + `message.part.updated` events. The session snapshot API does contain the + completed assistant messages and parts, so we poll it and emit unseen + message/part events in the same ND-JSON-compatible shapes expected by + the existing renderer. + """ + self._last_message_sync_at = time.time() + events: list[dict[str, Any]] = [] + try: + req = urllib.request.Request( + f"{self.base_url}/session/{self.session_id}/message", + headers=self._get_headers(), + method="GET", + ) + with urllib.request.urlopen(req, timeout=10.0) as resp: + messages = json.loads(resp.read().decode("utf-8")) + except Exception: # noqa: BLE001 + return [] + + if not isinstance(messages, list): + return [] + + for item in messages: + if not isinstance(item, dict): + continue + info = item.get("info") + parts = item.get("parts") + if not isinstance(info, dict) or not isinstance(parts, list): + continue + if info.get("role") != "assistant": + continue + if info.get("sessionID") != self.session_id: + continue + + message_id = info.get("id") + if isinstance(message_id, str) and message_id and message_id not in self._seen_message_ids: + events.append({ + "type": "message.updated", + "timestamp": int(time.time() * 1000), + "sessionID": self.session_id, + "info": info, + }) + self._seen_message_ids.add(message_id) + + for part in parts: + if not isinstance(part, dict): + continue + part_id = part.get("id") + if isinstance(part_id, str) and self._tracker.has_seen(part_id): + self._tracker.mark_seen(part_id) + continue + synthesized = { + "type": "message.part.updated", + "timestamp": int(time.time() * 1000), + "properties": { + "sessionID": self.session_id, + "part": part, + }, + } + events.extend(self._tracker.ingest(synthesized)) + + return events + + def _update_result( + self, + event: dict[str, Any], + any_step_finish_seen: bool, + step_finish_count: int, + last_finish_reason: str | None, + last_finish_tokens: dict[str, Any], + ) -> tuple[bool, int, str | None, dict[str, Any]]: + """ Update mutable result signals based on the mapped event. + + Returns the updated tuple of (any_seen, count, reason, tokens). + """ + event_type = event.get("type", "") + if event_type == "step_finish": + any_step_finish_seen = True + step_finish_count += 1 + part = event.get("part", {}) + reason = part.get("reason") + if isinstance(reason, str): + last_finish_reason = reason + tokens = part.get("tokens") + if isinstance(tokens, dict): + last_finish_tokens = tokens + + return any_step_finish_seen, step_finish_count, last_finish_reason, last_finish_tokens diff --git a/tools/events/emitters.py b/tools/events/emitters.py new file mode 100644 index 0000000..9705b42 --- /dev/null +++ b/tools/events/emitters.py @@ -0,0 +1,32 @@ +# Copyright (C) 2025-2026 Pablo Ruiz García +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +""" +Thin wrappers that bridge mapped events to existing render_event() in run-agent.py. + +This module avoids circular imports by accepting the render_event function +as a callable argument rather than importing it directly. +""" + +from __future__ import annotations + +from typing import Any, Callable + + +def emit_event( + render_fn: Callable[[Any, str, str, dict[str, Any]], None], + console: Any, + phase: str, + label: str, + event: dict[str, Any], +) -> None: + """ Forward a mapped ND-JSON event to the existing render_event(). + + Args: + render_fn: typically run_agent.render_event + console: rich Console or None + phase: phase number string + label: human-readable phase label + event: the mapped ND-JSON event dict + """ + render_fn(console, phase, label, event) diff --git a/tools/events/sse_client.py b/tools/events/sse_client.py new file mode 100644 index 0000000..454c8dd --- /dev/null +++ b/tools/events/sse_client.py @@ -0,0 +1,200 @@ +# Copyright (C) 2025-2026 Pablo Ruiz García +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +""" +Server-Sent Events (SSE) client for opencode serve. + +Consumes the global /event stream, parses data: lines, +reconnects on drops with exponential backoff, +and monitors heartbeats. +""" + +from __future__ import annotations + +import json +import time +import urllib.request +from typing import Any, Callable, Iterator + + +# Exponential backoff config for reconnect. +_BACKOFF_INITIAL_S = 3.0 +_BACKOFF_MAX_S = 30.0 +_BACKOFF_MULTIPLIER = 2.0 + +# If no heartbeat for this long, treat as dead. +_HEARTBEAT_TIMEOUT_S = 15.0 + +# Read timeout for the SSE connection. +_SSE_READ_TIMEOUT_S = 30.0 + + +import base64 + +def _build_sse_request(base_url: str, auth_token: str | None = None, workspace_dir: str | None = None) -> urllib.request.Request: + """Return a GET /event request with SSE headers.""" + headers = { + "Accept": "text/event-stream", + "Cache-Control": "no-cache", + } + if auth_token: + encoded = base64.b64encode(f"opencode:{auth_token}".encode("utf-8")).decode("utf-8") + headers["Authorization"] = f"Basic {encoded}" + if workspace_dir: + headers["x-opencode-directory"] = workspace_dir + + return urllib.request.Request( + f"{base_url}/event", + headers=headers, + method="GET", + ) + + +class SseClientError(Exception): + """ Raised when the SSE stream cannot be established or sustained. """ + pass + + +class SseClient: + """ Open, consume, and auto-reconnect to the opencode SSE stream. """ + + def __init__( + self, + base_url: str, + *, + auth_token: str | None = None, + workspace_dir: str | None = None, + reconnect: bool = True, + max_reconnects: int = 10, + on_reconnect: Callable[[], None] | None = None, + ) -> None: + self.base_url = base_url.rstrip("/") + self.auth_token = auth_token + self.workspace_dir = workspace_dir + self.reconnect = reconnect + self.max_reconnects = max_reconnects + self.on_reconnect = on_reconnect + + self._started = False + self._stopped = False + self._last_heartbeat = 0.0 + self._reconnect_count = 0 + self._first_connection_done = False + + def events(self) -> Iterator[dict]: + """ Yield parsed SSE event JSON dicts. + + This is a blocking generator that stays alive until + stop() is called or reconnect budget is exhausted. + """ + if self._started: + raise RuntimeError("events() can only be consumed once per instance") + self._started = True + self._last_heartbeat = time.time() + + while not self._stopped: + try: + for event in self._open_stream(): + if self._stopped: + return + self._on_event(event) + if self._first_connection_done and self.on_reconnect: + self.on_reconnect() + self._first_connection_done = True + yield event + except SseClientError: + if not self.reconnect or self._stopped: + raise + if self._reconnect_count >= self.max_reconnects: + raise SseClientError( + f"SSE reconnect budget exhausted ({self.max_reconnects} attempts)" + ) + self._reconnect_count += 1 + self._wait_backoff() + except Exception as exc: # noqa: BLE001 + # Unexpected error during stream consumption. + if not self.reconnect or self._stopped: + raise SseClientError(f"SSE stream error: {exc}") from exc + if self._reconnect_count >= self.max_reconnects: + raise SseClientError( + f"SSE reconnect budget exhausted ({self.max_reconnects} attempts)" + ) from exc + self._reconnect_count += 1 + self._wait_backoff() + + def stop(self) -> None: + """ Signal the generator to exit after the next event. """ + self._stopped = True + + # ------------------------------------------------------------------ + # Internal + # ------------------------------------------------------------------ + + def _on_event(self, event: dict) -> None: + """ Book-keeping on every consumed event. """ + if event.get("type") == "server.heartbeat": + self._last_heartbeat = time.time() + self._reconnect_count = 0 # Reset on successful read. + + # Heartbeat timeout check. + elapsed = time.time() - self._last_heartbeat + if elapsed > _HEARTBEAT_TIMEOUT_S: + raise SseClientError( + f"No server heartbeat for {elapsed:.1f}s (timeout {_HEARTBEAT_TIMEOUT_S}s)" + ) + + def _open_stream(self) -> Iterator[dict]: + """ Open the SSE connection and yield parsed events. """ + req = _build_sse_request(self.base_url, self.auth_token, self.workspace_dir) + try: + resp = urllib.request.urlopen(req, timeout=_SSE_READ_TIMEOUT_S) + except urllib.error.HTTPError as exc: + raise SseClientError(f"HTTP {exc.code}: {exc.reason}") from exc + except urllib.error.URLError as exc: + raise SseClientError(f"Connection failed: {exc.reason}") from exc + + # Read SSE lines. + buffer = [] + try: + for byte_line in resp: + if self._stopped: + return + line = byte_line.decode("utf-8", errors="replace").rstrip("\r\n") + if not line: + # Empty line → flush buffer. + if buffer: + event = self._parse_buffer(buffer) + buffer = [] + if event is not None: + yield event + continue + buffer.append(line) + finally: + resp.close() + + @staticmethod + def _parse_buffer(lines: list[str]) -> dict | None: + """ Parse accumulated SSE lines into a JSON event dict. + + Returns None for comment lines or non-data events we don't care about. + """ + data_parts: list[str] = [] + for line in lines: + if line.startswith("data:"): + data_parts.append(line[5:].lstrip()) + # We ignore event:, id:, retry: — the JSON payload is self-describing. + if not data_parts: + return None + payload = "\n".join(data_parts) + try: + return json.loads(payload) + except json.JSONDecodeError: + return None + + def _wait_backoff(self) -> None: + """ Sleep with exponential backoff before reconnect attempt. """ + delay = min( + _BACKOFF_INITIAL_S * (_BACKOFF_MULTIPLIER ** (self._reconnect_count - 1)), + _BACKOFF_MAX_S, + ) + time.sleep(delay) diff --git a/tools/events/state_tracker.py b/tools/events/state_tracker.py new file mode 100644 index 0000000..566839a --- /dev/null +++ b/tools/events/state_tracker.py @@ -0,0 +1,203 @@ +# Copyright (C) 2025-2026 Pablo Ruiz García +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +""" +Accumulate SSE streaming deltas and track part state transitions. + +The server sends `message.part.delta` events with tiny text fragments. +This module accumulates them by partID and produces finalized part +snapshots when the corresponding `message.part.updated` arrives. +""" + +from __future__ import annotations + +from typing import Any + + +class StateTracker: + """ Accumulate deltas, track part versions, detect finalized parts. """ + + def __init__(self) -> None: + # Map partID -> accumulated text buffer. + self._delta_buffers: dict[str, str] = {} + # Set of partIDs we have already "finalized" (yielded as updated). + self._seen_part_ids: set[str] = set() + # Set of partIDs for which we saw delta but not yet updated. + self._pending_part_ids: set[str] = set() + + # ------------------------------------------------------------------ + # Ingestion + # ------------------------------------------------------------------ + + def ingest(self, event: dict[str, Any]) -> list[dict[str, Any]]: + """ Process one SSE event and return zero or more *finalized* events. + + A finalized event is one whose part has reached a stable state + (e.g. text part with time.end, tool part with status completed, + step-start, step-finish). + """ + event_type = event.get("type", "") + if event_type == "message.part.delta": + self._handle_delta(event) + return [] + + if event_type == "message.part.updated": + finalized = self._handle_updated(event) + return finalized + + if event_type == "session.error": + props = event.get("properties", {}) + return [{ + "type": "error", + "timestamp": event.get("timestamp", 0), + "sessionID": props.get("sessionID", ""), + "error": props.get("error"), + }] + + if event_type == "session.diff": + mapped = self._map_session_diff(event) + return [mapped] if mapped else [] + + if event_type == "session.updated": + return [] + + # Pass-through events that don't need accumulation. + if event_type in ("session.status", "session.idle", + "permission.asked", "server.connected", "server.heartbeat"): + return [event] + + # Unknown event type: pass through as-is so callers can decide. + return [event] + + # ------------------------------------------------------------------ + # Delta accumulation + # ------------------------------------------------------------------ + + def _handle_delta(self, event: dict[str, Any]) -> None: + """ Append a text/field delta to the buffer for its partID. """ + props = event.get("properties", {}) + part_id = props.get("partID") + field = props.get("field", "text") + delta = props.get("delta", "") + if not part_id or field != "text": + return + if delta: + self._delta_buffers[part_id] = self._delta_buffers.get(part_id, "") + delta + self._pending_part_ids.add(part_id) + + def _handle_updated(self, event: dict[str, Any]) -> list[dict[str, Any]]: + """ Inject accumulated deltas into the updated part and return finalized event(s). """ + props = event.get("properties", {}) + part = props.get("part", {}) + part_id = part.get("id") + + if part_id and part_id in self._delta_buffers: + part["text"] = self._delta_buffers.get(part_id, "") + + # Build the finalized event. + finalized = self._build_finalized_event(event) + + if finalized: + # Track that we've seen this part so we don't re-emit on reconnect. + if part_id: + self._seen_part_ids.add(part_id) + # Now it's safe to clear the buffer + if part_id in self._delta_buffers: + del self._delta_buffers[part_id] + self._pending_part_ids.discard(part_id) + return [finalized] + + return [] + + def _build_finalized_event(self, event: dict[str, Any]) -> dict[str, Any] | None: + """ Convert a message.part.updated into the ND-JSON shape expected by render_event(). + + Returns None for event types we don't translate yet (e.g. async progress). + """ + props = event.get("properties", {}) + part = props.get("part", {}) + part_type = part.get("type", "") + + if part_type == "step-start": + return { + "type": "step_start", + "timestamp": event.get("timestamp", 0), + "sessionID": props.get("sessionID", ""), + "part": part, + } + + if part_type == "step-finish": + return { + "type": "step_finish", + "timestamp": event.get("timestamp", 0), + "sessionID": props.get("sessionID", ""), + "part": part, + } + + if part_type == "text": + # Only emit when finalized (time.end exists). + if part.get("time", {}).get("end"): + return { + "type": "text", + "timestamp": event.get("timestamp", 0), + "sessionID": props.get("sessionID", ""), + "part": part, + } + return None + + if part_type == "reasoning": + if part.get("time", {}).get("end"): + return { + "type": "reasoning", + "timestamp": event.get("timestamp", 0), + "sessionID": props.get("sessionID", ""), + "part": part, + } + return None + + if part_type == "tool": + state = part.get("state", {}) + if state.get("status") in ("completed", "error"): + return { + "type": "tool_use", + "timestamp": event.get("timestamp", 0), + "sessionID": props.get("sessionID", ""), + "part": part, + } + return None + + # Pass through unknown part types as raw event. + return event + + def _map_session_diff(self, event: dict[str, Any]) -> dict[str, Any] | None: + """Map non-empty session.diff into a compact compatibility event.""" + props = event.get("properties", {}) + diff = props.get("diff") + if not isinstance(diff, list) or not diff: + return None + return { + "type": "session.diff", + "timestamp": event.get("timestamp", 0), + "sessionID": props.get("sessionID", ""), + "properties": props, + } + + # ------------------------------------------------------------------ + # State queries + # ------------------------------------------------------------------ + + def has_seen(self, part_id: str) -> bool: + """ Return True if we have already processed a finalized event for this part. """ + return part_id in self._seen_part_ids + + def mark_seen(self, part_id: str) -> None: + """ Record that we have processed this part. """ + self._seen_part_ids.add(part_id) + + def is_pending(self, part_id: str) -> bool: + """ Return True if we have seen deltas but not yet the updated event. """ + return part_id in self._pending_part_ids + + def get_pending_part_ids(self) -> set[str]: + """ Return set of partIDs with buffered deltas awaiting finalization. """ + return set(self._pending_part_ids) diff --git a/tools/mock-llm-parity.py b/tools/mock-llm-parity.py new file mode 100644 index 0000000..4643461 --- /dev/null +++ b/tools/mock-llm-parity.py @@ -0,0 +1,490 @@ +#!/usr/bin/env python3 +# Copyright (C) 2025-2026 Pablo Ruiz García +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +"""Deterministic parity test between opencode run and opencode serve using a mock LLM. + +Usage: + python tools/mock-llm-parity.py --script tools/mock_llm_scripts/basic.json +""" + +from __future__ import annotations + +import argparse +import copy +import difflib +import json +import os +import socket +import subprocess +import sys +import time +import urllib.request +from pathlib import Path +from typing import Any + +ROOT = Path(__file__).resolve().parents[1] +sys.path.insert(0, str(ROOT / "tools")) + +from events import EventLoop # noqa: E402 +from opencode.serve import ServerRunner # noqa: E402 + +DEFAULT_PROMPT = "Say hello and then stop." +DEFAULT_MODEL = "test/mockmodel" +DEFAULT_AGENT = "test" +DEFAULT_TIMEOUT_S = 30.0 +MOCK_HOST = "127.0.0.1" + +# Events that only appear in the serve path and should be ignored for parity. +# Note: session.status (retry/busy) is NOT serve-only when _CODECOME_INSIDE_HARNESS=1 +# because the status-forwarder plugin emits them to stdout. +# session.idle is deprecated and serve-only. +_SERVE_ONLY_TYPES = {"server.connected", "server.heartbeat", "session.idle", "message.updated", "file.edited", "file.watcher.updated", "todo.updated"} + + +def _step_sort_key(ev: dict[str, Any]) -> tuple[int, str]: + """Return a sort key that orders events within a single step deterministically.""" + t = ev.get("type", "") + if t == "step_start": + return (0, "") + if t == "text": + return (1, ev.get("part", {}).get("text", "")[:50]) + if t == "tool_use": + call_id = str(ev.get("part", {}).get("callID", "")) + return (2, call_id) + if t == "step_finish": + return (3, "") + # session.status and error events sort after tool_use but before step_finish + # to keep them grouped with the step they occur in. + if t in ("session.status", "error"): + return (2.5, "") + return (4, "") + + +def _sort_events_by_step(events: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Group events by step (delimited by step_start) and sort within each step. + Also deduplicate session.status events by status_type to handle + transient status changes that don't affect parity. + """ + # Deduplicate session.status events by (status_type, status_message). + # Both run and serve may emit slightly different counts of busy/idle + # events depending on timing, but the important ones (retry on error) + # should match. + seen_status: set[tuple] = set() + deduped: list[dict[str, Any]] = [] + for ev in events: + if ev.get("type") == "session.status": + key = (ev.get("status_type"), ev.get("status_message")) + if key in seen_status: + continue + seen_status.add(key) + deduped.append(ev) + events = deduped + + groups: list[list[dict[str, Any]]] = [] + current: list[dict[str, Any]] = [] + for ev in events: + if ev.get("type") == "step_start": + if current: + groups.append(current) + current = [ev] + else: + current.append(ev) + if current: + groups.append(current) + + result: list[dict[str, Any]] = [] + for group in groups: + result.extend(sorted(group, key=_step_sort_key)) + return result + + +def _write_json(path: Path, payload: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8") + + +class MockServerInfo: + """Lightweight wrapper around a running mock server process.""" + + __slots__ = ("proc", "port") + + def __init__(self, proc: subprocess.Popen[Any], port: int) -> None: + self.proc = proc + self.port = port + + +def _find_free_port(host: str = MOCK_HOST) -> int: + """Find a free TCP port on the given host.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind((host, 0)) + s.listen(1) + return int(s.getsockname()[1]) + + +def start_mock_server(script_path: Path, host: str = MOCK_HOST, after_429: int = -1, after_500: int = -1) -> MockServerInfo: + port = _find_free_port(host) + cmd = [ + sys.executable, + str(ROOT / "tools" / "mock-llm-server.py"), + "--port", + str(port), + "--script", + str(script_path), + ] + if after_429 >= 0: + cmd.extend(["--429-after", str(after_429)]) + if after_500 >= 0: + cmd.extend(["--500-after", str(after_500)]) + proc = subprocess.Popen( + cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + bufsize=1, + text=True, + ) + + # Poll health check until the server is ready. + health_deadline = time.time() + 10.0 + while time.time() < health_deadline: + if proc.poll() is not None: + stderr = proc.stderr.read() if proc.stderr else "" + raise RuntimeError( + f"Mock LLM server exited early (code {proc.returncode}). stderr: {stderr}" + ) + try: + req = urllib.request.Request(f"http://{host}:{port}/v1/models", method="GET") + with urllib.request.urlopen(req, timeout=1.0) as resp: + if resp.status == 200: + return MockServerInfo(proc, port) + except Exception: + pass + time.sleep(0.1) + + proc.terminate() + try: + proc.wait(timeout=5.0) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait() + raise RuntimeError("Mock LLM server failed health check after startup.") + + +def stop_mock_server(info: MockServerInfo) -> None: + info.proc.terminate() + try: + info.proc.wait(timeout=5.0) + except subprocess.TimeoutExpired: + info.proc.kill() + info.proc.wait() + # Drain stdout/stderr so the OS buffers get closed (prevents BufferedReader leak). + if info.proc.stdout: + try: + info.proc.stdout.read() + except Exception: + pass + if info.proc.stderr: + try: + info.proc.stderr.read() + except Exception: + pass + + +def _post_json(url: str, payload: dict[str, Any], timeout: float = 30.0, auth_token: str | None = None, workspace_dir: str | None = None) -> Any: + headers = {"Content-Type": "application/json"} + if auth_token: + import base64 + encoded = base64.b64encode(f"opencode:{auth_token}".encode("utf-8")).decode("utf-8") + headers["Authorization"] = f"Basic {encoded}" + if workspace_dir: + headers["x-opencode-directory"] = workspace_dir + req = urllib.request.Request( + url, + data=json.dumps(payload).encode("utf-8"), + headers=headers, + method="POST", + ) + with urllib.request.urlopen(req, timeout=timeout) as resp: + body = resp.read().decode("utf-8") + return json.loads(body) if body else None + + +def run_reference(prompt: str, model: str, agent: str, timeout: float) -> list[dict[str, Any]]: + cmd = [ + "opencode", + "run", + "--format", + "json", + "--agent", + agent, + "--model", + model, + prompt, + ] + env = os.environ.copy() + env["_CODECOME_INSIDE_HARNESS"] = "1" + result = subprocess.run(cmd, cwd=ROOT, capture_output=True, text=True, timeout=timeout, env=env) + events: list[dict[str, Any]] = [] + for line in result.stdout.splitlines(): + line = line.strip() + if not line: + continue + try: + events.append(json.loads(line)) + except json.JSONDecodeError: + continue + return events + + +def _create_model_payload(model: str, *, create: bool) -> dict[str, str]: + parts = model.split("/", 1) + if len(parts) == 2: + if create: + return {"providerID": parts[0], "id": parts[1]} + return {"providerID": parts[0], "modelID": parts[1]} + key = "id" if create else "modelID" + return {key: model} + + +def run_serve(prompt: str, model: str, agent: str, timeout: float) -> list[dict[str, Any]]: + runner = ServerRunner() + info = runner.start(hostname="127.0.0.1", log_level="WARN") + base_url = info.base_url + + collected: list[dict[str, Any]] = [] + + def collect_render(console: Any, phase: str, label: str, event: dict[str, Any]) -> None: + collected.append(event) + + try: + created = _post_json( + f"{base_url}/session", + { + "title": "MockLLM parity test", + "agent": agent, + "model": _create_model_payload(model, create=True), + }, + timeout=10.0, + auth_token=info.password, + workspace_dir=str(ROOT), + ) + session_id = str(created.get("id", "")) + if not session_id: + raise RuntimeError("session.create returned empty id") + + loop = EventLoop(base_url, session_id, None, "1", "recon", auth_token=info.password, workspace_dir=str(ROOT)) + + # Start event consumer BEFORE sending prompt to avoid losing early SSE events. + import threading + + event_result_box: dict[str, Any] = {} + + def _consume() -> None: + try: + event_result_box["result"] = loop.run(collect_render) + except Exception as exc: + event_result_box["error"] = exc + + consumer = threading.Thread(target=_consume, name=f"parity-events-{session_id}", daemon=True) + consumer.start() + + body = { + "parts": [{"type": "text", "text": prompt}], + "agent": agent, + "model": _create_model_payload(model, create=False), + } + _post_json( + f"{base_url}/session/{session_id}/prompt_async", + body, + timeout=timeout, + auth_token=info.password, + workspace_dir=str(ROOT), + ) + + consumer.join() + if "error" in event_result_box: + raise event_result_box["error"] + finally: + runner.stop() + + return collected + + +def normalize_event(ev: dict[str, Any]) -> dict[str, Any] | None: + """Remove volatile fields and serve-only events for comparison.""" + ev_type = ev.get("type", "") + if ev_type in _SERVE_ONLY_TYPES: + return None + out = dict(ev) + + # Normalize session.error to "error" type to match run path output. + # Run emits: {"type": "error", error: {...}} + # Serve emits: {"type": "session.error", properties: {sessionID, error: {...}}} + if ev_type == "session.error": + props = out.pop("properties", {}) + out["type"] = "error" + out["error"] = props.get("error") + out.pop("timestamp", None) + return out + + # Normalize session.status to a flat structure for comparison. + # Both paths emit the same session.status event structure when + # _CODECOME_INSIDE_HARNESS=1 is set (status-forwarder plugin active). + if ev_type == "session.status": + props = out.pop("properties", {}) + status = props.get("status", {}) + out["status_type"] = status.get("type") + out["status_attempt"] = status.get("attempt") + out["status_message"] = status.get("message") + out["status_next"] = status.get("next") + out.pop("timestamp", None) + out.pop("sessionID", None) + out.pop("id", None) + return out + + out.pop("timestamp", None) + out.pop("sessionID", None) + out.pop("id", None) + part = out.get("part") + if isinstance(part, dict): + part = dict(part) + part.pop("time", None) + part.pop("id", None) + part.pop("messageID", None) + part.pop("sessionID", None) + # Truncate large tool output/preview to avoid spurious diff noise + if ev_type == "tool_use": + state = part.get("state") + if isinstance(state, dict): + state = dict(state) + for key in ("output", "error"): + val = state.get(key) + if isinstance(val, str) and len(val) > 200: + state[key] = f"" + metadata = state.get("metadata") + if isinstance(metadata, dict): + metadata = dict(metadata) + for key in ("preview", "output"): + val = metadata.get(key) + if isinstance(val, str) and len(val) > 200: + metadata[key] = f"" + state["metadata"] = metadata + # Remove execution timing from tool state + state.pop("time", None) + part["state"] = state + out["part"] = part + return out + + +def compare_events( + run_events: list[dict[str, Any]], serve_events: list[dict[str, Any]] +) -> tuple[bool, str]: + run_norm = [normalize_event(e) for e in run_events if normalize_event(e) is not None] + serve_norm = [normalize_event(e) for e in serve_events if normalize_event(e) is not None] + + run_sorted = _sort_events_by_step(run_norm) + serve_sorted = _sort_events_by_step(serve_norm) + + run_lines = [json.dumps(e, sort_keys=True) for e in run_sorted] + serve_lines = [json.dumps(e, sort_keys=True) for e in serve_sorted] + + if run_lines == serve_lines: + return True, "" + + diff = list( + difflib.unified_diff( + run_lines, + serve_lines, + fromfile="opencode-run", + tofile="opencode-serve", + lineterm="", + ) + ) + return False, "\n".join(diff) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Deterministic parity test between opencode run and opencode serve" + ) + parser.add_argument( + "--script", + type=Path, + default=ROOT / "tools" / "mock_llm_scripts" / "basic.json", + ) + parser.add_argument("--prompt", default=DEFAULT_PROMPT) + parser.add_argument("--model", default=DEFAULT_MODEL) + parser.add_argument("--agent", default=DEFAULT_AGENT) + parser.add_argument("--timeout", type=float, default=DEFAULT_TIMEOUT_S) + parser.add_argument("--429-after", type=int, default=-1, help="Make mock server return 429 after this many requests (-1 = disabled)") + parser.add_argument("--500-after", type=int, default=-1, help="Make mock server return 500 after this many requests (-1 = disabled)") + parser.add_argument( + "--out-dir", + type=Path, + default=ROOT / "tmp" / "mock-llm-parity", + ) + args = parser.parse_args() + + out_dir = args.out_dir + out_dir.mkdir(parents=True, exist_ok=True) + + _write_json( + out_dir / "meta.json", + { + "script": str(args.script), + "prompt": args.prompt, + "model": args.model, + "agent": args.agent, + "timeout": args.timeout, + }, + ) + + config_path = ROOT / "opencode.json" + config = json.loads(config_path.read_text(encoding="utf-8")) + original_base_url = config.get("provider", {}).get("test", {}).get("options", {}).get("baseURL", "") + mock_info: MockServerInfo | None = None + + try: + # --- Start mock server and rewrite provider URL ------------------- + mock_info = start_mock_server(args.script, after_429=args.__dict__.get("429_after", -1), after_500=args.__dict__.get("500_after", -1)) + config["provider"]["test"]["options"]["baseURL"] = f"http://{MOCK_HOST}:{mock_info.port}/v1" + config_path.write_text(json.dumps(config, indent=2) + "\n", encoding="utf-8") + + run_events = run_reference(args.prompt, args.model, args.agent, args.timeout) + + # Clean up files created by run_reference to ensure serve starts with a clean workspace. + # This prevents 'exists' metadata in write tool from reflecting leftover state. + for f in ROOT.glob("tmp/parity-*.txt"): + f.unlink() + + serve_events = run_serve(args.prompt, args.model, args.agent, args.timeout) + finally: + # --- Restore original provider URL -------------------------------- + if mock_info is not None: + stop_mock_server(mock_info) + if "options" not in config["provider"]["test"]: + config["provider"]["test"]["options"] = {} + if original_base_url: + config["provider"]["test"]["options"]["baseURL"] = original_base_url + else: + config["provider"]["test"]["options"].pop("baseURL", None) + config_path.write_text(json.dumps(config, indent=2) + "\n", encoding="utf-8") + + _write_json(out_dir / "run.json", run_events) + _write_json(out_dir / "serve.json", serve_events) + + ok, diff = compare_events(run_events, serve_events) + if ok: + print("Parity OK") + return 0 + + print("Parity FAILED", file=sys.stderr) + diff_path = out_dir / "diff.txt" + diff_path.write_text(diff, encoding="utf-8") + print(f"Diff written to {diff_path}", file=sys.stderr) + print(diff, file=sys.stderr) + return 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/mock-llm-server.py b/tools/mock-llm-server.py new file mode 100644 index 0000000..cecf075 --- /dev/null +++ b/tools/mock-llm-server.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python3 +# Copyright (C) 2025-2026 Pablo Ruiz García +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +"""Minimal OpenAI-compatible mock LLM server for deterministic parity testing. + +Reads a JSON script file and serves standard endpoints: + GET /v1/models + POST /v1/chat/completions (streaming SSE) + +The JSON script is a list of actions: + {"type": "text", "content": "Hello!"} + {"type": "tool_call", "id": "call_1", "name": "read", "arguments": {"filePath": "foo.txt"}} + {"type": "tool_call", "id": "call_2", "name": "read", "arguments": {"filePath": "bar.txt"}} + {"type": "text", "content": "Done."} + {"type": "done"} + +Multi-turn support: + A "turn" = optional leading text + all consecutive tool_calls that follow it. + The turn ends when the next action is text (after tools) or done. + The server counts tool result messages in the incoming request to determine + which turn to serve (stateless dispatch). + +Usage: + python tools/mock-llm-server.py --port 0 --script tools/mock_llm_scripts/basic.json + # Prints: MockLLM serving on http://127.0.0.1:49234 +""" + +from __future__ import annotations + +import argparse +import json +import socketserver +import sys +import time +from http.server import BaseHTTPRequestHandler +from pathlib import Path + + +def _parse_script_into_turns(script: list[dict]) -> list[list[dict]]: + """Split a linear script into turns. + + A turn is: optional text + all consecutive tool_calls that follow it. + The 'done' action marks the end of the conversation and is its own turn. + """ + turns: list[list[dict]] = [] + current_turn: list[dict] = [] + in_tool_block = False + + for action in script: + action_type = action.get("type", "") + + if action_type == "done": + # Flush current turn if any + if current_turn: + turns.append(current_turn) + current_turn = [] + # done is its own sentinel turn + turns.append([action]) + break + + if action_type == "text": + if in_tool_block and current_turn: + # Previous turn ended with tools; start new turn + turns.append(current_turn) + current_turn = [] + current_turn.append(action) + in_tool_block = False + elif action_type == "tool_call": + current_turn.append(action) + in_tool_block = True + else: + # Unknown action type — pass through in current turn + current_turn.append(action) + + # Flush final turn if not done yet + if current_turn: + turns.append(current_turn) + + return turns + + +class MockLLMHandler(BaseHTTPRequestHandler): + """Handle OpenAI-compatible requests with deterministic scripted responses.""" + + script: list[dict] = [] + turns: list[list[dict]] = [] + server_version = "MockLLM/1.0" + request_count: int = 0 + after_429: int = -1 + after_500: int = -1 + + def log_message(self, format: str, *args: object) -> None: + pass + + def _send_json(self, data: dict, status: int = 200) -> None: + body = json.dumps(data).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _send_sse(self, chunks: list[str]) -> None: + self.send_response(200) + self.send_header("Content-Type", "text/event-stream") + self.send_header("Cache-Control", "no-cache") + self.send_header("Connection", "close") + self.end_headers() + for chunk in chunks: + self.wfile.write(f"data: {chunk}\n\n".encode("utf-8")) + self.wfile.flush() + self.wfile.write(b"data: [DONE]\n\n") + self.wfile.flush() + + def do_GET(self) -> None: + if self.path == "/v1/models": + self._send_json({ + "object": "list", + "data": [{"id": "mockmodel", "object": "model"}], + }) + else: + self.send_response(404) + self.end_headers() + + def do_POST(self) -> None: + if self.path == "/v1/chat/completions": + try: + content_len = int(self.headers.get("Content-Length", "0")) + if content_len: + body = self.rfile.read(content_len) + payload = json.loads(body.decode("utf-8")) + else: + payload = {} + except Exception: + payload = {} + + cls = self.__class__ + cls.request_count += 1 + + if cls.after_429 >= 0 and cls.request_count > cls.after_429: + self._send_json({"error": {"type": "rate_limit", "message": "Too Many Requests"}}, status=429) + return + if cls.after_500 >= 0 and cls.request_count > cls.after_500: + self._send_json({"error": {"type": "internal_error", "message": "Internal error"}}, status=500) + return + + # Determine which turn to serve based on conversation history. + # Each assistant message in the history corresponds to a completed turn. + messages = payload.get("messages", []) + assistant_count = sum(1 for m in messages if m.get("role") == "assistant") + turns = cls.turns + turn_index = assistant_count + if turn_index >= len(turns): + turn_index = len(turns) - 1 if turns else 0 + + chunks = self._build_chunks_for_turn(turn_index) + self._send_sse(chunks) + else: + self.send_response(404) + self.end_headers() + +def _build_chunks(turns: list[list[dict]], turn_index: int) -> list[str]: + """Build SSE chunks for a specific turn.""" + chunks: list[str] = [] + + if not turns or turn_index >= len(turns): + # No more turns — emit empty stop + chunks.append( + json.dumps({ + "id": "mock-chunk-empty", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": "mockmodel", + "choices": [ + {"index": 0, "delta": {}, "finish_reason": "stop"} + ], + }) + ) + return chunks + + turn = turns[turn_index] + + # Standard role delta + chunks.append( + json.dumps({ + "id": "mock-chunk-0", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": "mockmodel", + "choices": [ + {"index": 0, "delta": {"role": "assistant"}, "finish_reason": None} + ], + }) + ) + + # Separate text and tool actions + text_actions = [a for a in turn if a.get("type") == "text"] + tool_actions = [a for a in turn if a.get("type") == "tool_call"] + is_done_turn = any(a.get("type") == "done" for a in turn) + + # Emit text deltas + for action in text_actions: + chunks.append( + json.dumps({ + "id": "mock-chunk", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": "mockmodel", + "choices": [ + { + "index": 0, + "delta": {"content": action["content"]}, + "finish_reason": None, + } + ], + }) + ) + + # Emit tool_calls deltas (all tools in this turn share the same assistant message) + if tool_actions: + for idx, action in enumerate(tool_actions): + tool_id = action.get("id", f"call_{idx+1}") + tool_name = action["name"] + arguments = json.dumps(action.get("arguments", {})) + chunks.append( + json.dumps({ + "id": "mock-chunk", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": "mockmodel", + "choices": [ + { + "index": 0, + "delta": { + "tool_calls": [ + { + "index": idx, + "id": tool_id, + "type": "function", + "function": { + "name": tool_name, + "arguments": arguments, + }, + } + ] + }, + "finish_reason": None, + } + ], + }) + ) + + # Determine finish_reason + if is_done_turn: + finish_reason = "stop" + elif tool_actions: + finish_reason = "tool_calls" + else: + finish_reason = "stop" + + chunks.append( + json.dumps({ + "id": "mock-chunk", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": "mockmodel", + "choices": [ + {"index": 0, "delta": {}, "finish_reason": finish_reason} + ], + }) + ) + + return chunks + + +class MockLLMHandler(BaseHTTPRequestHandler): + """Handle OpenAI-compatible requests with deterministic scripted responses.""" + + script: list[dict] = [] + turns: list[list[dict]] = [] + server_version = "MockLLM/1.0" + + def log_message(self, format: str, *args: object) -> None: + pass + + def _send_json(self, data: dict, status: int = 200) -> None: + body = json.dumps(data).encode("utf-8") + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.send_header("Content-Length", str(len(body))) + self.end_headers() + self.wfile.write(body) + + def _send_sse(self, chunks: list[str]) -> None: + self.send_response(200) + self.send_header("Content-Type", "text/event-stream") + self.send_header("Cache-Control", "no-cache") + self.send_header("Connection", "close") + self.end_headers() + for chunk in chunks: + self.wfile.write(f"data: {chunk}\n\n".encode("utf-8")) + self.wfile.flush() + self.wfile.write(b"data: [DONE]\n\n") + self.wfile.flush() + + def do_GET(self) -> None: + if self.path == "/v1/models": + self._send_json({ + "object": "list", + "data": [{"id": "mockmodel", "object": "model"}], + }) + else: + self.send_response(404) + self.end_headers() + + def do_POST(self) -> None: + if self.path == "/v1/chat/completions": + try: + content_len = int(self.headers.get("Content-Length", "0")) + if content_len: + body = self.rfile.read(content_len) + payload = json.loads(body.decode("utf-8")) + else: + payload = {} + except Exception: + payload = {} + + # Determine which turn to serve based on conversation history. + # Each assistant message in the history corresponds to a completed turn. + messages = payload.get("messages", []) + assistant_count = sum(1 for m in messages if m.get("role") == "assistant") + turns = self.__class__.turns + turn_index = assistant_count + if turn_index >= len(turns): + turn_index = len(turns) - 1 if turns else 0 + + chunks = _build_chunks(turns, turn_index) + self._send_sse(chunks) + else: + self.send_response(404) + self.end_headers() + + +def main() -> int: + parser = argparse.ArgumentParser(description="Mock OpenAI-compatible LLM server") + parser.add_argument("--port", type=int, default=0, help="Port (0 = ephemeral)") + parser.add_argument("--host", default="127.0.0.1") + parser.add_argument("--script", type=Path, required=True) + parser.add_argument("--429-after", type=int, default=-1, help="Return 429 after this many requests (-1 = disabled)") + parser.add_argument("--500-after", type=int, default=-1, help="Return 500 after this many requests (-1 = disabled)") + args = parser.parse_args() + + if not args.script.exists(): + print(f"Script not found: {args.script}", file=sys.stderr) + return 1 + + with args.script.open("r", encoding="utf-8") as fh: + script = json.load(fh) + + MockLLMHandler.script = script + MockLLMHandler.turns = _parse_script_into_turns(script) + MockLLMHandler.after_429 = args.__dict__.get("429_after", -1) + MockLLMHandler.after_500 = args.__dict__.get("500_after", -1) + MockLLMHandler.request_count = 0 + + with socketserver.ThreadingTCPServer((args.host, args.port), MockLLMHandler) as httpd: + actual_port = httpd.server_address[1] + print(f"MockLLM serving on http://{args.host}:{actual_port} (script: {args.script})") + sys.stdout.flush() + try: + httpd.serve_forever() + except KeyboardInterrupt: + pass + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tools/mock_llm_scripts/basic.json b/tools/mock_llm_scripts/basic.json new file mode 100644 index 0000000..0826981 --- /dev/null +++ b/tools/mock_llm_scripts/basic.json @@ -0,0 +1,4 @@ +[ + {"type": "text", "content": "Hello world!"}, + {"type": "done"} +] diff --git a/tools/mock_llm_scripts/comprehensive.json b/tools/mock_llm_scripts/comprehensive.json new file mode 100644 index 0000000..7063ec3 --- /dev/null +++ b/tools/mock_llm_scripts/comprehensive.json @@ -0,0 +1,16 @@ +[ + {"type": "text", "content": "I'll read files."}, + {"type": "tool_call", "id": "call_1", "name": "read", "arguments": {"filePath": "README.md"}}, + {"type": "tool_call", "id": "call_2", "name": "read", "arguments": {"filePath": "AGENTS.md"}}, + {"type": "text", "content": "Let me search."}, + {"type": "tool_call", "id": "call_3", "name": "glob", "arguments": {"pattern": "src/**/*.c"}}, + {"type": "tool_call", "id": "call_4", "name": "grep", "arguments": {"pattern": "main", "path": "src"}}, + {"type": "text", "content": "Now I'll write a file."}, + {"type": "tool_call", "id": "call_5", "name": "write", "arguments": {"filePath": "tmp/parity-test.txt", "content": "original\n"}}, + {"type": "text", "content": "Running a command."}, + {"type": "tool_call", "id": "call_7", "name": "bash", "arguments": {"command": "echo hello", "description": "Say hello"}}, + {"type": "text", "content": "Creating todos."}, + {"type": "tool_call", "id": "call_8", "name": "todowrite", "arguments": {"todos": [{"content":"test","status":"completed","priority":"high"}]}}, + {"type": "text", "content": "Done!"}, + {"type": "done"} +] \ No newline at end of file diff --git a/tools/mock_llm_scripts/internal_error.json b/tools/mock_llm_scripts/internal_error.json new file mode 100644 index 0000000..2936d55 --- /dev/null +++ b/tools/mock_llm_scripts/internal_error.json @@ -0,0 +1,6 @@ +[ + {"type": "text", "content": "Starting work."}, + {"type": "done"}, + {"type": "text", "content": "Continuing."}, + {"type": "done"} +] \ No newline at end of file diff --git a/tools/mock_llm_scripts/rate_limit_retry.json b/tools/mock_llm_scripts/rate_limit_retry.json new file mode 100644 index 0000000..4f249db --- /dev/null +++ b/tools/mock_llm_scripts/rate_limit_retry.json @@ -0,0 +1,7 @@ +[ + {"type": "text", "content": "I'll try that."}, + {"type": "done"}, + {"type": "text", "content": "Actually let me look that up."}, + {"type": "tool_call", "id": "call_1", "name": "grep", "arguments": {"pattern": "main", "path": "src"}}, + {"type": "done"} +] \ No newline at end of file diff --git a/tools/mock_llm_scripts/with_permission.json b/tools/mock_llm_scripts/with_permission.json new file mode 100644 index 0000000..23d17d5 --- /dev/null +++ b/tools/mock_llm_scripts/with_permission.json @@ -0,0 +1,6 @@ +[ + {"type": "text", "content": "Reading secret file."}, + {"type": "tool_call", "id": "call_1", "name": "read", "arguments": {"filePath": "secret.env"}}, + {"type": "text", "content": "Permission denied, moving on."}, + {"type": "done"} +] diff --git a/tools/mock_llm_scripts/with_permission_multi.json b/tools/mock_llm_scripts/with_permission_multi.json new file mode 100644 index 0000000..b266eb2 --- /dev/null +++ b/tools/mock_llm_scripts/with_permission_multi.json @@ -0,0 +1,8 @@ +[ + {"type": "text", "content": "Reading secret file."}, + {"type": "tool_call", "id": "call_1", "name": "read", "arguments": {"filePath": "secret.env"}}, + {"type": "text", "content": "Permission denied. Let me read allowed file."}, + {"type": "tool_call", "id": "call_2", "name": "read", "arguments": {"filePath": "README.md"}}, + {"type": "text", "content": "Done."}, + {"type": "done"} +] \ No newline at end of file diff --git a/tools/mock_llm_scripts/with_tool.json b/tools/mock_llm_scripts/with_tool.json new file mode 100644 index 0000000..4229306 --- /dev/null +++ b/tools/mock_llm_scripts/with_tool.json @@ -0,0 +1,6 @@ +[ + {"type": "text", "content": "Reading a file."}, + {"type": "tool_call", "id": "call_1", "name": "read", "arguments": {"filePath": "README.md"}}, + {"type": "text", "content": "Done reading."}, + {"type": "done"} +] diff --git a/tools/opencode/__init__.py b/tools/opencode/__init__.py new file mode 100644 index 0000000..421c046 --- /dev/null +++ b/tools/opencode/__init__.py @@ -0,0 +1,23 @@ +# Copyright (C) 2025-2026 Pablo Ruiz García +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +""" +Manage opencode serve lifecycle (start, stop, health check). + +Usage as a module: + from opencode.serve import ServerRunner + runner = ServerRunner() + info = runner.start(port=0, hostname="127.0.0.1", log_level="WARN") + ... + runner.stop() + +Convenience CLI: + python -m opencode.serve start --port 8080 --log-level DEBUG + python -m opencode.serve stop --pid 12345 +""" + +from __future__ import annotations + +from opencode.serve import ServerRunner, ServerInfo, ServerRunnerError + +__all__ = ["ServerRunner", "ServerInfo", "ServerRunnerError"] diff --git a/tools/opencode/serve.py b/tools/opencode/serve.py new file mode 100644 index 0000000..08e8b15 --- /dev/null +++ b/tools/opencode/serve.py @@ -0,0 +1,333 @@ +#!/usr/bin/env python3 +# Copyright (C) 2025-2026 Pablo Ruiz García +# SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later + +""" +Manage opencode serve lifecycle (start, stop, health check). + +Usage as a module: + from opencode.serve import ServerRunner + runner = ServerRunner() + info = runner.start(hostname="127.0.0.1", log_level="WARN") + ... + runner.stop() + +Convenience CLI: + python -m opencode.serve start --port 8080 --log-level DEBUG + python -m opencode.serve stop --pid 12345 +""" + +from __future__ import annotations + +import argparse +import dataclasses +import json +import os +import secrets +import signal +import socket +import subprocess +import sys +import time +import urllib.error +import urllib.request +from pathlib import Path +from typing import Any, Optional + +ROOT = Path(__file__).resolve().parents[2] + +# How long to poll /global/health before giving up. +_HEALTH_TIMEOUT_S = 20.0 +# Delay between health poll attempts. +_HEALTH_INTERVAL_S = 0.3 +# Graceful shutdown wait before SIGKILL. +_GRACEFUL_SHUTDOWN_S = 5.0 + + +def _find_free_port(hostname: str = "127.0.0.1") -> int: + """Find a free ephemeral port on the given hostname.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind((hostname, 0)) + return int(s.getsockname()[1]) + + +def _build_log_path() -> Path: + """Build a unique log file path in tmp/.""" + tmp_dir = ROOT / "tmp" + tmp_dir.mkdir(parents=True, exist_ok=True) + ts = int(time.time()) + pid = os.getpid() + return tmp_dir / f"opencode-serve-{pid}-{ts}.log" + + +@dataclasses.dataclass(frozen=True) +class ServerInfo: + """ immutable snapshot of a running opencode serve instance. """ + proc: subprocess.Popen[Any] + pid: int + base_url: str + port: int + log_path: Path + password: str + + +class ServerRunnerError(Exception): + """ Raised when the server cannot be started or reached. """ + pass + + +def _try_fetch_json(url: str, timeout: float, auth_token: str | None = None) -> dict | None: + """ Best-effort GET returning parsed JSON, or None on any failure. """ + try: + headers = {} + if auth_token: + import base64 + encoded = base64.b64encode(f"opencode:{auth_token}".encode("utf-8")).decode("utf-8") + headers["Authorization"] = f"Basic {encoded}" + req = urllib.request.Request(url, headers=headers, method="GET") + with urllib.request.urlopen(req, timeout=timeout) as resp: + return json.loads(resp.read().decode("utf-8")) + except Exception: # noqa: BLE001 + return None + + +def _post_json(base_url: str, path: str, payload: dict) -> dict: + """ POST JSON and return parsed JSON response. """ + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + f"{base_url}{path}", + data=data, + headers={"Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=30.0) as resp: + body = resp.read().decode("utf-8") + if not body: + return {} + return json.loads(body) + + +def _patch_json(base_url: str, path: str, payload: dict) -> dict: + """ PATCH JSON and return parsed JSON response. """ + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + f"{base_url}{path}", + data=data, + headers={"Content-Type": "application/json"}, + method="PATCH", + ) + with urllib.request.urlopen(req, timeout=30.0) as resp: + body = resp.read().decode("utf-8") + if not body: + return {} + return json.loads(body) + + +class ServerRunner: + """ Spawn and manage a local opencode serve process. """ + + _info: Optional[ServerInfo] = None + + # ------------------------------------------------------------------ + # Public API + # ------------------------------------------------------------------ + + def start( + self, + *, + hostname: str = "127.0.0.1", + port: int | None = None, + log_level: str = "WARN", + cwd: Optional[Path] = None, + ) -> ServerInfo: + """ Start opencode serve and return ServerInfo once healthy. + + Uses a free ephemeral port if port is None or 0. + Server stdout/stderr is redirected to a log file in tmp/ to + avoid the classic subprocess PIPE deadlock. + + Raises ServerRunnerError on startup failure. + """ + if self._info is not None: + raise ServerRunnerError("Server already started") + + password = secrets.token_hex(16) + log_path = _build_log_path() + env = dict(os.environ) + env["OPENCODE_SERVER_PASSWORD"] = password + + cmd = [ + "opencode", "serve", + "--hostname", hostname, + "--log-level", log_level, + ] + + last_err: Optional[Exception] = None + for attempt in range(3): + actual_port = port + if actual_port in (None, 0): + actual_port = _find_free_port(hostname) + + attempt_cmd = cmd + ["--port", str(actual_port)] + + try: + log_file = log_path.open("a") + proc = subprocess.Popen( + attempt_cmd, + stdout=log_file, + stderr=subprocess.STDOUT, + cwd=cwd or ROOT, + env=env, + start_new_session=True, + ) + except FileNotFoundError: + raise ServerRunnerError( + "opencode command not found. Is OpenCode installed and in PATH?" + ) from None + except OSError as exc: + raise ServerRunnerError( + f"Failed to start opencode serve: {exc}" + ) from exc + + base_url = f"http://{hostname}:{actual_port}" + health_url = f"{base_url}/global/health" + deadline = time.time() + _HEALTH_TIMEOUT_S + + health_ok = False + + while time.time() < deadline: + if proc.poll() is not None: + last_err = ServerRunnerError(f"opencode serve exited early (exit code {proc.returncode}).") + break + data = _try_fetch_json(health_url, timeout=2.0, auth_token=password) + if data and data.get("healthy") is True: + health_ok = True + break + time.sleep(_HEALTH_INTERVAL_S) + + if health_ok: + self._info = ServerInfo( + proc=proc, + pid=proc.pid, + base_url=base_url, + port=actual_port, + log_path=log_path, + password=password, + ) + return self._info + + # If we reach here, this attempt failed. Kill and retry. + self._kill(proc) + + log_tail = "" + try: + with open(log_path, "r") as f: + lines = f.readlines() + log_tail = "".join(lines[-30:]) + except OSError: + pass + + raise ServerRunnerError( + f"opencode serve failed to start after 3 attempts. Last error: {last_err}. " + f"Log file: {log_path}\n" + f"Last lines:\n{log_tail or '(empty)'}" + ) + + def stop(self) -> None: + """ Gracefully stop the server; no-op if not started. """ + info = self._info + if info is None: + return + + self._kill(info.proc) + self._info = None + + @property + def info(self) -> Optional[ServerInfo]: + return self._info + + # ------------------------------------------------------------------ + # Helpers + # ------------------------------------------------------------------ + + @staticmethod + def _kill(proc: subprocess.Popen[Any]) -> None: + """Send SIGTERM to the process group, wait, then SIGKILL if still alive.""" + try: + if proc.poll() is None: + try: + os.killpg(proc.pid, signal.SIGTERM) + except ProcessLookupError: + proc.terminate() + try: + proc.wait(timeout=_GRACEFUL_SHUTDOWN_S) + except subprocess.TimeoutExpired: + try: + os.killpg(proc.pid, signal.SIGKILL) + except ProcessLookupError: + proc.kill() + proc.wait() + except ProcessLookupError: + pass + + +# ------------------------------------------------------------------ +# Convenience CLI (not the primary entry point) +# ------------------------------------------------------------------ + +def _cli() -> int: + parser = argparse.ArgumentParser( + description="Convenience CLI for opencode serve management" + ) + sub = parser.add_subparsers(dest="cmd", required=True) + + start_p = sub.add_parser("start", help="Start opencode serve") + start_p.add_argument("--port", type=int, default=None) + start_p.add_argument("--hostname", default="127.0.0.1") + start_p.add_argument("--log-level", default="WARN") + start_p.add_argument("--cwd", type=Path, default=ROOT) + + stop_p = sub.add_parser("stop", help="Stop opencode serve by PID") + stop_p.add_argument("--pid", type=int, required=True) + + args = parser.parse_args() + + if args.cmd == "start": + runner = ServerRunner() + try: + info = runner.start( + hostname=args.hostname, + port=args.port, + log_level=args.log_level, + cwd=args.cwd, + ) + print(f"Server running at {info.base_url} (pid={info.pid})") + print(f"Log file: {info.log_path}") + print("Press Ctrl-C to stop...") + try: + while True: + time.sleep(1) + except KeyboardInterrupt: + pass + finally: + runner.stop() + print("Server stopped.") + return 0 + except ServerRunnerError as exc: + print(f"Error: {exc}", file=sys.stderr) + return 1 + + if args.cmd == "stop": + try: + os.kill(args.pid, signal.SIGTERM) + print(f"Sent SIGTERM to pid {args.pid}") + return 0 + except ProcessLookupError: + print(f"No process with pid {args.pid}", file=sys.stderr) + return 1 + + return 1 + + +if __name__ == "__main__": + raise SystemExit(_cli()) diff --git a/tools/run-agent.py b/tools/run-agent.py index 9a1001e..a9d8188 100644 --- a/tools/run-agent.py +++ b/tools/run-agent.py @@ -3,14 +3,15 @@ # SPDX-License-Identifier: GPL-3.0-or-later OR AGPL-3.0-or-later """ -Structured wrapper around `opencode run --format json` for CodeCome phase targets. +Structured wrapper around `opencode serve` HTTP+SSE API for CodeCome phase targets. -Minimum supported OpenCode version: 1.14.39 +Minimum supported OpenCode version: 1.14.50 """ from __future__ import annotations import argparse +import dataclasses import difflib import json import os @@ -19,16 +20,22 @@ import signal import subprocess import sys +import threading import time +import traceback +import urllib.error +import urllib.request +from collections import OrderedDict from dataclasses import dataclass from functools import lru_cache -from collections import OrderedDict from pathlib import Path from typing import Any sys.path.insert(0, str(Path(__file__).resolve().parent)) import _colors as C +from opencode.serve import ServerRunner, ServerRunnerError +from events import EventLoop, RunResult try: from rich.console import Console, Group @@ -50,7 +57,7 @@ HAVE_RICH = False ROOT = Path(__file__).resolve().parents[1] -MINIMUM_OPENCODE_VERSION = "1.14.39" +MINIMUM_OPENCODE_VERSION = "1.14.50" def check_opencode_version() -> None: @@ -674,6 +681,7 @@ def render_permission_error_plain(message: str) -> None: _GREP_HIGHLIGHT = os.environ.get("CODECOME_GREP_HIGHLIGHT", "1") not in ("0", "false", "False", "no") _REASONING_MAX_CHARS = int(os.environ.get("CODECOME_REASONING_MAX_CHARS", "4000")) _RENDER_REASONING = os.environ.get("CODECOME_RENDER_REASONING", "1") not in ("0", "false", "False", "no") +_DEBUG_UNKNOWN_EVENTS = os.environ.get("CODECOME_DEBUG_UNKNOWN_EVENTS", "0") not in ("", "0", "false", "False", "no") _SANDBOX_RENDER = os.environ.get("CODECOME_SANDBOX_RENDER", "1") not in ("0", "false", "False", "no") _SANDBOX_VALIDATE_STDERR_LINES = int(os.environ.get("CODECOME_SANDBOX_VALIDATE_STDERR_LINES", "20")) _SANDBOX_FILES_CAP = int(os.environ.get("CODECOME_SANDBOX_FILES_CAP", "15")) @@ -3861,11 +3869,63 @@ def render_step_finish(console: Console, event: dict[str, Any]) -> None: def render_unknown(console: Console, event: dict[str, Any]) -> None: - message = f"unknown event type: {event.get('type', '')}" + event_type = event.get("type", "") + # For message.part.updated, surface the actual unknown part type. + if event_type == "message.part.updated": + part_type = event.get("part", {}).get("type", "") + message = f"unknown part type: {part_type}" + else: + message = f"unknown event type: {event_type}" if HAVE_RICH: console.print(Text(message, style="dim")) else: print(message) + if _DEBUG_UNKNOWN_EVENTS: + payload = json.dumps(event, indent=2, default=str) + if HAVE_RICH: + console.print(Text(payload, style="dim")) + else: + print(payload) + + +def render_server_connected(console: Console, event: dict[str, Any]) -> None: + message = "connected to opencode event stream" + if HAVE_RICH: + console.print(Text(message, style="dim")) + else: + print(C.info(message)) + + +def render_server_heartbeat(console: Console, event: dict[str, Any]) -> None: + message = "server heartbeat" + if HAVE_RICH: + console.print(Text(message, style="dim")) + else: + print(C.info(message)) + + +def render_session_diff(console: Console, event: dict[str, Any]) -> None: + properties = event.get("properties", {}) + diff = properties.get("diff", []) + if not isinstance(diff, list) or not diff: + return + count = len(diff) + message = f"session diff updated: {count} file{'s' if count != 1 else ''}" + if HAVE_RICH: + console.print(Text(message, style="dim")) + else: + print(C.info(message)) + + +def render_message_updated(console: Console, event: dict[str, Any]) -> None: + info = event.get("info", {}) if isinstance(event.get("info"), dict) else {} + agent = str(info.get("agent", "assistant")) + model_id = str(info.get("modelID", info.get("model", ""))) + message = f"> {agent} · {model_id}" if model_id else f"> {agent}" + if HAVE_RICH: + console.print(Text(message, style="bold blue")) + else: + print(C.header(message)) def render_error(console: Console, event: dict[str, Any]) -> None: @@ -3911,7 +3971,13 @@ def render_error(console: Console, event: dict[str, Any]) -> None: def render_event(console: Console, phase: str, label: str, event: dict[str, Any]) -> None: event_type = event.get("type") - if event_type == "step_start": + if event_type == "server.connected": + render_server_connected(console, event) + elif event_type == "server.heartbeat": + render_server_heartbeat(console, event) + elif event_type == "message.updated": + render_message_updated(console, event) + elif event_type == "step_start": render_step_start(console, phase, label, event) elif event_type == "text": render_text(console, event) @@ -3925,6 +3991,8 @@ def render_event(console: Console, phase: str, label: str, event: dict[str, Any] render_error(console, event) elif event_type == "session.status": render_session_status(console, event) + elif event_type == "session.diff": + render_session_diff(console, event) elif event_type == "subagent.status": render_subagent_status(console, event) else: @@ -3944,6 +4012,18 @@ def render_session_status(console: Console, event: dict[str, Any]) -> None: console.print(Text(text, style="bold yellow")) else: print(C.warn(text)) + elif status_type == "busy": + text = "session status: busy" + if HAVE_RICH: + console.print(Text(text, style="dim")) + else: + print(C.info(text)) + elif status_type == "idle": + text = "session status: idle" + if HAVE_RICH: + console.print(Text(text, style="dim")) + else: + print(C.info(text)) def render_subagent_status(console: Console, event: dict[str, Any]) -> None: @@ -4095,48 +4175,6 @@ def _resolve_thinking_decision( return enabled, "provider-default" -def build_child_command(args: argparse.Namespace) -> tuple[list[str], Optional[str], Optional[str], str, str, bool, str]: - """Return the child command and the resolved model/variant + sources. - - Appends --model/--variant from env or codecome.yml only when - OPENCODE_ARGS does not already pass them. Also resolves the - --thinking decision per provider and returns it for the banner. - """ - cmd = ["opencode", "run", "--format", "json", "--agent", args.agent] - - extra_args = shlex.split(os.environ.get("OPENCODE_ARGS", "")) - cmd.extend(extra_args) - - model, variant, model_source, variant_source = resolve_model_and_variant( - args.agent, extra_args - ) - - # Append --model/--variant to enforce env/yaml-resolved values when - # OPENCODE_ARGS did not already pass them. OPENCODE_ARGS (and any - # earlier --model/-m/--variant in cmd) always wins because we never - # touch values that came from there. Discovered defaults - # ('opencode session history') are display-only and are NOT - # enforced — opencode will pick its own default anyway, and - # forcing it would surprise users when they switch models in the - # TUI between phases. - _ENFORCING_SOURCES = {"env CODECOME_MODEL", "codecome.yml"} - _ENFORCING_VARIANT_SOURCES = {"env CODECOME_MODEL_VARIANT", "codecome.yml"} - - if model and model_source in _ENFORCING_SOURCES: - cmd.extend(["--model", model]) - if variant and variant_source in _ENFORCING_VARIANT_SOURCES: - cmd.extend(["--variant", variant]) - - # Decide --thinking based on env override, OPENCODE_ARGS, or - # per-provider default. Skip appending if it was already in - # extra_args (already added via cmd.extend(extra_args) above). - thinking_on, thinking_source = _resolve_thinking_decision(model, extra_args) - if thinking_on and "--thinking" not in extra_args: - cmd.append("--thinking") - - return cmd, model, variant, model_source, variant_source, thinking_on, thinking_source - - def resolve_runtime_model_for_banner( args: argparse.Namespace, command: list[str], @@ -4409,6 +4447,204 @@ def check_phase_graceful_completion(phase: str, finding: str | None, run_start_t pass return False +def _get_headers(auth_token: str | None, workspace_dir: str | None) -> dict[str, str]: + headers = {"Content-Type": "application/json"} + if auth_token: + import base64 + encoded = base64.b64encode(f"opencode:{auth_token}".encode("utf-8")).decode("utf-8") + headers["Authorization"] = f"Basic {encoded}" + if workspace_dir: + headers["x-opencode-directory"] = workspace_dir + return headers + +def _send_prompt_to_session( + base_url: str, + session_id: str, + prompt: str, + agent: str, + model: str | None, + variant: str | None, + auth_token: str | None, + workspace_dir: str | None, +) -> None: + """Send a prompt text to a session via POST /session/{id}/prompt_async.""" + url = f"{base_url}/session/{session_id}/prompt_async" + payload: dict[str, Any] = { + "parts": [{"type": "text", "text": prompt}], + "agent": agent, + } + if model: + parts = model.split("/", 1) + if len(parts) == 2: + payload["model"] = {"providerID": parts[0], "modelID": parts[1]} + else: + payload["model"] = {"modelID": model} + if variant: + payload["variant"] = variant + data = json.dumps(payload).encode("utf-8") + req = urllib.request.Request( + url, + data=data, + headers=_get_headers(auth_token, workspace_dir), + method="POST", + ) + try: + with urllib.request.urlopen(req, timeout=30.0) as resp: + pass # 204 expected + except urllib.error.HTTPError as exc: + raise RuntimeError(f"Failed to send prompt: HTTP {exc.code}") from exc + + +def _create_session(base_url: str, phase: str, agent: str, model: str | None, auth_token: str | None, workspace_dir: str | None) -> str: + """Create a session via POST /session and return its ID.""" + payload: dict[str, Any] = {"title": f"CodeCome Phase {phase}", "agent": agent} + if model: + parts = model.split("/", 1) + if len(parts) == 2: + payload["model"] = {"providerID": parts[0], "id": parts[1]} + else: + payload["model"] = {"id": model} + req = urllib.request.Request( + f"{base_url}/session", + data=json.dumps(payload).encode("utf-8"), + headers=_get_headers(auth_token, workspace_dir), + method="POST", + ) + resp = urllib.request.urlopen(req, timeout=10.0) + data = json.loads(resp.read().decode("utf-8")) + sid = str(data.get("id", "")) + if not sid: + raise RuntimeError("Server returned empty session ID") + return sid + + +def _consume_events( + base_url: str, + session_id: str, + console: Any, + phase: str, + label: str, + args: argparse.Namespace, + transcript_fp: Any | None, + thinking_on: bool, + auth_token: str | None, + workspace_dir: str | None, +) -> RunResult: + """Create an EventLoop, consume SSE until idle, and return RunResult.""" + event_loop = EventLoop( + base_url=base_url, + session_id=session_id, + console=console, + phase=phase, + label=label, + auth_token=auth_token, + workspace_dir=workspace_dir, + ) + + def _render_and_log(console_: Any, phase_: str, label_: str, event: dict[str, Any]) -> None: + if transcript_fp is not None: + try: + transcript_fp.write(json.dumps(event) + "\n") + except OSError: + pass + if args.debug: + sys.stderr.write(json.dumps(event) + "\n") + sys.stderr.flush() + if not thinking_on and event.get("type") == "reasoning": + return + render_event(console_, phase_, label_, event) + + return event_loop.run(_render_and_log) + + +def _run_single_attempt( + args: argparse.Namespace, + console: Any, + prompt: str, + model: str | None, + variant: str | None, + thinking_on: bool, + base_url: str, + auth_token: str | None, + workspace_dir: str | None, + existing_session_id: str | None = None, +) -> tuple[int, str, RunResult, Path]: + """Run or resume a single phase attempt via opencode serve. + + If existing_session_id is provided, reuses that session (resume). + Otherwise creates a new session. + + Returns (returncode, session_id, run_result, transcript_path). + """ + finding_tag = (args.finding or "no-finding").replace("/", "_") + transcript_dir = ROOT / "tmp" + transcript_dir.mkdir(parents=True, exist_ok=True) + + # Use a module-level counter for attempt numbers across resume attempts. + counter = getattr(_run_single_attempt, "_attempt_counter", 1) + transcript_path = transcript_dir / f"last-phase-{args.phase}-{finding_tag}-attempt-{counter}.jsonl" + setattr(_run_single_attempt, "_attempt_counter", counter + 1) + + transcript_fp = None + try: + transcript_fp = transcript_path.open("w", encoding="utf-8") + except OSError as exc: + if HAVE_RICH: + console.print(Text(f"warning: could not open transcript {transcript_path}: {exc}", style="yellow")) + else: + print(C.warn(f"warning: could not open transcript {transcript_path}: {exc}")) + + try: + if existing_session_id: + session_id = existing_session_id + else: + session_id = _create_session(base_url, str(args.phase), args.agent, model, auth_token, workspace_dir) + + run_result_box: dict[str, Any] = {} + consume_error_box: dict[str, Exception] = {} + + def _consume() -> None: + try: + run_result_box["result"] = _consume_events( + base_url, + session_id, + console, + str(args.phase), + str(args.label), + args, + transcript_fp, + thinking_on, + auth_token, + workspace_dir, + ) + except Exception as exc: # noqa: BLE001 + consume_error_box["error"] = exc + + consumer = threading.Thread(target=_consume, name=f"codecome-events-{session_id}", daemon=True) + consumer.start() + + _send_prompt_to_session(base_url, session_id, prompt, args.agent, model, variant, auth_token, workspace_dir) + consumer.join() + + if "error" in consume_error_box: + raise consume_error_box["error"] + run_result = run_result_box.get("result") + if not isinstance(run_result, RunResult): + raise RuntimeError("Event loop ended without a RunResult") + except Exception as exc: + _emit_fatal_error(console, "Server Error", str(exc)) + return 1, existing_session_id or "", RunResult(), transcript_path + finally: + if transcript_fp is not None: + try: + transcript_fp.flush() + transcript_fp.close() + except OSError: + pass + + return 0, session_id, run_result, transcript_path + + def show_model_table(agent_name: str) -> int: """Print the model-resolution table for an agent and exit.""" extra_args = shlex.split(os.environ.get("OPENCODE_ARGS", "")) @@ -4448,6 +4684,14 @@ def fmt(v: Optional[str]) -> str: return 0 +def _emit_fatal_error(console: Any, title: str, message: str) -> None: + """Show fatal startup/runtime errors in the UI and on stderr.""" + formatted = C.fail(f"{title}: {message}") + if HAVE_RICH: + console.print(Panel(Text(message, style="red"), title=title, border_style="red")) + print(formatted, file=sys.stderr) + + def main() -> int: RUN_START_TIME = time.time() iteration_retry_count = 0 @@ -4485,30 +4729,17 @@ def main() -> int: console = build_console(color_mode) prompt_file = ROOT / args.prompt_file prompt = load_prompt(prompt_file, args.finding, phase=args.phase) - command, model, variant, model_source, variant_source, thinking_on, thinking_source = build_child_command(args) - base_command = list(command) - model, variant, model_source, variant_source = resolve_runtime_model_for_banner( - args, command, model, variant, model_source, variant_source + # Model resolution is still needed for banner display. + extra_args = shlex.split(os.environ.get("OPENCODE_ARGS", "")) + model, variant, model_source, variant_source = resolve_model_and_variant( + args.agent, extra_args ) - - # If the runtime probe revealed a different provider and the user - # didn't explicitly pin --thinking via env or OPENCODE_ARGS, redo - # the per-provider decision now. This keeps the banner truthful. - if thinking_source == "provider-default": - new_thinking_on, _ = _resolve_thinking_decision(model, shlex.split(os.environ.get("OPENCODE_ARGS", ""))) - if new_thinking_on != thinking_on: - # Adjust the command for the late discovery: add or remove --thinking. - if new_thinking_on and "--thinking" not in command: - command.append("--thinking") - elif not new_thinking_on and "--thinking" in command: - command.remove("--thinking") - thinking_on = new_thinking_on + thinking_on, thinking_source = _resolve_thinking_decision(model, extra_args) model_label = model or "(unknown)" variant_label = variant or "(unknown)" - # Build the single-line banner. Order: agent model variant? prompt - # followed by a trailing parenthetical with the resolution source(s). + # Build the single-line banner. parts = [f"agent={args.agent}", f"model={model_label}"] if variant is not None: parts.append(f"variant={variant_label}") @@ -4549,151 +4780,63 @@ def main() -> int: print(C.warn("rich is not installed; using plain structured output fallback")) attempt_number = 0 - while True: - attempt_number += 1 - process: subprocess.Popen[str] | None = None - interrupted = False - - def forward_signal(signum: int, _frame: Any) -> None: - nonlocal interrupted - interrupted = True - if process is not None and process.poll() is None: - try: - os.killpg(process.pid, signum) - except ProcessLookupError: - pass - - previous_sigint = signal.signal(signal.SIGINT, forward_signal) - previous_sigterm = signal.signal(signal.SIGTERM, forward_signal) - - try: - env = os.environ.copy() - env["_CODECOME_INSIDE_HARNESS"] = "1" - process = subprocess.Popen( - command, - cwd=ROOT, - env=env, - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=None, - text=True, - bufsize=1, - preexec_fn=os.setsid, - ) + last_session_id: str = "" + last_finish_reason: Optional[str] = None + last_finish_tokens: dict[str, Any] = {} + last_permission_error: Optional[str] = None + any_step_finish_seen = False + step_finish_count = 0 + transcript_path: Path = Path() + + # Signal to local opencode plugins (e.g. status-forwarder) that we are + # running inside the run-agent harness. + os.environ["_CODECOME_INSIDE_HARNESS"] = "1" + + # Start the server once for this phase + runner = ServerRunner() + server_info: Any = None + try: + server_info = runner.start(hostname="127.0.0.1", log_level="WARN") + except ServerRunnerError as exc: + _emit_fatal_error(console, "Server Error", str(exc)) + return 1 - assert process.stdin is not None - process.stdin.write(prompt) - process.stdin.close() - - assert process.stdout is not None - late_model_announced = False - finding_tag = (args.finding or "no-finding").replace("/", "_") - transcript_dir = ROOT / "tmp" - transcript_dir.mkdir(parents=True, exist_ok=True) - transcript_path = transcript_dir / f"last-phase-{args.phase}-{finding_tag}-attempt-{attempt_number}.jsonl" - last_finish_reason: Optional[str] = None - last_finish_tokens: dict[str, Any] = {} - last_permission_error: Optional[str] = None - any_step_finish_seen = False - step_finish_count = 0 - stream_session_id: Optional[str] = None - - try: - transcript_fp: Optional[Any] = transcript_path.open("w", encoding="utf-8") - except OSError as exc: - transcript_fp = None - if HAVE_RICH: - console.print(Text(f"warning: could not open transcript {transcript_path}: {exc}", style="yellow")) - else: - print(C.warn(f"warning: could not open transcript {transcript_path}: {exc}")) + base_url = server_info.base_url + # Forward Ctrl+C / SIGTERM to the server process group so children die too. + def _forward_signal(signum: int, _frame: Any) -> None: + info = runner.info + if info is not None: try: - for raw_line in process.stdout: - if transcript_fp is not None: - try: - transcript_fp.write(raw_line) - except OSError: - pass - line = raw_line.strip() - if not line: - continue - if args.debug: - sys.stderr.write(line + "\n") - sys.stderr.flush() - try: - event = json.loads(line) - if stream_session_id is None and "sessionID" in event: - stream_session_id = event["sessionID"] - except json.JSONDecodeError: - if args.debug: - sys.stderr.write(f"json-parse-error: {line}\n") - sys.stderr.flush() - continue + os.killpg(info.pid, signum) + except ProcessLookupError: + pass + signal.signal(signum, signal.SIG_DFL) + os.kill(os.getpid(), signum) - if not late_model_announced: - discovered_in_stream = _scan_event_for_model(event) - if discovered_in_stream and discovered_in_stream != model: - late_model_announced = True - msg = ( - f"[model resolved from stream] {discovered_in_stream} " - f"(banner showed {model_label})" - ) - if HAVE_RICH: - console.print(Text(msg, style="yellow")) - else: - print(C.warn(msg)) - - if event.get("type") == "step_finish": - any_step_finish_seen = True - step_finish_count += 1 - part = event.get("part") or {} - reason = part.get("reason") - if isinstance(reason, str): - last_finish_reason = reason - tokens = part.get("tokens") - if isinstance(tokens, dict): - last_finish_tokens = tokens - - permission_error = _extract_tool_permission_error(event) - if permission_error is not None: - last_permission_error = permission_error - if HAVE_RICH and console is not None: - render_permission_error_rich(console, permission_error) - else: - render_permission_error_plain(permission_error) - - render_event(console, args.phase, args.label, event) - finally: - if transcript_fp is not None: - try: - transcript_fp.flush() - transcript_fp.close() - except OSError: - pass - - process.wait() - returncode = process.returncode - except Exception as exc: - if HAVE_RICH: - console.print(Panel(Text(str(exc), style="red"), title="Wrapper Error", border_style="red")) - else: - print(C.fail(str(exc)), file=sys.stderr) - return 1 - finally: - signal.signal(signal.SIGINT, previous_sigint) - signal.signal(signal.SIGTERM, previous_sigterm) + previous_sigint = signal.signal(signal.SIGINT, _forward_signal) + previous_sigterm = signal.signal(signal.SIGTERM, _forward_signal) - if returncode is None: - returncode = 1 + try: + while True: + attempt_number += 1 + returncode, session_id, run_result, transcript_path = _run_single_attempt( + args, console, prompt, model, variant, thinking_on, base_url, + server_info.password, str(ROOT), + existing_session_id=last_session_id or None + ) - if returncode < 0: - returncode = 128 + abs(returncode) + if returncode != 0: + break - if interrupted and returncode == 0: - returncode = 130 + last_session_id = session_id + last_finish_reason = run_result.last_finish_reason + last_finish_tokens = run_result.last_finish_tokens + last_permission_error = run_result.last_permission_error + any_step_finish_seen = run_result.any_step_finish_seen + step_finish_count = run_result.step_finish_count - finish_warning: Optional[str] = None - if returncode == 0: + finish_warning: Optional[str] = None if not any_step_finish_seen: finish_warning = ( "CodeCome observed no step_finish events in the JSON stream, so the model/provider did not emit a " @@ -4727,141 +4870,106 @@ def forward_signal(signum: int, _frame: Any) -> None: "the run as incomplete rather than assuming success." ) - if finish_warning is not None and returncode == 0: - if ( - last_finish_reason in _FINISH_MID_TURN - and last_permission_error is None - and check_phase_graceful_completion(args.phase, args.finding, RUN_START_TIME) - ): - msg = ( - f"CodeCome observed a mid-turn model/provider cutoff for Phase {args.phase} after {step_finish_count} " - "completed loops, but the required durable artifacts were already written. Treating the phase as complete." - ) - if HAVE_RICH: - console.print(Text(msg, style="bold green")) + if finish_warning is not None: + if ( + last_finish_reason in _FINISH_MID_TURN + and last_permission_error is None + and check_phase_graceful_completion(args.phase, args.finding, RUN_START_TIME) + ): + msg = ( + f"CodeCome observed a mid-turn model/provider cutoff for Phase {args.phase} after {step_finish_count} " + "completed loops, but the required durable artifacts were already written. Treating the phase as complete." + ) + if HAVE_RICH: + console.print(Text(msg, style="bold green")) + else: + print(C.ok(msg)) + finish_warning = None + last_finish_reason = "graceful_forgiveness" else: - print(C.ok(msg)) - finish_warning = None - last_finish_reason = "graceful_forgiveness" - else: - returncode = 2 - - # ----------------------------------------------------- - # Auto-Resume Logic - # ----------------------------------------------------- - - last_session_id = stream_session_id - if not last_session_id: - try: - db_query = subprocess.run( - ["opencode", "db", "SELECT id FROM session ORDER BY time_updated DESC LIMIT 1", "--format", "tsv"], - capture_output=True, text=True, timeout=1.0 + returncode = 2 + + # Frontmatter Resume (only if returncode == 0) + if returncode == 0: + validation_result = subprocess.run( + [sys.executable, "tools/check-frontmatter.py"], + cwd=ROOT, + capture_output=True, + text=True ) - if db_query.returncode == 0 and db_query.stdout.strip(): - last_session_id = db_query.stdout.strip().splitlines()[-1].strip() - except Exception: - pass + if validation_result.returncode != 0: + max_frontmatter_retries = 2 + validation_output = (validation_result.stderr or validation_result.stdout).strip() or "(no validator output)" + if frontmatter_retry_count < max_frontmatter_retries: + frontmatter_retry_count += 1 + msg = ( + "\n[Auto-Correction] The model completed a turn, but its output failed local frontmatter " + f"validation. CodeCome will resume the same session and ask for a minimal repair " + f"(retry {frontmatter_retry_count}/{max_frontmatter_retries})." + ) + if HAVE_RICH: + console.print(Text(msg, style="bold yellow")) + else: + print(C.warn(msg)) + if last_session_id and last_session_id != "id": + prompt = _build_frontmatter_resume_prompt(args.phase, args.finding, validation_output) + continue + else: + returncode = 2 + finish_warning = ( + "The model output failed local frontmatter validation, and CodeCome could not determine a " + "session ID to resume for repair. Treating the phase as incomplete so the validator output " + "can be reported back with the saved transcript." + ) + else: + returncode = 2 + finish_warning = ( + f"The model output still fails local frontmatter validation after {max_frontmatter_retries} " + "auto-repair attempts. Treating the phase as incomplete so the validation errors can be reported back." + ) + msg = f"\n[Warning] Frontmatter errors persist after {max_frontmatter_retries} auto-retries." + if HAVE_RICH: + console.print(Text(msg, style="bold red")) + else: + print(C.fail(msg)) + print(validation_output) + break + break - # Frontmatter Resume - if returncode == 0: - validation_result = subprocess.run( - [sys.executable, "tools/check-frontmatter.py"], - cwd=ROOT, - capture_output=True, - text=True - ) - if validation_result.returncode != 0: - max_frontmatter_retries = 2 - validation_output = (validation_result.stderr or validation_result.stdout).strip() or "(no validator output)" - if frontmatter_retry_count < max_frontmatter_retries: - frontmatter_retry_count += 1 + # Iteration Limit Resume + if returncode == 2 and last_finish_reason in _FINISH_MID_TURN: + max_iteration_retries = int(os.environ.get("CODECOME_MAX_ITERATION_RETRIES", "1")) + if iteration_retry_count < max_iteration_retries: + iteration_retry_count += 1 msg = ( - "\n[Auto-Correction] The model completed a turn, but its output failed local frontmatter " - f"validation. CodeCome will resume the same session and ask for a minimal repair " - f"(retry {frontmatter_retry_count}/{max_frontmatter_retries})." + "\n[Auto-Resume] CodeCome observed a mid-turn model/provider cutoff and will resume the same " + f"session once to let the model finish the interrupted work (retry {iteration_retry_count}/{max_iteration_retries})." ) if HAVE_RICH: console.print(Text(msg, style="bold yellow")) else: print(C.warn(msg)) - if last_session_id and last_session_id != "id": - prompt = _build_frontmatter_resume_prompt( - args.phase, - args.finding, - validation_output, + prompt = _build_phase_resume_prompt( + args.phase, args.finding, last_finish_reason, step_finish_count ) - command = _build_resume_command(base_command, last_session_id, prompt) - if HAVE_RICH: - console.print(Text(f"Resuming session {last_session_id}...", style="dim")) - continue # loop back and retry + continue else: - returncode = 2 finish_warning = ( - "The model output failed local frontmatter validation, and CodeCome could not determine a " - "session ID to resume for repair. Treating the phase as incomplete so the validator output " - "can be reported back with the saved transcript." + "CodeCome correctly detected that the model/provider stopped mid-turn, but it could not determine " + "a session ID for automatic continuation. Treating the phase as incomplete." ) if HAVE_RICH: console.print(Text("Could not determine session ID to resume.", style="red")) else: print(C.fail("Could not determine session ID to resume.")) - else: - returncode = 2 - finish_warning = ( - f"The model output still fails local frontmatter validation after {max_frontmatter_retries} " - "auto-repair attempts. Treating the phase as incomplete so the validation errors can be reported back." - ) - msg = f"\n[Warning] Frontmatter errors persist after {max_frontmatter_retries} auto-retries." - if HAVE_RICH: - console.print(Text(msg, style="bold red")) - else: - print(C.fail(msg)) - print(validation_output) - - # If no frontmatter errors or we ran out of retries, we are done - break + break - # Iteration Limit Resume - if returncode == 2 and last_finish_reason in _FINISH_MID_TURN: - max_iteration_retries = int(os.environ.get("CODECOME_MAX_ITERATION_RETRIES", "1")) - if iteration_retry_count < max_iteration_retries: - iteration_retry_count += 1 - msg = ( - "\n[Auto-Resume] CodeCome observed a mid-turn model/provider cutoff and will resume the same " - f"session once to let the model finish the interrupted work (retry {iteration_retry_count}/{max_iteration_retries})." - ) - if HAVE_RICH: - console.print(Text(msg, style="bold yellow")) - else: - print(C.warn(msg)) - - if last_session_id and last_session_id != "id": - prompt = _build_phase_resume_prompt( - args.phase, - args.finding, - last_finish_reason, - step_finish_count, - ) - command = _build_resume_command(base_command, last_session_id, prompt) - if HAVE_RICH: - console.print(Text(f"Resuming session {last_session_id}...", style="dim")) - continue # loop back and retry - else: - finish_warning = ( - "CodeCome correctly detected that the model/provider stopped mid-turn, but it could not determine " - "a session ID for automatic continuation. Treating the phase as incomplete." - ) - if HAVE_RICH: - console.print(Text("Could not determine session ID to resume.", style="red")) - else: - print(C.fail("Could not determine session ID to resume.")) - - # If we run out of iteration retries, we break out break - - # Any other return code (e.g. failure, interrupt), we break - break + finally: + signal.signal(signal.SIGINT, previous_sigint) + signal.signal(signal.SIGTERM, previous_sigterm) + runner.stop() if returncode == 0: if HAVE_RICH: @@ -4930,5 +5038,16 @@ def forward_signal(signum: int, _frame: Any) -> None: return returncode + if __name__ == "__main__": - raise SystemExit(main()) + try: + raise SystemExit(main()) + except KeyboardInterrupt: + raise SystemExit(130) + except SystemExit: + raise + except Exception as exc: # noqa: BLE001 + print(C.fail(f"Fatal Error: {exc}"), file=sys.stderr) + if truthy_env("CODECOME_DEBUG"): + traceback.print_exc(file=sys.stderr) + raise SystemExit(1)