diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 38b0d8e..4a7b9f0 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.8.5] - 2026-05-23 + +Bugfix release. Removes the last concurrent-writer path that could still reach +`SQLITE_BUSY` after 0.8.4 (RC5 in the SQLite busy incident postmortem). + +### Fixed + +- **Foreground `index_documents` could write concurrently with the file-watcher** — `server.callIndexDocuments` called `Engine.IndexDocuments` directly, bypassing the `indexWithLock` guard (`re.indexing`) that only the startup and file-watcher paths used. So a manual `index_documents` and a background watcher index could write to `vectors.db` at the same time. 0.8.4 made them queue politely (busy_timeout up to 5s), but a heavy index holding a write transaction longer than that could still hit `SQLITE_BUSY`. `IndexDocuments` now holds a dedicated `Engine.indexMu` for its whole duration, serialising every indexing run regardless of caller. The watcher's `indexing` flag is unchanged (it still coalesces debounced ticks); foreground callers now wait on the mutex and index for real instead of being skipped. Regression test `TestIndexDocumentsSerialisesWriters` asserts a second call blocks while the lock is held and completes once released. See `06-planning/2026-05-05-sqlite-busy-incident.md` §7 RC5. + ## [0.8.4] - 2026-05-23 Bugfix release. Closes the SQLITE_BUSY recurrence on `index_documents` that the diff --git a/internal/rag/rag.go b/internal/rag/rag.go index e736bde..b676384 100644 --- a/internal/rag/rag.go +++ b/internal/rag/rag.go @@ -95,6 +95,14 @@ type Engine struct { vecService *vectorService mu sync.Mutex indexing bool + // indexMu serialises the whole IndexDocuments write path so two indexing + // runs never write to vectors.db concurrently. The file-watcher path + // (indexWithLock) coalesces debounced ticks via the `indexing` flag, but + // the foreground index_documents MCP tool calls IndexDocuments directly — + // without this mutex the two could race and hit SQLITE_BUSY (RC5 in + // 06-planning/2026-05-05-sqlite-busy-incident.md). Foreground callers wait + // here and then index for real, rather than being skipped. + indexMu sync.Mutex lastIndexCheck time.Time stopWatcher chan struct{} stopOnce sync.Once @@ -405,6 +413,11 @@ func (re *Engine) IndexDocuments(ctx context.Context) error { return fmt.Errorf("RAG engine not available") } + // Serialise all indexing runs (foreground tool + background watcher) so + // they never write to vectors.db concurrently. See Engine.indexMu. + re.indexMu.Lock() + defer re.indexMu.Unlock() + startTime := time.Now().UTC() // chunkerVersion bumps trigger a full rebuild on next IndexDocuments. // char-v1 — naive char-budget splitter (initial) diff --git a/internal/rag/rag_test.go b/internal/rag/rag_test.go index 7bc90f5..0237a0c 100644 --- a/internal/rag/rag_test.go +++ b/internal/rag/rag_test.go @@ -182,6 +182,84 @@ func TestIndexDocumentsRecoversAfterDirtyCommitFailure(t *testing.T) { } } +// TestIndexDocumentsSerialisesWriters guards RC5: IndexDocuments must hold the +// engine's write lock for its whole duration so the foreground index_documents +// tool and the background file-watcher never write to vectors.db concurrently. +// We hold indexMu (simulating an in-progress run) and assert a second call +// blocks until it is released, then completes for real. +func TestIndexDocumentsSerialisesWriters(t *testing.T) { + repoRoot := t.TempDir() + if err := os.MkdirAll(filepath.Join(repoRoot, "docs"), 0o755); err != nil { + t.Fatalf("mkdir docs: %v", err) + } + if err := os.WriteFile(filepath.Join(repoRoot, "docs", "runbook.md"), []byte("# Runbook\nrollback ingress safely"), 0o644); err != nil { + t.Fatalf("write runbook: %v", err) + } + + embeddingServer := newTestEmbeddingServer(t) + defer embeddingServer.Close() + + emb, err := embedder.New(embedder.Config{ + OllamaBaseURL: embeddingServer.URL, + Dimension: 2, + Mode: "local-only", + MaxRetries: 0, + Timeout: time.Second, + }, zap.NewNop()) + if err != nil { + t.Fatalf("embedder.New: %v", err) + } + + store, err := vectorstore.NewSQLiteStore(filepath.Join(t.TempDir(), "vectors.db"), 2, zap.NewNop()) + if err != nil { + t.Fatalf("NewSQLiteStore: %v", err) + } + t.Cleanup(func() { _ = store.Close() }) + + engine := &Engine{ + config: config.Config{RootPath: repoRoot, RAGMaxResults: 10}, + repoRoot: repoRoot, + logger: zap.NewNop(), + docService: newDocumentService(docServiceConfig{ + RepoRoot: repoRoot, + IndexDirs: []string{"docs"}, + ChunkSize: 2000, + ChunkOverlap: 200, + }, zap.NewNop()), + vecService: &vectorService{ + config: vecServiceConfig{Embedder: emb, MaxResults: 10}, + logger: zap.NewNop(), + store: store, + }, + stopWatcher: make(chan struct{}), + } + + // Simulate an indexing run already holding the writer lock. + engine.indexMu.Lock() + + done := make(chan error, 1) + go func() { done <- engine.IndexDocuments(context.Background()) }() + + select { + case <-done: + engine.indexMu.Unlock() + t.Fatal("IndexDocuments ran while indexMu was held — writers are not serialised") + case <-time.After(150 * time.Millisecond): + // Expected: the second run is blocked on indexMu. + } + + engine.indexMu.Unlock() + + select { + case err := <-done: + if err != nil { + t.Fatalf("IndexDocuments after lock release: %v", err) + } + case <-time.After(3 * time.Second): + t.Fatal("IndexDocuments did not complete after indexMu was released") + } +} + func TestCalculateIndexChanges_NewFiles(t *testing.T) { e := &Engine{}