diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 04620f4..38b0d8e 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.8.4] - 2026-05-23 + +Bugfix release. Closes the SQLITE_BUSY recurrence on `index_documents` that the +0.8.0 incident fix (`internal/dbutil`) did not fully resolve. + +### Fixed + +- **SQLITE_BUSY recurred because `busy_timeout` never reached the whole pool** — `dbutil.OpenSQLite` applied the pragmas via `db.Exec` after Open. `journal_mode=WAL` is persisted at the database-file level (so WAL did engage), but `busy_timeout` is **per-connection**: a single `Exec` only configures the one pooled connection that served it, and every other connection `database/sql` opened for concurrent work defaulted to `busy_timeout=0` → instant `SQLITE_BUSY`. Observed when the background file-watcher index raced a write (logs 2026-05-22 14:54 `delete chunks`, 2026-05-23 00:24 `upsert chunk`, `trigger: file_watcher`). `OpenSQLite` now passes the pragmas through the DSN (`_pragma=busy_timeout(5000)&_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)`), which `modernc.org/sqlite` runs on **every** new pool connection (`Driver.Open`), plus `_txlock=immediate` so writers take the write lock at `BEGIN` — `busy_timeout` is not honored when a deferred transaction fails to upgrade a read lock to a write lock (SQLite returns `SQLITE_BUSY` immediately without invoking the busy handler). Regression test `TestOpenSQLite_BusyTimeoutPerConnection` asserts the timeout holds on a second pooled connection. The original `?_journal_mode=WAL` DSN form (incident RC2) was also a non-existent `modernc` parameter — the driver only honors `_pragma=...` — which is why the rollback journal appeared on disk in the first place. See `06-planning/2026-05-05-sqlite-busy-incident.md` §7. Known follow-ups (tracked in the incident doc): foreground `index_documents` bypasses the file-watcher's write guard (RC5), and heavy indexing can still hit the embedding HTTP timeout. + ## [0.8.3] - 2026-05-22 Bugfix release. The llama.cpp backend shipped in 0.8.2 never reached the RAG diff --git a/internal/dbutil/sqlite.go b/internal/dbutil/sqlite.go index 12678f6..19ccdfa 100644 --- a/internal/dbutil/sqlite.go +++ b/internal/dbutil/sqlite.go @@ -1,53 +1,65 @@ // Package dbutil centralises the SQLite Open + PRAGMA setup shared between -// memory store and vector store. The connection-string PRAGMAs of -// modernc.org/sqlite are unreliable (see 06-planning/2026-05-05-sqlite-busy-incident.md): -// vectors.db-journal was observed on disk despite ?_journal_mode=WAL in the DSN, -// meaning rollback-journal mode silently kept the database under exclusive -// write lock and caused a 25h SQLITE_BUSY stall on index_documents. +// memory store and vector store. // -// OpenSQLite applies the pragmas explicitly via PRAGMA statements after Open -// and verifies that journal_mode actually switched to WAL. +// History (see 06-planning/2026-05-05-sqlite-busy-incident.md): a 25h +// SQLITE_BUSY stall on index_documents was traced to rollback-journal mode +// silently keeping the DB under an exclusive write lock. The first fix applied +// the pragmas via explicit db.Exec after Open — but busy_timeout is a +// PER-CONNECTION setting, and a single db.Exec only configures whichever +// pooled connection happened to serve it. New connections the pool opened for +// concurrent operations defaulted to busy_timeout=0 and kept returning instant +// SQLITE_BUSY under writer contention (file_watcher index racing a foreground +// index_documents). +// +// OpenSQLite now passes the pragmas through the DSN _pragma parameter, which +// modernc.org/sqlite runs as "PRAGMA ..." on EVERY new pool connection +// (Driver.Open). _txlock=immediate makes write transactions acquire the write +// lock at BEGIN so busy_timeout actually covers writer contention — SQLite does +// not invoke the busy handler when a deferred transaction fails to upgrade a +// read lock to a write lock, it returns SQLITE_BUSY immediately. package dbutil import ( "database/sql" "fmt" + "net/url" "go.uber.org/zap" _ "modernc.org/sqlite" // SQLite driver ) -// OpenSQLite opens a SQLite database at dbPath and applies the standard -// pragmas: busy_timeout=5000, journal_mode=WAL, synchronous=NORMAL. -// Returns an error if the database cannot be opened or pragmas fail. -// Logs a warning (logger may be nil) if WAL mode was requested but the -// driver fell back to a different journal mode. +// buildDSN appends the standard PRAGMA + txlock query parameters to a plain +// SQLite file path. The parameters are applied per-connection by the driver, +// so every connection in the database/sql pool gets busy_timeout, WAL and +// synchronous=NORMAL — not just the one that served an initial Exec. +func buildDSN(dbPath string) string { + q := url.Values{} + q.Add("_pragma", "busy_timeout(5000)") + q.Add("_pragma", "journal_mode(WAL)") + q.Add("_pragma", "synchronous(NORMAL)") + q.Set("_txlock", "immediate") + return dbPath + "?" + q.Encode() +} + +// OpenSQLite opens a SQLite database at dbPath with the standard pragmas +// (busy_timeout=5000, journal_mode=WAL, synchronous=NORMAL) applied to every +// pooled connection via the DSN, and BEGIN IMMEDIATE transaction locking. +// Returns an error if the database cannot be opened. Logs a warning (logger +// may be nil) if WAL mode was requested but the driver fell back to a +// different journal mode. func OpenSQLite(dbPath string, logger *zap.Logger) (*sql.DB, error) { - db, err := sql.Open("sqlite", dbPath) + db, err := sql.Open("sqlite", buildDSN(dbPath)) if err != nil { return nil, fmt.Errorf("open %s: %w", dbPath, err) } - if err := ApplyPragmas(db, logger); err != nil { - _ = db.Close() - return nil, err - } - return db, nil -} -// ApplyPragmas runs the standard pragmas on an already-opened database. -// Exposed for tests that need a custom Open path. -func ApplyPragmas(db *sql.DB, logger *zap.Logger) error { - // busy_timeout: queue concurrent writers up to 5s instead of returning - // SQLITE_BUSY immediately on contention. - if _, err := db.Exec("PRAGMA busy_timeout=5000"); err != nil { - return fmt.Errorf("set busy_timeout: %w", err) - } - - // journal_mode=WAL: concurrent reader+writer; rollback journal locks - // the entire database under any write transaction. + // Force a real connection (sql.Open is lazy) so an unusable path fails + // here, and verify WAL actually engaged — rollback-journal mode is the + // failure that caused the original incident. var mode string - if err := db.QueryRow("PRAGMA journal_mode=WAL").Scan(&mode); err != nil { - return fmt.Errorf("set journal_mode=WAL: %w", err) + if err := db.QueryRow("PRAGMA journal_mode").Scan(&mode); err != nil { + _ = db.Close() + return nil, fmt.Errorf("verify journal_mode on %s: %w", dbPath, err) } if mode != "wal" && logger != nil { logger.Warn("SQLite did not switch to WAL journal_mode", @@ -56,10 +68,5 @@ func ApplyPragmas(db *sql.DB, logger *zap.Logger) error { ) } - // synchronous=NORMAL: balanced durability when paired with WAL. - if _, err := db.Exec("PRAGMA synchronous=NORMAL"); err != nil { - return fmt.Errorf("set synchronous=NORMAL: %w", err) - } - - return nil + return db, nil } diff --git a/internal/dbutil/sqlite_test.go b/internal/dbutil/sqlite_test.go index 0f8dfc5..c8801b6 100644 --- a/internal/dbutil/sqlite_test.go +++ b/internal/dbutil/sqlite_test.go @@ -1,6 +1,8 @@ package dbutil import ( + "context" + "database/sql" "path/filepath" "testing" ) @@ -41,6 +43,42 @@ func TestOpenSQLite_AppliesWALAndBusyTimeout(t *testing.T) { } } +// TestOpenSQLite_BusyTimeoutPerConnection guards the regression that caused the +// SQLITE_BUSY recurrence: busy_timeout is a per-connection setting, so it must +// be applied to every pooled connection, not just the first one. We pin two +// connections open simultaneously to force the pool to hand out a second +// connection, then assert busy_timeout holds on it. +func TestOpenSQLite_BusyTimeoutPerConnection(t *testing.T) { + dir := t.TempDir() + db, err := OpenSQLite(filepath.Join(dir, "test.db"), nil) + if err != nil { + t.Fatalf("OpenSQLite: %v", err) + } + defer func() { _ = db.Close() }() + + ctx := context.Background() + c1, err := db.Conn(ctx) + if err != nil { + t.Fatalf("conn1: %v", err) + } + defer func() { _ = c1.Close() }() + c2, err := db.Conn(ctx) // distinct connection while c1 is held + if err != nil { + t.Fatalf("conn2: %v", err) + } + defer func() { _ = c2.Close() }() + + for i, c := range []*sql.Conn{c1, c2} { + var timeout int + if err := c.QueryRowContext(ctx, "PRAGMA busy_timeout").Scan(&timeout); err != nil { + t.Fatalf("conn%d busy_timeout: %v", i+1, err) + } + if timeout != 5000 { + t.Errorf("conn%d busy_timeout = %d, want 5000", i+1, timeout) + } + } +} + func TestOpenSQLite_BadPath(t *testing.T) { // "/" is a directory — sql.Open succeeds (lazy), but the first PRAGMA exec must fail. _, err := OpenSQLite("/", nil)