Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions docs/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.8.4] - 2026-05-23

Bugfix release. Closes the SQLITE_BUSY recurrence on `index_documents` that the
0.8.0 incident fix (`internal/dbutil`) did not fully resolve.

### Fixed

- **SQLITE_BUSY recurred because `busy_timeout` never reached the whole pool** — `dbutil.OpenSQLite` applied the pragmas via `db.Exec` after Open. `journal_mode=WAL` is persisted at the database-file level (so WAL did engage), but `busy_timeout` is **per-connection**: a single `Exec` only configures the one pooled connection that served it, and every other connection `database/sql` opened for concurrent work defaulted to `busy_timeout=0` → instant `SQLITE_BUSY`. Observed when the background file-watcher index raced a write (logs 2026-05-22 14:54 `delete chunks`, 2026-05-23 00:24 `upsert chunk`, `trigger: file_watcher`). `OpenSQLite` now passes the pragmas through the DSN (`_pragma=busy_timeout(5000)&_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)`), which `modernc.org/sqlite` runs on **every** new pool connection (`Driver.Open`), plus `_txlock=immediate` so writers take the write lock at `BEGIN` — `busy_timeout` is not honored when a deferred transaction fails to upgrade a read lock to a write lock (SQLite returns `SQLITE_BUSY` immediately without invoking the busy handler). Regression test `TestOpenSQLite_BusyTimeoutPerConnection` asserts the timeout holds on a second pooled connection. The original `?_journal_mode=WAL` DSN form (incident RC2) was also a non-existent `modernc` parameter — the driver only honors `_pragma=...` — which is why the rollback journal appeared on disk in the first place. See `06-planning/2026-05-05-sqlite-busy-incident.md` §7. Known follow-ups (tracked in the incident doc): foreground `index_documents` bypasses the file-watcher's write guard (RC5), and heavy indexing can still hit the embedding HTTP timeout.

## [0.8.3] - 2026-05-22

Bugfix release. The llama.cpp backend shipped in 0.8.2 never reached the RAG
Expand Down
83 changes: 45 additions & 38 deletions internal/dbutil/sqlite.go
Original file line number Diff line number Diff line change
@@ -1,53 +1,65 @@
// Package dbutil centralises the SQLite Open + PRAGMA setup shared between
// memory store and vector store. The connection-string PRAGMAs of
// modernc.org/sqlite are unreliable (see 06-planning/2026-05-05-sqlite-busy-incident.md):
// vectors.db-journal was observed on disk despite ?_journal_mode=WAL in the DSN,
// meaning rollback-journal mode silently kept the database under exclusive
// write lock and caused a 25h SQLITE_BUSY stall on index_documents.
// memory store and vector store.
//
// OpenSQLite applies the pragmas explicitly via PRAGMA statements after Open
// and verifies that journal_mode actually switched to WAL.
// History (see 06-planning/2026-05-05-sqlite-busy-incident.md): a 25h
// SQLITE_BUSY stall on index_documents was traced to rollback-journal mode
// silently keeping the DB under an exclusive write lock. The first fix applied
// the pragmas via explicit db.Exec after Open — but busy_timeout is a
// PER-CONNECTION setting, and a single db.Exec only configures whichever
// pooled connection happened to serve it. New connections the pool opened for
// concurrent operations defaulted to busy_timeout=0 and kept returning instant
// SQLITE_BUSY under writer contention (file_watcher index racing a foreground
// index_documents).
//
// OpenSQLite now passes the pragmas through the DSN _pragma parameter, which
// modernc.org/sqlite runs as "PRAGMA ..." on EVERY new pool connection
// (Driver.Open). _txlock=immediate makes write transactions acquire the write
// lock at BEGIN so busy_timeout actually covers writer contention — SQLite does
// not invoke the busy handler when a deferred transaction fails to upgrade a
// read lock to a write lock, it returns SQLITE_BUSY immediately.
package dbutil

import (
"database/sql"
"fmt"
"net/url"

"go.uber.org/zap"
_ "modernc.org/sqlite" // SQLite driver
)

// OpenSQLite opens a SQLite database at dbPath and applies the standard
// pragmas: busy_timeout=5000, journal_mode=WAL, synchronous=NORMAL.
// Returns an error if the database cannot be opened or pragmas fail.
// Logs a warning (logger may be nil) if WAL mode was requested but the
// driver fell back to a different journal mode.
// buildDSN appends the standard PRAGMA + txlock query parameters to a plain
// SQLite file path. The parameters are applied per-connection by the driver,
// so every connection in the database/sql pool gets busy_timeout, WAL and
// synchronous=NORMAL — not just the one that served an initial Exec.
func buildDSN(dbPath string) string {
q := url.Values{}
q.Add("_pragma", "busy_timeout(5000)")
q.Add("_pragma", "journal_mode(WAL)")
q.Add("_pragma", "synchronous(NORMAL)")
q.Set("_txlock", "immediate")
return dbPath + "?" + q.Encode()
}

// OpenSQLite opens a SQLite database at dbPath with the standard pragmas
// (busy_timeout=5000, journal_mode=WAL, synchronous=NORMAL) applied to every
// pooled connection via the DSN, and BEGIN IMMEDIATE transaction locking.
// Returns an error if the database cannot be opened. Logs a warning (logger
// may be nil) if WAL mode was requested but the driver fell back to a
// different journal mode.
func OpenSQLite(dbPath string, logger *zap.Logger) (*sql.DB, error) {
db, err := sql.Open("sqlite", dbPath)
db, err := sql.Open("sqlite", buildDSN(dbPath))
if err != nil {
return nil, fmt.Errorf("open %s: %w", dbPath, err)
}
if err := ApplyPragmas(db, logger); err != nil {
_ = db.Close()
return nil, err
}
return db, nil
}

// ApplyPragmas runs the standard pragmas on an already-opened database.
// Exposed for tests that need a custom Open path.
func ApplyPragmas(db *sql.DB, logger *zap.Logger) error {
// busy_timeout: queue concurrent writers up to 5s instead of returning
// SQLITE_BUSY immediately on contention.
if _, err := db.Exec("PRAGMA busy_timeout=5000"); err != nil {
return fmt.Errorf("set busy_timeout: %w", err)
}

// journal_mode=WAL: concurrent reader+writer; rollback journal locks
// the entire database under any write transaction.
// Force a real connection (sql.Open is lazy) so an unusable path fails
// here, and verify WAL actually engaged — rollback-journal mode is the
// failure that caused the original incident.
var mode string
if err := db.QueryRow("PRAGMA journal_mode=WAL").Scan(&mode); err != nil {
return fmt.Errorf("set journal_mode=WAL: %w", err)
if err := db.QueryRow("PRAGMA journal_mode").Scan(&mode); err != nil {
_ = db.Close()
return nil, fmt.Errorf("verify journal_mode on %s: %w", dbPath, err)
}
if mode != "wal" && logger != nil {
logger.Warn("SQLite did not switch to WAL journal_mode",
Expand All @@ -56,10 +68,5 @@ func ApplyPragmas(db *sql.DB, logger *zap.Logger) error {
)
}

// synchronous=NORMAL: balanced durability when paired with WAL.
if _, err := db.Exec("PRAGMA synchronous=NORMAL"); err != nil {
return fmt.Errorf("set synchronous=NORMAL: %w", err)
}

return nil
return db, nil
}
38 changes: 38 additions & 0 deletions internal/dbutil/sqlite_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package dbutil

import (
"context"
"database/sql"
"path/filepath"
"testing"
)
Expand Down Expand Up @@ -41,6 +43,42 @@ func TestOpenSQLite_AppliesWALAndBusyTimeout(t *testing.T) {
}
}

// TestOpenSQLite_BusyTimeoutPerConnection guards the regression that caused the
// SQLITE_BUSY recurrence: busy_timeout is a per-connection setting, so it must
// be applied to every pooled connection, not just the first one. We pin two
// connections open simultaneously to force the pool to hand out a second
// connection, then assert busy_timeout holds on it.
func TestOpenSQLite_BusyTimeoutPerConnection(t *testing.T) {
dir := t.TempDir()
db, err := OpenSQLite(filepath.Join(dir, "test.db"), nil)
if err != nil {
t.Fatalf("OpenSQLite: %v", err)
}
defer func() { _ = db.Close() }()

ctx := context.Background()
c1, err := db.Conn(ctx)
if err != nil {
t.Fatalf("conn1: %v", err)
}
defer func() { _ = c1.Close() }()
c2, err := db.Conn(ctx) // distinct connection while c1 is held
if err != nil {
t.Fatalf("conn2: %v", err)
}
defer func() { _ = c2.Close() }()

for i, c := range []*sql.Conn{c1, c2} {
var timeout int
if err := c.QueryRowContext(ctx, "PRAGMA busy_timeout").Scan(&timeout); err != nil {
t.Fatalf("conn%d busy_timeout: %v", i+1, err)
}
if timeout != 5000 {
t.Errorf("conn%d busy_timeout = %d, want 5000", i+1, timeout)
}
}
}

func TestOpenSQLite_BadPath(t *testing.T) {
// "/" is a directory — sql.Open succeeds (lazy), but the first PRAGMA exec must fail.
_, err := OpenSQLite("/", nil)
Expand Down
Loading