diff --git a/CHANGELOG.md b/CHANGELOG.md index e6b5a25..ba62e16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,21 @@ Format follows [Keep a Changelog](https://keepachangelog.com/). Versions follow --- +## [0.1.4] - 2026-03-19 + +### Added +- Phase-1 retrieval ranking pipeline with reciprocal rank fusion (RRF), recency boost, and importance weighting controls. +- Retrieval config keys and environment overrides for `rrfK`, `recencyBoost`, `recencyHalfLifeHours`, and `importanceWeight`. +- Foundation and regression coverage for RRF scoring behavior and phase-1 ranking config defaults/overrides. +- New OpenSpec capability `memory-retrieval-ranking-phase1` with archived implementation change record. + +### Changed +- Hybrid retrieval ranking now fuses vector and BM25 channels via rank-based RRF instead of direct weighted-score summation. +- Main specs for `memory-auto-capture-and-recall` and `memory-provider-config` now include phase-1 ranking requirements. +- Validation and operations docs now include low-feedback interpretation guidance and proxy-metric review workflows. + +--- + ## [0.1.3] - 2026-03-17 ### Added diff --git a/README.md b/README.md index 83f91b6..419f37a 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,11 @@ If you already use other plugins, keep them and append `"lancedb-opencode-pro"`. "mode": "hybrid", "vectorWeight": 0.7, "bm25Weight": 0.3, - "minScore": 0.2 + "minScore": 0.2, + "rrfK": 60, + "recencyBoost": true, + "recencyHalfLifeHours": 72, + "importanceWeight": 0.4 }, "includeGlobalScope": true, "minCaptureChars": 80, @@ -173,7 +177,11 @@ Create `~/.config/opencode/lancedb-opencode-pro.json`: "mode": "hybrid", "vectorWeight": 0.7, "bm25Weight": 0.3, - "minScore": 0.2 + "minScore": 0.2, + "rrfK": 60, + "recencyBoost": true, + "recencyHalfLifeHours": 72, + "importanceWeight": 0.4 }, "includeGlobalScope": true, "minCaptureChars": 80, @@ -216,6 +224,10 @@ Supported environment variables: - `LANCEDB_OPENCODE_PRO_VECTOR_WEIGHT` - `LANCEDB_OPENCODE_PRO_BM25_WEIGHT` - `LANCEDB_OPENCODE_PRO_MIN_SCORE` +- `LANCEDB_OPENCODE_PRO_RRF_K` +- `LANCEDB_OPENCODE_PRO_RECENCY_BOOST` +- `LANCEDB_OPENCODE_PRO_RECENCY_HALF_LIFE_HOURS` +- `LANCEDB_OPENCODE_PRO_IMPORTANCE_WEIGHT` - `LANCEDB_OPENCODE_PRO_INCLUDE_GLOBAL_SCOPE` - `LANCEDB_OPENCODE_PRO_MIN_CAPTURE_CHARS` - `LANCEDB_OPENCODE_PRO_MAX_ENTRIES_PER_SCOPE` @@ -298,6 +310,22 @@ Key fields: - `feedback.falsePositiveRate`: wrong-memory reports divided by stored memories. - `feedback.falseNegativeRate`: missing-memory reports relative to capture attempts. +### Interpreting Low-Feedback Results + +In real OpenCode usage, auto-capture and recall happen in the background, so explicit `memory_feedback_*` events are often sparse. + +- Treat `capture.*` and `recall.*` as system-health metrics: they show whether the memory pipeline is running. +- Treat repeated-context reduction, clarification burden, manual memory rescue, correction signals, and sampled audits as product-value signals: they show whether memory actually helped the user. +- Treat `feedback.* = 0` as insufficient evidence, not proof that memory quality is good. +- Treat a high `recall.hitRate` or `recall.injectionRate` as recall availability only; those values do not prove usefulness by themselves. + +Recommended review order in low-feedback environments: + +1. Check `capture.successRate`, `capture.skipReasons`, `recall.hitRate`, and `recall.injectionRate` for operational health. +2. Review whether users repeated background context less often or needed fewer clarification turns. +3. Check whether users still needed manual rescue through `memory_search` or issued correction-like responses. +4. Run a bounded audit of recalled memories or skipped captures before concluding the system is helping. + ## OpenAI Embedding Configuration Default behavior stays on Ollama. To use OpenAI embeddings, set `embedding.provider` to `openai` and provide API key + model. @@ -318,7 +346,11 @@ Example sidecar: "mode": "hybrid", "vectorWeight": 0.7, "bm25Weight": 0.3, - "minScore": 0.2 + "minScore": 0.2, + "rrfK": 60, + "recencyBoost": true, + "recencyHalfLifeHours": 72, + "importanceWeight": 0.4 }, "includeGlobalScope": true, "minCaptureChars": 80, diff --git a/docs/VALIDATION_README.md b/docs/VALIDATION_README.md index ecbc51b..87d2b0a 100644 --- a/docs/VALIDATION_README.md +++ b/docs/VALIDATION_README.md @@ -114,6 +114,21 @@ Documentation & error messages | Helpful recall rate | Reported | 7 | User feedback | | False-positive / false-negative counts | Reported | 7 | User feedback | +### Low-Feedback Proxy Metrics +| Metric | Target | Phase | Reference | +|--------|--------|-------|-----------| +| Repeated-context reduction | Reviewed | 7 | Low-feedback framework | +| Clarification burden reduction | Reviewed | 7 | Low-feedback framework | +| Manual memory rescue rate | Reviewed | 7 | Low-feedback framework | +| Correction-signal rate | Reviewed | 7 | Low-feedback framework | +| Sampled recall usefulness | Reviewed | 7 | Low-feedback framework | + +Interpretation rules: + +- High `recall.hitRate` indicates retrieval availability, not proven usefulness. +- Zero explicit feedback counts indicate missing labels unless a proxy-metric review or sample audit says otherwise. +- Release review should pair runtime summaries with manual proxy-metric inspection whenever feedback volume is sparse. + --- ## 🔍 Critical Tests (Must Pass Before Release) diff --git a/docs/acceptance-checklist.md b/docs/acceptance-checklist.md index 1b66bd0..003abd4 100644 --- a/docs/acceptance-checklist.md +++ b/docs/acceptance-checklist.md @@ -22,6 +22,12 @@ - [ ] Users can report whether recalled memory was helpful. - [ ] Operators can inspect machine-readable effectiveness summary output. +## Low-Feedback Evaluation + +- [ ] Operators separate system-health metrics from product-value conclusions. +- [ ] Zero explicit feedback is treated as insufficient signal, not as proof of good memory quality. +- [ ] Proxy metrics or sampled audits are reviewed when explicit feedback is sparse. + ## Build And Packaging - [ ] `docker compose build --no-cache && docker compose up -d` succeeds. diff --git a/docs/memory-validation-checklist.md b/docs/memory-validation-checklist.md index f799674..1372f77 100644 --- a/docs/memory-validation-checklist.md +++ b/docs/memory-validation-checklist.md @@ -348,6 +348,49 @@ - Acceptance: Message explains problem + suggests fix - Example: "Vector dimension mismatch: expected 384, got 768. Run memory_clear to reset." +### 7.3 Runtime Effectiveness Summary +- [ ] **System-Health Metrics Are Reported** + - Test: Run `memory_effectiveness` after a realistic write/recall workflow + - Measurement: Verify capture success, skip reasons, recall hit rate, and recall injection rate are present + - Acceptance: Summary includes all runtime fields needed to judge operational health + +- [ ] **Zero Feedback Is Treated As Unknown Quality** + - Test: Review a summary with sparse or zero `feedback.*` counts + - Measurement: Confirm release guidance does not treat zero counts as success + - Acceptance: Review docs require proxy metrics or sample audits before claiming usefulness + +### 7.4 Low-Feedback Proxy Metrics +- [ ] **Repeated-Context Reduction Review** + - Test: Compare follow-up sessions before/after memory use + - Measurement: Whether users repeat less project context manually + - Acceptance: Review process documents whether context repetition decreases, stays flat, or worsens + +- [ ] **Clarification Burden Review** + - Test: Inspect conversations after recall injection + - Measurement: Count reminder or context-recovery questions that should have been avoided + - Acceptance: Review process can identify whether memory reduced clarification turns + +- [ ] **Manual Memory Rescue Review** + - Test: Inspect whether operators still need `memory_search` after automatic recall + - Measurement: Manual search frequency relative to recall-heavy workflows + - Acceptance: Review process can describe whether automatic recall still required manual rescue + +- [ ] **Correction-Signal Review** + - Test: Inspect `memory_feedback_wrong`, `memory_feedback_missing`, and correction-like conversation turns + - Measurement: Frequency of stale, wrong, or irrelevant recall corrections + - Acceptance: Review process can identify whether memory introduced prompt contamination or stale context + +### 7.5 Sample Audit Workflow +- [ ] **Sampled Recall Audit** + - Test: Review 10-20 recent recall injections from one active project scope + - Measurement: Classify each as relevant, neutral noise, or misleading + - Acceptance: Audit result is documented before release claims are made in sparse-feedback environments + +- [ ] **Sampled Skipped-Capture Audit** + - Test: Review 10-20 skipped captures, especially `no-positive-signal` + - Measurement: Determine whether durable decisions, facts, or preferences were missed + - Acceptance: Audit result identifies whether capture heuristics are too strict for real usage + --- ## IMPLEMENTATION ROADMAP @@ -478,4 +521,3 @@ async function profileLatency(fn: () => Promise, iterations: number) { - **Monitor Tail Performance**: p99 latency matters more than average for interactive tools - **Scope Isolation is Critical**: Multi-project support depends on bulletproof scope enforcement - **Embedding Provider Abstraction**: Design tests to support future providers (OpenAI, local models, etc.) - diff --git a/docs/operations.md b/docs/operations.md index e228463..634360a 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -27,6 +27,13 @@ - User feedback is recorded through `memory_feedback_missing`, `memory_feedback_wrong`, and `memory_feedback_useful`. - Operators can inspect the aggregated machine-readable summary with `memory_effectiveness` for the active project scope. +### System Health vs Product Value + +- **System health metrics**: `capture.successRate`, `capture.skipReasons`, `recall.hitRate`, and `recall.injectionRate`. +- **Product value metrics**: repeated-context reduction, clarification burden reduction, manual memory rescue rate, correction-signal rate, and sampled recall usefulness. +- High recall availability means the store can return something; it does not prove that the injected memory helped the conversation. +- Zero `feedback.*` counts mean the workflow lacks direct labels, not that memory quality is confirmed. + ### Example Workflow ```text @@ -43,3 +50,25 @@ Expected summary fields: - `recall.requested`, `recall.returnedResults`, `recall.injected` - `feedback.missing`, `feedback.wrong`, `feedback.useful` - `feedback.falsePositiveRate`, `feedback.falseNegativeRate` + +### Low-Feedback Proxy Metrics + +Use these proxy metrics when users rarely submit `memory_feedback_*` commands: + +| Proxy metric | What it means | Current evidence source | +|---|---|---| +| Repeated-context reduction | Users repeat less project context across sessions or follow-up turns | Manual conversation review; not instrumented yet | +| Clarification burden | Agent asks fewer reminder or context-recovery questions | Manual conversation review; not instrumented yet | +| Manual memory rescue rate | Users still need `memory_search` after automatic recall | Search activity + session review; not instrumented as a dedicated rate | +| Correction-signal rate | Users say the recalled context is wrong, stale, or irrelevant | `memory_feedback_wrong`, `memory_feedback_missing`, or conversation review | +| Sampled recall usefulness | Audited recalled memories appear relevant and actually help move work forward | Sample audit of recalled memories | + +### Sample Audit Workflow + +When explicit feedback is sparse, run a bounded audit instead of assuming quality: + +1. Sample 10-20 recent recall injections from the same project scope. +2. For each sample, inspect the recalled memory text and the next assistant reply. +3. Mark whether the memory was relevant, neutral noise, or misleading. +4. Sample 10-20 skipped captures, especially `no-positive-signal`, and check whether important durable knowledge was missed. +5. Treat the audit as release input alongside `memory_effectiveness`, not as a replacement for runtime metrics. diff --git a/docs/release-readiness.md b/docs/release-readiness.md index 113d51a..6d9352f 100644 --- a/docs/release-readiness.md +++ b/docs/release-readiness.md @@ -77,6 +77,7 @@ Still manual or not yet automated: - FTS degradation fault injection validation - embedding-backend-unavailable fault-path validation - broader phase items outside current change scope (phase 2/5+/scalability extremes) +- low-feedback proxy metrics remain documentation-driven and require reviewer judgment or sampling ## Manual-Only Items (Current) @@ -85,6 +86,20 @@ Before archive/ship, retain these as explicit manual checks: 1. Force an FTS-index failure scenario and verify retrieval fallback behavior. 2. Force embedding backend outage and verify hook-level graceful behavior. 3. Run real OpenCode directory-switch scenario end-to-end to validate scope transition behavior in live integration. +4. If explicit `memory_feedback_*` counts are sparse, review proxy metrics or run a bounded audit of recalled memories and skipped captures. + +## Low-Feedback Evaluation Guidance + +Interpret `memory_effectiveness` in two layers: + +- **System health**: capture success, skip reasons, recall hit rate, and recall injection rate. +- **Product value**: repeated-context reduction, clarification burden reduction, manual memory rescue rate, correction-signal rate, and sampled recall usefulness. + +Review rules: + +- Zero feedback counts are insufficient evidence, not proof of zero defects. +- High `recall.hitRate` or `recall.injectionRate` means memory was available, not necessarily useful. +- When feedback volume is sparse, release reviewers should document either proxy-metric observations or the outcome of a sampled audit. ## Archive / Ship Gate @@ -93,3 +108,4 @@ Treat release as ready when all conditions are true: 1. `docker compose exec app npm run verify:full` passes. 2. No new failing items in the manual-only checklist above. 3. Any unresolved manual-only item is explicitly documented in release notes. +4. Sparse-feedback releases include a low-feedback interpretation note or sample-audit outcome. diff --git a/docs/validation-priority-summary.md b/docs/validation-priority-summary.md index e04a0d2..5353897 100644 --- a/docs/validation-priority-summary.md +++ b/docs/validation-priority-summary.md @@ -276,6 +276,8 @@ async function profileLatency(fn: () => Promise, iterations: number) { | p99 latency > 1000ms | Profile search algorithm + index structure | | Scope isolation failure | STOP - data privacy issue, fix before release | | Vector dimension mismatch | STOP - data integrity issue, fix before release | +| Recall hit rate is high but feedback is near zero | Treat as insufficient evidence; review proxy metrics or run a sample audit | +| Users still repeat background context after recall | Investigate product-value gap even if system-health metrics look good | --- @@ -286,4 +288,3 @@ See `memory-validation-checklist.md` for: - Complete measurement methodology - Implementation roadmap (4 sprints) - Success criteria for v0.1.0 - diff --git a/openspec/changes/add-low-feedback-memory-evaluation/.openspec.yaml b/openspec/changes/add-low-feedback-memory-evaluation/.openspec.yaml new file mode 100644 index 0000000..4e61834 --- /dev/null +++ b/openspec/changes/add-low-feedback-memory-evaluation/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-03-19 diff --git a/openspec/changes/add-low-feedback-memory-evaluation/design.md b/openspec/changes/add-low-feedback-memory-evaluation/design.md new file mode 100644 index 0000000..15b3551 --- /dev/null +++ b/openspec/changes/add-low-feedback-memory-evaluation/design.md @@ -0,0 +1,77 @@ +## Context + +The current project records capture, recall, and explicit feedback events, then exposes those aggregates through `memory_effectiveness`. This is useful for system-health visibility, but the interaction model is heavily background-driven: auto-capture happens without explicit user action, recall injection happens inside the system prompt, and users usually do not see which memory ids were involved unless they inspect raw memory output. As a result, explicit feedback counts are structurally sparse. + +The design challenge is not to replace event metrics, but to redefine how maintainers interpret them. In a low-feedback environment, the project needs a framework that treats explicit feedback as optional high-value evidence while using behavior-based proxy metrics and periodic sample audits as the primary source of product-value assessment. + +## Goals / Non-Goals + +**Goals:** +- Define a low-feedback evaluation model that separates system-health metrics from user-value metrics. +- Establish proxy metrics that can be reviewed even when `memory_feedback_*` usage is near zero. +- Define sample-audit workflows so maintainers can validate capture and recall quality without requiring continuous user labeling. +- Clarify summary interpretation rules so zero feedback is treated as unknown quality, not success. + +**Non-Goals:** +- Redesigning the existing event schema in this change. +- Guaranteeing fully automatic ground-truth measurement of memory usefulness. +- Replacing explicit feedback commands; they remain useful when available. +- Implementing dashboards or analytics infrastructure in this design-only change. + +## Decisions + +### Decision: Split effectiveness interpretation into system health and product value +The framework will define two separate evaluation layers. + +- **System health**: capture success, skip reasons, recall hit rate, recall injection rate. +- **Product value**: repeated-context reduction, clarification burden reduction, manual memory rescue rate, correction-signal rate, and sampled recall usefulness. + +Rationale: +- Existing metrics already describe whether the memory pipeline is operational. +- Users need a distinct lens for judging whether memory changed interaction cost in a beneficial way. +- This separation prevents high recall-hit rates from being misread as evidence of usefulness. + +Alternatives considered: +- Continue treating a single `memory_effectiveness` summary as a complete quality signal: rejected because it overstates certainty when user feedback is sparse. + +### Decision: Treat explicit feedback as sparse high-confidence evidence, not as the main KPI source +Explicit feedback commands remain important, but low feedback volume must be interpreted as insufficient signal. + +Rationale: +- Background auto-capture and background recall mean most users cannot easily observe storage or injection moments. +- Sparse feedback is therefore expected even in healthy usage. +- When explicit feedback does exist, it is still high-value evidence and should influence quality review. + +Alternatives considered: +- Ignore explicit feedback entirely: rejected because it is the strongest direct signal when present. +- Treat zero feedback as zero defects: rejected because it collapses missing observability into false confidence. + +### Decision: Use proxy metrics and sample audits as the default low-feedback evaluation method +Maintainers will review proxy metrics and periodic sampled sessions or events instead of waiting for large volumes of user feedback. + +Rationale: +- Proxy metrics can be collected passively from real usage. +- Sample audits allow teams to inspect actual recall usefulness and skipped-capture quality with bounded effort. +- This is more realistic for a background memory system than requiring constant manual annotation. + +Alternatives considered: +- Require users to rate every memory interaction: rejected as too disruptive and unlikely to succeed in CLI workflows. + +## Risks / Trade-offs + +- [Proxy metrics are less direct than explicit labels] -> Mitigation: keep proxy metrics paired with periodic human sample review. +- [Teams may over-interpret high recall hit rates] -> Mitigation: explicitly document that recall availability does not prove usefulness. +- [Sample audits may be inconsistent across reviewers] -> Mitigation: define a lightweight review rubric with fixed questions for captured and recalled examples. +- [Low-feedback evaluation could drift into qualitative opinions] -> Mitigation: anchor reviews in repeatable proxy metrics plus explicit audit checklists. + +## Migration Plan + +1. Add OpenSpec requirements and design guidance for low-feedback evaluation. +2. Update docs and reporting guidance so maintainers classify metrics into system-health and product-value layers. +3. If needed later, extend runtime tooling to compute or expose the proxy metrics defined here. + +## Open Questions + +- Which proxy metrics can be derived from existing event streams without adding new runtime instrumentation? +- Should sampled audits focus first on recalled memories, skipped captures, or both? +- What minimum sample size should release reviewers use before drawing quality conclusions in low-feedback projects? diff --git a/openspec/changes/add-low-feedback-memory-evaluation/proposal.md b/openspec/changes/add-low-feedback-memory-evaluation/proposal.md new file mode 100644 index 0000000..7ce2770 --- /dev/null +++ b/openspec/changes/add-low-feedback-memory-evaluation/proposal.md @@ -0,0 +1,25 @@ +## Why + +The project now records memory effectiveness events and exposes explicit feedback commands, but real OpenCode usage relies on background auto-capture and background recall. In that workflow, users rarely see which memories were stored or injected, so sparse feedback cannot be treated as proof that memory quality is good. We need a low-feedback evaluation framework that defines how operators judge memory value when explicit user reports are missing. + +## What Changes + +- Extend memory effectiveness evaluation to distinguish operational metrics from product-value proxies in low-feedback environments. +- Define behavior-based proxy metrics such as repeated-context reduction, clarification burden, manual memory rescue rate, and correction-signal rate. +- Define review workflows that use event summaries plus periodic sample audits instead of relying on direct feedback volume alone. +- Clarify that zero feedback counts mean insufficient signal rather than confirmed quality. + +## Capabilities + +### New Capabilities +- `low-feedback-memory-evaluation`: Defines how maintainers evaluate long-memory usefulness when explicit user feedback is rare or unavailable. + +### Modified Capabilities +- `memory-effectiveness-evaluation`: Expand effectiveness summaries and interpretation guidance to separate system health from product value and to treat missing feedback as unknown rather than good. +- `memory-validation-harness`: Extend validation/reporting expectations so maintainers review proxy metrics and sample-audit workflows in addition to raw event totals. + +## Impact + +- Affected design and reporting docs will include new guidance for interpreting `memory_effectiveness` outputs under sparse-feedback conditions. +- Future implementation work may add derived metrics, review tooling, and event collection for behavior-based proxies. +- Release-readiness and evaluation practices will need to reference low-feedback proxy metrics instead of over-weighting explicit feedback counts. diff --git a/openspec/changes/add-low-feedback-memory-evaluation/specs/low-feedback-memory-evaluation/spec.md b/openspec/changes/add-low-feedback-memory-evaluation/specs/low-feedback-memory-evaluation/spec.md new file mode 100644 index 0000000..69ea242 --- /dev/null +++ b/openspec/changes/add-low-feedback-memory-evaluation/specs/low-feedback-memory-evaluation/spec.md @@ -0,0 +1,22 @@ +## ADDED Requirements + +### Requirement: Low-feedback evaluation framework +The project MUST define a low-feedback evaluation framework for long-memory quality so maintainers can assess usefulness when explicit `memory_feedback_*` reports are sparse or absent. + +#### Scenario: Feedback counts are sparse +- **WHEN** explicit missing, wrong, and useful feedback counts are low or zero +- **THEN** the evaluation framework treats those counts as insufficient signal rather than proof that memory quality is good + +### Requirement: Product-value proxy metrics +The project MUST define product-value proxy metrics that can be reviewed alongside explicit feedback in low-feedback environments. + +#### Scenario: Maintainer reviews long-memory value without direct labels +- **WHEN** maintainers evaluate memory usefulness in a workflow where capture and recall happen automatically in the background +- **THEN** they can review proxy metrics including repeated-context reduction, clarification burden, manual memory rescue rate, correction-signal rate, and sampled recall usefulness + +### Requirement: Sample-audit workflow +The project MUST define a periodic sample-audit workflow for reviewing recalled memories and skipped captures under low-feedback conditions. + +#### Scenario: Maintainer performs a bounded audit +- **WHEN** maintainers need to validate whether recalled memories were useful or whether skipped captures hid important information +- **THEN** the framework provides a repeatable sampled review process instead of requiring continuous user labeling diff --git a/openspec/changes/add-low-feedback-memory-evaluation/specs/memory-effectiveness-evaluation/spec.md b/openspec/changes/add-low-feedback-memory-evaluation/specs/memory-effectiveness-evaluation/spec.md new file mode 100644 index 0000000..13931be --- /dev/null +++ b/openspec/changes/add-low-feedback-memory-evaluation/specs/memory-effectiveness-evaluation/spec.md @@ -0,0 +1,16 @@ +## MODIFIED Requirements + +### Requirement: Quantitative effectiveness summaries +The system MUST support quantitative summaries for long-memory effectiveness that combine capture funnel, recall funnel, and feedback-confirmed quality signals, and the project MUST define how those summaries are interpreted when explicit feedback is sparse. + +#### Scenario: Operator requests effectiveness summary +- **WHEN** an operator runs the documented effectiveness reporting workflow +- **THEN** the system returns machine-readable summary fields for capture success, skip reasons, recall hit rate, helpful recall rate, false-positive rate, and false-negative rate + +#### Scenario: Summary distinguishes operational and product metrics +- **WHEN** effectiveness metrics are reported +- **THEN** the report separates operational indicators from product-outcome proxies such as repeated-context reduction or manual-search-after-recall rate + +#### Scenario: Zero feedback is interpreted as unknown quality +- **WHEN** explicit feedback counts are zero or too sparse to support statistical confidence +- **THEN** maintainers treat feedback-derived rates as insufficient evidence rather than as confirmation that memory quality is good diff --git a/openspec/changes/add-low-feedback-memory-evaluation/specs/memory-validation-harness/spec.md b/openspec/changes/add-low-feedback-memory-evaluation/specs/memory-validation-harness/spec.md new file mode 100644 index 0000000..c41ebfa --- /dev/null +++ b/openspec/changes/add-low-feedback-memory-evaluation/specs/memory-validation-harness/spec.md @@ -0,0 +1,15 @@ +## ADDED Requirements + +### Requirement: Low-feedback review guidance +The project MUST document a validation and review workflow for low-feedback environments that combines runtime summaries, proxy metrics, and sampled audits. + +#### Scenario: Maintainer reviews release readiness with sparse user feedback +- **WHEN** maintainers review long-memory effectiveness and explicit feedback counts are sparse +- **THEN** the documented workflow instructs them to review proxy metrics and sampled audits instead of relying on feedback totals alone + +### Requirement: Proxy-metric evidence mapping +The project MUST map low-feedback proxy metrics and sample-audit expectations into the effectiveness review process. + +#### Scenario: Team evaluates whether memory reduced interaction cost +- **WHEN** the team asks whether long memory helped in real OpenCode usage +- **THEN** the review path includes evidence for reduced repeated context, reduced clarification burden, reduced manual rescue behavior, or stable correction-signal rates diff --git a/openspec/changes/add-low-feedback-memory-evaluation/tasks.md b/openspec/changes/add-low-feedback-memory-evaluation/tasks.md new file mode 100644 index 0000000..1af51bc --- /dev/null +++ b/openspec/changes/add-low-feedback-memory-evaluation/tasks.md @@ -0,0 +1,14 @@ +## 1. Evaluation Framework + +- [x] 1.1 Update effectiveness-reporting docs to separate system-health metrics from product-value metrics. +- [x] 1.2 Document low-feedback interpretation rules, including that zero feedback means insufficient evidence rather than zero defects. + +## 2. Proxy Metrics And Audits + +- [x] 2.1 Define the proxy metrics that should be reviewed in low-feedback environments and map each metric to available evidence sources. +- [x] 2.2 Document a sampled audit workflow for recalled memories and skipped captures. + +## 3. Validation Guidance + +- [x] 3.1 Update validation and release-readiness guidance to require proxy-metric review or sample audits when explicit feedback is sparse. +- [x] 3.2 Add or update examples showing how maintainers interpret `memory_effectiveness` outputs under low-feedback conditions. diff --git a/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/.openspec.yaml b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/.openspec.yaml new file mode 100644 index 0000000..4e61834 --- /dev/null +++ b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/.openspec.yaml @@ -0,0 +1,2 @@ +schema: spec-driven +created: 2026-03-19 diff --git a/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/design.md b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/design.md new file mode 100644 index 0000000..0039c51 --- /dev/null +++ b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/design.md @@ -0,0 +1,81 @@ +## Context + +The retrieval path currently ranks memories using a weighted sum of cosine similarity and BM25-like score, then applies a minimum score threshold. This is simple but sensitive to score-scale drift across lexical and semantic channels. The data model already stores `timestamp` and `importance`, but ranking does not currently consume either field. We need a low-risk phase-1 ranking upgrade that improves result ordering without introducing external reranker dependencies or changing storage backend. + +## Goals / Non-Goals + +**Goals:** +- Replace weighted-sum fusion with rank-based RRF fusion for vector and BM25 retrieval channels. +- Apply recency and importance factors as final ranking boosts with configurable controls. +- Preserve current operational behavior (scope filtering, min-score safety, graceful embedding fallback). +- Keep implementation lightweight and testable in current TypeScript/LanceDB architecture. + +**Non-Goals:** +- Introducing cross-encoder reranking or external reranker APIs. +- Adding MMR diversity selection in this phase. +- Redesigning memory extraction, storage schema, or lifecycle event model. +- Migrating retrieval to a new storage engine. + +## Decisions + +### Decision: Use RRF for channel fusion instead of weighted score sum +We will compute independent rankings for vector score and BM25 score, then combine ranks with RRF: + +`rrfScore = 1 / (k + rankVector) + 1 / (k + rankBm25)` + +where `k` defaults to `60` and is configurable. + +Rationale: +- Rank-based fusion is robust to score-range mismatch between cosine and BM25 channels. +- It is computationally cheap and deterministic. +- It avoids fragile per-channel score calibration. + +Alternatives considered: +- Keep weighted-sum fusion and tune weights: rejected for scale-sensitivity and lower robustness under mixed query styles. + +### Decision: Apply recency boost as multiplicative time decay +Final score will multiply by a recency factor computed from memory age and configurable half-life hours. + +Rationale: +- Multiplicative boost preserves base relevance ordering while preferring fresher memories. +- Half-life parameter is interpretable and stable for operators. + +Alternatives considered: +- Additive recency bonus: rejected because it can over-promote weakly relevant recent memories. + +### Decision: Apply importance weighting as multiplicative factor +Final score will multiply by an importance-based factor controlled by `importanceWeight`. + +Rationale: +- Importance is already captured at write time and should influence recall ranking. +- Multiplicative weighting stays bounded and composes cleanly with recency. + +Alternatives considered: +- Hard filter by minimum importance: rejected because it can suppress useful low-importance records. + +### Decision: Keep existing minScore and scope safety filters +The retrieval pipeline will preserve dimension-compatibility filtering, scope filtering, and minimum score threshold behavior. + +Rationale: +- These guards are already production-proven and required by existing specs. +- Phase-1 should improve ranking quality without reducing retrieval safety. + +## Risks / Trade-offs + +- [RRF may change ranking order for known prompts] -> Mitigation: add deterministic regression tests with fixed fixtures for rank expectations. +- [Recency decay may over-prioritize fresh but less relevant memories] -> Mitigation: expose half-life config and keep conservative default. +- [Importance weighting may amplify noisy extraction labels] -> Mitigation: keep default weight moderate and validate with retrieval tests. +- [New config knobs increase operator complexity] -> Mitigation: provide safe defaults and backward-compatible behavior when omitted. + +## Migration Plan + +1. Add retrieval phase-1 config fields with defaults in runtime config resolution. +2. Implement RRF + recency + importance scoring in store search path. +3. Update tests to assert new ranking behavior and default compatibility. +4. Run existing verification and retrieval workflows in Docker environment. +5. Keep rollback path by disabling recency/importance via config and retaining deterministic RRF default. + +## Open Questions + +- Should recency decay apply uniformly across all categories, or should `decision` memories decay slower in a future phase? +- Should phase-2 add cross-encoder reranking only for top-N candidates behind a separate mode flag? diff --git a/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/proposal.md b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/proposal.md new file mode 100644 index 0000000..0f04308 --- /dev/null +++ b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/proposal.md @@ -0,0 +1,25 @@ +## Why + +Current retrieval uses a single weighted-sum score (`vector + BM25`) and ignores already-stored recency and importance metadata during ranking. This causes unstable ranking quality in mixed query types and misses low-cost relevance gains that can be delivered without changing storage architecture. + +## What Changes + +- Replace weighted-sum hybrid fusion with rank-based reciprocal rank fusion (RRF) for vector and BM25 candidate lists. +- Add configurable recency boost and importance weighting in the final retrieval score. +- Extend retrieval configuration contract with phase-1 ranking controls (RRF constant, recency half-life, recency toggle, importance weight). +- Add regression coverage to verify ranking order behavior for RRF + recency + importance paths. + +## Capabilities + +### New Capabilities +- `memory-retrieval-ranking-phase1`: Defines phase-1 ranking pipeline behavior (RRF fusion, recency boost, importance weighting) and ranking-specific acceptance scenarios. + +### Modified Capabilities +- `memory-auto-capture-and-recall`: Retrieval requirements change from weighted-sum-only ranking to configurable phase-1 ranking pipeline behavior. +- `memory-provider-config`: Retrieval config requirements expand to include ranking-phase controls and defaults. + +## Impact + +- Affected code: `src/store.ts`, `src/config.ts`, `src/types.ts`, `src/index.ts` (if search API shape changes), and retrieval-related tests. +- Affected behavior: memory ranking order in automatic recall and `memory_search` tool output. +- Backward compatibility: existing retrieval mode remains supported; new controls default to enabled-safe values for phase-1 ranking. diff --git a/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/specs/memory-auto-capture-and-recall/spec.md b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/specs/memory-auto-capture-and-recall/spec.md new file mode 100644 index 0000000..39f378e --- /dev/null +++ b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/specs/memory-auto-capture-and-recall/spec.md @@ -0,0 +1,12 @@ +## MODIFIED Requirements + +### Requirement: Hybrid retrieval for context injection +The system MUST support hybrid retrieval combining vector similarity and BM25 lexical matching with reciprocal rank fusion (RRF) and configurable ranking controls, and the project MUST provide a retrieval-quality workflow that measures ranked-result quality against documented thresholds. + +#### Scenario: Hybrid retrieval returns ranked matches +- **WHEN** user submits a new troubleshooting prompt with memory provider enabled in hybrid mode +- **THEN** the system computes vector and BM25 candidates, applies RRF fusion plus configured recency and importance ranking controls, and returns ranked memories for context injection + +#### Scenario: Retrieval quality workflow reports recall metrics +- **WHEN** maintainers run the retrieval-quality workflow against the defined query set +- **THEN** the workflow reports recall and robustness metrics that can be compared with the documented release thresholds diff --git a/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/specs/memory-provider-config/spec.md b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/specs/memory-provider-config/spec.md new file mode 100644 index 0000000..45ffb94 --- /dev/null +++ b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/specs/memory-provider-config/spec.md @@ -0,0 +1,20 @@ +## MODIFIED Requirements + +### Requirement: Memory provider configuration contract +The system MUST support a memory configuration contract in sidecar config and environment variables with provider id, storage path, embedding settings, and retrieval settings, including both `ollama` and `openai` embedding providers and phase-1 ranking controls (`rrfK`, recency toggle, recency half-life hours, and importance weight). + +#### Scenario: Valid provider configuration is loaded +- **WHEN** memory config contains `provider = "lancedb-opencode-pro"` with valid `dbPath`, `embedding`, and `retrieval` fields +- **THEN** the provider configuration is accepted and initialized without fallback + +#### Scenario: Missing optional retrieval values uses defaults +- **WHEN** `memory.retrieval` omits optional mode, threshold, or phase-1 ranking control fields +- **THEN** the system applies documented defaults including `mode = hybrid`, `rrfK = 60`, recency boost enabled with a conservative half-life default, and moderate importance weighting + +#### Scenario: Embedding provider defaults to ollama +- **WHEN** `memory.embedding.provider` is omitted +- **THEN** the system defaults embedding provider to `ollama` to preserve backward compatibility + +#### Scenario: Environment variable overrides embedding provider settings +- **WHEN** OpenAI or Ollama embedding settings are provided in supported environment variables +- **THEN** environment variable values override sidecar configuration according to documented precedence diff --git a/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/specs/memory-retrieval-ranking-phase1/spec.md b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/specs/memory-retrieval-ranking-phase1/spec.md new file mode 100644 index 0000000..3533b17 --- /dev/null +++ b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/specs/memory-retrieval-ranking-phase1/spec.md @@ -0,0 +1,20 @@ +## ADDED Requirements + +### Requirement: Phase-1 ranking pipeline with RRF, recency, and importance +The system MUST support a phase-1 retrieval ranking pipeline that fuses vector and BM25 channels using reciprocal rank fusion (RRF), then applies recency and importance multipliers before final ranking output. + +#### Scenario: RRF fusion combines lexical and semantic ranks +- **WHEN** a retrieval query produces vector and BM25 candidate rankings for the active scope set +- **THEN** the system computes a fused ranking score using RRF with the configured constant and returns results sorted by fused score descending + +#### Scenario: Recency boost prefers fresher memories +- **WHEN** two memories have equivalent base fused relevance but different timestamps +- **THEN** the newer memory receives a higher final score when recency boost is enabled + +#### Scenario: Importance weighting influences final ordering +- **WHEN** two memories have equivalent fused relevance and recency factors but different stored importance values +- **THEN** the higher-importance memory receives a higher final score when importance weighting is enabled + +#### Scenario: Safety filters remain in effect +- **WHEN** retrieval processes candidate memories under phase-1 ranking +- **THEN** scope filtering, vector-dimension compatibility filtering, and minimum-score threshold filtering continue to apply before output diff --git a/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/tasks.md b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/tasks.md new file mode 100644 index 0000000..e3341df --- /dev/null +++ b/openspec/changes/archive/2026-03-19-improve-memory-retrieval-ranking-phase1/tasks.md @@ -0,0 +1,16 @@ +## 1. Retrieval Config Contract + +- [x] 1.1 Extend retrieval runtime types and config resolution to include phase-1 ranking controls (`rrfK`, recency boost toggle, recency half-life hours, importance weight) with documented defaults. +- [x] 1.2 Add environment-variable support and validation/clamping for new phase-1 ranking controls. + +## 2. Search Scoring Pipeline + +- [x] 2.1 Refactor store search scoring to compute independent vector and BM25 rankings, then fuse with RRF using configured `rrfK`. +- [x] 2.2 Apply recency multiplier and importance multiplier to fused scores before threshold filtering and final ordering. +- [x] 2.3 Preserve existing scope filtering, vector-dimension compatibility filtering, and minimum-score behavior. + +## 3. Verification Coverage + +- [x] 3.1 Add/adjust tests that verify deterministic ranking behavior for RRF fusion across mixed lexical/semantic query fixtures. +- [x] 3.2 Add/adjust tests that verify recency and importance controls affect ranking as configured while preserving safety filters. +- [x] 3.3 Run containerized verification workflows (`typecheck`, `build`, and targeted tests) and capture pass results. diff --git a/openspec/specs/memory-auto-capture-and-recall/spec.md b/openspec/specs/memory-auto-capture-and-recall/spec.md index b82be9f..70f561f 100644 --- a/openspec/specs/memory-auto-capture-and-recall/spec.md +++ b/openspec/specs/memory-auto-capture-and-recall/spec.md @@ -15,11 +15,11 @@ The system MUST automatically capture durable memory candidates from successful - **THEN** the workflow verifies that qualifying assistant output is captured, short output below the configured minimum is skipped, and stored category metadata matches the extraction rules ### Requirement: Hybrid retrieval for context injection -The system MUST support hybrid retrieval combining vector similarity and BM25 lexical matching with configurable weights, and the project MUST provide a retrieval-quality workflow that measures ranked-result quality against documented thresholds. +The system MUST support hybrid retrieval combining vector similarity and BM25 lexical matching with reciprocal rank fusion (RRF) and configurable ranking controls, and the project MUST provide a retrieval-quality workflow that measures ranked-result quality against documented thresholds. #### Scenario: Hybrid retrieval returns ranked matches - **WHEN** user submits a new troubleshooting prompt with memory provider enabled in hybrid mode -- **THEN** the system computes vector and BM25 candidates, applies configured weighting, and returns ranked memories for context injection +- **THEN** the system computes vector and BM25 candidates, applies RRF fusion plus configured recency and importance ranking controls, and returns ranked memories for context injection #### Scenario: Retrieval quality workflow reports recall metrics - **WHEN** maintainers run the retrieval-quality workflow against the defined query set diff --git a/openspec/specs/memory-provider-config/spec.md b/openspec/specs/memory-provider-config/spec.md index e963f20..e9ec177 100644 --- a/openspec/specs/memory-provider-config/spec.md +++ b/openspec/specs/memory-provider-config/spec.md @@ -4,15 +4,15 @@ TBD - created by archiving change add-lancedb-memory-provider. Update Purpose after archive. ## Requirements ### Requirement: Memory provider configuration contract -The system MUST support a memory configuration contract in sidecar config and environment variables with provider id, storage path, embedding settings, and retrieval settings, including both `ollama` and `openai` embedding providers. +The system MUST support a memory configuration contract in sidecar config and environment variables with provider id, storage path, embedding settings, and retrieval settings, including both `ollama` and `openai` embedding providers and phase-1 ranking controls (`rrfK`, recency toggle, recency half-life hours, and importance weight). #### Scenario: Valid provider configuration is loaded - **WHEN** memory config contains `provider = "lancedb-opencode-pro"` with valid `dbPath`, `embedding`, and `retrieval` fields - **THEN** the provider configuration is accepted and initialized without fallback #### Scenario: Missing optional retrieval values uses defaults -- **WHEN** `memory.retrieval` omits weights or mode fields -- **THEN** the system applies documented defaults including `mode = hybrid`, `vectorWeight = 0.7`, and `bm25Weight = 0.3` +- **WHEN** `memory.retrieval` omits optional mode, threshold, or phase-1 ranking control fields +- **THEN** the system applies documented defaults including `mode = hybrid`, `rrfK = 60`, recency boost enabled with a conservative half-life default, and moderate importance weighting #### Scenario: Embedding provider defaults to ollama - **WHEN** `memory.embedding.provider` is omitted diff --git a/openspec/specs/memory-retrieval-ranking-phase1/spec.md b/openspec/specs/memory-retrieval-ranking-phase1/spec.md new file mode 100644 index 0000000..0f9f8aa --- /dev/null +++ b/openspec/specs/memory-retrieval-ranking-phase1/spec.md @@ -0,0 +1,24 @@ +# memory-retrieval-ranking-phase1 Specification + +## Purpose +TBD - created by archiving change improve-memory-retrieval-ranking-phase1. Update Purpose after archive. +## Requirements +### Requirement: Phase-1 ranking pipeline with RRF, recency, and importance +The system MUST support a phase-1 retrieval ranking pipeline that fuses vector and BM25 channels using reciprocal rank fusion (RRF), then applies recency and importance multipliers before final ranking output. + +#### Scenario: RRF fusion combines lexical and semantic ranks +- **WHEN** a retrieval query produces vector and BM25 candidate rankings for the active scope set +- **THEN** the system computes a fused ranking score using RRF with the configured constant and returns results sorted by fused score descending + +#### Scenario: Recency boost prefers fresher memories +- **WHEN** two memories have equivalent base fused relevance but different timestamps +- **THEN** the newer memory receives a higher final score when recency boost is enabled + +#### Scenario: Importance weighting influences final ordering +- **WHEN** two memories have equivalent fused relevance and recency factors but different stored importance values +- **THEN** the higher-importance memory receives a higher final score when importance weighting is enabled + +#### Scenario: Safety filters remain in effect +- **WHEN** retrieval processes candidate memories under phase-1 ranking +- **THEN** scope filtering, vector-dimension compatibility filtering, and minimum-score threshold filtering continue to apply before output + diff --git a/package-lock.json b/package-lock.json index e331b19..064f807 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "lancedb-opencode-pro", - "version": "0.1.3", + "version": "0.1.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "lancedb-opencode-pro", - "version": "0.1.3", + "version": "0.1.4", "license": "MIT", "dependencies": { "@lancedb/lancedb": "^0.26.2", diff --git a/package.json b/package.json index 83eca35..e0d8a66 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "lancedb-opencode-pro", - "version": "0.1.3", + "version": "0.1.4", "description": "LanceDB-backed long-term memory provider for OpenCode", "type": "module", "main": "dist/index.js", diff --git a/src/config.ts b/src/config.ts index d5bf21b..fffc51d 100644 --- a/src/config.ts +++ b/src/config.ts @@ -28,6 +28,17 @@ export function resolveMemoryConfig(config: Config | undefined, worktree?: strin const weightSum = vectorWeight + bm25Weight; const normalizedVectorWeight = weightSum > 0 ? vectorWeight / weightSum : 0.7; const normalizedBm25Weight = weightSum > 0 ? bm25Weight / weightSum : 0.3; + const rrfK = Math.max(1, Math.floor(toNumber(process.env.LANCEDB_OPENCODE_PRO_RRF_K ?? retrievalRaw.rrfK, 60))); + const recencyBoost = toBoolean(process.env.LANCEDB_OPENCODE_PRO_RECENCY_BOOST ?? retrievalRaw.recencyBoost, true); + const recencyHalfLifeHours = Math.max( + 1, + toNumber(process.env.LANCEDB_OPENCODE_PRO_RECENCY_HALF_LIFE_HOURS ?? retrievalRaw.recencyHalfLifeHours, 72), + ); + const importanceWeight = clamp( + toNumber(process.env.LANCEDB_OPENCODE_PRO_IMPORTANCE_WEIGHT ?? retrievalRaw.importanceWeight, 0.4), + 0, + 2, + ); const embeddingProvider = resolveEmbeddingProvider( firstString(process.env.LANCEDB_OPENCODE_PRO_EMBEDDING_PROVIDER, embeddingRaw.provider), @@ -72,6 +83,10 @@ export function resolveMemoryConfig(config: Config | undefined, worktree?: strin vectorWeight: normalizedVectorWeight, bm25Weight: normalizedBm25Weight, minScore: clamp(toNumber(process.env.LANCEDB_OPENCODE_PRO_MIN_SCORE ?? retrievalRaw.minScore, 0.2), 0, 1), + rrfK, + recencyBoost, + recencyHalfLifeHours, + importanceWeight, }, includeGlobalScope: toBoolean(process.env.LANCEDB_OPENCODE_PRO_INCLUDE_GLOBAL_SCOPE ?? raw.includeGlobalScope, true), minCaptureChars: Math.max( diff --git a/src/index.ts b/src/index.ts index ce06096..fc6e7d1 100644 --- a/src/index.ts +++ b/src/index.ts @@ -63,6 +63,10 @@ const plugin: Plugin = async (input) => { vectorWeight: state.config.retrieval.mode === "vector" ? 1 : state.config.retrieval.vectorWeight, bm25Weight: state.config.retrieval.mode === "vector" ? 0 : state.config.retrieval.bm25Weight, minScore: state.config.retrieval.minScore, + rrfK: state.config.retrieval.rrfK, + recencyBoost: state.config.retrieval.recencyBoost, + recencyHalfLifeHours: state.config.retrieval.recencyHalfLifeHours, + importanceWeight: state.config.retrieval.importanceWeight, }); await state.store.putEvent({ @@ -118,6 +122,10 @@ const plugin: Plugin = async (input) => { vectorWeight: state.config.retrieval.mode === "vector" ? 1 : state.config.retrieval.vectorWeight, bm25Weight: state.config.retrieval.mode === "vector" ? 0 : state.config.retrieval.bm25Weight, minScore: state.config.retrieval.minScore, + rrfK: state.config.retrieval.rrfK, + recencyBoost: state.config.retrieval.recencyBoost, + recencyHalfLifeHours: state.config.retrieval.recencyHalfLifeHours, + importanceWeight: state.config.retrieval.importanceWeight, }); if (results.length === 0) return "No relevant memory found."; diff --git a/src/store.ts b/src/store.ts index f1410f2..a940ea1 100644 --- a/src/store.ts +++ b/src/store.ts @@ -138,22 +138,65 @@ export class MemoryStore { vectorWeight: number; bm25Weight: number; minScore: number; + rrfK?: number; + recencyBoost?: boolean; + recencyHalfLifeHours?: number; + importanceWeight?: number; }): Promise { const cached = await this.getCachedScopes(params.scopes); if (cached.records.length === 0) return []; const queryTokens = tokenize(params.query); - const queryNorm = vecNorm(params.queryVector); - - const scored = cached.records + const useVectorChannel = params.queryVector.length > 0 && params.vectorWeight > 0; + const useBm25Channel = queryTokens.length > 0 && params.bm25Weight > 0; + const { vectorWeight, bm25Weight } = normalizeChannelWeights( + useVectorChannel ? params.vectorWeight : 0, + useBm25Channel ? params.bm25Weight : 0, + ); + const rrfK = Math.max(1, Math.floor(params.rrfK ?? 60)); + const recencyBoostEnabled = params.recencyBoost ?? true; + const recencyHalfLifeHours = Math.max(1, params.recencyHalfLifeHours ?? 72); + const importanceWeight = clampImportanceWeight(params.importanceWeight ?? 0.4); + + const candidates = cached.records .filter((record) => params.queryVector.length === 0 || record.vector.length === params.queryVector.length) .map((record, index) => { const recordNorm = cached.norms.get(record.id) ?? vecNorm(record.vector); - const vectorScore = fastCosine(params.queryVector, record.vector, queryNorm, recordNorm); - const bm25Score = bm25LikeScore(queryTokens, cached.tokenized[index], cached.idf); - const score = params.vectorWeight * vectorScore + params.bm25Weight * bm25Score; - return { record, score, vectorScore, bm25Score }; + const vectorScore = useVectorChannel ? fastCosine(params.queryVector, record.vector, queryNorm, recordNorm) : 0; + const bm25Score = useBm25Channel ? bm25LikeScore(queryTokens, cached.tokenized[index], cached.idf) : 0; + return { record, vectorScore, bm25Score }; + }); + + if (candidates.length === 0) return []; + + const vectorRanks = useVectorChannel ? buildRankMap(candidates, (item) => item.vectorScore) : null; + const bm25Ranks = useBm25Channel ? buildRankMap(candidates, (item) => item.bm25Score) : null; + + const scored = candidates + .map((item) => { + let rrfScore = 0; + if (vectorRanks) { + const rank = vectorRanks.get(item.record.id); + if (rank !== undefined) rrfScore += vectorWeight / (rrfK + rank); + } + if (bm25Ranks) { + const rank = bm25Ranks.get(item.record.id); + if (rank !== undefined) rrfScore += bm25Weight / (rrfK + rank); + } + rrfScore *= rrfK + 1; + + const recencyFactor = recencyBoostEnabled + ? computeRecencyMultiplier(item.record.timestamp, recencyHalfLifeHours) + : 1; + const importanceFactor = 1 + importanceWeight * clampImportance(item.record.importance); + const score = rrfScore * recencyFactor * importanceFactor; + return { + record: item.record, + score, + vectorScore: item.vectorScore, + bm25Score: item.bm25Score, + }; }) .filter((item) => item.score >= params.minScore) .sort((a, b) => b.score - a.score) @@ -514,6 +557,48 @@ function escapeSql(value: string): string { return value.replace(/'/g, "''"); } +function buildRankMap( + items: T[], + scoreOf: (item: T) => number, +): Map { + const ranked = [...items].sort((a, b) => scoreOf(b) - scoreOf(a)); + const ranks = new Map(); + for (let i = 0; i < ranked.length; i += 1) { + ranks.set(ranked[i].record.id, i + 1); + } + return ranks; +} + +function normalizeChannelWeights(vectorWeight: number, bm25Weight: number): { vectorWeight: number; bm25Weight: number } { + const sum = vectorWeight + bm25Weight; + if (sum <= 0) { + return { vectorWeight: 0.5, bm25Weight: 0.5 }; + } + return { + vectorWeight: vectorWeight / sum, + bm25Weight: bm25Weight / sum, + }; +} + +function computeRecencyMultiplier(timestamp: number, halfLifeHours: number): number { + const now = Date.now(); + const ageMs = Math.max(0, now - timestamp); + const ageHours = ageMs / 3_600_000; + if (ageHours === 0) return 1; + const decay = Math.pow(0.5, ageHours / halfLifeHours); + return 0.5 + 0.5 * decay; +} + +function clampImportance(value: number): number { + if (!Number.isFinite(value)) return 0; + return Math.max(0, Math.min(1, value)); +} + +function clampImportanceWeight(value: number): number { + if (!Number.isFinite(value)) return 0.4; + return Math.max(0, Math.min(2, value)); +} + function computeIdf(docs: string[][]): Map { const df = new Map(); for (const doc of docs) { diff --git a/src/types.ts b/src/types.ts index 34343e9..7d1ced1 100644 --- a/src/types.ts +++ b/src/types.ts @@ -29,6 +29,10 @@ export interface RetrievalConfig { vectorWeight: number; bm25Weight: number; minScore: number; + rrfK: number; + recencyBoost: boolean; + recencyHalfLifeHours: number; + importanceWeight: number; } export interface MemoryRuntimeConfig { diff --git a/test/foundation/foundation.test.ts b/test/foundation/foundation.test.ts index cc0ba3f..bc6fdbe 100644 --- a/test/foundation/foundation.test.ts +++ b/test/foundation/foundation.test.ts @@ -241,3 +241,146 @@ test("effectiveness summary aggregates capture recall and feedback metrics", asy await cleanupDbPath(dbPath); } }); + +test("search scoring uses normalized RRF fusion when recency and importance boosts are disabled", async () => { + const { store, dbPath } = await createTestStore(); + + try { + const scope = "project:rrf"; + const queryVector = createVector(384, 0.9); + const records = [ + createTestRecord({ + id: "rrf-a", + scope, + text: "alpha alpha alpha", + vector: createVector(384, 0.91), + vectorDim: 384, + importance: 0.5, + timestamp: 10_000, + }), + createTestRecord({ + id: "rrf-b", + scope, + text: "alpha", + vector: createVector(384, 0.85), + vectorDim: 384, + importance: 0.5, + timestamp: 10_000, + }), + createTestRecord({ + id: "rrf-c", + scope, + text: "alpha alpha", + vector: createVector(384, 0.8), + vectorDim: 384, + importance: 0.5, + timestamp: 10_000, + }), + ]; + + for (const record of records) { + await store.put(record); + } + + const results = await store.search({ + query: "alpha", + queryVector, + scopes: [scope], + limit: 10, + vectorWeight: 0.7, + bm25Weight: 0.3, + minScore: 0, + rrfK: 10, + recencyBoost: false, + importanceWeight: 0, + }); + + assert.equal(results.length, 3); + + const vectorRank = new Map( + [...results] + .sort((a, b) => b.vectorScore - a.vectorScore) + .map((item, index) => [item.record.id, index + 1] as const), + ); + const bm25Rank = new Map( + [...results] + .sort((a, b) => b.bm25Score - a.bm25Score) + .map((item, index) => [item.record.id, index + 1] as const), + ); + + for (const item of results) { + const vr = vectorRank.get(item.record.id) ?? 0; + const br = bm25Rank.get(item.record.id) ?? 0; + const expected = 11 * (0.7 / (10 + vr) + 0.3 / (10 + br)); + assert.ok(Math.abs(item.score - expected) < 1e-9, `unexpected RRF score for ${item.record.id}`); + } + } finally { + await cleanupDbPath(dbPath); + } +}); + +test("recency and importance multipliers influence ranking order", async () => { + const { store, dbPath } = await createTestStore(); + + try { + const scope = "project:boost"; + await store.put( + createTestRecord({ + id: "boost-old-high-importance", + scope, + text: "gateway timeout resolved", + vector: createVector(384, 0.7), + vectorDim: 384, + timestamp: Date.now() - 14 * 24 * 3_600_000, + importance: 1, + }), + ); + await store.put( + createTestRecord({ + id: "boost-new-low-importance", + scope, + text: "gateway timeout resolved", + vector: createVector(384, 0.7), + vectorDim: 384, + timestamp: Date.now(), + importance: 0, + }), + ); + + const query = "gateway timeout resolved"; + const queryVector = createVector(384, 0.7); + + const noRecency = await store.search({ + query, + queryVector, + scopes: [scope], + limit: 2, + vectorWeight: 0.5, + bm25Weight: 0.5, + minScore: 0, + recencyBoost: false, + importanceWeight: 1, + rrfK: 60, + }); + + assert.equal(noRecency[0]?.record.id, "boost-old-high-importance"); + + const withRecency = await store.search({ + query, + queryVector, + scopes: [scope], + limit: 2, + vectorWeight: 0.5, + bm25Weight: 0.5, + minScore: 0, + recencyBoost: true, + recencyHalfLifeHours: 24, + importanceWeight: 0, + rrfK: 60, + }); + + assert.equal(withRecency[0]?.record.id, "boost-new-low-importance"); + } finally { + await cleanupDbPath(dbPath); + } +}); diff --git a/test/regression/plugin.test.ts b/test/regression/plugin.test.ts index 0ce1f93..b987932 100644 --- a/test/regression/plugin.test.ts +++ b/test/regression/plugin.test.ts @@ -360,6 +360,48 @@ test("environment overrides can switch embedding provider to openai", async () = ); }); +test("resolveMemoryConfig provides phase-1 retrieval defaults", () => { + const resolved = resolveMemoryConfig( + { + memory: { + provider: "lancedb-opencode-pro", + }, + } as unknown as Parameters[0], + undefined, + ); + + assert.equal(resolved.retrieval.rrfK, 60); + assert.equal(resolved.retrieval.recencyBoost, true); + assert.equal(resolved.retrieval.recencyHalfLifeHours, 72); + assert.equal(resolved.retrieval.importanceWeight, 0.4); +}); + +test("resolveMemoryConfig applies phase-1 retrieval environment overrides", async () => { + await withPatchedEnv( + { + LANCEDB_OPENCODE_PRO_RRF_K: "30", + LANCEDB_OPENCODE_PRO_RECENCY_BOOST: "false", + LANCEDB_OPENCODE_PRO_RECENCY_HALF_LIFE_HOURS: "24", + LANCEDB_OPENCODE_PRO_IMPORTANCE_WEIGHT: "1.2", + }, + async () => { + const resolved = resolveMemoryConfig( + { + memory: { + provider: "lancedb-opencode-pro", + }, + } as unknown as Parameters[0], + undefined, + ); + + assert.equal(resolved.retrieval.rrfK, 30); + assert.equal(resolved.retrieval.recencyBoost, false); + assert.equal(resolved.retrieval.recencyHalfLifeHours, 24); + assert.equal(resolved.retrieval.importanceWeight, 1.2); + }, + ); +}); + test("resolveMemoryConfig rejects invalid embedding provider values", async () => { await withPatchedEnv( {