Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
ff73802
Add OKP RAG quality benchmarks and system config for LCORE regression…
alessandralanz Jun 17, 2026
7e29a9a
Add baseline comparison script for regression gating
alessandralanz Jun 18, 2026
74768c9
Revised benchmarks to adhere to Evaluation Data Collection Standard +…
alessandralanz Jun 18, 2026
5e1f772
Add version-controlled baseline for regression gating
alessandralanz Jun 18, 2026
5d59325
Add three run A/B/C comparison script and regression tests. Attribute…
alessandralanz Jun 22, 2026
c03b462
LCORE-2802: Update baseline from 102 to 506 evaluations
alessandralanz Jun 29, 2026
261f0b0
Remove context-requiring metrics from turns where RAG returns no cont…
alessandralanz Jun 30, 2026
f67c2d8
Update baseline to 491 evaluations after removing inapplicable contex…
alessandralanz Jun 30, 2026
7d4c4f2
fix coderabbit review findings: missing-metric handling, check-only s…
alessandralanz Jun 30, 2026
6f44c8b
switch to context-only RAGAS metrics, replace response-quality metric…
alessandralanz Jun 30, 2026
e7ab2ae
Increase max_threads to 4 and disable skip_on_failure
alessandralanz Jun 30, 2026
6ee6bb9
update baseline bc we updated config
alessandralanz Jun 30, 2026
c82599a
remove OKP eval data from public repo and strip baseline to aggregates
alessandralanz Jul 1, 2026
b382b29
Merge branch 'lightspeed-core:main' into lcore-regression
alessandralanz Jul 1, 2026
db2d156
Remove A/B/C comparison script since weekly/PR gate jobs isolate vari…
alessandralanz Jul 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions baselines/lcore_regression/current_baseline_summary.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
{
"timestamp": "2026-06-30T21:09:26.159970+00:00",
"total_evaluations": 432,
"summary_stats": {
"overall": {
"TOTAL": 432,
"PASS": 160,
"FAIL": 188,
"ERROR": 84,
"SKIPPED": 0,
"pass_rate": 37.03703703703704,
"fail_rate": 43.51851851851852,
"error_rate": 19.444444444444446
},
"by_metric": {
"ragas:context_precision_with_reference": {
"pass": 36,
"fail": 51,
"error": 21,
"skipped": 0,
"pass_rate": 33.33333333333333,
"fail_rate": 47.22222222222222,
"error_rate": 19.444444444444446,
"skipped_rate": 0.0,
"score_statistics": {
"count": 87,
"mean": 0.4434865900147031,
"median": 0.0,
"std": 0.47947104466841306,
"min": 0.0,
"max": 0.9999999999666667,
"confidence_interval": {
"low": 0.3438697317798228,
"mean": 0.4425287356098659,
"high": 0.5421455938391834,
"confidence_level": 95.0
}
}
},
"ragas:context_precision_without_reference": {
"pass": 50,
"fail": 37,
"error": 21,
"skipped": 0,
"pass_rate": 46.2962962962963,
"fail_rate": 34.25925925925926,
"error_rate": 19.444444444444446,
"skipped_rate": 0.0,
"score_statistics": {
"count": 87,
"mean": 0.6091954022673371,
"median": 0.9999999999,
"std": 0.46281465738414,
"min": 0.0,
"max": 0.9999999999666667,
"confidence_interval": {
"low": 0.5143678160670976,
"mean": 0.6101532566731322,
"high": 0.7059386972799353,
"confidence_level": 95.0
}
}
},
"ragas:context_recall": {
"pass": 26,
"fail": 61,
"error": 21,
"skipped": 0,
"pass_rate": 24.074074074074073,
"fail_rate": 56.481481481481474,
"error_rate": 19.444444444444446,
"skipped_rate": 0.0,
"score_statistics": {
"count": 87,
"mean": 0.4013409961685824,
"median": 0.0,
"std": 0.4450649459822112,
"min": 0.0,
"max": 1.0,
"confidence_interval": {
"low": 0.3065134099616859,
"mean": 0.4013409961685824,
"high": 0.4932950191570882,
"confidence_level": 95.0
}
}
},
"ragas:context_relevance": {
"pass": 48,
"fail": 39,
"error": 21,
"skipped": 0,
"pass_rate": 44.44444444444444,
"fail_rate": 36.11111111111111,
"error_rate": 19.444444444444446,
"skipped_rate": 0.0,
"score_statistics": {
"count": 87,
"mean": 0.6522988505747126,
"median": 1.0,
"std": 0.4059089746437468,
"min": 0.0,
"max": 1.0,
"confidence_interval": {
"low": 0.5689655172413792,
"mean": 0.6522988505747126,
"high": 0.7385057471264367,
"confidence_level": 95.0
}
}
}
}
}
}
109 changes: 109 additions & 0 deletions config/lcore_regression/system-config-pr-gate.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# System configuration for LCORE regression PR gate evaluation.
#
# This config tells the evaluation framework:
# - How to call the lightspeed-stack API (api section)
# - Which LLM judge to use for scoring (llm_pool + judge_panel)
# - Which embedding model to use for semantic metrics (embedding)
# - Default thresholds for each metric (metrics_metadata)
# - Where to write results (storage)
#
# The stack under test runs at localhost:8080 (started by docker-compose).
# The judge LLM is a separate OpenAI call (gpt-4o-mini) used only for scoring.

core:
max_threads: 4
fail_on_invalid_data: true
skip_on_failure: false

llm_pool:
defaults:
cache_enabled: false
timeout: 120
num_retries: 3
parameters:
temperature: 0.0
max_completion_tokens: 4096
models:
judge_gpt_4o_mini:
provider: openai
model: gpt-4o-mini

embedding:
provider: openai
model: text-embedding-3-small
cache_enabled: false

api:
enabled: true
api_base: http://localhost:8080
version: v1
endpoint_type: query
timeout: 300
provider: openai
model: gpt-4o-mini
cache_enabled: false

metrics_metadata:
turn_level:
ragas:context_recall:
threshold: 0.8
description: Did we fetch every fact the answer needs?
default: false

ragas:context_precision_with_reference:
threshold: 0.7
description: How precise the retrieved context is (with reference)
default: false

ragas:context_precision_without_reference:
threshold: 0.7
description: How precise the retrieved context is (without reference)
default: false

ragas:context_relevance:
threshold: 0.7
description: Is what we retrieved actually relevant to user query?
default: false

storage:
- type: file
output_dir: ./eval_output/lcore_regression
base_filename: evaluation
enabled_outputs:
- csv
- json
- txt
csv_columns:
- conversation_group_id
- turn_id
- metric_identifier
- result
- score
- threshold
- reason
- query
- response
- contexts
- expected_response
- expected_keywords
- api_input_tokens
- api_output_tokens

visualization:
figsize:
- 14
- 10
dpi: 150
enabled_graphs:
- pass_rates
- score_distribution
- status_breakdown

environment:
LITELLM_LOG: ERROR

logging:
source_level: INFO
package_level: WARNING
log_format: "%(asctime)s - %(levelname)s - %(message)s"
show_timestamps: true
1 change: 1 addition & 0 deletions script/regression/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Regression comparison scripts for evaluation gating."""
Loading
Loading