lightspeed-core · alessandralanz · Jun 17, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/baselines/lcore_regression/current_baseline_summary.json b/baselines/lcore_regression/current_baseline_summary.json
@@ -0,0 +1,114 @@
+{
+  "timestamp": "2026-06-30T21:09:26.159970+00:00",
+  "total_evaluations": 432,
+  "summary_stats": {
+    "overall": {
+      "TOTAL": 432,
+      "PASS": 160,
+      "FAIL": 188,
+      "ERROR": 84,
+      "SKIPPED": 0,
+      "pass_rate": 37.03703703703704,
+      "fail_rate": 43.51851851851852,
+      "error_rate": 19.444444444444446
+    },
+    "by_metric": {
+      "ragas:context_precision_with_reference": {
+        "pass": 36,
+        "fail": 51,
+        "error": 21,
+        "skipped": 0,
+        "pass_rate": 33.33333333333333,
+        "fail_rate": 47.22222222222222,
+        "error_rate": 19.444444444444446,
+        "skipped_rate": 0.0,
+        "score_statistics": {
+          "count": 87,
+          "mean": 0.4434865900147031,
+          "median": 0.0,
+          "std": 0.47947104466841306,
+          "min": 0.0,
+          "max": 0.9999999999666667,
+          "confidence_interval": {
+            "low": 0.3438697317798228,
+            "mean": 0.4425287356098659,
+            "high": 0.5421455938391834,
+            "confidence_level": 95.0
+          }
+        }
+      },
+      "ragas:context_precision_without_reference": {
+        "pass": 50,
+        "fail": 37,
+        "error": 21,
+        "skipped": 0,
+        "pass_rate": 46.2962962962963,
+        "fail_rate": 34.25925925925926,
+        "error_rate": 19.444444444444446,
+        "skipped_rate": 0.0,
+        "score_statistics": {
+          "count": 87,
+          "mean": 0.6091954022673371,
+          "median": 0.9999999999,
+          "std": 0.46281465738414,
+          "min": 0.0,
+          "max": 0.9999999999666667,
+          "confidence_interval": {
+            "low": 0.5143678160670976,
+            "mean": 0.6101532566731322,
+            "high": 0.7059386972799353,
+            "confidence_level": 95.0
+          }
+        }
+      },
+      "ragas:context_recall": {
+        "pass": 26,
+        "fail": 61,
+        "error": 21,
+        "skipped": 0,
+        "pass_rate": 24.074074074074073,
+        "fail_rate": 56.481481481481474,
+        "error_rate": 19.444444444444446,
+        "skipped_rate": 0.0,
+        "score_statistics": {
+          "count": 87,
+          "mean": 0.4013409961685824,
+          "median": 0.0,
+          "std": 0.4450649459822112,
+          "min": 0.0,
+          "max": 1.0,
+          "confidence_interval": {
+            "low": 0.3065134099616859,
+            "mean": 0.4013409961685824,
+            "high": 0.4932950191570882,
+            "confidence_level": 95.0
+          }
+        }
+      },
+      "ragas:context_relevance": {
+        "pass": 48,
+        "fail": 39,
+        "error": 21,
+        "skipped": 0,
+        "pass_rate": 44.44444444444444,
+        "fail_rate": 36.11111111111111,
+        "error_rate": 19.444444444444446,
+        "skipped_rate": 0.0,
+        "score_statistics": {
+          "count": 87,
+          "mean": 0.6522988505747126,
+          "median": 1.0,
+          "std": 0.4059089746437468,
+          "min": 0.0,
+          "max": 1.0,
+          "confidence_interval": {
+            "low": 0.5689655172413792,
+            "mean": 0.6522988505747126,
+            "high": 0.7385057471264367,
+            "confidence_level": 95.0
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/config/lcore_regression/system-config-pr-gate.yaml b/config/lcore_regression/system-config-pr-gate.yaml
@@ -0,0 +1,109 @@
+# System configuration for LCORE regression PR gate evaluation.
+#
+# This config tells the evaluation framework:
+#   - How to call the lightspeed-stack API (api section)
+#   - Which LLM judge to use for scoring (llm_pool + judge_panel)
+#   - Which embedding model to use for semantic metrics (embedding)
+#   - Default thresholds for each metric (metrics_metadata)
+#   - Where to write results (storage)
+#
+# The stack under test runs at localhost:8080 (started by docker-compose).
+# The judge LLM is a separate OpenAI call (gpt-4o-mini) used only for scoring.
+
+core:
+  max_threads: 4
+  fail_on_invalid_data: true
+  skip_on_failure: false
+
+llm_pool:
+  defaults:
+    cache_enabled: false
+    timeout: 120
+    num_retries: 3
+    parameters:
+      temperature: 0.0
+      max_completion_tokens: 4096
+  models:
+    judge_gpt_4o_mini:
+      provider: openai
+      model: gpt-4o-mini
+
+embedding:
+  provider: openai
+  model: text-embedding-3-small
+  cache_enabled: false
+
+api:
+  enabled: true
+  api_base: http://localhost:8080
+  version: v1
+  endpoint_type: query
+  timeout: 300
+  provider: openai
+  model: gpt-4o-mini
+  cache_enabled: false
+
+metrics_metadata:
+  turn_level:
+    ragas:context_recall:
+      threshold: 0.8
+      description: Did we fetch every fact the answer needs?
+      default: false
+
+    ragas:context_precision_with_reference:
+      threshold: 0.7
+      description: How precise the retrieved context is (with reference)
+      default: false
+
+    ragas:context_precision_without_reference:
+      threshold: 0.7
+      description: How precise the retrieved context is (without reference)
+      default: false
+
+    ragas:context_relevance:
+      threshold: 0.7
+      description: Is what we retrieved actually relevant to user query?
+      default: false
+
+storage:
+  - type: file
+    output_dir: ./eval_output/lcore_regression
+    base_filename: evaluation
+    enabled_outputs:
+      - csv
+      - json
+      - txt
+    csv_columns:
+      - conversation_group_id
+      - turn_id
+      - metric_identifier
+      - result
+      - score
+      - threshold
+      - reason
+      - query
+      - response
+      - contexts
+      - expected_response
+      - expected_keywords
+      - api_input_tokens
+      - api_output_tokens
+
+visualization:
+  figsize:
+    - 14
+    - 10
+  dpi: 150
+  enabled_graphs:
+    - pass_rates
+    - score_distribution
+    - status_breakdown
+
+environment:
+  LITELLM_LOG: ERROR
+
+logging:
+  source_level: INFO
+  package_level: WARNING
+  log_format: "%(asctime)s - %(levelname)s - %(message)s"
+  show_timestamps: true
diff --git a/script/regression/__init__.py b/script/regression/__init__.py
@@ -0,0 +1 @@
+"""Regression comparison scripts for evaluation gating."""
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		"""Regression comparison scripts for evaluation gating."""