Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ storage:
| CSV column name | Description |
|-----------------|-------------|
| conversation_group_id | Conversation group id |
| tag | Tag for grouping eval conversations |
| tag | Tag(s) for grouping and filtering conversations (string or list of strings; stored as JSON array) |
| turn_id | Turn id |
| metric_identifier | Metric name |
| result | Result -- PASS/FAIL/ERROR/SKIPPED |
Expand Down
39 changes: 33 additions & 6 deletions src/lightspeed_evaluation/core/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@
logger = logging.getLogger(__name__)


def _normalize_tag(v: str | list) -> list[str]:
"""Normalize tag to list[str] for backward compatibility with single string."""
if isinstance(v, str):
items: list = [v]
else:
items = v
invalid = [item for item in items if not isinstance(item, str)]
if invalid:
raise ValueError(f"tag items must be strings, got: {invalid}")
result = [item for item in items if item.strip()]
if not result:
raise ValueError("tag must contain at least one non-empty string")
return result


class ConversationMetadata(BaseModel):
"""Optional user-defined metadata for a conversation group."""

Expand Down Expand Up @@ -520,10 +535,10 @@ class EvaluationData(BaseModel):
min_length=1,
description="Optional description of the conversation group",
)
tag: str = Field(
default="eval",
tag: list[str] = Field(
default=["eval"],
min_length=1,
description="Tag for grouping and filtering conversations",
description="Tag(s) for grouping and filtering conversations",
)
Comment thread
xmican10 marked this conversation as resolved.
skip: bool = Field(
default=False,
Expand Down Expand Up @@ -586,6 +601,12 @@ def is_metric_invalid(self, metric: str) -> bool:
"""Returns True if the metric didn't pass the validation."""
return metric in self._invalid_metrics

@field_validator("tag", mode="before")
@classmethod
def _validate_tag(cls, v: str | list) -> list[str]:
"""Normalize tag to list[str] for backward compatibility with single string."""
return _normalize_tag(v)

@field_validator("conversation_metrics")
@classmethod
def validate_conversation_metrics(
Expand Down Expand Up @@ -659,10 +680,10 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
conversation_group_id: str = Field(
..., min_length=1, description="Conversation group identifier"
)
tag: str = Field(
default="eval",
tag: list[str] = Field(
default=["eval"],
min_length=1,
description="Tag for grouping and filtering results",
description="Tag(s) for grouping and filtering results",
)
turn_id: Optional[str] = Field(
default=None, description="Turn ID if turn-level evaluation"
Expand Down Expand Up @@ -714,6 +735,12 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin):
default=None, description="Expected tool calls formatted as string"
)

@field_validator("tag", mode="before")
@classmethod
def _validate_tag(cls, v: str | list) -> list[str]:
"""Normalize tag to list[str] for backward compatibility with single string."""
return _normalize_tag(v)


class EvaluationScope(BaseModel):
"""Scope and parameters for metric evaluation."""
Expand Down
3 changes: 2 additions & 1 deletion src/lightspeed_evaluation/core/output/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,8 @@ def compute_tag_stats(

grouped: dict[str, list[EvaluationResult]] = {}
for r in results:
grouped.setdefault(r.tag, []).append(r)
for t in set(r.tag):
grouped.setdefault(t, []).append(r)
Comment thread
xmican10 marked this conversation as resolved.

tag_stats: dict[str, TagStats] = {}
for tag in sorted(grouped):
Expand Down
2 changes: 1 addition & 1 deletion src/lightspeed_evaluation/core/storage/langfuse_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

from __future__ import annotations

import importlib
import importlib.util
import logging
from typing import Any, Optional

Expand Down
4 changes: 2 additions & 2 deletions src/lightspeed_evaluation/core/storage/sql_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class EvaluationResultDB(Base): # pylint: disable=too-few-public-methods
timestamp = Column(DateTime, nullable=False, index=True)

conversation_group_id = Column(String(255), nullable=False, index=True)
tag = Column(String(100), nullable=True)
tag = Column(Text, nullable=True)
turn_id = Column(String(100), nullable=True)
metric_identifier = Column(String(255), nullable=False, index=True)
metric_metadata = Column(Text, nullable=True)
Expand Down Expand Up @@ -309,7 +309,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB:
run_id=self._run_info.run_id,
timestamp=datetime.now(UTC),
conversation_group_id=result.conversation_group_id,
tag=result.tag,
tag=json.dumps(result.tag),
turn_id=result.turn_id,
metric_identifier=result.metric_identifier,
metric_metadata=result.metric_metadata,
Expand Down
2 changes: 1 addition & 1 deletion src/lightspeed_evaluation/core/system/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ def _filter_by_scope(
filtered = [
conv_data
for conv_data in evaluation_data
if conv_data.tag in tag_set
if any(t in tag_set for t in conv_data.tag)
or conv_data.conversation_group_id in conv_id_set
]

Expand Down
12 changes: 6 additions & 6 deletions src/lightspeed_evaluation/pipeline/evaluation/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def _create_result( # pylint: disable=too-many-arguments,too-many-positional-ar
reason: str,
result_status: str,
*,
tag: str = "eval",
tag: list[str],
turn_id: Optional[str] = None,
query: str = "",
) -> EvaluationResult:
Expand All @@ -33,7 +33,7 @@ def _create_result( # pylint: disable=too-many-arguments,too-many-positional-ar
metric_id: Metric identifier
reason: Reason for the result
result_status: Result status (ERROR, SKIPPED, etc.)
tag: Tag for grouping and filtering results
tag: Tag(s) for grouping and filtering results
turn_id: Turn ID (None for conversation-level)
query: Query text
"""
Expand All @@ -53,7 +53,7 @@ def create_error_result( # pylint: disable=too-many-arguments,too-many-position
metric_id: str,
reason: str,
*,
tag: str = "eval",
tag: list[str],
turn_id: Optional[str] = None,
query: str = "",
) -> EvaluationResult:
Expand All @@ -63,7 +63,7 @@ def create_error_result( # pylint: disable=too-many-arguments,too-many-position
conv_id: Conversation group ID
metric_id: Metric identifier
reason: Error reason
tag: Tag for grouping and filtering results
tag: Tag(s) for grouping and filtering results
turn_id: Turn ID (None for conversation-level)
query: Query text
"""
Expand All @@ -77,7 +77,7 @@ def create_skipped_result( # pylint: disable=too-many-arguments,too-many-positi
metric_id: str,
reason: str,
*,
tag: str = "eval",
tag: list[str],
turn_id: Optional[str] = None,
query: str = "",
) -> EvaluationResult:
Expand All @@ -87,7 +87,7 @@ def create_skipped_result( # pylint: disable=too-many-arguments,too-many-positi
conv_id: Conversation group ID
metric_id: Metric identifier
reason: Skip reason
tag: Tag for grouping and filtering results
tag: Tag(s) for grouping and filtering results
turn_id: Turn ID (None for conversation-level)
query: Query text
"""
Expand Down
6 changes: 5 additions & 1 deletion src/lightspeed_evaluation/pipeline/evaluation/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,7 @@ def _evaluate_turn(
conv_data.conversation_group_id,
metric_identifier,
error_reason,
tag=conv_data.tag,
turn_id=turn_data.turn_id,
query=turn_data.query or "",
)
Expand Down Expand Up @@ -311,7 +312,10 @@ def _evaluate_conversation(
logger.error(error_reason)
results.append(
self.components.error_handler.create_error_result(
conv_data.conversation_group_id, metric_identifier, error_reason
conv_data.conversation_group_id,
metric_identifier,
error_reason,
tag=conv_data.tag,
)
)
continue
Expand Down
84 changes: 72 additions & 12 deletions tests/unit/core/models/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,12 +436,12 @@ def test_valid_creation(self) -> None:
conversation_group_id="conv1",
turns=turns,
description="Test conversation",
tag="test_tag",
tag=["test_tag"],
conversation_metrics=["deepeval:conversation_completeness"],
)

assert eval_data.conversation_group_id == "conv1"
assert eval_data.tag == "test_tag"
assert eval_data.tag == ["test_tag"]
assert len(eval_data.turns) == 2
assert eval_data.description == "Test conversation"
assert eval_data.conversation_metrics is not None
Expand All @@ -452,14 +452,50 @@ def test_default_tag_value(self) -> None:
turn = TurnData(turn_id="turn1", query="Query")
eval_data = EvaluationData(conversation_group_id="conv1", turns=[turn])

assert eval_data.tag == "eval"
assert eval_data.tag == ["eval"]

def test_empty_tag_rejected(self) -> None:
"""Test that empty tag is rejected."""
def test_single_string_tag_normalized_to_list(self) -> None:
"""Test that a single string tag is normalized to a list."""
turn = TurnData(turn_id="turn1", query="Query")
eval_data = EvaluationData(
conversation_group_id="conv1", turns=[turn], tag="basic" # type: ignore[arg-type]
)

assert eval_data.tag == ["basic"]
Comment thread
xmican10 marked this conversation as resolved.

def test_list_tag_accepted(self) -> None:
"""Test that a list of tags is accepted."""
turn = TurnData(turn_id="turn1", query="Query")
eval_data = EvaluationData(
conversation_group_id="conv1", turns=[turn], tag=["basic", "advanced"]
)

assert eval_data.tag == ["basic", "advanced"]

def test_empty_tag_list_rejected(self) -> None:
"""Test that empty list tag is rejected."""
turn = TurnData(turn_id="turn1", query="Query")

with pytest.raises(ValidationError):
EvaluationData(conversation_group_id="conv1", turns=[turn], tag="")
EvaluationData(conversation_group_id="conv1", turns=[turn], tag=[])

def test_whitespace_only_tag_rejected(self) -> None:
"""Test that a list of only whitespace strings is rejected."""
turn = TurnData(turn_id="turn1", query="Query")

with pytest.raises(ValidationError):
EvaluationData(conversation_group_id="conv1", turns=[turn], tag=[" "])

def test_non_string_tag_items_rejected(self) -> None:
"""Test that non-string items in tag list are rejected."""
turn = TurnData(turn_id="turn1", query="Query")

with pytest.raises(ValidationError):
EvaluationData(
conversation_group_id="conv1",
turns=[turn],
tag=[1, "prod"], # type: ignore[list-item]
)

def test_empty_conversation_id_rejected(self) -> None:
"""Test that empty conversation_group_id is rejected."""
Expand Down Expand Up @@ -500,7 +536,7 @@ def test_default_values(self) -> None:
)

# Test meaningful defaults
assert result.tag == "eval"
assert result.tag == ["eval"]
assert result.score is None
assert result.reason == ""
assert result.evaluation_latency == 0
Expand All @@ -509,21 +545,45 @@ def test_explicit_tag_value(self) -> None:
"""Test EvaluationResult with explicit tag value."""
result = EvaluationResult(
conversation_group_id="conv1",
tag="custom_tag",
tag=["custom_tag"],
turn_id="turn1",
metric_identifier="metric1",
result="PASS",
threshold=0.7,
)

assert result.tag == "custom_tag"
assert result.tag == ["custom_tag"]

def test_empty_tag_list_rejected(self) -> None:
"""Test that empty tag list is rejected."""
with pytest.raises(ValidationError):
EvaluationResult(
conversation_group_id="conv1",
tag=[],
turn_id="turn1",
metric_identifier="metric1",
result="PASS",
threshold=0.7,
)

def test_whitespace_only_tag_rejected(self) -> None:
"""Test that a list of only whitespace strings is rejected."""
with pytest.raises(ValidationError):
EvaluationResult(
conversation_group_id="conv1",
tag=[" "],
turn_id="turn1",
metric_identifier="metric1",
result="PASS",
threshold=0.7,
)

def test_empty_tag_rejected(self) -> None:
"""Test that empty tag is rejected."""
def test_non_string_tag_items_rejected(self) -> None:
"""Test that non-string items in tag list are rejected."""
with pytest.raises(ValidationError):
EvaluationResult(
conversation_group_id="conv1",
tag="",
tag=[1, "prod"], # type: ignore[list-item]
turn_id="turn1",
metric_identifier="metric1",
result="PASS",
Expand Down
29 changes: 26 additions & 3 deletions tests/unit/core/output/test_statistics_detailed.py
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ def test_compute_detailed_stats_by_tag(self) -> None:
results = [
EvaluationResult(
conversation_group_id="conv1",
tag="production",
tag=["production"],
turn_id="turn1",
metric_identifier="metric1",
result="PASS",
Expand All @@ -329,7 +329,7 @@ def test_compute_detailed_stats_by_tag(self) -> None:
),
EvaluationResult(
conversation_group_id="conv2",
tag="production",
tag=["production"],
turn_id="turn1",
metric_identifier="metric1",
result="PASS",
Expand All @@ -339,7 +339,7 @@ def test_compute_detailed_stats_by_tag(self) -> None:
),
EvaluationResult(
conversation_group_id="conv3",
tag="staging",
tag=["staging"],
turn_id="turn1",
metric_identifier="metric1",
result="FAIL",
Expand Down Expand Up @@ -372,6 +372,29 @@ def test_compute_detailed_stats_by_tag(self) -> None:
assert staging_stats["fail_rate"] == 100.0
assert "score_statistics" in staging_stats

def test_compute_detailed_stats_multi_tag_result_counted_in_each_bucket(
self,
) -> None:
"""Test a single result with multiple tags appears in each tag's bucket."""
result = EvaluationResult(
conversation_group_id="conv1",
tag=["production", "staging"],
turn_id="turn1",
metric_identifier="metric1",
result="PASS",
score=0.9,
threshold=0.7,
)

stats = compute_detailed_stats([result]).model_dump()

assert "production" in stats["by_tag"]
assert "staging" in stats["by_tag"]
assert stats["by_tag"]["production"]["passed"] == 1
assert stats["by_tag"]["production"]["failed"] == 0
assert stats["by_tag"]["staging"]["passed"] == 1
assert stats["by_tag"]["staging"]["failed"] == 0

def test_compute_detailed_stats_default_tag(self) -> None:
"""Test compute_detailed_stats with default 'eval' tag."""
results = [
Expand Down
Loading
Loading