From 2df33800f1e2b9663b6d55fcb37983a8523e8e9f Mon Sep 17 00:00:00 2001 From: Eva Micankova Date: Wed, 1 Jul 2026 13:41:38 +0200 Subject: [PATCH] Adding support for multiple tags --- docs/configuration.md | 2 +- src/lightspeed_evaluation/core/models/data.py | 39 +++++++-- .../core/output/statistics.py | 3 +- .../core/storage/langfuse_storage.py | 2 +- .../core/storage/sql_storage.py | 4 +- .../core/system/validator.py | 2 +- .../pipeline/evaluation/errors.py | 12 +-- .../pipeline/evaluation/processor.py | 6 +- tests/unit/core/models/test_data.py | 84 ++++++++++++++++--- .../core/output/test_statistics_detailed.py | 29 ++++++- tests/unit/core/storage/test_sql_storage.py | 5 +- tests/unit/core/system/test_validator.py | 35 ++++++-- tests/unit/pipeline/evaluation/conftest.py | 4 +- 13 files changed, 182 insertions(+), 45 deletions(-) diff --git a/docs/configuration.md b/docs/configuration.md index 46b9d98d..06f67930 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -459,7 +459,7 @@ storage: | CSV column name | Description | |-----------------|-------------| | conversation_group_id | Conversation group id | -| tag | Tag for grouping eval conversations | +| tag | Tag(s) for grouping and filtering conversations (string or list of strings; stored as JSON array) | | turn_id | Turn id | | metric_identifier | Metric name | | result | Result -- PASS/FAIL/ERROR/SKIPPED | diff --git a/src/lightspeed_evaluation/core/models/data.py b/src/lightspeed_evaluation/core/models/data.py index c9f4a999..cff90fdd 100644 --- a/src/lightspeed_evaluation/core/models/data.py +++ b/src/lightspeed_evaluation/core/models/data.py @@ -12,6 +12,21 @@ logger = logging.getLogger(__name__) +def _normalize_tag(v: str | list) -> list[str]: + """Normalize tag to list[str] for backward compatibility with single string.""" + if isinstance(v, str): + items: list = [v] + else: + items = v + invalid = [item for item in items if not isinstance(item, str)] + if invalid: + raise ValueError(f"tag items must be strings, got: {invalid}") + result = [item for item in items if item.strip()] + if not result: + raise ValueError("tag must contain at least one non-empty string") + return result + + class ConversationMetadata(BaseModel): """Optional user-defined metadata for a conversation group.""" @@ -520,10 +535,10 @@ class EvaluationData(BaseModel): min_length=1, description="Optional description of the conversation group", ) - tag: str = Field( - default="eval", + tag: list[str] = Field( + default=["eval"], min_length=1, - description="Tag for grouping and filtering conversations", + description="Tag(s) for grouping and filtering conversations", ) skip: bool = Field( default=False, @@ -586,6 +601,12 @@ def is_metric_invalid(self, metric: str) -> bool: """Returns True if the metric didn't pass the validation.""" return metric in self._invalid_metrics + @field_validator("tag", mode="before") + @classmethod + def _validate_tag(cls, v: str | list) -> list[str]: + """Normalize tag to list[str] for backward compatibility with single string.""" + return _normalize_tag(v) + @field_validator("conversation_metrics") @classmethod def validate_conversation_metrics( @@ -659,10 +680,10 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin): conversation_group_id: str = Field( ..., min_length=1, description="Conversation group identifier" ) - tag: str = Field( - default="eval", + tag: list[str] = Field( + default=["eval"], min_length=1, - description="Tag for grouping and filtering results", + description="Tag(s) for grouping and filtering results", ) turn_id: Optional[str] = Field( default=None, description="Turn ID if turn-level evaluation" @@ -714,6 +735,12 @@ class EvaluationResult(MetricResult, StreamingMetricsMixin): default=None, description="Expected tool calls formatted as string" ) + @field_validator("tag", mode="before") + @classmethod + def _validate_tag(cls, v: str | list) -> list[str]: + """Normalize tag to list[str] for backward compatibility with single string.""" + return _normalize_tag(v) + class EvaluationScope(BaseModel): """Scope and parameters for metric evaluation.""" diff --git a/src/lightspeed_evaluation/core/output/statistics.py b/src/lightspeed_evaluation/core/output/statistics.py index ad4c35af..88ffad4c 100644 --- a/src/lightspeed_evaluation/core/output/statistics.py +++ b/src/lightspeed_evaluation/core/output/statistics.py @@ -222,7 +222,8 @@ def compute_tag_stats( grouped: dict[str, list[EvaluationResult]] = {} for r in results: - grouped.setdefault(r.tag, []).append(r) + for t in set(r.tag): + grouped.setdefault(t, []).append(r) tag_stats: dict[str, TagStats] = {} for tag in sorted(grouped): diff --git a/src/lightspeed_evaluation/core/storage/langfuse_storage.py b/src/lightspeed_evaluation/core/storage/langfuse_storage.py index 08a02568..38ad0cf8 100644 --- a/src/lightspeed_evaluation/core/storage/langfuse_storage.py +++ b/src/lightspeed_evaluation/core/storage/langfuse_storage.py @@ -22,7 +22,7 @@ from __future__ import annotations -import importlib +import importlib.util import logging from typing import Any, Optional diff --git a/src/lightspeed_evaluation/core/storage/sql_storage.py b/src/lightspeed_evaluation/core/storage/sql_storage.py index 1f28b5fe..f91bdc1c 100644 --- a/src/lightspeed_evaluation/core/storage/sql_storage.py +++ b/src/lightspeed_evaluation/core/storage/sql_storage.py @@ -47,7 +47,7 @@ class EvaluationResultDB(Base): # pylint: disable=too-few-public-methods timestamp = Column(DateTime, nullable=False, index=True) conversation_group_id = Column(String(255), nullable=False, index=True) - tag = Column(String(100), nullable=True) + tag = Column(Text, nullable=True) turn_id = Column(String(100), nullable=True) metric_identifier = Column(String(255), nullable=False, index=True) metric_metadata = Column(Text, nullable=True) @@ -309,7 +309,7 @@ def _result_to_db_record(self, result: EvaluationResult) -> EvaluationResultDB: run_id=self._run_info.run_id, timestamp=datetime.now(UTC), conversation_group_id=result.conversation_group_id, - tag=result.tag, + tag=json.dumps(result.tag), turn_id=result.turn_id, metric_identifier=result.metric_identifier, metric_metadata=result.metric_metadata, diff --git a/src/lightspeed_evaluation/core/system/validator.py b/src/lightspeed_evaluation/core/system/validator.py index bdbf4755..950139e6 100644 --- a/src/lightspeed_evaluation/core/system/validator.py +++ b/src/lightspeed_evaluation/core/system/validator.py @@ -408,7 +408,7 @@ def _filter_by_scope( filtered = [ conv_data for conv_data in evaluation_data - if conv_data.tag in tag_set + if any(t in tag_set for t in conv_data.tag) or conv_data.conversation_group_id in conv_id_set ] diff --git a/src/lightspeed_evaluation/pipeline/evaluation/errors.py b/src/lightspeed_evaluation/pipeline/evaluation/errors.py index 5a352906..7820d29b 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/errors.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/errors.py @@ -22,7 +22,7 @@ def _create_result( # pylint: disable=too-many-arguments,too-many-positional-ar reason: str, result_status: str, *, - tag: str = "eval", + tag: list[str], turn_id: Optional[str] = None, query: str = "", ) -> EvaluationResult: @@ -33,7 +33,7 @@ def _create_result( # pylint: disable=too-many-arguments,too-many-positional-ar metric_id: Metric identifier reason: Reason for the result result_status: Result status (ERROR, SKIPPED, etc.) - tag: Tag for grouping and filtering results + tag: Tag(s) for grouping and filtering results turn_id: Turn ID (None for conversation-level) query: Query text """ @@ -53,7 +53,7 @@ def create_error_result( # pylint: disable=too-many-arguments,too-many-position metric_id: str, reason: str, *, - tag: str = "eval", + tag: list[str], turn_id: Optional[str] = None, query: str = "", ) -> EvaluationResult: @@ -63,7 +63,7 @@ def create_error_result( # pylint: disable=too-many-arguments,too-many-position conv_id: Conversation group ID metric_id: Metric identifier reason: Error reason - tag: Tag for grouping and filtering results + tag: Tag(s) for grouping and filtering results turn_id: Turn ID (None for conversation-level) query: Query text """ @@ -77,7 +77,7 @@ def create_skipped_result( # pylint: disable=too-many-arguments,too-many-positi metric_id: str, reason: str, *, - tag: str = "eval", + tag: list[str], turn_id: Optional[str] = None, query: str = "", ) -> EvaluationResult: @@ -87,7 +87,7 @@ def create_skipped_result( # pylint: disable=too-many-arguments,too-many-positi conv_id: Conversation group ID metric_id: Metric identifier reason: Skip reason - tag: Tag for grouping and filtering results + tag: Tag(s) for grouping and filtering results turn_id: Turn ID (None for conversation-level) query: Query text """ diff --git a/src/lightspeed_evaluation/pipeline/evaluation/processor.py b/src/lightspeed_evaluation/pipeline/evaluation/processor.py index 4f4adaab..9375df7b 100644 --- a/src/lightspeed_evaluation/pipeline/evaluation/processor.py +++ b/src/lightspeed_evaluation/pipeline/evaluation/processor.py @@ -283,6 +283,7 @@ def _evaluate_turn( conv_data.conversation_group_id, metric_identifier, error_reason, + tag=conv_data.tag, turn_id=turn_data.turn_id, query=turn_data.query or "", ) @@ -311,7 +312,10 @@ def _evaluate_conversation( logger.error(error_reason) results.append( self.components.error_handler.create_error_result( - conv_data.conversation_group_id, metric_identifier, error_reason + conv_data.conversation_group_id, + metric_identifier, + error_reason, + tag=conv_data.tag, ) ) continue diff --git a/tests/unit/core/models/test_data.py b/tests/unit/core/models/test_data.py index 9977dfb5..69a584aa 100644 --- a/tests/unit/core/models/test_data.py +++ b/tests/unit/core/models/test_data.py @@ -436,12 +436,12 @@ def test_valid_creation(self) -> None: conversation_group_id="conv1", turns=turns, description="Test conversation", - tag="test_tag", + tag=["test_tag"], conversation_metrics=["deepeval:conversation_completeness"], ) assert eval_data.conversation_group_id == "conv1" - assert eval_data.tag == "test_tag" + assert eval_data.tag == ["test_tag"] assert len(eval_data.turns) == 2 assert eval_data.description == "Test conversation" assert eval_data.conversation_metrics is not None @@ -452,14 +452,50 @@ def test_default_tag_value(self) -> None: turn = TurnData(turn_id="turn1", query="Query") eval_data = EvaluationData(conversation_group_id="conv1", turns=[turn]) - assert eval_data.tag == "eval" + assert eval_data.tag == ["eval"] - def test_empty_tag_rejected(self) -> None: - """Test that empty tag is rejected.""" + def test_single_string_tag_normalized_to_list(self) -> None: + """Test that a single string tag is normalized to a list.""" + turn = TurnData(turn_id="turn1", query="Query") + eval_data = EvaluationData( + conversation_group_id="conv1", turns=[turn], tag="basic" # type: ignore[arg-type] + ) + + assert eval_data.tag == ["basic"] + + def test_list_tag_accepted(self) -> None: + """Test that a list of tags is accepted.""" + turn = TurnData(turn_id="turn1", query="Query") + eval_data = EvaluationData( + conversation_group_id="conv1", turns=[turn], tag=["basic", "advanced"] + ) + + assert eval_data.tag == ["basic", "advanced"] + + def test_empty_tag_list_rejected(self) -> None: + """Test that empty list tag is rejected.""" turn = TurnData(turn_id="turn1", query="Query") with pytest.raises(ValidationError): - EvaluationData(conversation_group_id="conv1", turns=[turn], tag="") + EvaluationData(conversation_group_id="conv1", turns=[turn], tag=[]) + + def test_whitespace_only_tag_rejected(self) -> None: + """Test that a list of only whitespace strings is rejected.""" + turn = TurnData(turn_id="turn1", query="Query") + + with pytest.raises(ValidationError): + EvaluationData(conversation_group_id="conv1", turns=[turn], tag=[" "]) + + def test_non_string_tag_items_rejected(self) -> None: + """Test that non-string items in tag list are rejected.""" + turn = TurnData(turn_id="turn1", query="Query") + + with pytest.raises(ValidationError): + EvaluationData( + conversation_group_id="conv1", + turns=[turn], + tag=[1, "prod"], # type: ignore[list-item] + ) def test_empty_conversation_id_rejected(self) -> None: """Test that empty conversation_group_id is rejected.""" @@ -500,7 +536,7 @@ def test_default_values(self) -> None: ) # Test meaningful defaults - assert result.tag == "eval" + assert result.tag == ["eval"] assert result.score is None assert result.reason == "" assert result.evaluation_latency == 0 @@ -509,21 +545,45 @@ def test_explicit_tag_value(self) -> None: """Test EvaluationResult with explicit tag value.""" result = EvaluationResult( conversation_group_id="conv1", - tag="custom_tag", + tag=["custom_tag"], turn_id="turn1", metric_identifier="metric1", result="PASS", threshold=0.7, ) - assert result.tag == "custom_tag" + assert result.tag == ["custom_tag"] + + def test_empty_tag_list_rejected(self) -> None: + """Test that empty tag list is rejected.""" + with pytest.raises(ValidationError): + EvaluationResult( + conversation_group_id="conv1", + tag=[], + turn_id="turn1", + metric_identifier="metric1", + result="PASS", + threshold=0.7, + ) + + def test_whitespace_only_tag_rejected(self) -> None: + """Test that a list of only whitespace strings is rejected.""" + with pytest.raises(ValidationError): + EvaluationResult( + conversation_group_id="conv1", + tag=[" "], + turn_id="turn1", + metric_identifier="metric1", + result="PASS", + threshold=0.7, + ) - def test_empty_tag_rejected(self) -> None: - """Test that empty tag is rejected.""" + def test_non_string_tag_items_rejected(self) -> None: + """Test that non-string items in tag list are rejected.""" with pytest.raises(ValidationError): EvaluationResult( conversation_group_id="conv1", - tag="", + tag=[1, "prod"], # type: ignore[list-item] turn_id="turn1", metric_identifier="metric1", result="PASS", diff --git a/tests/unit/core/output/test_statistics_detailed.py b/tests/unit/core/output/test_statistics_detailed.py index 63b128f1..1c404113 100644 --- a/tests/unit/core/output/test_statistics_detailed.py +++ b/tests/unit/core/output/test_statistics_detailed.py @@ -319,7 +319,7 @@ def test_compute_detailed_stats_by_tag(self) -> None: results = [ EvaluationResult( conversation_group_id="conv1", - tag="production", + tag=["production"], turn_id="turn1", metric_identifier="metric1", result="PASS", @@ -329,7 +329,7 @@ def test_compute_detailed_stats_by_tag(self) -> None: ), EvaluationResult( conversation_group_id="conv2", - tag="production", + tag=["production"], turn_id="turn1", metric_identifier="metric1", result="PASS", @@ -339,7 +339,7 @@ def test_compute_detailed_stats_by_tag(self) -> None: ), EvaluationResult( conversation_group_id="conv3", - tag="staging", + tag=["staging"], turn_id="turn1", metric_identifier="metric1", result="FAIL", @@ -372,6 +372,29 @@ def test_compute_detailed_stats_by_tag(self) -> None: assert staging_stats["fail_rate"] == 100.0 assert "score_statistics" in staging_stats + def test_compute_detailed_stats_multi_tag_result_counted_in_each_bucket( + self, + ) -> None: + """Test a single result with multiple tags appears in each tag's bucket.""" + result = EvaluationResult( + conversation_group_id="conv1", + tag=["production", "staging"], + turn_id="turn1", + metric_identifier="metric1", + result="PASS", + score=0.9, + threshold=0.7, + ) + + stats = compute_detailed_stats([result]).model_dump() + + assert "production" in stats["by_tag"] + assert "staging" in stats["by_tag"] + assert stats["by_tag"]["production"]["passed"] == 1 + assert stats["by_tag"]["production"]["failed"] == 0 + assert stats["by_tag"]["staging"]["passed"] == 1 + assert stats["by_tag"]["staging"]["failed"] == 0 + def test_compute_detailed_stats_default_tag(self) -> None: """Test compute_detailed_stats with default 'eval' tag.""" results = [ diff --git a/tests/unit/core/storage/test_sql_storage.py b/tests/unit/core/storage/test_sql_storage.py index 3f24adfa..15755e47 100644 --- a/tests/unit/core/storage/test_sql_storage.py +++ b/tests/unit/core/storage/test_sql_storage.py @@ -40,7 +40,7 @@ def sample_result() -> EvaluationResult: """Create a sample evaluation result for testing.""" return EvaluationResult( conversation_group_id="conv_001", - tag="test", + tag=["test"], turn_id="turn_1", metric_identifier="ragas:faithfulness", metric_metadata='{"threshold": 0.8}', @@ -270,7 +270,7 @@ def test_all_fields_stored(self, temp_db_url: str) -> None: """Test all EvaluationResult fields are stored.""" result = EvaluationResult( conversation_group_id="conv_full", - tag="integration", + tag=["integration"], turn_id="turn_5", metric_identifier="custom:metric", metric_metadata='{"key": "value"}', @@ -311,6 +311,7 @@ def test_all_fields_stored(self, temp_db_url: str) -> None: assert row.score == 0.92 assert row.evaluation_latency == 2.5 assert row.tool_calls == '[{"name": "search"}]' + assert row.tag == '["integration"]' def test_null_fields_handled(self, temp_db_url: str) -> None: """Test null/optional fields are handled correctly.""" diff --git a/tests/unit/core/system/test_validator.py b/tests/unit/core/system/test_validator.py index d4ce588d..2d82c9a9 100644 --- a/tests/unit/core/system/test_validator.py +++ b/tests/unit/core/system/test_validator.py @@ -532,23 +532,23 @@ def test_filter_by_scope_tags_only(self) -> None: data = [ EvaluationData( conversation_group_id="conv_1", - tag="basic", + tag=["basic"], turns=[TurnData(turn_id="t1", query="Q", response="A")], ), EvaluationData( conversation_group_id="conv_2", - tag="advanced", + tag=["advanced"], turns=[TurnData(turn_id="t1", query="Q", response="A")], ), EvaluationData( conversation_group_id="conv_3", - tag="basic", + tag=["basic"], turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] result = validator._filter_by_scope(data, tags=["basic"]) assert len(result) == 2 - assert all(c.tag == "basic" for c in result) + assert all("basic" in c.tag for c in result) def test_filter_by_scope_conv_ids_only(self) -> None: """Test filtering by conversation IDs only.""" @@ -577,17 +577,17 @@ def test_filter_by_scope_tags_and_conv_ids(self) -> None: data = [ EvaluationData( conversation_group_id="conv_1", - tag="basic", + tag=["basic"], turns=[TurnData(turn_id="t1", query="Q", response="A")], ), EvaluationData( conversation_group_id="conv_2", - tag="advanced", + tag=["advanced"], turns=[TurnData(turn_id="t1", query="Q", response="A")], ), EvaluationData( conversation_group_id="conv_3", - tag="tools", + tag=["tools"], turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] @@ -600,13 +600,32 @@ def test_filter_by_scope_no_match_returns_empty(self) -> None: data = [ EvaluationData( conversation_group_id="conv_1", - tag="basic", + tag=["basic"], turns=[TurnData(turn_id="t1", query="Q", response="A")], ), ] result = validator._filter_by_scope(data, tags=["nonexistent"]) assert len(result) == 0 + def test_filter_by_scope_multi_tag_matches_any(self) -> None: + """Test that a conversation with multiple tags is matched by any of them.""" + validator = DataValidator() + data = [ + EvaluationData( + conversation_group_id="conv_1", + tag=["basic", "advanced"], + turns=[TurnData(turn_id="t1", query="Q", response="A")], + ), + EvaluationData( + conversation_group_id="conv_2", + tag=["tools"], + turns=[TurnData(turn_id="t1", query="Q", response="A")], + ), + ] + result = validator._filter_by_scope(data, tags=["advanced"]) + assert len(result) == 1 + assert result[0].conversation_group_id == "conv_1" + class TestMetricsFilter: """Tests for --metrics filter in load_evaluation_data.""" diff --git a/tests/unit/pipeline/evaluation/conftest.py b/tests/unit/pipeline/evaluation/conftest.py index b4f192d6..b96969b8 100644 --- a/tests/unit/pipeline/evaluation/conftest.py +++ b/tests/unit/pipeline/evaluation/conftest.py @@ -1,4 +1,4 @@ -# pylint: disable=redefined-outer-name +# pylint: disable=redefined-outer-name,too-many-arguments """Pytest configuration and fixtures for evaluation tests.""" @@ -212,11 +212,13 @@ def create_error_result_side_effect( metric_id: str, reason: str, *, + tag: list[str], turn_id: str | None = None, query: str = "", ) -> EvaluationResult: return EvaluationResult( conversation_group_id=conv_id, + tag=tag, turn_id=turn_id, metric_identifier=metric_id, result="ERROR",