diff --git a/pyproject.toml b/pyproject.toml index 7fe96929e..55ba56f77 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "uipath" -version = "2.7.0" +version = "2.7.1" description = "Python SDK and CLI for UiPath Platform, enabling programmatic interaction with automation services, process management, and deployment tools." readme = { file = "README.md", content-type = "text/markdown" } requires-python = ">=3.11" diff --git a/samples/calculator/evaluations/evaluators/custom/correct_operator.py b/samples/calculator/evaluations/evaluators/custom/correct_operator.py index 74c679735..6e702798d 100644 --- a/samples/calculator/evaluations/evaluators/custom/correct_operator.py +++ b/samples/calculator/evaluations/evaluators/custom/correct_operator.py @@ -1,6 +1,7 @@ import json -from uipath.eval.evaluators import BaseEvaluator, BaseEvaluationCriteria, BaseEvaluatorConfig +from uipath.eval.evaluators import BaseEvaluationCriteria, BaseEvaluatorConfig +from uipath.eval.evaluators.base_evaluator import BaseEvaluator from uipath.eval.models import AgentExecution, EvaluationResult, NumericEvaluationResult from opentelemetry.sdk.trace import ReadableSpan diff --git a/src/uipath/_cli/_evals/_console_progress_reporter.py b/src/uipath/_cli/_evals/_console_progress_reporter.py index 9028a2397..baa01e3a8 100644 --- a/src/uipath/_cli/_evals/_console_progress_reporter.py +++ b/src/uipath/_cli/_evals/_console_progress_reporter.py @@ -15,7 +15,7 @@ EvalSetRunUpdatedEvent, EvaluationEvents, ) -from uipath.eval.evaluators import BaseEvaluator +from uipath.eval.evaluators.base_evaluator import GenericBaseEvaluator from uipath.eval.models import ScoreType logger = logging.getLogger(__name__) @@ -26,7 +26,7 @@ class ConsoleProgressReporter: def __init__(self): self.console = Console() - self.evaluators: dict[str, BaseEvaluator[Any, Any, Any]] = {} + self.evaluators: dict[str, GenericBaseEvaluator[Any, Any, Any]] = {} self.display_started = False self.eval_results_by_name: dict[str, list[Any]] = {} diff --git a/src/uipath/_cli/_evals/_evaluator_factory.py b/src/uipath/_cli/_evals/_evaluator_factory.py index 3ff3ad458..4c53a2c44 100644 --- a/src/uipath/_cli/_evals/_evaluator_factory.py +++ b/src/uipath/_cli/_evals/_evaluator_factory.py @@ -9,85 +9,20 @@ from uipath._cli._evals._helpers import ( # type: ignore # Remove after gnarly fix try_extract_file_and_class_name, ) -from uipath._cli._evals._models._evaluator import ( - EvaluatorConfig, - LegacyEqualsEvaluatorParams, - LegacyEvaluator, - LegacyJsonSimilarityEvaluatorParams, - LegacyLLMEvaluatorParams, - LegacyTrajectoryEvaluatorParams, -) -from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams +from uipath._cli._evals._models._evaluator import CodedEvaluator, LegacyEvaluator from uipath._utils.constants import EVALS_FOLDER from uipath.eval.evaluators import ( BaseEvaluator, - LegacyBaseEvaluator, + BaseLegacyEvaluator, LegacyContextPrecisionEvaluator, - LegacyExactMatchEvaluator, LegacyFaithfulnessEvaluator, - LegacyJsonSimilarityEvaluator, LegacyLlmAsAJudgeEvaluator, LegacyTrajectoryEvaluator, ) -from uipath.eval.evaluators.base_evaluator import BaseEvaluatorConfig -from uipath.eval.evaluators.contains_evaluator import ( - ContainsEvaluator, - ContainsEvaluatorConfig, -) -from uipath.eval.evaluators.exact_match_evaluator import ( - ExactMatchEvaluator, - ExactMatchEvaluatorConfig, -) -from uipath.eval.evaluators.json_similarity_evaluator import ( - JsonSimilarityEvaluator, - JsonSimilarityEvaluatorConfig, -) -from uipath.eval.evaluators.llm_judge_output_evaluator import ( - LLMJudgeOutputEvaluator, - LLMJudgeOutputEvaluatorConfig, - LLMJudgeStrictJSONSimilarityOutputEvaluator, - LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig, -) -from uipath.eval.evaluators.llm_judge_trajectory_evaluator import ( - LLMJudgeTrajectoryEvaluator, - LLMJudgeTrajectoryEvaluatorConfig, - LLMJudgeTrajectorySimulationEvaluator, - LLMJudgeTrajectorySimulationEvaluatorConfig, -) -from uipath.eval.evaluators.tool_call_args_evaluator import ( - ToolCallArgsEvaluator, - ToolCallArgsEvaluatorConfig, -) -from uipath.eval.evaluators.tool_call_count_evaluator import ( - ToolCallCountEvaluator, - ToolCallCountEvaluatorConfig, -) -from uipath.eval.evaluators.tool_call_order_evaluator import ( - ToolCallOrderEvaluator, - ToolCallOrderEvaluatorConfig, -) -from uipath.eval.evaluators.tool_call_output_evaluator import ( - ToolCallOutputEvaluator, - ToolCallOutputEvaluatorConfig, -) -from uipath.eval.models import LegacyEvaluatorType +from uipath.eval.evaluators.base_evaluator import GenericBaseEvaluator logger = logging.getLogger(__name__) -EVALUATOR_SCHEMA_TO_EVALUATOR_CLASS = { - ContainsEvaluatorConfig: ContainsEvaluator, - ExactMatchEvaluatorConfig: ExactMatchEvaluator, - JsonSimilarityEvaluatorConfig: JsonSimilarityEvaluator, - LLMJudgeOutputEvaluatorConfig: LLMJudgeOutputEvaluator, - LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig: LLMJudgeStrictJSONSimilarityOutputEvaluator, - LLMJudgeTrajectoryEvaluatorConfig: LLMJudgeTrajectoryEvaluator, - LLMJudgeTrajectorySimulationEvaluatorConfig: LLMJudgeTrajectorySimulationEvaluator, - ToolCallArgsEvaluatorConfig: ToolCallArgsEvaluator, - ToolCallCountEvaluatorConfig: ToolCallCountEvaluator, - ToolCallOrderEvaluatorConfig: ToolCallOrderEvaluator, - ToolCallOutputEvaluatorConfig: ToolCallOutputEvaluator, -} - class EvaluatorFactory: """Factory class for creating evaluator instances based on configuration.""" @@ -130,7 +65,7 @@ def create_evaluator( data: dict[str, Any], evaluators_dir: Path | None = None, agent_model: str | None = None, - ) -> BaseEvaluator[Any, Any, Any]: + ) -> GenericBaseEvaluator[Any, Any, Any]: if data.get("version", None) == "1.0": return cls._create_evaluator_internal(data, evaluators_dir) else: @@ -147,31 +82,20 @@ def _create_evaluator_internal( evaluator_schema ) if success: - return EvaluatorFactory._create_coded_evaluator_internal( + return EvaluatorFactory._create_custom_coded_evaluator_internal( data, file_path, class_name, evaluators_dir ) - - config: BaseEvaluatorConfig[Any] = TypeAdapter(EvaluatorConfig).validate_python( - data - ) - evaluator_class = EVALUATOR_SCHEMA_TO_EVALUATOR_CLASS.get(type(config)) - if not evaluator_class: - raise ValueError(f"Unknown evaluator configuration: {config}") - return TypeAdapter(evaluator_class).validate_python( - { - "id": data.get("id"), - "config": EvaluatorFactory._prepare_evaluator_config(data), - } - ) + else: + return TypeAdapter(CodedEvaluator).validate_python(data) @staticmethod - def _create_coded_evaluator_internal( + def _create_custom_coded_evaluator_internal( data: dict[str, Any], file_path_str: str, class_name: str, evaluators_dir: Path | None = None, ) -> BaseEvaluator[Any, Any, Any]: - """Create a coded evaluator by dynamically loading from a Python file. + """Create a custom coded evaluator by dynamically loading from a Python file. Args: data: Dictionary containing evaluator configuration with evaluatorTypeId @@ -242,18 +166,13 @@ def _create_coded_evaluator_internal( evaluator_id = data.get("id") if not evaluator_id or not isinstance(evaluator_id, str): raise ValueError("Evaluator 'id' must be a non-empty string") - return TypeAdapter(evaluator_class).validate_python( - { - "id": evaluator_id, - "config": EvaluatorFactory._prepare_evaluator_config(data), - } - ) + return TypeAdapter(evaluator_class).validate_python(data) @staticmethod def _create_legacy_evaluator_internal( data: dict[str, Any], agent_model: str | None = None, - ) -> LegacyBaseEvaluator[Any]: + ) -> BaseLegacyEvaluator[Any]: """Create an evaluator instance from configuration data. Args: @@ -267,97 +186,25 @@ def _create_legacy_evaluator_internal( Raises: ValueError: If category is unknown or required fields are missing """ - params: EvaluatorBaseParams = TypeAdapter(LegacyEvaluator).validate_python(data) - - match params: - case LegacyEqualsEvaluatorParams(): - return EvaluatorFactory._create_legacy_exact_match_evaluator(params) - case LegacyJsonSimilarityEvaluatorParams(): - return EvaluatorFactory._create_legacy_json_similarity_evaluator(params) - case LegacyLLMEvaluatorParams(): - return EvaluatorFactory._create_legacy_llm_as_judge_evaluator( - params, agent_model - ) - case LegacyTrajectoryEvaluatorParams(): - return EvaluatorFactory._create_legacy_trajectory_evaluator( - params, agent_model - ) - case _: - raise ValueError(f"Unknown evaluator category: {params}") - - @staticmethod - def _create_legacy_exact_match_evaluator( - params: LegacyEqualsEvaluatorParams, - ) -> LegacyExactMatchEvaluator: - """Create a deterministic evaluator.""" - return LegacyExactMatchEvaluator(**params.model_dump(), config={}) - - @staticmethod - def _create_legacy_json_similarity_evaluator( - params: LegacyJsonSimilarityEvaluatorParams, - ) -> LegacyJsonSimilarityEvaluator: - """Create a deterministic evaluator.""" - return LegacyJsonSimilarityEvaluator(**params.model_dump(), config={}) - - @staticmethod - def _create_legacy_llm_as_judge_evaluator( - params: LegacyLLMEvaluatorParams, - agent_model: str | None = None, - ) -> LegacyBaseEvaluator[Any]: - """Create an LLM-as-a-judge evaluator or context precision evaluator based on type.""" - if not params.model: - raise ValueError("LLM evaluator must include 'model' field") - - # Resolve 'same-as-agent' to actual agent model - if params.model == "same-as-agent": - if not agent_model: - raise ValueError( - "'same-as-agent' model option requires agent settings. " - "Ensure agent.json contains valid model settings." - ) - logger.info( - f"Resolving 'same-as-agent' to agent model: {agent_model} " - f"for evaluator '{params.name}'" - ) - params = params.model_copy(update={"model": agent_model}) - - # Check evaluator type to determine which evaluator to create - if params.evaluator_type == LegacyEvaluatorType.ContextPrecision: - return LegacyContextPrecisionEvaluator(**params.model_dump(), config={}) - elif params.evaluator_type == LegacyEvaluatorType.Faithfulness: - return LegacyFaithfulnessEvaluator(**params.model_dump(), config={}) - else: - if not params.prompt: - raise ValueError("LLM evaluator must include 'prompt' field") - - return LegacyLlmAsAJudgeEvaluator(**params.model_dump(), config={}) - - @staticmethod - def _create_legacy_trajectory_evaluator( - params: LegacyTrajectoryEvaluatorParams, - agent_model: str | None = None, - ) -> LegacyTrajectoryEvaluator: - """Create a trajectory evaluator.""" - if not params.prompt: - raise ValueError("Trajectory evaluator must include 'prompt' field") - - if not params.model: - raise ValueError("Trajectory evaluator must include 'model' field") - - # Resolve 'same-as-agent' to actual agent model - if params.model == "same-as-agent": - if not agent_model: - raise ValueError( - "'same-as-agent' model option requires agent settings. " - "Ensure agent.json contains valid model settings." + evaluator: LegacyEvaluator = TypeAdapter(LegacyEvaluator).validate_python(data) + + if isinstance( + evaluator, + LegacyTrajectoryEvaluator + | LegacyLlmAsAJudgeEvaluator + | LegacyContextPrecisionEvaluator + | LegacyFaithfulnessEvaluator, + ): + if evaluator.model == "same-as-agent": + if not agent_model: + raise ValueError( + "'same-as-agent' model option requires agent settings. " + "Ensure agent.json contains valid model settings." + ) + logger.info( + f"Resolving 'same-as-agent' to agent model: {agent_model} " + f"for evaluator '{evaluator.name}'" ) - logger.info( - f"Resolving 'same-as-agent' to agent model: {agent_model} " - f"for evaluator '{params.name}'" - ) - params = params.model_copy(update={"model": agent_model}) + evaluator.model = agent_model - logger.info( - f"Creating trajectory evaluator '{params.name}' with model: {params.model}" - ) - return LegacyTrajectoryEvaluator(**params.model_dump(), config={}) + return evaluator diff --git a/src/uipath/_cli/_evals/_models/_evaluator.py b/src/uipath/_cli/_evals/_models/_evaluator.py index 4aeaf973b..7e11b4459 100644 --- a/src/uipath/_cli/_evals/_models/_evaluator.py +++ b/src/uipath/_cli/_evals/_models/_evaluator.py @@ -1,33 +1,26 @@ -from typing import Annotated, Any, Literal, Union - -from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag - -from uipath.eval.evaluators.base_evaluator import BaseEvaluatorConfig -from uipath.eval.evaluators.contains_evaluator import ContainsEvaluatorConfig -from uipath.eval.evaluators.exact_match_evaluator import ExactMatchEvaluatorConfig -from uipath.eval.evaluators.json_similarity_evaluator import ( - JsonSimilarityEvaluatorConfig, -) -from uipath.eval.evaluators.llm_judge_output_evaluator import ( - LLMJudgeOutputEvaluatorConfig, - LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig, -) -from uipath.eval.evaluators.llm_judge_trajectory_evaluator import ( - LLMJudgeTrajectoryEvaluatorConfig, - LLMJudgeTrajectorySimulationEvaluatorConfig, -) -from uipath.eval.evaluators.tool_call_args_evaluator import ( - ToolCallArgsEvaluatorConfig, -) -from uipath.eval.evaluators.tool_call_count_evaluator import ( - ToolCallCountEvaluatorConfig, -) -from uipath.eval.evaluators.tool_call_order_evaluator import ( - ToolCallOrderEvaluatorConfig, -) -from uipath.eval.evaluators.tool_call_output_evaluator import ( - ToolCallOutputEvaluatorConfig, +from typing import Annotated, Any, Union + +from pydantic import ConfigDict, Discriminator, Field, Tag + +from uipath.eval.evaluators import ( + BaseLegacyEvaluator, + ContainsEvaluator, + ExactMatchEvaluator, + JsonSimilarityEvaluator, + LegacyExactMatchEvaluator, + LegacyJsonSimilarityEvaluator, + LegacyLlmAsAJudgeEvaluator, + LegacyTrajectoryEvaluator, + LLMJudgeOutputEvaluator, + LLMJudgeStrictJSONSimilarityOutputEvaluator, + LLMJudgeTrajectoryEvaluator, + LLMJudgeTrajectorySimulationEvaluator, + ToolCallArgsEvaluator, + ToolCallCountEvaluator, + ToolCallOrderEvaluator, + ToolCallOutputEvaluator, ) +from uipath.eval.evaluators.base_evaluator import BaseEvaluator, BaseEvaluatorConfig from uipath.eval.models import ( EvaluatorType, LegacyEvaluatorCategory, @@ -35,57 +28,18 @@ ) -class LegacyEvaluatorBaseParams(BaseModel): - """Parameters for initializing the base evaluator.""" - - id: str - name: str - description: str - evaluator_type: LegacyEvaluatorType = Field(..., alias="type") - created_at: str = Field(..., alias="createdAt") - updated_at: str = Field(..., alias="updatedAt") - target_output_key: str = Field(..., alias="targetOutputKey") - file_name: str = Field(..., alias="fileName") - +class UnknownLegacyEvaluator(BaseLegacyEvaluator[Any]): + pass -class LegacyLLMEvaluatorParams(LegacyEvaluatorBaseParams): - category: Literal[LegacyEvaluatorCategory.LlmAsAJudge] = Field( - ..., alias="category" - ) - prompt: str = Field(..., alias="prompt") - model: str = Field(..., alias="model") +class UnknownEvaluatorConfig(BaseEvaluatorConfig[Any]): model_config = ConfigDict( validate_by_name=True, validate_by_alias=True, extra="allow" ) -class LegacyTrajectoryEvaluatorParams(LegacyEvaluatorBaseParams): - category: Literal[LegacyEvaluatorCategory.Trajectory] = Field(..., alias="category") - prompt: str = Field(..., alias="prompt") - model: str = Field(..., alias="model") - - model_config = ConfigDict( - validate_by_name=True, validate_by_alias=True, extra="allow" - ) - - -class LegacyEqualsEvaluatorParams(LegacyEvaluatorBaseParams): - model_config = ConfigDict( - validate_by_name=True, validate_by_alias=True, extra="allow" - ) - - -class LegacyJsonSimilarityEvaluatorParams(LegacyEvaluatorBaseParams): - model_config = ConfigDict( - validate_by_name=True, validate_by_alias=True, extra="allow" - ) - - -class LegacyUnknownEvaluatorParams(LegacyEvaluatorBaseParams): - model_config = ConfigDict( - validate_by_name=True, validate_by_alias=True, extra="allow" - ) +class UnknownCodedEvaluator(BaseEvaluator[Any, Any, Any]): + pass def legacy_evaluator_discriminator(data: Any) -> str: @@ -94,138 +48,154 @@ def legacy_evaluator_discriminator(data: Any) -> str: evaluator_type = data.get("type") match category: case LegacyEvaluatorCategory.LlmAsAJudge: - return "LegacyLLMEvaluatorParams" + return "LegacyLLMEvaluator" case LegacyEvaluatorCategory.Trajectory: - return "LegacyTrajectoryEvaluatorParams" + return "LegacyTrajectoryEvaluator" case LegacyEvaluatorCategory.Deterministic: match evaluator_type: case LegacyEvaluatorType.Equals: - return "LegacyEqualsEvaluatorParams" + return "LegacyEqualsEvaluator" case LegacyEvaluatorType.JsonSimilarity: - return "LegacyJsonSimilarityEvaluatorParams" + return "LegacyJsonSimilarityEvaluator" case _: - return "LegacyUnknownEvaluatorParams" + return "LegacyUnknownEvaluator" case _: - return "LegacyUnknownEvaluatorParams" + return "LegacyUnknownEvaluator" else: - return "LegacyUnknownLegacyEvaluatorParams" + return "LegacyUnknownLegacyEvaluator" LegacyEvaluator = Annotated[ Union[ Annotated[ - LegacyLLMEvaluatorParams, - Tag("LegacyLLMEvaluatorParams"), + LegacyLlmAsAJudgeEvaluator, + Tag("LegacyLLMEvaluator"), ], Annotated[ - LegacyTrajectoryEvaluatorParams, - Tag("LegacyTrajectoryEvaluatorParams"), + LegacyTrajectoryEvaluator, + Tag("LegacyTrajectoryEvaluator"), ], Annotated[ - LegacyEqualsEvaluatorParams, - Tag("LegacyEqualsEvaluatorParams"), + LegacyExactMatchEvaluator, + Tag("LegacyEqualsEvaluator"), ], Annotated[ - LegacyJsonSimilarityEvaluatorParams, - Tag("LegacyJsonSimilarityEvaluatorParams"), + LegacyJsonSimilarityEvaluator, + Tag("LegacyJsonSimilarityEvaluator"), ], Annotated[ - LegacyUnknownEvaluatorParams, - Tag("LegacyUnknownEvaluatorParams"), + UnknownLegacyEvaluator, + Tag("LegacyUnknownEvaluator"), ], ], Field(discriminator=Discriminator(legacy_evaluator_discriminator)), ] -class UnknownEvaluatorConfig(BaseEvaluatorConfig[Any]): - model_config = ConfigDict( - validate_by_name=True, validate_by_alias=True, extra="allow" - ) - - -def evaluator_config_discriminator(data: Any) -> str: +def coded_evaluator_discriminator(data: Any) -> str: if isinstance(data, dict): evaluator_type_id = data.get("evaluatorTypeId") match evaluator_type_id: case EvaluatorType.CONTAINS: - return "ContainsEvaluatorConfig" + return "ContainsEvaluator" case EvaluatorType.EXACT_MATCH: - return "ExactMatchEvaluatorConfig" + return "ExactMatchEvaluator" case EvaluatorType.JSON_SIMILARITY: - return "JsonSimilarityEvaluatorConfig" + return "JsonSimilarityEvaluator" case EvaluatorType.LLM_JUDGE_OUTPUT_SEMANTIC_SIMILARITY: - return "LLMJudgeOutputEvaluatorConfig" + return "LLMJudgeOutputEvaluator" case EvaluatorType.LLM_JUDGE_OUTPUT_STRICT_JSON_SIMILARITY: - return "LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig" + return "LLMJudgeStrictJSONSimilarityOutputEvaluator" case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMILARITY: - return "LLMJudgeTrajectoryEvaluatorConfig" + return "LLMJudgeTrajectoryEvaluator" case EvaluatorType.LLM_JUDGE_TRAJECTORY_SIMULATION: - return "LLMJudgeTrajectorySimulationEvaluatorConfig" + return "LLMJudgeTrajectorySimulationEvaluator" case EvaluatorType.TOOL_CALL_ARGS: - return "ToolCallArgsEvaluatorConfig" + return "ToolCallArgsEvaluator" case EvaluatorType.TOOL_CALL_COUNT: - return "ToolCallCountEvaluatorConfig" + return "ToolCallCountEvaluator" case EvaluatorType.TOOL_CALL_ORDER: - return "ToolCallOrderEvaluatorConfig" + return "ToolCallOrderEvaluator" case EvaluatorType.TOOL_CALL_OUTPUT: - return "ToolCallOutputEvaluatorConfig" + return "ToolCallOutputEvaluator" case _: - return "UnknownEvaluatorConfig" + return "UnknownEvaluator" else: - return "UnknownEvaluatorConfig" + return "UnknownEvaluator" -EvaluatorConfig = Annotated[ +CodedEvaluator = Annotated[ Union[ Annotated[ - ContainsEvaluatorConfig, - Tag("ContainsEvaluatorConfig"), + ContainsEvaluator, + Tag("ContainsEvaluator"), ], Annotated[ - ExactMatchEvaluatorConfig, - Tag("ExactMatchEvaluatorConfig"), + ExactMatchEvaluator, + Tag("ExactMatchEvaluator"), ], Annotated[ - JsonSimilarityEvaluatorConfig, - Tag("JsonSimilarityEvaluatorConfig"), + JsonSimilarityEvaluator, + Tag("JsonSimilarityEvaluator"), ], Annotated[ - LLMJudgeOutputEvaluatorConfig, - Tag("LLMJudgeOutputEvaluatorConfig"), + LLMJudgeOutputEvaluator, + Tag("LLMJudgeOutputEvaluator"), ], Annotated[ - LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig, - Tag("LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig"), + LLMJudgeStrictJSONSimilarityOutputEvaluator, + Tag("LLMJudgeStrictJSONSimilarityOutputEvaluator"), ], Annotated[ - LLMJudgeTrajectoryEvaluatorConfig, - Tag("LLMJudgeTrajectoryEvaluatorConfig"), + LLMJudgeTrajectoryEvaluator, + Tag("LLMJudgeTrajectoryEvaluator"), ], Annotated[ - ToolCallArgsEvaluatorConfig, - Tag("ToolCallArgsEvaluatorConfig"), + ToolCallArgsEvaluator, + Tag("ToolCallArgsEvaluator"), ], Annotated[ - ToolCallCountEvaluatorConfig, - Tag("ToolCallCountEvaluatorConfig"), + ToolCallCountEvaluator, + Tag("ToolCallCountEvaluator"), ], Annotated[ - ToolCallOrderEvaluatorConfig, - Tag("ToolCallOrderEvaluatorConfig"), + ToolCallOrderEvaluator, + Tag("ToolCallOrderEvaluator"), ], Annotated[ - ToolCallOutputEvaluatorConfig, - Tag("ToolCallOutputEvaluatorConfig"), + ToolCallOutputEvaluator, + Tag("ToolCallOutputEvaluator"), ], Annotated[ - LLMJudgeTrajectorySimulationEvaluatorConfig, - Tag("LLMJudgeTrajectorySimulationEvaluatorConfig"), + LLMJudgeTrajectorySimulationEvaluator, + Tag("LLMJudgeTrajectorySimulationEvaluator"), + ], + Annotated[ + UnknownCodedEvaluator, + Tag("UnknownEvaluator"), + ], + ], + Field(discriminator=Discriminator(coded_evaluator_discriminator)), +] + + +def evaluator_discriminator(data: Any) -> str: + if "version" in data: + return "CodedEvaluator" + else: + return "LegacyEvaluator" + + +Evaluator = Annotated[ + Union[ + Annotated[ + LegacyEvaluator, + Tag("LegacyEvaluator"), ], Annotated[ - UnknownEvaluatorConfig, - Tag("UnknownEvaluatorConfig"), + CodedEvaluator, + Tag("CodedEvaluator"), ], ], - Field(discriminator=Discriminator(evaluator_config_discriminator)), + Field(discriminator=Discriminator(evaluator_discriminator)), ] diff --git a/src/uipath/_cli/_evals/_models/_evaluator_base_params.py b/src/uipath/_cli/_evals/_models/_evaluator_base_params.py deleted file mode 100644 index b4e578b9b..000000000 --- a/src/uipath/_cli/_evals/_models/_evaluator_base_params.py +++ /dev/null @@ -1,16 +0,0 @@ -from pydantic import BaseModel - -from uipath.eval.models.models import LegacyEvaluatorCategory, LegacyEvaluatorType - - -class EvaluatorBaseParams(BaseModel): - """Parameters for initializing the base evaluator.""" - - id: str - category: LegacyEvaluatorCategory - evaluator_type: LegacyEvaluatorType - name: str - description: str - created_at: str - updated_at: str - target_output_key: str diff --git a/src/uipath/_cli/_evals/_progress_reporter.py b/src/uipath/_cli/_evals/_progress_reporter.py index ff7d6a355..25d8c9204 100644 --- a/src/uipath/_cli/_evals/_progress_reporter.py +++ b/src/uipath/_cli/_evals/_progress_reporter.py @@ -37,8 +37,9 @@ ) from uipath.eval.evaluators import ( BaseEvaluator, - LegacyBaseEvaluator, + BaseLegacyEvaluator, ) +from uipath.eval.evaluators.base_evaluator import GenericBaseEvaluator from uipath.eval.models import EvalItemResult, ScoreType from uipath.platform import UiPath from uipath.platform.common import UiPathConfig @@ -312,7 +313,7 @@ def _get_endpoint_prefix(self) -> str: return "agentsruntime_/api/" def _is_coded_evaluator( - self, evaluators: list[BaseEvaluator[Any, Any, Any]] + self, evaluators: list[GenericBaseEvaluator[Any, Any, Any]] ) -> bool: """Check if evaluators are coded (BaseEvaluator) vs legacy (LegacyBaseEvaluator). @@ -325,7 +326,7 @@ def _is_coded_evaluator( if not evaluators: return False # Check the first evaluator type - return not isinstance(evaluators[0], LegacyBaseEvaluator) + return not isinstance(evaluators[0], BaseLegacyEvaluator) def _extract_usage_from_spans( self, spans: list[Any] @@ -395,7 +396,7 @@ async def create_eval_set_run_sw( eval_set_id: str, agent_snapshot: StudioWebAgentSnapshot, no_of_evals: int, - evaluators: list[LegacyBaseEvaluator[Any]], + evaluators: list[BaseLegacyEvaluator[Any]], is_coded: bool = False, ) -> str: """Create a new evaluation set run in StudioWeb.""" @@ -458,12 +459,12 @@ async def update_eval_run( ): """Update an evaluation run with results.""" coded_evaluators: dict[str, BaseEvaluator[Any, Any, Any]] = {} - legacy_evaluators: dict[str, LegacyBaseEvaluator[Any]] = {} + legacy_evaluators: dict[str, BaseLegacyEvaluator[Any]] = {} evaluator_runs: list[dict[str, Any]] = [] evaluator_scores: list[dict[str, Any]] = [] for k, v in evaluators.items(): - if isinstance(v, LegacyBaseEvaluator): + if isinstance(v, BaseLegacyEvaluator): legacy_evaluators[k] = v elif isinstance(v, BaseEvaluator): coded_evaluators[k] = v @@ -922,7 +923,7 @@ def _extract_agent_snapshot(self, entrypoint: str | None) -> StudioWebAgentSnaps def _collect_results( self, eval_results: list[EvalItemResult], - evaluators: dict[str, LegacyBaseEvaluator[Any]], + evaluators: dict[str, BaseLegacyEvaluator[Any]], spans: list[Any], ) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]: assertion_runs: list[dict[str, Any]] = [] @@ -972,9 +973,7 @@ def _collect_results( "promptTokens": usage_metrics["promptTokens"] or 0, }, "assertionSnapshot": { - "assertionType": evaluators[ - eval_result.evaluator_id - ].evaluator_type.name, + "assertionType": evaluators[eval_result.evaluator_id].type.name, "outputKey": evaluators[ eval_result.evaluator_id ].target_output_key, diff --git a/src/uipath/_cli/_evals/_runtime.py b/src/uipath/_cli/_evals/_runtime.py index 3120a77d4..48c64139b 100644 --- a/src/uipath/_cli/_evals/_runtime.py +++ b/src/uipath/_cli/_evals/_runtime.py @@ -66,7 +66,7 @@ EvalSetRunUpdatedEvent, EvaluationEvents, ) -from ...eval.evaluators import BaseEvaluator +from ...eval.evaluators.base_evaluator import GenericBaseEvaluator from ...eval.models import EvaluationResult from ...eval.models.models import AgentExecution, EvalItemResult from .._utils._parallelization import execute_parallel @@ -193,7 +193,7 @@ class UiPathEvalContext: # Required Fields runtime_schema: UiPathRuntimeSchema evaluation_set: EvaluationSet - evaluators: list[BaseEvaluator[Any, Any, Any]] + evaluators: list[GenericBaseEvaluator[Any, Any, Any]] execution_id: str # Optional Fields @@ -276,7 +276,7 @@ async def initiate_evaluation( self, ) -> Tuple[ EvaluationSet, - list[BaseEvaluator[Any, Any, Any]], + list[GenericBaseEvaluator[Any, Any, Any]], Iterable[Awaitable[EvaluationRunResult]], ]: # Validate that resume mode is not used with multiple evaluations @@ -487,7 +487,7 @@ async def execute(self) -> UiPathRuntimeResult: async def _execute_eval( self, eval_item: EvaluationItem, - evaluators: list[BaseEvaluator[Any, Any, Any]], + evaluators: list[GenericBaseEvaluator[Any, Any, Any]], ) -> EvaluationRunResult: execution_id = str(eval_item.id) @@ -664,11 +664,12 @@ async def _execute_eval( evaluator=evaluator, execution_output=agent_execution_output, eval_item=eval_item, + # If evaluation criteria is None, validate_and_evaluate defaults to the default evaluation_criteria=evaluator.evaluation_criteria_type( **evaluation_criteria ) if evaluation_criteria - else evaluator.evaluator_config.default_evaluation_criteria, + else None, ) dto_result = EvaluationResultDto.from_evaluation_result( @@ -906,7 +907,7 @@ def _setup_execution_logging( async def run_evaluator( self, - evaluator: BaseEvaluator[Any, Any, Any], + evaluator: GenericBaseEvaluator[Any, Any, Any], execution_output: UiPathEvalRunExecutionOutput, eval_item: EvaluationItem, *, diff --git a/src/uipath/_cli/_utils/_eval_set.py b/src/uipath/_cli/_utils/_eval_set.py index ccad1e89a..bbc9d5047 100644 --- a/src/uipath/_cli/_utils/_eval_set.py +++ b/src/uipath/_cli/_utils/_eval_set.py @@ -14,7 +14,7 @@ ) from uipath._cli._evals.mocks.types import InputMockingStrategy, LLMMockingStrategy from uipath._cli._utils._console import ConsoleLogger -from uipath.eval.evaluators import BaseEvaluator +from uipath.eval.evaluators.base_evaluator import GenericBaseEvaluator console = ConsoleLogger() @@ -183,7 +183,7 @@ async def load_evaluators( eval_set_path: str, evaluation_set: EvaluationSet, agent_model: str | None = None, - ) -> list[BaseEvaluator[Any, Any, Any]]: + ) -> list[GenericBaseEvaluator[Any, Any, Any]]: """Load evaluators referenced by the evaluation set.""" evaluators = [] if evaluation_set is None: diff --git a/src/uipath/_events/_events.py b/src/uipath/_events/_events.py index 3ac0c4967..df3a21921 100644 --- a/src/uipath/_events/_events.py +++ b/src/uipath/_events/_events.py @@ -6,7 +6,7 @@ from pydantic import BaseModel, ConfigDict, SkipValidation, model_validator from uipath._cli._evals._models._evaluation_set import EvaluationItem -from uipath.eval.evaluators import BaseEvaluator +from uipath.eval.evaluators.base_evaluator import GenericBaseEvaluator from uipath.eval.models import EvalItemResult @@ -24,7 +24,7 @@ class EvalSetRunCreatedEvent(BaseModel): eval_set_run_id: str | None = None no_of_evals: int # skip validation to avoid abstract class instantiation - evaluators: SkipValidation[list[BaseEvaluator[Any, Any, Any]]] + evaluators: SkipValidation[list[GenericBaseEvaluator[Any, Any, Any]]] class EvalRunCreatedEvent(BaseModel): diff --git a/src/uipath/eval/evaluators/__init__.py b/src/uipath/eval/evaluators/__init__.py index 248b5d571..529ff1dd2 100644 --- a/src/uipath/eval/evaluators/__init__.py +++ b/src/uipath/eval/evaluators/__init__.py @@ -4,12 +4,12 @@ # Current coded evaluators from .base_evaluator import BaseEvaluationCriteria, BaseEvaluator, BaseEvaluatorConfig +from .base_legacy_evaluator import BaseLegacyEvaluator + +# Legacy evaluators from .contains_evaluator import ContainsEvaluator from .exact_match_evaluator import ExactMatchEvaluator from .json_similarity_evaluator import JsonSimilarityEvaluator - -# Legacy evaluators -from .legacy_base_evaluator import LegacyBaseEvaluator from .legacy_context_precision_evaluator import LegacyContextPrecisionEvaluator from .legacy_exact_match_evaluator import LegacyExactMatchEvaluator from .legacy_faithfulness_evaluator import LegacyFaithfulnessEvaluator @@ -44,16 +44,15 @@ ToolCallCountEvaluator, ToolCallOutputEvaluator, ] - __all__ = [ # Legacy evaluators - "LegacyBaseEvaluator", + "BaseLegacyEvaluator", "LegacyContextPrecisionEvaluator", "LegacyExactMatchEvaluator", "LegacyFaithfulnessEvaluator", - "LegacyJsonSimilarityEvaluator", "LegacyLlmAsAJudgeEvaluator", "LegacyTrajectoryEvaluator", + "LegacyJsonSimilarityEvaluator", # Current coded evaluators "BaseEvaluator", "ContainsEvaluator", diff --git a/src/uipath/eval/evaluators/base_evaluator.py b/src/uipath/eval/evaluators/base_evaluator.py index e1a3a28c4..217ff4fef 100644 --- a/src/uipath/eval/evaluators/base_evaluator.py +++ b/src/uipath/eval/evaluators/base_evaluator.py @@ -11,7 +11,10 @@ from .._helpers.helpers import track_evaluation_metrics from ..models import AgentExecution, EvaluationResult -from ..models.models import UiPathEvaluationError, UiPathEvaluationErrorCategory +from ..models.models import ( + UiPathEvaluationError, + UiPathEvaluationErrorCategory, +) class BaseEvaluationCriteria(BaseModel): @@ -51,7 +54,7 @@ class BaseEvaluatorJustification(BaseModel): J = TypeVar("J", bound=Union[str, None, BaseEvaluatorJustification]) -class BaseEvaluator(BaseModel, Generic[T, C, J], ABC): +class GenericBaseEvaluator(BaseModel, Generic[T, C, J], ABC): """Abstract base class for all evaluators. Generic Parameters: @@ -77,16 +80,17 @@ class BaseEvaluator(BaseModel, Generic[T, C, J], ABC): model_config = ConfigDict(arbitrary_types_allowed=True) id: str - config: dict[str, Any] = Field(description="The config dictionary") - config_type: type[C] = Field(description="The config type class") + name: str = Field(default="", description="The name of the evaluator") + description: str = Field(default="", description="The description of the evaluator") + + config_type: type[C] = Field(description="The config type class", exclude=True) evaluation_criteria_type: type[T] = Field( - description="The type used for evaluation criteria validation and creation" + description="The type used for evaluation criteria validation and creation", + exclude=True, ) justification_type: type[J] = Field( - description="The type used for justification validation and creation" - ) - evaluator_config: C = Field( - exclude=True, description="The validated config object instance" + description="The type used for justification validation and creation", + exclude=True, ) def __init_subclass__(cls, **kwargs: Any): @@ -100,26 +104,6 @@ def __init_subclass__(cls, **kwargs: Any): new_evaluation_method._has_metrics_decorator = True # type: ignore[attr-defined] # probably a better way to do this cls.evaluate = new_evaluation_method # type: ignore[method-assign] # probably a better way to do this - @property - def name(self) -> str: - """Evaluator's name.""" - return self.evaluator_config.name - - @name.setter - def name(self, value: str) -> None: - """Set the evaluator's name.""" - self.evaluator_config.name = value - - @property - def description(self) -> str: - """Evaluator's description.""" - return self.evaluator_config.description - - @description.setter - def description(self, value: str) -> None: - """Set the evaluator's description.""" - self.evaluator_config.description = value - @model_validator(mode="before") @classmethod def validate_model(cls, values: Any) -> Any: @@ -137,6 +121,10 @@ def validate_model(cls, values: Any) -> Any: ValueError: If types cannot be determined or are inconsistent """ if isinstance(values, dict): + if "description" in values and "evaluatorConfig" in values: + values["evaluatorConfig"]["description"] = values.pop("description") + if "name" in values and "evaluatorConfig" in values: + values["evaluatorConfig"]["name"] = values.pop("name") # Always extract and set evaluation_criteria_type criteria_type = cls._extract_evaluation_criteria_type() values["evaluation_criteria_type"] = criteria_type @@ -574,21 +562,12 @@ def generate_json_type(cls) -> dict[str, Any]: "justificationSchema": cls.get_justification_schema(), } + @abstractmethod async def validate_and_evaluate_criteria( self, agent_execution: AgentExecution, evaluation_criteria: Any ) -> EvaluationResult: """Evaluate the given data and return a result from a raw evaluation criteria.""" - if evaluation_criteria is None: - evaluation_criteria = self.evaluator_config.default_evaluation_criteria - if evaluation_criteria is None: - raise UiPathEvaluationError( - code="NO_EVALUATION_CRITERIA_PROVIDED", - title="No evaluation criteria provided and no default evaluation criteria configured", - detail="No evaluation criteria provided and no default evaluation criteria configured", - category=UiPathEvaluationErrorCategory.SYSTEM, - ) - criteria = self.validate_evaluation_criteria(evaluation_criteria) - return await self.evaluate(agent_execution, criteria) + pass @abstractmethod async def evaluate( @@ -608,3 +587,45 @@ async def evaluate( EvaluationResult containing the score and details """ pass + + +class BaseEvaluator(GenericBaseEvaluator[T, C, J]): + """Abstract base class for all coded evaluators. Not naming this BaseCodedEvaluator for backwards compatibility.""" + + version: str = Field(default="1.0", description="Version of the evaluator") + evaluator_type_id: str = Field( + default="", alias="evaluatorTypeId", description="Type of the evaluator" + ) + evaluator_config: C = Field( + alias="evaluatorConfig", description="The validated config object instance" + ) + + name: str = Field(default="", description="The name of the evaluator", exclude=True) + description: str = Field( + default="", description="The description of the evaluator", exclude=True + ) + + def model_post_init(self, __context: Any) -> None: + """Post initialization of the evaluator.""" + if not self.evaluator_type_id: + self.evaluator_type_id = type(self).get_evaluator_id() + if not self.name: + self.name = self.evaluator_config.name + if not self.description: + self.description = self.evaluator_config.description + + async def validate_and_evaluate_criteria( + self, agent_execution: AgentExecution, evaluation_criteria: Any + ) -> EvaluationResult: + """Evaluate the given data and return a result from a raw evaluation criteria.""" + if evaluation_criteria is None: + evaluation_criteria = self.evaluator_config.default_evaluation_criteria + if evaluation_criteria is None: + raise UiPathEvaluationError( + code="NO_EVALUATION_CRITERIA_PROVIDED", + title="No evaluation criteria provided and no default evaluation criteria configured", + detail="No evaluation criteria provided and no default evaluation criteria configured", + category=UiPathEvaluationErrorCategory.SYSTEM, + ) + criteria = self.validate_evaluation_criteria(evaluation_criteria) + return await self.evaluate(agent_execution, criteria) diff --git a/src/uipath/eval/evaluators/legacy_base_evaluator.py b/src/uipath/eval/evaluators/base_legacy_evaluator.py similarity index 76% rename from src/uipath/eval/evaluators/legacy_base_evaluator.py rename to src/uipath/eval/evaluators/base_legacy_evaluator.py index 36a68c3be..706fdd0fd 100644 --- a/src/uipath/eval/evaluators/legacy_base_evaluator.py +++ b/src/uipath/eval/evaluators/base_legacy_evaluator.py @@ -16,7 +16,11 @@ LegacyEvaluatorType, ) -from .base_evaluator import BaseEvaluationCriteria, BaseEvaluator, BaseEvaluatorConfig +from .base_evaluator import ( + BaseEvaluationCriteria, + BaseEvaluatorConfig, + GenericBaseEvaluator, +) def track_evaluation_metrics(func: Callable[..., Any]) -> Callable[..., Any]: @@ -59,8 +63,8 @@ class LegacyEvaluationCriteria(BaseEvaluationCriteria): T = TypeVar("T", bound=LegacyEvaluatorConfig) -class LegacyBaseEvaluator( - BaseEvaluator[LegacyEvaluationCriteria, T, str], Generic[T], ABC +class BaseLegacyEvaluator( + GenericBaseEvaluator[LegacyEvaluationCriteria, T, str], Generic[T], ABC ): """Abstract base class for all legacy evaluators. @@ -70,12 +74,15 @@ class LegacyBaseEvaluator( model_config = ConfigDict(arbitrary_types_allowed=True) - # Legacy-specific fields (in addition to inherited fields from BaseEvaluator) - target_output_key: str = "*" - created_at: str - updated_at: str - category: LegacyEvaluatorCategory - evaluator_type: LegacyEvaluatorType + # Required Fields + category: LegacyEvaluatorCategory = Field(...) + type: LegacyEvaluatorType = Field(...) + + # Optional Fields + file_name: str = Field(default="", alias="fileName") + target_output_key: str = Field(default="*", alias="targetOutputKey") + created_at: str = Field(..., alias="createdAt") + updated_at: str = Field(..., alias="updatedAt") # Note: __init_subclass__ is inherited from BaseEvaluator and handles metrics tracking @@ -93,6 +100,15 @@ def get_evaluator_id(cls) -> str: """ return "legacy-evaluator" + async def validate_and_evaluate_criteria( + self, + agent_execution: AgentExecution, + evaluation_criteria: LegacyEvaluationCriteria, + ) -> EvaluationResult: + """Evaluate the given data and return a result from a raw evaluation criteria.""" + criteria = self.validate_evaluation_criteria(evaluation_criteria) + return await self.evaluate(agent_execution, criteria) + @abstractmethod async def evaluate( self, diff --git a/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py b/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py index 090d42bfc..81a4f8cb3 100644 --- a/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_context_precision_evaluator.py @@ -8,8 +8,8 @@ from ...platform.chat import UiPathLlmChatService from ..models.models import AgentExecution, EvaluationResult -from .legacy_base_evaluator import ( - LegacyBaseEvaluator, +from .base_legacy_evaluator import ( + BaseLegacyEvaluator, LegacyEvaluationCriteria, LegacyEvaluatorConfig, track_evaluation_metrics, @@ -22,7 +22,9 @@ class LegacyContextPrecisionEvaluatorConfig(LegacyEvaluatorConfig): name: str = "LegacyContextPrecisionEvaluator" model: str = "" - prompt: str = """You are an expert evaluator assessing the relevance of context chunks to a given query. + + +PROMPT = """You are an expert evaluator assessing the relevance of context chunks to a given query. TASK: Evaluate how relevant each provided context chunk is to answering the query. Your scoring should be deterministic - the same chunk-query pair should always receive the same score. @@ -86,7 +88,7 @@ class LegacyContextPrecisionEvaluatorConfig(LegacyEvaluatorConfig): class LegacyContextPrecisionEvaluator( - LegacyBaseEvaluator[LegacyContextPrecisionEvaluatorConfig] + BaseLegacyEvaluator[LegacyContextPrecisionEvaluatorConfig] ): """Legacy evaluator that assesses context precision using an LLM. @@ -282,9 +284,9 @@ async def _evaluate_context_grounding( """ # Create evaluation prompt chunks_text = "\n".join(chunks) - prompt = self.evaluator_config.prompt.replace( - self.query_placeholder, query - ).replace(self.chunks_placeholder, chunks_text) + prompt = PROMPT.replace(self.query_placeholder, query).replace( + self.chunks_placeholder, chunks_text + ) # Get LLM response response_obj = await self._get_structured_llm_response(prompt) diff --git a/src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py b/src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py index 798029148..faa5cac17 100644 --- a/src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py +++ b/src/uipath/eval/evaluators/legacy_deterministic_evaluator_base.py @@ -4,12 +4,12 @@ from abc import ABC from typing import Any, Generic, TypeVar -from .legacy_base_evaluator import LegacyBaseEvaluator, LegacyEvaluatorConfig +from .base_legacy_evaluator import BaseLegacyEvaluator, LegacyEvaluatorConfig T = TypeVar("T", bound=LegacyEvaluatorConfig) -class DeterministicEvaluatorBase(LegacyBaseEvaluator[T], Generic[T], ABC): +class BaseLegacyDeterministicEvaluator(BaseLegacyEvaluator[T], Generic[T], ABC): """Base class for evaluators that produce deterministic, reproducible results. This class provides utility methods for canonical JSON comparison and number normalization diff --git a/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py b/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py index b54c30fdf..1ba80455a 100644 --- a/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_exact_match_evaluator.py @@ -3,8 +3,8 @@ from uipath.eval.models import BooleanEvaluationResult, EvaluationResult from ..models.models import AgentExecution -from .legacy_base_evaluator import LegacyEvaluationCriteria, LegacyEvaluatorConfig -from .legacy_deterministic_evaluator_base import DeterministicEvaluatorBase +from .base_legacy_evaluator import LegacyEvaluationCriteria, LegacyEvaluatorConfig +from .legacy_deterministic_evaluator_base import BaseLegacyDeterministicEvaluator class LegacyExactMatchEvaluatorConfig(LegacyEvaluatorConfig): @@ -14,7 +14,7 @@ class LegacyExactMatchEvaluatorConfig(LegacyEvaluatorConfig): class LegacyExactMatchEvaluator( - DeterministicEvaluatorBase[LegacyExactMatchEvaluatorConfig] + BaseLegacyDeterministicEvaluator[LegacyExactMatchEvaluatorConfig] ): """Evaluator that performs exact structural matching between expected and actual outputs. diff --git a/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py b/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py index b3b547ade..cfa2ab90a 100644 --- a/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_faithfulness_evaluator.py @@ -7,8 +7,8 @@ from uipath.platform.chat import UiPathLlmChatService from ..models.models import AgentExecution, EvaluationResult -from .legacy_base_evaluator import ( - LegacyBaseEvaluator, +from .base_legacy_evaluator import ( + BaseLegacyEvaluator, LegacyEvaluationCriteria, LegacyEvaluatorConfig, track_evaluation_metrics, @@ -27,7 +27,7 @@ class LegacyFaithfulnessEvaluatorConfig(LegacyEvaluatorConfig): class LegacyFaithfulnessEvaluator( - LegacyBaseEvaluator[LegacyFaithfulnessEvaluatorConfig] + BaseLegacyEvaluator[LegacyFaithfulnessEvaluatorConfig] ): """Legacy evaluator that assesses faithfulness using an LLM. diff --git a/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py b/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py index 09b467482..23c7dc396 100644 --- a/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_json_similarity_evaluator.py @@ -6,8 +6,8 @@ from uipath.eval.models import EvaluationResult, NumericEvaluationResult from ..models.models import AgentExecution -from .legacy_base_evaluator import LegacyEvaluationCriteria, LegacyEvaluatorConfig -from .legacy_deterministic_evaluator_base import DeterministicEvaluatorBase +from .base_legacy_evaluator import LegacyEvaluationCriteria, LegacyEvaluatorConfig +from .legacy_deterministic_evaluator_base import BaseLegacyDeterministicEvaluator T = TypeVar("T") @@ -19,7 +19,7 @@ class LegacyJsonSimilarityEvaluatorConfig(LegacyEvaluatorConfig): class LegacyJsonSimilarityEvaluator( - DeterministicEvaluatorBase[LegacyJsonSimilarityEvaluatorConfig] + BaseLegacyDeterministicEvaluator[LegacyJsonSimilarityEvaluatorConfig] ): """Legacy deterministic evaluator that scores structural JSON similarity between expected and actual output. diff --git a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py index 8ec2a1146..3dde8c6e1 100644 --- a/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_llm_as_judge_evaluator.py @@ -10,8 +10,8 @@ from ...platform.chat import UiPathLlmChatService from ...platform.chat.llm_gateway import RequiredToolChoice from ..models.models import AgentExecution, EvaluationResult, LLMResponse -from .legacy_base_evaluator import ( - LegacyBaseEvaluator, +from .base_legacy_evaluator import ( + BaseLegacyEvaluator, LegacyEvaluationCriteria, LegacyEvaluatorConfig, ) @@ -24,7 +24,7 @@ class LegacyLlmAsAJudgeEvaluatorConfig(LegacyEvaluatorConfig): name: str = "LegacyLlmAsAJudgeEvaluator" -class LegacyLlmAsAJudgeEvaluator(LegacyBaseEvaluator[LegacyLlmAsAJudgeEvaluatorConfig]): +class LegacyLlmAsAJudgeEvaluator(BaseLegacyEvaluator[LegacyLlmAsAJudgeEvaluatorConfig]): """Legacy evaluator that uses an LLM to judge the quality of agent output.""" prompt: str diff --git a/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py b/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py index 27e8e73be..bdd37b79a 100644 --- a/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py +++ b/src/uipath/eval/evaluators/legacy_trajectory_evaluator.py @@ -16,8 +16,8 @@ NumericEvaluationResult, TrajectoryEvaluationTrace, ) -from .legacy_base_evaluator import ( - LegacyBaseEvaluator, +from .base_legacy_evaluator import ( + BaseLegacyEvaluator, LegacyEvaluationCriteria, LegacyEvaluatorConfig, ) @@ -30,7 +30,7 @@ class LegacyTrajectoryEvaluatorConfig(LegacyEvaluatorConfig): name: str = "LegacyTrajectoryEvaluator" -class LegacyTrajectoryEvaluator(LegacyBaseEvaluator[LegacyTrajectoryEvaluatorConfig]): +class LegacyTrajectoryEvaluator(BaseLegacyEvaluator[LegacyTrajectoryEvaluatorConfig]): """Legacy evaluator that analyzes the trajectory/path taken to reach outputs.""" prompt: str diff --git a/src/uipath/eval/evaluators/llm_judge_output_evaluator.py b/src/uipath/eval/evaluators/llm_judge_output_evaluator.py index 1e8c6919c..400f93205 100644 --- a/src/uipath/eval/evaluators/llm_judge_output_evaluator.py +++ b/src/uipath/eval/evaluators/llm_judge_output_evaluator.py @@ -51,6 +51,7 @@ class LLMJudgeStrictJSONSimilarityOutputEvaluatorConfig(LLMJudgeOutputEvaluatorC OC = TypeVar("OC", bound=LLMJudgeOutputEvaluatorConfig) +# NOTE: This evaluator is only used in coded evaluators class BaseLLMOutputEvaluator( OutputEvaluator[OutputEvaluationCriteria, OC, str], LLMJudgeMixin[OutputEvaluationCriteria, OC], diff --git a/src/uipath/eval/evaluators/output_evaluator.py b/src/uipath/eval/evaluators/output_evaluator.py index f88743db8..1a4036cda 100644 --- a/src/uipath/eval/evaluators/output_evaluator.py +++ b/src/uipath/eval/evaluators/output_evaluator.py @@ -41,6 +41,7 @@ class OutputEvaluatorConfig(BaseEvaluatorConfig[T]): J = TypeVar("J", bound=Union[str, None, BaseEvaluatorJustification]) +# NOTE: This evaluator is only used in coded evaluators class BaseOutputEvaluator(BaseEvaluator[T, C, J]): """Abstract base class for all output evaluators. @@ -105,6 +106,7 @@ def _get_expected_output(self, evaluation_criteria: T) -> Any: return expected_output +# NOTE: This evaluator is only used in coded evaluators. class OutputEvaluator(BaseOutputEvaluator[T_OutputCriteria, C, J]): """Abstract base class for all output evaluators. diff --git a/src/uipath/platform/context_grounding/context_grounding_payloads.py b/src/uipath/platform/context_grounding/context_grounding_payloads.py index 9060a8dcb..420665245 100644 --- a/src/uipath/platform/context_grounding/context_grounding_payloads.py +++ b/src/uipath/platform/context_grounding/context_grounding_payloads.py @@ -15,7 +15,7 @@ ) -class DataSourceBase(BaseModel): +class BaseDataSource(BaseModel): """Base model for data source configurations.""" folder: str = Field(alias="folder", description="Folder path") @@ -25,7 +25,7 @@ class DataSourceBase(BaseModel): directory_path: str = Field(alias="directoryPath", description="Directory path") -class BucketDataSource(DataSourceBase): +class BucketDataSource(BaseDataSource): """Data source configuration for storage buckets.""" odata_type: str = Field( @@ -35,7 +35,7 @@ class BucketDataSource(DataSourceBase): bucket_name: str = Field(alias="bucketName", description="Storage bucket name") -class GoogleDriveDataSource(DataSourceBase): +class GoogleDriveDataSource(BaseDataSource): """Data source configuration for Google Drive.""" odata_type: str = Field( @@ -47,7 +47,7 @@ class GoogleDriveDataSource(DataSourceBase): leaf_folder_id: str = Field(alias="leafFolderId", description="Leaf folder ID") -class DropboxDataSource(DataSourceBase): +class DropboxDataSource(BaseDataSource): """Data source configuration for Dropbox.""" odata_type: str = Field( @@ -58,7 +58,7 @@ class DropboxDataSource(DataSourceBase): connection_name: str = Field(alias="connectionName", description="Connection name") -class OneDriveDataSource(DataSourceBase): +class OneDriveDataSource(BaseDataSource): """Data source configuration for OneDrive.""" odata_type: str = Field( @@ -70,7 +70,7 @@ class OneDriveDataSource(DataSourceBase): leaf_folder_id: str = Field(alias="leafFolderId", description="Leaf folder ID") -class ConfluenceDataSource(DataSourceBase): +class ConfluenceDataSource(BaseDataSource): """Data source configuration for Confluence.""" odata_type: str = Field( diff --git a/tests/cli/evaluators/test_json_similarity_evaluator.py b/tests/cli/evaluators/test_json_similarity_evaluator.py index d3ed11829..0b74ae07c 100644 --- a/tests/cli/evaluators/test_json_similarity_evaluator.py +++ b/tests/cli/evaluators/test_json_similarity_evaluator.py @@ -4,12 +4,12 @@ """ import json +from typing import Any import pytest -from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams from uipath.eval.evaluators import LegacyJsonSimilarityEvaluator -from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria +from uipath.eval.evaluators.base_legacy_evaluator import LegacyEvaluationCriteria from uipath.eval.models.models import ( AgentExecution, LegacyEvaluatorCategory, @@ -17,26 +17,24 @@ ) -def _make_base_params() -> EvaluatorBaseParams: - return EvaluatorBaseParams( - id="json-sim", - category=LegacyEvaluatorCategory.Deterministic, - evaluator_type=LegacyEvaluatorType.JsonSimilarity, - name="JSON Similarity", - description="Compares JSON structures", - created_at="2025-01-01T00:00:00Z", - updated_at="2025-01-01T00:00:00Z", - target_output_key="*", - ) +def _make_base_params() -> dict[str, Any]: + """Create base parameters for faithfulness evaluator.""" + return { + "id": "json-sim", + "category": LegacyEvaluatorCategory.Deterministic, + "type": LegacyEvaluatorType.JsonSimilarity, + "name": "JSON Similarity", + "description": "Compares JSON structures", + "createdAt": "2025-01-01T00:00:00Z", + "updatedAt": "2025-01-01T00:00:00Z", + "targetOutputKey": "*", + } class TestJsonSimilarityEvaluator: @pytest.mark.asyncio async def test_json_similarity_exact_score_1(self) -> None: - evaluator = LegacyJsonSimilarityEvaluator( - **_make_base_params().model_dump(), - config={}, - ) + evaluator = LegacyJsonSimilarityEvaluator(**_make_base_params()) expected_json = """ { "user": { @@ -82,10 +80,7 @@ async def test_json_similarity_exact_score_1(self) -> None: @pytest.mark.asyncio async def test_json_similarity_exact_score_2(self) -> None: - evaluator = LegacyJsonSimilarityEvaluator( - **_make_base_params().model_dump(), - config={}, - ) + evaluator = LegacyJsonSimilarityEvaluator(**_make_base_params()) expected_json = """ { "users": [ @@ -122,10 +117,7 @@ async def test_json_similarity_exact_score_2(self) -> None: @pytest.mark.asyncio async def test_json_similarity_exact_score_3(self) -> None: - evaluator = LegacyJsonSimilarityEvaluator( - **_make_base_params().model_dump(), - config={}, - ) + evaluator = LegacyJsonSimilarityEvaluator(**_make_base_params()) expected_json = """ { "name": "Alice", @@ -159,10 +151,7 @@ async def test_json_similarity_exact_score_3(self) -> None: @pytest.mark.asyncio async def test_json_similarity_exact_score_4(self) -> None: - evaluator = LegacyJsonSimilarityEvaluator( - **_make_base_params().model_dump(), - config={}, - ) + evaluator = LegacyJsonSimilarityEvaluator(**_make_base_params()) expected_json = """ { "user": { diff --git a/tests/cli/evaluators/test_legacy_context_precision_evaluator.py b/tests/cli/evaluators/test_legacy_context_precision_evaluator.py index b3bf7f6eb..d6d425bee 100644 --- a/tests/cli/evaluators/test_legacy_context_precision_evaluator.py +++ b/tests/cli/evaluators/test_legacy_context_precision_evaluator.py @@ -5,13 +5,13 @@ import json from types import MappingProxyType +from typing import Any from unittest.mock import AsyncMock, patch import pytest -from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams from uipath.eval.evaluators import LegacyContextPrecisionEvaluator -from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria +from uipath.eval.evaluators.base_legacy_evaluator import LegacyEvaluationCriteria from uipath.eval.models.models import ( AgentExecution, LegacyEvaluatorCategory, @@ -19,18 +19,18 @@ ) -def _make_base_params() -> EvaluatorBaseParams: +def _make_base_params() -> dict[str, Any]: """Create base parameters for context precision evaluator.""" - return EvaluatorBaseParams( - id="context-precision", - category=LegacyEvaluatorCategory.LlmAsAJudge, - evaluator_type=LegacyEvaluatorType.ContextPrecision, - name="Context Precision", - description="Evaluates context chunk relevance", - created_at="2025-01-01T00:00:00Z", - updated_at="2025-01-01T00:00:00Z", - target_output_key="*", - ) + return { + "id": "context-precision", + "category": LegacyEvaluatorCategory.LlmAsAJudge, + "type": LegacyEvaluatorType.ContextPrecision, + "name": "Context Precision", + "description": "Evaluates context chunk relevance", + "createdAt": "2025-01-01T00:00:00Z", + "updatedAt": "2025-01-01T00:00:00Z", + "targetOutputKey": "*", + } @pytest.fixture(autouse=True) @@ -46,8 +46,7 @@ def mock_uipath_platform(): def evaluator_with_mocked_llm(): """Fixture to create evaluator with mocked LLM service.""" evaluator = LegacyContextPrecisionEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), model="gpt-4.1-2025-04-14", ) return evaluator diff --git a/tests/cli/evaluators/test_legacy_exact_match_evaluator.py b/tests/cli/evaluators/test_legacy_exact_match_evaluator.py index 5419f4f0e..199058c43 100644 --- a/tests/cli/evaluators/test_legacy_exact_match_evaluator.py +++ b/tests/cli/evaluators/test_legacy_exact_match_evaluator.py @@ -4,13 +4,13 @@ canonical JSON normalization, and number normalization. """ +from typing import Any from unittest.mock import patch import pytest -from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams from uipath.eval.evaluators import LegacyExactMatchEvaluator -from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria +from uipath.eval.evaluators.base_legacy_evaluator import LegacyEvaluationCriteria from uipath.eval.models.models import ( AgentExecution, LegacyEvaluatorCategory, @@ -18,28 +18,25 @@ ) -def _make_base_params(target_output_key: str = "*") -> EvaluatorBaseParams: - """Create base parameters for exact match evaluator.""" - return EvaluatorBaseParams( - id="exact_match", - category=LegacyEvaluatorCategory.Deterministic, - evaluator_type=LegacyEvaluatorType.Equals, - name="ExactMatch", - description="Evaluates exact match of outputs", - created_at="2025-01-01T00:00:00Z", - updated_at="2025-01-01T00:00:00Z", - target_output_key=target_output_key, - ) +def _make_base_params(target_output_key: str = "*") -> dict[str, Any]: + """Create base parameters for faithfulness evaluator.""" + return { + "id": "exact_match", + "category": LegacyEvaluatorCategory.Deterministic, + "type": LegacyEvaluatorType.Equals, + "name": "ExactMatch", + "description": "Evaluates exact match of outputs", + "createdAt": "2025-01-01T00:00:00Z", + "updatedAt": "2025-01-01T00:00:00Z", + "targetOutputKey": target_output_key, + } @pytest.fixture def evaluator(): """Fixture to create evaluator.""" with patch("uipath.platform.UiPath"): - return LegacyExactMatchEvaluator( - **_make_base_params().model_dump(), - config={}, - ) + return LegacyExactMatchEvaluator(**_make_base_params()) @pytest.fixture @@ -47,8 +44,7 @@ def evaluator_with_target_key(): """Fixture to create evaluator with a specific target output key.""" with patch("uipath.platform.UiPath"): return LegacyExactMatchEvaluator( - **_make_base_params(target_output_key="result").model_dump(), - config={}, + **_make_base_params(target_output_key="result") ) diff --git a/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py b/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py index c5822f257..c95fd6c57 100644 --- a/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py +++ b/tests/cli/evaluators/test_legacy_faithfulness_evaluator.py @@ -10,9 +10,8 @@ import pytest -from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams from uipath.eval.evaluators import LegacyFaithfulnessEvaluator -from uipath.eval.evaluators.legacy_base_evaluator import LegacyEvaluationCriteria +from uipath.eval.evaluators.base_legacy_evaluator import LegacyEvaluationCriteria from uipath.eval.models.models import ( AgentExecution, LegacyEvaluatorCategory, @@ -20,18 +19,18 @@ ) -def _make_base_params() -> EvaluatorBaseParams: +def _make_base_params() -> dict[str, Any]: """Create base parameters for faithfulness evaluator.""" - return EvaluatorBaseParams( - id="faithfulness", - category=LegacyEvaluatorCategory.LlmAsAJudge, - evaluator_type=LegacyEvaluatorType.Faithfulness, - name="Faithfulness", - description="Evaluates faithfulness of claims against context", - created_at="2025-01-01T00:00:00Z", - updated_at="2025-01-01T00:00:00Z", - target_output_key="*", - ) + return { + "id": "faithfulness", + "category": LegacyEvaluatorCategory.LlmAsAJudge, + "type": LegacyEvaluatorType.Faithfulness, + "name": "Faithfulness", + "description": "Evaluates faithfulness of claims against context", + "createdAt": "2025-01-01T00:00:00Z", + "updatedAt": "2025-01-01T00:00:00Z", + "targetOutputKey": "*", + } @pytest.fixture(autouse=True) @@ -47,8 +46,7 @@ def mock_uipath_platform(): def evaluator_with_mocked_llm(): """Fixture to create evaluator with mocked LLM service.""" evaluator = LegacyFaithfulnessEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), model="gpt-4.1-2025-04-14", ) return evaluator diff --git a/tests/cli/evaluators/test_legacy_llm_as_judge_placeholder_validation.py b/tests/cli/evaluators/test_legacy_llm_as_judge_placeholder_validation.py index 694e8d2a9..34f4c7cdb 100644 --- a/tests/cli/evaluators/test_legacy_llm_as_judge_placeholder_validation.py +++ b/tests/cli/evaluators/test_legacy_llm_as_judge_placeholder_validation.py @@ -4,9 +4,9 @@ automatically added to prompts with XML tags for clear delimitation. """ +from typing import Any from unittest.mock import patch -from uipath._cli._evals._models._evaluator_base_params import EvaluatorBaseParams from uipath.eval.evaluators.legacy_llm_as_judge_evaluator import ( LegacyLlmAsAJudgeEvaluator, ) @@ -16,18 +16,18 @@ ) -def _make_base_params() -> EvaluatorBaseParams: - """Create base parameters for LLM judge evaluator.""" - return EvaluatorBaseParams( - id="test", - category=LegacyEvaluatorCategory.LlmAsAJudge, - evaluator_type=LegacyEvaluatorType.Custom, - name="TestEvaluator", - description="Test", - created_at="2025-01-01T00:00:00Z", - updated_at="2025-01-01T00:00:00Z", - target_output_key="*", - ) +def _make_base_params() -> dict[str, Any]: + """Create base parameters for faithfulness evaluator.""" + return { + "id": "test", + "category": LegacyEvaluatorCategory.LlmAsAJudge, + "type": LegacyEvaluatorType.Custom, + "name": "TestEvaluator", + "description": "Test", + "createdAt": "2025-01-01T00:00:00Z", + "updatedAt": "2025-01-01T00:00:00Z", + "targetOutputKey": "*", + } class TestLegacyLlmAsAJudgePlaceholderValidation: @@ -43,8 +43,7 @@ def test_both_placeholders_present_no_modification(self): with patch("uipath.platform.UiPath"): evaluator = LegacyLlmAsAJudgeEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), prompt=original_prompt, model="gpt-4", ) @@ -62,8 +61,7 @@ def test_missing_expected_output_placeholder_added_with_tags(self): with patch("uipath.platform.UiPath"): evaluator = LegacyLlmAsAJudgeEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), prompt=original_prompt, model="gpt-4", ) @@ -92,8 +90,7 @@ def test_missing_actual_output_placeholder_added_with_tags(self): with patch("uipath.platform.UiPath"): evaluator = LegacyLlmAsAJudgeEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), prompt=original_prompt, model="gpt-4", ) @@ -119,8 +116,7 @@ def test_both_placeholders_missing_both_added_with_tags(self): with patch("uipath.platform.UiPath"): evaluator = LegacyLlmAsAJudgeEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), prompt=original_prompt, model="gpt-4", ) @@ -158,8 +154,7 @@ def test_placeholder_order_actual_then_expected(self): with patch("uipath.platform.UiPath"): evaluator = LegacyLlmAsAJudgeEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), prompt=original_prompt, model="gpt-4", ) @@ -177,8 +172,7 @@ def test_xml_tags_properly_nested(self): with patch("uipath.platform.UiPath"): evaluator = LegacyLlmAsAJudgeEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), prompt=original_prompt, model="gpt-4", ) @@ -202,8 +196,7 @@ def test_custom_placeholder_delimiters_not_affected(self): with patch("uipath.platform.UiPath"): evaluator = LegacyLlmAsAJudgeEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), prompt=original_prompt, model="gpt-4", ) @@ -221,8 +214,7 @@ def test_sections_appended_not_prepended(self): with patch("uipath.platform.UiPath"): evaluator = LegacyLlmAsAJudgeEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), prompt=original_prompt, model="gpt-4", ) @@ -249,8 +241,7 @@ def test_multiline_prompt_with_missing_placeholders(self): with patch("uipath.platform.UiPath"): evaluator = LegacyLlmAsAJudgeEvaluator( - **_make_base_params().model_dump(), - config={}, + **_make_base_params(), prompt=original_prompt, model="gpt-4", ) diff --git a/tests/evaluators/test_documentation_examples.py b/tests/evaluators/test_documentation_examples.py index f9f767c99..a9e840df2 100644 --- a/tests/evaluators/test_documentation_examples.py +++ b/tests/evaluators/test_documentation_examples.py @@ -8,6 +8,7 @@ from typing import Any import pytest +from pydantic import TypeAdapter from pytest_mock.plugin import MockerFixture from uipath.eval.evaluators import ( @@ -39,13 +40,15 @@ async def test_getting_started_example(self) -> None: ) # Create evaluator - evaluator = ExactMatchEvaluator( # type: ignore[call-arg] - id="exact-match-1", - config={ - "name": "ExactMatchEvaluator", - "case_sensitive": False, - "target_output_key": "result", - }, + evaluator = TypeAdapter(ExactMatchEvaluator).validate_python( + dict( + id="exact-match-1", + evaluatorConfig={ + "name": "ExactMatchEvaluator", + "case_sensitive": False, + "target_output_key": "result", + }, + ) ) # Evaluate @@ -67,13 +70,15 @@ async def test_basic_usage(self) -> None: from uipath.eval.models import AgentExecution # Create evaluator - extracts "response" field for comparison - evaluator = ContainsEvaluator( # type: ignore[call-arg] - id="contains-check", - config={ - "name": "ContainsEvaluator", - "case_sensitive": False, - "target_output_key": "response", # Extract the "response" field - }, + evaluator = TypeAdapter(ContainsEvaluator).validate_python( + dict( + id="contains-check", + evaluatorConfig={ + "name": "ContainsEvaluator", + "case_sensitive": False, + "target_output_key": "response", # Extract the "response" field + }, + ) ) # agent_output must be a dict @@ -94,13 +99,15 @@ async def test_basic_usage(self) -> None: @pytest.mark.asyncio async def test_case_sensitive_search(self) -> None: """Test case-sensitive search example.""" - evaluator = ContainsEvaluator( # type: ignore[call-arg] - id="contains-case-sensitive", - config={ - "name": "ContainsEvaluator", - "case_sensitive": True, - "target_output_key": "message", # Extract the "message" field - }, + evaluator = TypeAdapter(ContainsEvaluator).validate_python( + dict( + id="contains-case-sensitive", + evaluatorConfig={ + "name": "ContainsEvaluator", + "case_sensitive": True, + "target_output_key": "message", # Extract the "message" field + }, + ) ) agent_execution = AgentExecution( @@ -120,13 +127,15 @@ async def test_case_sensitive_search(self) -> None: @pytest.mark.asyncio async def test_negated_search(self) -> None: """Test negated search example.""" - evaluator = ContainsEvaluator( # type: ignore[call-arg] - id="contains-negated", - config={ - "name": "ContainsEvaluator", - "negated": True, - "target_output_key": "status", # Extract the "status" field - }, + evaluator = TypeAdapter(ContainsEvaluator).validate_python( + dict( + id="contains-negated", + evaluatorConfig={ + "name": "ContainsEvaluator", + "negated": True, + "target_output_key": "status", # Extract the "status" field + }, + ) ) agent_execution = AgentExecution( @@ -146,9 +155,14 @@ async def test_negated_search(self) -> None: @pytest.mark.asyncio async def test_target_specific_output_field(self) -> None: """Test targeting specific output field example.""" - evaluator = ContainsEvaluator( # type: ignore[call-arg] - id="contains-targeted", - config={"name": "ContainsEvaluator", "target_output_key": "message"}, + evaluator = TypeAdapter(ContainsEvaluator).validate_python( + dict( + id="contains-targeted", + evaluatorConfig={ + "name": "ContainsEvaluator", + "target_output_key": "message", + }, + ) ) agent_execution = AgentExecution( @@ -186,13 +200,15 @@ async def test_basic_usage(self) -> None: ) # Create evaluator - extracts "result" field for comparison - evaluator = ExactMatchEvaluator( # type: ignore[call-arg] - id="exact-match-1", - config={ - "name": "ExactMatchEvaluator", - "case_sensitive": False, - "target_output_key": "result", # Extract the "result" field - }, + evaluator = TypeAdapter(ExactMatchEvaluator).validate_python( + dict( + id="exact-match-1", + evaluatorConfig={ + "name": "ExactMatchEvaluator", + "case_sensitive": False, + "target_output_key": "result", # Extract the "result" field + }, + ) ) # Evaluate - compares just the "result" field value @@ -212,13 +228,15 @@ async def test_case_sensitive_matching(self) -> None: agent_trace=[], ) - evaluator = ExactMatchEvaluator( # type: ignore[call-arg] - id="exact-match-case", - config={ - "name": "ExactMatchEvaluator", - "case_sensitive": True, - "target_output_key": "status", # Extract the "status" field - }, + evaluator = TypeAdapter(ExactMatchEvaluator).validate_python( + dict( + id="exact-match-case", + evaluatorConfig={ + "name": "ExactMatchEvaluator", + "case_sensitive": True, + "target_output_key": "status", # Extract the "status" field + }, + ) ) # Fails due to case mismatch @@ -246,12 +264,14 @@ async def test_matching_structured_outputs(self) -> None: agent_trace=[], ) - evaluator = ExactMatchEvaluator( # type: ignore[call-arg] - id="exact-match-dict", - config={ - "name": "ExactMatchEvaluator", - "target_output_key": "*", # Compare entire output (default) - }, + evaluator = TypeAdapter(ExactMatchEvaluator).validate_python( + dict( + id="exact-match-dict", + evaluatorConfig={ + "name": "ExactMatchEvaluator", + "target_output_key": "*", # Compare entire output (default) + }, + ) ) # Entire dict structure must match @@ -271,13 +291,15 @@ async def test_negated_mode(self) -> None: agent_trace=[], ) - evaluator = ExactMatchEvaluator( # type: ignore[call-arg] - id="exact-match-negated", - config={ - "name": "ExactMatchEvaluator", - "negated": True, - "target_output_key": "result", # Extract the "result" field - }, + evaluator = TypeAdapter(ExactMatchEvaluator).validate_python( + dict( + id="exact-match-negated", + evaluatorConfig={ + "name": "ExactMatchEvaluator", + "negated": True, + "target_output_key": "result", # Extract the "result" field + }, + ) ) # Passes because outputs do NOT match @@ -297,13 +319,17 @@ async def test_using_default_criteria(self) -> None: agent_trace=[], ) - evaluator = ExactMatchEvaluator( # type: ignore[call-arg] - id="exact-match-default", - config={ - "name": "ExactMatchEvaluator", - "target_output_key": "status", # Extract the "status" field - "default_evaluation_criteria": {"expected_output": {"status": "OK"}}, - }, + evaluator = TypeAdapter(ExactMatchEvaluator).validate_python( + dict( + id="exact-match-default", + evaluatorConfig={ + "name": "ExactMatchEvaluator", + "target_output_key": "status", # Extract the "status" field + "default_evaluation_criteria": { + "expected_output": {"status": "OK"} + }, + }, + ) ) # Use default criteria @@ -329,12 +355,14 @@ async def test_basic_json_comparison(self) -> None: agent_trace=[], ) - evaluator = JsonSimilarityEvaluator( # type: ignore[call-arg] - id="json-sim-1", - config={ - "name": "JsonSimilarityEvaluator" - # target_output_key defaults to "*" - compares entire output dict - }, + evaluator = TypeAdapter(JsonSimilarityEvaluator).validate_python( + dict( + id="json-sim-1", + evaluatorConfig={ + "name": "JsonSimilarityEvaluator" + # target_output_key defaults to "*" - compares entire output dict + }, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -361,9 +389,11 @@ async def test_numeric_tolerance(self) -> None: agent_trace=[], ) - evaluator = JsonSimilarityEvaluator( # type: ignore[call-arg] - id="json-sim-numeric", - config={"name": "JsonSimilarityEvaluator"}, + evaluator = TypeAdapter(JsonSimilarityEvaluator).validate_python( + dict( + id="json-sim-numeric", + evaluatorConfig={"name": "JsonSimilarityEvaluator"}, + ) ) # Slightly different numbers @@ -386,9 +416,11 @@ async def test_string_similarity(self) -> None: agent_trace=[], ) - evaluator = JsonSimilarityEvaluator( # type: ignore[call-arg] - id="json-sim-string", - config={"name": "JsonSimilarityEvaluator"}, + evaluator = TypeAdapter(JsonSimilarityEvaluator).validate_python( + dict( + id="json-sim-string", + evaluatorConfig={"name": "JsonSimilarityEvaluator"}, + ) ) # Similar but not exact string @@ -415,9 +447,11 @@ async def test_nested_structures(self) -> None: agent_trace=[], ) - evaluator = JsonSimilarityEvaluator( # type: ignore[call-arg] - id="json-sim-nested", - config={"name": "JsonSimilarityEvaluator"}, + evaluator = TypeAdapter(JsonSimilarityEvaluator).validate_python( + dict( + id="json-sim-nested", + evaluatorConfig={"name": "JsonSimilarityEvaluator"}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -444,9 +478,11 @@ async def test_array_comparison(self) -> None: agent_trace=[], ) - evaluator = JsonSimilarityEvaluator( # type: ignore[call-arg] - id="json-sim-array", - config={"name": "JsonSimilarityEvaluator"}, + evaluator = TypeAdapter(JsonSimilarityEvaluator).validate_python( + dict( + id="json-sim-array", + evaluatorConfig={"name": "JsonSimilarityEvaluator"}, + ) ) # Partial match (2 out of 3 correct) @@ -474,9 +510,11 @@ async def test_handling_extra_keys(self) -> None: agent_trace=[], ) - evaluator = JsonSimilarityEvaluator( # type: ignore[call-arg] - id="json-sim-extra", - config={"name": "JsonSimilarityEvaluator"}, + evaluator = TypeAdapter(JsonSimilarityEvaluator).validate_python( + dict( + id="json-sim-extra", + evaluatorConfig={"name": "JsonSimilarityEvaluator"}, + ) ) # Only expected keys are evaluated @@ -499,12 +537,14 @@ async def test_target_specific_field(self) -> None: agent_trace=[], ) - evaluator = JsonSimilarityEvaluator( # type: ignore[call-arg] - id="json-sim-targeted", - config={ - "name": "JsonSimilarityEvaluator", - "target_output_key": "result", - }, + evaluator = TypeAdapter(JsonSimilarityEvaluator).validate_python( + dict( + id="json-sim-targeted", + evaluatorConfig={ + "name": "JsonSimilarityEvaluator", + "target_output_key": "result", + }, + ) ) # Only compares the "result" field @@ -552,15 +592,17 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: agent_trace=[], ) - evaluator = LLMJudgeOutputEvaluator( # type: ignore[call-arg] - id="llm-judge-1", - config={ - "name": "LLMJudgeOutputEvaluator", - "model": "gpt-4", - "temperature": 0.0, - "target_output_key": "answer", # Extract the "answer" field - }, - llm_service=mock_chat_completions, + evaluator = TypeAdapter(LLMJudgeOutputEvaluator).validate_python( + dict( + id="llm-judge-1", + evaluatorConfig={ + "name": "LLMJudgeOutputEvaluator", + "model": "gpt-4", + "temperature": 0.0, + "target_output_key": "answer", # Extract the "answer" field + }, + llm_service=mock_chat_completions, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -612,16 +654,18 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: agent_trace=[], ) - evaluator = LLMJudgeOutputEvaluator( # type: ignore[call-arg] - id="llm-judge-custom", - config={ - "name": "LLMJudgeOutputEvaluator", - "model": "gpt-4", - "prompt": custom_prompt, - "temperature": 0.0, - "target_output_key": "message", # Extract the "message" field - }, - llm_service=mock_chat_completions, + evaluator = TypeAdapter(LLMJudgeOutputEvaluator).validate_python( + dict( + id="llm-judge-custom", + evaluatorConfig={ + "name": "LLMJudgeOutputEvaluator", + "model": "gpt-4", + "prompt": custom_prompt, + "temperature": 0.0, + "target_output_key": "message", # Extract the "message" field + }, + llm_service=mock_chat_completions, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -672,15 +716,17 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: agent_trace=[], ) - evaluator = LLMJudgeOutputEvaluator( # type: ignore[call-arg] - id="llm-judge-quality", - config={ - "name": "LLMJudgeOutputEvaluator", - "model": "gpt-4o", - "temperature": 0.0, - "target_output_key": "email", # Extract the "email" field - }, - llm_service=mock_chat_completions, + evaluator = TypeAdapter(LLMJudgeOutputEvaluator).validate_python( + dict( + id="llm-judge-quality", + evaluatorConfig={ + "name": "LLMJudgeOutputEvaluator", + "model": "gpt-4o", + "temperature": 0.0, + "target_output_key": "email", # Extract the "email" field + }, + llm_service=mock_chat_completions, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -731,14 +777,18 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: agent_trace=[], ) - evaluator = LLMJudgeStrictJSONSimilarityOutputEvaluator( # type: ignore[call-arg] - id="llm-json-strict", - config={ - "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", - "model": "gpt-4", - "temperature": 0.0, - }, - llm_service=mock_chat_completions, + evaluator = TypeAdapter( + LLMJudgeStrictJSONSimilarityOutputEvaluator + ).validate_python( + dict( + id="llm-json-strict", + evaluatorConfig={ + "name": "LLMJudgeStrictJSONSimilarityOutputEvaluator", + "model": "gpt-4", + "temperature": 0.0, + }, + llm_service=mock_chat_completions, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -793,14 +843,16 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: ], ) - evaluator = LLMJudgeTrajectoryEvaluator( # type: ignore[call-arg] - id="trajectory-judge-1", - config={ - "name": "LLMJudgeTrajectoryEvaluator", - "model": "gpt-4", - "temperature": 0.0, - }, - llm_service=mock_chat_completions, + evaluator = TypeAdapter(LLMJudgeTrajectoryEvaluator).validate_python( + dict( + id="trajectory-judge-1", + evaluatorConfig={ + "name": "LLMJudgeTrajectoryEvaluator", + "model": "gpt-4", + "temperature": 0.0, + }, + llm_service=mock_chat_completions, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -848,14 +900,16 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: ], ) - evaluator = LLMJudgeTrajectoryEvaluator( # type: ignore[call-arg] - id="trajectory-tools", - config={ - "name": "LLMJudgeTrajectoryEvaluator", - "model": "gpt-4o", - "temperature": 0.0, - }, - llm_service=mock_chat_completions, + evaluator = TypeAdapter(LLMJudgeTrajectoryEvaluator).validate_python( + dict( + id="trajectory-tools", + evaluatorConfig={ + "name": "LLMJudgeTrajectoryEvaluator", + "model": "gpt-4o", + "temperature": 0.0, + }, + llm_service=mock_chat_completions, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -912,14 +966,16 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: """, ) - evaluator = LLMJudgeTrajectorySimulationEvaluator( # type: ignore[call-arg] - id="sim-trajectory-1", - config={ - "name": "LLMJudgeTrajectorySimulationEvaluator", - "model": "gpt-4", - "temperature": 0.0, - }, - llm_service=mock_chat_completions, + evaluator = TypeAdapter(LLMJudgeTrajectorySimulationEvaluator).validate_python( + dict( + id="sim-trajectory-1", + evaluatorConfig={ + "name": "LLMJudgeTrajectorySimulationEvaluator", + "model": "gpt-4", + "temperature": 0.0, + }, + llm_service=mock_chat_completions, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -984,9 +1040,11 @@ async def test_basic_tool_call_order(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOrderEvaluator( # type: ignore[call-arg] - id="order-check-1", - config={"name": "ToolCallOrderEvaluator", "strict": False}, + evaluator = TypeAdapter(ToolCallOrderEvaluator).validate_python( + dict( + id="order-check-1", + evaluatorConfig={"name": "ToolCallOrderEvaluator", "strict": False}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1033,9 +1091,11 @@ async def test_strict_order_validation(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOrderEvaluator( # type: ignore[call-arg] - id="order-strict", - config={"name": "ToolCallOrderEvaluator", "strict": True}, + evaluator = TypeAdapter(ToolCallOrderEvaluator).validate_python( + dict( + id="order-strict", + evaluatorConfig={"name": "ToolCallOrderEvaluator", "strict": True}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1087,9 +1147,11 @@ async def test_partial_credit_with_lcs(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOrderEvaluator( # type: ignore[call-arg] - id="order-lcs", - config={"name": "ToolCallOrderEvaluator", "strict": False}, + evaluator = TypeAdapter(ToolCallOrderEvaluator).validate_python( + dict( + id="order-lcs", + evaluatorConfig={"name": "ToolCallOrderEvaluator", "strict": False}, + ) ) # Expected sequence @@ -1145,9 +1207,11 @@ async def test_database_transaction_sequence(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOrderEvaluator( # type: ignore[call-arg] - id="db-transaction", - config={"name": "ToolCallOrderEvaluator", "strict": True}, + evaluator = TypeAdapter(ToolCallOrderEvaluator).validate_python( + dict( + id="db-transaction", + evaluatorConfig={"name": "ToolCallOrderEvaluator", "strict": True}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1211,9 +1275,11 @@ async def test_api_integration_workflow(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOrderEvaluator( # type: ignore[call-arg] - id="api-workflow", - config={"name": "ToolCallOrderEvaluator", "strict": False}, + evaluator = TypeAdapter(ToolCallOrderEvaluator).validate_python( + dict( + id="api-workflow", + evaluatorConfig={"name": "ToolCallOrderEvaluator", "strict": False}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1265,15 +1331,17 @@ async def test_using_default_criteria(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOrderEvaluator( # type: ignore[call-arg] - id="order-default", - config={ - "name": "ToolCallOrderEvaluator", - "strict": False, - "default_evaluation_criteria": { - "tool_calls_order": ["init", "process", "cleanup"] + evaluator = TypeAdapter(ToolCallOrderEvaluator).validate_python( + dict( + id="order-default", + evaluatorConfig={ + "name": "ToolCallOrderEvaluator", + "strict": False, + "default_evaluation_criteria": { + "tool_calls_order": ["init", "process", "cleanup"] + }, }, - }, + ) ) # Use default criteria @@ -1346,9 +1414,11 @@ async def test_basic_count_validation(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallCountEvaluator( # type: ignore[call-arg] - id="count-check-1", - config={"name": "ToolCallCountEvaluator", "strict": False}, + evaluator = TypeAdapter(ToolCallCountEvaluator).validate_python( + dict( + id="count-check-1", + evaluatorConfig={"name": "ToolCallCountEvaluator", "strict": False}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1404,9 +1474,11 @@ async def test_using_comparison_operators(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallCountEvaluator( # type: ignore[call-arg] - id="count-operators", - config={"name": "ToolCallCountEvaluator", "strict": False}, + evaluator = TypeAdapter(ToolCallCountEvaluator).validate_python( + dict( + id="count-operators", + evaluatorConfig={"name": "ToolCallCountEvaluator", "strict": False}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1457,9 +1529,11 @@ async def test_strict_mode_all_or_nothing(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallCountEvaluator( # type: ignore[call-arg] - id="count-strict", - config={"name": "ToolCallCountEvaluator", "strict": True}, + evaluator = TypeAdapter(ToolCallCountEvaluator).validate_python( + dict( + id="count-strict", + evaluatorConfig={"name": "ToolCallCountEvaluator", "strict": True}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1517,9 +1591,11 @@ async def test_preventing_redundant_calls(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallCountEvaluator( # type: ignore[call-arg] - id="prevent-redundant", - config={"name": "ToolCallCountEvaluator", "strict": False}, + evaluator = TypeAdapter(ToolCallCountEvaluator).validate_python( + dict( + id="prevent-redundant", + evaluatorConfig={"name": "ToolCallCountEvaluator", "strict": False}, + ) ) # Ensure expensive operations aren't called too many times @@ -1578,9 +1654,11 @@ async def test_loop_validation(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallCountEvaluator( # type: ignore[call-arg] - id="loop-validation", - config={"name": "ToolCallCountEvaluator", "strict": False}, + evaluator = TypeAdapter(ToolCallCountEvaluator).validate_python( + dict( + id="loop-validation", + evaluatorConfig={"name": "ToolCallCountEvaluator", "strict": False}, + ) ) # Verify loop processed correct number of items @@ -1627,13 +1705,15 @@ async def test_basic_argument_validation(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallArgsEvaluator( # type: ignore[call-arg] - id="args-check-1", - config={ - "name": "ToolCallArgsEvaluator", - "strict": False, - "subset": False, - }, + evaluator = TypeAdapter(ToolCallArgsEvaluator).validate_python( + dict( + id="args-check-1", + evaluatorConfig={ + "name": "ToolCallArgsEvaluator", + "strict": False, + "subset": False, + }, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1679,9 +1759,15 @@ async def test_strict_mode_exact_matching(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallArgsEvaluator( # type: ignore[call-arg] - id="args-strict", - config={"name": "ToolCallArgsEvaluator", "strict": True, "subset": False}, + evaluator = TypeAdapter(ToolCallArgsEvaluator).validate_python( + dict( + id="args-strict", + evaluatorConfig={ + "name": "ToolCallArgsEvaluator", + "strict": True, + "subset": False, + }, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1727,9 +1813,15 @@ async def test_subset_mode_partial_validation(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallArgsEvaluator( # type: ignore[call-arg] - id="args-subset", - config={"name": "ToolCallArgsEvaluator", "strict": False, "subset": True}, + evaluator = TypeAdapter(ToolCallArgsEvaluator).validate_python( + dict( + id="args-subset", + evaluatorConfig={ + "name": "ToolCallArgsEvaluator", + "strict": False, + "subset": True, + }, + ) ) # Only validate critical fields @@ -1790,9 +1882,15 @@ async def test_multiple_tool_calls(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallArgsEvaluator( # type: ignore[call-arg] - id="args-multiple", - config={"name": "ToolCallArgsEvaluator", "strict": False, "subset": False}, + evaluator = TypeAdapter(ToolCallArgsEvaluator).validate_python( + dict( + id="args-multiple", + evaluatorConfig={ + "name": "ToolCallArgsEvaluator", + "strict": False, + "subset": False, + }, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1826,25 +1924,31 @@ async def test_nested_arguments(self) -> None: mock_spans = [ ReadableSpan( - name="configure_service", + name="evaluatorConfigure_service", start_time=0, end_time=1, attributes={ - "tool.name": "configure_service", - "input.value": "{'service': 'api', 'config': {'host': 'api.example.com', 'port': 443, 'ssl': {'enabled': True, 'cert_path': '/path/to/cert'}}}", + "tool.name": "evaluatorConfigure_service", + "input.value": "{'service': 'api', 'evaluatorConfig': {'host': 'api.example.com', 'port': 443, 'ssl': {'enabled': True, 'cert_path': '/path/to/cert'}}}", }, ), ] agent_execution = AgentExecution( - agent_input={"task": "Configure API service"}, - agent_output={"status": "configured"}, + agent_input={"task": "evaluatorConfigure API service"}, + agent_output={"status": "evaluatorConfigured"}, agent_trace=mock_spans, ) - evaluator = ToolCallArgsEvaluator( # type: ignore[call-arg] - id="args-nested", - config={"name": "ToolCallArgsEvaluator", "strict": False, "subset": False}, + evaluator = TypeAdapter(ToolCallArgsEvaluator).validate_python( + dict( + id="args-nested", + evaluatorConfig={ + "name": "ToolCallArgsEvaluator", + "strict": False, + "subset": False, + }, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1852,10 +1956,10 @@ async def test_nested_arguments(self) -> None: evaluation_criteria={ "tool_calls": [ { - "name": "configure_service", + "name": "evaluatorConfigure_service", "args": { "service": "api", - "config": { + "evaluatorConfig": { "host": "api.example.com", "port": 443, "ssl": {"enabled": True, "cert_path": "/path/to/cert"}, @@ -1913,13 +2017,15 @@ async def test_non_strict_proportional_scoring(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallArgsEvaluator( # type: ignore[call-arg] - id="args-proportional", - config={ - "name": "ToolCallArgsEvaluator", - "strict": False, # Proportional scoring - "subset": False, - }, + evaluator = TypeAdapter(ToolCallArgsEvaluator).validate_python( + dict( + id="args-proportional", + evaluatorConfig={ + "name": "ToolCallArgsEvaluator", + "strict": False, # Proportional scoring + "subset": False, + }, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -1978,9 +2084,11 @@ async def test_basic_output_validation(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOutputEvaluator( # type: ignore[call-arg] - id="output-check-1", - config={"name": "ToolCallOutputEvaluator", "strict": False}, + evaluator = TypeAdapter(ToolCallOutputEvaluator).validate_python( + dict( + id="output-check-1", + evaluatorConfig={"name": "ToolCallOutputEvaluator", "strict": False}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -2022,9 +2130,11 @@ async def test_strict_mode_exact_output_matching(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOutputEvaluator( # type: ignore[call-arg] - id="output-strict", - config={"name": "ToolCallOutputEvaluator", "strict": True}, + evaluator = TypeAdapter(ToolCallOutputEvaluator).validate_python( + dict( + id="output-strict", + evaluatorConfig={"name": "ToolCallOutputEvaluator", "strict": True}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -2075,9 +2185,11 @@ async def test_multiple_tool_outputs(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOutputEvaluator( # type: ignore[call-arg] - id="output-multiple", - config={"name": "ToolCallOutputEvaluator", "strict": False}, + evaluator = TypeAdapter(ToolCallOutputEvaluator).validate_python( + dict( + id="output-multiple", + evaluatorConfig={"name": "ToolCallOutputEvaluator", "strict": False}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -2117,9 +2229,11 @@ async def test_error_handling_validation(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOutputEvaluator( # type: ignore[call-arg] - id="error-validation", - config={"name": "ToolCallOutputEvaluator", "strict": False}, + evaluator = TypeAdapter(ToolCallOutputEvaluator).validate_python( + dict( + id="error-validation", + evaluatorConfig={"name": "ToolCallOutputEvaluator", "strict": False}, + ) ) result = await evaluator.validate_and_evaluate_criteria( @@ -2181,12 +2295,14 @@ async def test_non_strict_proportional_scoring(self) -> None: agent_trace=mock_spans, ) - evaluator = ToolCallOutputEvaluator( # type: ignore[call-arg] - id="output-proportional", - config={ - "name": "ToolCallOutputEvaluator", - "strict": False, # Proportional scoring - }, + evaluator = TypeAdapter(ToolCallOutputEvaluator).validate_python( + dict( + id="output-proportional", + evaluatorConfig={ + "name": "ToolCallOutputEvaluator", + "strict": False, # Proportional scoring + }, + ) ) result = await evaluator.validate_and_evaluate_criteria( diff --git a/tests/evaluators/test_evaluator_methods.py b/tests/evaluators/test_evaluator_methods.py index f724e8d10..fa017dbe3 100644 --- a/tests/evaluators/test_evaluator_methods.py +++ b/tests/evaluators/test_evaluator_methods.py @@ -15,6 +15,7 @@ import pytest from opentelemetry.sdk.trace import ReadableSpan +from pydantic import ValidationError from pytest_mock.plugin import MockerFixture from uipath.eval.evaluators.contains_evaluator import ( @@ -139,7 +140,7 @@ async def test_exact_match_string_success( "default_evaluation_criteria": {"expected_output": "test"}, } evaluator = ExactMatchEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = OutputEvaluationCriteria(expected_output={"output": "Test output"}) # pyright: ignore[reportCallIssue] @@ -159,7 +160,7 @@ async def test_exact_match_string_failure( "default_evaluation_criteria": {"expected_output": "test"}, } evaluator = ExactMatchEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = OutputEvaluationCriteria( expected_output={"output": "Different output"} # pyright: ignore[reportCallIssue] @@ -181,7 +182,7 @@ async def test_exact_match_negated( "negated": True, } evaluator = ExactMatchEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = OutputEvaluationCriteria( expected_output={"output": "Test output"}, # pyright: ignore[reportCallIssue] @@ -202,7 +203,7 @@ async def test_exact_match_validate_and_evaluate_criteria( "case_sensitive": True, } evaluator = ExactMatchEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) raw_criteria = {"expected_output": {"output": "Test output"}} @@ -228,7 +229,7 @@ async def test_contains_evaluator( "default_evaluation_criteria": {"search_text": "Test output"}, } evaluator = ContainsEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ContainsEvaluationCriteria(search_text="Test output") result = await evaluator.evaluate(sample_agent_execution, criteria) @@ -248,7 +249,7 @@ async def test_contains_evaluator_negated( "default_evaluation_criteria": {"search_text": "Test output"}, } evaluator = ContainsEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ContainsEvaluationCriteria(search_text="Test output") result = await evaluator.evaluate(sample_agent_execution, criteria) @@ -267,7 +268,7 @@ async def test_contains_evaluator_validate_and_evaluate_criteria( "default_evaluation_criteria": {"search_text": "Test output"}, } evaluator = ContainsEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ContainsEvaluationCriteria(search_text="Test output") result = await evaluator.validate_and_evaluate_criteria( @@ -292,7 +293,7 @@ async def test_json_similarity_identical(self) -> None: "name": "JsonSimilarityTest", } evaluator = JsonSimilarityEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = OutputEvaluationCriteria( expected_output={"name": "John", "age": 30, "city": "NYC"} # pyright: ignore[reportCallIssue] @@ -316,7 +317,7 @@ async def test_json_similarity_partial_match(self) -> None: "default_evaluation_criteria": {"expected_output": "test"}, } evaluator = JsonSimilarityEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = OutputEvaluationCriteria( expected_output={"name": "John", "age": 30, "city": "NYC"} # pyright: ignore[reportCallIssue] @@ -339,7 +340,7 @@ async def test_json_similarity_validate_and_evaluate_criteria(self) -> None: "name": "JsonSimilarityTest", } evaluator = JsonSimilarityEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) raw_criteria = {"expected_output": {"name": "John", "age": 30, "city": "NYC"}} @@ -364,7 +365,7 @@ async def test_tool_call_order_perfect_match( } evaluator = ToolCallOrderEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallOrderEvaluationCriteria( tool_calls_order=["tool1", "tool2", "tool1", "tool2"] @@ -387,7 +388,7 @@ async def test_tool_call_order_no_perfect_match( } evaluator = ToolCallOrderEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallOrderEvaluationCriteria( tool_calls_order=["tool1", "tool1", "tool2", "tool2"] @@ -409,7 +410,7 @@ async def test_tool_call_order_lcs_match( "strict": False, } evaluator = ToolCallOrderEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallOrderEvaluationCriteria( tool_calls_order=["tool1", "tool1", "tool2", "tool2"] @@ -430,7 +431,7 @@ async def test_tool_call_order_validate_and_evaluate_criteria( "strict": True, } evaluator = ToolCallOrderEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) raw_criteria = {"tool_calls_order": ["tool1", "tool2", "tool1", "tool2"]} @@ -455,7 +456,7 @@ async def test_tool_call_count_exact_match( "strict": True, } evaluator = ToolCallCountEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallCountEvaluationCriteria( tool_calls_count={"tool1": ("=", 2), "tool2": ("=", 2)} @@ -476,7 +477,7 @@ async def test_tool_call_count_with_gt( "strict": True, } evaluator = ToolCallCountEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallCountEvaluationCriteria( tool_calls_count={"tool1": (">", 1), "tool2": (">", 1)} @@ -497,7 +498,7 @@ async def test_tool_call_count_no_exact_match( "strict": True, } evaluator = ToolCallCountEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallCountEvaluationCriteria( tool_calls_count={"tool1": ("=", 2), "tool2": ("=", 1)} @@ -518,7 +519,7 @@ async def test_tool_call_count_partial_match( "strict": False, } evaluator = ToolCallCountEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallCountEvaluationCriteria( tool_calls_count={"tool1": ("=", 2), "tool2": ("=", 1)} @@ -539,7 +540,7 @@ async def test_tool_call_count_validate_and_evaluate_criteria( "strict": True, } evaluator = ToolCallCountEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) raw_criteria = {"tool_calls_count": {"tool1": ("=", 2), "tool2": ("=", 2)}} @@ -564,7 +565,7 @@ async def test_tool_call_args_perfect_match( "strict": True, } evaluator = ToolCallArgsEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallArgsEvaluationCriteria( tool_calls=[ @@ -590,7 +591,7 @@ async def test_tool_call_args_partial_match( "strict": False, } evaluator = ToolCallArgsEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallArgsEvaluationCriteria( tool_calls=[ @@ -616,7 +617,7 @@ async def test_tool_call_args_validate_and_evaluate_criteria( "strict": True, } evaluator = ToolCallArgsEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) raw_criteria = { "tool_calls": [ @@ -648,7 +649,7 @@ async def test_tool_call_output_perfect_match( "strict": True, } evaluator = ToolCallOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallOutputEvaluationCriteria( tool_outputs=[ @@ -674,7 +675,7 @@ async def test_tool_call_output_partial_match( "strict": False, } evaluator = ToolCallOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallOutputEvaluationCriteria( tool_outputs=[ @@ -700,7 +701,7 @@ async def test_tool_call_output_no_match_strict( "strict": True, } evaluator = ToolCallOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallOutputEvaluationCriteria( tool_outputs=[ @@ -726,7 +727,7 @@ async def test_tool_call_output_partial_match_non_strict( "strict": False, } evaluator = ToolCallOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallOutputEvaluationCriteria( tool_outputs=[ @@ -750,7 +751,7 @@ async def test_tool_call_output_empty_criteria( "strict": False, } evaluator = ToolCallOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallOutputEvaluationCriteria(tool_outputs=[]) @@ -769,7 +770,7 @@ async def test_tool_call_output_validate_and_evaluate_criteria( "strict": True, } evaluator = ToolCallOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) raw_criteria = { "tool_outputs": [ @@ -831,7 +832,7 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: "model": "gpt-4o-2024-08-06", } evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = OutputEvaluationCriteria(expected_output="Expected output") # pyright: ignore[reportCallIssue] @@ -875,7 +876,7 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: } evaluator = LLMJudgeOutputEvaluator.model_validate( { - "config": config, + "evaluatorConfig": config, "llm_service": mock_chat_completions, "id": str(uuid.uuid4()), } @@ -931,7 +932,7 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: "model": "gpt-4", } evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) raw_criteria = {"expected_output": "Expected output"} @@ -989,7 +990,7 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: "model": "gpt-4", } evaluator = LLMJudgeTrajectoryEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = TrajectoryEvaluationCriteria( @@ -1044,7 +1045,7 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: "model": "gpt-4", } evaluator = LLMJudgeTrajectoryEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) raw_criteria = {"expected_agent_behavior": "Agent should respond helpfully"} @@ -1069,7 +1070,7 @@ async def test_invalid_criteria_type(self) -> None: "default_evaluation_criteria": {"expected_output": "test"}, } evaluator = ExactMatchEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) with pytest.raises(UiPathEvaluationError): @@ -1088,10 +1089,10 @@ async def test_missing_config_fields(self, mocker: MockerFixture) -> None: "default_evaluation_criteria": {}, } - with pytest.raises(UiPathEvaluationError, match="Field required"): + with pytest.raises(ValidationError): # Missing required field 'model' LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) @@ -1107,7 +1108,7 @@ async def test_evaluators_return_results_with_scores( "name": "Test", } evaluator = ExactMatchEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = OutputEvaluationCriteria(expected_output={"output": "Test output"}) # pyright: ignore[reportCallIssue] @@ -1130,7 +1131,7 @@ async def test_exact_match_evaluator_justification( "case_sensitive": True, } evaluator = ExactMatchEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = OutputEvaluationCriteria(expected_output={"output": "Test output"}) # pyright: ignore[reportCallIssue] @@ -1157,7 +1158,7 @@ async def test_json_similarity_evaluator_justification(self) -> None: "name": "JsonSimilarityTest", } evaluator = JsonSimilarityEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = OutputEvaluationCriteria( expected_output={"name": "John", "age": 30, "city": "NYC"} # pyright: ignore[reportCallIssue] @@ -1184,7 +1185,7 @@ async def test_tool_call_order_evaluator_justification( "strict": True, } evaluator = ToolCallOrderEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallOrderEvaluationCriteria( tool_calls_order=["tool1", "tool2", "tool1", "tool2"] @@ -1211,7 +1212,7 @@ async def test_tool_call_count_evaluator_justification( "strict": True, } evaluator = ToolCallCountEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallCountEvaluationCriteria( tool_calls_count={"tool1": ("=", 2), "tool2": ("=", 2)} @@ -1238,7 +1239,7 @@ async def test_tool_call_args_evaluator_justification( "strict": True, } evaluator = ToolCallArgsEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallArgsEvaluationCriteria( tool_calls=[ @@ -1270,7 +1271,7 @@ async def test_tool_call_output_evaluator_justification( "strict": True, } evaluator = ToolCallOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = ToolCallOutputEvaluationCriteria( tool_outputs=[ @@ -1331,7 +1332,7 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: "model": "gpt-4o-2024-08-06", } evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = OutputEvaluationCriteria(expected_output="Expected output") # pyright: ignore[reportCallIssue] @@ -1387,7 +1388,7 @@ async def mock_chat_completions(*args: Any, **kwargs: Any) -> Any: "model": "gpt-4", } evaluator = LLMJudgeTrajectoryEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) criteria = TrajectoryEvaluationCriteria( expected_agent_behavior="Agent should respond helpfully" @@ -1412,7 +1413,7 @@ def test_justification_validation_edge_cases(self, mocker: MockerFixture) -> Non "default_evaluation_criteria": {"expected_output": "test"}, } none_evaluator = ExactMatchEvaluator.model_validate( - {"config": config_dict, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_dict, "id": str(uuid.uuid4())} ) # All inputs should return None for None type evaluators @@ -1431,7 +1432,7 @@ def test_justification_validation_edge_cases(self, mocker: MockerFixture) -> Non mock_llm_service = mocker.MagicMock() str_evaluator = LLMJudgeOutputEvaluator.model_validate( { - "config": llm_config_dict, + "evaluatorConfig": llm_config_dict, "llm_service": mock_llm_service, "id": str(uuid.uuid4()), } @@ -1540,7 +1541,7 @@ async def capture_chat_completions(**kwargs: Any) -> Any: # max_tokens is intentionally omitted (defaults to None) } evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) # Evaluate diff --git a/tests/evaluators/test_evaluator_schemas.py b/tests/evaluators/test_evaluator_schemas.py index f7c7de4bd..ba11e5d69 100644 --- a/tests/evaluators/test_evaluator_schemas.py +++ b/tests/evaluators/test_evaluator_schemas.py @@ -347,7 +347,7 @@ def test_config_validation_exact_match(self) -> None: "default_evaluation_criteria": {"expected_output": "test"}, } evaluator = ExactMatchEvaluator.model_validate( - {"config": config_dict, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_dict, "id": str(uuid.uuid4())} ) assert isinstance(evaluator.evaluator_config, ExactMatchEvaluatorConfig) @@ -361,7 +361,7 @@ def test_criteria_validation_exact_match(self) -> None: "default_evaluation_criteria": {"expected_output": "test"}, } evaluator = ExactMatchEvaluator.model_validate( - {"config": config_dict, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_dict, "id": str(uuid.uuid4())} ) # Test dict validation @@ -379,7 +379,7 @@ def test_criteria_validation_tool_call_order(self) -> None: "default_evaluation_criteria": {"tool_calls_order": ["tool1", "tool2"]}, } evaluator = ToolCallOrderEvaluator.model_validate( - {"config": config_dict, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_dict, "id": str(uuid.uuid4())} ) # Test dict validation @@ -400,7 +400,7 @@ def test_config_validation_tool_call_output(self) -> None: }, } evaluator = ToolCallOutputEvaluator.model_validate( - {"config": config_dict, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_dict, "id": str(uuid.uuid4())} ) assert isinstance(evaluator.evaluator_config, ToolCallOutputEvaluatorConfig) @@ -417,7 +417,7 @@ def test_criteria_validation_tool_call_output(self) -> None: }, } evaluator = ToolCallOutputEvaluator.model_validate( - {"config": config_dict, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_dict, "id": str(uuid.uuid4())} ) # Test dict validation @@ -446,7 +446,7 @@ def test_criteria_validation_llm_judge_output(self, mocker: MockerFixture) -> No mock_llm_service = mocker.MagicMock() evaluator = LLMJudgeOutputEvaluator.model_validate( { - "config": config_dict, + "evaluatorConfig": config_dict, "llm_service": mock_llm_service, "id": str(uuid.uuid4()), } @@ -467,7 +467,7 @@ def test_automatic_type_detection(self) -> None: "default_evaluation_criteria": {"expected_output": "test"}, } evaluator = JsonSimilarityEvaluator.model_validate( - {"config": config_dict, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_dict, "id": str(uuid.uuid4())} ) # Types should be set correctly @@ -481,7 +481,7 @@ def test_justification_validation_none_type(self) -> None: "default_evaluation_criteria": {"expected_output": "test"}, } evaluator = ExactMatchEvaluator.model_validate( - {"config": config_dict, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_dict, "id": str(uuid.uuid4())} ) # Test None validation @@ -498,7 +498,7 @@ def test_justification_validation_str_type(self, mocker: MockerFixture) -> None: mock_llm_service = mocker.MagicMock() evaluator = LLMJudgeOutputEvaluator.model_validate( { - "config": config_dict, + "evaluatorConfig": config_dict, "llm_service": mock_llm_service, "id": str(uuid.uuid4()), } @@ -520,7 +520,7 @@ def test_justification_type_consistency(self, mocker: MockerFixture) -> None: "default_evaluation_criteria": {"expected_output": "test"}, } exact_match_evaluator = ExactMatchEvaluator.model_validate( - {"config": config_dict, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_dict, "id": str(uuid.uuid4())} ) assert exact_match_evaluator.justification_type is type(None) @@ -533,7 +533,7 @@ def test_justification_type_consistency(self, mocker: MockerFixture) -> None: mock_llm_service = mocker.MagicMock() llm_evaluator = LLMJudgeOutputEvaluator.model_validate( { - "config": llm_config_dict, + "evaluatorConfig": llm_config_dict, "llm_service": mock_llm_service, "id": str(uuid.uuid4()), } @@ -552,7 +552,7 @@ def test_instance_config_access(self) -> None: "default_evaluation_criteria": {"expected_output": "test"}, } evaluator = ExactMatchEvaluator.model_validate( - {"config": config_data, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_data, "id": str(uuid.uuid4())} ) # Test direct config access @@ -569,7 +569,7 @@ def test_instance_schema_access(self) -> None: "default_evaluation_criteria": {"expected_output": "test"}, } evaluator = JsonSimilarityEvaluator.model_validate( - {"config": config_dict, "id": str(uuid.uuid4())} + {"evaluatorConfig": config_dict, "id": str(uuid.uuid4())} ) # Should be able to get schemas from instances diff --git a/tests/evaluators/test_llm_judge_placeholder_validation.py b/tests/evaluators/test_llm_judge_placeholder_validation.py index 3c15043b5..71108fdbe 100644 --- a/tests/evaluators/test_llm_judge_placeholder_validation.py +++ b/tests/evaluators/test_llm_judge_placeholder_validation.py @@ -31,7 +31,7 @@ def test_both_placeholders_present_no_modification(self): with patch("uipath.platform.UiPath"): evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) # Prompt should remain unchanged @@ -53,7 +53,7 @@ def test_missing_expected_output_placeholder_added_with_tags(self): with patch("uipath.platform.UiPath"): evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) # Check that ExpectedOutput section was added @@ -89,7 +89,7 @@ def test_missing_actual_output_placeholder_added_with_tags(self): with patch("uipath.platform.UiPath"): evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) # Check that ActualOutput section was added @@ -122,7 +122,7 @@ def test_both_placeholders_missing_both_added_with_tags(self): with patch("uipath.platform.UiPath"): evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) # Check that both sections were added @@ -167,7 +167,7 @@ def test_placeholder_order_actual_then_expected(self): with patch("uipath.platform.UiPath"): evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) # Find positions of the sections @@ -189,7 +189,7 @@ def test_xml_tags_properly_nested(self): with patch("uipath.platform.UiPath"): evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) # Check proper nesting for ActualOutput @@ -219,7 +219,7 @@ def test_custom_placeholder_delimiters_not_affected(self): with patch("uipath.platform.UiPath"): evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) # Double braces should be preserved @@ -245,7 +245,7 @@ def test_sections_appended_not_prepended(self): with patch("uipath.platform.UiPath"): evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) # Original prompt should be at the start @@ -278,7 +278,7 @@ def test_multiline_prompt_with_missing_placeholders(self): with patch("uipath.platform.UiPath"): evaluator = LLMJudgeOutputEvaluator.model_validate( - {"config": config, "id": str(uuid.uuid4())} + {"evaluatorConfig": config, "id": str(uuid.uuid4())} ) # Original multiline content should be preserved diff --git a/uv.lock b/uv.lock index 0cac305ce..a54ea1c17 100644 --- a/uv.lock +++ b/uv.lock @@ -2491,7 +2491,7 @@ wheels = [ [[package]] name = "uipath" -version = "2.7.0" +version = "2.7.1" source = { editable = "." } dependencies = [ { name = "applicationinsights" },