diff --git a/eval_protocol/cli.py b/eval_protocol/cli.py index a2278a31..ac8a8d9d 100644 --- a/eval_protocol/cli.py +++ b/eval_protocol/cli.py @@ -3,38 +3,17 @@ """ import argparse -import inspect -import json import logging import os import sys from pathlib import Path -from typing import Any, cast -from .cli_commands.utils import add_args_from_callable_signature from fireworks import Fireworks -logger = logging.getLogger(__name__) - - from .cli_commands.common import setup_logging +from .cli_commands.utils import add_args_from_callable_signature -# Re-export deploy_command for backward compatibility with tests importing from eval_protocol.cli -try: # pragma: no cover - import-time alias for tests - from .cli_commands import deploy as _deploy_mod - - deploy_command = _deploy_mod.deploy_command # type: ignore[attr-defined] -except Exception: # pragma: no cover - # If import fails in constrained environments, tests that import it will surface the issue - deploy_command = None # type: ignore[assignment] - -# Re-export preview_command for backward compatibility with tests importing from eval_protocol.cli -try: # pragma: no cover - import-time alias for tests - from .cli_commands import preview as _preview_mod - - preview_command = _preview_mod.preview_command # type: ignore[attr-defined] -except Exception: # pragma: no cover - preview_command = None # type: ignore[assignment] +logger = logging.getLogger(__name__) def build_parser() -> argparse.ArgumentParser: @@ -55,257 +34,6 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse subparsers = parser.add_subparsers(dest="command", help="Command to run") - # NOTE: The following commands are hidden/disabled. Uncomment to re-enable. - # # Preview command - # preview_parser = subparsers.add_parser("preview", help="Preview an evaluator with sample data") - # preview_parser.add_argument( - # "--metrics-folders", - # "-m", - # nargs="+", - # help="Metric folders in format 'name=path', e.g., 'clarity=./metrics/clarity'", - # ) - # - # # Make samples optional to allow HF dataset option - # preview_parser.add_argument( - # "--samples", - # "-s", - # required=False, - # help="Path to JSONL file containing sample data", - # ) - # preview_parser.add_argument( - # "--max-samples", - # type=int, - # default=5, - # help="Maximum number of samples to process (default: 5)", - # ) - # - # # Add HuggingFace dataset options - # hf_group = preview_parser.add_argument_group("HuggingFace Dataset Options") - # hf_group.add_argument( - # "--huggingface-dataset", - # "--hf", - # help="HuggingFace dataset name (e.g., 'deepseek-ai/DeepSeek-ProverBench')", - # ) - # hf_group.add_argument( - # "--huggingface-split", - # default="train", - # help="Dataset split to use (default: 'train')", - # ) - # hf_group.add_argument( - # "--huggingface-prompt-key", - # default="prompt", - # help="Key in the dataset containing the prompt text (default: 'prompt')", - # ) - # hf_group.add_argument( - # "--huggingface-response-key", - # default="response", - # help="Key in the dataset containing the response text (default: 'response')", - # ) - # hf_group.add_argument( - # "--huggingface-key-map", - # help="JSON mapping of dataset keys to Eval Protocol message keys", - # ) - # preview_parser.add_argument( - # "--remote-url", - # help="URL of a remote reward function endpoint to preview against. If provided, metrics-folders might be ignored.", - # ) - # - # # Deploy command - # deploy_parser = subparsers.add_parser("deploy", help="Create and deploy an evaluator, or register a remote one") - # deploy_parser.add_argument("--id", required=True, help="ID for the evaluator") - # deploy_parser.add_argument( - # "--metrics-folders", - # "-m", - # nargs="+", - # required=False, # No longer strictly required if --remote-url is used - # help="Metric folders in format 'name=path', e.g., 'clarity=./metrics/clarity'. Required if not using --remote-url.", - # ) - # deploy_parser.add_argument( - # "--display-name", - # help="Display name for the evaluator (defaults to ID if not provided)", - # ) - # deploy_parser.add_argument("--description", help="Description for the evaluator") - # deploy_parser.add_argument( - # "--force", - # "-f", - # action="store_true", - # help="Force update if evaluator already exists", - # ) - # - # # Add HuggingFace dataset options to deploy command - # hf_deploy_group = deploy_parser.add_argument_group("HuggingFace Dataset Options") - # hf_deploy_group.add_argument( - # "--huggingface-dataset", - # "--hf", - # help="HuggingFace dataset name (e.g., 'deepseek-ai/DeepSeek-ProverBench')", - # ) - # hf_deploy_group.add_argument( - # "--huggingface-split", - # default="train", - # help="Dataset split to use (default: 'train')", - # ) - # hf_deploy_group.add_argument( - # "--huggingface-prompt-key", - # default="prompt", - # help="Key in the dataset containing the prompt text (default: 'prompt')", - # ) - # hf_deploy_group.add_argument( - # "--huggingface-response-key", - # default="response", - # help="Key in the dataset containing the response text (default: 'response')", - # ) - # hf_deploy_group.add_argument( - # "--huggingface-key-map", - # help="JSON mapping of dataset keys to Eval Protocol message keys", - # ) - # deploy_parser.add_argument( - # "--remote-url", - # help="URL of a pre-deployed remote reward function. If provided, deploys by registering this URL with Fireworks AI.", - # ) - # - # # Deployment target options - # target_group = deploy_parser.add_argument_group("Deployment Target Options") - # target_group.add_argument( - # "--target", - # choices=["fireworks", "gcp-cloud-run", "local-serve"], - # default="fireworks", - # help="Deployment target. 'fireworks' for standard Fireworks platform deployment, 'gcp-cloud-run' for Google Cloud Run, 'local-serve' for local serving with Serveo tunneling.", - # ) - # target_group.add_argument( - # "--function-ref", - # help="Reference to the reward function to deploy (e.g., 'my_module.reward_func'). Required for 'gcp-cloud-run' and 'local-serve' targets.", - # ) - # - # # Local serving options (relevant if --target is local-serve) - # local_serve_group = deploy_parser.add_argument_group("Local Serving Options (used if --target is local-serve)") - # local_serve_group.add_argument( - # "--local-port", - # type=int, - # default=8001, - # help="Port for the local reward function server to listen on (default: 8001). Used with --target local-serve.", - # ) - # - # # GCP deployment options - # gcp_group = deploy_parser.add_argument_group( - # "GCP Cloud Run Deployment Options (used if --target is gcp-cloud-run)" - # ) - # # --function-ref is now in target_group - # gcp_group.add_argument( - # "--gcp-project", - # required=False, - # help="Google Cloud Project ID. Must be provided via CLI or rewardkit.yaml.", - # ) - # gcp_group.add_argument( - # "--gcp-region", - # required=False, - # help="Google Cloud Region for deployment (e.g., 'us-central1'). Must be provided via CLI or rewardkit.yaml.", - # ) - # gcp_group.add_argument( - # "--gcp-ar-repo", - # required=False, - # help="Google Artifact Registry repository name. Optional, defaults to value in rewardkit.yaml or 'eval-protocol-evaluators' if not specified.", - # ) - # gcp_group.add_argument( - # "--service-account", - # help="Email of the GCP service account to run the Cloud Run service. Optional.", - # ) - # gcp_group.add_argument( - # "--entry-point", - # default="reward_function", - # help="The name of the entry point function within your --function-ref module (default: reward_function). Only for gcp-cloud-run.", - # ) - # gcp_group.add_argument( - # "--runtime", - # default="python311", # Or a sensible default - # help="The Cloud Functions/Run runtime (e.g., python311). Only for gcp-cloud-run.", - # ) - # gcp_group.add_argument( - # "--gcp-auth-mode", - # choices=["open", "api-key"], # Add 'iam' later - # default=None, # Default will be resolved in deploy_command - # help="Authentication mode for the deployed GCP Cloud Run service. " - # "'open': Publicly accessible. " - # "'api-key': Service is publicly accessible but requires an API key in requests (handled by the application). " - # "If not specified, defaults to value in rewardkit.yaml or 'api-key'. Optional.", - # ) - # - # # Deploy MCP command - # deploy_mcp_parser = subparsers.add_parser("deploy-mcp", help="Deploy an MCP server to Google Cloud Run") - # deploy_mcp_parser.add_argument("--id", required=True, help="Unique ID for the MCP server deployment") - # deploy_mcp_parser.add_argument( - # "--mcp-server-module", - # help="Python module containing the MCP server (e.g., 'examples.frozen_lake_mcp.frozen_lake_mcp_server'). Required if --dockerfile is not provided.", - # ) - # deploy_mcp_parser.add_argument( - # "--dockerfile", - # help="Path to Dockerfile to use for deployment (recommended for tested local Dockerfiles). When provided, --mcp-server-module is not required.", - # ) - # deploy_mcp_parser.add_argument( - # "--gcp-project", - # help="Google Cloud Project ID. Can also be set in rewardkit.yaml", - # ) - # deploy_mcp_parser.add_argument( - # "--gcp-region", - # help="Google Cloud Region (e.g., 'us-central1'). Can also be set in rewardkit.yaml", - # ) - # deploy_mcp_parser.add_argument( - # "--gcp-ar-repo", - # help="Google Artifact Registry repository name. Defaults to 'eval-protocol-mcp-servers'", - # ) - # deploy_mcp_parser.add_argument( - # "--port", - # type=int, - # default=8000, - # help="Port for the MCP server to listen on (default: 8000)", - # ) - # deploy_mcp_parser.add_argument( - # "--python-version", - # default="3.11", - # help="Python version for the container (default: 3.11)", - # ) - # deploy_mcp_parser.add_argument("--requirements", help="Additional pip requirements (newline separated)") - # deploy_mcp_parser.add_argument("--env-vars", nargs="*", help="Environment variables in KEY=VALUE format") - # - # # Agent-eval command - # agent_eval_parser = subparsers.add_parser( - # "agent-eval", help="Run agent evaluation using the ForkableResource framework." - # ) - # agent_eval_parser.add_argument( - # "--task-def", - # required=True, - # help="Path to task definition file or directory containing task definitions.", - # ) - # agent_eval_parser.add_argument( - # "--parallel", - # action="store_true", - # help="Execute tasks in parallel when multiple tasks are specified.", - # ) - # agent_eval_parser.add_argument( - # "--max-concurrency", - # type=int, - # default=3, - # help="Maximum number of tasks to execute in parallel (default: 3).", - # ) - # agent_eval_parser.add_argument( - # "--filter", - # nargs="+", - # help="Run only tasks matching the specified task IDs.", - # ) - # agent_eval_parser.add_argument( - # "--output-dir", - # default="./agent_runs", - # help="Directory to store agent evaluation run results (default: ./agent_runs).", - # ) - # agent_eval_parser.add_argument( - # "--model", - # help="Override MODEL_AGENT environment variable (format: provider/model_name).", - # ) - # agent_eval_parser.add_argument( - # "--num-rollouts", - # type=int, - # help="Override the number of parallel rollouts to execute for each task.", - # ) - # Logs command logs_parser = subparsers.add_parser("logs", help="Serve logs with file watching and real-time updates") logs_parser.add_argument("--port", type=int, default=8000, help="Port to bind to (default: 8000)") @@ -332,6 +60,8 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse "upload", help="Scan for evaluation tests, select, and upload as Fireworks evaluators", ) + + # CLI workflow flags (not part of the SDK create() signature) upload_parser.add_argument( "--path", default=".", @@ -341,23 +71,6 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse "--entry", help="Entrypoint of evaluation test to upload (module:function or path::function). For multiple, separate by commas.", ) - upload_parser.add_argument( - "--id", - help="Evaluator ID to use (if multiple selections, a numeric suffix is appended)", - ) - upload_parser.add_argument( - "--display-name", - help="Display name for evaluator (defaults to ID)", - ) - upload_parser.add_argument( - "--description", - help="Description for evaluator", - ) - upload_parser.add_argument( - "--force", - action="store_true", - help="Overwrite existing evaluator with the same ID", - ) upload_parser.add_argument( "--yes", "-y", @@ -368,6 +81,48 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse "--env-file", help="Path to .env file containing secrets to upload (default: .env in current directory)", ) + upload_parser.add_argument( + "--force", + action="store_true", + help="Overwrite existing evaluator with the same ID", + ) + + # Auto-generate flags from SDK Fireworks().evaluators.create() signature + create_evaluator_fn = Fireworks().evaluators.create + + upload_skip_fields = { + "__top_level__": { + "account_id", # auto-detected + "extra_headers", + "extra_query", + "extra_body", + "timeout", + }, + "evaluator": { + "commit_hash", # should be auto-detected from git + "source", # not relevant for CLI flow + }, + } + upload_aliases = { + "evaluator_id": ["--id"], + "evaluator.display_name": ["--name"], + } + upload_help_overrides = { + "evaluator_id": "Evaluator ID to use (if multiple selections, a numeric suffix is appended)", + "evaluator.display_name": "Display name for evaluator (defaults to ID)", + "evaluator.description": "Description for evaluator", + "evaluator.requirements": "Requirements for evaluator (auto-detected from requirements.txt if not provided)", + "evaluator.entry_point": "Pytest-style entrypoint (e.g., test_file.py::test_func). Auto-detected if not provided.", + "evaluator.default_dataset": "Default dataset to use with this evaluator", + } + + add_args_from_callable_signature( + upload_parser, + create_evaluator_fn, + skip_fields=upload_skip_fields, + aliases=upload_aliases, + help_overrides=upload_help_overrides, + ) # Create command group create_parser = subparsers.add_parser( @@ -484,14 +239,6 @@ def _configure_parser(parser: argparse.ArgumentParser) -> argparse.ArgumentParse help="Extra flags to pass to 'docker run' (quoted string, e.g. \"--env-file .env --memory=8g\")", ) - # # Run command (for Hydra-based evaluations) - # # This subparser intentionally defines no arguments itself. - # # All arguments after 'run' will be passed to Hydra by parse_known_args. - # subparsers.add_parser( - # "run", - # help="Run an evaluation using a Hydra configuration. All arguments after 'run' are passed to Hydra.", - # ) - # Hidden command: export-docs (for generating CLI reference documentation) export_docs_parser = subparsers.add_parser("export-docs", help=argparse.SUPPRESS) export_docs_parser.add_argument( @@ -583,30 +330,10 @@ def _extract_flag_value(argv_list, flag_name): logger.debug("Using Fireworks API base: %s", normalized) # Now parse args normally (so help/commands work), after globals applied - # Store original sys.argv[0] because Hydra might manipulate it - # and we need it if we're not calling a Hydra app. - original_script_name = sys.argv[0] - args, remaining_argv = parse_args() # Use parse_known_args + args, _ = parse_args() setup_logging(args.verbose, getattr(args, "debug", False)) - # NOTE: The following command handlers are disabled. Uncomment to re-enable. - # if args.command == "preview": - # if preview_command is None: - # raise ImportError("preview_command is unavailable") - # return preview_command(args) - # elif args.command == "deploy": - # if deploy_command is None: - # raise ImportError("deploy_command is unavailable") - # return deploy_command(args) - # elif args.command == "deploy-mcp": - # from .cli_commands.deploy_mcp import deploy_mcp_command - # - # return deploy_mcp_command(args) - # elif args.command == "agent-eval": - # from .cli_commands.agent_eval_cmd import agent_eval_command - # - # return agent_eval_command(args) if args.command == "logs": from .cli_commands.logs import logs_command @@ -630,92 +357,9 @@ def _extract_flag_value(argv_list, flag_name): from .cli_commands.export_docs import export_docs_command return export_docs_command(args) - # elif args.command == "run": - # # For the 'run' command, Hydra takes over argument parsing. - # - # # Filter out the initial '--' if present in remaining_argv, which parse_known_args might add - # hydra_specific_args = [arg for arg in remaining_argv if arg != "--"] - # - # # Auto-detect local conf directory and add it to config path if not explicitly provided - # has_config_path = any(arg.startswith("--config-path") for arg in hydra_specific_args) - # current_dir = os.getcwd() - # local_conf_dir = os.path.join(current_dir, "conf") - # - # if not has_config_path and os.path.isdir(local_conf_dir): - # logger.info("Auto-detected local conf directory: %s", local_conf_dir) - # hydra_specific_args = [ - # "--config-path", - # local_conf_dir, - # ] + hydra_specific_args - # - # processed_hydra_args = [] - # i = 0 - # while i < len(hydra_specific_args): - # arg = hydra_specific_args[i] - # if arg == "--config-path": - # processed_hydra_args.append(arg) - # i += 1 - # if i < len(hydra_specific_args): - # path_val = hydra_specific_args[i] - # abs_path = os.path.abspath(path_val) - # logger.debug( - # "Converting relative --config-path '%s' (space separated) to absolute '%s'", - # path_val, - # abs_path, - # ) - # processed_hydra_args.append(abs_path) - # else: - # logger.error("--config-path specified without a value.") - # elif arg.startswith("--config-path="): - # flag_part, path_val = arg.split("=", 1) - # processed_hydra_args.append(flag_part) - # abs_path = os.path.abspath(path_val) - # logger.debug( - # "Converting relative --config-path '%s' (equals separated) to absolute '%s'", - # path_val, - # abs_path, - # ) - # processed_hydra_args.append(abs_path) - # else: - # processed_hydra_args.append(arg) - # i += 1 - # - # sys.argv = [sys.argv[0]] + processed_hydra_args - # logger.info("SYSCALL_ARGV_FOR_HYDRA (after potential abspath conversion): %s", sys.argv) - # - # try: - # from .cli_commands.run_eval_cmd import hydra_cli_entry_point - # - # hydra_entry = cast(Any, hydra_cli_entry_point) - # hydra_entry() # type: ignore # pylint: disable=no-value-for-parameter - # return 0 - # except Exception as e: # pylint: disable=broad-except - # error_msg = str(e) - # logger.error("Evaluation failed: %s", e) - # - # # Provide helpful suggestions for common Hydra/config errors - # if "Cannot find primary config" in error_msg: - # logger.error("HINT: Configuration file not found.") - # logger.error("SOLUTION: Ensure you have a config file in ./conf/ directory") - # logger.error("Try: eval-protocol run --config-name simple_uipath_eval") - # elif "missing from config" in error_msg or "MissingMandatoryValue" in error_msg: - # logger.error("HINT: Required configuration values are missing.") - # logger.error("SOLUTION: Check your config file for missing required fields") - # elif "Config search path" in error_msg: - # logger.error("HINT: Hydra cannot find the configuration directory.") - # logger.error("SOLUTION: Create a ./conf directory with your config files") - # elif "ValidationError" in error_msg: - # logger.error("HINT: Configuration validation failed.") - # logger.error("SOLUTION: Run 'eval-protocol validate-data --file your_data.jsonl' to check data") - # - # logger.error("\nQuick fix suggestions:") - # logger.error("1. Use the simplified setup: eval-protocol run --config-name simple_uipath_eval") - # logger.error("2. Validate your data first: eval-protocol validate-data --file data.jsonl --schema agent") - # logger.error("3. Ensure you have: ./conf/simple_uipath_eval.yaml and ./uipath_reward.py") - # return 1 else: - temp_parser = argparse.ArgumentParser(prog=os.path.basename(original_script_name)) - temp_parser.print_help() + parser = build_parser() + parser.print_help() return 1 diff --git a/eval_protocol/cli_commands/deploy.py b/eval_protocol/cli_commands/deploy.py deleted file mode 100644 index 1ae0313b..00000000 --- a/eval_protocol/cli_commands/deploy.py +++ /dev/null @@ -1,509 +0,0 @@ -""" -CLI command for creating and deploying an evaluator, -or registering a pre-deployed remote evaluator. -""" - -import importlib # For dynamically importing modules -import json -import os # For os.path.join, os.makedirs, os.getcwd (already imported but good to be explicit if used extensively) -import secrets # For API key generation (already imported but good to be explicit) -import sys # For sys.executable -import time # For sleep -from pathlib import Path # For path operations -from typing import Any, Dict - -import yaml # For saving config if save_config helper doesn't exist - -# TODO: Consider moving subprocess_manager functions to a more central location if used by core CLI -try: - # Import functions with explicit names to match expected signatures - from development.utils.subprocess_manager import ( - start_ngrok_and_get_url as _start_ngrok_and_get_url, - start_process as _start_process, - start_serveo_and_get_url as _start_serveo_and_get_url, - stop_process as _stop_process, - ) -except ImportError: - # Fallback implementations when development module is not available - import signal - import socket - import subprocess - - def _fallback_start_process(command, log_path, env=None): - """Fallback process starter.""" - try: - with open(log_path, "w") as log_file: - process = subprocess.Popen(command, stdout=log_file, stderr=subprocess.STDOUT, env=env) - return process - except Exception as e: - print(f"Error starting process: {e}") - return None - - def _fallback_stop_process(pid): - """Fallback process stopper.""" - try: - import os - - os.kill(pid, signal.SIGTERM) - except Exception: - pass - - def _fallback_start_serveo_and_get_url(local_port, log_path): - """Fallback serveo tunnel - returns None to indicate unavailable.""" - print("Serveo tunneling not available - development module not found") - return None, None - - def _fallback_start_ngrok_and_get_url(local_port, log_path): - """Fallback ngrok tunnel - returns None to indicate unavailable.""" - print("ngrok tunneling not available - development module not found") - return None, None - - # Expose unified names using fallbacks - start_process = _fallback_start_process - stop_process = _fallback_stop_process - start_serveo_and_get_url = _fallback_start_serveo_and_get_url - start_ngrok_and_get_url = _fallback_start_ngrok_and_get_url -else: - # Wrap imported helpers to present consistent, simple signatures used below - def start_process(command, log_path, env=None): - return _start_process(command=command, log_file_path=log_path, env=env) - - def stop_process(pid): - return _stop_process(pid) - - def start_serveo_and_get_url(local_port, log_path): - return _start_serveo_and_get_url(local_port=local_port, log_file_path=log_path) - - def start_ngrok_and_get_url(local_port, log_path): - return _start_ngrok_and_get_url(local_port=local_port, ngrok_log_file=log_path) - - -from eval_protocol.auth import get_fireworks_account_id -from eval_protocol.config import ( - GCPCloudRunConfig, - RewardKitConfig, - _config_file_path as global_loaded_config_path, - get_config, -) -from eval_protocol.evaluation import create_evaluation -from eval_protocol.gcp_tools import ( - build_and_push_docker_image, - deploy_to_cloud_run, - ensure_artifact_registry_repo_exists, - ensure_gcp_secret, -) -from eval_protocol.packaging import generate_dockerfile_content -from eval_protocol.platform_api import ( # For catching errors from create_evaluation - PlatformAPIError, - create_or_update_fireworks_secret, -) - -from .common import check_environment - - -def _establish_local_server_and_tunnel(args): - """ - Handles starting the local generic server and establishing a public tunnel - using Serveo, with a fallback to ngrok. - Returns: (public_url, tunnel_provider_name, local_server_pid, tunnel_process_pid) - Returns (None, None, server_pid_or_None, None) if tunneling fails. - """ - if not args.function_ref: - print("Error: --function-ref is required for local-serve target.") - return None, None, None, None - - evaluator_id = args.id - function_ref = args.function_ref - local_server_port = args.local_port - - log_dir = os.path.join(os.getcwd(), "logs", "eval-protocol-local") - os.makedirs(log_dir, exist_ok=True) - generic_server_log_path = os.path.join(log_dir, f"generic_server_{evaluator_id}.log") - - server_env = None # Run local server without API key protection - print(f"Note: Local server for '{evaluator_id}' will run without API key protection.") - - print(f"Starting local reward function server for '{function_ref}' on port {local_server_port}...") - server_command = [ - sys.executable, - "-m", - "eval_protocol.generic_server", - function_ref, - "--port", - str(local_server_port), - ] - - local_server_process = start_process(server_command, generic_server_log_path, env=server_env) - - if not local_server_process or local_server_process.poll() is not None: - print(f"Error: Failed to start local generic server. Check log: {generic_server_log_path}") - return None, None, None, None # No server, no tunnel - - local_server_pid = local_server_process.pid - print(f"Local server started (PID: {local_server_pid}). Log: {generic_server_log_path}") - print("Waiting for server to initialize...") - time.sleep(5) - - # Attempt Serveo first - print(f"Attempting Serveo tunnel for local port {local_server_port}...") - serveo_log_path = os.path.join(log_dir, f"serveo_{evaluator_id}.log") - serveo_tunnel_process, serveo_url = start_serveo_and_get_url(local_server_port, serveo_log_path) - - if serveo_url and serveo_tunnel_process: - print(f"Serveo tunnel established: {serveo_url} (PID: {serveo_tunnel_process.pid}). Log: {serveo_log_path}") - return serveo_url, "serveo", local_server_pid, serveo_tunnel_process.pid - else: - print(f"Serveo tunnel failed. Check log: {serveo_log_path}") - print("Attempting fallback to ngrok...") - - ngrok_log_path = os.path.join(log_dir, f"ngrok_{evaluator_id}.log") - # Assuming ngrok authtoken is pre-configured by the user or via NGROK_AUTHTOKEN env var - ngrok_tunnel_process, ngrok_url = start_ngrok_and_get_url(local_server_port, ngrok_log_path) - - if ngrok_url and ngrok_tunnel_process: - print(f"ngrok tunnel established: {ngrok_url} (PID: {ngrok_tunnel_process.pid}). Log: {ngrok_log_path}") - return ngrok_url, "ngrok", local_server_pid, ngrok_tunnel_process.pid - else: - print(f"ngrok tunnel also failed. Check log: {ngrok_log_path}") - # Both failed, stop the local server we started - if local_server_pid: - stop_process(local_server_pid) - return ( - None, - None, - local_server_pid, - None, - ) # URL, provider, server_pid, tunnel_pid - - -def _deploy_to_gcp_cloud_run(args, current_config, gcp_config_from_yaml): - """Handles the logic for --target gcp-cloud-run up to service deployment.""" - print(f"Starting GCP Cloud Run deployment for evaluator '{args.id}'...") - - # Resolve function_ref (must be from CLI for GCP) - if not args.function_ref: # This check is also in main, but good for helper too - print("Error: --function-ref is required for GCP Cloud Run deployment.") - return None - - # Dynamically import the reward function to get its requirements - inline_requirements_content = None - try: - module_name, func_name = args.function_ref.rsplit(".", 1) - module = importlib.import_module(module_name) - reward_func = getattr(module, func_name) - if hasattr(reward_func, "_reward_function_requirements"): - inline_requirements_content = reward_func._reward_function_requirements - if inline_requirements_content: - print(f"Found inline requirements for {args.function_ref}") - except Exception as e: - print(f"Warning: Could not load reward function {args.function_ref} to check for inline requirements: {e}") - # Continue without inline requirements if loading fails - - # Resolve GCP project_id - gcp_project_id = args.gcp_project - if not gcp_project_id and gcp_config_from_yaml: - gcp_project_id = gcp_config_from_yaml.project_id - if not gcp_project_id: - print("Error: GCP Project ID must be provided via --gcp-project argument or in rewardkit.yaml.") - return None - - # Resolve GCP region - gcp_region = args.gcp_region - if not gcp_region and gcp_config_from_yaml: - gcp_region = gcp_config_from_yaml.region - if not gcp_region: - print("Error: GCP Region must be provided via --gcp-region argument or in rewardkit.yaml.") - return None - - # Resolve GCP AR repo name - gcp_ar_repo_name = args.gcp_ar_repo - if not gcp_ar_repo_name and gcp_config_from_yaml: - gcp_ar_repo_name = gcp_config_from_yaml.artifact_registry_repository - if not gcp_ar_repo_name: - gcp_ar_repo_name = "eval-protocol-evaluators" - - print(f"Using GCP Project: {gcp_project_id}, Region: {gcp_region}, AR Repo: {gcp_ar_repo_name}") - - if not ensure_artifact_registry_repo_exists( - project_id=gcp_project_id, region=gcp_region, repo_name=gcp_ar_repo_name - ): - print(f"Failed to ensure Artifact Registry repository '{gcp_ar_repo_name}' exists. Aborting.") - return None - - dockerfile_content = generate_dockerfile_content( - function_ref=args.function_ref, - python_version=( - f"{args.runtime[6]}.{args.runtime[7:]}" - if args.runtime.startswith("python") and len(args.runtime) > 7 - else args.runtime.replace("python", "") - ), - eval_protocol_install_source=".", - user_requirements_path=None, # Explicitly None, inline_requirements_content will be used - inline_requirements_content=inline_requirements_content, - service_port=8080, - ) - if not dockerfile_content: - print("Failed to generate Dockerfile content. Aborting.") - return None - - image_tag = "latest" - image_name_tag = f"{gcp_region}-docker.pkg.dev/{gcp_project_id}/{gcp_ar_repo_name}/{args.id}:{image_tag}" - build_context_dir = os.getcwd() - - if not build_and_push_docker_image( - image_name_tag=image_name_tag, - dockerfile_content=dockerfile_content, - build_context_dir=build_context_dir, - gcp_project_id=gcp_project_id, - ): - print(f"Failed to build and push Docker image {image_name_tag}. Aborting.") - return None - print(f"Successfully built and pushed Docker image: {image_name_tag}") - - gcp_env_vars: Dict[str, str] = {} - parsed_gcp_secrets: Dict[str, Any] = {} - allow_unauthenticated_gcp = True - - resolved_auth_mode = "api-key" - if gcp_config_from_yaml and gcp_config_from_yaml.default_auth_mode: - resolved_auth_mode = gcp_config_from_yaml.default_auth_mode - if args.gcp_auth_mode is not None: - resolved_auth_mode = args.gcp_auth_mode - print(f"Using GCP Auth Mode for service: {resolved_auth_mode}") - - if resolved_auth_mode == "api-key": - print("Configuring GCP Cloud Run service for API key authentication (application layer).") - evaluator_id = args.id - api_key_for_service = None # This is the key the service itself will use - config_path = global_loaded_config_path - - if current_config.evaluator_endpoint_keys and evaluator_id in current_config.evaluator_endpoint_keys: - api_key_for_service = current_config.evaluator_endpoint_keys[evaluator_id] - print(f"Using existing API key for '{evaluator_id}' from configuration for the service.") - else: - api_key_for_service = secrets.token_hex(32) - print(f"Generated new API key for '{evaluator_id}' for the service.") - if not current_config.evaluator_endpoint_keys: - current_config.evaluator_endpoint_keys = {} - current_config.evaluator_endpoint_keys[evaluator_id] = api_key_for_service - if config_path: - _save_config(current_config, config_path) - else: - print(f"Warning: No rewardkit.yaml found to save API key for '{evaluator_id}'.") - - gcp_sanitized_eval_id = "".join(filter(lambda char: char.isalnum() or char in ["-", "_"], args.id)) - if not gcp_sanitized_eval_id: - gcp_sanitized_eval_id = "evalprotocol-evaluator" - secret_id_for_auth_key = f"rk-eval-{gcp_sanitized_eval_id}-authkey" - secret_labels = {"managed-by": "eval-protocol", "evaluator-id": evaluator_id} - - api_key_secret_version_id = ensure_gcp_secret( - project_id=gcp_project_id, - secret_id=secret_id_for_auth_key, - secret_value=api_key_for_service, - labels=secret_labels, - ) - if not api_key_secret_version_id: - print(f"Error: Failed to store API key in GCP Secret Manager for '{evaluator_id}'. Aborting.") - return None - print(f"API key for service stored in GCP Secret Manager: {secret_id_for_auth_key}") - parsed_gcp_secrets["RK_ENDPOINT_API_KEY"] = api_key_secret_version_id - - # Register this key with Fireworks secrets for the shim - fireworks_account_id_for_secret = get_fireworks_account_id() - if fireworks_account_id_for_secret: - fw_eval_id_sanitized = args.id.lower() - fw_eval_id_sanitized = "".join(filter(lambda char: char.isalnum() or char == "-", fw_eval_id_sanitized)) - fw_eval_id_sanitized = "-".join(filter(None, fw_eval_id_sanitized.split("-"))) - if not fw_eval_id_sanitized: - fw_eval_id_sanitized = "evaluator" - fw_eval_id_sanitized = fw_eval_id_sanitized[:40] - fw_secret_key_name = f"rkeval-{fw_eval_id_sanitized}-shim-key" - print(f"Registering API key on Fireworks platform as secret '{fw_secret_key_name}' for shim...") - if create_or_update_fireworks_secret( - account_id=fireworks_account_id_for_secret, - key_name=fw_secret_key_name, - secret_value=api_key_for_service, - ): - print(f"Successfully registered/updated secret '{fw_secret_key_name}' on Fireworks platform.") - else: - print(f"Warning: Failed to register/update secret '{fw_secret_key_name}' on Fireworks platform.") - else: - print("Warning: Fireworks Account ID not found, cannot store shim API key on Fireworks platform.") - - cloud_run_service_url = deploy_to_cloud_run( - service_name=args.id, - image_name_tag=image_name_tag, - gcp_project_id=gcp_project_id, - gcp_region=gcp_region, - allow_unauthenticated=allow_unauthenticated_gcp, # True if api-key mode, app handles auth - env_vars=gcp_env_vars if gcp_env_vars else None, - secrets_to_mount=parsed_gcp_secrets, - ) - - if not cloud_run_service_url: - print("Failed to deploy to Cloud Run or retrieve service URL. Aborting.") - return None - - print(f"Successfully deployed to Cloud Run. Service URL: {cloud_run_service_url}") - return cloud_run_service_url - - -# Helper to save config (can be moved to config.py later) -def _save_config(config_data: RewardKitConfig, path: str): - # Basic save, ideally config.py would provide a robust method - try: - with open(path, "w") as f: - yaml.dump(config_data.model_dump(exclude_none=True), f, sort_keys=False) - print(f"Config updated and saved to {path}") - except Exception as e: - print(f"Warning: Failed to save updated config to {path}: {e}") - - -def deploy_command(args): - """Create and deploy an evaluator or register a remote one.""" - - # Check environment variables - if not check_environment(): - return 1 - - if not args.id: # ID is always required - print("Error: Evaluator ID (--id) is required.") - return 1 - - # Process HuggingFace key mapping if provided - huggingface_message_key_map = None - if args.huggingface_key_map: - try: - huggingface_message_key_map = json.loads(args.huggingface_key_map) - except json.JSONDecodeError: - print("Error: Invalid JSON format for --huggingface-key-map") - return 1 - - # Initialize variables for URL registration path - service_url_to_register = None - # api_key_for_shim = None # Not currently used by create_evaluation for shim auth directly - - # PIDs for cleanup if registration fails for local-serve - local_server_pid_to_clean = None - # serveo_pid_to_clean = None # This was old, replaced by local_tunnel_pid_to_clean - local_tunnel_pid_to_clean = None # Initialize here - - if args.target == "gcp-cloud-run": - current_config = get_config() # Needed by the helper - gcp_config_from_yaml = current_config.gcp_cloud_run if current_config.gcp_cloud_run else None - - cloud_run_service_url = _deploy_to_gcp_cloud_run(args, current_config, gcp_config_from_yaml) - if not cloud_run_service_url: - return 1 # Error already printed by helper - service_url_to_register = cloud_run_service_url - - elif args.target == "local-serve": - # Renamed helper and updated return values - url, tunnel_provider, server_pid, tunnel_pid = _establish_local_server_and_tunnel(args) - if not url: - # _establish_local_server_and_tunnel handles cleanup of server if tunnel fails completely - return 1 # Error already printed by helper - service_url_to_register = url - local_server_pid_to_clean = server_pid - # serveo_pid_to_clean was specific, now it's generic tunnel_pid - # Let's rename it for clarity in the cleanup logic - local_tunnel_pid_to_clean = tunnel_pid - print(f"Tunnel established using {tunnel_provider}.") - - elif args.remote_url: - # This is for --target fireworks (default) but with --remote-url - print(f"Registering remote URL: {args.remote_url} for evaluator '{args.id}'") - if not (args.remote_url.startswith("http://") or args.remote_url.startswith("https://")): - print(f"Error: Invalid --remote-url '{args.remote_url}'. Must start with http:// or https://") - return 1 - if args.metrics_folders: # This check might be redundant if --target is explicit - print("Info: --metrics-folders are ignored when deploying with --remote-url.") - service_url_to_register = args.remote_url - # No specific shim auth provided by this path. - - # Common registration step for targets that produce a URL - if service_url_to_register: - try: - print(f"Registering URL '{service_url_to_register}' with Fireworks AI for evaluator '{args.id}'...") - evaluator = create_evaluation( - evaluator_id=args.id, - remote_url=service_url_to_register, - display_name=args.display_name or args.id, - description=args.description or f"Evaluator for {args.id} at {service_url_to_register}", - force=args.force, - huggingface_dataset=args.huggingface_dataset, - huggingface_split=args.huggingface_split, - huggingface_message_key_map=huggingface_message_key_map, - huggingface_prompt_key=args.huggingface_prompt_key, - huggingface_response_key=args.huggingface_response_key, - # remote_auth_header_name="X-Api-Key" if api_key_for_shim else None, # No API key for shim for now - # remote_auth_header_value=api_key_for_shim # No API key for shim for now - ) - evaluator_name = evaluator.get("name", args.id) - print( - f"Successfully registered evaluator '{evaluator_name}' on Fireworks AI, pointing to '{service_url_to_register}'." - ) - if args.target == "local-serve": - # tunnel_provider is defined in the local-serve block - # We need to ensure it's accessible here or pass it through. - # For now, let's assume tunnel_provider was defined in the calling scope of this block. - # This will require a small adjustment to how tunnel_provider is scoped. - # Let's fetch it from args if we store it there, or pass it. - # Simpler: just make the message generic or re-fetch from the PIDs. - # The variable `tunnel_provider` is set in the `elif args.target == "local-serve":` block. - # It needs to be available here. - # For now, I'll adjust the print statement to be more generic or rely on the PIDs. - # The `tunnel_provider` variable is indeed set in the correct scope. - print( - f"Local server (PID: {local_server_pid_to_clean}) and {tunnel_provider} tunnel (PID: {local_tunnel_pid_to_clean}) are running." - ) - print("They will be stopped automatically when this command exits (e.g., Ctrl+C).") - return 0 - except PlatformAPIError as e: - print(f"Error registering URL with Fireworks AI: {str(e)}") - except Exception as e: - print(f"An unexpected error occurred during Fireworks AI registration: {str(e)}") - finally: - # If registration fails for local-serve, clean up the started processes - if args.target == "local-serve" and ("evaluator" not in locals() or not locals().get("evaluator")): - print("Registration failed or was interrupted for local-serve. Cleaning up local processes...") - if local_tunnel_pid_to_clean: # Use the new generic tunnel PID variable - stop_process(local_tunnel_pid_to_clean) - if local_server_pid_to_clean: - stop_process(local_server_pid_to_clean) - return 1 - - # Fallback to original behavior: Deploying by packaging local metrics_folders (target=fireworks, no remote_url) - # This is when args.target == "fireworks" (default) AND args.remote_url is NOT provided. - elif args.target == "fireworks" and not args.remote_url: - if not args.metrics_folders: - print("Error: --metrics-folders are required for 'fireworks' target if --remote-url is not provided.") - return 1 - for folder_spec in args.metrics_folders: - if "=" not in folder_spec: - print(f"Error: Metric folder format should be 'name=path', got '{folder_spec}'") - return 1 - try: - print(f"Packaging and deploying metrics for evaluator '{args.id}' to Fireworks AI...") - evaluator = create_evaluation( - evaluator_id=args.id, - metric_folders=args.metrics_folders, - display_name=args.display_name or args.id, - description=args.description or f"Evaluator: {args.id}", - force=args.force, - huggingface_dataset=args.huggingface_dataset, - huggingface_split=args.huggingface_split, - huggingface_message_key_map=huggingface_message_key_map, - huggingface_prompt_key=args.huggingface_prompt_key, - huggingface_response_key=args.huggingface_response_key, - ) - evaluator_name = evaluator.get("name", args.id) - print(f"Successfully created/updated evaluator: {evaluator_name}") - return 0 - except PlatformAPIError as e: - print(f"Error creating/updating evaluator '{args.id}': {str(e)}") - return 1 - except Exception as e: - print(f"Error creating/updating evaluator '{args.id}': {str(e)}") - return 1 diff --git a/eval_protocol/cli_commands/deploy_mcp.py b/eval_protocol/cli_commands/deploy_mcp.py deleted file mode 100644 index 34cb6a6f..00000000 --- a/eval_protocol/cli_commands/deploy_mcp.py +++ /dev/null @@ -1,290 +0,0 @@ -""" -CLI command for deploying MCP servers to Google Cloud Run. -""" - -import importlib -import os -import sys -import tempfile -from pathlib import Path -from typing import Dict, Optional - -from eval_protocol.config import ( - GCPCloudRunConfig, - RewardKitConfig, - _config_file_path as global_loaded_config_path, - get_config, -) -from eval_protocol.gcp_tools import ( - build_and_push_docker_image, - deploy_to_cloud_run, - ensure_artifact_registry_repo_exists, -) - -from .common import check_environment - - -def _generate_mcp_dockerfile_content( - mcp_server_module: str, - python_version: str = "3.11", - service_port: int = 8000, - additional_requirements: Optional[str] = None, -) -> str: - """ - Generate Dockerfile content for MCP server deployment. - - Args: - mcp_server_module: The Python module containing the MCP server (e.g., 'frozen_lake_mcp_server') - python_version: Python version to use in the container - service_port: Port the MCP server will listen on - additional_requirements: Additional pip requirements - - Returns: - Dockerfile content as string - """ - - # Base requirements for MCP servers - matching setup.py dependencies - base_requirements = [ - "fastmcp>=0.1.0", - # Core Eval Protocol dependencies from setup.py - "requests>=2.25.0", - "pydantic>=2.0.0", - "dataclasses-json>=0.5.7", - "fastapi>=0.68.0", - "uvicorn>=0.15.0", - "python-dotenv>=0.19.0", - "openai==1.78.1", - "aiosqlite", - "aiohttp", - "mcp>=1.9.2", - "PyYAML>=5.0", - "datasets==3.6.0", - "fsspec==2025.3.0", - "hydra-core>=1.3.2", - "omegaconf>=2.3.0", - "gymnasium>=0.29.0", - "httpx>=0.24.0", - "fireworks-ai>=0.17.19", - ] - - if additional_requirements: - requirements = base_requirements + [req.strip() for req in additional_requirements.split("\n") if req.strip()] - else: - requirements = base_requirements - - # Generate pip install lines with proper escaping - pip_install_lines = [] - for req in requirements[:-1]: - pip_install_lines.append(f" {req} \\") - pip_install_lines.append(f" {requirements[-1]}") - pip_install_section = "\n".join(pip_install_lines) - - dockerfile_content = f"""# Multi-stage build for MCP server deployment -FROM python:{python_version}-slim as builder - -WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y \\ - build-essential \\ - && rm -rf /var/lib/apt/lists/* - -# Copy requirements and install Python dependencies -RUN pip install --no-cache-dir --upgrade pip - -# Install MCP server requirements -RUN pip install --no-cache-dir \\ -{pip_install_section} - -# Production stage -FROM python:{python_version}-slim - -WORKDIR /app - -# Install runtime dependencies -RUN apt-get update && apt-get install -y \\ - && rm -rf /var/lib/apt/lists/* - -# Copy Python packages from builder -COPY --from=builder /usr/local/lib/python{python_version}/site-packages /usr/local/lib/python{python_version}/site-packages -COPY --from=builder /usr/local/bin /usr/local/bin - -# Copy the MCP server code -COPY . . - -# Set environment variables for Cloud Run -# FastMCP uses HOST and PORT environment variables for streamable-http transport -ENV HOST=0.0.0.0 -ENV PORT={service_port} -ENV PYTHONPATH=/app -ENV PYTHONUNBUFFERED=1 - -# Expose the port -EXPOSE {service_port} - -# Run the MCP server with proper host and port for Cloud Run -CMD ["python", "-m", "{mcp_server_module}", "--host", "0.0.0.0", "--port", "{service_port}"] -""" - - return dockerfile_content - - -def _deploy_mcp_to_gcp_cloud_run(args, current_config, gcp_config_from_yaml): - """Deploy MCP server to GCP Cloud Run.""" - print(f"Starting MCP server deployment to GCP Cloud Run for '{args.id}'...") - - # Validate required arguments - either dockerfile or mcp-server-module must be provided - if not args.dockerfile and not args.mcp_server_module: - print("Error: Either --dockerfile or --mcp-server-module is required for MCP server deployment.") - return None - - # Resolve GCP configuration - gcp_project_id = args.gcp_project - if not gcp_project_id and gcp_config_from_yaml: - gcp_project_id = gcp_config_from_yaml.project_id - if not gcp_project_id: - print("Error: GCP Project ID must be provided via --gcp-project or rewardkit.yaml.") - return None - - gcp_region = args.gcp_region - if not gcp_region and gcp_config_from_yaml: - gcp_region = gcp_config_from_yaml.region - if not gcp_region: - print("Error: GCP Region must be provided via --gcp-region or rewardkit.yaml.") - return None - - gcp_ar_repo_name = args.gcp_ar_repo - if not gcp_ar_repo_name and gcp_config_from_yaml: - gcp_ar_repo_name = gcp_config_from_yaml.artifact_registry_repository - if not gcp_ar_repo_name: - gcp_ar_repo_name = "eval-protocol-mcp-servers" - - print(f"Using GCP Project: {gcp_project_id}, Region: {gcp_region}, AR Repo: {gcp_ar_repo_name}") - - # Ensure Artifact Registry repository exists - if not ensure_artifact_registry_repo_exists( - project_id=gcp_project_id, region=gcp_region, repo_name=gcp_ar_repo_name - ): - print(f"Failed to ensure Artifact Registry repository '{gcp_ar_repo_name}' exists. Aborting.") - return None - - # Determine Dockerfile content - use provided file or generate - dockerfile_content = None - if hasattr(args, "dockerfile") and args.dockerfile: - # Use provided Dockerfile - dockerfile_path = Path(args.dockerfile) - if not dockerfile_path.exists(): - print(f"Error: Dockerfile not found at {dockerfile_path}") - return None - print(f"Using provided Dockerfile: {dockerfile_path}") - try: - with open(dockerfile_path, "r") as f: - dockerfile_content = f.read() - except Exception as e: - print(f"Error reading Dockerfile at {dockerfile_path}: {e}") - return None - else: - # Generate Dockerfile content (legacy approach) - print("Generating Dockerfile content from mcp-server-module...") - dockerfile_content = _generate_mcp_dockerfile_content( - mcp_server_module=args.mcp_server_module, - python_version=getattr(args, "python_version", "3.11"), - service_port=getattr(args, "port", 8000), - additional_requirements=getattr(args, "requirements", None), - ) - - if not dockerfile_content: - print("Failed to obtain Dockerfile content. Aborting.") - return None - - # Build and push Docker image - image_tag = "latest" - image_name_tag = f"{gcp_region}-docker.pkg.dev/{gcp_project_id}/{gcp_ar_repo_name}/{args.id}:{image_tag}" - build_context_dir = os.getcwd() - - if not build_and_push_docker_image( - image_name_tag=image_name_tag, - dockerfile_content=dockerfile_content, - build_context_dir=build_context_dir, - gcp_project_id=gcp_project_id, - ): - print(f"Failed to build and push Docker image {image_name_tag}. Aborting.") - return None - - print(f"Successfully built and pushed Docker image: {image_name_tag}") - - # Deploy to Cloud Run - service_port = getattr(args, "port", 8000) - env_vars = {} - - # Add any custom environment variables - if hasattr(args, "env_vars") and args.env_vars: - for env_pair in args.env_vars: - if "=" in env_pair: - key, value = env_pair.split("=", 1) - env_vars[key] = value - - cloud_run_service_url = deploy_to_cloud_run( - service_name=args.id, - image_name_tag=image_name_tag, - gcp_project_id=gcp_project_id, - gcp_region=gcp_region, - allow_unauthenticated=True, # MCP servers typically need to be publicly accessible - env_vars=env_vars if env_vars else None, - service_port=service_port, - ) - - if not cloud_run_service_url: - print("Failed to deploy to Cloud Run or retrieve service URL. Aborting.") - return None - - print("🚀 Successfully deployed MCP server to Cloud Run!") - print(f"📍 Service URL: {cloud_run_service_url}") - print(f"🔗 MCP Connection URL: {cloud_run_service_url}") - print(f"📋 Service Name: {args.id}") - deployment_method = ( - "local Dockerfile" if (hasattr(args, "dockerfile") and args.dockerfile) else "auto-generated Dockerfile" - ) - print(f"🐳 Deployment Method: {deployment_method}") - print() - print("🎯 Next steps:") - print(f" 1. Test your MCP server: curl {cloud_run_service_url}/health") - print(f" 2. Connect MCP clients to: {cloud_run_service_url}") - print( - f" 3. Monitor logs: gcloud logging read 'resource.type=cloud_run_revision AND resource.labels.service_name={args.id}' --project {gcp_project_id}" - ) - - return cloud_run_service_url - - -def deploy_mcp_command(args): - """Main entry point for MCP server deployment command.""" - - # Check environment (similar to existing deploy command) - if not check_environment(): - print("Environment check failed. Please resolve the issues above before deploying.") - return False - - try: - # Load configuration - current_config = get_config() - gcp_config_from_yaml: Optional[GCPCloudRunConfig] = None - if current_config and current_config.gcp_cloud_run: - gcp_config_from_yaml = current_config.gcp_cloud_run - - # Deploy to GCP Cloud Run - service_url = _deploy_mcp_to_gcp_cloud_run(args, current_config, gcp_config_from_yaml) - - if service_url: - print(f"✅ MCP server '{args.id}' successfully deployed!") - return True - else: - print(f"❌ Failed to deploy MCP server '{args.id}'") - return False - - except Exception as e: - print(f"Error during MCP server deployment: {e}") - import traceback - - traceback.print_exc() - return False diff --git a/eval_protocol/cli_commands/preview.py b/eval_protocol/cli_commands/preview.py deleted file mode 100644 index ef438496..00000000 --- a/eval_protocol/cli_commands/preview.py +++ /dev/null @@ -1,186 +0,0 @@ -""" -CLI command for previewing an evaluator. -""" - -import json -import sys # For sys.exit -from pathlib import Path -from typing import Any, Dict, Iterator, List, Optional, Union - -import requests # For making HTTP requests - -from eval_protocol.evaluation import preview_evaluation - -# Assuming EvaluationRequest is defined in generic_server. -# For loose coupling, it might be better in models.py or a shared types module. -from eval_protocol.generic_server import EvaluationRequest -from eval_protocol.models import EvaluateResult, Message - -# Assuming these helper functions exist or will be created in .common -# If not, their logic for loading samples would need to be integrated here or called differently. -from .common import ( - check_environment, - load_samples_from_file, - load_samples_from_huggingface, -) - - -def preview_command(args): - """Preview an evaluator with sample data""" - - # Check environment variables - if not check_environment(): - return 1 - - # Validate --remote-url and --metrics-folders usage - if args.remote_url and args.metrics_folders: - print("Info: --metrics-folders are ignored when --remote-url is specified.") - - if not args.remote_url and not args.metrics_folders: - print("Error: Either --remote-url or --metrics-folders must be specified.") - return 1 - - # Ensure either samples or huggingface_dataset is provided (still needed for remote_url) - if not args.samples and not args.huggingface_dataset: - print("Error: Either sample file (--samples) or HuggingFace dataset (--huggingface-dataset) is required.") - return 1 - - # If using samples file, verify it exists - if args.samples and not Path(args.samples).exists(): - print(f"Error: Sample file '{args.samples}' not found") - return 1 - - # Process HuggingFace key mapping if provided - huggingface_message_key_map = None - if args.huggingface_key_map: - try: - huggingface_message_key_map = json.loads(args.huggingface_key_map) - except json.JSONDecodeError: - print("Error: Invalid JSON format for --huggingface-key-map") - return 1 - - if args.remote_url: - # Handle previewing against a remote URL - print(f"Previewing against remote URL: {args.remote_url}") - - # Ensure the remote URL is a valid base URL - if not (args.remote_url.startswith("http://") or args.remote_url.startswith("https://")): - print(f"Error: Invalid --remote-url '{args.remote_url}'. Must start with http:// or https://") - return 1 - - evaluate_endpoint = f"{args.remote_url.rstrip('/')}/evaluate" - - samples_iterator: Union[List[Any], Iterator[Dict[str, Any]]] = [] - try: - if args.samples: - # Assuming load_samples_from_file yields dicts with "messages" and optional "ground_truth" - samples_iterator = load_samples_from_file(args.samples, args.max_samples) - elif args.huggingface_dataset: - # Assuming load_samples_from_huggingface yields dicts with "messages" and optional "ground_truth" - samples_iterator = load_samples_from_huggingface( - dataset_name=args.huggingface_dataset, - split=args.huggingface_split, - prompt_key=args.huggingface_prompt_key, - response_key=args.huggingface_response_key, - key_map=huggingface_message_key_map, - max_samples=args.max_samples, - ) - except Exception as e: - print(f"Error loading samples: {e}") - return 1 - - results_count = 0 - for i, sample_data in enumerate(samples_iterator): - # The load_samples_from_* helpers should ideally cap at max_samples, - # but we double-check here. - if i >= args.max_samples: - break - results_count += 1 - - messages_payload = sample_data.get("messages", []) - ground_truth_payload = sample_data.get("ground_truth") - # Allow passing other sample fields as kwargs to the reward function - sample_kwargs = {k: v for k, v in sample_data.items() if k not in ["messages", "ground_truth"]} - - processed_messages = [] - for msg_item in messages_payload: - if isinstance(msg_item, Message): # If helpers return Message objects - processed_messages.append(msg_item.model_dump(exclude_none=True)) - elif isinstance(msg_item, dict): # If helpers return dicts - processed_messages.append(msg_item) - else: - print( - f"Warning: Sample {i + 1} has unexpected message item type: {type(msg_item)}. Skipping this message item." - ) - - try: - request_obj = EvaluationRequest( - messages=processed_messages, - ground_truth=ground_truth_payload, - kwargs=sample_kwargs, - ) - except Exception as e: # Pydantic validation for EvaluationRequest - print(f"\n--- Sample {i + 1} ---") - print(f" Error creating request payload for sample: {e}") - print(f" Sample data: {sample_data}") - print("--- End Sample ---") - continue # Skip to next sample - - print(f"\n--- Sample {i + 1} ---") - - try: - response = requests.post( - evaluate_endpoint, - json=request_obj.model_dump(), # Use model_dump() for Pydantic v2, or .dict() for v1 - timeout=30, - ) - response.raise_for_status() - - result_json = response.json() - evaluate_result = EvaluateResult(**result_json) - - print(f" Score: {evaluate_result.score}") - print(f" Reason: {evaluate_result.reason if evaluate_result.reason else 'N/A'}") - print(f" Is Valid: {evaluate_result.is_score_valid}") - if evaluate_result.metrics: - for k, v_metric in evaluate_result.metrics.items(): - print( - f" Metric '{k}': Score={v_metric.score}, Valid={v_metric.is_score_valid}, Reason={v_metric.reason}" - ) - - except requests.exceptions.RequestException as e: - print(f" Error calling remote URL '{evaluate_endpoint}': {e}") - except json.JSONDecodeError: - print( - f" Error: Invalid JSON response from remote URL. Status: {response.status_code}. Response text: {response.text[:200]}..." - ) - except Exception as e: - print(f" Error processing response from remote URL: {e}") - print("--- End Sample ---") - - if results_count == 0: - print("No samples were processed. Check sample source or loading functions.") - return 0 - - else: - # Original behavior: preview using local metrics_folders - # This path is taken if args.remote_url is None (or empty string) - # We already checked above that if not remote_url, then metrics_folders must be present. - - try: - preview_result = preview_evaluation( - metric_folders=args.metrics_folders, - sample_file=args.samples if args.samples else None, - max_samples=args.max_samples, - huggingface_dataset=args.huggingface_dataset, - huggingface_split=args.huggingface_split, - huggingface_prompt_key=args.huggingface_prompt_key, - huggingface_response_key=args.huggingface_response_key, - huggingface_message_key_map=huggingface_message_key_map, - ) - - preview_result.display() - return 0 - except Exception as e: - print(f"Error previewing evaluator (local mode): {str(e)}") - return 1 diff --git a/eval_protocol/cli_commands/upload.py b/eval_protocol/cli_commands/upload.py index c978d48c..9e079c3e 100644 --- a/eval_protocol/cli_commands/upload.py +++ b/eval_protocol/cli_commands/upload.py @@ -136,8 +136,6 @@ def _resolve_entry_to_qual_and_source(entry: str, cwd: str) -> tuple[str, str]: def _load_secrets_from_env_file(env_file_path: str) -> Dict[str, str]: """ Load secrets from a .env file that should be uploaded to Fireworks. - - Returns a dictionary of secret key-value pairs that contain 'API_KEY' in the name. """ if not os.path.exists(env_file_path): return {} @@ -152,14 +150,7 @@ def _load_secrets_from_env_file(env_file_path: str) -> Dict[str, str]: key = key.strip() value = value.strip().strip('"').strip("'") # Remove quotes env_vars[key] = value - - # Filter for secrets that look like API keys - secrets = {} - for key, value in env_vars.items(): - if "API_KEY" in key.upper() and value: - secrets[key] = value - - return secrets + return env_vars def _mask_secret_value(value: str) -> str: @@ -193,13 +184,6 @@ def upload_command(args: argparse.Namespace) -> int: selected_tests = _discover_and_select_tests(root, non_interactive=non_interactive) if not selected_tests: return 1 - # Warn about parameterized tests - parameterized_tests = [t for t in selected_tests if t.has_parametrize] - if parameterized_tests: - print("\nNote: Parameterized tests will be uploaded as a single evaluator that") - print(" handles all parameter combinations. The evaluator will work with") - print(" the same logic regardless of which model/parameters are used.") - selected_specs = [(t.qualname, t.file_path) for t in selected_tests] base_id = getattr(args, "id", None) @@ -284,11 +268,8 @@ def upload_command(args: argparse.Namespace) -> int: print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...") try: - test_dir = root - metric_name = os.path.basename(test_dir) or "metric" result = create_evaluation( evaluator_id=evaluator_id, - metric_folders=[f"{metric_name}={test_dir}"], display_name=display_name or evaluator_id, description=description or f"Evaluator for {qualname}", force=force, diff --git a/eval_protocol/evaluation.py b/eval_protocol/evaluation.py index 6123f15d..9c84d34e 100644 --- a/eval_protocol/evaluation.py +++ b/eval_protocol/evaluation.py @@ -1,575 +1,36 @@ -import ast # Added for AST parsing -import importlib.util # Added for dynamic module loading -import json import logging import os -import sys # Added for path manipulation import time -import types -from pathlib import Path -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union, cast - -if TYPE_CHECKING: - # For type checking only - import datasets +from typing import List, Optional +import fireworks import requests +from fireworks import Fireworks from eval_protocol.auth import ( get_fireworks_account_id, get_fireworks_api_key, verify_api_key_and_get_account_id, ) -from eval_protocol.common_utils import get_user_agent -from eval_protocol.typed_interface import EvaluationMode - from eval_protocol.get_pep440_version import get_pep440_version logger = logging.getLogger(__name__) -# Flag to track if the preview API was successfully used -used_preview_api = False - - -def huggingface_dataset_to_jsonl( - dataset_name: str, - split: str = "train", - output_file: Optional[str] = None, - max_samples: int = 100, - message_key_map: Optional[Dict[str, str]] = None, - response_key: str = "response", - prompt_key: str = "prompt", -) -> str: - """ - Converts a HuggingFace dataset to JSONL format suitable for Eval Protocol evaluation. - - Args: - dataset_name: The name of the HuggingFace dataset (e.g., "deepseek-ai/DeepSeek-ProverBench") - split: The dataset split to use (default: "train") - output_file: Optional file path to save the JSONL output (if None, generates a temp file) - max_samples: Maximum number of samples to include - message_key_map: Optional mapping of dataset keys to Eval Protocol message keys - response_key: Key in the dataset containing the response text (default: "response") - prompt_key: Key in the dataset containing the prompt text (default: "prompt") - - Returns: - Path to the generated JSONL file - """ - try: - from datasets import load_dataset # pyright: ignore[reportAttributeAccessIssue] - except ImportError: - raise ImportError( - "The 'datasets' package is required to use this function. " - "Please install it with 'pip install \"eval-protocol[deepseek]\"'" - ) - - import tempfile - - logger.info(f"Loading dataset {dataset_name} (split: {split})") - dataset = load_dataset(dataset_name, split=split) - - if not output_file: - temp_dir = tempfile.gettempdir() - dataset_basename = dataset_name.split("/")[-1] - output_file = os.path.join(temp_dir, f"{dataset_basename}_{split}_{int(time.time())}.jsonl") - - os.makedirs(os.path.dirname(os.path.abspath(output_file)), exist_ok=True) - - if message_key_map is None: - message_key_map = {} - - processed_samples = 0 - # Initialize i to handle empty dataset case for logging - i = -1 - with open(output_file, "w") as f: - for i, item in enumerate(dataset): - if processed_samples >= max_samples: - break - - if prompt_key not in item and "statement" not in item: - logger.debug(f"Skipping sample {i} due to missing prompt/statement key.") - continue - - prompt_text = item.get(prompt_key, item.get("statement", "")) - response_text = item.get( - response_key, - item.get("reference_solution", item.get("expected_proof", "")), - ) - - if not prompt_text or not response_text: - logger.debug(f"Skipping sample {i} due to missing prompt or response text.") - continue - - messages = [ - {"role": "user", "content": prompt_text}, - {"role": "assistant", "content": response_text}, - ] - entry = {"messages": messages} - - for ds_key, rk_key in message_key_map.items(): - if ds_key in item: - entry[rk_key] = item[ds_key] - - for key, value in item.items(): - if key not in [prompt_key, response_key] and key not in message_key_map: - entry[key] = value - - f.write(json.dumps(entry) + "\n") - processed_samples += 1 - - if processed_samples == 0 and i == -1: - logger.info(f"No samples converted to JSONL format: {output_file}") - else: - logger.info(f"Converted {processed_samples} samples to JSONL format: {output_file}") - return output_file - - -class EvaluatorPreviewResult: - def __init__(self): - self.results = [] - self.total_samples = 0 - self.total_runtime_ms = 0 - - def add_result(self, sample_index, success, score, per_metric_evals): - result_obj = types.SimpleNamespace( - index=sample_index, - success=success, - score=score, - per_metric_evals=per_metric_evals, - ) - self.results.append(result_obj) - - def display(self): - print("Evaluation Preview Results") - print("------------------------") - print(f"Total Samples: {self.total_samples}") - print(f"Total Runtime: {self.total_runtime_ms} ms\n") - print("Individual Results:") - print("------------------") - for i, result_obj in enumerate(self.results): - print(f"Sample {result_obj.index + 1}:") - print(f" Success: {result_obj.success}") - print(f" Score: {result_obj.score}") - if hasattr(result_obj, "per_metric_evals") and isinstance(result_obj.per_metric_evals, dict): - for metric, value in result_obj.per_metric_evals.items(): - print(f" {metric}: {value}") - elif hasattr(result_obj, "per_metric_evals"): - print(f" Per-Metric Evals: {result_obj.per_metric_evals}") - if i < len(self.results) - 1: - print() - class Evaluator: def __init__( self, - multi_metrics=False, # Relates to output structure (dict of metrics vs single) - remote_url: Optional[str] = None, - ts_mode_config: Optional[Dict[str, Any]] = None, - reward_function_mode: EvaluationMode = "pointwise", # New parameter for input processing mode account_id: Optional[str] = None, api_key: Optional[str] = None, entry_point: Optional[str] = None, ): - self.multi_metrics = multi_metrics - self.remote_url = remote_url - self.ts_mode_config = ts_mode_config - self.reward_function_mode = reward_function_mode - self.code_files = {} - self.metric_folders: Dict[str, Dict[str, Any]] = {} # Changed to store path and requirements self.account_id = account_id self.api_key = api_key self.description = "" self.display_name = "" self.api_base = os.environ.get("FIREWORKS_API_BASE", "https://api.fireworks.ai") - # Optional requirements string for multi-metric mode (when loaded differently) - self._loaded_multi_metric_requirements_str: Optional[str] = None - # Optional entry point metadata (module::function or path::function) self.entry_point: Optional[str] = entry_point - if self.ts_mode_config: - python_code = self.ts_mode_config.get("python_code") - file_name = self.ts_mode_config.get("file_name", "main.py") - if not python_code: - raise ValueError("python_code is required in ts_mode_config") - self.code_files[file_name] = python_code - # ts_mode implies multiMetrics: true for the payload structure - # but it's distinct from folder-based multi_metrics for loading. - # The original self.multi_metrics flag is for folder loading. - # The payload's multiMetrics field will be set to True if ts_mode_config is active. - # The check for (metric_folders or folder) is not applicable in __init__ and was causing an error. - # If ts_mode_config is active, it takes precedence for code definition. - # The multi_metrics flag passed to __init__ is for folder-based loading if ts_mode_config is not used. - - def _should_include_file(self, filename: str) -> bool: - """Check if a file should be included in the evaluator upload.""" - return ( - filename.endswith(".py") - or filename.endswith(".txt") - or filename.endswith(".toml") - or os.path.basename(filename) == "Dockerfile" - ) - - def _load_python_files_from_folder(self, folder_path: str) -> Dict[str, str]: - """ - Recursively loads Python, text, and TOML files from a given folder (excluding common ignored dirs). - - Args: - folder_path: Absolute path to the folder. - - Returns: - A dictionary mapping relative file paths (within folder) to their content. - - Raises: - ValueError: If folder_path is invalid or not a directory. - """ - if not os.path.exists(folder_path): - raise ValueError(f"Folder does not exist: {folder_path}") - - if not os.path.isdir(folder_path): - raise ValueError(f"Not a directory: {folder_path}") - - files: Dict[str, str] = {} - ignored_dirs = {".git", "__pycache__", "node_modules", "venv", ".venv", "dist", "build", "vendor"} - base_path = Path(folder_path) - for dirpath, dirnames, filenames in os.walk(folder_path): - # prune ignored directories - dirnames[:] = [d for d in dirnames if d not in ignored_dirs and not d.startswith(".")] - for name in filenames: - if not self._should_include_file(name): - continue - abs_path = Path(dirpath) / name - rel_path = str(abs_path.relative_to(base_path)) - with open(abs_path, "r", encoding="utf-8") as f: - content = f.read() - files[rel_path] = content - if not files: - raise ValueError(f"No Python, text, or TOML files found in {folder_path}") - return files - - def load_metric_folder(self, metric_name, folder_path): - """ - Load code files from a metric folder - - Args: - metric_name: Name of the metric - folder_path: Path to the folder containing code files - - Returns: - Dict mapping filenames to their contents - """ - folder_path = os.path.abspath(folder_path) - files = self._load_python_files_from_folder(folder_path) # Reads all .py files into a dict - metric_requirements_list: Optional[List[str]] = None - - main_py_content = files.get("main.py") - if main_py_content: - try: - tree = ast.parse(main_py_content) - for node in ast.walk(tree): - if isinstance(node, ast.FunctionDef) and node.name == "evaluate": - for decorator_node in node.decorator_list: - if ( - isinstance(decorator_node, ast.Call) - and isinstance(decorator_node.func, ast.Name) - and decorator_node.func.id == "reward_function" - ): - for keyword in decorator_node.keywords: - if keyword.arg == "requirements": - if isinstance(keyword.value, ast.List): - reqs: List[str] = [] - for elt in keyword.value.elts: - if isinstance(elt, ast.Constant): # Python 3.8+ - if isinstance(elt.value, str): - reqs.append(cast(str, elt.value)) - elif isinstance(elt, ast.Str): # Python < 3.8 - reqs.append(cast(str, elt.s)) - if reqs: - metric_requirements_list = cast(List[str], reqs) - elif isinstance(keyword.value, ast.Constant) and isinstance( - keyword.value.value, str - ): # Python 3.8+ (single req string) - metric_requirements_list = [cast(str, keyword.value.value)] - elif isinstance(keyword.value, ast.Str): # Python < 3.8 (single req string) - metric_requirements_list = [cast(str, keyword.value.s)] - break - if metric_requirements_list: - break - if metric_requirements_list: - logger.info( - f"Found requirements for metric '{metric_name}' via AST: {metric_requirements_list}" - ) - break - except SyntaxError as e: - logger.error(f"Syntax error parsing main.py for metric '{metric_name}' to find requirements: {e}") - except Exception as e: - logger.error(f"Error parsing main.py AST for metric '{metric_name}': {e}") - - self.metric_folders[metric_name] = { - "path": folder_path, - "requirements": metric_requirements_list, # This is now a list of strings or None - } - - for filename, content in files.items(): - self.code_files[f"{metric_name}/{filename}"] = content - - logger.info(f"Loaded {len(files)} files for metric '{metric_name}' from {folder_path}") - return files - - def load_multi_metrics_folder(self, folder_path): - """ - Load code files from a folder with multiple metrics - - Args: - folder_path: Path to the folder containing code files - - Returns: - Dict mapping filenames to their contents - """ - folder_path = os.path.abspath(folder_path) - files = self._load_python_files_from_folder(folder_path) - - self.code_files = files - logger.info(f"Loaded {len(files)} files from {folder_path} for multi-metrics evaluation") - return files - - def load_samples_from_jsonl(self, sample_file, max_samples=5): - if not os.path.exists(sample_file): - raise ValueError(f"Sample file does not exist: {sample_file}") - samples = [] - with open(sample_file, "r") as f: - for i, line in enumerate(f): - if i >= max_samples: - break - line = line.strip() - if not line: - continue - try: - sample = json.loads(line) - samples.append(sample) - except json.JSONDecodeError: - logger.warning(f"Invalid JSON on line {i + 1}, skipping") - logger.info(f"Loaded {len(samples)} samples from {sample_file}") - return samples - - def preview(self, sample_file, max_samples=5): - if not self.remote_url and not self.ts_mode_config and not self.code_files: - raise ValueError("No code files loaded. Load metric folder(s) or provide ts_mode_config/remote_url first.") - - # If not remote and not ts_mode, then main.py check applies to loaded code_files - if not self.remote_url and not self.ts_mode_config: - if "main.py" not in self.code_files and not any(k.endswith("/main.py") for k in self.code_files): - raise ValueError("No main.py found in loaded code files for folder-based evaluation.") - - samples = self.load_samples_from_jsonl(sample_file, max_samples) - if not samples: - raise ValueError(f"No valid samples found in {sample_file}") - - auth_token = self.api_key or get_fireworks_api_key() - account_id = self.account_id or get_fireworks_account_id() - if not account_id and auth_token: - account_id = verify_api_key_and_get_account_id(api_key=auth_token, api_base=self.api_base) - logger.debug(f"Preview using account_id: {account_id}") - - if not account_id or not auth_token: - logger.error("Authentication error: Missing Fireworks Account ID or API Key.") - raise ValueError("Missing Fireworks Account ID or API Key.") - - # Keep multiMetrics/rollupSettings for backward compatibility with tests - payload_multi_metrics = True - payload_rollup_settings = {"skipRollup": True} - - # For preview, evaluator_id might not be as critical for shim's env var name, - # but pass it for consistency. Use display_name as a proxy if no specific ID. - preview_evaluator_id_for_shim = self.display_name or "preview_evaluator" - evaluator_payload_data = { - "displayName": self.display_name or "Preview Evaluator", - "description": self.description or "Preview Evaluator", - "multiMetrics": payload_multi_metrics, - "criteria": self._construct_criteria(criteria_data={}), - "requirements": self._get_combined_requirements(), - "rollupSettings": payload_rollup_settings, - } - - sample_strings = [json.dumps(sample) for sample in samples] - payload = { - "evaluator": evaluator_payload_data, - "sampleData": sample_strings, - "maxSamples": max_samples, - } - - api_base = os.environ.get("FIREWORKS_API_BASE", "https://api.fireworks.ai") - - if "dev.api.fireworks.ai" in api_base and account_id == "fireworks": - account_id = "pyroworks-dev" - - url = f"{api_base}/v1/accounts/{account_id}/evaluators:previewEvaluator" - headers = { - "Authorization": f"Bearer {auth_token}", - "Content-Type": "application/json", - "User-Agent": get_user_agent(), - } - logger.info(f"Previewing evaluator using API endpoint: {url} with account: {account_id}") - logger.debug(f"Preview API Request URL: {url}") - logger.debug(f"Preview API Request Headers: {json.dumps(headers, indent=2)}") - logger.debug(f"Preview API Request Payload: {json.dumps(payload, indent=2)}") - - global used_preview_api - try: - response = requests.post(url, json=payload, headers=headers) - response.raise_for_status() - result = response.json() - used_preview_api = True - preview_result_obj = EvaluatorPreviewResult() - preview_result_obj.total_samples = result.get("totalSamples", len(samples)) - preview_result_obj.total_runtime_ms = int(result.get("totalRuntimeMs", 0)) - sample_results = result.get("results", []) - for i, sample_result_item in enumerate(sample_results): - preview_result_obj.add_result( - sample_index=i, - success=sample_result_item.get("success", False), - score=sample_result_item.get("score", 0.0), - per_metric_evals=sample_result_item.get("perMetricEvals", {}), - ) - return preview_result_obj - except Exception as e: - logger.error(f"Error previewing evaluator: {str(e)}") - if isinstance(e, requests.exceptions.HTTPError) and hasattr(e, "response"): - logger.error(f"Response: {e.response.text}") - used_preview_api = False - logger.warning("Falling back to simulated preview mode") - return self._simulated_preview(samples) - - def _get_combined_requirements(self) -> str: - """Combines requirements from all loaded metrics.""" - all_requirements_set = set() - for metric_data in self.metric_folders.values(): - req_list_or_str = metric_data.get("requirements") - if req_list_or_str: - if isinstance(req_list_or_str, list): - for req_item in req_list_or_str: - if isinstance(req_item, str): - all_requirements_set.add(req_item.strip()) - elif isinstance(req_list_or_str, str): # Fallback if somehow a string is still passed - items = [r.strip() for r in req_list_or_str.splitlines() if r.strip()] - for item in items: - all_requirements_set.add(item) - - # For multi_metrics loaded directly into self.code_files (not via metric_folders) - # This part is more complex as it requires loading the 'main.py' from self.code_files - # if self.multi_metrics and not self.metric_folders and "main.py" in self.code_files: - # We would need a temporary way to load this main.py to get its requirements. - # For now, focusing on metric_folders which is the primary path for --metrics-folders. - # If a multi_metrics folder is loaded via load_multi_metrics_folder, it also needs a similar - # dynamic import logic to fetch requirements from its main 'evaluate' function. - # This part is NOT YET IMPLEMENTED for multi_metrics folders. - - if not all_requirements_set and hasattr(self, "_loaded_multi_metric_requirements_str"): - # Fallback for multi_metrics if requirements were loaded differently (hypothetical) - # This attribute doesn't exist yet, placeholder for future enhancement if needed. - if self._loaded_multi_metric_requirements_str: # type: ignore - requirements_list = [ - r.strip() for r in self._loaded_multi_metric_requirements_str.splitlines() if r.strip() - ] # type: ignore - for req_item in requirements_list: - all_requirements_set.add(req_item) - - logger.info(f"Combined unique requirements: {all_requirements_set}") - return "\n".join(sorted(list(all_requirements_set))) - - def _simulated_preview(self, samples): - preview_result = EvaluatorPreviewResult() - preview_result.total_samples = len(samples) - start_time = time.time() - for i, sample in enumerate(samples): - try: - if "messages" not in sample: - raise ValueError(f"Sample {i + 1} is missing 'messages' field") - _ = sample.get("messages", []) - _ = sample.get("ground_truth", []) - _ = sample.get("tools", []) - _ = { - k: v - for k, v in sample.items() - if k - not in [ - "messages", - "ground_truth", - "tools", - ] - } - - if self.multi_metrics or self.ts_mode_config: # ts_mode also implies a single set of results - per_metric_evals = {"quality": 0.8, "relevance": 0.7, "safety": 0.9} - else: - per_metric_evals = {metric_name: 0.75 for metric_name in self.metric_folders} - - score = sum(per_metric_evals.values()) / len(per_metric_evals) if per_metric_evals else 0.0 - preview_result.add_result( - sample_index=i, - success=True, - score=score, - per_metric_evals=per_metric_evals, - ) - except Exception as e: - logger.error(f"Error processing sample {i + 1}: {str(e)}") - preview_result.add_result( - sample_index=i, - success=False, - score=0.0, - per_metric_evals={"error": str(e)}, - ) - end_time = time.time() - preview_result.total_runtime_ms = max(1, int((end_time - start_time) * 1000)) - return preview_result - - def _build_minimal_criteria(self) -> List[Dict[str, str]]: - """Build minimal criteria (name, type, description) without code snippets.""" - - # Remote URL mode - if self.remote_url: - return [ - { - "name": "remote_eval_proxy", - "type": "CODE_SNIPPETS", - "description": f"Proxies evaluation to remote URL: {self.remote_url}", - } - ] - - # TS mode (direct code snippet) - elif self.ts_mode_config: - criterion_name = self.ts_mode_config.get("criterion_name", "default_code_criterion") - description = self.ts_mode_config.get("description", "Python code execution") - return [ - { - "name": criterion_name, - "type": "CODE_SNIPPETS", - "description": description, - } - ] - - # Multi-metrics mode - elif self.multi_metrics: - return [ - { - "name": "eval", - "type": "CODE_SNIPPETS", - "description": self.description or "Multi-metric evaluation", - } - ] - - # Single metric folders - else: - criteria = [] - for metric_name in self.metric_folders: - criteria.append( - { - "name": metric_name, - "type": "CODE_SNIPPETS", - "description": self.description or f"Evaluation metric: {metric_name}", - } - ) - return criteria - @staticmethod def _parse_ignore_file(ignore_path: str) -> List[str]: """Parse .gitignore or .dockerignore and return patterns.""" @@ -694,9 +155,6 @@ def _create_tar_gz_with_ignores(output_path: str, source_dir: str) -> int: return size_bytes def create(self, evaluator_id, display_name=None, description=None, force=False): - if not self.remote_url and not self.ts_mode_config and not self.code_files: - raise ValueError("No code files loaded. Load metric folder(s) or provide ts_mode_config/remote_url first.") - auth_token = self.api_key or get_fireworks_api_key() account_id = self.account_id or get_fireworks_account_id() if not account_id and auth_token: @@ -706,94 +164,65 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) logger.error("Authentication error: API credentials appear to be invalid or incomplete.") raise ValueError("Invalid or missing API credentials.") + client = Fireworks(api_key=auth_token, base_url=self.api_base, account_id=account_id) + self.display_name = display_name or evaluator_id self.description = description or f"Evaluator created from {evaluator_id}" - # Keep multiMetrics/rollupSettings for backward compatibility with tests - payload_multi_metrics = True - payload_rollup_settings = {"skipRollup": True} - parent = f"accounts/{account_id}" - try: version_str = get_pep440_version() except Exception: version_str = None - payload_data = { - "parent": parent, - "evaluator": { - "displayName": self.display_name, - "description": self.description, - "multiMetrics": payload_multi_metrics, - "commitHash": version_str, - "criteria": self._build_minimal_criteria(), - "requirements": "", - "rollupSettings": payload_rollup_settings, - }, - "evaluatorId": evaluator_id, - } + # Build evaluator params for SDK + from fireworks.types import evaluator_create_params - # Include optional entry point when provided + evaluator_params: evaluator_create_params.Evaluator = { + "display_name": self.display_name, + "description": self.description, + } + if version_str: + evaluator_params["commit_hash"] = version_str if self.entry_point: - payload_data["evaluator"]["entryPoint"] = self.entry_point + evaluator_params["entry_point"] = self.entry_point logger.info(f"Including entryPoint in payload: {self.entry_point}") - # Debug log the create payload structure (without sample data) + # Debug log the create payload structure try: - logger.info(f"Create API Request Payload: {json.dumps(payload_data, indent=2)}") + logger.info(f"Create API Request: evaluator_id={evaluator_id}, evaluator={evaluator_params}") except Exception: - # If serialization fails for any reason, skip debug dump pass - if "dev.api.fireworks.ai" in self.api_base and account_id == "fireworks": - account_id = "pyroworks-dev" - - base_url = f"{self.api_base}/v1/{parent}/evaluatorsV2" - headers = { - "Authorization": f"Bearer {auth_token}", - "Content-Type": "application/json", - "User-Agent": get_user_agent(), - } - self._ensure_requirements_present(os.getcwd()) logger.info(f"Creating evaluator '{evaluator_id}' for account '{account_id}'...") try: if force: - check_url = f"{self.api_base}/v1/{parent}/evaluators/{evaluator_id}" try: - logger.info(f"Checking if evaluator exists: {check_url}") - check_response = requests.get(check_url, headers=headers) - - if check_response.status_code == 200: + logger.info("Checking if evaluator exists") + existing_evaluator = client.evaluators.get(evaluator_id=evaluator_id) + if existing_evaluator: logger.info(f"Evaluator '{evaluator_id}' already exists, deleting and recreating...") - delete_url = f"{self.api_base}/v1/{parent}/evaluators/{evaluator_id}" try: - delete_response = requests.delete(delete_url, headers=headers) - if delete_response.status_code < 400: - logger.info(f"Successfully deleted evaluator '{evaluator_id}'") - else: - logger.warning( - f"Unable to delete evaluator '{evaluator_id}', status: {delete_response.status_code}" - ) - except Exception as e_del: - logger.warning(f"Error deleting evaluator: {str(e_del)}") - response = requests.post(base_url, json=payload_data, headers=headers) - else: - response = requests.post(base_url, json=payload_data, headers=headers) - except requests.exceptions.RequestException: - response = requests.post(base_url, json=payload_data, headers=headers) - else: - logger.info(f"Creating evaluator at: {base_url}") - response = requests.post(base_url, json=payload_data, headers=headers) - - response.raise_for_status() - result = response.json() + client.evaluators.delete(evaluator_id=evaluator_id) + logger.info(f"Successfully deleted evaluator '{evaluator_id}'") + except fireworks.NotFoundError: + logger.info(f"Evaluator '{evaluator_id}' not found, creating...") + except fireworks.APIError as e: + logger.warning(f"Error deleting evaluator: {str(e)}") + except fireworks.NotFoundError: + logger.info(f"Evaluator '{evaluator_id}' does not exist, creating...") + + # Create evaluator using SDK + result = client.evaluators.create( + evaluator_id=evaluator_id, + evaluator=evaluator_params, + ) logger.info(f"Successfully created evaluator '{evaluator_id}'") # Upload code as tar.gz to GCS - evaluator_name = result.get("name") # e.g., "accounts/pyroworks/evaluators/test-123" + evaluator_name = result.name # e.g., "accounts/pyroworks/evaluators/test-123" if not evaluator_name: raise ValueError( @@ -810,20 +239,18 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) tar_size = self._create_tar_gz_with_ignores(tar_path, cwd) - # Call GetEvaluatorUploadEndpoint - upload_endpoint_url = f"{self.api_base}/v1/{evaluator_name}:getUploadEndpoint" - upload_payload = {"name": evaluator_name, "filename_to_size": {tar_filename: tar_size}} - + # Call GetEvaluatorUploadEndpoint using SDK logger.info(f"Requesting upload endpoint for {tar_filename}") - upload_response = requests.post(upload_endpoint_url, json=upload_payload, headers=headers) - upload_response.raise_for_status() + upload_response = client.evaluators.get_upload_endpoint( + evaluator_id=evaluator_id, + filename_to_size={tar_filename: str(tar_size)}, + ) # Check for signed URLs - upload_response_data = upload_response.json() - signed_urls = upload_response_data.get("filenameToSignedUrls", {}) + signed_urls = upload_response.filename_to_signed_urls or {} if not signed_urls: - raise ValueError(f"GetUploadEndpoint returned no signed URLs. Response: {upload_response_data}") + raise ValueError(f"GetUploadEndpoint returned no signed URLs. Response: {upload_response}") signed_url = signed_urls.get(tar_filename) @@ -894,14 +321,11 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) logger.error(f"Upload failed after {max_retries} attempts") raise - # Step 3: Validate upload - validate_url = f"{self.api_base}/v1/{evaluator_name}:validateUpload" - validate_payload = {"name": evaluator_name} - validate_response = requests.post(validate_url, json=validate_payload, headers=headers) - validate_response.raise_for_status() - - validate_data = validate_response.json() - + # Step 3: Validate upload using SDK + client.evaluators.validate_upload( + evaluator_id=evaluator_id, + body={}, + ) logger.info("Upload validated successfully") # Clean up tar file @@ -913,275 +337,14 @@ def create(self, evaluator_id, display_name=None, description=None, force=False) # Don't fail - evaluator is created, just code upload failed return result # Return after attempting upload + except fireworks.APIStatusError as e: + logger.error(f"Error creating evaluator: {str(e)}") + logger.error(f"Status code: {e.status_code}, Response: {e.response.text}") + raise except Exception as e: logger.error(f"Error creating evaluator: {str(e)}") - if isinstance(e, requests.exceptions.HTTPError) and hasattr(e, "response"): - logger.error(f"Response: {e.response.text}") raise - def _construct_criteria(self, criteria_data: Any) -> Any: - assertions = [] - if self.remote_url: - shim_main_py_content = f""" -import json -import os -import requests - -REMOTE_EVALUATOR_URL = "{self.remote_url}" - -def evaluate(messages, ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs): - payload = {{ - "messages": messages, - "ground_truth": ground_truth, - "tools": tools, - "kwargs": kwargs - }} - headers = {{"Content-Type": "application/json"}} - try: - response = requests.post(REMOTE_EVALUATOR_URL, json=payload, headers=headers, timeout=30) - response.raise_for_status() - return response.json() - except requests.exceptions.RequestException as e: - error_info = {{ - "error": f"Failed to call remote evaluator at {{REMOTE_EVALUATOR_URL}}: {{str(e)}}", - "status_code": getattr(e.response, 'status_code', None), - "response_text": getattr(e.response, 'text', None) - }} - return {{ - "score": 0.0, "reason": f"Error calling remote evaluator: {{str(e)}}", - "is_score_valid": False, "metrics": {{"remote_call_error": {{"score": 0.0, "is_score_valid": False, "reason": json.dumps(error_info)}}}} - }} - except Exception as e: - return {{ - "score": 0.0, "reason": f"Unexpected error in remote evaluator shim: {{str(e)}}", - "is_score_valid": False, "metrics": {{"shim_error": {{"score": 0.0, "is_score_valid": False, "reason": str(e)}}}} - }} -""" - file_contents = {"main.py": shim_main_py_content} - assertions.append( - { - "codeSnippets": { - "language": "python", - "fileContents": file_contents, - }, - "name": "remote_eval_proxy", - "type": "CODE_SNIPPETS", - "description": f"Proxies evaluation to remote URL: {self.remote_url}", - } - ) - elif self.ts_mode_config: - python_code = self.ts_mode_config.get("python_code") - file_name = self.ts_mode_config.get("file_name", "main.py") - criterion_name = self.ts_mode_config.get("criterion_name", "default_code_criterion") - description = self.ts_mode_config.get("description", "Python code execution") - if not python_code: - raise ValueError("python_code is required in ts_mode_config") - entry_func = "evaluate" - try: - if self.entry_point and "::" in self.entry_point: - entry_func = self.entry_point.split("::", 1)[1] - except Exception: - entry_func = "evaluate" - assertions.append( - { - "type": "CODE_SNIPPETS", - "name": criterion_name, - "description": description, - "codeSnippets": { - "language": "python", - "fileContents": {file_name: python_code}, - "entryFile": file_name, - "entryFunc": entry_func, - }, - } - ) - elif self.multi_metrics: - file_contents = {} - for filename, content in self.code_files.items(): - if filename.endswith(".py"): - file_contents[filename] = self._update_evaluate_signature(content) - elif self._should_include_file(filename) and not filename.endswith(".py"): - file_contents[filename] = content - if not file_contents: - raise ValueError("No files found for multi-metrics mode.") - # Determine entry file from entry_point if provided; otherwise detect - entry_file = None - if self.entry_point and "::" in self.entry_point: - try: - ep_file = self.entry_point.split("::", 1)[0] - if ep_file in file_contents: - entry_file = ep_file - else: - ep_base = os.path.basename(ep_file) - for fname in file_contents.keys(): - if os.path.basename(fname) == ep_base: - entry_file = fname - break - except Exception: - entry_file = None - if not entry_file: - try: - for fname, content in file_contents.items(): - for line in content.split("\n"): - s = line.lstrip() - if s.startswith("def evaluate(") or s.startswith("async def evaluate("): - entry_file = fname - break - if entry_file: - break - except Exception: - entry_file = None - if not entry_file: - entry_file = "main.py" if "main.py" in file_contents else list(file_contents.keys())[0] - entry_func = "evaluate" - try: - if self.entry_point and "::" in self.entry_point: - entry_func = self.entry_point.split("::", 1)[1] - except Exception: - entry_func = "evaluate" - assertions.append( - { - "codeSnippets": { - "language": "python", - "fileContents": file_contents, - "entryFile": entry_file, - "entryFunc": entry_func, - }, - "name": "eval", - "type": "CODE_SNIPPETS", - "description": self.description or "Multi-metric evaluation", - } - ) - else: # Folder-based, non-multi_metrics - for metric_name in self.metric_folders: - file_contents = {} - # Include all discovered files for this metric folder, preserving filenames - for filename, content in self.code_files.items(): - if filename.startswith(f"{metric_name}/"): - # Use the file name within the metric folder for clarity - short_name = filename.split(f"{metric_name}/", 1)[1] - if filename.endswith(".py"): - file_contents[short_name] = self._update_evaluate_signature(content) - elif self._should_include_file(filename) and not filename.endswith(".py"): - file_contents[short_name] = content - if not file_contents: - logger.warning(f"No files prepared for metric '{metric_name}', skipping this metric for criteria.") - continue - # Determine entry file within this metric's files using entry_point if present - entry_file = None - if self.entry_point and "::" in self.entry_point: - try: - ep_file = self.entry_point.split("::", 1)[0] - if ep_file in file_contents: - entry_file = ep_file - else: - ep_base = os.path.basename(ep_file) - for fname in file_contents.keys(): - if os.path.basename(fname) == ep_base: - entry_file = fname - break - except Exception: - entry_file = None - if not entry_file: - try: - for fname, content in file_contents.items(): - for line in content.split("\n"): - s = line.lstrip() - if s.startswith("def evaluate(") or s.startswith("async def evaluate("): - entry_file = fname - break - if entry_file: - break - except Exception: - entry_file = None - if not entry_file: - entry_file = "main.py" if "main.py" in file_contents else list(file_contents.keys())[0] - - entry_func = "evaluate" - try: - if self.entry_point and "::" in self.entry_point: - entry_func = self.entry_point.split("::", 1)[1] - except Exception: - entry_func = "evaluate" - assertions.append( - { - "codeSnippets": { - "language": "python", - "fileContents": file_contents, - "entryFile": entry_file, - "entryFunc": entry_func, - }, - "name": metric_name, - "type": "CODE_SNIPPETS", - "description": f"Metric: {metric_name}", - } - ) - - if not assertions: - raise ValueError("No valid criteria could be constructed.") - return assertions - - def _update_evaluate_signature(self, content): - import re - - # Simple regex to match the old evaluate function signature - old_pattern = r"def\s+evaluate\s*\(\s*entry\s*(?::\s*dict)?\s*\)" - # Regex to match the signature we are changing from (original_messages) - current_signature_pattern = ( - r"def\s+evaluate\s*\(\s*messages,\s*original_messages\s*=\s*None,\s*tools\s*=\s*None,\s*\*\*kwargs\s*\)" - ) - new_signature = "def evaluate(messages, ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs)" - - # Check if the old pattern (entry-based) exists - if re.search(old_pattern, content): - updated_content = re.sub(old_pattern, new_signature, content, count=1) - - # Add a compatibility layer for the 'entry' style - compat_layer = """ - # Compatibility layer for old 'entry' format - if ground_truth is None: # Default ground_truth from messages if not provided - ground_truth = messages - # Assuming 'entry' dict was constructed from messages, original_messages (now ground_truth), tools, kwargs - # This part might need more context on how 'entry' was used. - # For now, we'll assume ground_truth takes precedence or is derived. -""" - # Check if the current signature (with original_messages) exists - elif re.search(current_signature_pattern, content): - updated_content = re.sub(current_signature_pattern, new_signature, content, count=1) - # No specific compatibility layer needed here as it's a direct parameter rename - compat_layer = "" # No additional layer for this direct change - else: - # If neither known signature is found, return content as is - return content - - # Find the function body indent level if a change was made - if "updated_content" in locals() and compat_layer: # Only add layer if it's defined - func_match = re.search(r"def\s+evaluate.*?:\s*\n(\s+)", updated_content, re.DOTALL) - if func_match: - indent = func_match.group(1) - # Adjust indentation of compatibility layer - indented_compat_layer = "\n".join(indent + line for line in compat_layer.strip().split("\n")) - - # Insert compatibility layer after function definition - updated_content = re.sub( - re.escape(new_signature) + r"\s*:", - new_signature + ":" + indented_compat_layer, - updated_content, - count=1, - ) - return updated_content - elif "updated_content" in locals(): - return updated_content - return content - - def _get_combined_code(self): # This method seems unused now, consider removal - # ... (implementation unchanged, but likely dead code) - pass - - def _get_code_from_files(self, files): # This method seems unused now, consider removal - # ... (implementation unchanged, but likely dead code) - pass - def _get_authentication(self): account_id = get_fireworks_account_id() auth_token = get_fireworks_api_key() @@ -1195,277 +358,31 @@ def _get_authentication(self): # Helper functions for CLI commands -def preview_evaluation( - metric_folders: Optional[List[str]] = None, - multi_metrics: bool = False, - folder: Optional[str] = None, - python_code_to_evaluate: Optional[str] = None, - python_file_name_for_code: str = "main.py", - criterion_name_for_code: str = "default_code_criterion", - criterion_description_for_code: str = "Python code execution", - sample_file: Optional[str] = None, - max_samples: int = 5, - huggingface_dataset: Optional[str] = None, - huggingface_split: str = "train", - huggingface_message_key_map: Optional[Dict[str, str]] = None, - huggingface_response_key: str = "response", - huggingface_prompt_key: str = "prompt", - reward_function_mode: EvaluationMode = "pointwise", # Added for consistency - account_id: Optional[str] = None, - api_key: Optional[str] = None, -): - ts_mode_config = None - if python_code_to_evaluate: - if metric_folders or folder: # Removed multi_metrics from this check as it's handled by Evaluator init - raise ValueError( - "Cannot use python_code_to_evaluate with folder-based parameters (metric_folders, folder)." - ) - ts_mode_config = { - "python_code": python_code_to_evaluate, - "file_name": python_file_name_for_code, - "criterion_name": criterion_name_for_code, - "description": criterion_description_for_code, - } - # When python_code_to_evaluate is used, multi_metrics in Evaluator constructor is effectively True - # due to how ts_mode_config is handled (sets self.multi_metrics = True for payload). - # The multi_metrics flag passed to Evaluator here should be the original one for folder logic. - evaluator = Evaluator( - multi_metrics=multi_metrics, - ts_mode_config=ts_mode_config, - reward_function_mode=reward_function_mode, - account_id=account_id, - api_key=api_key, - ) - else: - evaluator = Evaluator( - multi_metrics=multi_metrics, - reward_function_mode=reward_function_mode, - account_id=account_id, - api_key=api_key, - ) # Pass mode to Evaluator - if multi_metrics: - if not folder: - raise ValueError("`folder` must be specified for multi_metrics mode.") - evaluator.load_multi_metrics_folder(folder) - else: - if not metric_folders: - raise ValueError("At least one metric_folder must be specified.") - for pair in metric_folders: - if "=" not in pair: - raise ValueError(f"Invalid metric-folder format: {pair}.") - metric_name, folder_path = pair.split("=", 1) - evaluator.load_metric_folder(metric_name, folder_path) - - if huggingface_dataset: - if sample_file: - logger.warning("Both sample_file and huggingface_dataset specified. Using HuggingFace dataset.") - sample_file = huggingface_dataset_to_jsonl( - dataset_name=huggingface_dataset, - split=huggingface_split, - max_samples=max_samples, - message_key_map=huggingface_message_key_map, - response_key=huggingface_response_key, - prompt_key=huggingface_prompt_key, - ) - logger.info(f"Converted dataset saved to: {sample_file}") - - if not sample_file: - raise ValueError("Either sample_file or huggingface_dataset must be specified.") - return evaluator.preview(sample_file, max_samples) - - -def preview_folder_evaluation( # This function might become redundant or need to align with the new preview_evaluation - evaluator_folder, - sample_file=None, - max_samples=5, - multi_metrics=False, # original multi_metrics - huggingface_dataset=None, - huggingface_split="train", - huggingface_message_key_map=None, - huggingface_response_key="response", - huggingface_prompt_key="prompt", -): - evaluator_folder = os.path.abspath(evaluator_folder) - if not os.path.exists(evaluator_folder): - raise ValueError(f"Evaluator folder does not exist: {evaluator_folder}") - if not os.path.isdir(evaluator_folder): - raise ValueError(f"Not a directory: {evaluator_folder}") - - has_main_py = os.path.exists(os.path.join(evaluator_folder, "main.py")) - # Auto-detect multi_metrics if not specified by caller - detected_multi_metrics = multi_metrics - if has_main_py and not multi_metrics: - py_files = list(Path(evaluator_folder).glob("*.py")) - if len(py_files) > 1: - logger.info("Auto-detecting multi-metrics mode based on folder structure for preview_folder_evaluation") - detected_multi_metrics = True - - # Call the unified preview_evaluation - # This function doesn't directly support ts_mode_config, so python_code_to_evaluate is None - return preview_evaluation( - metric_folders=( - None if detected_multi_metrics else [f"{os.path.basename(evaluator_folder)}={evaluator_folder}"] - ), # Simplified for now - multi_metrics=detected_multi_metrics, - folder=evaluator_folder if detected_multi_metrics else None, - python_code_to_evaluate=None, # Not applicable for this helper - sample_file=sample_file, - max_samples=max_samples, - huggingface_dataset=huggingface_dataset, - huggingface_split=huggingface_split, - huggingface_message_key_map=huggingface_message_key_map, - huggingface_response_key=huggingface_response_key, - huggingface_prompt_key=huggingface_prompt_key, - ) - - def create_evaluation( evaluator_id: str, - metric_folders: Optional[List[str]] = None, - multi_metrics: bool = False, # Original folder-based multi_metrics flag - folder: Optional[str] = None, - python_code_to_evaluate: Optional[str] = None, - python_file_name_for_code: str = "main.py", - criterion_name_for_code: str = "default_code_criterion", - criterion_description_for_code: str = "Python code execution", display_name: Optional[str] = None, description: Optional[str] = None, force: bool = False, - huggingface_dataset: Optional[str] = None, - huggingface_split: str = "train", - huggingface_message_key_map: Optional[Dict[str, str]] = None, - huggingface_response_key: str = "response", - huggingface_prompt_key: str = "prompt", - remote_url: Optional[str] = None, - reward_function_mode: EvaluationMode = "pointwise", # Added account_id: Optional[str] = None, api_key: Optional[str] = None, entry_point: Optional[str] = None, ): - ts_mode_config = None - if python_code_to_evaluate: - if metric_folders or folder: # Removed multi_metrics from this check - raise ValueError("Cannot use python_code_to_evaluate with folder-based parameters.") - ts_mode_config = { - "python_code": python_code_to_evaluate, - "file_name": python_file_name_for_code, - "criterion_name": criterion_name_for_code, - "description": criterion_description_for_code, - } + """ + Create an evaluator on the Fireworks platform. + Args: + evaluator_id: Unique identifier for the evaluator + display_name: Display name for the evaluator + description: Description for the evaluator + force: If True, delete and recreate if evaluator exists + account_id: Optional Fireworks account ID + api_key: Optional Fireworks API key + entry_point: Optional entry point (module::function or path::function) + """ evaluator = Evaluator( - multi_metrics=multi_metrics, - remote_url=remote_url, - ts_mode_config=ts_mode_config, - reward_function_mode=reward_function_mode, account_id=account_id, api_key=api_key, entry_point=entry_point, ) - if remote_url: - logger.info(f"Configuring evaluator to use remote URL: {remote_url}") - if ( - metric_folders or folder or python_code_to_evaluate - ): # If remote_url, other code sources are ignored for execution - logger.warning( - "When remote_url is provided, other code sources (folders, python_code_to_evaluate) are ignored for execution logic by the platform." - ) - elif ts_mode_config: - # ts_mode_config already handled in Evaluator.__init__ for self.code_files - logger.info("Configuring evaluator with direct Python code snippet (ts_mode).") - elif multi_metrics: # Folder-based multi_metrics - if not folder: - raise ValueError("`folder` must be specified for folder-based multi_metrics mode.") - evaluator.load_multi_metrics_folder(folder) - else: # Folder-based single/multiple metrics (non-multi_metrics structure) - if not metric_folders: - raise ValueError("At least one metric_folder must be specified.") - for pair in metric_folders: - if "=" not in pair: - raise ValueError(f"Invalid metric-folder format: {pair}.") - metric_name, folder_path = pair.split("=", 1) - evaluator.load_metric_folder(metric_name, folder_path) - - if huggingface_dataset: - logger.info(f"HuggingFace dataset specified: {huggingface_dataset} (currently for preview only).") - return evaluator.create(evaluator_id, display_name, description, force) - - -def deploy_folder_evaluation( # This function might become redundant or need to align with the new create_evaluation - evaluator_id, - evaluator_folder, - display_name=None, - description=None, - force=False, - multi_metrics=False, # original multi_metrics - huggingface_dataset=None, - huggingface_split="train", - huggingface_message_key_map=None, - huggingface_response_key="response", - huggingface_prompt_key="prompt", - remote_url: Optional[str] = None, -): - evaluator_folder_abs = os.path.abspath(evaluator_folder) if evaluator_folder else None - - # If remote_url is provided, evaluator_folder is less relevant for code loading - # but might still be used for context/metadata if the function design implies it. - # For now, if remote_url, we don't load from folder. - - python_code_to_evaluate = None # This helper doesn't take direct code string - - if not remote_url and not evaluator_folder_abs: - raise ValueError("evaluator_folder must be specified if not using remote_url.") - - if evaluator_folder_abs: - if not os.path.exists(evaluator_folder_abs): - raise ValueError(f"Evaluator folder does not exist: {evaluator_folder_abs}") - if not os.path.isdir(evaluator_folder_abs): - raise ValueError(f"Not a directory: {evaluator_folder_abs}") - - # Auto-detect multi_metrics if not specified and not remote_url and folder is given - detected_multi_metrics = multi_metrics - folder_for_loading = None - metric_folders_for_loading = None - - if not remote_url and evaluator_folder_abs: - has_main_py = os.path.exists(os.path.join(evaluator_folder_abs, "main.py")) - if has_main_py and not multi_metrics: # If user says not multi_metrics, but main.py is at root - py_files = list(Path(evaluator_folder_abs).glob("*.py")) - if len(py_files) > 1: # Heuristic: if multiple .py files at root with main.py, likely multi-metric - logger.info("Auto-detecting multi-metrics mode for deploy_folder_evaluation.") - detected_multi_metrics = True - - if detected_multi_metrics: - folder_for_loading = evaluator_folder_abs - else: # Prepare metric_folders list - metric_folders_for_loading = [] - if has_main_py: # Single metric in the root folder - metric_folders_for_loading.append(f"{os.path.basename(evaluator_folder_abs)}={evaluator_folder_abs}") - else: # Look for subdirectories - for item in os.listdir(evaluator_folder_abs): - item_path = os.path.join(evaluator_folder_abs, item) - if os.path.isdir(item_path) and os.path.exists(os.path.join(item_path, "main.py")): - metric_folders_for_loading.append(f"{item}={item_path}") - if not metric_folders_for_loading: - raise ValueError( - f"No valid metrics found in {evaluator_folder_abs} for non-multi-metric deployment." - ) - - return create_evaluation( - evaluator_id=evaluator_id, - metric_folders=metric_folders_for_loading, - multi_metrics=detected_multi_metrics, # Use the detected or passed-in multi_metrics - folder=folder_for_loading, - python_code_to_evaluate=python_code_to_evaluate, # None for this helper - display_name=display_name, - description=description, - force=force, - huggingface_dataset=huggingface_dataset, - huggingface_split=huggingface_split, - huggingface_message_key_map=huggingface_message_key_map, - huggingface_response_key=huggingface_response_key, - huggingface_prompt_key=huggingface_prompt_key, - remote_url=remote_url, - ) diff --git a/eval_protocol/platform_api.py b/eval_protocol/platform_api.py index bf608be0..60743ccb 100644 --- a/eval_protocol/platform_api.py +++ b/eval_protocol/platform_api.py @@ -1,9 +1,8 @@ # eval_protocol/platform_api.py import logging import sys -from typing import Any, Dict, Optional +from typing import Optional -import requests from dotenv import find_dotenv, load_dotenv from eval_protocol.auth import ( @@ -11,7 +10,8 @@ get_fireworks_api_base, get_fireworks_api_key, ) -from eval_protocol.common_utils import get_user_agent +from fireworks.types import Secret +from fireworks import Fireworks, FireworksError, NotFoundError, InternalServerError logger = logging.getLogger(__name__) @@ -88,47 +88,31 @@ def create_or_update_fireworks_secret( resolved_api_key = api_key or get_fireworks_api_key() resolved_api_base = api_base or get_fireworks_api_base() resolved_account_id = account_id # Must be provided + client = Fireworks(api_key=resolved_api_key, account_id=resolved_account_id, base_url=resolved_api_base) if not all([resolved_api_key, resolved_api_base, resolved_account_id]): logger.error("Missing Fireworks API key, base URL, or account ID for creating/updating secret.") return False - headers = { - "Authorization": f"Bearer {resolved_api_key}", - "Content-Type": "application/json", - "User-Agent": get_user_agent(), - } - - # The secret_id for GET/PATCH/DELETE operations is the key_name. - # The 'name' field in the gatewaySecret model for POST/PATCH is a bit ambiguous. - # For POST (create), the body is gatewaySecret, which has 'name', 'keyName', 'value'. - # 'name' in POST body is likely just the 'keyName' or 'secret_id' for creation context, - # as the full resource name 'accounts/.../secrets/...' is server-generated. - # Let's assume for POST, we send 'keyName' and 'value'. - # For PATCH, the path contains {secret_id} which is the key_name. The body is also gatewaySecret. - # Check if secret exists using GET (path uses normalized resource id) resource_id = _normalize_secret_resource_id(key_name) secret_exists = False try: - url = f"{resolved_api_base}/v1/accounts/{resolved_account_id}/secrets/{resource_id}" - response = requests.get(url, headers=headers, timeout=10) - if response.status_code == 200: + secret = client.secrets.get(resource_id) + if secret: secret_exists = True logger.info(f"Secret '{key_name}' already exists. Will attempt to update.") - elif response.status_code == 404: - logger.info(f"Secret '{key_name}' does not exist. Will attempt to create.") - secret_exists = False - elif response.status_code == 500: # As per user feedback, 500 on GET might mean not found - logger.warning( - f"Received 500 error when checking for secret '{key_name}'. Assuming it does not exist and will attempt to create. Response: {response.text}" - ) - secret_exists = False - else: - logger.error(f"Error checking for secret '{key_name}': {response.status_code} - {response.text}") - return False - except requests.exceptions.RequestException as e: - logger.error(f"Request exception while checking for secret '{key_name}': {e}") + except NotFoundError: + # Secret doesn't exist, proceed with creation + secret_exists = False + except InternalServerError as e: + # As per user feedback, 500 on GET might mean not found, treat as not found + logger.warning( + f"Received 500 error when checking for secret '{key_name}'. Assuming it doesn't exist. Response: {e}" + ) + secret_exists = False + except FireworksError as e: + logger.error(f"Error checking for secret '{key_name}': {e}") return False if secret_exists: @@ -144,31 +128,15 @@ def create_or_update_fireworks_secret( ) payload_key_name = "EP_SECRET" # Fallback, though unlikely needed with .upper() - payload = {"keyName": payload_key_name, "value": secret_value} try: - logger.debug(f"PATCH payload for '{key_name}': {payload}") - url = f"{resolved_api_base}/v1/accounts/{resolved_account_id}/secrets/{resource_id}" - response = requests.patch(url, json=payload, headers=headers, timeout=30) - response.raise_for_status() + logger.debug(f"PATCH payload for '{key_name}': key_name={payload_key_name}") + client.secrets.update(resource_id, key_name=payload_key_name, value=secret_value) logger.info(f"Successfully updated secret '{key_name}' on Fireworks platform.") return True - except requests.exceptions.HTTPError as e: - logger.error(f"HTTP error updating secret '{key_name}': {e.response.status_code} - {e.response.text}") - return False - except requests.exceptions.RequestException as e: - logger.error(f"Request exception updating secret '{key_name}': {e}") + except FireworksError as e: + logger.error(f"Error updating secret '{key_name}': {e}") return False else: - # Create new secret (POST) - # Body for POST is gatewaySecret. 'name' field in payload is the resource path. - # Let's assume for POST, the 'name' in payload can be omitted or is the key_name. - # The API should ideally use 'keyName' from URL or a specific 'secretId' in payload for creation if 'name' is server-assigned. - # Given the Swagger, 'name' is required in gatewaySecret. - # Let's try with 'name' being the 'key_name' for the payload, as the full path is not known yet. - # This might need adjustment based on actual API behavior. - # Construct the full 'name' path for the POST payload as per Swagger's title for 'name' - full_resource_name_for_payload = f"accounts/{resolved_account_id}/secrets/{resource_id}" - # Transform key_name for payload "keyName" field: uppercase and underscores payload_key_name = key_name.upper().replace("-", "_") if not payload_key_name or not payload_key_name[0].isupper(): @@ -177,26 +145,12 @@ def create_or_update_fireworks_secret( ) payload_key_name = "EP_SECRET" - payload = { - "name": full_resource_name_for_payload, # This 'name' is the resource path - "keyName": payload_key_name, # This 'keyName' is the specific field with new rules - "value": secret_value, - } try: - logger.debug(f"POST payload for '{key_name}': {payload}") - url = f"{resolved_api_base}/v1/accounts/{resolved_account_id}/secrets" - response = requests.post(url, json=payload, headers=headers, timeout=30) - response.raise_for_status() - logger.info( - f"Successfully created secret '{key_name}' on Fireworks platform. Full name: {response.json().get('name')}" - ) + logger.debug(f"POST payload for '{key_name}': {payload_key_name}") + client.secrets.create(key_name=payload_key_name, value=secret_value, name=resource_id) return True - except requests.exceptions.HTTPError as e: - logger.error(f"HTTP error creating secret '{key_name}': {e.response.status_code} - {e.response.text}") - # If error is due to 'name' field, this log will show it. - return False - except requests.exceptions.RequestException as e: - logger.error(f"Request exception creating secret '{key_name}': {e}") + except FireworksError as e: + logger.error(f"Error creating secret '{key_name}': {e}") return False @@ -205,7 +159,7 @@ def get_fireworks_secret( key_name: str, # This is the identifier for the secret api_key: Optional[str] = None, api_base: Optional[str] = None, -) -> Optional[Dict[str, Any]]: +) -> Optional[Secret]: """ Retrieves a secret from the Fireworks AI platform by its keyName. Note: This typically does not return the secret's actual value for security reasons, @@ -219,26 +173,28 @@ def get_fireworks_secret( logger.error("Missing Fireworks API key, base URL, or account ID for getting secret.") return None - headers = { - "Authorization": f"Bearer {resolved_api_key}", - "User-Agent": get_user_agent(), - } + client = Fireworks(api_key=resolved_api_key, account_id=resolved_account_id, base_url=resolved_api_base) resource_id = _normalize_secret_resource_id(key_name) try: - url = f"{resolved_api_base}/v1/accounts/{resolved_account_id}/secrets/{resource_id}" - response = requests.get(url, headers=headers, timeout=10) - if response.status_code == 200: + secret = client.secrets.get(resource_id) + if secret: logger.info(f"Successfully retrieved secret '{key_name}'.") - return response.json() - elif response.status_code == 404: - logger.info(f"Secret '{key_name}' not found.") - return None + return secret else: - logger.error(f"Error getting secret '{key_name}': {response.status_code} - {response.text}") + logger.warning(f"Secret '{key_name}' lookup succeeded but returned empty/falsy value.") return None - except requests.exceptions.RequestException as e: - logger.error(f"Request exception while getting secret '{key_name}': {e}") + except NotFoundError: + logger.info(f"Secret '{key_name}' not found.") + return None + except InternalServerError as e: + # As per user feedback, 500 on GET might mean not found + logger.warning( + f"Received 500 error when getting secret '{key_name}'. Assuming it doesn't exist. Response: {e}" + ) + return None + except FireworksError as e: + logger.error(f"Error getting secret '{key_name}': {e}") return None @@ -259,33 +215,24 @@ def delete_fireworks_secret( logger.error("Missing Fireworks API key, base URL, or account ID for deleting secret.") return False - headers = { - "Authorization": f"Bearer {resolved_api_key}", - "User-Agent": get_user_agent(), - } + client = Fireworks(api_key=resolved_api_key, account_id=resolved_account_id, base_url=resolved_api_base) resource_id = _normalize_secret_resource_id(key_name) try: - url = f"{resolved_api_base}/v1/accounts/{resolved_account_id}/secrets/{resource_id}" - response = requests.delete(url, headers=headers, timeout=30) - if response.status_code == 200 or response.status_code == 204: # 204 No Content is also success for DELETE - logger.info(f"Successfully deleted secret '{key_name}'.") - return True - elif response.status_code == 404: - logger.info(f"Secret '{key_name}' not found, nothing to delete.") - return True - elif ( - response.status_code == 500 - ): # As per user feedback, 500 on GET might mean not found, apply same logic for DELETE - logger.warning( - f"Received 500 error when deleting secret '{key_name}'. Assuming it might not have existed. Response: {response.text}" - ) - return True # Consider deletion successful if it results in non-existence - else: - logger.error(f"Error deleting secret '{key_name}': {response.status_code} - {response.text}") - return False - except requests.exceptions.RequestException as e: - logger.error(f"Request exception while deleting secret '{key_name}': {e}") + client.secrets.delete(resource_id, account_id=resolved_account_id) + logger.info(f"Successfully deleted secret '{key_name}'.") + return True + except NotFoundError: + logger.info(f"Secret '{key_name}' not found, nothing to delete.") + return True + except InternalServerError as e: + # As per user feedback, 500 on GET might mean not found, apply same logic for DELETE + logger.warning( + f"Received 500 error when deleting secret '{key_name}'. Assuming it might not have existed. Response: {e}" + ) + return True # Consider deletion successful if it results in non-existence + except FireworksError as e: + logger.error(f"Error deleting secret '{key_name}': {e}") return False @@ -319,8 +266,6 @@ def delete_fireworks_secret( logger.error( "CRITICAL: FIREWORKS_API_KEY and FIREWORKS_API_BASE must be correctly set in environment or .env file to run this test." ) - import sys # Make sure sys is imported if using sys.exit - sys.exit(1) test_secret_key_name = "rewardkit-test-secret-delete-me" # Changed to be valid @@ -331,7 +276,7 @@ def delete_fireworks_secret( # 1. Ensure it doesn't exist initially (or delete if it does from a previous failed run) logger.info(f"\n[Test Step 0] Attempting to delete '{test_secret_key_name}' if it exists (cleanup)...") - delete_fireworks_secret(test_account_id, test_secret_key_name) + delete_fireworks_secret(account_id=test_account_id, key_name=test_secret_key_name) retrieved = get_fireworks_secret(test_account_id, test_secret_key_name) if retrieved is None: logger.info(f"Confirmed secret '{test_secret_key_name}' does not exist before creation test.") @@ -341,7 +286,7 @@ def delete_fireworks_secret( # 2. Create secret logger.info(f"\n[Test Step 1] Creating secret '{test_secret_key_name}' with value '{test_secret_value}'...") - success_create = create_or_update_fireworks_secret(test_account_id, test_secret_key_name, test_secret_value) + success_create: bool = create_or_update_fireworks_secret(test_account_id, test_secret_key_name, test_secret_value) logger.info(f"Create operation success: {success_create}") # 3. Get secret (to verify creation, though value won't be returned) @@ -351,8 +296,11 @@ def delete_fireworks_secret( logger.info(f"Retrieved secret metadata: {retrieved_after_create}") # Assert against the transformed keyName that's expected in the payload/response body expected_payload_key_name = test_secret_key_name.upper().replace("-", "_") - assert retrieved_after_create.get("keyName") == expected_payload_key_name - assert retrieved_after_create.get("value") == test_secret_value # Also check value if returned + assert retrieved_after_create.key_name == expected_payload_key_name + # Note: value is typically not returned in GET responses for security reasons + # The value field will be None or empty string, so we don't assert on it + if retrieved_after_create.value: + logger.info(f"Note: Secret value was returned (unusual): {retrieved_after_create.value[:10]}...") else: logger.error(f"Failed to retrieve secret '{test_secret_key_name}' after creation.") diff --git a/pyproject.toml b/pyproject.toml index 400e8d40..bad2683c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "pytest-asyncio>=0.21.0", "peewee>=3.18.2", "backoff>=2.2.0", + "fireworks-ai==1.0.0a20", "questionary>=2.0.0", # Dependencies for vendored tau2 package "toml>=0.10.0", @@ -47,7 +48,6 @@ dependencies = [ "deepdiff>=6.0.0", "websockets>=15.0.1", "fastapi>=0.116.1", - "fireworks-ai==1.0.0a18", ] [project.urls] diff --git a/tests/cli_commands/test_deploy_cmd.py b/tests/cli_commands/test_deploy_cmd.py deleted file mode 100644 index fbd38ae8..00000000 --- a/tests/cli_commands/test_deploy_cmd.py +++ /dev/null @@ -1,507 +0,0 @@ -import json -from unittest.mock import MagicMock, patch - -import pytest - -# Module to be tested -from eval_protocol.cli_commands.deploy import deploy_command -from eval_protocol.platform_api import PlatformAPIError # Import for exception testing - - -# --- Mocking argparse.Namespace to simulate parsed CLI arguments --- -class MockArgs: - def __init__(self, **kwargs): - self.verbose = False - self.id = None - self.metrics_folders = None - self.display_name = None - self.description = None - self.force = False - self.huggingface_dataset = None - self.huggingface_split = "train" - self.huggingface_prompt_key = "prompt" - self.huggingface_response_key = "response" - self.huggingface_key_map = None - self.remote_url = None - # For GCP - self.target = "fireworks" # Default target - self.function_ref = None - self.gcp_project = None - self.gcp_region = None - self.gcp_ar_repo = None - self.service_account = None - self.entry_point = "reward_function" - self.runtime = "python311" - self.gcp_auth_mode = None - self.__dict__.update(kwargs) - - -@pytest.fixture -def mock_check_environment(): - with patch("eval_protocol.cli_commands.deploy.check_environment", return_value=True) as mock_check: - yield mock_check - - -@pytest.fixture -def mock_gcp_tools(): - with ( - patch("eval_protocol.cli_commands.deploy.ensure_artifact_registry_repo_exists") as mock_ensure_repo, - patch("eval_protocol.cli_commands.deploy.generate_dockerfile_content") as mock_gen_dockerfile, - patch("eval_protocol.cli_commands.deploy.build_and_push_docker_image") as mock_build_push, - patch("eval_protocol.cli_commands.deploy.deploy_to_cloud_run") as mock_deploy_run, - patch("eval_protocol.cli_commands.deploy.ensure_gcp_secret") as mock_ensure_gcp_secret, - ): - mock_ensure_repo.return_value = True - mock_gen_dockerfile.return_value = "DOCKERFILE CONTENT" - mock_build_push.return_value = True - mock_deploy_run.return_value = "http://mock-cloud-run-url.com/service" - mock_ensure_gcp_secret.return_value = "projects/test-proj/secrets/mocksecret/versions/1" - yield { - "ensure_repo": mock_ensure_repo, - "gen_dockerfile": mock_gen_dockerfile, - "build_push": mock_build_push, - "deploy_run": mock_deploy_run, - "ensure_gcp_secret": mock_ensure_gcp_secret, - } - - -class TestDeployCommandRemoteUrl: - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - def test_deploy_remote_url_success(self, mock_create_evaluation_call, mock_check_environment, capsys): - """Test successful registration of a remote URL via create_evaluation.""" - args = MockArgs( - id="my-remote-eval", - remote_url="http://my-evaluator.com/evaluate", - display_name="My Remote Eval", - description="A cool remote evaluator.", - target="fireworks", # Explicitly set target for this path - ) - mock_create_evaluation_call.return_value = { - "name": args.id, # Simulate platform API returning full name - "id": args.id, # Simulate platform API returning id - } - - return_code = deploy_command(args) - assert return_code == 0 - - mock_create_evaluation_call.assert_called_once_with( - evaluator_id=args.id, - remote_url=args.remote_url, - display_name=args.display_name or args.id, - description=args.description - or f"Evaluator for {args.id} at {args.remote_url}", # Updated description format - force=args.force, - huggingface_dataset=args.huggingface_dataset, - huggingface_split=args.huggingface_split, - huggingface_message_key_map=None, - huggingface_prompt_key=args.huggingface_prompt_key, - huggingface_response_key=args.huggingface_response_key, - ) - - captured = capsys.readouterr() - assert ( - f"Registering remote URL: {args.remote_url} for evaluator '{args.id}'" # Updated initial message - in captured.out - ) - assert ( - f"Successfully registered evaluator '{args.id}' on Fireworks AI, pointing to '{args.remote_url}'." # Updated success message - in captured.out - ) - - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - def test_deploy_remote_url_with_metrics_folders_warning(self, mock_create_eval, mock_check_environment, capsys): - args = MockArgs( - id="my-remote-eval", - remote_url="http://my-evaluator.com/evaluate", - metrics_folders=["mf=path"], - target="fireworks", # Explicitly set target - ) - mock_create_eval.return_value = {"name": args.id} - deploy_command(args) - captured = capsys.readouterr() - assert ( - "Info: --metrics-folders are ignored when deploying with --remote-url." # Updated "not packaged" to "ignored" - in captured.out - ) - - def test_deploy_remote_url_invalid_url_format(self, mock_check_environment, capsys): - args = MockArgs(id="my-eval", remote_url="ftp://invalid.com", target="fireworks") - return_code = deploy_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Error: Invalid --remote-url 'ftp://invalid.com'" in captured.out - - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - def test_deploy_remote_url_platform_api_error(self, mock_create_eval, mock_check_environment, capsys): - args = MockArgs( - id="my-remote-eval-fail", - remote_url="http://my-evaluator.com/evaluate", - target="fireworks", - ) - # Simulate the full error string from PlatformAPIError's __str__ - error_message = "Platform connection failed (Status: 500, Response: N/A)" - mock_create_eval.side_effect = PlatformAPIError( - "Platform connection failed", status_code=500, response_text="N/A" - ) - - return_code = deploy_command(args) - assert return_code == 1 - - captured = capsys.readouterr() - # Updated error message to match common registration block - assert f"Error registering URL with Fireworks AI: {error_message}" in captured.out - - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - def test_deploy_remote_url_unexpected_error(self, mock_create_eval, mock_check_environment, capsys): - args = MockArgs( - id="my-remote-eval-generic-fail", - remote_url="http://my-evaluator.com/evaluate", - target="fireworks", - ) - mock_create_eval.side_effect = Exception("Something broke") - - return_code = deploy_command(args) - assert return_code == 1 - - captured = capsys.readouterr() - # Updated error message to match common registration block - assert "An unexpected error occurred during Fireworks AI registration: Something broke" in captured.out - - -class TestDeployCommandLocalMode: # This class tests the "fireworks" target (packaging metrics) - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - def test_deploy_local_mode_success( # Renaming to reflect it tests "fireworks" target - self, mock_create_eval, mock_check_environment, capsys - ): - mock_create_eval.return_value = {"name": "my-fireworks-eval"} # Adjusted for clarity - args = MockArgs( - id="my-fireworks-eval", - metrics_folders=["mf=./path"], - display_name="My Fireworks Eval", - description="A packaged one.", - target="fireworks", # Explicitly "fireworks" target - ) - return_code = deploy_command(args) - assert return_code == 0 - expected_hf_message_key_map = None - mock_create_eval.assert_called_once_with( - evaluator_id=args.id, - metric_folders=args.metrics_folders, - display_name=args.display_name or args.id, - description=args.description or f"Evaluator: {args.id}", - force=args.force, - huggingface_dataset=args.huggingface_dataset, - huggingface_split=args.huggingface_split, - huggingface_message_key_map=expected_hf_message_key_map, - huggingface_prompt_key=args.huggingface_prompt_key, - huggingface_response_key=args.huggingface_response_key, - ) - captured = capsys.readouterr() - assert "Packaging and deploying metrics for evaluator 'my-fireworks-eval' to Fireworks AI..." in captured.out - assert "Successfully created/updated evaluator: my-fireworks-eval" in captured.out - - def test_deploy_local_mode_missing_metrics_folders( # Renaming to reflect "fireworks" target - self, mock_check_environment, capsys - ): - args = MockArgs( - id="my-fireworks-eval-fail", target="fireworks", remote_url=None - ) # Explicit target, no remote_url - return_code = deploy_command(args) - assert return_code == 1 - captured = capsys.readouterr() - # Updated error message to be specific to "fireworks" target - assert ( - "Error: --metrics-folders are required for 'fireworks' target if --remote-url is not provided." - in captured.out - ) - - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - def test_deploy_local_mode_create_evaluation_fails( # Renaming - self, mock_create_eval, mock_check_environment, capsys - ): - error_message = "Platform API error (Status: 503, Response: N/A)" - mock_create_eval.side_effect = PlatformAPIError("Platform API error", status_code=503, response_text="N/A") - args = MockArgs(id="my-fireworks-eval", metrics_folders=["mf=./path"], target="fireworks") - return_code = deploy_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert f"Error creating/updating evaluator 'my-fireworks-eval': {error_message}" in captured.out - - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - def test_deploy_local_mode_create_evaluation_fails_generic_exception( # Renaming - self, mock_create_eval, mock_check_environment, capsys - ): - mock_create_eval.side_effect = Exception("Generic error") - args = MockArgs(id="my-fireworks-eval", metrics_folders=["mf=./path"], target="fireworks") - return_code = deploy_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Error creating/updating evaluator 'my-fireworks-eval': Generic error" in captured.out - - -class TestDeployCommandGCPMode: - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - def test_deploy_gcp_mode_success( - self, - mock_create_evaluation_final_step, - mock_check_environment, - mock_gcp_tools, - capsys, - ): - args = MockArgs( - target="gcp-cloud-run", - id="gcp-eval-test", - function_ref="my_module.my_func", - gcp_project="test-proj", - gcp_region="us-central1", - gcp_ar_repo="test-repo", - runtime="python310", - gcp_auth_mode="api-key", - ) - mock_create_evaluation_final_step.return_value = {"name": args.id} # Simulate platform API returning full name - - return_code = deploy_command(args) - assert return_code == 0 - - mock_gcp_tools["ensure_repo"].assert_called_once() - mock_gcp_tools["gen_dockerfile"].assert_called_once() - mock_gcp_tools["build_push"].assert_called_once() - mock_gcp_tools["ensure_gcp_secret"].assert_called_once() - mock_gcp_tools["deploy_run"].assert_called_once() - mock_create_evaluation_final_step.assert_called_once() - - captured = capsys.readouterr() - # Check initial message from helper - assert f"Starting GCP Cloud Run deployment for evaluator '{args.id}'..." in captured.out - assert "Successfully built and pushed Docker image" in captured.out - assert ( - f"Successfully deployed to Cloud Run. Service URL: {mock_gcp_tools['deploy_run'].return_value}" - in captured.out - ) - # Check common registration success message - assert ( - f"Successfully registered evaluator '{args.id}' on Fireworks AI, pointing to '{mock_gcp_tools['deploy_run'].return_value}'." - in captured.out - ) - - @patch("eval_protocol.cli_commands.deploy.get_config") - def test_deploy_gcp_mode_missing_args(self, mock_get_config, mock_check_environment, capsys): - # Mock empty config to test missing project/region scenarios - from eval_protocol.config import RewardKitConfig - - mock_get_config.return_value = RewardKitConfig() - - args = MockArgs(target="gcp-cloud-run", id="gcp-eval-incomplete") - # function_ref is missing, gcp_project, gcp_region also - - # Test missing function_ref - temp_args_dict = args.__dict__.copy() - temp_args_dict.pop("function_ref", None) - current_args = MockArgs(**temp_args_dict) - return_code = deploy_command(current_args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Error: --function-ref is required for GCP Cloud Run deployment." in captured.out - - # Test missing gcp_project - temp_args_dict = args.__dict__.copy() - temp_args_dict["function_ref"] = "a.b" - temp_args_dict.pop("gcp_project", None) - current_args = MockArgs(**temp_args_dict) - return_code = deploy_command(current_args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Error: GCP Project ID must be provided" in captured.out - - # Test missing gcp_region - temp_args_dict["gcp_project"] = "proj" - temp_args_dict.pop("gcp_region", None) - current_args = MockArgs(**temp_args_dict) - return_code = deploy_command(current_args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Error: GCP Region must be provided" in captured.out - - @patch( - "eval_protocol.cli_commands.deploy.ensure_artifact_registry_repo_exists", - return_value=False, - ) - def test_deploy_gcp_mode_ensure_repo_fails(self, mock_ensure_repo_fails, mock_check_environment, capsys): - args = MockArgs( - target="gcp-cloud-run", - id="gcp-eval", - function_ref="a.b", - gcp_project="p", - gcp_region="r", - gcp_ar_repo="repo", - ) - return_code = deploy_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Failed to ensure Artifact Registry repository" in captured.out - - @patch( - "eval_protocol.cli_commands.deploy.ensure_artifact_registry_repo_exists", - return_value=True, - ) - @patch( - "eval_protocol.cli_commands.deploy.generate_dockerfile_content", - return_value=None, - ) - def test_deploy_gcp_mode_gen_dockerfile_fails( - self, - mock_gen_dockerfile_fails, - mock_ensure_repo, - mock_check_environment, - capsys, - ): - args = MockArgs( - target="gcp-cloud-run", - id="gcp-eval", - function_ref="a.b", - gcp_project="p", - gcp_region="r", - gcp_ar_repo="repo", - ) - return_code = deploy_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Failed to generate Dockerfile content. Aborting." in captured.out - - @patch( - "eval_protocol.cli_commands.deploy.ensure_artifact_registry_repo_exists", - return_value=True, - ) - @patch( - "eval_protocol.cli_commands.deploy.generate_dockerfile_content", - return_value="Dockerfile", - ) - @patch( - "eval_protocol.cli_commands.deploy.build_and_push_docker_image", - return_value=False, - ) - def test_deploy_gcp_mode_build_fails( - self, - mock_build_fails, - mock_gen_dockerfile, - mock_ensure_repo, - mock_check_environment, - capsys, - ): - args = MockArgs( - target="gcp-cloud-run", - id="gcp-eval", - function_ref="a.b", - gcp_project="p", - gcp_region="r", - gcp_ar_repo="repo", - ) - return_code = deploy_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Failed to build and push Docker image" in captured.out - - @patch( - "eval_protocol.cli_commands.deploy.ensure_artifact_registry_repo_exists", - return_value=True, - ) - @patch( - "eval_protocol.cli_commands.deploy.generate_dockerfile_content", - return_value="Dockerfile", - ) - @patch( - "eval_protocol.cli_commands.deploy.build_and_push_docker_image", - return_value=True, - ) - @patch("eval_protocol.cli_commands.deploy.deploy_to_cloud_run", return_value=None) - @patch( - "eval_protocol.cli_commands.deploy.ensure_gcp_secret", - return_value="projects/p/secrets/mocksecret/versions/1", - ) - def test_deploy_gcp_mode_cloud_run_deploy_fails( - self, - mock_ensure_gcp_secret_individual, - mock_deploy_run_fails, - mock_build_push, - mock_gen_dockerfile, - mock_ensure_repo, - mock_check_environment, - capsys, - ): - args = MockArgs( - target="gcp-cloud-run", - id="gcp-eval", - function_ref="a.b", - gcp_project="p", - gcp_region="r", - gcp_ar_repo="repo", - gcp_auth_mode="api-key", - ) - return_code = deploy_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Failed to deploy to Cloud Run or retrieve service URL. Aborting." in captured.out - - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - @patch( - "eval_protocol.cli_commands.deploy.ensure_gcp_secret", - return_value="projects/p/secrets/mocksecret/versions/1", - ) - def test_deploy_gcp_mode_final_registration_fails_platform_error( - self, - mock_ensure_gcp_secret_individual, - mock_create_evaluation_final_step, - mock_check_environment, - mock_gcp_tools, - capsys, - ): - args = MockArgs( - target="gcp-cloud-run", - id="gcp-eval-reg-fail", - function_ref="a.b", - gcp_project="p", - gcp_region="r", - gcp_ar_repo="repo", - gcp_auth_mode="api-key", - ) - error_message = "Registration failed (Status: 400, Response: N/A)" - mock_create_evaluation_final_step.side_effect = PlatformAPIError( - "Registration failed", status_code=400, response_text="N/A" - ) - return_code = deploy_command(args) - assert return_code == 1 - captured = capsys.readouterr() - # Updated error message to match common registration block - assert f"Error registering URL with Fireworks AI: {error_message}" in captured.out - - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - @patch( - "eval_protocol.cli_commands.deploy.ensure_gcp_secret", - return_value="projects/p/secrets/mocksecret/versions/1", - ) - def test_deploy_gcp_mode_final_registration_fails_generic_error( - self, - mock_ensure_gcp_secret_individual, - mock_create_evaluation_final_step, - mock_check_environment, - mock_gcp_tools, - capsys, - ): - args = MockArgs( - target="gcp-cloud-run", - id="gcp-eval-reg-fail-gen", - function_ref="a.b", - gcp_project="p", - gcp_region="r", - gcp_ar_repo="repo", - gcp_auth_mode="api-key", - ) - mock_create_evaluation_final_step.side_effect = Exception("Unexpected registration issue") - return_code = deploy_command(args) - assert return_code == 1 - captured = capsys.readouterr() - # Updated error message to match common registration block - assert ( - "An unexpected error occurred during Fireworks AI registration: Unexpected registration issue" - in captured.out - ) diff --git a/tests/cli_commands/test_preview_cmd.py b/tests/cli_commands/test_preview_cmd.py deleted file mode 100644 index 20cf0416..00000000 --- a/tests/cli_commands/test_preview_cmd.py +++ /dev/null @@ -1,218 +0,0 @@ -import json -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest -import requests - -from eval_protocol.cli_commands import preview as preview_cmd_module -from eval_protocol.cli_commands.preview import preview_command -from eval_protocol.generic_server import EvaluationRequest -from eval_protocol.models import EvaluateResult, Message, MetricResult - -try: - from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict - - DATASETS_AVAILABLE = True -except ImportError: - DATASETS_AVAILABLE = False - - class _DummyDSType: - pass - - IterableDataset = _DummyDSType - - -class MockArgs: - def __init__(self, **kwargs): - self.verbose = False - self.metrics_folders = None - self.samples = None - self.max_samples = 5 - self.huggingface_dataset = None - self.huggingface_split = "train" - self.huggingface_prompt_key = "prompt" - self.huggingface_response_key = "response" - self.huggingface_key_map = None - self.remote_url = None - self.__dict__.update(kwargs) - - -@pytest.fixture -def mock_check_environment(): - with patch("eval_protocol.cli_commands.preview.check_environment", return_value=True) as mock_check: - yield mock_check - - -def create_temp_jsonl(tmp_path: Path, samples_data: list) -> str: - sample_file_path = tmp_path / "temp_samples.jsonl" - with open(sample_file_path, "w", encoding="utf-8") as f: - for sample in samples_data: - f.write(json.dumps(sample) + "\n") - return str(sample_file_path) - - -class TestPreviewCommandRemoteUrl: - @patch("requests.post") - def test_preview_remote_url_success_with_file(self, mock_post, mock_check_environment, tmp_path, capsys): - mock_response = MagicMock() - mock_response.status_code = 200 - sample_data_for_file = [ - { - "messages": [ - {"role": "user", "content": "User prompt 1"}, - {"role": "assistant", "content": "Assistant response 1"}, - ], - "ground_truth": "GT 1", - "custom_kwarg": "custom_val_1", - } - ] - temp_sample_file = create_temp_jsonl(tmp_path, sample_data_for_file) - - eval_result_payload = EvaluateResult( - score=0.8, - reason="Remote success", - is_score_valid=True, - metrics={ - "accuracy": MetricResult(score=0.9, reason="High acc", is_score_valid=True) - }, # This already has metrics - ).model_dump() - mock_response.json.return_value = eval_result_payload - mock_post.return_value = mock_response - - args = MockArgs( - remote_url="http://fake-remote-eval.com", - samples=temp_sample_file, - max_samples=1, - ) - return_code = preview_command(args) - assert return_code == 0 - - expected_endpoint = "http://fake-remote-eval.com/evaluate" - expected_payload_sample1 = EvaluationRequest( - messages=sample_data_for_file[0]["messages"], - ground_truth=sample_data_for_file[0]["ground_truth"], - kwargs={"custom_kwarg": sample_data_for_file[0]["custom_kwarg"]}, - ).model_dump() - mock_post.assert_called_once_with(expected_endpoint, json=expected_payload_sample1, timeout=30) - - captured = capsys.readouterr() - assert "Previewing against remote URL: http://fake-remote-eval.com" in captured.out - assert "--- Sample 1 ---" in captured.out - assert "Score: 0.8" in captured.out - - @pytest.mark.skipif(not DATASETS_AVAILABLE, reason="datasets library not installed") - @patch("datasets.load_dataset") - @patch("requests.post") - def test_preview_remote_url_success_with_hf(self, mock_post, mock_hf_load_dataset, mock_check_environment, capsys): - hf_sample_data = [ - { - "prompt": "HF User prompt", - "response": "HF Assistant response", - "ground_truth_col": "HF GT", - } - ] - mock_iterable_ds = MagicMock(spec=IterableDataset) - mock_iterable_ds.__iter__.return_value = iter(hf_sample_data) - mock_hf_load_dataset.return_value = mock_iterable_ds - - mock_response = MagicMock() - mock_response.status_code = 200 - # Corrected: Explicitly provide metrics={} - eval_result_payload = EvaluateResult(score=0.7, reason="HF Remote success", metrics={}).model_dump() - mock_response.json.return_value = eval_result_payload - mock_post.return_value = mock_response - - args = MockArgs( - remote_url="http://fake-hf-eval.com", - huggingface_dataset="test/hf-dataset", - huggingface_prompt_key="prompt", - huggingface_response_key="response", - huggingface_key_map=json.dumps({"ground_truth_col": "ground_truth"}), - max_samples=1, - ) - return_code = preview_command(args) - assert return_code == 0 - - expected_payload = EvaluationRequest( - messages=[ - {"role": "user", "content": "HF User prompt"}, - {"role": "assistant", "content": "HF Assistant response"}, - ], - ground_truth="HF GT", - kwargs={}, - ).model_dump() - mock_post.assert_called_once_with("http://fake-hf-eval.com/evaluate", json=expected_payload, timeout=30) - captured = capsys.readouterr() - assert "Score: 0.7" in captured.out - - @patch("requests.post") - def test_preview_remote_url_http_error(self, mock_post, mock_check_environment, tmp_path, capsys): - sample_data = [{"messages": [{"role": "user", "content": "Test"}]}] - temp_sample_file = create_temp_jsonl(tmp_path, sample_data) - mock_post.side_effect = requests.exceptions.HTTPError("403 Client Error: Forbidden for url") - - args = MockArgs( - remote_url="http://fake-remote-eval.com", - samples=temp_sample_file, - max_samples=1, - ) - return_code = preview_command(args) - assert return_code == 0 - - captured = capsys.readouterr() - assert "Error calling remote URL" in captured.out - assert "403 Client Error: Forbidden for url" in captured.out - - def test_preview_remote_url_invalid_url_format(self, mock_check_environment, tmp_path, capsys): - sample_data = [{"messages": [{"role": "user", "content": "Test"}]}] - temp_sample_file = create_temp_jsonl(tmp_path, sample_data) - args = MockArgs(remote_url="ftp://invalid-url.com", samples=temp_sample_file) - return_code = preview_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Error: Invalid --remote-url 'ftp://invalid-url.com'" in captured.out - - def test_preview_remote_url_no_samples_provided(self, mock_check_environment, capsys): - args = MockArgs(remote_url="http://fake-remote-eval.com") - return_code = preview_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert ( - "Error: Either sample file (--samples) or HuggingFace dataset (--huggingface-dataset) is required." - in captured.out - ) - - def test_preview_remote_url_sample_file_not_found(self, mock_check_environment, capsys): - args = MockArgs(remote_url="http://fake-remote-eval.com", samples="non_existent.jsonl") - return_code = preview_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Error: Sample file 'non_existent.jsonl' not found" in captured.out - - -class TestPreviewCommandLocalMode: - @patch("eval_protocol.cli_commands.preview.preview_evaluation") - def test_preview_local_mode_success(self, mock_preview_eval, mock_check_environment, tmp_path, capsys): - sample_data = [{"messages": [{"role": "user", "content": "Test"}]}] - temp_sample_file = create_temp_jsonl(tmp_path, sample_data) - - mock_preview_result = MagicMock() - mock_preview_eval.return_value = mock_preview_result - - args = MockArgs(metrics_folders=["mf=path"], samples=temp_sample_file) - return_code = preview_command(args) - - assert return_code == 0 - mock_preview_eval.assert_called_once() - mock_preview_result.display.assert_called_once() - - def test_preview_local_mode_missing_metrics_folders(self, mock_check_environment, tmp_path, capsys): - sample_data = [{"messages": [{"role": "user", "content": "Test"}]}] - temp_sample_file = create_temp_jsonl(tmp_path, sample_data) - args = MockArgs(samples=temp_sample_file) - - return_code = preview_command(args) - assert return_code == 1 - captured = capsys.readouterr() - assert "Error: Either --remote-url or --metrics-folders must be specified." in captured.out diff --git a/tests/test_cli.py b/tests/test_cli.py deleted file mode 100644 index 050b98d6..00000000 --- a/tests/test_cli.py +++ /dev/null @@ -1,171 +0,0 @@ -import argparse -import os -import sys -from unittest.mock import MagicMock, patch - -import pytest - -from eval_protocol.cli import deploy_command, main, parse_args, preview_command - - -class TestCLI: - """Tests for the CLI functionality.""" - - @pytest.mark.skip(reason="preview and deploy commands are currently disabled in cli.py") - def test_parse_args(self): - """Test the argument parser.""" - # Test preview command - # Note: This test is less comprehensive than tests/test_cli_args.py - # It doesn't check for --remote-url here. - args, _ = parse_args( # Unpack tuple - ["preview", "--samples", "test.jsonl", "--metrics-folders", "m=p"] - ) # Added metrics folders to pass new check - assert args.command == "preview" - assert args.samples == "test.jsonl" - assert args.max_samples == 5 # default value - - # Test deploy command - args, _ = parse_args(["deploy", "--id", "test-eval", "--metrics-folders", "test=./test"]) # Unpack tuple - assert args.command == "deploy" - assert args.id == "test-eval" - assert args.metrics_folders == ["test=./test"] - assert not args.force # default value - - @patch("eval_protocol.cli_commands.preview.check_environment", return_value=True) - @patch("eval_protocol.cli_commands.preview.preview_evaluation") - def test_preview_command(self, mock_preview_eval, mock_preview_check_env): - """Test the preview command (local mode).""" - mock_preview_result = MagicMock() - mock_preview_result.display = MagicMock() - mock_preview_eval.return_value = mock_preview_result - - args = argparse.Namespace() - args.metrics_folders = ["test=./test"] - args.samples = "test.jsonl" - args.max_samples = 5 - args.huggingface_dataset = None - args.huggingface_split = "train" - args.huggingface_prompt_key = "prompt" - args.huggingface_response_key = "response" - args.huggingface_key_map = None - args.remote_url = None # Added for compatibility with updated preview_command - - with patch("eval_protocol.cli_commands.preview.Path.exists", return_value=True): - result = preview_command(args) - - assert result == 0 - mock_preview_check_env.assert_called_once() - mock_preview_eval.assert_called_once_with( - metric_folders=["test=./test"], - sample_file="test.jsonl", - max_samples=5, - huggingface_dataset=None, - huggingface_split="train", - huggingface_prompt_key="prompt", - huggingface_response_key="response", - huggingface_message_key_map=None, - ) - mock_preview_result.display.assert_called_once() - - @patch("eval_protocol.cli_commands.deploy.check_environment", return_value=True) - @patch("eval_protocol.cli_commands.deploy.create_evaluation") - def test_deploy_command(self, mock_create_eval, mock_deploy_check_env): - """Test the deploy command (local mode).""" - mock_create_eval.return_value = {"name": "test-evaluator"} - - args = argparse.Namespace() - args.metrics_folders = ["test=./test"] - args.id = "test-eval" - args.display_name = "Test Evaluator" - args.description = "Test description" - args.force = True - args.huggingface_dataset = None - args.huggingface_split = "train" - args.huggingface_prompt_key = "prompt" - args.huggingface_response_key = "response" - args.huggingface_key_map = None - args.remote_url = None - - # Add attributes accessed by deploy_command, with defaults for non-GCP target - args.target = "fireworks" # Explicitly set for this local mode test - args.function_ref = None - args.gcp_project = None - args.gcp_region = None - args.gcp_ar_repo = None - args.service_account = None - args.entry_point = "reward_function" # Default from parser - args.runtime = "python311" # Default from parser - args.gcp_auth_mode = None # Default from parser - - # For local deploy, metrics_folders is required. This is checked inside deploy_command. - # The test_parse_args in test_cli_args.py covers parser-level requirement changes. - - result = deploy_command(args) - - assert result == 0 - mock_deploy_check_env.assert_called_once() - mock_create_eval.assert_called_once_with( - evaluator_id="test-eval", - metric_folders=["test=./test"], - display_name="Test Evaluator", - description="Test description", - force=True, - huggingface_dataset=None, - huggingface_split="train", - huggingface_message_key_map=None, # This is derived from args.huggingface_key_map - huggingface_prompt_key="prompt", - huggingface_response_key="response", - # remote_url=None removed as it relies on default - ) - - @patch("eval_protocol.cli_commands.deploy.check_environment", return_value=False) - @patch("eval_protocol.cli_commands.preview.check_environment", return_value=False) - def test_command_environment_check(self, mock_preview_check_env, mock_deploy_check_env): - """Test that commands check the environment and fail if check_environment returns False.""" - preview_args = argparse.Namespace() - # For preview_command to proceed to check_environment, it needs either remote_url or metrics_folders, - # and also sample sources. - preview_args.metrics_folders = ["test=./test"] - preview_args.samples = "test.jsonl" - preview_args.max_samples = 1 - preview_args.huggingface_dataset = None - preview_args.huggingface_split = "train" - preview_args.huggingface_prompt_key = "prompt" - preview_args.huggingface_response_key = "response" - preview_args.huggingface_key_map = None - preview_args.remote_url = None # Added for compatibility - - deploy_args = argparse.Namespace() - deploy_args.id = "test-eval" - # For deploy_command to proceed to check_environment, it needs id. - # If not remote_url, it also needs metrics_folders. - deploy_args.metrics_folders = ["test=./test"] - deploy_args.display_name = None - deploy_args.description = None - deploy_args.force = False - deploy_args.huggingface_dataset = None - deploy_args.huggingface_split = "train" - deploy_args.huggingface_prompt_key = "prompt" - deploy_args.huggingface_response_key = "response" - deploy_args.huggingface_key_map = None - deploy_args.remote_url = None - deploy_args.target = "fireworks" # Ensure target is set - deploy_args.function_ref = None - deploy_args.gcp_project = None - deploy_args.gcp_region = None - deploy_args.gcp_ar_repo = None - deploy_args.service_account = None - deploy_args.entry_point = "reward_function" - deploy_args.runtime = "python311" - deploy_args.gcp_auth_mode = None - - # Mock Path.exists for preview_args if it uses samples file - with patch("eval_protocol.cli_commands.preview.Path.exists", return_value=True): - preview_result = preview_command(preview_args) - - deploy_result = deploy_command(deploy_args) - - assert preview_result == 1 - assert deploy_result == 1 - mock_preview_check_env.assert_called_once() - mock_deploy_check_env.assert_called_once() diff --git a/tests/test_cli_args.py b/tests/test_cli_args.py index 7ba917c4..e0b6c22f 100644 --- a/tests/test_cli_args.py +++ b/tests/test_cli_args.py @@ -1,4 +1,3 @@ -import argparse import subprocess import sys @@ -32,152 +31,13 @@ def test_create_rft_help_does_not_error(): assert "--dry-run" in combined -@pytest.mark.skip(reason="preview and deploy commands are currently disabled in cli.py") -class TestCliArgParsing: - # --- Tests for 'preview' command --- - def test_preview_with_remote_url_and_samples(self): - args_list = [ - "preview", - "--remote-url", - "http://example.com/eval", - "--samples", - "dummy.jsonl", - ] - parsed, _ = parse_args(args_list) - assert parsed.command == "preview" - assert parsed.remote_url == "http://example.com/eval" - assert parsed.samples == "dummy.jsonl" - assert parsed.metrics_folders is None # Should be None if not provided +def test_verbose_flag(): + """Test verbose flag with upload command.""" + parsed_verbose_short, _ = parse_args(["-v", "upload", "--path", "."]) + assert parsed_verbose_short.verbose is True - def test_preview_with_remote_url_and_hf_dataset(self): - args_list = [ - "preview", - "--remote-url", - "http://example.com/eval", - "--hf", - "dataset_name", - ] - parsed, _ = parse_args(args_list) - assert parsed.command == "preview" - assert parsed.remote_url == "http://example.com/eval" - assert parsed.huggingface_dataset == "dataset_name" + parsed_verbose_long, _ = parse_args(["--verbose", "upload", "--path", "."]) + assert parsed_verbose_long.verbose is True - def test_preview_with_remote_url_and_metrics_folders(self): - """Metrics folders should be accepted by argparse but logic in command might ignore/warn.""" - args_list = [ - "preview", - "--remote-url", - "http://example.com/eval", - "--metrics-folders", - "mf=path", - "--samples", - "s.jsonl", - ] - parsed, _ = parse_args(args_list) - assert parsed.command == "preview" - assert parsed.remote_url == "http://example.com/eval" - assert parsed.metrics_folders == ["mf=path"] - - def test_preview_without_remote_url_requires_metrics_folders_or_command_logic_handles( - self, - ): - """Argparse allows no metrics_folders, command logic should enforce if needed.""" - args_list = [ - "preview", - "--samples", - "dummy.jsonl", - ] # No --remote-url, no --metrics-folders - parsed, _ = parse_args(args_list) - assert parsed.command == "preview" - assert parsed.remote_url is None - assert parsed.metrics_folders is None - # The command logic in preview.py now checks: - # if not args.remote_url and not args.metrics_folders: error - - def test_preview_traditional_with_metrics_folders(self): - args_list = [ - "preview", - "--metrics-folders", - "mf=path", - "--samples", - "dummy.jsonl", - ] - parsed, _ = parse_args(args_list) - assert parsed.command == "preview" - assert parsed.metrics_folders == ["mf=path"] - assert parsed.remote_url is None - - # --- Tests for 'deploy' command --- - def test_deploy_with_remote_url(self): - args_list = [ - "deploy", - "--id", - "my-eval", - "--remote-url", - "http://example.com/deploy-eval", - ] - parsed, _ = parse_args(args_list) - assert parsed.command == "deploy" - assert parsed.id == "my-eval" - assert parsed.remote_url == "http://example.com/deploy-eval" - assert parsed.metrics_folders is None # Not required, should be None if not given - - def test_deploy_with_remote_url_and_metrics_folders(self): - """Metrics folders should be accepted by argparse but logic in command might ignore/warn.""" - args_list = [ - "deploy", - "--id", - "my-eval", - "--remote-url", - "http://example.com/eval", - "--metrics-folders", - "mf=path", - ] - parsed, _ = parse_args(args_list) - assert parsed.command == "deploy" - assert parsed.id == "my-eval" - assert parsed.remote_url == "http://example.com/eval" - assert parsed.metrics_folders == ["mf=path"] - - def test_deploy_traditional_without_remote_url(self): - args_list = ["deploy", "--id", "my-eval", "--metrics-folders", "mf=path"] - parsed, _ = parse_args(args_list) - assert parsed.command == "deploy" - assert parsed.id == "my-eval" - assert parsed.metrics_folders == ["mf=path"] - assert parsed.remote_url is None - - def test_deploy_traditional_metrics_folders_still_optional_at_parser_level(self): - """ - --metrics-folders is required=False at parser level. - The command logic in deploy.py enforces it if --remote-url is not present. - """ - args_list = [ - "deploy", - "--id", - "my-eval", - ] # No --metrics-folders, no --remote-url - # This should parse fine, but deploy_command will raise error. - parsed, _ = parse_args(args_list) - assert parsed.command == "deploy" - assert parsed.id == "my-eval" - assert parsed.metrics_folders is None - assert parsed.remote_url is None - - def test_deploy_id_is_required(self): - with pytest.raises(SystemExit): # argparse exits on missing required arg - parse_args(["deploy"]) # Missing --id - - # General verbose flag - def test_verbose_flag(self): - # Global flags like -v or --verbose should typically come before the subcommand - parsed_verbose_short, _ = parse_args(["-v", "preview", "--samples", "s.jsonl", "--metrics-folders", "m=p"]) - assert parsed_verbose_short.verbose is True - - parsed_verbose_long, _ = parse_args( - ["--verbose", "preview", "--samples", "s.jsonl", "--metrics-folders", "m=p"] - ) - assert parsed_verbose_long.verbose is True - - parsed_not_verbose, _ = parse_args(["preview", "--samples", "s.jsonl", "--metrics-folders", "m=p"]) - assert parsed_not_verbose.verbose is False + parsed_not_verbose, _ = parse_args(["upload", "--path", "."]) + assert parsed_not_verbose.verbose is False diff --git a/tests/test_deploy_integration.py b/tests/test_deploy_integration.py deleted file mode 100644 index 1841c9db..00000000 --- a/tests/test_deploy_integration.py +++ /dev/null @@ -1,215 +0,0 @@ -import argparse -import importlib.util -import json -import os -import sys -from pathlib import Path -from unittest.mock import ANY, MagicMock, patch - -import pytest - -from eval_protocol.cli_commands.deploy import deploy_command -from eval_protocol.config import GCPCloudRunConfig, RewardKitConfig - -# Constants for a dummy reward function module -# This module will be created and deleted by tests needing it. -DUMMY_DEPLOY_TEST_MODULE_NAME = "dummy_deploy_test_module" -DUMMY_DEPLOY_TEST_MODULE_FILENAME = f"{DUMMY_DEPLOY_TEST_MODULE_NAME}.py" -DUMMY_DEPLOY_TEST_FUNCTION_NAME = "my_dummy_deploy_reward_func" -DUMMY_DEPLOY_FUNCTION_REF = f"{DUMMY_DEPLOY_TEST_MODULE_NAME}.{DUMMY_DEPLOY_TEST_FUNCTION_NAME}" -DUMMY_DEPLOY_REQUIREMENTS = "requests==2.25.0\nfastapi==0.70.0" - -DUMMY_DEPLOY_MODULE_CONTENT = f""" -from eval_protocol.typed_interface import reward_function - -@reward_function(id="test-deploy-func", requirements='''{DUMMY_DEPLOY_REQUIREMENTS}''') -def {DUMMY_DEPLOY_TEST_FUNCTION_NAME}(messages, ground_truth=None, **kwargs): - return {{"score": 0.5, "reason": "Deployed dummy"}} -""" - -# Ensure the CWD (project root) is in sys.path for module loading during tests -if Path.cwd().as_posix() not in sys.path: - sys.path.insert(0, Path.cwd().as_posix()) - - -@pytest.fixture(scope="function") -def create_dummy_reward_module_for_deploy(): - # Create the dummy module file - with open(DUMMY_DEPLOY_TEST_MODULE_FILENAME, "w") as f: - f.write(DUMMY_DEPLOY_MODULE_CONTENT) - - # Ensure the module can be imported by clearing any cached versions - if DUMMY_DEPLOY_TEST_MODULE_NAME in sys.modules: - del sys.modules[DUMMY_DEPLOY_TEST_MODULE_NAME] - - yield DUMMY_DEPLOY_FUNCTION_REF # Provide the function reference to the test - - # Cleanup: remove the dummy module file - if os.path.exists(DUMMY_DEPLOY_TEST_MODULE_FILENAME): - os.remove(DUMMY_DEPLOY_TEST_MODULE_FILENAME) - # Cleanup: remove from sys.modules if it was loaded - if DUMMY_DEPLOY_TEST_MODULE_NAME in sys.modules: - del sys.modules[DUMMY_DEPLOY_TEST_MODULE_NAME] - - -# Load the deploy_example module directly from the examples folder -def load_module_from_path(name, path): - spec = importlib.util.spec_from_file_location(name, path) - if spec is None: - raise ImportError(f"Could not load spec for module {name} from {path}") - module = importlib.util.module_from_spec(spec) - if spec.loader is None: - raise ImportError(f"Spec for module {name} has no loader") - spec.loader.exec_module(module) - return module - - -@pytest.fixture -def deploy_example(): - # Path to the deploy_example.py file - file_path = os.path.join( - os.path.dirname(os.path.dirname(__file__)), - "examples", - "deploy_example.py", - ) - - # Load the module - return load_module_from_path("deploy_example", file_path) - - -@pytest.fixture -def mock_env_variables(monkeypatch): - """Set environment variables for testing""" - monkeypatch.setenv("FIREWORKS_API_KEY", "test_api_key") - monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai") - # Account id is derived from API key; mock deploy module lookup to keep tests offline. - monkeypatch.setattr("eval_protocol.cli_commands.deploy.get_fireworks_account_id", lambda: "test_account") - - -@pytest.fixture -def mock_requests_post(): - """Mock requests.post method""" - with patch("requests.post") as mock_post: - mock_post.return_value = MagicMock() - mock_post.return_value.status_code = 200 - mock_post.return_value.json.return_value = { - "name": "accounts/test_account/evaluators/informativeness-v1", - "displayName": "informativeness-v1", - "description": "Evaluates response informativeness based on specificity and content density", - } - yield mock_post - - -@pytest.fixture -def mock_requests_get(): - """Mock requests.get method""" - with patch("requests.get") as mock_get: - mock_get.return_value = MagicMock() - mock_get.return_value.status_code = 404 # Evaluator doesn't exist - yield mock_get - - -def test_deploy_gcp_with_inline_requirements( - mock_env_variables, # Ensures FIREWORKS_API_KEY etc. are set - create_dummy_reward_module_for_deploy, # Creates and cleans up the dummy module -): - """ - Test the deploy_command with --target gcp-cloud-run, ensuring inline requirements - from the @reward_function decorator are passed to generate_dockerfile_content. - """ - function_ref = create_dummy_reward_module_for_deploy - evaluator_id = "test-gcp-evaluator-with-reqs" - - args = argparse.Namespace( - id=evaluator_id, - target="gcp-cloud-run", - function_ref=function_ref, - metrics_folders=None, - remote_url=None, - display_name=None, - description=None, - force=False, - huggingface_dataset=None, - huggingface_split=None, - huggingface_key_map=None, # This is what argparse would create from --huggingface-key-map - huggingface_prompt_key=None, - huggingface_response_key=None, - local_port=8001, # Default, not used for GCP - runtime="python3.10", # Example runtime - gcp_project="test-gcp-project", - gcp_region="us-central1", - gcp_ar_repo=None, # Will default - gcp_auth_mode="api-key", - ) - - # Mock all external dependencies of _deploy_to_gcp_cloud_run and deploy_command - with ( - patch("eval_protocol.cli_commands.deploy.check_environment", return_value=True) as mock_check_env, - patch("eval_protocol.cli_commands.deploy.get_config") as mock_get_config, - patch( - "eval_protocol.cli_commands.deploy.ensure_artifact_registry_repo_exists", - return_value=True, - ) as mock_ensure_ar, - patch( - "eval_protocol.cli_commands.deploy.generate_dockerfile_content", - return_value="DOCKERFILE CONTENT", - ) as mock_gen_dockerfile, - patch( - "eval_protocol.cli_commands.deploy.build_and_push_docker_image", - return_value=True, - ) as mock_build_push, - patch( - "eval_protocol.cli_commands.deploy.ensure_gcp_secret", - return_value="projects/p/secrets/s/versions/1", - ) as mock_ensure_secret, - patch( - "eval_protocol.cli_commands.deploy.create_or_update_fireworks_secret", - return_value=True, - ) as mock_fw_secret, - patch( - "eval_protocol.cli_commands.deploy.deploy_to_cloud_run", - return_value="https://service-url.run.app", - ) as mock_deploy_cr, - patch( - "eval_protocol.cli_commands.deploy.create_evaluation", - return_value={"name": evaluator_id}, - ) as mock_create_eval, - ): - # Configure mock_get_config to return a basic config - mock_config_instance = RewardKitConfig( - gcp_cloud_run=GCPCloudRunConfig( - project_id="test-gcp-project-yaml", # Test CLI override - region="us-west1-yaml", # Test CLI override - default_auth_mode="api-key", - ), - evaluator_endpoint_keys={}, - ) - mock_get_config.return_value = mock_config_instance - - # Call the deploy command - result_code = deploy_command(args) - - assert result_code == 0 - mock_check_env.assert_called_once() - - # Key assertion: generate_dockerfile_content was called with the correct inline_requirements_content - mock_gen_dockerfile.assert_called_once() - call_args, call_kwargs = mock_gen_dockerfile.call_args - assert call_kwargs.get("function_ref") == function_ref - assert call_kwargs.get("inline_requirements_content") == DUMMY_DEPLOY_REQUIREMENTS - assert call_kwargs.get("user_requirements_path") is None # Ensure it's not trying to use both - - mock_ensure_ar.assert_called_once_with( - project_id=args.gcp_project, - region=args.gcp_region, - repo_name="eval-protocol-evaluators", # Default repo name - ) - mock_build_push.assert_called_once() - mock_deploy_cr.assert_called_once() - mock_create_eval.assert_called_once() - - # Check that the dynamically loaded module's requirements were used - # This is implicitly tested by checking mock_gen_dockerfile's call_kwargs - - # Ensure the dummy module is cleaned up by the fixture - # No explicit cleanup needed here due to yield in fixture diff --git a/tests/test_ep_upload_e2e.py b/tests/test_ep_upload_e2e.py index a1521a96..56de5fea 100644 --- a/tests/test_ep_upload_e2e.py +++ b/tests/test_ep_upload_e2e.py @@ -61,24 +61,6 @@ def mock_env_variables(monkeypatch): monkeypatch.setattr("eval_protocol.evaluation.get_fireworks_account_id", lambda: "test_account") -@pytest.fixture -def mock_requests_get(): - """Mock requests.get for force flow check""" - with patch("requests.get") as mock_get: - mock_get.return_value.status_code = 404 # Evaluator doesn't exist - mock_get.return_value.raise_for_status = MagicMock() - yield mock_get - - -@pytest.fixture -def mock_requests_delete(): - """Mock requests.delete for force flow""" - with patch("requests.delete") as mock_delete: - mock_delete.return_value.status_code = 200 - mock_delete.return_value.raise_for_status = MagicMock() - yield mock_delete - - @pytest.fixture def mock_gcs_upload(): """Mock the GCS upload via requests.Session""" @@ -96,45 +78,77 @@ def mock_gcs_upload(): @pytest.fixture -def mock_requests_post(): - """Mock requests.post for all API endpoints""" - with patch("requests.post") as mock_post: - validate_response = {"success": True, "valid": True} - create_response = { - "name": "accounts/test_account/evaluators/test-eval", - "displayName": "Test Evaluator", - "description": "Test description", - } - - def side_effect(*args, **kwargs): - url = args[0] - payload = kwargs.get("json", {}) - response = mock_post.return_value - - if "getUploadEndpoint" in url: - # Dynamically create signed URLs for whatever filenames are requested - filename_to_size = payload.get("filename_to_size", {}) - signed_urls = {} - for filename in filename_to_size.keys(): - signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" - response.json.return_value = {"filenameToSignedUrls": signed_urls} - elif "validateUpload" in url: - response.json.return_value = validate_response - else: - # Create evaluator endpoint - response.json.return_value = create_response - - response.status_code = 200 +def mock_fireworks_client(): + """Mock the Fireworks SDK client used in evaluation.py""" + with patch("eval_protocol.evaluation.Fireworks") as mock_fw_class: + mock_client = MagicMock() + mock_fw_class.return_value = mock_client + + # Mock evaluators.create response + mock_create_response = MagicMock() + mock_create_response.name = "accounts/test_account/evaluators/test-eval" + mock_create_response.display_name = "Test Evaluator" + mock_create_response.description = "Test description" + mock_client.evaluators.create.return_value = mock_create_response + + # Mock evaluators.get_upload_endpoint response - will be set dynamically + def get_upload_endpoint_side_effect(evaluator_id, filename_to_size): + response = MagicMock() + signed_urls = {} + for filename in filename_to_size.keys(): + signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" + response.filename_to_signed_urls = signed_urls return response - mock_post.side_effect = side_effect - mock_post.return_value.status_code = 200 - mock_post.return_value.raise_for_status = MagicMock() - yield mock_post + mock_client.evaluators.get_upload_endpoint.side_effect = get_upload_endpoint_side_effect + + # Mock evaluators.validate_upload response + mock_validate_response = MagicMock() + mock_validate_response.success = True + mock_validate_response.valid = True + mock_client.evaluators.validate_upload.return_value = mock_validate_response + + # Mock evaluators.get (for force flow - raises NotFoundError by default) + import fireworks + + mock_client.evaluators.get.side_effect = fireworks.NotFoundError( + "Evaluator not found", + response=MagicMock(status_code=404), + body={"error": "not found"}, + ) + + # Mock evaluators.delete + mock_client.evaluators.delete.return_value = None + + yield mock_client + + +@pytest.fixture +def mock_platform_api_client(): + """Mock the Fireworks SDK client used in platform_api.py for secrets""" + with patch("eval_protocol.platform_api.Fireworks") as mock_fw_class: + mock_client = MagicMock() + mock_fw_class.return_value = mock_client + + # Mock secrets.get - raise NotFoundError to simulate secret doesn't exist + from fireworks import NotFoundError + + mock_client.secrets.get.side_effect = NotFoundError( + "Secret not found", + response=MagicMock(status_code=404), + body={"error": "not found"}, + ) + + # Mock secrets.create - successful + mock_create_response = MagicMock() + mock_create_response.name = "accounts/test_account/secrets/test-secret" + mock_client.secrets.create.return_value = mock_create_response + + yield mock_client def test_ep_upload_discovers_and_uploads_evaluation_test( - mock_env_variables, mock_requests_post, mock_requests_get, mock_gcs_upload, monkeypatch + mock_env_variables, mock_fireworks_client, mock_platform_api_client, mock_gcs_upload, monkeypatch ): """ Test the complete ep upload flow: @@ -213,20 +227,17 @@ async def test_simple_evaluation(row: EvaluationRow) -> EvaluationRow: # 4. VERIFY SUCCESS assert exit_code == 0, "Upload command should return 0 (success)" - # 5. VERIFY ALL API CALLS IN UPLOAD FLOW - post_calls = [call[0][0] for call in mock_requests_post.call_args_list] - - # Step 1: Create evaluator (V2 endpoint) - create_calls = [url for url in post_calls if "evaluatorsV2" in url] - assert len(create_calls) >= 1, "Should call V2 create endpoint" + # 5. VERIFY ALL API CALLS IN UPLOAD FLOW via Fireworks SDK + # Step 1: Create evaluator + assert mock_fireworks_client.evaluators.create.called, "Should call evaluators.create" # Step 2: Get upload endpoint - upload_endpoint_calls = [url for url in post_calls if "getUploadEndpoint" in url] - assert len(upload_endpoint_calls) >= 1, "Should call getUploadEndpoint" + assert mock_fireworks_client.evaluators.get_upload_endpoint.called, ( + "Should call evaluators.get_upload_endpoint" + ) # Step 3: Validate upload - validate_calls = [url for url in post_calls if "validateUpload" in url] - assert len(validate_calls) >= 1, "Should call validateUpload" + assert mock_fireworks_client.evaluators.validate_upload.called, "Should call evaluators.validate_upload" # Step 4: GCS upload assert mock_gcs_upload.send.called, "Should upload tar.gz to GCS" @@ -235,32 +246,22 @@ async def test_simple_evaluation(row: EvaluationRow) -> EvaluationRow: assert "storage.googleapis.com" in gcs_request.url, "Should upload to GCS" # 6. VERIFY CREATE PAYLOAD STRUCTURE - create_payload = None - for call in mock_requests_post.call_args_list: - url = call[0][0] - if "evaluatorsV2" in url: - create_payload = call[1].get("json") - break + create_call = mock_fireworks_client.evaluators.create.call_args + assert create_call is not None - assert create_payload is not None - assert "evaluator" in create_payload - assert create_payload["evaluatorId"] == "test-simple-eval" + # Check evaluator_id + assert create_call.kwargs.get("evaluator_id") == "test-simple-eval" - evaluator_data = create_payload["evaluator"] - assert evaluator_data["displayName"] == "Simple Word Count Eval" - assert evaluator_data["description"] == "E2E test evaluator" + # Check evaluator params + evaluator_params = create_call.kwargs.get("evaluator", {}) + assert evaluator_params.get("display_name") == "Simple Word Count Eval" + assert evaluator_params.get("description") == "E2E test evaluator" # Verify entry point is included - assert "entryPoint" in evaluator_data, "Should include entry point" - entry_point = evaluator_data["entryPoint"] + assert "entry_point" in evaluator_params, "Should include entry point" + entry_point = evaluator_params["entry_point"] assert "test_simple_eval.py::test_simple_evaluation" in entry_point - # Verify criteria structure (minimal, no embedded code) - criteria = evaluator_data["criteria"] - assert len(criteria) > 0 - assert criteria[0]["type"] == "CODE_SNIPPETS" - # Code is uploaded as tar.gz, not embedded in criteria - finally: # Restore original directory os.chdir(original_cwd) @@ -273,8 +274,8 @@ async def test_simple_evaluation(row: EvaluationRow) -> EvaluationRow: def test_ep_upload_with_parametrized_test( mock_env_variables, - mock_requests_post, - mock_requests_get, + mock_fireworks_client, + mock_platform_api_client, mock_gcs_upload, ): """ @@ -335,11 +336,10 @@ async def test_multi_model_eval(row: EvaluationRow) -> EvaluationRow: assert exit_code == 0 - # Verify upload flow completed - post_calls = [call[0][0] for call in mock_requests_post.call_args_list] - assert any("evaluatorsV2" in url for url in post_calls) - assert any("getUploadEndpoint" in url for url in post_calls) - assert any("validateUpload" in url for url in post_calls) + # Verify upload flow completed via Fireworks SDK + assert mock_fireworks_client.evaluators.create.called + assert mock_fireworks_client.evaluators.get_upload_endpoint.called + assert mock_fireworks_client.evaluators.validate_upload.called assert mock_gcs_upload.send.called finally: @@ -440,8 +440,8 @@ async def test_quickstart_eval(row: EvaluationRow) -> EvaluationRow: def test_ep_upload_complete_workflow_with_entry_point_validation( mock_env_variables, - mock_requests_post, - mock_requests_get, + mock_fireworks_client, + mock_platform_api_client, mock_gcs_upload, ): """ @@ -515,14 +515,12 @@ async def test_math_correctness(row: EvaluationRow) -> EvaluationRow: assert exit_code == 0 - # 3. VERIFY 5-STEP UPLOAD FLOW - post_calls = [call[0][0] for call in mock_requests_post.call_args_list] - + # 3. VERIFY 5-STEP UPLOAD FLOW via Fireworks SDK # Step 1: Create evaluator - assert any("evaluatorsV2" in url for url in post_calls), "Missing create call" + assert mock_fireworks_client.evaluators.create.called, "Missing create call" # Step 2: Get upload endpoint - assert any("getUploadEndpoint" in url for url in post_calls), "Missing getUploadEndpoint call" + assert mock_fireworks_client.evaluators.get_upload_endpoint.called, "Missing getUploadEndpoint call" # Step 3: Upload to GCS assert mock_gcs_upload.send.called, "Missing GCS upload" @@ -531,51 +529,34 @@ async def test_math_correctness(row: EvaluationRow) -> EvaluationRow: assert "storage.googleapis.com" in gcs_request.url # Step 4: Validate - assert any("validateUpload" in url for url in post_calls), "Missing validateUpload call" + assert mock_fireworks_client.evaluators.validate_upload.called, "Missing validateUpload call" # 4. VERIFY PAYLOAD DETAILS - create_payload = None - for call in mock_requests_post.call_args_list: - url = call[0][0] - if "evaluatorsV2" in url: - create_payload = call[1].get("json") - break - - assert create_payload is not None + create_call = mock_fireworks_client.evaluators.create.call_args + assert create_call is not None # Verify evaluator ID auto-generated from filename + test name - evaluator_id = create_payload["evaluatorId"] + evaluator_id = create_call.kwargs.get("evaluator_id", "") assert "test-math-eval" in evaluator_id or "math-correctness" in evaluator_id # Verify entry point is path-based (not module-based) - evaluator_data = create_payload["evaluator"] - assert "entryPoint" in evaluator_data, "Should include entry point" - entry_point = evaluator_data["entryPoint"] + evaluator_params = create_call.kwargs.get("evaluator", {}) + assert "entry_point" in evaluator_params, "Should include entry point" + entry_point = evaluator_params["entry_point"] assert "test_math_eval.py::test_math_correctness" in entry_point - # Verify criteria is minimal - criteria = evaluator_data["criteria"] - assert len(criteria) > 0 - assert criteria[0]["type"] == "CODE_SNIPPETS" - # Code is in tar.gz, not in payload - # 5. VERIFY TAR.GZ WAS CREATED AND UPLOADED # Check getUploadEndpoint call payload - upload_endpoint_payload = None - for call in mock_requests_post.call_args_list: - url = call[0][0] - if "getUploadEndpoint" in url: - upload_endpoint_payload = call[1].get("json") - break - - assert upload_endpoint_payload is not None - assert "filename_to_size" in upload_endpoint_payload + upload_call = mock_fireworks_client.evaluators.get_upload_endpoint.call_args + assert upload_call is not None + filename_to_size = upload_call.kwargs.get("filename_to_size", {}) + assert filename_to_size, "Should have filename_to_size" # Tar filename is dynamic (based on directory name) - tar_files = list(upload_endpoint_payload["filename_to_size"].keys()) + tar_files = list(filename_to_size.keys()) assert len(tar_files) == 1, "Should have exactly one tar file" tar_filename = tar_files[0] assert tar_filename.endswith(".tar.gz"), "Should be a tar.gz file" - tar_size = upload_endpoint_payload["filename_to_size"][tar_filename] + tar_size = int(filename_to_size[tar_filename]) assert tar_size > 0, "Tar file should have non-zero size" finally: @@ -587,8 +568,8 @@ async def test_math_correctness(row: EvaluationRow) -> EvaluationRow: def test_ep_upload_force_flag_triggers_delete_flow( mock_env_variables, - mock_requests_post, mock_gcs_upload, + mock_platform_api_client, ): """ Test that --force flag triggers the check/delete/recreate flow @@ -611,39 +592,64 @@ async def test_force_eval(row: EvaluationRow) -> EvaluationRow: try: os.chdir(test_project_dir) - # Mock requests.get to return 200 (evaluator exists) - with patch("requests.get") as mock_get: - mock_get.return_value.status_code = 200 - mock_get.return_value.raise_for_status = MagicMock() + # Mock the Fireworks client with evaluator existing (for force flow) + with patch("eval_protocol.evaluation.Fireworks") as mock_fw_class: + mock_client = MagicMock() + mock_fw_class.return_value = mock_client + + # Mock evaluators.get to return an existing evaluator (not raise NotFoundError) + mock_existing_evaluator = MagicMock() + mock_existing_evaluator.name = "accounts/test_account/evaluators/test-force" + mock_client.evaluators.get.return_value = mock_existing_evaluator + + # Mock evaluators.delete + mock_client.evaluators.delete.return_value = None + + # Mock evaluators.create response + mock_create_response = MagicMock() + mock_create_response.name = "accounts/test_account/evaluators/test-force" + mock_client.evaluators.create.return_value = mock_create_response + + # Mock get_upload_endpoint + def get_upload_endpoint_side_effect(evaluator_id, filename_to_size): + response = MagicMock() + signed_urls = {} + for filename in filename_to_size.keys(): + signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" + response.filename_to_signed_urls = signed_urls + return response + + mock_client.evaluators.get_upload_endpoint.side_effect = get_upload_endpoint_side_effect + + # Mock validate_upload + mock_client.evaluators.validate_upload.return_value = MagicMock() - # Mock requests.delete - with patch("requests.delete") as mock_delete: - mock_delete.return_value.status_code = 200 - mock_delete.return_value.raise_for_status = MagicMock() + discovered_tests = _discover_tests(test_project_dir) - discovered_tests = _discover_tests(test_project_dir) + args = argparse.Namespace( + path=test_project_dir, + entry=None, + id="test-force", + display_name=None, + description=None, + force=True, # Force flag enabled + yes=True, + ) - args = argparse.Namespace( - path=test_project_dir, - entry=None, - id="test-force", - display_name=None, - description=None, - force=True, # Force flag enabled - yes=True, - ) + with patch("eval_protocol.cli_commands.upload._prompt_select") as mock_select: + mock_select.return_value = discovered_tests + exit_code = upload_command(args) - with patch("eval_protocol.cli_commands.upload._prompt_select") as mock_select: - mock_select.return_value = discovered_tests - exit_code = upload_command(args) + assert exit_code == 0 - assert exit_code == 0 + # Verify check happened (evaluators.get was called) + assert mock_client.evaluators.get.called, "Should check if evaluator exists" - # Verify check happened - assert mock_get.called, "Should check if evaluator exists" + # Verify delete happened (since evaluator existed) + assert mock_client.evaluators.delete.called, "Should delete existing evaluator" - # Verify delete happened (since mock_get returned 200) - assert mock_delete.called, "Should delete existing evaluator" + # Verify create happened after delete + assert mock_client.evaluators.create.called, "Should create evaluator after delete" finally: os.chdir(original_cwd) diff --git a/tests/test_evaluation.py b/tests/test_evaluation.py index e251d6d9..942c1962 100644 --- a/tests/test_evaluation.py +++ b/tests/test_evaluation.py @@ -1,15 +1,9 @@ -import json import os import shutil import tempfile -from pathlib import Path from unittest.mock import MagicMock, patch -import pytest -import requests - -from eval_protocol.evaluation import Evaluator, create_evaluation, preview_evaluation -from eval_protocol.models import MetricResult +from eval_protocol.evaluation import create_evaluation def create_test_folder(): @@ -37,299 +31,6 @@ def evaluate(messages, original_messages=None, tools=None, **kwargs): return tmp_dir -def create_sample_file(): - fd, path = tempfile.mkstemp(suffix=".jsonl") - samples = [ - { - "messages": [ - {"role": "user", "content": "Hello"}, - {"role": "assistant", "content": "Hi there! How can I help you today?"}, - ] - }, - { - "messages": [ - {"role": "user", "content": "What is AI?"}, - { - "role": "assistant", - "content": "AI stands for Artificial Intelligence.", - }, - ], - "original_messages": [ - {"role": "user", "content": "What is AI?"}, - { - "role": "assistant", - "content": "AI stands for Artificial Intelligence.", - }, - ], - "tools": [ - { - "type": "function", - "function": { - "name": "search", - "description": "Search for information", - }, - } - ], - }, - ] - with os.fdopen(fd, "w") as f: - for sample in samples: - f.write(json.dumps(sample) + "\n") - return path - - -def test_evaluator_load_metric_folder(): - tmp_dir = create_test_folder() - try: - evaluator = Evaluator() - files = evaluator.load_metric_folder("test_metric", tmp_dir) - assert "main.py" in files - assert "test_metric" in evaluator.metric_folders - assert "test_metric/main.py" in evaluator.code_files - assert "evaluate" in evaluator.code_files["test_metric/main.py"] - finally: - shutil.rmtree(tmp_dir, ignore_errors=True) - - -def test_evaluator_load_multi_metrics_folder(): - tmp_dir = create_test_folder() - try: - evaluator = Evaluator(multi_metrics=True) - files = evaluator.load_multi_metrics_folder(tmp_dir) - assert "main.py" in files - assert "main.py" in evaluator.code_files - assert "evaluate" in evaluator.code_files["main.py"] - finally: - shutil.rmtree(tmp_dir, ignore_errors=True) - - -def test_evaluator_update_evaluate_signature(): - evaluator = Evaluator() - old_code = """ -def evaluate(entry): - messages = entry.get('messages', []) - if not messages: return {'score': 0.0, 'reason': 'No messages found'} - last_message = messages[-1] - content = last_message.get('content', '') - word_count = len(content.split()) - score = min(word_count / 100, 1.0) - return {'score': score, 'reason': f'Word count: {word_count}'} - """ - updated_code = evaluator._update_evaluate_signature(old_code) - assert ( - "def evaluate(messages, ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs)" - in updated_code - ) - # The "entry = {" line is no longer part of the compatibility layer for the old_pattern. - # The compatibility layer now focuses on handling ground_truth. - assert ( - "if ground_truth is None: # Default ground_truth from messages if not provided" in updated_code - ) # Check for new compat layer logic - new_code = """ -def evaluate(messages, ground_truth: Optional[Union[str, List[Dict[str, Any]]]] = None, tools=None, **kwargs): - if not messages: return {'score': 0.0, 'reason': 'No messages found'} - last_message = messages[-1] - content = last_message.get('content', '') - word_count = len(content.split()) - score = min(word_count / 100, 1.0) - return {'score': score, 'reason': f'Word count: {word_count}'} - """ - unchanged_code = evaluator._update_evaluate_signature(new_code) - assert new_code == unchanged_code - - -@patch("eval_protocol.evaluation.requests.post") -def test_evaluator_preview(mock_requests_post, monkeypatch): - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = { - "totalSamples": 2, - "totalRuntimeMs": 123, - "results": [ - { - "index": 0, - "success": True, - "score": 0.5, - "reason": "Reason 1", - "perMetricEvals": { - "test_metric": MetricResult(score=0.5, reason="Metric reason 1", is_score_valid=True).model_dump() - }, - }, - { - "index": 1, - "success": True, - "score": 0.8, - "reason": "Reason 2", - "perMetricEvals": { - "test_metric": MetricResult(score=0.8, reason="Metric reason 2", is_score_valid=True).model_dump() - }, - }, - ], - } - mock_requests_post.return_value = mock_response - - monkeypatch.setenv("FIREWORKS_API_KEY", "test_preview_api_key") - monkeypatch.setattr("eval_protocol.evaluation.get_fireworks_account_id", lambda: "test_preview_account") - # Using a mock API base to prevent real calls - monkeypatch.setenv("FIREWORKS_API_BASE", "http://mock-api-server") # Changed to avoid actual localhost call - - # Mock requests.post for the preview call - class MockResponsePreview: - def __init__(self, json_data, status_code=200): - self.json_data = json_data - self.status_code = status_code - self.text = json.dumps(json_data) - - def json(self): - return self.json_data - - def raise_for_status(self): - if self.status_code != 200: - raise requests.exceptions.HTTPError(f"Mock API Error: {self.status_code}") - - def mock_post_preview(*args, **kwargs): - expected_url_preview = "http://mock-api-server/v1/accounts/test_preview_account/evaluators:previewEvaluator" - if args[0] == expected_url_preview: - # Simulate a successful preview API response - return MockResponsePreview( - { - "totalSamples": 2, - "totalRuntimeMs": 150, # Example runtime - "results": [ - { - "success": True, - "score": 0.75, - "perMetricEvals": {"test_metric": 0.75}, - }, - { - "success": True, - "score": 0.85, - "perMetricEvals": {"test_metric": 0.85}, - }, - ], - } - ) - # Fallback for other URLs if any, though not expected in this test - return MockResponsePreview({"error": "Unexpected URL"}, 404) - - monkeypatch.setattr("requests.post", mock_post_preview) - - tmp_dir = create_test_folder() - sample_file = create_sample_file() - try: - evaluator = Evaluator() - evaluator.load_metric_folder("test_metric", tmp_dir) - preview_result = evaluator.preview(sample_file, max_samples=2) - assert preview_result.total_samples == 2 - assert preview_result.total_runtime_ms >= 0 - assert len(preview_result.results) == 2 - assert preview_result.results[0].index == 0 - assert preview_result.results[0].success is True - assert hasattr(preview_result.results[0], "score") - assert preview_result.results[0].score == 0.75 - assert hasattr(preview_result.results[0], "per_metric_evals") # Attribute name in Python object - assert "test_metric" in preview_result.results[0].per_metric_evals - finally: - shutil.rmtree(tmp_dir, ignore_errors=True) - os.unlink(sample_file) - - -@patch("eval_protocol.evaluation.requests.post") -def test_preview_evaluation_helper(mock_requests_post, monkeypatch): - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.json.return_value = { - "totalSamples": 2, - "totalRuntimeMs": 100, - "results": [ - { - "index": 0, - "success": True, - "score": 0.6, - "reason": "Helper Reason 1", - "perMetricEvals": { - "test_metric": MetricResult( - score=0.6, reason="Helper Metric reason 1", is_score_valid=True - ).model_dump() - }, - }, - { - "index": 1, - "success": True, - "score": 0.7, - "reason": "Helper Reason 2", - "perMetricEvals": { - "test_metric": MetricResult( - score=0.7, reason="Helper Metric reason 2", is_score_valid=True - ).model_dump() - }, - }, - ], - } - mock_requests_post.return_value = mock_response - - monkeypatch.setenv("FIREWORKS_API_KEY", "test_helper_api_key") - monkeypatch.setattr("eval_protocol.evaluation.get_fireworks_account_id", lambda: "test_helper_account") - # Using a mock API base to prevent real calls - monkeypatch.setenv("FIREWORKS_API_BASE", "http://mock-api-server-helper") # Changed - - # Mock requests.post for the preview_evaluation helper call - class MockResponseHelperPreview: # Renamed to avoid conflict if in same scope, though not strictly necessary here - def __init__(self, json_data, status_code=200): - self.json_data = json_data - self.status_code = status_code - self.text = json.dumps(json_data) - - def json(self): - return self.json_data - - def raise_for_status(self): - if self.status_code != 200: - raise requests.exceptions.HTTPError(f"Mock API Error: {self.status_code}") - - def mock_post_helper_preview(*args, **kwargs): - expected_url_helper_preview = ( - "http://mock-api-server-helper/v1/accounts/test_helper_account/evaluators:previewEvaluator" - ) - if args[0] == expected_url_helper_preview: - return MockResponseHelperPreview( - { - "totalSamples": 2, - "totalRuntimeMs": 160, - "results": [ - { - "success": True, - "score": 0.65, - "perMetricEvals": {"test_metric": 0.65}, - }, - { - "success": True, - "score": 0.70, - "perMetricEvals": {"test_metric": 0.70}, - }, - ], - } - ) - return MockResponseHelperPreview({"error": "Unexpected URL for helper"}, 404) - - monkeypatch.setattr("requests.post", mock_post_helper_preview) - - tmp_dir = create_test_folder() - sample_file = create_sample_file() - try: - preview_result = preview_evaluation( - metric_folders=[f"test_metric={tmp_dir}"], - sample_file=sample_file, - max_samples=2, - ) - assert preview_result.total_samples == 2 - assert len(preview_result.results) == 2 - assert preview_result.results[0].score == 0.65 - finally: - shutil.rmtree(tmp_dir, ignore_errors=True) - os.unlink(sample_file) - - def test_create_evaluation_helper(monkeypatch): tmp_dir = create_test_folder() monkeypatch.setenv("FIREWORKS_API_KEY", "test_api_key") @@ -338,94 +39,80 @@ def test_create_evaluation_helper(monkeypatch): original_cwd = os.getcwd() - class MockResponse: - def __init__(self, json_data, status_code=200): - self.json_data = json_data - self.status_code = status_code - self.text = json.dumps(json_data) - - def json(self): - return self.json_data - - def raise_for_status(self): # pragma: no cover - if self.status_code != 200: - raise Exception("API Error") - + # Track SDK calls create_called = False upload_endpoint_called = False validate_called = False - def mock_post(*args, **kwargs): - nonlocal create_called, upload_endpoint_called, validate_called - url = args[0] - payload = kwargs.get("json", {}) - - # Handle different endpoints in the upload flow - if "getUploadEndpoint" in url: - upload_endpoint_called = True - # Dynamically create signed URLs for whatever filenames are requested - filename_to_size = payload.get("filename_to_size", {}) - signed_urls = {} - for filename in filename_to_size.keys(): - signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" - return MockResponse({"filenameToSignedUrls": signed_urls}) - elif "validateUpload" in url: - validate_called = True - return MockResponse({"success": True, "valid": True}) - else: - # Create evaluator endpoint - create_called = True - assert "evaluator" in payload - assert "evaluatorId" in payload - evaluator_data = payload["evaluator"] - assert "criteria" in evaluator_data - criteria = evaluator_data["criteria"] - assert len(criteria) > 0 - criterion = criteria[0] - assert criterion["type"] == "CODE_SNIPPETS" - # Code is now uploaded as tar.gz, not in criteria - - return MockResponse( - { - "name": "accounts/test_account/evaluators/test-eval", - "displayName": "Test Evaluator", - "description": "Test description", - "multiMetrics": False, - } - ) - - # Mock GCS upload - from unittest.mock import MagicMock - + # Mock the Fireworks SDK client methods + mock_evaluator_result = MagicMock() + mock_evaluator_result.name = "accounts/test_account/evaluators/test-eval" + mock_evaluator_result.display_name = "Test Evaluator" + mock_evaluator_result.description = "Test description" + + def mock_create(evaluator_id, evaluator): + nonlocal create_called + create_called = True + # Verify the evaluator params + assert evaluator_id == "test-eval" + assert "display_name" in evaluator + assert evaluator["display_name"] == "Test Evaluator" + assert "description" in evaluator + assert evaluator["description"] == "Test description" + return mock_evaluator_result + + def mock_get_upload_endpoint(evaluator_id, filename_to_size): + nonlocal upload_endpoint_called + upload_endpoint_called = True + mock_response = MagicMock() + signed_urls = {} + for filename in filename_to_size.keys(): + signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" + mock_response.filename_to_signed_urls = signed_urls + return mock_response + + def mock_validate_upload(evaluator_id, body): + nonlocal validate_called + validate_called = True + return MagicMock() + + # Mock GCS upload (still uses requests.Session) mock_session = MagicMock() mock_gcs_response = MagicMock() mock_gcs_response.status_code = 200 mock_gcs_response.raise_for_status = MagicMock() mock_session.send.return_value = mock_gcs_response - monkeypatch.setattr("requests.post", mock_post) - monkeypatch.setattr("requests.Session", lambda: mock_session) - - try: - os.chdir(tmp_dir) - api_response = create_evaluation( - evaluator_id="test-eval", - metric_folders=[f"test_metric={tmp_dir}"], - display_name="Test Evaluator", - description="Test description", - ) + # Patch the Fireworks client + with patch("eval_protocol.evaluation.Fireworks") as mock_fireworks_class: + mock_client = MagicMock() + mock_fireworks_class.return_value = mock_client + mock_client.evaluators.create = mock_create + mock_client.evaluators.get_upload_endpoint = mock_get_upload_endpoint + mock_client.evaluators.validate_upload = mock_validate_upload + + # Patch requests.Session for GCS upload + monkeypatch.setattr("requests.Session", lambda: mock_session) + + try: + os.chdir(tmp_dir) + api_response = create_evaluation( + evaluator_id="test-eval", + display_name="Test Evaluator", + description="Test description", + ) - # Verify response - assert api_response["name"] == "accounts/test_account/evaluators/test-eval" - assert api_response["displayName"] == "Test Evaluator" - assert api_response["description"] == "Test description" + # Verify response (SDK returns an object, not dict) + assert api_response.name == "accounts/test_account/evaluators/test-eval" + assert api_response.display_name == "Test Evaluator" + assert api_response.description == "Test description" - # Verify full upload flow was executed - assert create_called, "Create endpoint should be called" - assert upload_endpoint_called, "GetUploadEndpoint should be called" - assert validate_called, "ValidateUpload should be called" - assert mock_session.send.called, "GCS upload should happen" + # Verify full upload flow was executed + assert create_called, "Create endpoint should be called" + assert upload_endpoint_called, "GetUploadEndpoint should be called" + assert validate_called, "ValidateUpload should be called" + assert mock_session.send.called, "GCS upload should happen" - finally: - os.chdir(original_cwd) - shutil.rmtree(tmp_dir, ignore_errors=True) + finally: + os.chdir(original_cwd) + shutil.rmtree(tmp_dir, ignore_errors=True) diff --git a/tests/test_evaluation_integration.py b/tests/test_evaluation_integration.py deleted file mode 100644 index de97000b..00000000 --- a/tests/test_evaluation_integration.py +++ /dev/null @@ -1,365 +0,0 @@ -import json -import os -import shutil -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest - -from eval_protocol.evaluation import Evaluator, create_evaluation, preview_evaluation - - -def create_test_folder(): - """Create a temporary folder with a main.py file for testing""" - tmp_dir = tempfile.mkdtemp() - with open(os.path.join(tmp_dir, "main.py"), "w") as f: - f.write( - """ -def evaluate(messages, ground_truth=None, tools=None, **kwargs): # Changed original_messages to ground_truth - if not messages: - return {'score': 0.0, 'reason': 'No messages found'} - last_message = messages[-1] - content = last_message.get('content', '') - word_count = len(content.split()) - score = min(word_count / 100, 1.0) - return { - 'score': score, - 'reason': f'Word count: {word_count}' - } -""" - ) - # Create requirements.txt (required for upload) - with open(os.path.join(tmp_dir, "requirements.txt"), "w") as f: - f.write("eval-protocol>=0.1.0\n") - return tmp_dir - - -def create_sample_file(): - fd, path = tempfile.mkstemp(suffix=".jsonl") - samples = [ - { - "messages": [ - {"role": "user", "content": "Hello"}, - {"role": "assistant", "content": "Hi there! How can I help you today?"}, - ] - }, - { - "messages": [ - {"role": "user", "content": "What is AI?"}, - { - "role": "assistant", - "content": "AI stands for Artificial Intelligence.", - }, - ], - "tools": [ - { - "type": "function", - "function": { - "name": "search", - "description": "Search for information", - }, - } - ], - }, - ] - with os.fdopen(fd, "w") as f: - for sample in samples: - f.write(json.dumps(sample) + "\n") - return path - - -@pytest.fixture -def mock_env_variables(monkeypatch): - monkeypatch.setenv("FIREWORKS_API_KEY", "test_api_key") - monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai") - monkeypatch.setattr("eval_protocol.evaluation.get_fireworks_account_id", lambda: "test_account") - - -@pytest.fixture -def mock_requests_get(): - """Mock requests.get for force flow check""" - with patch("requests.get") as mock_get: - mock_get.return_value.status_code = 404 # Evaluator doesn't exist - mock_get.return_value.raise_for_status = MagicMock() - yield mock_get - - -@pytest.fixture -def mock_requests_delete(): - """Mock requests.delete for force flow""" - with patch("requests.delete") as mock_delete: - mock_delete.return_value.status_code = 200 - mock_delete.return_value.raise_for_status = MagicMock() - yield mock_delete - - -@pytest.fixture -def mock_gcs_upload(): - """Mock the GCS upload via requests.Session""" - with patch("requests.Session") as mock_session_class: - mock_session = MagicMock() - mock_session_class.return_value = mock_session - - # Mock successful GCS upload - mock_gcs_response = MagicMock() - mock_gcs_response.status_code = 200 - mock_gcs_response.raise_for_status = MagicMock() - mock_session.send.return_value = mock_gcs_response - - yield mock_session - - -@pytest.fixture -def mock_requests_post(): - with patch("requests.post") as mock_post: - default_response = { - "name": "accounts/test_account/evaluators/test-eval", - "displayName": "Test Evaluator", - "description": "Test description", - "multiMetrics": False, - } - preview_response = { - "totalSamples": 2, - "totalRuntimeMs": 1234, - "results": [ - { - "success": True, - "score": 0.7, - "perMetricEvals": {"quality": 0.8, "relevance": 0.7, "safety": 0.9}, - }, - { - "success": True, - "score": 0.5, - "perMetricEvals": {"quality": 0.6, "relevance": 0.4, "safety": 0.8}, - }, - ], - } - validate_response = {"success": True, "valid": True} - - def side_effect(*args, **kwargs): - url = args[0] - payload = kwargs.get("json", {}) - response = mock_post.return_value - if "previewEvaluator" in url: - response.json.return_value = preview_response - elif "getUploadEndpoint" in url: - # Dynamically create signed URLs for whatever filenames are requested - filename_to_size = payload.get("filename_to_size", {}) - signed_urls = {} - for filename in filename_to_size.keys(): - signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" - response.json.return_value = {"filenameToSignedUrls": signed_urls} - elif "validateUpload" in url: - response.json.return_value = validate_response - else: - response.json.return_value = default_response - return response - - mock_post.side_effect = side_effect - mock_post.return_value.status_code = 200 - mock_post.return_value.json.return_value = default_response - mock_post.return_value.raise_for_status = MagicMock() - yield mock_post - - -def test_integration_single_metric(mock_env_variables, mock_requests_post, mock_gcs_upload): - tmp_dir = create_test_folder() - sample_file = create_sample_file() - original_cwd = os.getcwd() - try: - os.chdir(tmp_dir) - preview_result = preview_evaluation( - metric_folders=[f"test_metric={tmp_dir}"], - sample_file=sample_file, - max_samples=2, - ) - assert preview_result.total_samples == 2 - assert len(preview_result.results) == 2 - evaluator = create_evaluation( - evaluator_id="test-eval", - metric_folders=[f"test_metric={tmp_dir}"], - display_name="Test Evaluator", - description="Test description", - ) - assert evaluator["name"] == "accounts/test_account/evaluators/test-eval" - assert evaluator["displayName"] == "Test Evaluator" - - # Verify all API calls in the new upload flow - post_calls = [call[0][0] for call in mock_requests_post.call_args_list] - - # 1. Create evaluator call (V2 endpoint) - assert any("evaluatorsV2" in url for url in post_calls), "Should call V2 create endpoint" - - # 2. Get upload endpoint call - assert any("getUploadEndpoint" in url for url in post_calls), "Should call getUploadEndpoint" - - # 3. Validate upload call - assert any("validateUpload" in url for url in post_calls), "Should call validateUpload" - - # 4. Verify GCS upload happened - assert mock_gcs_upload.send.called, "Should upload tar.gz to GCS" - gcs_request = mock_gcs_upload.send.call_args[0][0] - assert gcs_request.method == "PUT", "GCS upload should use PUT" - assert "storage.googleapis.com" in gcs_request.url, "Should upload to GCS" - - # Verify create payload structure - create_call_payload = None - for call in mock_requests_post.call_args_list: - url = call[0][0] - if "evaluatorsV2" in url: - create_call_payload = call[1].get("json") - break - - assert create_call_payload is not None, "Should have create payload" - assert "evaluator" in create_call_payload - assert "evaluatorId" in create_call_payload and create_call_payload["evaluatorId"] == "test-eval" - assert "criteria" in create_call_payload["evaluator"] - assert len(create_call_payload["evaluator"]["criteria"]) > 0 - assert create_call_payload["evaluator"]["criteria"][0]["type"] == "CODE_SNIPPETS" - finally: - os.chdir(original_cwd) - shutil.rmtree(tmp_dir, ignore_errors=True) - os.unlink(sample_file) - - -def test_integration_multi_metrics(mock_env_variables, mock_requests_post, mock_gcs_upload): - tmp_dir = create_test_folder() - sample_file = create_sample_file() - original_cwd = os.getcwd() - try: - os.chdir(tmp_dir) - preview_result = preview_evaluation(multi_metrics=True, folder=tmp_dir, sample_file=sample_file, max_samples=2) - assert preview_result.total_samples == 2 - assert len(preview_result.results) == 2 - assert hasattr(preview_result.results[0], "per_metric_evals") - assert "quality" in preview_result.results[0].per_metric_evals - mock_requests_post.reset_mock() - mock_requests_post.return_value.json.return_value = { - "name": "accounts/test_account/evaluators/test-eval", - "displayName": "Multi Metrics Evaluator", - "description": "Test multi-metrics evaluator", - "multiMetrics": True, - } - evaluator = create_evaluation( - evaluator_id="multi-metrics-eval", - multi_metrics=True, - folder=tmp_dir, - display_name="Multi Metrics Evaluator", - description="Test multi-metrics evaluator", - ) - assert evaluator["name"] == "accounts/test_account/evaluators/test-eval" - - # Verify all API calls in the new upload flow - post_calls = [call[0][0] for call in mock_requests_post.call_args_list] - assert any("evaluatorsV2" in url for url in post_calls), "Should call V2 create endpoint" - assert any("getUploadEndpoint" in url for url in post_calls), "Should call getUploadEndpoint" - assert any("validateUpload" in url for url in post_calls), "Should call validateUpload" - - # Verify GCS upload happened - assert mock_gcs_upload.send.called, "Should upload tar.gz to GCS" - - # Verify create payload uses V2 format - create_call_payload = None - for call in mock_requests_post.call_args_list: - url = call[0][0] - if "evaluatorsV2" in url: - create_call_payload = call[1].get("json") - break - - assert create_call_payload is not None - assert "evaluator" in create_call_payload - assert create_call_payload["evaluatorId"] == "multi-metrics-eval" - assert create_call_payload["evaluator"]["multiMetrics"] is True - finally: - import shutil - - os.chdir(original_cwd) - shutil.rmtree(tmp_dir, ignore_errors=True) - os.unlink(sample_file) - - -@patch("sys.exit") -def test_integration_cli_commands(mock_sys_exit, mock_env_variables, mock_requests_post): # Corrected parameter name - from eval_protocol.cli import deploy_command, preview_command - - mock_sys_exit.side_effect = lambda code=0: None - - tmp_dir = create_test_folder() - sample_file = create_sample_file() - original_cwd = os.getcwd() - try: - os.chdir(tmp_dir) - # Test preview command - with patch("eval_protocol.cli_commands.preview.preview_evaluation") as mock_preview_eval_func: - mock_preview_result = MagicMock() - mock_preview_result.display = MagicMock() - mock_preview_eval_func.return_value = mock_preview_result - args = MagicMock() - args.metrics_folders = [f"test_metric={tmp_dir}"] - args.samples = sample_file - args.max_samples = 2 - args.huggingface_dataset = None - args.huggingface_split = "train" - args.huggingface_prompt_key = "prompt" - args.huggingface_response_key = "response" - args.huggingface_key_map = None - args.remote_url = None # Explicitly set for local path - - with patch("eval_protocol.cli_commands.preview.Path.exists", return_value=True): - result = preview_command(args) - assert result == 0 - mock_preview_eval_func.assert_called_once_with( - metric_folders=[f"test_metric={tmp_dir}"], - sample_file=sample_file, - max_samples=2, - huggingface_dataset=None, - huggingface_split="train", - huggingface_prompt_key="prompt", - huggingface_response_key="response", - huggingface_message_key_map=None, - ) - mock_preview_result.display.assert_called_once() - - # Test deploy command - with patch("eval_protocol.cli_commands.deploy.create_evaluation") as mock_create_eval_func: - mock_create_eval_func.return_value = { - "name": "accounts/test_account/evaluators/test-eval", - "displayName": "Test Evaluator", - "description": "Test description", - "multiMetrics": False, - } - args = MagicMock() - args.metrics_folders = [f"test_metric={tmp_dir}"] - args.id = "test-eval" - args.display_name = "Test Evaluator" - args.description = "Test description" - args.force = False - args.huggingface_dataset = None - args.huggingface_split = "train" - args.huggingface_prompt_key = "prompt" - args.huggingface_response_key = "response" - args.huggingface_key_map = None - args.remote_url = None # Explicitly set for local path - args.target = "fireworks" # Explicitly set target for this test path - - result = deploy_command(args) - assert result == 0 - mock_create_eval_func.assert_called_once_with( - evaluator_id="test-eval", - metric_folders=[f"test_metric={tmp_dir}"], - display_name="Test Evaluator", - description="Test description", - force=False, - huggingface_dataset=None, - huggingface_split="train", - huggingface_message_key_map=None, - huggingface_prompt_key="prompt", - huggingface_response_key="response", - ) - finally: - os.chdir(original_cwd) - import shutil - - shutil.rmtree(tmp_dir, ignore_errors=True) - os.unlink(sample_file) diff --git a/tests/test_evaluation_preview_integration.py b/tests/test_evaluation_preview_integration.py deleted file mode 100644 index d7b3b266..00000000 --- a/tests/test_evaluation_preview_integration.py +++ /dev/null @@ -1,470 +0,0 @@ -import importlib.util -import json -import os -import sys -import tempfile -from pathlib import Path -from unittest.mock import MagicMock, patch - -import pytest - - -# Load the evaluation_preview_example module directly from the examples folder -def load_module_from_path(name, path): - spec = importlib.util.spec_from_file_location(name, path) - if spec is None: - raise ImportError(f"Could not load spec for module {name} from {path}") - module = importlib.util.module_from_spec(spec) - if spec.loader is None: - raise ImportError(f"Spec for module {name} has no loader") - spec.loader.exec_module(module) - return module - - -@pytest.fixture -def evaluation_preview_example(): - # Path to the evaluation_preview_example.py file - file_path = os.path.join( - os.path.dirname(os.path.dirname(__file__)), - "examples", - "evaluation_preview_example.py", - ) - - # Load the module - return load_module_from_path("evaluation_preview_example", file_path) - - -@pytest.fixture -def mock_env_variables(monkeypatch): - """Set environment variables for testing""" - monkeypatch.setenv("FIREWORKS_API_KEY", "test_api_key") - monkeypatch.setenv("FIREWORKS_API_BASE", "https://api.fireworks.ai") - monkeypatch.setattr("eval_protocol.evaluation.get_fireworks_account_id", lambda: "test_account") - - -@pytest.fixture -def mock_preview_api(): - """Mock the preview API calls""" - with patch("requests.post") as mock_post: - # Set up mock for preview API - preview_response = { - "totalSamples": 2, - "totalRuntimeMs": 1234, - "results": [ - { - "success": True, - "score": 0.26, - "perMetricEvals": { - "word_count": { - "score": 0.26, - "reason": "Word count: 26", - } - }, - }, - { - "success": True, - "score": 0.22, - "perMetricEvals": { - "word_count": { - "score": 0.22, - "reason": "Word count: 22", - } - }, - }, - ], - } - - mock_post.return_value = MagicMock() - mock_post.return_value.status_code = 200 - mock_post.return_value.json.return_value = preview_response - - yield mock_post - - -@pytest.fixture -def mock_create_api(): - """Mock the create API calls""" - with patch("requests.post") as mock_post: - create_response = { - "name": "accounts/test_account/evaluators/word-count-eval", - "displayName": "Word Count Evaluator", - "description": "Evaluates responses based on word count", - } - - def side_effect(*args, **kwargs): - url = args[0] - payload = kwargs.get("json", {}) - response = mock_post.return_value - - if "getUploadEndpoint" in url: - # Return signed URL for upload - filename_to_size = payload.get("filename_to_size", {}) - signed_urls = {} - for filename in filename_to_size.keys(): - signed_urls[filename] = f"https://storage.googleapis.com/test-bucket/{filename}?signed=true" - response.json.return_value = {"filenameToSignedUrls": signed_urls} - elif "validateUpload" in url: - response.json.return_value = {"success": True, "valid": True} - else: - response.json.return_value = create_response - - response.status_code = 200 - return response - - mock_post.side_effect = side_effect - mock_post.return_value = MagicMock() - mock_post.return_value.status_code = 200 - mock_post.return_value.json.return_value = create_response - mock_post.return_value.raise_for_status = MagicMock() - - yield mock_post - - -@pytest.fixture -def mock_gcs_upload(): - """Mock the GCS upload via requests.Session""" - with patch("requests.Session") as mock_session_class: - mock_session = MagicMock() - mock_session_class.return_value = mock_session - - # Mock successful GCS upload - mock_gcs_response = MagicMock() - mock_gcs_response.status_code = 200 - mock_gcs_response.raise_for_status = MagicMock() - mock_session.send.return_value = mock_gcs_response - - yield mock_session - - -@pytest.fixture -def mock_word_count_metric(): - """Create a temporary directory with a word count metric""" - tmp_dir = tempfile.mkdtemp() - - # Create the metrics/word_count directory - os.makedirs(os.path.join(tmp_dir, "metrics", "word_count"), exist_ok=True) - - # Create main.py in the word_count directory - with open(os.path.join(tmp_dir, "metrics", "word_count", "main.py"), "w") as f: - f.write( - """ -def evaluate(messages, ground_truth=None, tools=None, **kwargs): - if not messages: - return {'score': 0.0, 'reason': 'No messages found'} - - last_message = messages[-1] - content = last_message.get('content', '') - - word_count = len(content.split()) - score = min(word_count / 100, 1.0) - - return { - 'score': score, - 'reason': f'Word count: {word_count}' - } -""" - ) - - # Create a samples directory and sample file - os.makedirs(os.path.join(tmp_dir, "samples"), exist_ok=True) - - # Create a sample file - with open(os.path.join(tmp_dir, "samples", "samples.jsonl"), "w") as f: - f.write( - json.dumps( - { - "messages": [ - {"role": "user", "content": "Hello"}, - { - "role": "assistant", - "content": "Hi there! How can I help you today?", - }, - ] - } - ) - + "\n" - ) - f.write( - json.dumps( - { - "messages": [ - {"role": "user", "content": "What is AI?"}, - { - "role": "assistant", - "content": "AI stands for Artificial Intelligence.", - }, - ] - } - ) - + "\n" - ) - - yield tmp_dir - - # Clean up - import shutil - - shutil.rmtree(tmp_dir) - - -def test_preview_evaluation(mock_env_variables, mock_preview_api, monkeypatch): - """Test the preview_evaluation function in isolation""" - from eval_protocol.evaluation import preview_evaluation - - # Create a temporary directory for the test - with tempfile.TemporaryDirectory() as tmp_dir: - # Create a metrics directory with word_count - os.makedirs(os.path.join(tmp_dir, "word_count"), exist_ok=True) - - # Create main.py in the word_count directory - with open(os.path.join(tmp_dir, "word_count", "main.py"), "w") as f: - f.write( - """ -def evaluate(messages, ground_truth=None, tools=None, **kwargs): - if not messages: - return {'score': 0.0, 'reason': 'No messages found'} - - last_message = messages[-1] - content = last_message.get('content', '') - - word_count = len(content.split()) - score = min(word_count / 100, 1.0) - - return { - 'score': score, - 'reason': f'Word count: {word_count}' - } -""" - ) - - # Create a temporary sample file - sample_fd, sample_path = tempfile.mkstemp(suffix=".jsonl") - with os.fdopen(sample_fd, "w") as f: - f.write( - json.dumps( - { - "messages": [ - {"role": "user", "content": "Hello"}, - { - "role": "assistant", - "content": "Hi there! How can I help you today?", - }, - ] - } - ) - + "\n" - ) - f.write( - json.dumps( - { - "messages": [ - {"role": "user", "content": "What is AI?"}, - { - "role": "assistant", - "content": "AI stands for Artificial Intelligence.", - }, - ] - } - ) - + "\n" - ) - - # Set used_preview_api flag to simulate successful preview - import eval_protocol.evaluation - - eval_protocol.evaluation.used_preview_api = True - - # Call preview_evaluation - result = preview_evaluation( - metric_folders=[f"word_count={os.path.join(tmp_dir, 'word_count')}"], - sample_file=sample_path, - max_samples=2, - ) - - # Clean up - os.unlink(sample_path) - - # Verify results - assert result.total_samples == 2 - assert len(result.results) == 2 - # Assuming result.results[0] is an object, use attribute access - assert result.results[0].score == 0.26 - assert hasattr(result.results[0], "per_metric_evals") - assert "word_count" in result.results[0].per_metric_evals - - -def test_create_evaluation(mock_env_variables, mock_create_api, mock_gcs_upload, monkeypatch): - """Test the create_evaluation function in isolation""" - from eval_protocol.evaluation import create_evaluation - - # Create a temporary directory for the test - with tempfile.TemporaryDirectory() as tmp_dir: - # Create a metrics directory with word_count - os.makedirs(os.path.join(tmp_dir, "word_count"), exist_ok=True) - - # Create main.py in the word_count directory - with open(os.path.join(tmp_dir, "word_count", "main.py"), "w") as f: - f.write( - """ -def evaluate(messages, ground_truth=None, tools=None, **kwargs): - if not messages: - return {'score': 0.0, 'reason': 'No messages found'} - - last_message = messages[-1] - content = last_message.get('content', '') - - word_count = len(content.split()) - score = min(word_count / 100, 1.0) - - return { - 'score': score, - 'reason': f'Word count: {word_count}' - } -""" - ) - - # Create requirements.txt - with open(os.path.join(tmp_dir, "requirements.txt"), "w") as f: - f.write("eval-protocol>=0.1.0\n") - - # Change to temp directory - original_cwd = os.getcwd() - os.chdir(tmp_dir) - - try: - # Call create_evaluation - result = create_evaluation( - evaluator_id="word-count-eval", - metric_folders=[f"word_count={os.path.join(tmp_dir, 'word_count')}"], - display_name="Word Count Evaluator", - description="Evaluates responses based on word count", - force=True, - ) - - # Verify results - assert result["name"] == "accounts/test_account/evaluators/word-count-eval" - assert result["displayName"] == "Word Count Evaluator" - assert result["description"] == "Evaluates responses based on word count" - finally: - os.chdir(original_cwd) - - -def test_preview_then_create(monkeypatch, mock_env_variables, mock_preview_api, mock_create_api, mock_gcs_upload): - """Test the full example flow (simulated)""" - # Patch input to always return 'y' - monkeypatch.setattr("builtins.input", lambda _: "y") - - with tempfile.TemporaryDirectory() as tmp_dir: - # Create a metrics directory with word_count - os.makedirs(os.path.join(tmp_dir, "word_count"), exist_ok=True) - - # Create main.py in the word_count directory - with open(os.path.join(tmp_dir, "word_count", "main.py"), "w") as f: - f.write( - """ -def evaluate(messages, ground_truth=None, tools=None, **kwargs): - if not messages: - return {'score': 0.0, 'reason': 'No messages found'} - - last_message = messages[-1] - content = last_message.get('content', '') - - word_count = len(content.split()) - score = min(word_count / 100, 1.0) - - return { - 'score': score, - 'reason': f'Word count: {word_count}' - } -""" - ) - - # Create requirements.txt - with open(os.path.join(tmp_dir, "requirements.txt"), "w") as f: - f.write("eval-protocol>=0.1.0\n") - - # Create a temporary sample file - sample_fd, sample_path = tempfile.mkstemp(suffix=".jsonl") - with os.fdopen(sample_fd, "w") as f: - f.write( - json.dumps( - { - "messages": [ - {"role": "user", "content": "Hello"}, - { - "role": "assistant", - "content": "Hi there! How can I help you today?", - }, - ] - } - ) - + "\n" - ) - f.write( - json.dumps( - { - "messages": [ - {"role": "user", "content": "What is AI?"}, - { - "role": "assistant", - "content": "AI stands for Artificial Intelligence.", - }, - ] - } - ) - + "\n" - ) - - # Create a patched example module with modified paths - from eval_protocol.evaluation import create_evaluation, preview_evaluation - - # Change to temp directory - original_cwd = os.getcwd() - os.chdir(tmp_dir) - - try: - # Define a patched main function - def patched_main(): - # Preview the evaluation using metrics folder and samples file - print("Previewing evaluation...") - preview_result = preview_evaluation( - metric_folders=[f"word_count={os.path.join(tmp_dir, 'word_count')}"], - sample_file=sample_path, - max_samples=2, - ) - - preview_result.display() - - # Check if 'used_preview_api' attribute exists and is True - import eval_protocol.evaluation as evaluation_module - - # For testing, always assume the API was used successfully - evaluation_module.used_preview_api = True - - print("\nCreating evaluation...") - try: - evaluator = create_evaluation( - evaluator_id="word-count-eval", - metric_folders=[f"word_count={os.path.join(tmp_dir, 'word_count')}"], - display_name="Word Count Evaluator", - description="Evaluates responses based on word count", - force=True, - ) - print(f"Created evaluator: {evaluator['name']}") - return evaluator - except Exception as e: - print(f"Error creating evaluator: {str(e)}") - print("Make sure you have proper Fireworks API credentials set up.") - return None - - # Run the patched main function - result = patched_main() - - # Clean up - os.unlink(sample_path) - - # Verify the result - assert result is not None - assert result["name"] == "accounts/test_account/evaluators/word-count-eval" - finally: - os.chdir(original_cwd) diff --git a/uv.lock b/uv.lock index 972f90b1..917c1ff6 100644 --- a/uv.lock +++ b/uv.lock @@ -1309,7 +1309,7 @@ requires-dist = [ { name = "dspy", marker = "extra == 'dspy'", specifier = ">=3.0.0" }, { name = "e2b", marker = "extra == 'dev'" }, { name = "fastapi", specifier = ">=0.116.1" }, - { name = "fireworks-ai", specifier = "==1.0.0a18" }, + { name = "fireworks-ai", specifier = "==1.0.0a20" }, { name = "google-auth", marker = "extra == 'bigquery'", specifier = ">=2.0.0" }, { name = "google-cloud-bigquery", marker = "extra == 'bigquery'", specifier = ">=3.0.0" }, { name = "gymnasium", marker = "extra == 'dev'", specifier = ">=1.2.0" }, @@ -1578,7 +1578,7 @@ wheels = [ [[package]] name = "fireworks-ai" -version = "1.0.0a18" +version = "1.0.0a20" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -1590,9 +1590,9 @@ dependencies = [ { name = "sniffio" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/d5/c8/2d8454b01facbc3db73ec1a30d087ef2a3f6eee42b2817ca984cda5e789f/fireworks_ai-1.0.0a18.tar.gz", hash = "sha256:68a80a7ab15803a03cca96efc7078099e229901f867d43cbb463963ed2353ba2", size = 563750, upload-time = "2025-12-20T08:45:24.388Z" } +sdist = { url = "https://files.pythonhosted.org/packages/1d/c6/cdc6c152876ee1253491e6f72c65c2cdaf7b22b320be0cec7ac5778d3b1c/fireworks_ai-1.0.0a20.tar.gz", hash = "sha256:c84f702445679ea768461dba8fb027175b82255021832a89f9ece65821a2ab25", size = 564097, upload-time = "2025-12-23T19:21:17.891Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/49/b0/eaa7b865a99307e80fd6852348dc0aa1135c0e8c5cb91b050f93dde4919d/fireworks_ai-1.0.0a18-py3-none-any.whl", hash = "sha256:7f6b46a2b9928464bc515f3f68282a97e2df2f6e230ea1acc26d9c4d404a0f6e", size = 307477, upload-time = "2025-12-20T08:45:22.98Z" }, + { url = "https://files.pythonhosted.org/packages/c5/a4/e2bc9c4af291786bc7fe364ae63503ba2c8161c2e71223d570a77f0a1415/fireworks_ai-1.0.0a20-py3-none-any.whl", hash = "sha256:b5e199978f71b564b2e19cf55a71c1ac20906d9a7b4ae75135fdccb245227722", size = 304153, upload-time = "2025-12-23T19:21:15.943Z" }, ] [[package]]