Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 62 additions & 55 deletions eval_protocol/cli_commands/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,23 +184,32 @@ def upload_command(args: argparse.Namespace) -> int:
entries_arg = getattr(args, "entry", None)
non_interactive: bool = bool(getattr(args, "yes", False))
if entries_arg:
entries = [e.strip() for e in re.split(r"[,\s]+", entries_arg) if e.strip()]
selected_specs: list[tuple[str, str]] = []
for e in entries:
qualname, resolved_path = _resolve_entry_to_qual_and_source(e, root)
selected_specs.append((qualname, resolved_path))
# Only support single entry, not comma-separated values
entry = entries_arg.strip()
if "," in entry:
print("Error: --entry only supports uploading one evaluator at a time.")
print("Please specify a single entry in the format: module::function or path::function")
return 1
qualname, resolved_path = _resolve_entry_to_qual_and_source(entry, root)
selected_specs: list[tuple[str, str]] = [(qualname, resolved_path)]
else:
selected_tests = _discover_and_select_tests(root, non_interactive=non_interactive)
if not selected_tests:
return 1

# Enforce single selection
if len(selected_tests) > 1:
print(f"Error: Multiple tests selected ({len(selected_tests)}), but only one can be uploaded at a time.")
print("Please select exactly one test to upload.")
return 1

# Warn about parameterized tests
parameterized_tests = [t for t in selected_tests if t.has_parametrize]
if parameterized_tests:
print("\nNote: Parameterized tests will be uploaded as a single evaluator that")
if selected_tests[0].has_parametrize:
print("\nNote: This parameterized test will be uploaded as a single evaluator that")
print(" handles all parameter combinations. The evaluator will work with")
print(" the same logic regardless of which model/parameters are used.")

selected_specs = [(t.qualname, t.file_path) for t in selected_tests]
selected_specs = [(selected_tests[0].qualname, selected_tests[0].file_path)]

base_id = getattr(args, "id", None)
display_name = getattr(args, "display_name", None)
Expand Down Expand Up @@ -256,53 +265,51 @@ def upload_command(args: argparse.Namespace) -> int:
except Exception as e:
print(f"Warning: Skipped Fireworks secret registration due to error: {e}")

exit_code = 0
for i, (qualname, source_file_path) in enumerate(selected_specs):
# Generate a short default ID from just the test function name
if base_id:
evaluator_id = base_id
if len(selected_specs) > 1:
evaluator_id = f"{base_id}-{i + 1}"
# selected_specs is guaranteed to have exactly 1 item at this point
qualname, source_file_path = selected_specs[0]

# Generate evaluator ID
if base_id:
evaluator_id = base_id
else:
# Extract just the test function name from qualname
test_func_name = qualname.split(".")[-1]
# Extract source file name (e.g., "test_gpqa.py" -> "test_gpqa")
if source_file_path:
source_file_name = Path(source_file_path).stem
else:
# Extract just the test function name from qualname
test_func_name = qualname.split(".")[-1]
# Extract source file name (e.g., "test_gpqa.py" -> "test_gpqa")
if source_file_path:
source_file_name = Path(source_file_path).stem
else:
source_file_name = "eval"
# Create a shorter ID: filename-testname
evaluator_id = f"{source_file_name}-{test_func_name}"
source_file_name = "eval"
# Create a shorter ID: filename-testname
evaluator_id = f"{source_file_name}-{test_func_name}"

# Normalize the evaluator ID to meet Fireworks requirements
evaluator_id = _normalize_evaluator_id(evaluator_id)
# Normalize the evaluator ID to meet Fireworks requirements
evaluator_id = _normalize_evaluator_id(evaluator_id)

# Compute entry point metadata for backend as a pytest nodeid usable with `pytest <entrypoint>`
# Always prefer a path-based nodeid to work in plain pytest environments (server may not use --pyargs)
func_name = qualname.split(".")[-1]
entry_point = _build_entry_point(root, source_file_path, func_name)
# Compute entry point metadata for backend as a pytest nodeid usable with `pytest <entrypoint>`
# Always prefer a path-based nodeid to work in plain pytest environments (server may not use --pyargs)
func_name = qualname.split(".")[-1]
entry_point = _build_entry_point(root, source_file_path, func_name)

print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
try:
test_dir = root
metric_name = os.path.basename(test_dir) or "metric"
result = create_evaluation(
evaluator_id=evaluator_id,
metric_folders=[f"{metric_name}={test_dir}"],
display_name=display_name or evaluator_id,
description=description or f"Evaluator for {qualname}",
force=force,
entry_point=entry_point,
)
name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id

# Print success message with Fireworks dashboard link
print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
print("📊 View in Fireworks Dashboard:")
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
print(f" {dashboard_url}\n")
except Exception as e:
print(f"Failed to upload {qualname}: {e}")
exit_code = 2

return exit_code
print(f"\nUploading evaluator '{evaluator_id}' for {qualname.split('.')[-1]}...")
try:
test_dir = root
metric_name = os.path.basename(test_dir) or "metric"
result = create_evaluation(
evaluator_id=evaluator_id,
metric_folders=[f"{metric_name}={test_dir}"],
display_name=display_name or evaluator_id,
description=description or f"Evaluator for {qualname}",
force=force,
entry_point=entry_point,
)
name = result.get("name", evaluator_id) if isinstance(result, dict) else evaluator_id

# Print success message with Fireworks dashboard link
print(f"\n✅ Successfully uploaded evaluator: {evaluator_id}")
print("📊 View in Fireworks Dashboard:")
dashboard_url = _build_evaluator_dashboard_url(evaluator_id)
print(f" {dashboard_url}\n")
return 0
except Exception as e:
print(f"Failed to upload {qualname}: {e}")
return 2
6 changes: 5 additions & 1 deletion eval_protocol/cli_commands/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,11 @@ def _prompt_select_fallback(tests: list[DiscoveredTest]) -> list[DiscoveredTest]
def _prompt_select(tests: list[DiscoveredTest], non_interactive: bool) -> list[DiscoveredTest]:
"""Prompt user to select tests to upload."""
if non_interactive:
return tests
# In non-interactive mode, auto-select only the first test
if len(tests) > 1:
print(f"Note: {len(tests)} tests discovered. Auto-selecting first test in non-interactive mode:")
print(f" {_format_test_choice(tests[0], 1)}")
return [tests[0]]
Comment on lines 334 to +341

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Non-interactive selection masks ambiguous test choice

In _prompt_select non-interactive mode now always returns [tests[0]], so _discover_and_select_tests no longer surfaces when multiple evaluation tests are discovered. create_rft._resolve_evaluator (eval_protocol/cli_commands/create_rft.py:319-341) depends on len(selected_tests) != 1 to force disambiguation; with this change ep create rft --yes in a repo containing multiple tests will silently choose whichever test _discover_tests returns first and proceed to create datasets/jobs for it instead of erroring, risking creating resources for the wrong evaluator.

Useful? React with 👍 / 👎.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Empty list access causes crash in non-interactive mode

The _prompt_select function now accesses tests[0] without checking if the list is empty. Previously, return tests would safely handle an empty list by returning [], but return [tests[0]] raises an IndexError if tests is empty. Although the current caller _discover_and_select_tests validates that tests is non-empty before calling this function, this change makes _prompt_select less robust and could cause crashes if a future refactor or new caller passes an empty list.

Fix in Cursor Fix in Web


return _prompt_select_interactive(tests)

Expand Down
Loading