From f7a648c2467c6e0138f3df704efd12f35f074e20 Mon Sep 17 00:00:00 2001 From: "Yufei (Benny) Chen" <1585539+benjibc@users.noreply.github.com> Date: Sat, 27 Sep 2025 11:14:14 -0700 Subject: [PATCH] Route benchmark datasets through data loaders --- eval_protocol/benchmarks/test_aime25.py | 30 +++++++++++++++---- .../benchmarks/test_tau_bench_airline.py | 15 ++++++++-- .../benchmarks/test_tau_bench_retail.py | 15 ++++++++-- 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/eval_protocol/benchmarks/test_aime25.py b/eval_protocol/benchmarks/test_aime25.py index 91a67f77..932f93e5 100644 --- a/eval_protocol/benchmarks/test_aime25.py +++ b/eval_protocol/benchmarks/test_aime25.py @@ -1,5 +1,7 @@ from typing import Any, Dict, List, Optional +from eval_protocol.common_utils import load_jsonl +from eval_protocol.data_loader import DynamicDataLoader from eval_protocol.models import ( EvaluateResult, EvaluationRow, @@ -11,6 +13,7 @@ SingleTurnRolloutProcessor, ) from eval_protocol.pytest.evaluation_test import evaluation_test +from eval_protocol.pytest.utils import parse_ep_max_rows SYSTEM_PROMPT = ( "You are a helpful math assistant. Please reason step by step, and put your final answer within \\boxed{...}." @@ -71,12 +74,29 @@ def aime2025_dataset_adapter(rows: List[Dict[str, Any]]) -> List[EvaluationRow]: return converted +_AIME2025_DATASET_URLS: List[str] = [ + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", + "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", +] + + +def aime2025_data_generator() -> List[EvaluationRow]: + """Load the AIME 2025 datasets and convert them into evaluation rows.""" + dataset_rows: List[Dict[str, Any]] = [] + for dataset_url in _AIME2025_DATASET_URLS: + dataset_rows.extend(load_jsonl(dataset_url)) + + max_rows = parse_ep_max_rows(2) + if max_rows is not None: + dataset_rows = dataset_rows[:max_rows] + + return aime2025_dataset_adapter(dataset_rows) + + @evaluation_test( - input_dataset=[ - "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl", - "https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl", - ], - dataset_adapter=aime2025_dataset_adapter, + data_loaders=DynamicDataLoader( + generators=[aime2025_data_generator], + ), completion_params=[ { "max_tokens": 131000, diff --git a/eval_protocol/benchmarks/test_tau_bench_airline.py b/eval_protocol/benchmarks/test_tau_bench_airline.py index 77cfec0c..e09d3752 100644 --- a/eval_protocol/benchmarks/test_tau_bench_airline.py +++ b/eval_protocol/benchmarks/test_tau_bench_airline.py @@ -10,6 +10,8 @@ from pathlib import Path from typing import Any, Dict, List +from eval_protocol.common_utils import load_jsonl +from eval_protocol.data_loader import DynamicDataLoader from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test, ExceptionHandlerConfig from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor @@ -69,6 +71,14 @@ def _get_airline_dataset_path() -> str: return str(Path(__file__).parent / "data" / "airline_dataset.jsonl") +def tau_bench_airline_data_generator() -> List[EvaluationRow]: + """Load and adapt the airline dataset into evaluation rows.""" + dataset_rows: List[Dict[str, Any]] = [] + for dataset_path in [_get_airline_dataset_path()]: + dataset_rows.extend(load_jsonl(dataset_path)) + return tau_bench_airline_to_evaluation_row(dataset_rows) + + def _get_server_script_path() -> str: """Get the tau2 mcp server script path.""" from eval_protocol.mcp_servers.tau2 import get_server_script_path @@ -107,8 +117,9 @@ def tau_bench_airline_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Eval @evaluation_test( - input_dataset=[_get_airline_dataset_path()], - dataset_adapter=tau_bench_airline_to_evaluation_row, + data_loaders=DynamicDataLoader( + generators=[tau_bench_airline_data_generator], + ), completion_params=[ { "temperature": 0.8, diff --git a/eval_protocol/benchmarks/test_tau_bench_retail.py b/eval_protocol/benchmarks/test_tau_bench_retail.py index 68ec8430..f2354be9 100644 --- a/eval_protocol/benchmarks/test_tau_bench_retail.py +++ b/eval_protocol/benchmarks/test_tau_bench_retail.py @@ -10,6 +10,8 @@ from pathlib import Path from typing import Any, Dict, List +from eval_protocol.common_utils import load_jsonl +from eval_protocol.data_loader import DynamicDataLoader from eval_protocol.models import EvaluateResult, EvaluationRow, InputMetadata, Message from eval_protocol.pytest import evaluation_test, ExceptionHandlerConfig from eval_protocol.pytest.default_mcp_gym_rollout_processor import MCPGymRolloutProcessor @@ -69,6 +71,14 @@ def _get_retail_dataset_path() -> str: return str(Path(__file__).parent / "data" / "retail_dataset.jsonl") +def tau_bench_retail_data_generator() -> List[EvaluationRow]: + """Load and adapt the retail dataset into evaluation rows.""" + dataset_rows: List[Dict[str, Any]] = [] + for dataset_path in [_get_retail_dataset_path()]: + dataset_rows.extend(load_jsonl(dataset_path)) + return tau_bench_retail_to_evaluation_row(dataset_rows) + + def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[EvaluationRow]: """ Convert entries from retail dataset to EvaluationRow objects. @@ -98,8 +108,9 @@ def tau_bench_retail_to_evaluation_row(data: List[Dict[str, Any]]) -> List[Evalu @evaluation_test( - input_dataset=[_get_retail_dataset_path()], - dataset_adapter=tau_bench_retail_to_evaluation_row, + data_loaders=DynamicDataLoader( + generators=[tau_bench_retail_data_generator], + ), completion_params=[ { "temperature": 0.8,