diff --git a/tests/pytest/datasets/gmail_inbox.jsonl b/tests/pytest/datasets/gmail_inbox.jsonl deleted file mode 100644 index 789a7dde..00000000 --- a/tests/pytest/datasets/gmail_inbox.jsonl +++ /dev/null @@ -1 +0,0 @@ -{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Find the first 5 emails title in my inbox." } ], "ground_truth": "The first 5 emails contain meeting between Benny and Zheng"} diff --git a/tests/pytest/datasets/klavis_mcp_test.jsonl b/tests/pytest/datasets/klavis_mcp_test.jsonl new file mode 100644 index 00000000..9cee59a7 --- /dev/null +++ b/tests/pytest/datasets/klavis_mcp_test.jsonl @@ -0,0 +1,15 @@ +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to Gmail to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "How many notion pages are in MCPMark Source Hub?" } ], "ground_truth": "10" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout." } ], "ground_truth": "Pizzeria Badiali" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to calculate how much did I spend in accomondation." } ], "ground_truth": "$373.63" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to check how many tokyo attractions I've visited." } ], "ground_truth": "2" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to find how many presses did we have during 2018. You can find the presses in company wiki." } ], "ground_truth": "3" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to figure out how many FAQ items under training & upskilling category."} ], "ground_truth": "4" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have today?" } ], "ground_truth": "3" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days this week?" } ], "ground_truth": "8" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many available time(in hour) do I have during my business hour the next working day?" } ], "ground_truth": "2 hour" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on Oct 15 2025?" } ], "ground_truth": "4" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days of the week of Oct 15 2025?" } ], "ground_truth": "9" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's Thursday?" } ], "ground_truth": "2" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's buisiness day?" } ], "ground_truth": "5" } + diff --git a/tests/pytest/mcp_configurations/klavis_strata_mcp.json b/tests/pytest/mcp_configurations/klavis_strata_mcp.json index fd9e6923..b6becb87 100644 --- a/tests/pytest/mcp_configurations/klavis_strata_mcp.json +++ b/tests/pytest/mcp_configurations/klavis_strata_mcp.json @@ -2,7 +2,7 @@ "mcpServers": { "klavis-strata": { "url": "https://strata.klavis.ai/mcp/", - "authorization": "Bearer ${KLAVIS_API_KEY}" + "authorization": "Bearer ${KLAVIS_AUTH_TOKEN}" } } } diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index 00f48c9c..926051da 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -12,9 +12,12 @@ class ResponseFormat(BaseModel): score: float - +''' +You should copy https://painted-tennis-ebc.notion.site/MCPMark-Source-Hub-23181626b6d7805fb3a7d59c63033819 +into your Notion for the notion test. +''' @evaluation_test( - input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"], + input_dataset=["tests/pytest/datasets/klavis_mcp_test.jsonl"], rollout_processor=AgentRolloutProcessor(), completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], mode="pointwise",