From e559c16b3b149f272eba95981acece01cd35e23a Mon Sep 17 00:00:00 2001 From: zhengliu Date: Thu, 30 Oct 2025 16:43:21 -0700 Subject: [PATCH 1/8] First Commit of Klavis Strata MCP --- .../mcp_configurations/klavis_strata_mcp.json | 7 +++ tests/pytest/test_pytest_klavis_mcp.py | 43 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 tests/pytest/mcp_configurations/klavis_strata_mcp.json create mode 100644 tests/pytest/test_pytest_klavis_mcp.py diff --git a/tests/pytest/mcp_configurations/klavis_strata_mcp.json b/tests/pytest/mcp_configurations/klavis_strata_mcp.json new file mode 100644 index 00000000..b4c2f3e5 --- /dev/null +++ b/tests/pytest/mcp_configurations/klavis_strata_mcp.json @@ -0,0 +1,7 @@ +{ + "mcpServers": { + "klavis-strata": { + "url": "https://strata.klavis.ai/mcp/" + } + } +} diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py new file mode 100644 index 00000000..8ec70395 --- /dev/null +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -0,0 +1,43 @@ +from eval_protocol.models import EvaluateResult, EvaluationRow, Message +from eval_protocol.pytest import AgentRolloutProcessor, evaluation_test + + +@evaluation_test( + input_messages=[ + [ + [ + Message( + role="system", + content=( + "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information.\n" + ), + ), + Message( + role="user", + content=("Find the first 5 emails title in my inbox."), + ), + ] + ] + ], + rollout_processor=AgentRolloutProcessor(), + completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], + mode="pointwise", + mcp_config_path="tests/pytest/mcp_configurations/klavis_strata_mcp.json", +) +def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow: + # filter for all tool calls + tool_calls = [msg for msg in row.messages if msg.role == "tool"] + + if len(tool_calls) == 0: + logger.warning("No tool calls made - returning score 0") + row.evaluation_result = EvaluateResult( + score=0, + reason="No tool calls made", + ) + return row + + row.evaluation_result = EvaluateResult( + score=1, + reason="At least one tool call was made", + ) + return row From 34bc64c49f42b2410d08325e78a0e32540434520 Mon Sep 17 00:00:00 2001 From: zhengliu Date: Mon, 3 Nov 2025 11:49:18 -0800 Subject: [PATCH 2/8] Add auth header for Klavis MCP --- tests/pytest/mcp_configurations/klavis_strata_mcp.json | 7 ++++--- tests/pytest/test_pytest_klavis_mcp.py | 2 -- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/pytest/mcp_configurations/klavis_strata_mcp.json b/tests/pytest/mcp_configurations/klavis_strata_mcp.json index b4c2f3e5..02bf542d 100644 --- a/tests/pytest/mcp_configurations/klavis_strata_mcp.json +++ b/tests/pytest/mcp_configurations/klavis_strata_mcp.json @@ -1,7 +1,8 @@ { "mcpServers": { - "klavis-strata": { - "url": "https://strata.klavis.ai/mcp/" + "klavis-strata": { + "url": "https://strata.klavis.ai/mcp/", + "authorization": "Bearer ${KLAVIS_API_KEY}" + } } - } } diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index 8ec70395..bec5b4b0 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -27,9 +27,7 @@ def test_pytest_klavis_mcp(row: EvaluationRow) -> EvaluationRow: # filter for all tool calls tool_calls = [msg for msg in row.messages if msg.role == "tool"] - if len(tool_calls) == 0: - logger.warning("No tool calls made - returning score 0") row.evaluation_result = EvaluateResult( score=0, reason="No tool calls made", From 95ca90c6a2018da102b6cddcd3a6229216cae080 Mon Sep 17 00:00:00 2001 From: zhengliu Date: Wed, 5 Nov 2025 16:23:13 -0800 Subject: [PATCH 3/8] Update the simple email use case --- tests/pytest/datasets/gmail_inbox.jsonl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pytest/datasets/gmail_inbox.jsonl b/tests/pytest/datasets/gmail_inbox.jsonl index 789a7dde..29abcf79 100644 --- a/tests/pytest/datasets/gmail_inbox.jsonl +++ b/tests/pytest/datasets/gmail_inbox.jsonl @@ -1 +1 @@ -{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Find the first 5 emails title in my inbox." } ], "ground_truth": "The first 5 emails contain meeting between Benny and Zheng"} +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." } From f444a75861336edb1e50e154fb3fb78b20fed47c Mon Sep 17 00:00:00 2001 From: zhengliu Date: Thu, 13 Nov 2025 16:29:54 -0800 Subject: [PATCH 4/8] Add notion task --- tests/pytest/datasets/gmail_inbox.jsonl | 1 + tests/pytest/test_pytest_klavis_mcp.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/pytest/datasets/gmail_inbox.jsonl b/tests/pytest/datasets/gmail_inbox.jsonl index 29abcf79..706eb2c6 100644 --- a/tests/pytest/datasets/gmail_inbox.jsonl +++ b/tests/pytest/datasets/gmail_inbox.jsonl @@ -1 +1,2 @@ {"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to help you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout" } ], "ground_truth": "Pizzeria Badiali" } diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index 00f48c9c..30e412ef 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -13,6 +13,7 @@ class ResponseFormat(BaseModel): score: float +# You should copy https://painted-tennis-ebc.notion.site/MCPMark-Source-Hub-23181626b6d7805fb3a7d59c63033819 into your Notion for the notion test. @evaluation_test( input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"], rollout_processor=AgentRolloutProcessor(), From 4c3db56298863d97049366f687050518bfa093db Mon Sep 17 00:00:00 2001 From: zhengliu Date: Thu, 13 Nov 2025 16:36:02 -0800 Subject: [PATCH 5/8] change dataset file name --- .../datasets/{gmail_inbox.jsonl => klavis_mcp_test.jsonl} | 0 tests/pytest/test_pytest_klavis_mcp.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename tests/pytest/datasets/{gmail_inbox.jsonl => klavis_mcp_test.jsonl} (100%) diff --git a/tests/pytest/datasets/gmail_inbox.jsonl b/tests/pytest/datasets/klavis_mcp_test.jsonl similarity index 100% rename from tests/pytest/datasets/gmail_inbox.jsonl rename to tests/pytest/datasets/klavis_mcp_test.jsonl diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index 30e412ef..0ebc8352 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -15,7 +15,7 @@ class ResponseFormat(BaseModel): # You should copy https://painted-tennis-ebc.notion.site/MCPMark-Source-Hub-23181626b6d7805fb3a7d59c63033819 into your Notion for the notion test. @evaluation_test( - input_dataset=["tests/pytest/datasets/gmail_inbox.jsonl"], + input_dataset=["tests/pytest/datasets/klavis_mcp_test.jsonl"], rollout_processor=AgentRolloutProcessor(), completion_params=[{"model": "fireworks_ai/accounts/fireworks/models/kimi-k2-instruct"}], mode="pointwise", From 5dacb42d4cfc1e014bb553470ba9efdd1ac286c7 Mon Sep 17 00:00:00 2001 From: zhengliu Date: Mon, 17 Nov 2025 17:22:49 -0800 Subject: [PATCH 6/8] Add more use cases --- tests/pytest/datasets/klavis_mcp_test.jsonl | 17 +++++++++++++++-- tests/pytest/test_pytest_klavis_mcp.py | 6 ++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/tests/pytest/datasets/klavis_mcp_test.jsonl b/tests/pytest/datasets/klavis_mcp_test.jsonl index 706eb2c6..7a0e6b89 100644 --- a/tests/pytest/datasets/klavis_mcp_test.jsonl +++ b/tests/pytest/datasets/klavis_mcp_test.jsonl @@ -1,2 +1,15 @@ -{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to tools to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." } -{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to help you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout" } ], "ground_truth": "Pizzeria Badiali" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Gmail. You have access to Gmail to help you find information." }, { "role": "user", "content": "Fetch the first 5 emails in my gmail inbox and get the sender. Output the sender only." } ], "ground_truth": "The response contains 5 email addresses." } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "How many notion pages are in MCPMark Source Hub?" } ], "ground_truth": "10" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "In the notion Toronto guide, help me to find a pizza restaurant which is able to takeout." } ], "ground_truth": "Pizzeria Badiali" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to calculate how much did I spend in accomondation." } ], "ground_truth": "$373.63" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Japan Travel Planner page, help me to check how many tokyo attractions I've visited." } ], "ground_truth": "2" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to find how many presses did we have during 2018. You can find the presses in company wiki." } ], "ground_truth": "3" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to figure out how many FAQ items under training & upskilling category."} ], "ground_truth": "4" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have today?" } ], "ground_truth": "3" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days this week?" } ], "ground_truth": "8" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many available time(in hour) do I have during my business hour tomorrow?" } ], "ground_truth": "3 hour" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on Oct 15 2025?" } ], "ground_truth": "4" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days of the week of Oct 15 2025?" } ], "ground_truth": "9" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's Thursday?" } ], "ground_truth": "2" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's buisiness day?" } ], "ground_truth": "5" } + diff --git a/tests/pytest/test_pytest_klavis_mcp.py b/tests/pytest/test_pytest_klavis_mcp.py index 0ebc8352..926051da 100644 --- a/tests/pytest/test_pytest_klavis_mcp.py +++ b/tests/pytest/test_pytest_klavis_mcp.py @@ -12,8 +12,10 @@ class ResponseFormat(BaseModel): score: float - -# You should copy https://painted-tennis-ebc.notion.site/MCPMark-Source-Hub-23181626b6d7805fb3a7d59c63033819 into your Notion for the notion test. +''' +You should copy https://painted-tennis-ebc.notion.site/MCPMark-Source-Hub-23181626b6d7805fb3a7d59c63033819 +into your Notion for the notion test. +''' @evaluation_test( input_dataset=["tests/pytest/datasets/klavis_mcp_test.jsonl"], rollout_processor=AgentRolloutProcessor(), From 4c431e35ade286d3e9797b1b13eb40d84cbb87b2 Mon Sep 17 00:00:00 2001 From: zhengliu Date: Tue, 18 Nov 2025 16:36:27 -0800 Subject: [PATCH 7/8] change the API key to auth token --- tests/pytest/mcp_configurations/klavis_strata_mcp.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pytest/mcp_configurations/klavis_strata_mcp.json b/tests/pytest/mcp_configurations/klavis_strata_mcp.json index fd9e6923..b6becb87 100644 --- a/tests/pytest/mcp_configurations/klavis_strata_mcp.json +++ b/tests/pytest/mcp_configurations/klavis_strata_mcp.json @@ -2,7 +2,7 @@ "mcpServers": { "klavis-strata": { "url": "https://strata.klavis.ai/mcp/", - "authorization": "Bearer ${KLAVIS_API_KEY}" + "authorization": "Bearer ${KLAVIS_AUTH_TOKEN}" } } } From 78dbd7537c2b4234961d9a6b1885c54404e403cf Mon Sep 17 00:00:00 2001 From: zhengliu Date: Tue, 18 Nov 2025 20:34:54 -0800 Subject: [PATCH 8/8] Modify test case --- tests/pytest/datasets/klavis_mcp_test.jsonl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pytest/datasets/klavis_mcp_test.jsonl b/tests/pytest/datasets/klavis_mcp_test.jsonl index 7a0e6b89..9cee59a7 100644 --- a/tests/pytest/datasets/klavis_mcp_test.jsonl +++ b/tests/pytest/datasets/klavis_mcp_test.jsonl @@ -7,7 +7,7 @@ {"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Notion. You have access to tools to Notion you find information." }, { "role": "user", "content": "Check Company In A Box page, help me to figure out how many FAQ items under training & upskilling category."} ], "ground_truth": "4" } {"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have today?" } ], "ground_truth": "3" } {"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days this week?" } ], "ground_truth": "8" } -{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many available time(in hour) do I have during my business hour tomorrow?" } ], "ground_truth": "3 hour" } +{"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many available time(in hour) do I have during my business hour the next working day?" } ], "ground_truth": "2 hour" } {"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on Oct 15 2025?" } ], "ground_truth": "4" } {"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have during business days of the week of Oct 15 2025?" } ], "ground_truth": "9" } {"messages": [ { "role": "system", "content": "You are a helpful assistant that can answer questions about Outlook Calendar. You have access to Outlook Calendar to help you find information." }, { "role": "user", "content": "How many events do I have on next week's Thursday?" } ], "ground_truth": "2" }