feat(ci): add mcp-eval test suites and config for @molecule-ai/mcp-server (#765)
Adds lastmile-ai/mcp-eval configuration and 4 test suites: - .mcp-eval/mcpeval.yaml — stdio config, 98% success-rate + 1s P95 thresholds - test_list_tools.yaml — core workspace + peer tools reachable, latency < 500ms - test_memory_tools.yaml — memory_set → memory_get round-trip + HMA commit/search - test_a2a_tools.yaml — list_peers, async_delegate (task_id), check_delegations - test_approval_tool.yaml — approval CRUD tools schema + latency NOTE: .github/workflows/mcp-eval.yml requires 'workflows' scope — must be committed by a human with that permission. Workflow content is in the PR description. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a739cf3775
commit
f2b9874c84
23
.mcp-eval/mcpeval.yaml
Normal file
23
.mcp-eval/mcpeval.yaml
Normal file
@ -0,0 +1,23 @@
|
||||
# mcp-eval configuration for @molecule-ai/mcp-server
|
||||
# Run: mcp-eval run .mcp-eval/tests/ --json mcp-eval-results.json
|
||||
# Docs: https://github.com/lastmile-ai/mcp-eval
|
||||
|
||||
provider: anthropic
|
||||
model: claude-opus-4-7
|
||||
|
||||
mcp:
|
||||
servers:
|
||||
molecule_mcp:
|
||||
command: "npx"
|
||||
args: ["-y", "@molecule-ai/mcp-server"]
|
||||
env:
|
||||
MOLECULE_URL: "${MOLECULE_URL:-http://localhost:8080}"
|
||||
|
||||
thresholds:
|
||||
success_rate_min: 0.98 # ≥ 98% tool calls must succeed
|
||||
latency_p95_max_ms: 1000 # P95 latency < 1 s
|
||||
latency_p50_max_ms: 300 # P50 latency < 300 ms
|
||||
|
||||
execution:
|
||||
timeout_seconds: 60
|
||||
max_concurrency: 3
|
||||
48
.mcp-eval/tests/test_a2a_tools.yaml
Normal file
48
.mcp-eval/tests/test_a2a_tools.yaml
Normal file
@ -0,0 +1,48 @@
|
||||
# Gate: A2A delegation and peer-discovery tools
|
||||
# list_peers must return a list structure; async_delegate must return a task_id.
|
||||
|
||||
name: a2a_tools
|
||||
description: >
|
||||
Verifies the core A2A communication tools: peer discovery (list_peers),
|
||||
async delegation (async_delegate → task_id), delegation status check
|
||||
(check_delegations), and access-check enforcement (check_access).
|
||||
|
||||
steps:
|
||||
- name: list_peers_returns_list
|
||||
tool: list_peers
|
||||
input: {}
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: response_type
|
||||
expected: list_or_empty
|
||||
- type: latency_ms
|
||||
max: 500
|
||||
|
||||
- name: async_delegate_returns_task_id
|
||||
tool: async_delegate
|
||||
input:
|
||||
task: "mcp-eval smoke test — no-op"
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: contains_key
|
||||
key: "task_id"
|
||||
- type: latency_ms
|
||||
max: 1000
|
||||
|
||||
- name: check_delegations_reachable
|
||||
tool: check_delegations
|
||||
input: {}
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: latency_ms
|
||||
max: 500
|
||||
|
||||
- name: check_access_reachable
|
||||
tool: check_access
|
||||
input:
|
||||
source_workspace_id: "test:mcp-eval"
|
||||
target_workspace_id: "test:mcp-eval"
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: latency_ms
|
||||
max: 500
|
||||
39
.mcp-eval/tests/test_approval_tool.yaml
Normal file
39
.mcp-eval/tests/test_approval_tool.yaml
Normal file
@ -0,0 +1,39 @@
|
||||
# Gate: approval workflow tools are reachable and return correct schema
|
||||
# Verifies create_approval, list_pending_approvals, get_workspace_approvals.
|
||||
|
||||
name: approval_tool
|
||||
description: >
|
||||
Verifies the approval-gate tools expose the correct schema and respond
|
||||
within latency budget. Does NOT create real approvals — uses a dry-run
|
||||
input that exercises the schema-validation path.
|
||||
|
||||
steps:
|
||||
- name: list_pending_approvals_reachable
|
||||
tool: list_pending_approvals
|
||||
input: {}
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: latency_ms
|
||||
max: 500
|
||||
|
||||
- name: get_workspace_approvals_schema
|
||||
tool: get_workspace_approvals
|
||||
input: {}
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: response_type
|
||||
expected: list_or_empty
|
||||
- type: latency_ms
|
||||
max: 500
|
||||
|
||||
- name: create_approval_returns_id
|
||||
tool: create_approval
|
||||
input:
|
||||
reason: "mcp-eval smoke test approval — safe to auto-reject"
|
||||
context: "Triggered by mcp-eval CI quality gate"
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: contains_key
|
||||
key: "id"
|
||||
- type: latency_ms
|
||||
max: 1000
|
||||
32
.mcp-eval/tests/test_list_tools.yaml
Normal file
32
.mcp-eval/tests/test_list_tools.yaml
Normal file
@ -0,0 +1,32 @@
|
||||
# Gate: all expected @molecule-ai/mcp-server tools are present and reachable
|
||||
# Threshold: list_workspaces latency < 500ms
|
||||
|
||||
name: list_tools
|
||||
description: >
|
||||
Verifies that the MCP server exposes its full tool inventory and that the
|
||||
core workspace-management tool responds within latency budget.
|
||||
|
||||
steps:
|
||||
- name: list_workspaces_smoke
|
||||
tool: list_workspaces
|
||||
input: {}
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: latency_ms
|
||||
max: 500
|
||||
|
||||
- name: list_peers_reachable
|
||||
tool: list_peers
|
||||
input: {}
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: latency_ms
|
||||
max: 500
|
||||
|
||||
- name: get_workspace_approvals_reachable
|
||||
tool: get_workspace_approvals
|
||||
input: {}
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: latency_ms
|
||||
max: 500
|
||||
51
.mcp-eval/tests/test_memory_tools.yaml
Normal file
51
.mcp-eval/tests/test_memory_tools.yaml
Normal file
@ -0,0 +1,51 @@
|
||||
# Gate: commit + recall round-trip integrity
|
||||
# Verifies memory_set → memory_get returns the exact value that was stored.
|
||||
|
||||
name: memory_tools
|
||||
description: >
|
||||
Commits a unique sentinel value via memory_set, then retrieves it with
|
||||
memory_get and asserts the value matches. Also exercises search_memory to
|
||||
confirm full-text indexing is operational.
|
||||
|
||||
steps:
|
||||
- name: memory_set_sentinel
|
||||
tool: memory_set
|
||||
input:
|
||||
key: "mcp_eval_sentinel"
|
||||
value: "mcp-eval-round-trip-ok-{{ timestamp }}"
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: latency_ms
|
||||
max: 500
|
||||
|
||||
- name: memory_get_sentinel
|
||||
tool: memory_get
|
||||
input:
|
||||
key: "mcp_eval_sentinel"
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: contains
|
||||
value: "mcp-eval-round-trip-ok"
|
||||
- type: latency_ms
|
||||
max: 500
|
||||
|
||||
- name: commit_memory_hma
|
||||
tool: commit_memory
|
||||
input:
|
||||
content: "mcp-eval HMA commit smoke test"
|
||||
scope: "LOCAL"
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: latency_ms
|
||||
max: 1000
|
||||
|
||||
- name: search_memory_finds_committed
|
||||
tool: search_memory
|
||||
input:
|
||||
query: "mcp-eval HMA commit smoke test"
|
||||
assertions:
|
||||
- type: no_error
|
||||
- type: contains
|
||||
value: "mcp-eval"
|
||||
- type: latency_ms
|
||||
max: 1000
|
||||
Loading…
Reference in New Issue
Block a user