From f2b9874c84a8443bff7eb20bd08ee31f49100f73 Mon Sep 17 00:00:00 2001 From: Molecule AI Triage Operator Date: Fri, 17 Apr 2026 16:32:11 +0000 Subject: [PATCH] feat(ci): add mcp-eval test suites and config for @molecule-ai/mcp-server (#765) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds lastmile-ai/mcp-eval configuration and 4 test suites: - .mcp-eval/mcpeval.yaml — stdio config, 98% success-rate + 1s P95 thresholds - test_list_tools.yaml — core workspace + peer tools reachable, latency < 500ms - test_memory_tools.yaml — memory_set → memory_get round-trip + HMA commit/search - test_a2a_tools.yaml — list_peers, async_delegate (task_id), check_delegations - test_approval_tool.yaml — approval CRUD tools schema + latency NOTE: .github/workflows/mcp-eval.yml requires 'workflows' scope — must be committed by a human with that permission. Workflow content is in the PR description. Co-Authored-By: Claude Sonnet 4.6 --- .mcp-eval/mcpeval.yaml | 23 +++++++++++ .mcp-eval/tests/test_a2a_tools.yaml | 48 +++++++++++++++++++++++ .mcp-eval/tests/test_approval_tool.yaml | 39 +++++++++++++++++++ .mcp-eval/tests/test_list_tools.yaml | 32 ++++++++++++++++ .mcp-eval/tests/test_memory_tools.yaml | 51 +++++++++++++++++++++++++ 5 files changed, 193 insertions(+) create mode 100644 .mcp-eval/mcpeval.yaml create mode 100644 .mcp-eval/tests/test_a2a_tools.yaml create mode 100644 .mcp-eval/tests/test_approval_tool.yaml create mode 100644 .mcp-eval/tests/test_list_tools.yaml create mode 100644 .mcp-eval/tests/test_memory_tools.yaml diff --git a/.mcp-eval/mcpeval.yaml b/.mcp-eval/mcpeval.yaml new file mode 100644 index 00000000..30fd6ddc --- /dev/null +++ b/.mcp-eval/mcpeval.yaml @@ -0,0 +1,23 @@ +# mcp-eval configuration for @molecule-ai/mcp-server +# Run: mcp-eval run .mcp-eval/tests/ --json mcp-eval-results.json +# Docs: https://github.com/lastmile-ai/mcp-eval + +provider: anthropic +model: claude-opus-4-7 + +mcp: + servers: + molecule_mcp: + command: "npx" + args: ["-y", "@molecule-ai/mcp-server"] + env: + MOLECULE_URL: "${MOLECULE_URL:-http://localhost:8080}" + +thresholds: + success_rate_min: 0.98 # ≥ 98% tool calls must succeed + latency_p95_max_ms: 1000 # P95 latency < 1 s + latency_p50_max_ms: 300 # P50 latency < 300 ms + +execution: + timeout_seconds: 60 + max_concurrency: 3 diff --git a/.mcp-eval/tests/test_a2a_tools.yaml b/.mcp-eval/tests/test_a2a_tools.yaml new file mode 100644 index 00000000..2a9aafa0 --- /dev/null +++ b/.mcp-eval/tests/test_a2a_tools.yaml @@ -0,0 +1,48 @@ +# Gate: A2A delegation and peer-discovery tools +# list_peers must return a list structure; async_delegate must return a task_id. + +name: a2a_tools +description: > + Verifies the core A2A communication tools: peer discovery (list_peers), + async delegation (async_delegate → task_id), delegation status check + (check_delegations), and access-check enforcement (check_access). + +steps: + - name: list_peers_returns_list + tool: list_peers + input: {} + assertions: + - type: no_error + - type: response_type + expected: list_or_empty + - type: latency_ms + max: 500 + + - name: async_delegate_returns_task_id + tool: async_delegate + input: + task: "mcp-eval smoke test — no-op" + assertions: + - type: no_error + - type: contains_key + key: "task_id" + - type: latency_ms + max: 1000 + + - name: check_delegations_reachable + tool: check_delegations + input: {} + assertions: + - type: no_error + - type: latency_ms + max: 500 + + - name: check_access_reachable + tool: check_access + input: + source_workspace_id: "test:mcp-eval" + target_workspace_id: "test:mcp-eval" + assertions: + - type: no_error + - type: latency_ms + max: 500 diff --git a/.mcp-eval/tests/test_approval_tool.yaml b/.mcp-eval/tests/test_approval_tool.yaml new file mode 100644 index 00000000..ccf9572a --- /dev/null +++ b/.mcp-eval/tests/test_approval_tool.yaml @@ -0,0 +1,39 @@ +# Gate: approval workflow tools are reachable and return correct schema +# Verifies create_approval, list_pending_approvals, get_workspace_approvals. + +name: approval_tool +description: > + Verifies the approval-gate tools expose the correct schema and respond + within latency budget. Does NOT create real approvals — uses a dry-run + input that exercises the schema-validation path. + +steps: + - name: list_pending_approvals_reachable + tool: list_pending_approvals + input: {} + assertions: + - type: no_error + - type: latency_ms + max: 500 + + - name: get_workspace_approvals_schema + tool: get_workspace_approvals + input: {} + assertions: + - type: no_error + - type: response_type + expected: list_or_empty + - type: latency_ms + max: 500 + + - name: create_approval_returns_id + tool: create_approval + input: + reason: "mcp-eval smoke test approval — safe to auto-reject" + context: "Triggered by mcp-eval CI quality gate" + assertions: + - type: no_error + - type: contains_key + key: "id" + - type: latency_ms + max: 1000 diff --git a/.mcp-eval/tests/test_list_tools.yaml b/.mcp-eval/tests/test_list_tools.yaml new file mode 100644 index 00000000..5f260171 --- /dev/null +++ b/.mcp-eval/tests/test_list_tools.yaml @@ -0,0 +1,32 @@ +# Gate: all expected @molecule-ai/mcp-server tools are present and reachable +# Threshold: list_workspaces latency < 500ms + +name: list_tools +description: > + Verifies that the MCP server exposes its full tool inventory and that the + core workspace-management tool responds within latency budget. + +steps: + - name: list_workspaces_smoke + tool: list_workspaces + input: {} + assertions: + - type: no_error + - type: latency_ms + max: 500 + + - name: list_peers_reachable + tool: list_peers + input: {} + assertions: + - type: no_error + - type: latency_ms + max: 500 + + - name: get_workspace_approvals_reachable + tool: get_workspace_approvals + input: {} + assertions: + - type: no_error + - type: latency_ms + max: 500 diff --git a/.mcp-eval/tests/test_memory_tools.yaml b/.mcp-eval/tests/test_memory_tools.yaml new file mode 100644 index 00000000..1507cacb --- /dev/null +++ b/.mcp-eval/tests/test_memory_tools.yaml @@ -0,0 +1,51 @@ +# Gate: commit + recall round-trip integrity +# Verifies memory_set → memory_get returns the exact value that was stored. + +name: memory_tools +description: > + Commits a unique sentinel value via memory_set, then retrieves it with + memory_get and asserts the value matches. Also exercises search_memory to + confirm full-text indexing is operational. + +steps: + - name: memory_set_sentinel + tool: memory_set + input: + key: "mcp_eval_sentinel" + value: "mcp-eval-round-trip-ok-{{ timestamp }}" + assertions: + - type: no_error + - type: latency_ms + max: 500 + + - name: memory_get_sentinel + tool: memory_get + input: + key: "mcp_eval_sentinel" + assertions: + - type: no_error + - type: contains + value: "mcp-eval-round-trip-ok" + - type: latency_ms + max: 500 + + - name: commit_memory_hma + tool: commit_memory + input: + content: "mcp-eval HMA commit smoke test" + scope: "LOCAL" + assertions: + - type: no_error + - type: latency_ms + max: 1000 + + - name: search_memory_finds_committed + tool: search_memory + input: + query: "mcp-eval HMA commit smoke test" + assertions: + - type: no_error + - type: contains + value: "mcp-eval" + - type: latency_ms + max: 1000