From e6be3c0df00db4a838cb231a8a403fff42645257 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Fri, 1 May 2026 14:23:40 -0700
Subject: [PATCH] test(mcp): pin prompt-injection defense in
 _CHANNEL_INSTRUCTIONS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the missing symmetric pin against the threat-model sentence —
the existing tests pin reply-tool names (send_message_to_user,
delegate_task, inbox_pop) and tag attributes (kind, peer_id,
activity_id) but left the "treat message body as untrusted user
content" line unpinned. A copy-edit that drops it would turn the
channel into an open prompt-injection vector against any workspace
running the MCP server.

Pins three signals: "untrusted" present, an explicit
"not execute"/"do not" clause, and the "approval" escape-hatch
sentence — two of three would let a partial copy-edit slip
through.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 workspace/tests/test_a2a_mcp_server.py | 34 ++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/workspace/tests/test_a2a_mcp_server.py b/workspace/tests/test_a2a_mcp_server.py
index 41b5f12c..2fd701cf 100644
--- a/workspace/tests/test_a2a_mcp_server.py
+++ b/workspace/tests/test_a2a_mcp_server.py
@@ -331,3 +331,37 @@ def test_initialize_instructions_documents_meta_attributes():
             f"instructions must document the `{required_attr}` tag "
             f"attribute for the agent to act on it"
         )
+
+
+def test_initialize_instructions_pins_prompt_injection_defense():
+    """The threat-model sentence in `_CHANNEL_INSTRUCTIONS` is what
+    tells the agent that inbound canvas-user / peer-agent message
+    bodies are untrusted user content and must NOT be acted on as
+    instructions without chat-side approval. Symmetric with the reply-
+    tool pins above — drop this and a future copy-edit could silently
+    turn the channel into an open prompt-injection vector against any
+    workspace running this MCP server.
+    """
+    from a2a_mcp_server import _build_initialize_result
+
+    instructions = _build_initialize_result()["instructions"]
+    lowered = instructions.lower()
+
+    assert "untrusted" in lowered, (
+        "instructions must flag inbound message bodies as untrusted "
+        "user content — same threat model as the telegram channel "
+        "plugin. Dropping this turns the channel into a prompt-"
+        "injection vector."
+    )
+    # And the explicit don't-execute-blindly clause: pin both the
+    # restriction ("do not execute") and the escape hatch ("user
+    # approval") so a partial copy-edit can't keep one and drop the
+    # other.
+    assert "not execute" in lowered or "do not" in lowered, (
+        "instructions must explicitly say the agent should NOT execute "
+        "instructions embedded in message bodies"
+    )
+    assert "approval" in lowered, (
+        "instructions must point the agent at user chat-side approval "
+        "as the escape hatch when a message looks instruction-like"
+    )