molecule-core/workspace/_sanitize_a2a.py

"""A2A trust-boundary sanitizer — escapes markers in peer-supplied text.

Issue #346 / OFFSEC-003.

Peer agents can return text that contains trust-boundary markers our own code
uses (e.g. [A2A_ERROR], [A2A_QUEUED]). If this text reaches the agent's prompt
context, a malicious peer could inject fake error/control blocks to manipulate
the agent's behavior.

This module provides `sanitize_a2a_result` which inserts a ZERO-WIDTH SPACE
(U+200B) between the opening `[` and the marker text, breaking regex/string
pattern matches while being invisible to humans reading the content.

The ZERO-WIDTH SPACE is used because:
1. It is invisible in all common fonts and terminals
2. It is a valid Unicode character (Category Cf: Format)
3. It does not affect LLM tokenization meaningfully
4. The agent cannot easily "fix" it back because it can't see it
"""

from __future__ import annotations

import re
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    pass

# Zero-width space — the "escape" character inserted inside the bracket.
ZWSP = ""

# Known trust-boundary markers that appear in square-bracket form.
# These are the ones our own code generates and the ones a malicious peer
# might try to inject. Each entry: (regex, replacement_template).
# The replacement puts ZWSP INSIDE the opening bracket so that "[A2A_ERROR]"
# becomes "[A2A_ERROR]" — the raw marker string no longer appears as a
# contiguous substring, but the text remains human-readable.
_TRUST_MARKER_PATTERNS: list[tuple[re.Pattern[str], str]] = [
    # Our own sentinels (from a2a_client.py)
    (re.compile(r"\[(A2A_ERROR)\]", re.IGNORECASE), "[\\1]"),
    (re.compile(r"\[(A2A_QUEUED)\]", re.IGNORECASE), "[\\1]"),
    # System-level markers (open-bracket form — captures content after "[")
    (re.compile(r"\[(SYSTEM)\b"), "[\\1"),
    (re.compile(r"\[(SYSTEM)\]", re.IGNORECASE), "[\\1]"),
    (re.compile(r"\[(AGENT)\b"), "[\\1"),
    # Generic control markers a peer might inject
    (re.compile(r"\[(ADMIN)\b"), "[\\1"),
    (re.compile(r"\[(BYPASS)\b"), "[\\1"),
    (re.compile(r"\[(IGNORE)\b"), "[\\1"),
]


def sanitize_a2a_result(text: str) -> str:
    """Escape trust-boundary markers in peer-supplied A2A response text.

    Inserts a ZERO-WIDTH SPACE (U+200B) INSIDE the opening bracket of each
    known marker (e.g. ``[A2A_ERROR]`` → ``[A2A_ERROR]``), so that the raw
    marker string no longer appears as a contiguous substring and naive pattern
    checks do not fire on peer-supplied content.

    Idempotent — running sanitized text through this function again is a no-op
    because the ZWSP is already inside the brackets.

    Args:
        text: Raw peer-supplied text from ``response_preview`` or ``summary``
              fields in delegation results.

    Returns:
        The input text with ZWSP escape characters inserted inside each
        opening ``[`` that starts a known trust-boundary marker.
    """
    if not text:
        return text

    result = text
    for pattern, replacement in _TRUST_MARKER_PATTERNS:
        # Use regex backreference to preserve the captured marker text,
        # with ZWSP inserted after the opening "[".
        result = pattern.sub(replacement, result)

    return result