molecule-core/workspace/_sanitize_a2a.py

"""OFFSEC-003: A2A peer-result sanitization — shared across delegation tools.

This module is intentionally a LEAF (no imports from the molecule-runtime
package) to avoid circular dependency cycles. Both ``a2a_tools_delegation``
and ``a2a_tools`` can import from here without creating import loops.

Trust-boundary design (OFFSEC-003):
    A2A peer responses are untrusted third-party content. Before passing
    them to the agent context, they MUST be wrapped in a trust-boundary
    marker pair so the calling agent knows the content is external.

Boundary markers:
    - _A2A_BOUNDARY_START = "[A2A_RESULT_FROM_PEER]"
    - _A2A_BOUNDARY_END   = "[/A2A_RESULT_FROM_PEER]"

The boundary is the PRIMARY security control. A peer that sends
"[A2A_RESULT_FROM_PEER]evil[/A2A_RESULT_FROM_PEER]safe" can make "safe"
appear inside the trusted context unless the markers themselves are
escaped before wrapping — see _escape_boundary_markers() below.

Defense-in-depth (secondary):
    Known prompt-injection control-words are also escaped so that even
    if a calling agent ignores the boundary marker, embedded attack
    patterns (SYSTEM:, OVERRIDE:, etc.) lose their special meaning.
    This is not a complete injection sanitizer — do not rely on it as
    the primary control.
"""

from __future__ import annotations

import re

# ── Trust-boundary markers ────────────────────────────────────────────────────

_A2A_BOUNDARY_START = "[A2A_RESULT_FROM_PEER]"
_A2A_BOUNDARY_END = "[/A2A_RESULT_FROM_PEER]"

# ── Boundary-marker escaping ─────────────────────────────────────────────────
# A peer that sends "[/A2A_RESULT_FROM_PEER]evil" can make "evil" appear
# inside the trusted zone. Escape BOTH boundary markers in the raw text
# before wrapping so they can never close the boundary early.
# We use "[/ " as the escape prefix — visually distinct from the real marker.


def _escape_boundary_markers(text: str) -> str:
    """Escape boundary markers inside the raw peer text before wrapping.

    Replaces any occurrence of the boundary start/end markers with a
    visually-similar escaped form so a malicious peer can never close
    the boundary early or inject a fake opener.
    """
    return (
        text.replace(_A2A_BOUNDARY_START, "[/ A2A_RESULT_FROM_PEER]")
        .replace(_A2A_BOUNDARY_END, "[/ /A2A_RESULT_FROM_PEER]")
    )


# ── Defense-in-depth: injection pattern escaping ───────────────────────────────
# These patterns cover common prompt-injection phrasings. They are NOT a
# complete sanitizer — see module docstring. The boundary marker is the
# primary control; these are purely defense-in-depth.

_INJECTION_PATTERNS = [
    # Single-word patterns: anchor to word boundary so they don't match
    # inside other words (e.g. "SYSTEM" in "mySYSTEMatic").
    # Single-word patterns: anchor to word boundary so they don't match
    # inside other words (e.g. "SYSTEM" in "mySYSTEMatic").
    (re.compile(r"(^|[^\w])SYSTEM\b", re.IGNORECASE), r"\1[ESCAPED_SYSTEM]"),
    (re.compile(r"(^|[^\w])OVERRIDE\b", re.IGNORECASE), r"\1[ESCAPED_OVERRIDE]"),
    # "INSTRUCTIONS" may appear at the start of a string or after a newline.
    (re.compile(r"(^|\n)INSTRUCTIONS?\b", re.IGNORECASE), " [ESCAPED_INSTRUCTIONS]"),
    (re.compile(r"(^|[^\w])IGNORE\s+ALL\b", re.IGNORECASE), r"\1[ESCAPED_IGNORE_ALL]"),
    (re.compile(r"(^|[^\w])YOU\s+ARE\s+NOW\b", re.IGNORECASE), r"\1[ESCAPED_YOU_ARE_NOW]"),
]


def sanitize_a2a_result(text: str) -> str:
    """Sanitize untrusted text from an A2A peer (OFFSEC-003).

    Order of operations:
      1. Escape boundary markers in the raw text (prevents injection).
      2. Escape known injection patterns (defense-in-depth).

    Returns the input unchanged if it is empty/None.

    Note: this function does NOT add boundary wrappers — callers that need
    to establish a trust boundary should wrap the sanitized result with
    ``[A2A_RESULT_FROM_PEER]\\n{sanitized}\\n[/A2A_RESULT_FROM_PEER]``.
    See ``a2a_tools_delegation.py:tool_delegate_task`` for the canonical
    wrapping pattern.
    """
    if not text:
        return text

    # 1. Escape boundary markers so a malicious peer cannot break the
    #    trust boundary from inside their response.
    escaped = _escape_boundary_markers(text)

    # 2. Escape known injection control-words (defense-in-depth only).
    for pattern, replacement in _INJECTION_PATTERNS:
        escaped = pattern.sub(replacement, escaped)

    return escaped