molecule-core/workspace/adapters/smolagents/env_sanitize.py

"""Allowlist-based environment sanitization for smolagents (#826 — C3 CRITICAL).

Security model
--------------
We use an **allowlist** (not a denylist) — only variables explicitly
enumerated as safe are passed through to agent-executed code.  Any key not
on the list is silently dropped.

This is intentionally strict: adding a new safe variable is a deliberate
engineering act that surfaces in code review, rather than hoping a regex
denylist catches every new secret name.

Thread safety
-------------
``SafeLocalPythonExecutor.__call__`` mutates ``os.environ`` temporarily.
``_ENV_PATCH_LOCK`` serialises concurrent calls so simultaneous executions
do not see each other's env patches.

Extending the allowlist
-----------------------
Set ``SMOLAGENTS_ENV_EXTRA_ALLOWLIST`` to a comma-separated list of
additional uppercase env var names that should be passed through.  This is
intended for workspace-specific non-secret variables (e.g. ``WORKSPACE_ID``
that you know are safe):

    SMOLAGENTS_ENV_EXTRA_ALLOWLIST="MY_COMPANY_ENV,REGION"

Never add secret names here — use workspace secrets injection instead.
"""

from __future__ import annotations

import os
import threading
from typing import Any, Dict, List, Optional

# ---------------------------------------------------------------------------
# Allowlist configuration
# ---------------------------------------------------------------------------

# Core safe env variables — non-secret system and runtime variables that
# agent code may legitimately need (e.g. PATH for subprocess-free tools,
# PYTHONPATH for module resolution, TZ for datetime ops).
_SAFE_ENV_ALLOWLIST: frozenset = frozenset(
    [
        # Shell / system fundamentals
        "PATH",
        "HOME",
        "USER",
        "LOGNAME",
        "SHELL",
        "TERM",
        "TZ",
        "TMPDIR",
        "TEMP",
        "TMP",
        # Language / locale
        "LANG",
        "LANGUAGE",
        "LC_ALL",
        "LC_CTYPE",
        "LC_MESSAGES",
        "LC_NUMERIC",
        "LC_TIME",
        # Python runtime
        "PYTHONPATH",
        "PYTHONHOME",
        "PYTHONDONTWRITEBYTECODE",
        "PYTHONUNBUFFERED",
        "PYTHONIOENCODING",
        # Molecule workspace non-secret identity vars
        "WORKSPACE_ID",
        "WORKSPACE_NAME",
        "PLATFORM_URL",
    ]
)

# Imports permanently excluded from the executor's authorized list.
# These are well-known sandbox-escape vectors.
_BANNED_IMPORTS: frozenset = frozenset(
    ["subprocess", "socket", "ctypes", "importlib", "importlib.util"]
)

# Baseline imports every SafeLocalPythonExecutor allows — pure-computation
# modules with no I/O escape surface.
_BASELINE_SAFE_IMPORTS: List[str] = [
    "math",
    "json",
    "re",
    "datetime",
    "collections",
    "itertools",
    "functools",
    "typing",
    "string",
    "textwrap",
    "decimal",
    "fractions",
    "statistics",
    "random",
    "hashlib",
    "base64",
    "urllib.parse",
    "copy",
    "dataclasses",
    "enum",
    "abc",
    "io",
]

# Thread lock for env patching
_ENV_PATCH_LOCK = threading.Lock()


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------


def make_safe_env(
    extra_allowed: Optional[List[str]] = None,
) -> Dict[str, str]:
    """Return a *copy* of the environment containing only allowlisted keys.

    ``os.environ`` is **never mutated** by this function.

    Parameters
    ----------
    extra_allowed:
        Additional variable names to include beyond the built-in allowlist.
        Also merged with the ``SMOLAGENTS_ENV_EXTRA_ALLOWLIST`` env var.

    Returns
    -------
    dict
        A copy of ``os.environ`` filtered to allowlisted keys only.
        Keys not on the list are silently dropped.
    """
    allowed = set(_SAFE_ENV_ALLOWLIST)

    # Merge caller-provided extras
    if extra_allowed:
        allowed.update(k.upper() for k in extra_allowed)

    # Merge env-var-configured extras
    env_extra = os.environ.get("SMOLAGENTS_ENV_EXTRA_ALLOWLIST", "")
    if env_extra:
        for key in env_extra.split(","):
            key = key.strip().upper()
            if key:
                allowed.add(key)

    return {k: v for k, v in os.environ.items() if k in allowed}


class SafeLocalPythonExecutor:
    """Allowlist-gated wrapper around smolagents ``LocalPythonExecutor``.

    Guarantees that agent-generated code cannot read secret environment
    variables (``ANTHROPIC_API_KEY``, ``GH_TOKEN``, ``DATABASE_URL``, etc.)
    because they are absent from ``os.environ`` during execution.

    Parameters
    ----------
    additional_imports:
        Extra module names to allow beyond ``_BASELINE_SAFE_IMPORTS``.
        ``_BANNED_IMPORTS`` takes precedence — listed names are silently
        removed.
    extra_allowed_env:
        Extra variable names to pass through beyond the core allowlist.
    _inner:
        Inject a mock ``LocalPythonExecutor`` for tests.  When ``None``,
        the real smolagents executor is constructed lazily.
    """

    def __init__(
        self,
        additional_imports: Optional[List[str]] = None,
        extra_allowed_env: Optional[List[str]] = None,
        *,
        _inner: Any = None,
    ) -> None:
        # Compute final import list (baseline + extras − banned)
        combined = list(_BASELINE_SAFE_IMPORTS)
        if additional_imports:
            for imp in additional_imports:
                if imp not in _BANNED_IMPORTS:
                    combined.append(imp)

        self._authorized_imports: List[str] = combined
        self._extra_allowed_env: Optional[List[str]] = extra_allowed_env
        self._inner = _inner  # may be None until first call

    def _get_inner(self) -> Any:
        """Lazy-construct the real executor on first use (avoids import errors in tests)."""
        if self._inner is None:
            from smolagents import LocalPythonExecutor  # type: ignore[import]

            self._inner = LocalPythonExecutor(
                additional_authorized_imports=self._authorized_imports
            )
        return self._inner

    def __call__(self, code: str, *args: Any, **kwargs: Any) -> Any:
        """Execute ``code`` with only allowlisted env vars visible.

        All keys not on the allowlist are removed from ``os.environ`` for
        the duration of execution and restored afterward, even on exception.
        The lock ensures thread safety across concurrent calls.
        """
        safe_env = make_safe_env(self._extra_allowed_env)
        inner = self._get_inner()

        with _ENV_PATCH_LOCK:
            # Snapshot full current env
            original_env = dict(os.environ)
            # Remove everything not in the safe set
            keys_to_remove = [k for k in os.environ if k not in safe_env]
            for k in keys_to_remove:
                del os.environ[k]
            try:
                return inner(code, *args, **kwargs)
            finally:
                # Always restore
                os.environ.clear()
                os.environ.update(original_env)