molecule-core/workspace/tests/test_sandbox.py

"""Tests for the sandbox run_code tool — subprocess, docker-routing, and e2b backends.

The e2b backend tests use a fully mocked e2b_code_interpreter to avoid
requiring a real E2B_API_KEY or network access in CI.

Design notes:
- sandbox.py lives in tools/ alongside other tool modules.
- conftest.py stubs sys.modules["tools"] so a plain `import builtin_tools.sandbox`
  would hit the stub. We load sandbox.py via its file path instead.
- SANDBOX_BACKEND is captured as a module-level constant on load, so
  _load_sandbox() must be called with it set.
- E2B_API_KEY and e2b_code_interpreter are read at call-time inside
  _run_e2b(), so they must be present in os.environ / sys.modules during
  the actual async call (use monkeypatch or patch.dict).
"""

import asyncio
import importlib.util
import os
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
_SANDBOX_PATH = Path(__file__).parent.parent / "builtin_tools" / "sandbox.py"


def _load_sandbox(sandbox_backend: str = "subprocess", extra_env: dict | None = None):
    """
    Load (or reload) tools/sandbox.py from its real file path.
    Only SANDBOX_BACKEND needs to be set at load time — it's a module-level
    constant. Other env vars (E2B_API_KEY etc.) are read at call-time and
    should be set by the caller via monkeypatch or patch.dict.
    """
    # Evict any previously cached copy.
    for key in list(sys.modules.keys()):
        if "sandbox_mod" in key:
            del sys.modules[key]

    saved = os.environ.get("SANDBOX_BACKEND")
    os.environ["SANDBOX_BACKEND"] = sandbox_backend

    for k, v in (extra_env or {}).items():
        os.environ[k] = v
    try:
        spec = importlib.util.spec_from_file_location("sandbox_mod", _SANDBOX_PATH)
        mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(mod)
    finally:
        if saved is None:
            os.environ.pop("SANDBOX_BACKEND", None)
        else:
            os.environ["SANDBOX_BACKEND"] = saved
        for k in (extra_env or {}):
            os.environ.pop(k, None)

    return mod


def _make_e2b_mock(stdout_text: str = "hello e2b\n", stderr_text: str = ""):
    """Build a mock e2b Sandbox that returns a plausible execution result."""
    result_obj = MagicMock()
    result_obj.text = stdout_text
    result_obj.error = None

    logs_obj = MagicMock()
    logs_obj.stdout = []
    logs_obj.stderr = [stderr_text] if stderr_text else []

    exec_obj = MagicMock()
    exec_obj.results = [result_obj]
    exec_obj.logs = logs_obj

    sandbox_instance = MagicMock()
    sandbox_instance.run_code.return_value = exec_obj
    sandbox_instance.kill.return_value = None

    sandbox_cls = MagicMock(return_value=sandbox_instance)
    return sandbox_cls, sandbox_instance


def _run_sync(coro):
    return asyncio.run(coro)


# ---------------------------------------------------------------------------
# subprocess backend
# ---------------------------------------------------------------------------

class TestSubprocessBackend:
    def test_python_hello(self):
        sb = _load_sandbox("subprocess")
        result = _run_sync(sb._run_subprocess('print("hello subprocess")', "python"))
        assert result["exit_code"] == 0
        assert "hello subprocess" in result["stdout"]
        assert result["backend"] == "subprocess"

    def test_stderr_nonzero_exit(self):
        sb = _load_sandbox("subprocess")
        result = _run_sync(sb._run_subprocess("import sys; sys.exit(2)", "python"))
        assert result["exit_code"] == 2

    def test_unsupported_language(self):
        sb = _load_sandbox("subprocess")
        result = _run_sync(sb._run_subprocess("code", "cobol"))
        assert result["exit_code"] == -1
        assert "Unsupported" in result["error"]

    def test_syntax_error_captured_in_stderr(self):
        sb = _load_sandbox("subprocess")
        result = _run_sync(sb._run_subprocess("def broken(:", "python"))
        assert result["exit_code"] != 0

    def test_timeout(self):
        sb = _load_sandbox("subprocess", {"SANDBOX_TIMEOUT": "1"})
        # Manually set the module-level constant that was captured at load time
        sb.SANDBOX_TIMEOUT = 1
        result = _run_sync(sb._run_subprocess("import time; time.sleep(10)", "python"))
        assert result["exit_code"] == -1
        assert "Timeout" in result["error"]


# ---------------------------------------------------------------------------
# E2B backend
# ---------------------------------------------------------------------------

class TestE2BBackend:
    """
    All tests mock e2b_code_interpreter to avoid real network calls.
    E2B_API_KEY must be present in os.environ for the duration of _run_e2b
    (it's read at call-time, not module-load time).
    """

    def _call_e2b(self, code: str, language: str, sandbox_cls, api_key: str = "test-key"):
        sb = _load_sandbox("e2b")
        mock_mod = MagicMock()
        mock_mod.Sandbox = sandbox_cls
        with patch.dict(os.environ, {"E2B_API_KEY": api_key}):
            with patch.dict("sys.modules", {"e2b_code_interpreter": mock_mod}):
                return _run_sync(sb._run_e2b(code, language)), sb, sandbox_cls

    def test_python_success(self):
        sandbox_cls, sandbox_instance = _make_e2b_mock(stdout_text="42\n")
        result, _, _ = self._call_e2b("print(6 * 7)", "python", sandbox_cls)

        assert result["exit_code"] == 0
        assert result["backend"] == "e2b"
        assert result["language"] == "python"
        assert result["stdout"] == "42\n"
        sandbox_instance.kill.assert_called_once()

    def test_javascript_success(self):
        sandbox_cls, sandbox_instance = _make_e2b_mock(stdout_text="hello js\n")
        result, _, _ = self._call_e2b('console.log("hi")', "javascript", sandbox_cls)

        assert result["exit_code"] == 0
        assert result["language"] == "javascript"
        # E2B kernel must be remapped: "javascript" → "js"
        call_args = sandbox_instance.run_code.call_args
        called_kernel = (
            call_args.kwargs.get("language")
            or (call_args.args[1] if len(call_args.args) > 1 else None)
        )
        assert called_kernel == "js", f"Expected kernel 'js', got {called_kernel!r}"

    def test_stderr_produces_nonzero_exit(self):
        sandbox_cls, _ = _make_e2b_mock(
            stdout_text="", stderr_text="NameError: name 'x' is not defined"
        )
        result, _, _ = self._call_e2b("print(x)", "python", sandbox_cls)

        assert result["exit_code"] == 1
        assert "NameError" in result["stderr"]

    def test_missing_api_key_returns_error(self):
        sb = _load_sandbox("e2b")
        sandbox_cls, _ = _make_e2b_mock()
        mock_mod = MagicMock()
        mock_mod.Sandbox = sandbox_cls
        # Do NOT set E2B_API_KEY
        with patch.dict("sys.modules", {"e2b_code_interpreter": mock_mod}):
            with patch.dict(os.environ, {}, clear=False):
                os.environ.pop("E2B_API_KEY", None)
                result = _run_sync(sb._run_e2b("print(1)", "python"))

        assert result["exit_code"] == -1
        assert "E2B_API_KEY" in result["error"]

    def test_missing_package_returns_error(self):
        sb = _load_sandbox("e2b")
        with patch.dict(os.environ, {"E2B_API_KEY": "key"}):
            # Simulate ImportError by putting None in sys.modules
            with patch.dict("sys.modules", {"e2b_code_interpreter": None}):
                result = _run_sync(sb._run_e2b("print(1)", "python"))

        assert result["exit_code"] == -1
        assert "e2b-code-interpreter" in result["error"]

    def test_unsupported_language_returns_error(self):
        sandbox_cls, _ = _make_e2b_mock()
        result, _, _ = self._call_e2b("echo hi", "shell", sandbox_cls)

        assert result["exit_code"] == -1
        assert "not supported by the e2b backend" in result["error"]

    def test_sandbox_always_killed_on_exception(self):
        """sandbox.kill() is called even when run_code raises."""
        sandbox_instance = MagicMock()
        sandbox_instance.run_code.side_effect = RuntimeError("network error")
        sandbox_instance.kill.return_value = None
        sandbox_cls = MagicMock(return_value=sandbox_instance)

        result, _, _ = self._call_e2b("print(1)", "python", sandbox_cls)

        assert result["exit_code"] == -1
        assert "network error" in result["error"]
        sandbox_instance.kill.assert_called_once()

    def test_output_truncated_at_max_output(self):
        big = "x" * 20_000
        sandbox_cls, _ = _make_e2b_mock(stdout_text=big)
        result, sb, _ = self._call_e2b("print('x' * 20000)", "python", sandbox_cls)

        assert "stdout" in result
        assert len(result["stdout"]) <= sb.MAX_OUTPUT

    def test_api_key_forwarded_to_constructor(self):
        """E2B_API_KEY from env is passed to Sandbox(api_key=...)."""
        sandbox_cls, _ = _make_e2b_mock()
        _, _, used_cls = self._call_e2b("print(1)", "python", sandbox_cls, api_key="my-secret")

        call_kwargs = used_cls.call_args.kwargs
        assert call_kwargs.get("api_key") == "my-secret"

    def test_timeout_forwarded_to_constructor(self):
        """SANDBOX_TIMEOUT is forwarded as the sandbox timeout kwarg."""
        sandbox_cls, _ = _make_e2b_mock()
        sb = _load_sandbox("e2b", {"SANDBOX_TIMEOUT": "45"})
        sb.SANDBOX_TIMEOUT = 45

        mock_mod = MagicMock()
        mock_mod.Sandbox = sandbox_cls
        with patch.dict(os.environ, {"E2B_API_KEY": "key"}):
            with patch.dict("sys.modules", {"e2b_code_interpreter": mock_mod}):
                _run_sync(sb._run_e2b("print(1)", "python"))

        call_kwargs = sandbox_cls.call_args.kwargs
        assert call_kwargs.get("timeout") == 45


# ---------------------------------------------------------------------------
# Dispatcher routing — verify SANDBOX_BACKEND selects the right function
# ---------------------------------------------------------------------------

class TestRunCodeDispatcher:
    def test_subprocess_backend_dispatched(self):
        sb = _load_sandbox("subprocess")
        assert sb.SANDBOX_BACKEND == "subprocess"
        result = _run_sync(sb._run_subprocess("1 + 1", "python"))
        assert result["exit_code"] == 0

    def test_e2b_backend_dispatched(self):
        """run_code routes to _run_e2b when SANDBOX_BACKEND=e2b."""
        sb = _load_sandbox("e2b")
        assert sb.SANDBOX_BACKEND == "e2b"

        called_with = []

        async def fake_e2b(code, language):
            called_with.append((code, language))
            return {"exit_code": 0, "stdout": "ok", "backend": "e2b"}

        with patch.object(sb, "_run_e2b", fake_e2b):
            # conftest mocks @tool as identity, so run_code is the raw async fn
            result = _run_sync(sb.run_code("print(1)", "python"))

        assert called_with == [("print(1)", "python")]
        assert result["backend"] == "e2b"

    def test_docker_backend_dispatched(self):
        """run_code routes to _run_docker when SANDBOX_BACKEND=docker."""
        sb = _load_sandbox("docker")
        assert sb.SANDBOX_BACKEND == "docker"

        called_with = []

        async def fake_docker(code, language):
            called_with.append((code, language))
            return {"exit_code": 0, "stdout": "ok", "backend": "docker"}

        with patch.object(sb, "_run_docker", fake_docker):
            result = _run_sync(sb.run_code("echo hi", "shell"))

        assert called_with == [("echo hi", "shell")]
        assert result["backend"] == "docker"

    def test_subprocess_backend_routes_to_run_subprocess(self):
        """run_code with SANDBOX_BACKEND=subprocess calls _run_subprocess."""
        sb = _load_sandbox("subprocess")

        called_with = []

        async def fake_subprocess(code, language):
            called_with.append((code, language))
            return {"exit_code": 0, "stdout": "ok", "backend": "subprocess"}

        with patch.object(sb, "_run_subprocess", fake_subprocess):
            result = _run_sync(sb.run_code("print(1)", "python"))

        assert called_with == [("print(1)", "python")]
        assert result["backend"] == "subprocess"


# ---------------------------------------------------------------------------
# Additional subprocess backend edge-cases
# ---------------------------------------------------------------------------

class TestSubprocessEdgeCases:

    def test_process_lookup_error_on_kill(self):
        """ProcessLookupError during proc.kill() after timeout is silently ignored."""
        sb = _load_sandbox("subprocess")
        sb.SANDBOX_TIMEOUT = 1

        # We need the real timeout path but with proc.kill() raising ProcessLookupError.
        # Patch asyncio.wait_for to raise TimeoutError then patch proc.kill to raise.
        import asyncio as _asyncio

        original_create = _asyncio.create_subprocess_exec

        async def fake_create(*args, **kwargs):
            proc = MagicMock()
            proc.returncode = None

            async def _communicate():
                raise _asyncio.TimeoutError()

            proc.communicate = _communicate

            def _kill():
                raise ProcessLookupError("no such process")

            proc.kill = _kill

            async def _wait():
                pass

            proc.wait = _wait
            return proc

        with patch("asyncio.create_subprocess_exec", fake_create):
            result = _run_sync(sb._run_subprocess("import time; time.sleep(100)", "python"))

        assert result["exit_code"] == -1
        assert "Timeout" in result["error"]

    def test_general_exception_in_subprocess_exec(self):
        """Exception from asyncio.create_subprocess_exec is caught and returned."""
        sb = _load_sandbox("subprocess")

        async def fake_create(*args, **kwargs):
            raise OSError("no such executable")

        with patch("asyncio.create_subprocess_exec", fake_create):
            result = _run_sync(sb._run_subprocess("print(1)", "python"))

        assert result["exit_code"] == -1
        assert "no such executable" in result["error"]


# ---------------------------------------------------------------------------
# Docker backend
# ---------------------------------------------------------------------------

class TestDockerBackend:

    def _make_docker_proc(self, stdout: bytes = b"", stderr: bytes = b"", returncode: int = 0):
        """Return a fake asyncio subprocess-like object."""
        proc = MagicMock()
        proc.returncode = returncode

        async def _communicate():
            return (stdout, stderr)

        proc.communicate = _communicate
        return proc

    def test_run_docker_unsupported_language(self):
        sb = _load_sandbox("docker")
        result = _run_sync(sb._run_docker("code", "cobol"))
        assert result["exit_code"] == -1
        assert "Unsupported" in result["error"]

    def test_run_docker_success(self):
        """_run_docker returns exit_code=0 and correct stdout on success."""
        import asyncio as _asyncio

        sb = _load_sandbox("docker")
        fake_proc = self._make_docker_proc(stdout=b"hello docker\n", stderr=b"")

        async def fake_wait_for(coro, timeout):
            return await coro

        async def fake_create(*args, **kwargs):
            return fake_proc

        with patch("asyncio.create_subprocess_exec", fake_create), \
             patch("asyncio.wait_for", fake_wait_for):
            result = _run_sync(sb._run_docker('print("hello docker")', "python"))

        assert result["exit_code"] == 0
        assert "hello docker" in result["stdout"]
        assert result["backend"] == "docker"
        assert result["language"] == "python"

    def test_run_docker_timeout(self):
        """asyncio.wait_for TimeoutError → returns timeout error dict."""
        import asyncio as _asyncio

        sb = _load_sandbox("docker")
        sb.SANDBOX_TIMEOUT = 1

        async def fake_create(*args, **kwargs):
            proc = MagicMock()
            return proc

        async def fake_wait_for(coro, timeout):
            raise _asyncio.TimeoutError()

        with patch("asyncio.create_subprocess_exec", fake_create), \
             patch("asyncio.wait_for", fake_wait_for):
            result = _run_sync(sb._run_docker("code", "python"))

        assert result["exit_code"] == -1
        assert "Timeout" in result["error"]

    def test_run_docker_general_exception(self):
        """Generic exception in create_subprocess_exec → returns error dict."""
        sb = _load_sandbox("docker")

        async def fake_create(*args, **kwargs):
            raise RuntimeError("docker not available")

        with patch("asyncio.create_subprocess_exec", fake_create):
            result = _run_sync(sb._run_docker("code", "python"))

        assert result["exit_code"] == -1
        assert "docker not available" in result["error"]

    def test_run_docker_cleanup_on_success(self, tmp_path, monkeypatch):
        """Temp file is removed after successful run."""
        import asyncio as _asyncio
        import tempfile
        import os

        sb = _load_sandbox("docker")

        created_files = []
        original_mkstemp = tempfile.mkstemp

        def fake_mkstemp(suffix="", prefix="", dir=None, text=False):
            fd, path = original_mkstemp(suffix=suffix, prefix=prefix)
            created_files.append(path)
            return fd, path

        fake_proc = self._make_docker_proc(stdout=b"done\n", stderr=b"")

        async def fake_wait_for(coro, timeout):
            return await coro

        async def fake_create(*args, **kwargs):
            return fake_proc

        with patch("tempfile.mkstemp", fake_mkstemp), \
             patch("asyncio.create_subprocess_exec", fake_create), \
             patch("asyncio.wait_for", fake_wait_for):
            result = _run_sync(sb._run_docker("print('done')", "python"))

        assert result["exit_code"] == 0
        for f in created_files:
            assert not os.path.exists(f), f"temp file {f} was not cleaned up"

    def test_run_docker_cleanup_on_exception(self, tmp_path, monkeypatch):
        """Temp file is removed even when an exception is raised."""
        import tempfile
        import os

        sb = _load_sandbox("docker")

        created_files = []
        original_mkstemp = tempfile.mkstemp

        def fake_mkstemp(suffix="", prefix="", dir=None, text=False):
            fd, path = original_mkstemp(suffix=suffix, prefix=prefix)
            created_files.append(path)
            return fd, path

        async def fake_create(*args, **kwargs):
            raise RuntimeError("crash")

        with patch("tempfile.mkstemp", fake_mkstemp), \
             patch("asyncio.create_subprocess_exec", fake_create):
            result = _run_sync(sb._run_docker("print(1)", "python"))

        assert result["exit_code"] == -1
        for f in created_files:
            assert not os.path.exists(f), f"temp file {f} was not cleaned up after exception"

    def test_run_docker_cleanup_oserror_swallowed(self, tmp_path):
        """Lines 165-166: os.unlink raises OSError in finally block — swallowed, result still returned."""
        import tempfile
        import os

        sb = _load_sandbox("docker")
        fake_proc = self._make_docker_proc(stdout=b"ok\n", stderr=b"")

        created_files = []
        original_mkstemp = tempfile.mkstemp

        def fake_mkstemp(suffix="", prefix="", dir=None, text=False):
            fd, path = original_mkstemp(suffix=suffix, prefix=prefix)
            created_files.append(path)
            return fd, path

        async def fake_wait_for(coro, timeout):
            return await coro

        async def fake_create(*args, **kwargs):
            return fake_proc

        original_unlink = os.unlink
        unlink_calls = []

        def raising_unlink(path):
            unlink_calls.append(path)
            raise OSError("permission denied")

        with patch("tempfile.mkstemp", fake_mkstemp), \
             patch("asyncio.create_subprocess_exec", fake_create), \
             patch("asyncio.wait_for", fake_wait_for), \
             patch("os.unlink", raising_unlink):
            result = _run_sync(sb._run_docker("print('ok')", "python"))

        # OSError is swallowed; result is still returned
        assert result["exit_code"] == 0
        assert len(unlink_calls) > 0


# ---------------------------------------------------------------------------
# Gap 4: E2B backend — additional coverage paths
# ---------------------------------------------------------------------------

class TestE2BBackendGapCoverage:
    """Cover lines 242, 248, 268-269, 280-281 in _run_e2b."""

    def _call_e2b(self, code, language, mock_e2b_mod, api_key="test-key"):
        sb = _load_sandbox("e2b")
        with patch.dict(os.environ, {"E2B_API_KEY": api_key}):
            with patch.dict("sys.modules", {"e2b_code_interpreter": mock_e2b_mod}):
                return _run_sync(sb._run_e2b(code, language)), sb

    def test_result_error_attribute_captured(self):
        """Line 242: result.error in execution.results → captured in stderr."""
        result_obj = MagicMock()
        result_obj.text = None
        result_obj.error = "NameError: x not defined"

        logs_obj = MagicMock()
        logs_obj.stdout = []
        logs_obj.stderr = []

        exec_obj = MagicMock()
        exec_obj.results = [result_obj]
        exec_obj.logs = logs_obj

        sandbox_instance = MagicMock()
        sandbox_instance.run_code.return_value = exec_obj
        sandbox_instance.kill.return_value = None
        sandbox_cls = MagicMock(return_value=sandbox_instance)

        mock_mod = MagicMock()
        mock_mod.Sandbox = sandbox_cls

        result, _ = self._call_e2b("print(x)", "python", mock_mod)

        assert result["exit_code"] == 1
        assert "NameError" in result["stderr"]

    def test_logs_stdout_captured(self):
        """Line 248: execution.logs.stdout → appended to stdout_parts."""
        result_obj = MagicMock()
        result_obj.text = None
        result_obj.error = None

        logs_obj = MagicMock()
        logs_obj.stdout = ["hello from logs\n"]
        logs_obj.stderr = []

        exec_obj = MagicMock()
        exec_obj.results = [result_obj]
        exec_obj.logs = logs_obj

        sandbox_instance = MagicMock()
        sandbox_instance.run_code.return_value = exec_obj
        sandbox_instance.kill.return_value = None
        sandbox_cls = MagicMock(return_value=sandbox_instance)

        mock_mod = MagicMock()
        mock_mod.Sandbox = sandbox_cls

        result, _ = self._call_e2b("print('hello from logs')", "python", mock_mod)

        assert result["exit_code"] == 0
        assert "hello from logs" in result["stdout"]

    def test_e2b_timeout_returns_error(self):
        """Lines 268-269: asyncio.TimeoutError raised → returns timeout error dict."""
        import asyncio as _asyncio

        # Sandbox constructor itself raises TimeoutError via wait_for
        sandbox_instance = MagicMock()
        sandbox_cls = MagicMock(return_value=sandbox_instance)

        mock_mod = MagicMock()
        mock_mod.Sandbox = sandbox_cls

        sb = _load_sandbox("e2b")

        original_wait_for = _asyncio.wait_for

        call_count = {"n": 0}

        async def raising_wait_for(coro, timeout):
            call_count["n"] += 1
            if call_count["n"] == 1:
                raise _asyncio.TimeoutError()
            return await original_wait_for(coro, timeout)

        with patch.dict(os.environ, {"E2B_API_KEY": "test-key"}):
            with patch.dict("sys.modules", {"e2b_code_interpreter": mock_mod}):
                with patch("asyncio.wait_for", raising_wait_for):
                    result = _run_sync(sb._run_e2b("print(1)", "python"))

        assert result["exit_code"] == -1
        assert "Timeout" in result["error"]

    def test_e2b_cleanup_exception_swallowed(self):
        """Lines 280-281: sandbox.kill raises in finally → exception swallowed."""
        result_obj = MagicMock()
        result_obj.text = "42\n"
        result_obj.error = None

        logs_obj = MagicMock()
        logs_obj.stdout = []
        logs_obj.stderr = []

        exec_obj = MagicMock()
        exec_obj.results = [result_obj]
        exec_obj.logs = logs_obj

        sandbox_instance = MagicMock()
        sandbox_instance.run_code.return_value = exec_obj
        # Make kill raise an exception
        sandbox_instance.kill.side_effect = RuntimeError("kill failed")
        sandbox_cls = MagicMock(return_value=sandbox_instance)

        mock_mod = MagicMock()
        mock_mod.Sandbox = sandbox_cls

        result, _ = self._call_e2b("print(42)", "python", mock_mod)

        # Result is still returned despite kill() failing
        assert result["exit_code"] == 0
        assert "42" in result["stdout"]