molecule-core/workspace/tests/test_sandbox.py
Hongming Wang d8026347e5 chore: open-source restructure — rename dirs, remove internal files, scrub secrets
Renames:
- platform/ → workspace-server/ (Go module path stays as "platform" for
  external dep compat — will update after plugin module republish)
- workspace-template/ → workspace/

Removed (moved to separate repos or deleted):
- PLAN.md — internal roadmap (move to private project board)
- HANDOFF.md, AGENTS.md — one-time internal session docs
- .claude/ — gitignored entirely (local agent config)
- infra/cloudflare-worker/ → Molecule-AI/molecule-tenant-proxy
- org-templates/molecule-dev/ → standalone template repo
- .mcp-eval/ → molecule-mcp-server repo
- test-results/ — ephemeral, gitignored

Security scrubbing:
- Cloudflare account/zone/KV IDs → placeholders
- Real EC2 IPs → <EC2_IP> in all docs
- CF token prefix, Neon project ID, Fly app names → redacted
- Langfuse dev credentials → parameterized
- Personal runner username/machine name → generic

Community files:
- CONTRIBUTING.md — build, test, branch conventions
- CODE_OF_CONDUCT.md — Contributor Covenant 2.1

All Dockerfiles, CI workflows, docker-compose, railway.toml, render.yaml,
README, CLAUDE.md updated for new directory names.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-18 00:24:44 -07:00

679 lines
25 KiB
Python

"""Tests for the sandbox run_code tool — subprocess, docker-routing, and e2b backends.
The e2b backend tests use a fully mocked e2b_code_interpreter to avoid
requiring a real E2B_API_KEY or network access in CI.
Design notes:
- sandbox.py lives in tools/ alongside other tool modules.
- conftest.py stubs sys.modules["tools"] so a plain `import builtin_tools.sandbox`
would hit the stub. We load sandbox.py via its file path instead.
- SANDBOX_BACKEND is captured as a module-level constant on load, so
_load_sandbox() must be called with it set.
- E2B_API_KEY and e2b_code_interpreter are read at call-time inside
_run_e2b(), so they must be present in os.environ / sys.modules during
the actual async call (use monkeypatch or patch.dict).
"""
import asyncio
import importlib.util
import os
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
_SANDBOX_PATH = Path(__file__).parent.parent / "builtin_tools" / "sandbox.py"
def _load_sandbox(sandbox_backend: str = "subprocess", extra_env: dict | None = None):
"""
Load (or reload) tools/sandbox.py from its real file path.
Only SANDBOX_BACKEND needs to be set at load time — it's a module-level
constant. Other env vars (E2B_API_KEY etc.) are read at call-time and
should be set by the caller via monkeypatch or patch.dict.
"""
# Evict any previously cached copy.
for key in list(sys.modules.keys()):
if "sandbox_mod" in key:
del sys.modules[key]
saved = os.environ.get("SANDBOX_BACKEND")
os.environ["SANDBOX_BACKEND"] = sandbox_backend
for k, v in (extra_env or {}).items():
os.environ[k] = v
try:
spec = importlib.util.spec_from_file_location("sandbox_mod", _SANDBOX_PATH)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
finally:
if saved is None:
os.environ.pop("SANDBOX_BACKEND", None)
else:
os.environ["SANDBOX_BACKEND"] = saved
for k in (extra_env or {}):
os.environ.pop(k, None)
return mod
def _make_e2b_mock(stdout_text: str = "hello e2b\n", stderr_text: str = ""):
"""Build a mock e2b Sandbox that returns a plausible execution result."""
result_obj = MagicMock()
result_obj.text = stdout_text
result_obj.error = None
logs_obj = MagicMock()
logs_obj.stdout = []
logs_obj.stderr = [stderr_text] if stderr_text else []
exec_obj = MagicMock()
exec_obj.results = [result_obj]
exec_obj.logs = logs_obj
sandbox_instance = MagicMock()
sandbox_instance.run_code.return_value = exec_obj
sandbox_instance.kill.return_value = None
sandbox_cls = MagicMock(return_value=sandbox_instance)
return sandbox_cls, sandbox_instance
def _run_sync(coro):
return asyncio.run(coro)
# ---------------------------------------------------------------------------
# subprocess backend
# ---------------------------------------------------------------------------
class TestSubprocessBackend:
def test_python_hello(self):
sb = _load_sandbox("subprocess")
result = _run_sync(sb._run_subprocess('print("hello subprocess")', "python"))
assert result["exit_code"] == 0
assert "hello subprocess" in result["stdout"]
assert result["backend"] == "subprocess"
def test_stderr_nonzero_exit(self):
sb = _load_sandbox("subprocess")
result = _run_sync(sb._run_subprocess("import sys; sys.exit(2)", "python"))
assert result["exit_code"] == 2
def test_unsupported_language(self):
sb = _load_sandbox("subprocess")
result = _run_sync(sb._run_subprocess("code", "cobol"))
assert result["exit_code"] == -1
assert "Unsupported" in result["error"]
def test_syntax_error_captured_in_stderr(self):
sb = _load_sandbox("subprocess")
result = _run_sync(sb._run_subprocess("def broken(:", "python"))
assert result["exit_code"] != 0
def test_timeout(self):
sb = _load_sandbox("subprocess", {"SANDBOX_TIMEOUT": "1"})
# Manually set the module-level constant that was captured at load time
sb.SANDBOX_TIMEOUT = 1
result = _run_sync(sb._run_subprocess("import time; time.sleep(10)", "python"))
assert result["exit_code"] == -1
assert "Timeout" in result["error"]
# ---------------------------------------------------------------------------
# E2B backend
# ---------------------------------------------------------------------------
class TestE2BBackend:
"""
All tests mock e2b_code_interpreter to avoid real network calls.
E2B_API_KEY must be present in os.environ for the duration of _run_e2b
(it's read at call-time, not module-load time).
"""
def _call_e2b(self, code: str, language: str, sandbox_cls, api_key: str = "test-key"):
sb = _load_sandbox("e2b")
mock_mod = MagicMock()
mock_mod.Sandbox = sandbox_cls
with patch.dict(os.environ, {"E2B_API_KEY": api_key}):
with patch.dict("sys.modules", {"e2b_code_interpreter": mock_mod}):
return _run_sync(sb._run_e2b(code, language)), sb, sandbox_cls
def test_python_success(self):
sandbox_cls, sandbox_instance = _make_e2b_mock(stdout_text="42\n")
result, _, _ = self._call_e2b("print(6 * 7)", "python", sandbox_cls)
assert result["exit_code"] == 0
assert result["backend"] == "e2b"
assert result["language"] == "python"
assert result["stdout"] == "42\n"
sandbox_instance.kill.assert_called_once()
def test_javascript_success(self):
sandbox_cls, sandbox_instance = _make_e2b_mock(stdout_text="hello js\n")
result, _, _ = self._call_e2b('console.log("hi")', "javascript", sandbox_cls)
assert result["exit_code"] == 0
assert result["language"] == "javascript"
# E2B kernel must be remapped: "javascript" → "js"
call_args = sandbox_instance.run_code.call_args
called_kernel = (
call_args.kwargs.get("language")
or (call_args.args[1] if len(call_args.args) > 1 else None)
)
assert called_kernel == "js", f"Expected kernel 'js', got {called_kernel!r}"
def test_stderr_produces_nonzero_exit(self):
sandbox_cls, _ = _make_e2b_mock(
stdout_text="", stderr_text="NameError: name 'x' is not defined"
)
result, _, _ = self._call_e2b("print(x)", "python", sandbox_cls)
assert result["exit_code"] == 1
assert "NameError" in result["stderr"]
def test_missing_api_key_returns_error(self):
sb = _load_sandbox("e2b")
sandbox_cls, _ = _make_e2b_mock()
mock_mod = MagicMock()
mock_mod.Sandbox = sandbox_cls
# Do NOT set E2B_API_KEY
with patch.dict("sys.modules", {"e2b_code_interpreter": mock_mod}):
with patch.dict(os.environ, {}, clear=False):
os.environ.pop("E2B_API_KEY", None)
result = _run_sync(sb._run_e2b("print(1)", "python"))
assert result["exit_code"] == -1
assert "E2B_API_KEY" in result["error"]
def test_missing_package_returns_error(self):
sb = _load_sandbox("e2b")
with patch.dict(os.environ, {"E2B_API_KEY": "key"}):
# Simulate ImportError by putting None in sys.modules
with patch.dict("sys.modules", {"e2b_code_interpreter": None}):
result = _run_sync(sb._run_e2b("print(1)", "python"))
assert result["exit_code"] == -1
assert "e2b-code-interpreter" in result["error"]
def test_unsupported_language_returns_error(self):
sandbox_cls, _ = _make_e2b_mock()
result, _, _ = self._call_e2b("echo hi", "shell", sandbox_cls)
assert result["exit_code"] == -1
assert "not supported by the e2b backend" in result["error"]
def test_sandbox_always_killed_on_exception(self):
"""sandbox.kill() is called even when run_code raises."""
sandbox_instance = MagicMock()
sandbox_instance.run_code.side_effect = RuntimeError("network error")
sandbox_instance.kill.return_value = None
sandbox_cls = MagicMock(return_value=sandbox_instance)
result, _, _ = self._call_e2b("print(1)", "python", sandbox_cls)
assert result["exit_code"] == -1
assert "network error" in result["error"]
sandbox_instance.kill.assert_called_once()
def test_output_truncated_at_max_output(self):
big = "x" * 20_000
sandbox_cls, _ = _make_e2b_mock(stdout_text=big)
result, sb, _ = self._call_e2b("print('x' * 20000)", "python", sandbox_cls)
assert "stdout" in result
assert len(result["stdout"]) <= sb.MAX_OUTPUT
def test_api_key_forwarded_to_constructor(self):
"""E2B_API_KEY from env is passed to Sandbox(api_key=...)."""
sandbox_cls, _ = _make_e2b_mock()
_, _, used_cls = self._call_e2b("print(1)", "python", sandbox_cls, api_key="my-secret")
call_kwargs = used_cls.call_args.kwargs
assert call_kwargs.get("api_key") == "my-secret"
def test_timeout_forwarded_to_constructor(self):
"""SANDBOX_TIMEOUT is forwarded as the sandbox timeout kwarg."""
sandbox_cls, _ = _make_e2b_mock()
sb = _load_sandbox("e2b", {"SANDBOX_TIMEOUT": "45"})
sb.SANDBOX_TIMEOUT = 45
mock_mod = MagicMock()
mock_mod.Sandbox = sandbox_cls
with patch.dict(os.environ, {"E2B_API_KEY": "key"}):
with patch.dict("sys.modules", {"e2b_code_interpreter": mock_mod}):
_run_sync(sb._run_e2b("print(1)", "python"))
call_kwargs = sandbox_cls.call_args.kwargs
assert call_kwargs.get("timeout") == 45
# ---------------------------------------------------------------------------
# Dispatcher routing — verify SANDBOX_BACKEND selects the right function
# ---------------------------------------------------------------------------
class TestRunCodeDispatcher:
def test_subprocess_backend_dispatched(self):
sb = _load_sandbox("subprocess")
assert sb.SANDBOX_BACKEND == "subprocess"
result = _run_sync(sb._run_subprocess("1 + 1", "python"))
assert result["exit_code"] == 0
def test_e2b_backend_dispatched(self):
"""run_code routes to _run_e2b when SANDBOX_BACKEND=e2b."""
sb = _load_sandbox("e2b")
assert sb.SANDBOX_BACKEND == "e2b"
called_with = []
async def fake_e2b(code, language):
called_with.append((code, language))
return {"exit_code": 0, "stdout": "ok", "backend": "e2b"}
with patch.object(sb, "_run_e2b", fake_e2b):
# conftest mocks @tool as identity, so run_code is the raw async fn
result = _run_sync(sb.run_code("print(1)", "python"))
assert called_with == [("print(1)", "python")]
assert result["backend"] == "e2b"
def test_docker_backend_dispatched(self):
"""run_code routes to _run_docker when SANDBOX_BACKEND=docker."""
sb = _load_sandbox("docker")
assert sb.SANDBOX_BACKEND == "docker"
called_with = []
async def fake_docker(code, language):
called_with.append((code, language))
return {"exit_code": 0, "stdout": "ok", "backend": "docker"}
with patch.object(sb, "_run_docker", fake_docker):
result = _run_sync(sb.run_code("echo hi", "shell"))
assert called_with == [("echo hi", "shell")]
assert result["backend"] == "docker"
def test_subprocess_backend_routes_to_run_subprocess(self):
"""run_code with SANDBOX_BACKEND=subprocess calls _run_subprocess."""
sb = _load_sandbox("subprocess")
called_with = []
async def fake_subprocess(code, language):
called_with.append((code, language))
return {"exit_code": 0, "stdout": "ok", "backend": "subprocess"}
with patch.object(sb, "_run_subprocess", fake_subprocess):
result = _run_sync(sb.run_code("print(1)", "python"))
assert called_with == [("print(1)", "python")]
assert result["backend"] == "subprocess"
# ---------------------------------------------------------------------------
# Additional subprocess backend edge-cases
# ---------------------------------------------------------------------------
class TestSubprocessEdgeCases:
def test_process_lookup_error_on_kill(self):
"""ProcessLookupError during proc.kill() after timeout is silently ignored."""
sb = _load_sandbox("subprocess")
sb.SANDBOX_TIMEOUT = 1
# We need the real timeout path but with proc.kill() raising ProcessLookupError.
# Patch asyncio.wait_for to raise TimeoutError then patch proc.kill to raise.
import asyncio as _asyncio
original_create = _asyncio.create_subprocess_exec
async def fake_create(*args, **kwargs):
proc = MagicMock()
proc.returncode = None
async def _communicate():
raise _asyncio.TimeoutError()
proc.communicate = _communicate
def _kill():
raise ProcessLookupError("no such process")
proc.kill = _kill
async def _wait():
pass
proc.wait = _wait
return proc
with patch("asyncio.create_subprocess_exec", fake_create):
result = _run_sync(sb._run_subprocess("import time; time.sleep(100)", "python"))
assert result["exit_code"] == -1
assert "Timeout" in result["error"]
def test_general_exception_in_subprocess_exec(self):
"""Exception from asyncio.create_subprocess_exec is caught and returned."""
sb = _load_sandbox("subprocess")
async def fake_create(*args, **kwargs):
raise OSError("no such executable")
with patch("asyncio.create_subprocess_exec", fake_create):
result = _run_sync(sb._run_subprocess("print(1)", "python"))
assert result["exit_code"] == -1
assert "no such executable" in result["error"]
# ---------------------------------------------------------------------------
# Docker backend
# ---------------------------------------------------------------------------
class TestDockerBackend:
def _make_docker_proc(self, stdout: bytes = b"", stderr: bytes = b"", returncode: int = 0):
"""Return a fake asyncio subprocess-like object."""
proc = MagicMock()
proc.returncode = returncode
async def _communicate():
return (stdout, stderr)
proc.communicate = _communicate
return proc
def test_run_docker_unsupported_language(self):
sb = _load_sandbox("docker")
result = _run_sync(sb._run_docker("code", "cobol"))
assert result["exit_code"] == -1
assert "Unsupported" in result["error"]
def test_run_docker_success(self):
"""_run_docker returns exit_code=0 and correct stdout on success."""
import asyncio as _asyncio
sb = _load_sandbox("docker")
fake_proc = self._make_docker_proc(stdout=b"hello docker\n", stderr=b"")
async def fake_wait_for(coro, timeout):
return await coro
async def fake_create(*args, **kwargs):
return fake_proc
with patch("asyncio.create_subprocess_exec", fake_create), \
patch("asyncio.wait_for", fake_wait_for):
result = _run_sync(sb._run_docker('print("hello docker")', "python"))
assert result["exit_code"] == 0
assert "hello docker" in result["stdout"]
assert result["backend"] == "docker"
assert result["language"] == "python"
def test_run_docker_timeout(self):
"""asyncio.wait_for TimeoutError → returns timeout error dict."""
import asyncio as _asyncio
sb = _load_sandbox("docker")
sb.SANDBOX_TIMEOUT = 1
async def fake_create(*args, **kwargs):
proc = MagicMock()
return proc
async def fake_wait_for(coro, timeout):
raise _asyncio.TimeoutError()
with patch("asyncio.create_subprocess_exec", fake_create), \
patch("asyncio.wait_for", fake_wait_for):
result = _run_sync(sb._run_docker("code", "python"))
assert result["exit_code"] == -1
assert "Timeout" in result["error"]
def test_run_docker_general_exception(self):
"""Generic exception in create_subprocess_exec → returns error dict."""
sb = _load_sandbox("docker")
async def fake_create(*args, **kwargs):
raise RuntimeError("docker not available")
with patch("asyncio.create_subprocess_exec", fake_create):
result = _run_sync(sb._run_docker("code", "python"))
assert result["exit_code"] == -1
assert "docker not available" in result["error"]
def test_run_docker_cleanup_on_success(self, tmp_path, monkeypatch):
"""Temp file is removed after successful run."""
import asyncio as _asyncio
import tempfile
import os
sb = _load_sandbox("docker")
created_files = []
original_mkstemp = tempfile.mkstemp
def fake_mkstemp(suffix="", prefix="", dir=None, text=False):
fd, path = original_mkstemp(suffix=suffix, prefix=prefix)
created_files.append(path)
return fd, path
fake_proc = self._make_docker_proc(stdout=b"done\n", stderr=b"")
async def fake_wait_for(coro, timeout):
return await coro
async def fake_create(*args, **kwargs):
return fake_proc
with patch("tempfile.mkstemp", fake_mkstemp), \
patch("asyncio.create_subprocess_exec", fake_create), \
patch("asyncio.wait_for", fake_wait_for):
result = _run_sync(sb._run_docker("print('done')", "python"))
assert result["exit_code"] == 0
for f in created_files:
assert not os.path.exists(f), f"temp file {f} was not cleaned up"
def test_run_docker_cleanup_on_exception(self, tmp_path, monkeypatch):
"""Temp file is removed even when an exception is raised."""
import tempfile
import os
sb = _load_sandbox("docker")
created_files = []
original_mkstemp = tempfile.mkstemp
def fake_mkstemp(suffix="", prefix="", dir=None, text=False):
fd, path = original_mkstemp(suffix=suffix, prefix=prefix)
created_files.append(path)
return fd, path
async def fake_create(*args, **kwargs):
raise RuntimeError("crash")
with patch("tempfile.mkstemp", fake_mkstemp), \
patch("asyncio.create_subprocess_exec", fake_create):
result = _run_sync(sb._run_docker("print(1)", "python"))
assert result["exit_code"] == -1
for f in created_files:
assert not os.path.exists(f), f"temp file {f} was not cleaned up after exception"
def test_run_docker_cleanup_oserror_swallowed(self, tmp_path):
"""Lines 165-166: os.unlink raises OSError in finally block — swallowed, result still returned."""
import tempfile
import os
sb = _load_sandbox("docker")
fake_proc = self._make_docker_proc(stdout=b"ok\n", stderr=b"")
created_files = []
original_mkstemp = tempfile.mkstemp
def fake_mkstemp(suffix="", prefix="", dir=None, text=False):
fd, path = original_mkstemp(suffix=suffix, prefix=prefix)
created_files.append(path)
return fd, path
async def fake_wait_for(coro, timeout):
return await coro
async def fake_create(*args, **kwargs):
return fake_proc
original_unlink = os.unlink
unlink_calls = []
def raising_unlink(path):
unlink_calls.append(path)
raise OSError("permission denied")
with patch("tempfile.mkstemp", fake_mkstemp), \
patch("asyncio.create_subprocess_exec", fake_create), \
patch("asyncio.wait_for", fake_wait_for), \
patch("os.unlink", raising_unlink):
result = _run_sync(sb._run_docker("print('ok')", "python"))
# OSError is swallowed; result is still returned
assert result["exit_code"] == 0
assert len(unlink_calls) > 0
# ---------------------------------------------------------------------------
# Gap 4: E2B backend — additional coverage paths
# ---------------------------------------------------------------------------
class TestE2BBackendGapCoverage:
"""Cover lines 242, 248, 268-269, 280-281 in _run_e2b."""
def _call_e2b(self, code, language, mock_e2b_mod, api_key="test-key"):
sb = _load_sandbox("e2b")
with patch.dict(os.environ, {"E2B_API_KEY": api_key}):
with patch.dict("sys.modules", {"e2b_code_interpreter": mock_e2b_mod}):
return _run_sync(sb._run_e2b(code, language)), sb
def test_result_error_attribute_captured(self):
"""Line 242: result.error in execution.results → captured in stderr."""
result_obj = MagicMock()
result_obj.text = None
result_obj.error = "NameError: x not defined"
logs_obj = MagicMock()
logs_obj.stdout = []
logs_obj.stderr = []
exec_obj = MagicMock()
exec_obj.results = [result_obj]
exec_obj.logs = logs_obj
sandbox_instance = MagicMock()
sandbox_instance.run_code.return_value = exec_obj
sandbox_instance.kill.return_value = None
sandbox_cls = MagicMock(return_value=sandbox_instance)
mock_mod = MagicMock()
mock_mod.Sandbox = sandbox_cls
result, _ = self._call_e2b("print(x)", "python", mock_mod)
assert result["exit_code"] == 1
assert "NameError" in result["stderr"]
def test_logs_stdout_captured(self):
"""Line 248: execution.logs.stdout → appended to stdout_parts."""
result_obj = MagicMock()
result_obj.text = None
result_obj.error = None
logs_obj = MagicMock()
logs_obj.stdout = ["hello from logs\n"]
logs_obj.stderr = []
exec_obj = MagicMock()
exec_obj.results = [result_obj]
exec_obj.logs = logs_obj
sandbox_instance = MagicMock()
sandbox_instance.run_code.return_value = exec_obj
sandbox_instance.kill.return_value = None
sandbox_cls = MagicMock(return_value=sandbox_instance)
mock_mod = MagicMock()
mock_mod.Sandbox = sandbox_cls
result, _ = self._call_e2b("print('hello from logs')", "python", mock_mod)
assert result["exit_code"] == 0
assert "hello from logs" in result["stdout"]
def test_e2b_timeout_returns_error(self):
"""Lines 268-269: asyncio.TimeoutError raised → returns timeout error dict."""
import asyncio as _asyncio
# Sandbox constructor itself raises TimeoutError via wait_for
sandbox_instance = MagicMock()
sandbox_cls = MagicMock(return_value=sandbox_instance)
mock_mod = MagicMock()
mock_mod.Sandbox = sandbox_cls
sb = _load_sandbox("e2b")
original_wait_for = _asyncio.wait_for
call_count = {"n": 0}
async def raising_wait_for(coro, timeout):
call_count["n"] += 1
if call_count["n"] == 1:
raise _asyncio.TimeoutError()
return await original_wait_for(coro, timeout)
with patch.dict(os.environ, {"E2B_API_KEY": "test-key"}):
with patch.dict("sys.modules", {"e2b_code_interpreter": mock_mod}):
with patch("asyncio.wait_for", raising_wait_for):
result = _run_sync(sb._run_e2b("print(1)", "python"))
assert result["exit_code"] == -1
assert "Timeout" in result["error"]
def test_e2b_cleanup_exception_swallowed(self):
"""Lines 280-281: sandbox.kill raises in finally → exception swallowed."""
result_obj = MagicMock()
result_obj.text = "42\n"
result_obj.error = None
logs_obj = MagicMock()
logs_obj.stdout = []
logs_obj.stderr = []
exec_obj = MagicMock()
exec_obj.results = [result_obj]
exec_obj.logs = logs_obj
sandbox_instance = MagicMock()
sandbox_instance.run_code.return_value = exec_obj
# Make kill raise an exception
sandbox_instance.kill.side_effect = RuntimeError("kill failed")
sandbox_cls = MagicMock(return_value=sandbox_instance)
mock_mod = MagicMock()
mock_mod.Sandbox = sandbox_cls
result, _ = self._call_e2b("print(42)", "python", mock_mod)
# Result is still returned despite kill() failing
assert result["exit_code"] == 0
assert "42" in result["stdout"]