Merge pull request #2473 from Molecule-AI/feat/universal-turn-smoke-runtime-wedge
feat(smoke): consult runtime_wedge after execute() to catch SDK init wedges
This commit is contained in:
commit
b39dc62de6
@ -15,6 +15,15 @@ times out — that's a *pass*. If a lazy import is broken, the call
|
||||
raises `ImportError` / `ModuleNotFoundError` from inside the executor
|
||||
body — that's a *fail*.
|
||||
|
||||
Universal wedge gate (task #131): timeout-as-pass alone misses init
|
||||
wedges where the SDK process spins for 60s+ on a malformed argv
|
||||
(claude-agent-sdk PR #25 class). After every result path, the smoke
|
||||
consults `runtime_wedge.is_wedged()` — adapters opt-in by calling
|
||||
`runtime_wedge.mark_wedged(reason)` from their executor's wedge catch
|
||||
arm, and the smoke upgrades the provisional PASS to FAIL when the
|
||||
flag is set. Non-opt-in adapters keep working as before — the check
|
||||
is additive.
|
||||
|
||||
Activated by setting `MOLECULE_SMOKE_MODE=1` in the env. Wired into
|
||||
`main.py` after `executor = await adapter.create_executor(...)` so the
|
||||
full adapter setup path runs first; the smoke just adds one more
|
||||
@ -23,7 +32,10 @@ exercise step before exit.
|
||||
CI usage (intended for `molecule-ci/.github/workflows/publish-template-image.yml`):
|
||||
docker run --rm \
|
||||
-e WORKSPACE_ID=fake -e MOLECULE_SMOKE_MODE=1 \
|
||||
-e MOLECULE_SMOKE_TIMEOUT_SECS=90 \
|
||||
"$IMAGE" molecule-runtime
|
||||
The 90s timeout is calibrated to claude-agent-sdk's 60s
|
||||
`initialize()` handshake — adapters with shorter init can lower it.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
@ -81,20 +93,52 @@ def _build_stub_context() -> tuple[Any, Any]:
|
||||
return context, queue
|
||||
|
||||
|
||||
def _check_runtime_wedge() -> str | None:
|
||||
"""Return the wedge reason if any adapter has marked the runtime
|
||||
wedged during this smoke run, or None when healthy.
|
||||
|
||||
Universal turn-smoke (task #131): adapters that hit an unrecoverable
|
||||
init wedge (e.g. claude-agent-sdk's `Control request timeout:
|
||||
initialize` after a malformed CLI argv) call
|
||||
`runtime_wedge.mark_wedged(reason)`. The smoke gate consults this
|
||||
flag at the end of every result path — pre-existing PASS branches
|
||||
are upgraded to FAIL when the flag is set, so a wedge that was
|
||||
triggered inside a still-running execute() (timeout branch) or
|
||||
inside a non-import exception (PASS-on-other-error branch) gets
|
||||
surfaced instead of silently shipping a broken image to GHCR.
|
||||
|
||||
Lazy import: the runtime may be installed without runtime_wedge in
|
||||
a corrupt-rolling-deploy state, in which case "no wedge info"
|
||||
reads as "assume healthy" — same fail-open posture heartbeat.py
|
||||
takes for the same reason.
|
||||
"""
|
||||
try:
|
||||
from runtime_wedge import is_wedged, wedge_reason
|
||||
except Exception:
|
||||
return None
|
||||
if is_wedged():
|
||||
return wedge_reason() or "<unspecified>"
|
||||
return None
|
||||
|
||||
|
||||
async def run_executor_smoke(executor: Any) -> int:
|
||||
"""Invoke executor.execute() once with stub deps. Return an exit code.
|
||||
|
||||
Returns:
|
||||
0 — import tree healthy. Either execution timed out (the
|
||||
expected outcome — we hit a network boundary like an LLM
|
||||
call) or completed cleanly. Either way, no broken imports.
|
||||
1 — broken lazy import detected. Re-raised as a clear log line
|
||||
so the publish gate's stderr captures the offending symbol.
|
||||
0 — import tree healthy AND no adapter marked the runtime wedged.
|
||||
Either execution timed out (the expected outcome — we hit a
|
||||
network boundary like an LLM call) or completed cleanly.
|
||||
1 — broken lazy import detected, OR an adapter marked the
|
||||
runtime wedged via runtime_wedge.mark_wedged(). Re-raised
|
||||
as a clear log line so the publish gate's stderr captures
|
||||
the offending symbol or wedge reason.
|
||||
|
||||
The 5-second timeout comes from `MOLECULE_SMOKE_TIMEOUT_SECS` env
|
||||
(default 5.0). Bump it via env if a slow adapter setup overlaps the
|
||||
first execute call. Don't make it too long — the publish workflow
|
||||
multiplies this across N templates.
|
||||
(default 5.0). Bump it via env when the failure mode under test is
|
||||
an init handshake that takes longer than 5s to give up — e.g.
|
||||
claude-agent-sdk's 60s `initialize()` timeout needs ~90s here so
|
||||
the SDK marks itself wedged before our outer wait_for fires.
|
||||
The publish workflow sets this value per-template via env.
|
||||
"""
|
||||
print(
|
||||
f"[smoke-mode] invoking executor.execute(stub_ctx, stub_queue) "
|
||||
@ -114,6 +158,11 @@ async def run_executor_smoke(executor: Any) -> int:
|
||||
)
|
||||
return 1
|
||||
|
||||
# Outcome of executor.execute() — narrowed to exit code by the
|
||||
# post-run wedge check below. Pre-wedge-check exit code: 0 for
|
||||
# PASS-shaped paths (timeout, clean return, non-import exception),
|
||||
# 1 for FAIL-shaped paths (import error). Wedge check upgrades
|
||||
# PASS → FAIL when the runtime self-reports wedged.
|
||||
try:
|
||||
await asyncio.wait_for(
|
||||
executor.execute(context, queue),
|
||||
@ -121,9 +170,11 @@ async def run_executor_smoke(executor: Any) -> int:
|
||||
)
|
||||
except (asyncio.TimeoutError, asyncio.CancelledError):
|
||||
# Timeout = imports healthy, execution was proceeding and hit
|
||||
# a network boundary or long await. Pass.
|
||||
print("[smoke-mode] PASS: timed out past import-tree (imports healthy)")
|
||||
return 0
|
||||
# a network boundary or long await. Provisionally PASS — but
|
||||
# also check runtime_wedge below: an adapter whose init wedge
|
||||
# fires inside the timeout window still needs to FAIL the gate.
|
||||
pre_wedge_code = 0
|
||||
pre_wedge_msg = "timed out past import-tree (imports healthy)"
|
||||
except (ImportError, ModuleNotFoundError) as imp_err:
|
||||
# The exact regression class issue #2275 exists to catch.
|
||||
print(
|
||||
@ -134,13 +185,33 @@ async def run_executor_smoke(executor: Any) -> int:
|
||||
return 1
|
||||
except Exception as other_err: # noqa: BLE001
|
||||
# Anything else (auth errors, validation errors, runtime bugs)
|
||||
# is downstream of the import gate. Pass — these are caught by
|
||||
# the relevant adapter-level tests, not by this smoke.
|
||||
print(
|
||||
f"[smoke-mode] PASS: execute() raised "
|
||||
f"{type(other_err).__name__} past import-tree (not an import error)"
|
||||
# is downstream of the import gate. Provisionally PASS — these
|
||||
# are caught by adapter-level tests, NOT by this gate, EXCEPT
|
||||
# when the adapter also called runtime_wedge.mark_wedged() on
|
||||
# the way out (the PR-25-class wedge — SDK init failure inside
|
||||
# execute()). The post-run wedge check below catches that.
|
||||
pre_wedge_code = 0
|
||||
pre_wedge_msg = (
|
||||
f"execute() raised {type(other_err).__name__} "
|
||||
"past import-tree (not an import error)"
|
||||
)
|
||||
return 0
|
||||
else:
|
||||
print("[smoke-mode] PASS: execute() completed within timeout (imports + body OK)")
|
||||
return 0
|
||||
pre_wedge_code = 0
|
||||
pre_wedge_msg = "execute() completed within timeout (imports + body OK)"
|
||||
|
||||
wedge_reason_str = _check_runtime_wedge()
|
||||
if wedge_reason_str is not None:
|
||||
# Adapter self-reported wedge — overrides any provisional PASS.
|
||||
# This is the path that catches the PR-25-class regression
|
||||
# (claude_agent_sdk init wedge from a malformed CLI argv) that
|
||||
# otherwise looks like a benign network-call timeout to the
|
||||
# outer wait_for.
|
||||
print(
|
||||
f"[smoke-mode] FAIL: runtime self-reported wedged after execute(): "
|
||||
f"{wedge_reason_str}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 1
|
||||
|
||||
print(f"[smoke-mode] PASS: {pre_wedge_msg}")
|
||||
return pre_wedge_code
|
||||
|
||||
@ -209,3 +209,140 @@ async def test_smoke_fails_when_stub_context_build_breaks(monkeypatch: pytest.Mo
|
||||
monkeypatch.setattr(smoke_mode, "_build_stub_context", _fail_build)
|
||||
code = await smoke_mode.run_executor_smoke(_CleanExecutor())
|
||||
assert code == 1
|
||||
|
||||
|
||||
# ─── runtime_wedge integration (universal turn-smoke, task #131) ───────
|
||||
#
|
||||
# These tests pin the post-execute wedge-check that upgrades a
|
||||
# provisional PASS to FAIL when an adapter has marked the runtime
|
||||
# wedged via `runtime_wedge.mark_wedged()`. Without this gate, the
|
||||
# PR-25-class regression (claude_agent_sdk init wedge from a malformed
|
||||
# CLI argv) shipped to GHCR because the smoke saw the outer wait_for
|
||||
# timeout as "imports healthy, hit a network boundary."
|
||||
|
||||
|
||||
class _MarkWedgedThenRaiseExecutor:
|
||||
"""Mimics the claude_sdk_executor wedge path: catches the SDK's
|
||||
`Control request timeout: initialize`, calls
|
||||
`runtime_wedge.mark_wedged()` from the catch arm, then re-raises
|
||||
a sanitized error. The smoke must surface this as FAIL even
|
||||
though the outer exception class (`RuntimeError` here) would
|
||||
otherwise be a PASS-on-non-import-error.
|
||||
"""
|
||||
|
||||
def __init__(self, reason: str):
|
||||
self._reason = reason
|
||||
|
||||
async def execute(self, context, event_queue) -> None: # noqa: ARG002
|
||||
import runtime_wedge
|
||||
runtime_wedge.mark_wedged(self._reason)
|
||||
raise RuntimeError("sanitized adapter error after wedge")
|
||||
|
||||
|
||||
class _MarkWedgedThenBlockExecutor:
|
||||
"""Mimics a wedge that fires inside a still-running execute() —
|
||||
the adapter marks wedged, then continues to await something
|
||||
network-shaped that the outer wait_for cuts short. The pre-fix
|
||||
smoke returned 0 here ('timed out past import-tree') even though
|
||||
the runtime had already self-reported wedged.
|
||||
"""
|
||||
|
||||
def __init__(self, reason: str):
|
||||
self._reason = reason
|
||||
|
||||
async def execute(self, context, event_queue) -> None: # noqa: ARG002
|
||||
import runtime_wedge
|
||||
runtime_wedge.mark_wedged(self._reason)
|
||||
await asyncio.Event().wait()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def reset_runtime_wedge():
|
||||
"""Ensure each wedge-test starts and ends with the runtime healthy.
|
||||
|
||||
The wedge is module-scoped state (`_DEFAULT` in runtime_wedge.py),
|
||||
so a leak from one test would contaminate every subsequent smoke
|
||||
test in the same pytest process. Reset on both sides so an early
|
||||
failure doesn't poison the rest of the file either.
|
||||
"""
|
||||
import runtime_wedge
|
||||
runtime_wedge.reset_for_test()
|
||||
yield
|
||||
runtime_wedge.reset_for_test()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_smoke_fails_when_adapter_marked_wedged_via_exception(
|
||||
stub_build, reset_runtime_wedge,
|
||||
):
|
||||
"""PR-25 regression class: adapter catches SDK init wedge, marks
|
||||
runtime_wedge, raises a sanitized error. Outer exception class
|
||||
(`RuntimeError`) is non-import → would have been PASS pre-fix.
|
||||
Post-fix: post-run wedge check overrides PASS → FAIL."""
|
||||
code = await smoke_mode.run_executor_smoke(
|
||||
_MarkWedgedThenRaiseExecutor("claude SDK init timeout — restart workspace"),
|
||||
)
|
||||
assert code == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_smoke_fails_when_adapter_marked_wedged_then_blocks(
|
||||
stub_build, reset_runtime_wedge, monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Same wedge class as above but the adapter doesn't raise — it
|
||||
keeps awaiting (e.g. waiting on a control-message reply that will
|
||||
never come). Outer wait_for cuts short → would have been PASS-on-
|
||||
timeout pre-fix. Post-fix: wedge check upgrades to FAIL.
|
||||
"""
|
||||
monkeypatch.setattr(smoke_mode, "_SMOKE_TIMEOUT_SECS", 0.1)
|
||||
code = await smoke_mode.run_executor_smoke(
|
||||
_MarkWedgedThenBlockExecutor("hermes init handshake timed out"),
|
||||
)
|
||||
assert code == 1
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_smoke_passes_when_runtime_wedge_is_clean_after_clean_execute(
|
||||
stub_build, reset_runtime_wedge,
|
||||
):
|
||||
"""Belt-and-braces: wedge-clean + clean execute() must still PASS.
|
||||
Pins that the new check is additive — it doesn't accidentally
|
||||
fail healthy executions (e.g. by treating "no runtime_wedge import"
|
||||
as a wedge)."""
|
||||
code = await smoke_mode.run_executor_smoke(_CleanExecutor())
|
||||
assert code == 0
|
||||
|
||||
|
||||
def test_check_runtime_wedge_returns_none_when_module_missing(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""Direct test for the import-resilience contract — the helper
|
||||
must swallow ImportError (and any other exception while reading
|
||||
the module) so a corrupt install doesn't crash the smoke gate."""
|
||||
import builtins
|
||||
real_import = builtins.__import__
|
||||
|
||||
def _raising_import(name, *args, **kwargs):
|
||||
if name == "runtime_wedge":
|
||||
raise ImportError("simulated: runtime_wedge unavailable")
|
||||
return real_import(name, *args, **kwargs)
|
||||
|
||||
monkeypatch.setattr(builtins, "__import__", _raising_import)
|
||||
assert smoke_mode._check_runtime_wedge() is None
|
||||
|
||||
|
||||
def test_check_runtime_wedge_returns_reason_when_marked(reset_runtime_wedge):
|
||||
"""When an adapter has called runtime_wedge.mark_wedged(reason),
|
||||
the helper returns that reason verbatim so the smoke can surface
|
||||
it in the FAIL log line."""
|
||||
import runtime_wedge
|
||||
runtime_wedge.mark_wedged("explicit test reason")
|
||||
assert smoke_mode._check_runtime_wedge() == "explicit test reason"
|
||||
|
||||
|
||||
def test_check_runtime_wedge_returns_none_when_clean(reset_runtime_wedge):
|
||||
"""Pre-condition for the additive contract: helper must return
|
||||
None (not the empty string from `wedge_reason()`) when no adapter
|
||||
has marked the runtime wedged, so the caller's `is not None`
|
||||
check works."""
|
||||
assert smoke_mode._check_runtime_wedge() is None
|
||||
|
||||
Loading…
Reference in New Issue
Block a user