Merge pull request #2473 from Molecule-AI/feat/universal-turn-smoke-runtime-wedge

feat(smoke): consult runtime_wedge after execute() to catch SDK init wedges
This commit is contained in:
Hongming Wang 2026-05-02 00:52:31 +00:00 committed by GitHub
commit b39dc62de6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 227 additions and 19 deletions

View File

@ -15,6 +15,15 @@ times out — that's a *pass*. If a lazy import is broken, the call
raises `ImportError` / `ModuleNotFoundError` from inside the executor
body that's a *fail*.
Universal wedge gate (task #131): timeout-as-pass alone misses init
wedges where the SDK process spins for 60s+ on a malformed argv
(claude-agent-sdk PR #25 class). After every result path, the smoke
consults `runtime_wedge.is_wedged()` adapters opt-in by calling
`runtime_wedge.mark_wedged(reason)` from their executor's wedge catch
arm, and the smoke upgrades the provisional PASS to FAIL when the
flag is set. Non-opt-in adapters keep working as before the check
is additive.
Activated by setting `MOLECULE_SMOKE_MODE=1` in the env. Wired into
`main.py` after `executor = await adapter.create_executor(...)` so the
full adapter setup path runs first; the smoke just adds one more
@ -23,7 +32,10 @@ exercise step before exit.
CI usage (intended for `molecule-ci/.github/workflows/publish-template-image.yml`):
docker run --rm \
-e WORKSPACE_ID=fake -e MOLECULE_SMOKE_MODE=1 \
-e MOLECULE_SMOKE_TIMEOUT_SECS=90 \
"$IMAGE" molecule-runtime
The 90s timeout is calibrated to claude-agent-sdk's 60s
`initialize()` handshake adapters with shorter init can lower it.
"""
from __future__ import annotations
@ -81,20 +93,52 @@ def _build_stub_context() -> tuple[Any, Any]:
return context, queue
def _check_runtime_wedge() -> str | None:
"""Return the wedge reason if any adapter has marked the runtime
wedged during this smoke run, or None when healthy.
Universal turn-smoke (task #131): adapters that hit an unrecoverable
init wedge (e.g. claude-agent-sdk's `Control request timeout:
initialize` after a malformed CLI argv) call
`runtime_wedge.mark_wedged(reason)`. The smoke gate consults this
flag at the end of every result path pre-existing PASS branches
are upgraded to FAIL when the flag is set, so a wedge that was
triggered inside a still-running execute() (timeout branch) or
inside a non-import exception (PASS-on-other-error branch) gets
surfaced instead of silently shipping a broken image to GHCR.
Lazy import: the runtime may be installed without runtime_wedge in
a corrupt-rolling-deploy state, in which case "no wedge info"
reads as "assume healthy" same fail-open posture heartbeat.py
takes for the same reason.
"""
try:
from runtime_wedge import is_wedged, wedge_reason
except Exception:
return None
if is_wedged():
return wedge_reason() or "<unspecified>"
return None
async def run_executor_smoke(executor: Any) -> int:
"""Invoke executor.execute() once with stub deps. Return an exit code.
Returns:
0 import tree healthy. Either execution timed out (the
expected outcome we hit a network boundary like an LLM
call) or completed cleanly. Either way, no broken imports.
1 broken lazy import detected. Re-raised as a clear log line
so the publish gate's stderr captures the offending symbol.
0 import tree healthy AND no adapter marked the runtime wedged.
Either execution timed out (the expected outcome we hit a
network boundary like an LLM call) or completed cleanly.
1 broken lazy import detected, OR an adapter marked the
runtime wedged via runtime_wedge.mark_wedged(). Re-raised
as a clear log line so the publish gate's stderr captures
the offending symbol or wedge reason.
The 5-second timeout comes from `MOLECULE_SMOKE_TIMEOUT_SECS` env
(default 5.0). Bump it via env if a slow adapter setup overlaps the
first execute call. Don't make it too long — the publish workflow
multiplies this across N templates.
(default 5.0). Bump it via env when the failure mode under test is
an init handshake that takes longer than 5s to give up e.g.
claude-agent-sdk's 60s `initialize()` timeout needs ~90s here so
the SDK marks itself wedged before our outer wait_for fires.
The publish workflow sets this value per-template via env.
"""
print(
f"[smoke-mode] invoking executor.execute(stub_ctx, stub_queue) "
@ -114,6 +158,11 @@ async def run_executor_smoke(executor: Any) -> int:
)
return 1
# Outcome of executor.execute() — narrowed to exit code by the
# post-run wedge check below. Pre-wedge-check exit code: 0 for
# PASS-shaped paths (timeout, clean return, non-import exception),
# 1 for FAIL-shaped paths (import error). Wedge check upgrades
# PASS → FAIL when the runtime self-reports wedged.
try:
await asyncio.wait_for(
executor.execute(context, queue),
@ -121,9 +170,11 @@ async def run_executor_smoke(executor: Any) -> int:
)
except (asyncio.TimeoutError, asyncio.CancelledError):
# Timeout = imports healthy, execution was proceeding and hit
# a network boundary or long await. Pass.
print("[smoke-mode] PASS: timed out past import-tree (imports healthy)")
return 0
# a network boundary or long await. Provisionally PASS — but
# also check runtime_wedge below: an adapter whose init wedge
# fires inside the timeout window still needs to FAIL the gate.
pre_wedge_code = 0
pre_wedge_msg = "timed out past import-tree (imports healthy)"
except (ImportError, ModuleNotFoundError) as imp_err:
# The exact regression class issue #2275 exists to catch.
print(
@ -134,13 +185,33 @@ async def run_executor_smoke(executor: Any) -> int:
return 1
except Exception as other_err: # noqa: BLE001
# Anything else (auth errors, validation errors, runtime bugs)
# is downstream of the import gate. Pass — these are caught by
# the relevant adapter-level tests, not by this smoke.
print(
f"[smoke-mode] PASS: execute() raised "
f"{type(other_err).__name__} past import-tree (not an import error)"
# is downstream of the import gate. Provisionally PASS — these
# are caught by adapter-level tests, NOT by this gate, EXCEPT
# when the adapter also called runtime_wedge.mark_wedged() on
# the way out (the PR-25-class wedge — SDK init failure inside
# execute()). The post-run wedge check below catches that.
pre_wedge_code = 0
pre_wedge_msg = (
f"execute() raised {type(other_err).__name__} "
"past import-tree (not an import error)"
)
return 0
else:
print("[smoke-mode] PASS: execute() completed within timeout (imports + body OK)")
return 0
pre_wedge_code = 0
pre_wedge_msg = "execute() completed within timeout (imports + body OK)"
wedge_reason_str = _check_runtime_wedge()
if wedge_reason_str is not None:
# Adapter self-reported wedge — overrides any provisional PASS.
# This is the path that catches the PR-25-class regression
# (claude_agent_sdk init wedge from a malformed CLI argv) that
# otherwise looks like a benign network-call timeout to the
# outer wait_for.
print(
f"[smoke-mode] FAIL: runtime self-reported wedged after execute(): "
f"{wedge_reason_str}",
file=sys.stderr,
)
return 1
print(f"[smoke-mode] PASS: {pre_wedge_msg}")
return pre_wedge_code

View File

@ -209,3 +209,140 @@ async def test_smoke_fails_when_stub_context_build_breaks(monkeypatch: pytest.Mo
monkeypatch.setattr(smoke_mode, "_build_stub_context", _fail_build)
code = await smoke_mode.run_executor_smoke(_CleanExecutor())
assert code == 1
# ─── runtime_wedge integration (universal turn-smoke, task #131) ───────
#
# These tests pin the post-execute wedge-check that upgrades a
# provisional PASS to FAIL when an adapter has marked the runtime
# wedged via `runtime_wedge.mark_wedged()`. Without this gate, the
# PR-25-class regression (claude_agent_sdk init wedge from a malformed
# CLI argv) shipped to GHCR because the smoke saw the outer wait_for
# timeout as "imports healthy, hit a network boundary."
class _MarkWedgedThenRaiseExecutor:
"""Mimics the claude_sdk_executor wedge path: catches the SDK's
`Control request timeout: initialize`, calls
`runtime_wedge.mark_wedged()` from the catch arm, then re-raises
a sanitized error. The smoke must surface this as FAIL even
though the outer exception class (`RuntimeError` here) would
otherwise be a PASS-on-non-import-error.
"""
def __init__(self, reason: str):
self._reason = reason
async def execute(self, context, event_queue) -> None: # noqa: ARG002
import runtime_wedge
runtime_wedge.mark_wedged(self._reason)
raise RuntimeError("sanitized adapter error after wedge")
class _MarkWedgedThenBlockExecutor:
"""Mimics a wedge that fires inside a still-running execute() —
the adapter marks wedged, then continues to await something
network-shaped that the outer wait_for cuts short. The pre-fix
smoke returned 0 here ('timed out past import-tree') even though
the runtime had already self-reported wedged.
"""
def __init__(self, reason: str):
self._reason = reason
async def execute(self, context, event_queue) -> None: # noqa: ARG002
import runtime_wedge
runtime_wedge.mark_wedged(self._reason)
await asyncio.Event().wait()
@pytest.fixture
def reset_runtime_wedge():
"""Ensure each wedge-test starts and ends with the runtime healthy.
The wedge is module-scoped state (`_DEFAULT` in runtime_wedge.py),
so a leak from one test would contaminate every subsequent smoke
test in the same pytest process. Reset on both sides so an early
failure doesn't poison the rest of the file either.
"""
import runtime_wedge
runtime_wedge.reset_for_test()
yield
runtime_wedge.reset_for_test()
@pytest.mark.asyncio
async def test_smoke_fails_when_adapter_marked_wedged_via_exception(
stub_build, reset_runtime_wedge,
):
"""PR-25 regression class: adapter catches SDK init wedge, marks
runtime_wedge, raises a sanitized error. Outer exception class
(`RuntimeError`) is non-import would have been PASS pre-fix.
Post-fix: post-run wedge check overrides PASS FAIL."""
code = await smoke_mode.run_executor_smoke(
_MarkWedgedThenRaiseExecutor("claude SDK init timeout — restart workspace"),
)
assert code == 1
@pytest.mark.asyncio
async def test_smoke_fails_when_adapter_marked_wedged_then_blocks(
stub_build, reset_runtime_wedge, monkeypatch: pytest.MonkeyPatch,
):
"""Same wedge class as above but the adapter doesn't raise — it
keeps awaiting (e.g. waiting on a control-message reply that will
never come). Outer wait_for cuts short would have been PASS-on-
timeout pre-fix. Post-fix: wedge check upgrades to FAIL.
"""
monkeypatch.setattr(smoke_mode, "_SMOKE_TIMEOUT_SECS", 0.1)
code = await smoke_mode.run_executor_smoke(
_MarkWedgedThenBlockExecutor("hermes init handshake timed out"),
)
assert code == 1
@pytest.mark.asyncio
async def test_smoke_passes_when_runtime_wedge_is_clean_after_clean_execute(
stub_build, reset_runtime_wedge,
):
"""Belt-and-braces: wedge-clean + clean execute() must still PASS.
Pins that the new check is additive it doesn't accidentally
fail healthy executions (e.g. by treating "no runtime_wedge import"
as a wedge)."""
code = await smoke_mode.run_executor_smoke(_CleanExecutor())
assert code == 0
def test_check_runtime_wedge_returns_none_when_module_missing(
monkeypatch: pytest.MonkeyPatch,
):
"""Direct test for the import-resilience contract — the helper
must swallow ImportError (and any other exception while reading
the module) so a corrupt install doesn't crash the smoke gate."""
import builtins
real_import = builtins.__import__
def _raising_import(name, *args, **kwargs):
if name == "runtime_wedge":
raise ImportError("simulated: runtime_wedge unavailable")
return real_import(name, *args, **kwargs)
monkeypatch.setattr(builtins, "__import__", _raising_import)
assert smoke_mode._check_runtime_wedge() is None
def test_check_runtime_wedge_returns_reason_when_marked(reset_runtime_wedge):
"""When an adapter has called runtime_wedge.mark_wedged(reason),
the helper returns that reason verbatim so the smoke can surface
it in the FAIL log line."""
import runtime_wedge
runtime_wedge.mark_wedged("explicit test reason")
assert smoke_mode._check_runtime_wedge() == "explicit test reason"
def test_check_runtime_wedge_returns_none_when_clean(reset_runtime_wedge):
"""Pre-condition for the additive contract: helper must return
None (not the empty string from `wedge_reason()`) when no adapter
has marked the runtime wedged, so the caller's `is not None`
check works."""
assert smoke_mode._check_runtime_wedge() is None