diff --git a/workspace/smoke_mode.py b/workspace/smoke_mode.py index 79399946..bc65c986 100644 --- a/workspace/smoke_mode.py +++ b/workspace/smoke_mode.py @@ -15,6 +15,15 @@ times out — that's a *pass*. If a lazy import is broken, the call raises `ImportError` / `ModuleNotFoundError` from inside the executor body — that's a *fail*. +Universal wedge gate (task #131): timeout-as-pass alone misses init +wedges where the SDK process spins for 60s+ on a malformed argv +(claude-agent-sdk PR #25 class). After every result path, the smoke +consults `runtime_wedge.is_wedged()` — adapters opt-in by calling +`runtime_wedge.mark_wedged(reason)` from their executor's wedge catch +arm, and the smoke upgrades the provisional PASS to FAIL when the +flag is set. Non-opt-in adapters keep working as before — the check +is additive. + Activated by setting `MOLECULE_SMOKE_MODE=1` in the env. Wired into `main.py` after `executor = await adapter.create_executor(...)` so the full adapter setup path runs first; the smoke just adds one more @@ -23,7 +32,10 @@ exercise step before exit. CI usage (intended for `molecule-ci/.github/workflows/publish-template-image.yml`): docker run --rm \ -e WORKSPACE_ID=fake -e MOLECULE_SMOKE_MODE=1 \ + -e MOLECULE_SMOKE_TIMEOUT_SECS=90 \ "$IMAGE" molecule-runtime +The 90s timeout is calibrated to claude-agent-sdk's 60s +`initialize()` handshake — adapters with shorter init can lower it. """ from __future__ import annotations @@ -81,20 +93,52 @@ def _build_stub_context() -> tuple[Any, Any]: return context, queue +def _check_runtime_wedge() -> str | None: + """Return the wedge reason if any adapter has marked the runtime + wedged during this smoke run, or None when healthy. + + Universal turn-smoke (task #131): adapters that hit an unrecoverable + init wedge (e.g. claude-agent-sdk's `Control request timeout: + initialize` after a malformed CLI argv) call + `runtime_wedge.mark_wedged(reason)`. The smoke gate consults this + flag at the end of every result path — pre-existing PASS branches + are upgraded to FAIL when the flag is set, so a wedge that was + triggered inside a still-running execute() (timeout branch) or + inside a non-import exception (PASS-on-other-error branch) gets + surfaced instead of silently shipping a broken image to GHCR. + + Lazy import: the runtime may be installed without runtime_wedge in + a corrupt-rolling-deploy state, in which case "no wedge info" + reads as "assume healthy" — same fail-open posture heartbeat.py + takes for the same reason. + """ + try: + from runtime_wedge import is_wedged, wedge_reason + except Exception: + return None + if is_wedged(): + return wedge_reason() or "" + return None + + async def run_executor_smoke(executor: Any) -> int: """Invoke executor.execute() once with stub deps. Return an exit code. Returns: - 0 — import tree healthy. Either execution timed out (the - expected outcome — we hit a network boundary like an LLM - call) or completed cleanly. Either way, no broken imports. - 1 — broken lazy import detected. Re-raised as a clear log line - so the publish gate's stderr captures the offending symbol. + 0 — import tree healthy AND no adapter marked the runtime wedged. + Either execution timed out (the expected outcome — we hit a + network boundary like an LLM call) or completed cleanly. + 1 — broken lazy import detected, OR an adapter marked the + runtime wedged via runtime_wedge.mark_wedged(). Re-raised + as a clear log line so the publish gate's stderr captures + the offending symbol or wedge reason. The 5-second timeout comes from `MOLECULE_SMOKE_TIMEOUT_SECS` env - (default 5.0). Bump it via env if a slow adapter setup overlaps the - first execute call. Don't make it too long — the publish workflow - multiplies this across N templates. + (default 5.0). Bump it via env when the failure mode under test is + an init handshake that takes longer than 5s to give up — e.g. + claude-agent-sdk's 60s `initialize()` timeout needs ~90s here so + the SDK marks itself wedged before our outer wait_for fires. + The publish workflow sets this value per-template via env. """ print( f"[smoke-mode] invoking executor.execute(stub_ctx, stub_queue) " @@ -114,6 +158,11 @@ async def run_executor_smoke(executor: Any) -> int: ) return 1 + # Outcome of executor.execute() — narrowed to exit code by the + # post-run wedge check below. Pre-wedge-check exit code: 0 for + # PASS-shaped paths (timeout, clean return, non-import exception), + # 1 for FAIL-shaped paths (import error). Wedge check upgrades + # PASS → FAIL when the runtime self-reports wedged. try: await asyncio.wait_for( executor.execute(context, queue), @@ -121,9 +170,11 @@ async def run_executor_smoke(executor: Any) -> int: ) except (asyncio.TimeoutError, asyncio.CancelledError): # Timeout = imports healthy, execution was proceeding and hit - # a network boundary or long await. Pass. - print("[smoke-mode] PASS: timed out past import-tree (imports healthy)") - return 0 + # a network boundary or long await. Provisionally PASS — but + # also check runtime_wedge below: an adapter whose init wedge + # fires inside the timeout window still needs to FAIL the gate. + pre_wedge_code = 0 + pre_wedge_msg = "timed out past import-tree (imports healthy)" except (ImportError, ModuleNotFoundError) as imp_err: # The exact regression class issue #2275 exists to catch. print( @@ -134,13 +185,33 @@ async def run_executor_smoke(executor: Any) -> int: return 1 except Exception as other_err: # noqa: BLE001 # Anything else (auth errors, validation errors, runtime bugs) - # is downstream of the import gate. Pass — these are caught by - # the relevant adapter-level tests, not by this smoke. - print( - f"[smoke-mode] PASS: execute() raised " - f"{type(other_err).__name__} past import-tree (not an import error)" + # is downstream of the import gate. Provisionally PASS — these + # are caught by adapter-level tests, NOT by this gate, EXCEPT + # when the adapter also called runtime_wedge.mark_wedged() on + # the way out (the PR-25-class wedge — SDK init failure inside + # execute()). The post-run wedge check below catches that. + pre_wedge_code = 0 + pre_wedge_msg = ( + f"execute() raised {type(other_err).__name__} " + "past import-tree (not an import error)" ) - return 0 else: - print("[smoke-mode] PASS: execute() completed within timeout (imports + body OK)") - return 0 + pre_wedge_code = 0 + pre_wedge_msg = "execute() completed within timeout (imports + body OK)" + + wedge_reason_str = _check_runtime_wedge() + if wedge_reason_str is not None: + # Adapter self-reported wedge — overrides any provisional PASS. + # This is the path that catches the PR-25-class regression + # (claude_agent_sdk init wedge from a malformed CLI argv) that + # otherwise looks like a benign network-call timeout to the + # outer wait_for. + print( + f"[smoke-mode] FAIL: runtime self-reported wedged after execute(): " + f"{wedge_reason_str}", + file=sys.stderr, + ) + return 1 + + print(f"[smoke-mode] PASS: {pre_wedge_msg}") + return pre_wedge_code diff --git a/workspace/tests/test_smoke_mode.py b/workspace/tests/test_smoke_mode.py index 9721024f..aeae6ad6 100644 --- a/workspace/tests/test_smoke_mode.py +++ b/workspace/tests/test_smoke_mode.py @@ -209,3 +209,140 @@ async def test_smoke_fails_when_stub_context_build_breaks(monkeypatch: pytest.Mo monkeypatch.setattr(smoke_mode, "_build_stub_context", _fail_build) code = await smoke_mode.run_executor_smoke(_CleanExecutor()) assert code == 1 + + +# ─── runtime_wedge integration (universal turn-smoke, task #131) ─────── +# +# These tests pin the post-execute wedge-check that upgrades a +# provisional PASS to FAIL when an adapter has marked the runtime +# wedged via `runtime_wedge.mark_wedged()`. Without this gate, the +# PR-25-class regression (claude_agent_sdk init wedge from a malformed +# CLI argv) shipped to GHCR because the smoke saw the outer wait_for +# timeout as "imports healthy, hit a network boundary." + + +class _MarkWedgedThenRaiseExecutor: + """Mimics the claude_sdk_executor wedge path: catches the SDK's + `Control request timeout: initialize`, calls + `runtime_wedge.mark_wedged()` from the catch arm, then re-raises + a sanitized error. The smoke must surface this as FAIL even + though the outer exception class (`RuntimeError` here) would + otherwise be a PASS-on-non-import-error. + """ + + def __init__(self, reason: str): + self._reason = reason + + async def execute(self, context, event_queue) -> None: # noqa: ARG002 + import runtime_wedge + runtime_wedge.mark_wedged(self._reason) + raise RuntimeError("sanitized adapter error after wedge") + + +class _MarkWedgedThenBlockExecutor: + """Mimics a wedge that fires inside a still-running execute() — + the adapter marks wedged, then continues to await something + network-shaped that the outer wait_for cuts short. The pre-fix + smoke returned 0 here ('timed out past import-tree') even though + the runtime had already self-reported wedged. + """ + + def __init__(self, reason: str): + self._reason = reason + + async def execute(self, context, event_queue) -> None: # noqa: ARG002 + import runtime_wedge + runtime_wedge.mark_wedged(self._reason) + await asyncio.Event().wait() + + +@pytest.fixture +def reset_runtime_wedge(): + """Ensure each wedge-test starts and ends with the runtime healthy. + + The wedge is module-scoped state (`_DEFAULT` in runtime_wedge.py), + so a leak from one test would contaminate every subsequent smoke + test in the same pytest process. Reset on both sides so an early + failure doesn't poison the rest of the file either. + """ + import runtime_wedge + runtime_wedge.reset_for_test() + yield + runtime_wedge.reset_for_test() + + +@pytest.mark.asyncio +async def test_smoke_fails_when_adapter_marked_wedged_via_exception( + stub_build, reset_runtime_wedge, +): + """PR-25 regression class: adapter catches SDK init wedge, marks + runtime_wedge, raises a sanitized error. Outer exception class + (`RuntimeError`) is non-import → would have been PASS pre-fix. + Post-fix: post-run wedge check overrides PASS → FAIL.""" + code = await smoke_mode.run_executor_smoke( + _MarkWedgedThenRaiseExecutor("claude SDK init timeout — restart workspace"), + ) + assert code == 1 + + +@pytest.mark.asyncio +async def test_smoke_fails_when_adapter_marked_wedged_then_blocks( + stub_build, reset_runtime_wedge, monkeypatch: pytest.MonkeyPatch, +): + """Same wedge class as above but the adapter doesn't raise — it + keeps awaiting (e.g. waiting on a control-message reply that will + never come). Outer wait_for cuts short → would have been PASS-on- + timeout pre-fix. Post-fix: wedge check upgrades to FAIL. + """ + monkeypatch.setattr(smoke_mode, "_SMOKE_TIMEOUT_SECS", 0.1) + code = await smoke_mode.run_executor_smoke( + _MarkWedgedThenBlockExecutor("hermes init handshake timed out"), + ) + assert code == 1 + + +@pytest.mark.asyncio +async def test_smoke_passes_when_runtime_wedge_is_clean_after_clean_execute( + stub_build, reset_runtime_wedge, +): + """Belt-and-braces: wedge-clean + clean execute() must still PASS. + Pins that the new check is additive — it doesn't accidentally + fail healthy executions (e.g. by treating "no runtime_wedge import" + as a wedge).""" + code = await smoke_mode.run_executor_smoke(_CleanExecutor()) + assert code == 0 + + +def test_check_runtime_wedge_returns_none_when_module_missing( + monkeypatch: pytest.MonkeyPatch, +): + """Direct test for the import-resilience contract — the helper + must swallow ImportError (and any other exception while reading + the module) so a corrupt install doesn't crash the smoke gate.""" + import builtins + real_import = builtins.__import__ + + def _raising_import(name, *args, **kwargs): + if name == "runtime_wedge": + raise ImportError("simulated: runtime_wedge unavailable") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", _raising_import) + assert smoke_mode._check_runtime_wedge() is None + + +def test_check_runtime_wedge_returns_reason_when_marked(reset_runtime_wedge): + """When an adapter has called runtime_wedge.mark_wedged(reason), + the helper returns that reason verbatim so the smoke can surface + it in the FAIL log line.""" + import runtime_wedge + runtime_wedge.mark_wedged("explicit test reason") + assert smoke_mode._check_runtime_wedge() == "explicit test reason" + + +def test_check_runtime_wedge_returns_none_when_clean(reset_runtime_wedge): + """Pre-condition for the additive contract: helper must return + None (not the empty string from `wedge_reason()`) when no adapter + has marked the runtime wedged, so the caller's `is not None` + check works.""" + assert smoke_mode._check_runtime_wedge() is None