Merge pull request #2473 from Molecule-AI/feat/universal-turn-smoke-runtime-wedge

feat(smoke): consult runtime_wedge after execute() to catch SDK init wedges
2026-05-02 00:52:31 +00:00 · 2026-05-02 00:52:31 +00:00 · b39dc62de6
commit b39dc62de6
parent 2297c083c8 59f0a449bd
2 changed files with 227 additions and 19 deletions
--- a/workspace/smoke_mode.py
+++ b/workspace/smoke_mode.py
@ -15,6 +15,15 @@ times out — that's a *pass*. If a lazy import is broken, the call
 raises `ImportError` / `ModuleNotFoundError` from inside the executor
 body — that's a *fail*.

+Universal wedge gate (task #131): timeout-as-pass alone misses init
+wedges where the SDK process spins for 60s+ on a malformed argv
+(claude-agent-sdk PR #25 class). After every result path, the smoke
+consults `runtime_wedge.is_wedged()` — adapters opt-in by calling
+`runtime_wedge.mark_wedged(reason)` from their executor's wedge catch
+arm, and the smoke upgrades the provisional PASS to FAIL when the
+flag is set. Non-opt-in adapters keep working as before — the check
+is additive.
+
 Activated by setting `MOLECULE_SMOKE_MODE=1` in the env. Wired into
 `main.py` after `executor = await adapter.create_executor(...)` so the
 full adapter setup path runs first; the smoke just adds one more
@ -23,7 +32,10 @@ exercise step before exit.
 CI usage (intended for `molecule-ci/.github/workflows/publish-template-image.yml`):
  docker run --rm \
    -e WORKSPACE_ID=fake -e MOLECULE_SMOKE_MODE=1 \
+    -e MOLECULE_SMOKE_TIMEOUT_SECS=90 \
    "$IMAGE" molecule-runtime
+The 90s timeout is calibrated to claude-agent-sdk's 60s
+`initialize()` handshake — adapters with shorter init can lower it.
 """
 from __future__ import annotations

@ -81,20 +93,52 @@ def _build_stub_context() -> tuple[Any, Any]:
    return context, queue


+def _check_runtime_wedge() -> str | None:
+    """Return the wedge reason if any adapter has marked the runtime
+    wedged during this smoke run, or None when healthy.
+
+    Universal turn-smoke (task #131): adapters that hit an unrecoverable
+    init wedge (e.g. claude-agent-sdk's `Control request timeout:
+    initialize` after a malformed CLI argv) call
+    `runtime_wedge.mark_wedged(reason)`. The smoke gate consults this
+    flag at the end of every result path — pre-existing PASS branches
+    are upgraded to FAIL when the flag is set, so a wedge that was
+    triggered inside a still-running execute() (timeout branch) or
+    inside a non-import exception (PASS-on-other-error branch) gets
+    surfaced instead of silently shipping a broken image to GHCR.
+
+    Lazy import: the runtime may be installed without runtime_wedge in
+    a corrupt-rolling-deploy state, in which case "no wedge info"
+    reads as "assume healthy" — same fail-open posture heartbeat.py
+    takes for the same reason.
+    """
+    try:
+        from runtime_wedge import is_wedged, wedge_reason
+    except Exception:
+        return None
+    if is_wedged():
+        return wedge_reason() or "<unspecified>"
+    return None
+
+
 async def run_executor_smoke(executor: Any) -> int:
    """Invoke executor.execute() once with stub deps. Return an exit code.

    Returns:
-      0 — import tree healthy. Either execution timed out (the
-          expected outcome — we hit a network boundary like an LLM
-          call) or completed cleanly. Either way, no broken imports.
-      1 — broken lazy import detected. Re-raised as a clear log line
-          so the publish gate's stderr captures the offending symbol.
+      0 — import tree healthy AND no adapter marked the runtime wedged.
+          Either execution timed out (the expected outcome — we hit a
+          network boundary like an LLM call) or completed cleanly.
+      1 — broken lazy import detected, OR an adapter marked the
+          runtime wedged via runtime_wedge.mark_wedged(). Re-raised
+          as a clear log line so the publish gate's stderr captures
+          the offending symbol or wedge reason.

    The 5-second timeout comes from `MOLECULE_SMOKE_TIMEOUT_SECS` env
-    (default 5.0). Bump it via env if a slow adapter setup overlaps the
-    first execute call. Don't make it too long — the publish workflow
-    multiplies this across N templates.
+    (default 5.0). Bump it via env when the failure mode under test is
+    an init handshake that takes longer than 5s to give up — e.g.
+    claude-agent-sdk's 60s `initialize()` timeout needs ~90s here so
+    the SDK marks itself wedged before our outer wait_for fires.
+    The publish workflow sets this value per-template via env.
    """
    print(
        f"[smoke-mode] invoking executor.execute(stub_ctx, stub_queue) "
@ -114,6 +158,11 @@ async def run_executor_smoke(executor: Any) -> int:
        )
        return 1

+    # Outcome of executor.execute() — narrowed to exit code by the
+    # post-run wedge check below. Pre-wedge-check exit code: 0 for
+    # PASS-shaped paths (timeout, clean return, non-import exception),
+    # 1 for FAIL-shaped paths (import error). Wedge check upgrades
+    # PASS → FAIL when the runtime self-reports wedged.
    try:
        await asyncio.wait_for(
            executor.execute(context, queue),
@ -121,9 +170,11 @@ async def run_executor_smoke(executor: Any) -> int:
        )
    except (asyncio.TimeoutError, asyncio.CancelledError):
        # Timeout = imports healthy, execution was proceeding and hit
-        # a network boundary or long await. Pass.
-        print("[smoke-mode] PASS: timed out past import-tree (imports healthy)")
-        return 0
+        # a network boundary or long await. Provisionally PASS — but
+        # also check runtime_wedge below: an adapter whose init wedge
+        # fires inside the timeout window still needs to FAIL the gate.
+        pre_wedge_code = 0
+        pre_wedge_msg = "timed out past import-tree (imports healthy)"
    except (ImportError, ModuleNotFoundError) as imp_err:
        # The exact regression class issue #2275 exists to catch.
        print(
@ -134,13 +185,33 @@ async def run_executor_smoke(executor: Any) -> int:
        return 1
    except Exception as other_err:  # noqa: BLE001
        # Anything else (auth errors, validation errors, runtime bugs)
-        # is downstream of the import gate. Pass — these are caught by
-        # the relevant adapter-level tests, not by this smoke.
-        print(
-            f"[smoke-mode] PASS: execute() raised "
-            f"{type(other_err).__name__} past import-tree (not an import error)"
+        # is downstream of the import gate. Provisionally PASS — these
+        # are caught by adapter-level tests, NOT by this gate, EXCEPT
+        # when the adapter also called runtime_wedge.mark_wedged() on
+        # the way out (the PR-25-class wedge — SDK init failure inside
+        # execute()). The post-run wedge check below catches that.
+        pre_wedge_code = 0
+        pre_wedge_msg = (
+            f"execute() raised {type(other_err).__name__} "
+            "past import-tree (not an import error)"
        )
-        return 0
    else:
-        print("[smoke-mode] PASS: execute() completed within timeout (imports + body OK)")
-        return 0
+        pre_wedge_code = 0
+        pre_wedge_msg = "execute() completed within timeout (imports + body OK)"
+
+    wedge_reason_str = _check_runtime_wedge()
+    if wedge_reason_str is not None:
+        # Adapter self-reported wedge — overrides any provisional PASS.
+        # This is the path that catches the PR-25-class regression
+        # (claude_agent_sdk init wedge from a malformed CLI argv) that
+        # otherwise looks like a benign network-call timeout to the
+        # outer wait_for.
+        print(
+            f"[smoke-mode] FAIL: runtime self-reported wedged after execute(): "
+            f"{wedge_reason_str}",
+            file=sys.stderr,
+        )
+        return 1
+
+    print(f"[smoke-mode] PASS: {pre_wedge_msg}")
+    return pre_wedge_code
--- a/workspace/tests/test_smoke_mode.py
+++ b/workspace/tests/test_smoke_mode.py
@ -209,3 +209,140 @@ async def test_smoke_fails_when_stub_context_build_breaks(monkeypatch: pytest.Mo
    monkeypatch.setattr(smoke_mode, "_build_stub_context", _fail_build)
    code = await smoke_mode.run_executor_smoke(_CleanExecutor())
    assert code == 1
+
+
+# ─── runtime_wedge integration (universal turn-smoke, task #131) ───────
+#
+# These tests pin the post-execute wedge-check that upgrades a
+# provisional PASS to FAIL when an adapter has marked the runtime
+# wedged via `runtime_wedge.mark_wedged()`. Without this gate, the
+# PR-25-class regression (claude_agent_sdk init wedge from a malformed
+# CLI argv) shipped to GHCR because the smoke saw the outer wait_for
+# timeout as "imports healthy, hit a network boundary."
+
+
+class _MarkWedgedThenRaiseExecutor:
+    """Mimics the claude_sdk_executor wedge path: catches the SDK's
+    `Control request timeout: initialize`, calls
+    `runtime_wedge.mark_wedged()` from the catch arm, then re-raises
+    a sanitized error. The smoke must surface this as FAIL even
+    though the outer exception class (`RuntimeError` here) would
+    otherwise be a PASS-on-non-import-error.
+    """
+
+    def __init__(self, reason: str):
+        self._reason = reason
+
+    async def execute(self, context, event_queue) -> None:  # noqa: ARG002
+        import runtime_wedge
+        runtime_wedge.mark_wedged(self._reason)
+        raise RuntimeError("sanitized adapter error after wedge")
+
+
+class _MarkWedgedThenBlockExecutor:
+    """Mimics a wedge that fires inside a still-running execute() —
+    the adapter marks wedged, then continues to await something
+    network-shaped that the outer wait_for cuts short. The pre-fix
+    smoke returned 0 here ('timed out past import-tree') even though
+    the runtime had already self-reported wedged.
+    """
+
+    def __init__(self, reason: str):
+        self._reason = reason
+
+    async def execute(self, context, event_queue) -> None:  # noqa: ARG002
+        import runtime_wedge
+        runtime_wedge.mark_wedged(self._reason)
+        await asyncio.Event().wait()
+
+
+@pytest.fixture
+def reset_runtime_wedge():
+    """Ensure each wedge-test starts and ends with the runtime healthy.
+
+    The wedge is module-scoped state (`_DEFAULT` in runtime_wedge.py),
+    so a leak from one test would contaminate every subsequent smoke
+    test in the same pytest process. Reset on both sides so an early
+    failure doesn't poison the rest of the file either.
+    """
+    import runtime_wedge
+    runtime_wedge.reset_for_test()
+    yield
+    runtime_wedge.reset_for_test()
+
+
+@pytest.mark.asyncio
+async def test_smoke_fails_when_adapter_marked_wedged_via_exception(
+    stub_build, reset_runtime_wedge,
+):
+    """PR-25 regression class: adapter catches SDK init wedge, marks
+    runtime_wedge, raises a sanitized error. Outer exception class
+    (`RuntimeError`) is non-import → would have been PASS pre-fix.
+    Post-fix: post-run wedge check overrides PASS → FAIL."""
+    code = await smoke_mode.run_executor_smoke(
+        _MarkWedgedThenRaiseExecutor("claude SDK init timeout — restart workspace"),
+    )
+    assert code == 1
+
+
+@pytest.mark.asyncio
+async def test_smoke_fails_when_adapter_marked_wedged_then_blocks(
+    stub_build, reset_runtime_wedge, monkeypatch: pytest.MonkeyPatch,
+):
+    """Same wedge class as above but the adapter doesn't raise — it
+    keeps awaiting (e.g. waiting on a control-message reply that will
+    never come). Outer wait_for cuts short → would have been PASS-on-
+    timeout pre-fix. Post-fix: wedge check upgrades to FAIL.
+    """
+    monkeypatch.setattr(smoke_mode, "_SMOKE_TIMEOUT_SECS", 0.1)
+    code = await smoke_mode.run_executor_smoke(
+        _MarkWedgedThenBlockExecutor("hermes init handshake timed out"),
+    )
+    assert code == 1
+
+
+@pytest.mark.asyncio
+async def test_smoke_passes_when_runtime_wedge_is_clean_after_clean_execute(
+    stub_build, reset_runtime_wedge,
+):
+    """Belt-and-braces: wedge-clean + clean execute() must still PASS.
+    Pins that the new check is additive — it doesn't accidentally
+    fail healthy executions (e.g. by treating "no runtime_wedge import"
+    as a wedge)."""
+    code = await smoke_mode.run_executor_smoke(_CleanExecutor())
+    assert code == 0
+
+
+def test_check_runtime_wedge_returns_none_when_module_missing(
+    monkeypatch: pytest.MonkeyPatch,
+):
+    """Direct test for the import-resilience contract — the helper
+    must swallow ImportError (and any other exception while reading
+    the module) so a corrupt install doesn't crash the smoke gate."""
+    import builtins
+    real_import = builtins.__import__
+
+    def _raising_import(name, *args, **kwargs):
+        if name == "runtime_wedge":
+            raise ImportError("simulated: runtime_wedge unavailable")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", _raising_import)
+    assert smoke_mode._check_runtime_wedge() is None
+
+
+def test_check_runtime_wedge_returns_reason_when_marked(reset_runtime_wedge):
+    """When an adapter has called runtime_wedge.mark_wedged(reason),
+    the helper returns that reason verbatim so the smoke can surface
+    it in the FAIL log line."""
+    import runtime_wedge
+    runtime_wedge.mark_wedged("explicit test reason")
+    assert smoke_mode._check_runtime_wedge() == "explicit test reason"
+
+
+def test_check_runtime_wedge_returns_none_when_clean(reset_runtime_wedge):
+    """Pre-condition for the additive contract: helper must return
+    None (not the empty string from `wedge_reason()`) when no adapter
+    has marked the runtime wedged, so the caller's `is not None`
+    check works."""
+    assert smoke_mode._check_runtime_wedge() is None