Merge pull request #2154 from Molecule-AI/refactor/extract-wedge-state-from-claude-sdk
refactor(wedge): extract claude_sdk_executor wedge state into runtime_wedge module
This commit is contained in:
commit
66b9c04057
@ -116,11 +116,21 @@ class BaseAdapter(ABC):
|
||||
"""Interface every agent infrastructure adapter must implement.
|
||||
|
||||
To add a new agent infra:
|
||||
1. Create workspace/adapters/<your_infra>/
|
||||
1. Create a standalone template repo (molecule-ai-workspace-template-<infra>)
|
||||
2. Implement adapter.py with a class extending BaseAdapter
|
||||
3. Add requirements.txt with your infra's dependencies
|
||||
4. Export as Adapter in __init__.py
|
||||
5. Submit a PR
|
||||
3. Add requirements.txt with your infra's dependencies + molecule-runtime
|
||||
4. Set ADAPTER_MODULE in the Dockerfile to your adapter module path
|
||||
|
||||
Cross-cutting capabilities your adapter can opt into:
|
||||
- capabilities() — declare native ownership of heartbeat, scheduler,
|
||||
session, status mgmt, etc. (see RuntimeCapabilities above)
|
||||
- idle_timeout_override() — extend the platform's per-dispatch
|
||||
silence window for SDKs with long synth turns
|
||||
- runtime_wedge.mark_wedged() / clear_wedge() — flip the workspace
|
||||
to `degraded` + auto-recover when your SDK hits a non-recoverable
|
||||
error class. Import directly from `runtime_wedge`; the heartbeat
|
||||
forwards the state to the platform automatically. See the
|
||||
runtime_wedge module docstring for the integration recipe.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
|
||||
@ -87,71 +87,29 @@ _RETRYABLE_PATTERNS = (
|
||||
"try again",
|
||||
)
|
||||
|
||||
# Module-level SDK-wedge flag. When claude_agent_sdk's `query.initialize()`
|
||||
# raises `Control request timeout: initialize`, the SDK's internal client-
|
||||
# process state is corrupted for the rest of the Python process — every
|
||||
# subsequent `_run_query()` call hits the same wedge and re-throws. The
|
||||
# executor itself can't auto-recover (the underlying CLI subprocess and
|
||||
# its read pipe are in an unrecoverable state); only a workspace restart
|
||||
# clears it.
|
||||
#
|
||||
# The heartbeat task reads these helpers and reports
|
||||
# `runtime_state="wedged"` to the platform, which flips the workspace to
|
||||
# `degraded` so the canvas surfaces a Restart hint instead of leaving
|
||||
# the user staring at a green dot while every chat hangs.
|
||||
#
|
||||
# Module scope (not instance scope) is deliberate: the wedge is a
|
||||
# property of the Python process, not the executor. A future per-org
|
||||
# multi-executor design could move this to a shared registry, but with
|
||||
# one executor per workspace process today the simplest lock-free
|
||||
# read+write fits.
|
||||
_sdk_wedged_reason: str | None = None
|
||||
# Wedge state moved to runtime_wedge (see that module's docstring for
|
||||
# the rationale + the broader "Compatibility shim" note). This block
|
||||
# re-exports under the historical names so the in-file call sites in
|
||||
# _run_query stay terse and any external consumer that imported them
|
||||
# from claude_sdk_executor keeps working for one release cycle.
|
||||
from runtime_wedge import ( # noqa: E402
|
||||
clear_wedge as _clear_sdk_wedge_on_success,
|
||||
is_wedged,
|
||||
mark_wedged as _mark_sdk_wedged,
|
||||
reset_for_test as _reset_sdk_wedge_for_test,
|
||||
wedge_reason,
|
||||
)
|
||||
|
||||
|
||||
def is_wedged() -> bool:
|
||||
"""True if the Claude SDK has hit a non-recoverable init wedge in
|
||||
this process. Sticky until process restart."""
|
||||
return _sdk_wedged_reason is not None
|
||||
|
||||
|
||||
def wedge_reason() -> str:
|
||||
"""Human-readable description of the wedge cause, or empty string
|
||||
when not wedged. Surfaced to the canvas via heartbeat sample_error."""
|
||||
return _sdk_wedged_reason or ""
|
||||
|
||||
|
||||
def _mark_sdk_wedged(reason: str) -> None:
|
||||
"""Internal — flag the SDK as wedged. Only the first call wins
|
||||
(subsequent identical wedges shouldn't overwrite a more specific
|
||||
reason). Tests use `_reset_sdk_wedge_for_test()` to clear."""
|
||||
global _sdk_wedged_reason
|
||||
if _sdk_wedged_reason is None:
|
||||
_sdk_wedged_reason = reason
|
||||
logger.error("SDK wedge detected: %s — workspace will report degraded until a successful query clears it", reason)
|
||||
|
||||
|
||||
def _clear_sdk_wedge_on_success() -> None:
|
||||
"""Auto-recovery — called from _run_query after a successful
|
||||
completion. The original wedge could be transient (a single network
|
||||
blip during the SDK's first-message handshake), and a sticky-only
|
||||
flag would lock the workspace into degraded forever even after the
|
||||
SDK started working again. Clearing on observed success means the
|
||||
next heartbeat after a working query reports `runtime_state` empty
|
||||
and the platform flips status back to online.
|
||||
|
||||
No-op when not wedged (the common case)."""
|
||||
global _sdk_wedged_reason
|
||||
if _sdk_wedged_reason is not None:
|
||||
logger.info("SDK wedge cleared after successful query — workspace will recover to online on next heartbeat")
|
||||
_sdk_wedged_reason = None
|
||||
|
||||
|
||||
def _reset_sdk_wedge_for_test() -> None:
|
||||
"""Test-only escape hatch. Production code clears the wedge via
|
||||
`_clear_sdk_wedge_on_success` when a query succeeds; this helper
|
||||
is for unit tests that need to reset between cases."""
|
||||
global _sdk_wedged_reason
|
||||
_sdk_wedged_reason = None
|
||||
# Names below are re-exported (not consumed inside this file) for
|
||||
# backwards compatibility with third-party adapters that imported them
|
||||
# from claude_sdk_executor before the wedge state moved to runtime_wedge.
|
||||
# Listing them in __all__ marks the intent explicitly and stops static
|
||||
# analysis from flagging the imports as unused.
|
||||
__all__ = [
|
||||
"is_wedged",
|
||||
"wedge_reason",
|
||||
"_reset_sdk_wedge_for_test",
|
||||
]
|
||||
|
||||
|
||||
# Per-tool-use summarizers. Reads the most-useful argument from each
|
||||
|
||||
@ -22,16 +22,25 @@ from platform_auth import auth_headers, refresh_cache, self_source_headers
|
||||
|
||||
def _runtime_state_payload() -> dict:
|
||||
"""Build the {runtime_state, sample_error} portion of the heartbeat
|
||||
body when the Claude SDK has hit a wedge. Returns an empty dict
|
||||
when the runtime is healthy so the heartbeat payload doesn't grow
|
||||
fields the platform doesn't need.
|
||||
body when SOME adapter executor has marked itself wedged. Returns
|
||||
an empty dict when the runtime is healthy so the heartbeat payload
|
||||
doesn't grow fields the platform doesn't need.
|
||||
|
||||
Imported lazily so workspaces running non-Claude runtimes (where
|
||||
`claude_sdk_executor` may not be importable at all) keep working —
|
||||
a missing import means "no Claude wedge possible here, healthy."
|
||||
Source of truth is runtime_wedge (lives in molecule-runtime,
|
||||
independent of any specific adapter). Pre task #87 this imported
|
||||
from claude_sdk_executor — that worked because the executor was
|
||||
bundled into molecule-runtime, but blocked moving it to the
|
||||
claude-code template repo. The runtime_wedge module is now the
|
||||
cross-cutting wedge-state holder; adapters mark/clear via it,
|
||||
heartbeat reads it.
|
||||
|
||||
Imported lazily so a workspace whose runtime image somehow ships
|
||||
without runtime_wedge (corrupt install, mid-rolling-deploy state)
|
||||
keeps heartbeating — a missing import means "no wedge info; assume
|
||||
healthy."
|
||||
"""
|
||||
try:
|
||||
from claude_sdk_executor import is_wedged, wedge_reason
|
||||
from runtime_wedge import is_wedged, wedge_reason
|
||||
except Exception:
|
||||
return {}
|
||||
if not is_wedged():
|
||||
|
||||
203
workspace/runtime_wedge.py
Normal file
203
workspace/runtime_wedge.py
Normal file
@ -0,0 +1,203 @@
|
||||
"""Per-process runtime-wedge state.
|
||||
|
||||
Adapter executors that hit a non-recoverable wedge (e.g. claude-agent-sdk's
|
||||
`Control request timeout: initialize` corrupting the client process's
|
||||
internal state) call mark_wedged(reason). The heartbeat task reads
|
||||
is_wedged() / wedge_reason() and forwards them in the heartbeat payload's
|
||||
runtime_state field — the platform then flips workspace status to
|
||||
`degraded` so the canvas surfaces a Restart hint instead of leaving the
|
||||
user staring at a green dot while every chat hangs.
|
||||
|
||||
Module scope (not instance scope) is deliberate: the wedge is a property
|
||||
of the Python process, not any particular executor. With one executor
|
||||
per workspace process today this is the simplest lock-free
|
||||
read+write fit. A future per-org multi-executor design could move this
|
||||
to a shared registry.
|
||||
|
||||
This module lives in molecule-runtime (NOT in any adapter / template
|
||||
repo) because:
|
||||
|
||||
1. workspace/heartbeat.py reads it on every heartbeat — cross-cutting
|
||||
concern, runtime owns it.
|
||||
2. Multiple adapter executors can mark themselves wedged with their
|
||||
own reason; the runtime aggregates one flag for the platform.
|
||||
3. Decoupling from claude_sdk_executor is the prerequisite for the
|
||||
universal-runtime refactor (molecule-core task #87) — without
|
||||
this extraction, claude_sdk_executor.py couldn't move to its
|
||||
template repo because heartbeat would lose access to the wedge
|
||||
state.
|
||||
|
||||
Public API: mark_wedged(reason), clear_wedge(), is_wedged(),
|
||||
wedge_reason(). The reset_for_test() helper is for unit tests only.
|
||||
|
||||
How to use from a NEW adapter (template repo)
|
||||
---------------------------------------------
|
||||
|
||||
Hermes, Codex, LangGraph, or any future adapter that wants the same
|
||||
"flip-to-degraded-on-fatal-wedge" UX should call mark_wedged + clear_wedge
|
||||
from its executor. The runtime imports + heartbeat plumbing are already
|
||||
in place — adapters do not change anything in molecule-runtime.
|
||||
|
||||
Minimum integration (~6 LOC inside the executor):
|
||||
|
||||
# Import path:
|
||||
# - In a TEMPLATE repo (the common case for new adapters), the
|
||||
# runtime is installed via PyPI as `molecule-ai-workspace-runtime`,
|
||||
# so the import is `from molecule_runtime.runtime_wedge import …`.
|
||||
# - In molecule-core itself (when editing this repo's own
|
||||
# workspace/ tree), the module is at the top level — import as
|
||||
# `from runtime_wedge import …`.
|
||||
from molecule_runtime.runtime_wedge import mark_wedged, clear_wedge
|
||||
|
||||
async def execute(self, ctx, queue):
|
||||
try:
|
||||
result = await self._run_query(ctx)
|
||||
except SomeFatalSdkError as e:
|
||||
# Pick a short, operator-actionable reason. This becomes the
|
||||
# banner text on the canvas's degraded card — keep it under
|
||||
# ~80 chars and name the recovery action when possible.
|
||||
mark_wedged(f"hermes init timeout — restart workspace ({e})")
|
||||
raise
|
||||
clear_wedge() # observed-success → next heartbeat reports healthy
|
||||
return result
|
||||
|
||||
What you get for free:
|
||||
- Heartbeat payload sets runtime_state="wedged" + sample_error=<reason>
|
||||
on the next 30s tick.
|
||||
- registry.go's evaluateStatus flips the workspace to `degraded` and
|
||||
broadcasts WORKSPACE_DEGRADED so the canvas card turns yellow with
|
||||
your reason as the subtitle.
|
||||
- clear_wedge() on the next successful turn flips the workspace back
|
||||
to `online` automatically — no manual operator action.
|
||||
|
||||
What NOT to do:
|
||||
- Don't store wedge state in your adapter module. The platform-side
|
||||
consumer (heartbeat) imports from runtime_wedge by name; an adapter-
|
||||
local copy won't be observed.
|
||||
- Don't call mark_wedged for transient errors (rate limits, single
|
||||
failed network call). The whole point is "the SDK process is in a
|
||||
state that can only be cleared by restart" — false positives
|
||||
train operators to ignore the degraded banner.
|
||||
- Don't write your own clear logic. clear_wedge() is the only path
|
||||
the heartbeat watches; a custom flag won't propagate.
|
||||
|
||||
When wedge is the WRONG primitive: if the failure is per-request (the
|
||||
SDK works for some inputs but not others), surface as a normal A2A
|
||||
error response, not a wedge. Wedge means "every subsequent request in
|
||||
this process will fail until restart."
|
||||
|
||||
Compatibility shim (will be removed once #87 Phase 2 lands)
|
||||
-----------------------------------------------------------
|
||||
|
||||
claude_sdk_executor.py re-exports the four functions under the historical
|
||||
names (is_wedged, wedge_reason, _mark_sdk_wedged, _clear_sdk_wedge_on_success)
|
||||
for one release cycle. New adapter code should import from runtime_wedge
|
||||
directly; the shim only exists so existing third-party adapters that
|
||||
copied our claude_sdk_executor wedge convention have time to migrate.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class _WedgeState:
|
||||
"""Internal carrier for the wedge flag. Exposed only via the module-
|
||||
level helpers below; adapters never see this class.
|
||||
|
||||
Wrapping the state in a class (instead of a bare module-level global)
|
||||
is forward-cover for the day a runtime hosts multiple executors per
|
||||
process — a future per-scope variant can hand out keyed instances
|
||||
without changing the public mark_wedged / clear_wedge / is_wedged /
|
||||
wedge_reason API. Today there's exactly one instance (_DEFAULT).
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
# None = healthy; non-empty string = wedged with that human-
|
||||
# readable reason. Surfaced verbatim as the canvas's degraded-
|
||||
# card banner text via heartbeat.sample_error.
|
||||
self._reason: str | None = None
|
||||
|
||||
def is_wedged(self) -> bool:
|
||||
return self._reason is not None
|
||||
|
||||
def reason(self) -> str:
|
||||
return self._reason or ""
|
||||
|
||||
def mark(self, reason: str) -> None:
|
||||
# First-write-wins: a subsequent identical-class wedge can't
|
||||
# overwrite a more specific initial reason so the operator-
|
||||
# visible banner stays stable.
|
||||
if self._reason is None:
|
||||
self._reason = reason
|
||||
logger.error(
|
||||
"runtime wedge detected: %s — workspace will report degraded until cleared",
|
||||
reason,
|
||||
)
|
||||
|
||||
def clear(self) -> None:
|
||||
# No-op when not wedged (the common case — adapters call this
|
||||
# on every successful query).
|
||||
if self._reason is not None:
|
||||
logger.info(
|
||||
"runtime wedge cleared after successful operation — workspace will recover to online on next heartbeat",
|
||||
)
|
||||
self._reason = None
|
||||
|
||||
def reset(self) -> None:
|
||||
# Unconditional clear — for test fixtures only. Skips the
|
||||
# info-level log line the production clear() path emits.
|
||||
self._reason = None
|
||||
|
||||
|
||||
# Single shared instance backing the module-level helpers. Today there's
|
||||
# one executor per workspace process so this fits perfectly; the class
|
||||
# wrap above is the seam for any future per-scope variant.
|
||||
_DEFAULT = _WedgeState()
|
||||
|
||||
|
||||
def is_wedged() -> bool:
|
||||
"""True if some adapter executor in this process has marked itself
|
||||
wedged. Sticky until the same executor calls clear_wedge() on
|
||||
observed recovery (or the process restarts)."""
|
||||
return _DEFAULT.is_wedged()
|
||||
|
||||
|
||||
def wedge_reason() -> str:
|
||||
"""Human-readable description of the wedge cause, or empty string
|
||||
when not wedged. Surfaced to the canvas via heartbeat sample_error."""
|
||||
return _DEFAULT.reason()
|
||||
|
||||
|
||||
def mark_wedged(reason: str) -> None:
|
||||
"""Flag the runtime as wedged. Only the FIRST call wins so a
|
||||
subsequent identical-class wedge can't overwrite a more specific
|
||||
initial reason — the operator-visible banner stays stable.
|
||||
|
||||
Adapters call this from their executor's exception path when the
|
||||
SDK has hit a non-recoverable error class. Safe to call multiple
|
||||
times; the no-op when already wedged is intentional.
|
||||
"""
|
||||
_DEFAULT.mark(reason)
|
||||
|
||||
|
||||
def clear_wedge() -> None:
|
||||
"""Auto-recovery: adapter calls this after an observed successful
|
||||
operation. The original wedge could be transient (single network
|
||||
blip during the SDK's first-message handshake), and a sticky-only
|
||||
flag would lock the workspace into degraded forever even after the
|
||||
SDK started working again. Clearing on observed success means the
|
||||
next heartbeat after a working query reports runtime_state empty
|
||||
and the platform flips status back to online.
|
||||
|
||||
No-op when not wedged (the common case)."""
|
||||
_DEFAULT.clear()
|
||||
|
||||
|
||||
def reset_for_test() -> None:
|
||||
"""Test-only escape hatch. Production code clears the wedge via
|
||||
clear_wedge() on observed success; this helper is for unit tests
|
||||
that need to reset between cases without going through the full
|
||||
SDK round-trip."""
|
||||
_DEFAULT.reset()
|
||||
103
workspace/tests/test_runtime_wedge.py
Normal file
103
workspace/tests/test_runtime_wedge.py
Normal file
@ -0,0 +1,103 @@
|
||||
"""Tests for runtime_wedge — the runtime-side wedge-state module that
|
||||
heartbeat reads + adapter executors write. Extracted from claude_sdk_
|
||||
executor (task #87 universal-runtime refactor) so the executor can move
|
||||
to its template repo without breaking heartbeat.
|
||||
|
||||
The behavior is identical to the prior in-executor implementation; tests
|
||||
pin the contract so the re-export shim in claude_sdk_executor.py can
|
||||
later be deleted without surprise."""
|
||||
import pytest
|
||||
|
||||
import runtime_wedge
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset():
|
||||
"""Each test starts with a clean wedge state — production wedges are
|
||||
sticky-per-process, but cross-test bleed would couple unrelated cases."""
|
||||
runtime_wedge.reset_for_test()
|
||||
yield
|
||||
runtime_wedge.reset_for_test()
|
||||
|
||||
|
||||
class TestRuntimeWedge:
|
||||
def test_starts_unwedged(self):
|
||||
assert runtime_wedge.is_wedged() is False
|
||||
assert runtime_wedge.wedge_reason() == ""
|
||||
|
||||
def test_mark_wedged_sets_flag_and_reason(self):
|
||||
runtime_wedge.mark_wedged("SDK init timeout")
|
||||
assert runtime_wedge.is_wedged() is True
|
||||
assert runtime_wedge.wedge_reason() == "SDK init timeout"
|
||||
|
||||
def test_first_mark_wins(self):
|
||||
# Stable banner text is more important than the most-recent
|
||||
# cause. A second wedge while already wedged should NOT
|
||||
# overwrite — operator sees the original (more diagnosable)
|
||||
# reason, not whatever the SDK said next.
|
||||
runtime_wedge.mark_wedged("SDK init timeout")
|
||||
runtime_wedge.mark_wedged("Subsequent identical-class wedge")
|
||||
assert runtime_wedge.wedge_reason() == "SDK init timeout"
|
||||
|
||||
def test_clear_wedge_restores_healthy(self):
|
||||
# Auto-recovery: when the SDK starts working again, the next
|
||||
# heartbeat must report empty runtime_state so the platform
|
||||
# flips status from degraded back to online.
|
||||
runtime_wedge.mark_wedged("transient blip")
|
||||
runtime_wedge.clear_wedge()
|
||||
assert runtime_wedge.is_wedged() is False
|
||||
assert runtime_wedge.wedge_reason() == ""
|
||||
|
||||
def test_clear_wedge_when_not_wedged_is_noop(self):
|
||||
# No-op safety — production calls clear_wedge() on every
|
||||
# successful query (~thousands of times per session); throwing
|
||||
# or logging when not wedged would spam.
|
||||
runtime_wedge.clear_wedge()
|
||||
runtime_wedge.clear_wedge() # still safe twice in a row
|
||||
assert runtime_wedge.is_wedged() is False
|
||||
|
||||
def test_re_marking_after_clear_is_allowed(self):
|
||||
# Real production path: SDK wedges, recovers, wedges again.
|
||||
# Each cycle should land cleanly (not silently drop).
|
||||
runtime_wedge.mark_wedged("first wedge")
|
||||
runtime_wedge.clear_wedge()
|
||||
runtime_wedge.mark_wedged("second wedge — different reason")
|
||||
assert runtime_wedge.is_wedged() is True
|
||||
assert runtime_wedge.wedge_reason() == "second wedge — different reason"
|
||||
|
||||
|
||||
class TestClaudeSdkExecutorReExportShim:
|
||||
"""claude_sdk_executor.py keeps re-exporting the old names for one
|
||||
release cycle so any third-party adapter copying our wedge convention
|
||||
has time to migrate. These tests pin the shim — when removed, the
|
||||
test file goes too."""
|
||||
|
||||
def test_is_wedged_re_exported(self):
|
||||
from claude_sdk_executor import is_wedged
|
||||
assert is_wedged is runtime_wedge.is_wedged
|
||||
|
||||
def test_wedge_reason_re_exported(self):
|
||||
from claude_sdk_executor import wedge_reason
|
||||
assert wedge_reason is runtime_wedge.wedge_reason
|
||||
|
||||
def test_internal_helpers_re_exported(self):
|
||||
# Keep the underscore names too — claude_sdk_executor's own
|
||||
# _run_query calls _mark_sdk_wedged / _clear_sdk_wedge_on_success
|
||||
# via these re-exports.
|
||||
from claude_sdk_executor import (
|
||||
_mark_sdk_wedged,
|
||||
_clear_sdk_wedge_on_success,
|
||||
_reset_sdk_wedge_for_test,
|
||||
)
|
||||
assert _mark_sdk_wedged is runtime_wedge.mark_wedged
|
||||
assert _clear_sdk_wedge_on_success is runtime_wedge.clear_wedge
|
||||
assert _reset_sdk_wedge_for_test is runtime_wedge.reset_for_test
|
||||
|
||||
def test_re_export_state_is_shared(self):
|
||||
# The shim isn't a copy — both names refer to the same module
|
||||
# state. Marking via the executor name must be observable via
|
||||
# the runtime_wedge name (and vice versa).
|
||||
from claude_sdk_executor import _mark_sdk_wedged
|
||||
_mark_sdk_wedged("via executor shim")
|
||||
assert runtime_wedge.is_wedged() is True
|
||||
assert runtime_wedge.wedge_reason() == "via executor shim"
|
||||
Loading…
Reference in New Issue
Block a user