From 06240ab67b32a2564edbabed07f7e0a1283fb172 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 3 May 2026 03:44:05 -0700
Subject: [PATCH 1/6] fix(preflight): skip required_env check in
 MOLECULE_SMOKE_MODE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Boot smoke (#2275) exercises executor.execute() against stub deps
and never hits the real provider, so missing auth env is not a real
blocker. Without this bypass, every adapter that introduces a new
auth env var must be mirrored into molecule-ci's fake-env list — a
maintenance treadmill that just bit hermes-template:

- 2026-05-03 09:47 UTC: hermes publish-image smoke fails on
  HERMES_API_KEY preflight (workflow injects CLAUDE_CODE_OAUTH_TOKEN,
  ANTHROPIC_API_KEY, GEMINI_API_KEY, OPENAI_API_KEY but not
  HERMES_API_KEY or OPENROUTER_API_KEY). Failed for two cycles
  before being noticed.

The bypass demotes Required-env failures to warnings when
MOLECULE_SMOKE_MODE is truthy, so the unset env stays visible in
the boot log without blocking. Production paths are unchanged
(env unset → fail).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 workspace/preflight.py            | 32 ++++++++++++++++----
 workspace/tests/test_preflight.py | 49 +++++++++++++++++++++++++++++++
 2 files changed, 76 insertions(+), 5 deletions(-)

diff --git a/workspace/preflight.py b/workspace/preflight.py
index d6123f25..e1929f3e 100644
--- a/workspace/preflight.py
+++ b/workspace/preflight.py
@@ -180,16 +180,38 @@ def run_preflight(config: WorkspaceConfig, config_path: str) -> PreflightReport:
                 required_env = list(entry.get("required_env") or [])
             break
 
+    # Smoke mode skips the auth-env block: the boot smoke (CI publish-image,
+    # issue #2275) exercises executor.execute() against stub deps, never
+    # hits the real provider, and CI cannot enumerate every adapter's auth
+    # env without forming a maintenance treadmill. Hermes 2026-05-03 outage:
+    # template smoke crashed for two cycles because molecule-ci injected
+    # CLAUDE_CODE_OAUTH_TOKEN/ANTHROPIC_API_KEY/etc. but not HERMES_API_KEY.
+    # Bypass here means new templates can ship without the workflow
+    # learning their env names.
+    smoke_mode = os.environ.get("MOLECULE_SMOKE_MODE", "").strip().lower() in (
+        "1", "true", "yes", "on",
+    )
     for env_var in required_env:
-        if not os.environ.get(env_var):
-            report.failures.append(
+        if os.environ.get(env_var):
+            continue
+        if smoke_mode:
+            report.warnings.append(
                 PreflightIssue(
-                    severity="fail",
+                    severity="warn",
                     title="Required env",
-                    detail=f"Missing required environment variable: {env_var}",
-                    fix=f"Set {env_var} via the secrets API (global or workspace-level).",
+                    detail=f"Missing {env_var} (skipped — MOLECULE_SMOKE_MODE)",
+                    fix="",
                 )
             )
+            continue
+        report.failures.append(
+            PreflightIssue(
+                severity="fail",
+                title="Required env",
+                detail=f"Missing required environment variable: {env_var}",
+                fix=f"Set {env_var} via the secrets API (global or workspace-level).",
+            )
+        )
 
     # Backward compat: if legacy auth_token_file is set, warn but don't block
     # if the token is available via required_env or auth_token_env.
diff --git a/workspace/tests/test_preflight.py b/workspace/tests/test_preflight.py
index febf536a..063dcb8f 100644
--- a/workspace/tests/test_preflight.py
+++ b/workspace/tests/test_preflight.py
@@ -286,6 +286,55 @@ def test_required_env_empty_list_passes(tmp_path):
     assert report.ok is True
 
 
+def test_required_env_skipped_in_smoke_mode(tmp_path, monkeypatch):
+    """MOLECULE_SMOKE_MODE=1 demotes Required-env failures to warnings.
+
+    Boot smoke (issue #2275) exercises executor.execute() against stub
+    deps and never hits the real provider, so missing auth env is not
+    a real blocker. Without this bypass, every adapter that introduces
+    a new auth env var (HERMES_API_KEY, OPENROUTER_API_KEY, etc.)
+    would silently break the publish-image gate until molecule-ci's
+    fake-env list catches up — the 2026-05-03 hermes outage. The
+    warning still surfaces in the report so unset env doesn't go
+    completely silent.
+    """
+    monkeypatch.delenv("HERMES_API_KEY", raising=False)
+    monkeypatch.setenv("MOLECULE_SMOKE_MODE", "1")
+
+    config = make_config(
+        runtime_config=RuntimeConfig(required_env=["HERMES_API_KEY"]),
+    )
+
+    report = run_preflight(config, str(tmp_path))
+
+    assert report.ok is True
+    assert any(
+        issue.title == "Required env" and "HERMES_API_KEY" in issue.detail
+        for issue in report.warnings
+    ), "smoke-mode bypass should still warn so unset env stays visible"
+    assert not any(
+        issue.title == "Required env" for issue in report.failures
+    )
+
+
+def test_required_env_smoke_mode_off_still_fails(tmp_path, monkeypatch):
+    """Sanity: smoke bypass is OFF when MOLECULE_SMOKE_MODE is unset."""
+    monkeypatch.delenv("HERMES_API_KEY", raising=False)
+    monkeypatch.delenv("MOLECULE_SMOKE_MODE", raising=False)
+
+    config = make_config(
+        runtime_config=RuntimeConfig(required_env=["HERMES_API_KEY"]),
+    )
+
+    report = run_preflight(config, str(tmp_path))
+
+    assert report.ok is False
+    assert any(
+        issue.title == "Required env" and "HERMES_API_KEY" in issue.detail
+        for issue in report.failures
+    )
+
+
 # ---------- Per-model required_env (models[] override) ----------
 
 

From 09010212a0a887071d1c1f855b586641d8cca939 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwangalt@gmail.com>
Date: Sun, 3 May 2026 03:52:39 -0700
Subject: [PATCH 2/6] feat(ci): structural drift gate for cascade list vs
 manifest (RFC #388 PR-3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the recurrence path of PR #2556. The data fix realigned 8→4
templates in publish-runtime.yml's TEMPLATES variable, but the
underlying drift hazard was unguarded — the next manifest change
could silently leave cascade out of sync again.

This gate fails any PR that changes manifest.json or
publish-runtime.yml in a way that makes the cascade list diverge
from manifest workspace_templates (suffix-stripped). Either
direction is caught:

  missing-from-cascade  templates that won't auto-rebuild on a new
                       wheel publish (the codex-stuck-on-stale-runtime
                       bug class — PR #2512 added codex to manifest,
                       cascade wasn't updated, codex stayed pinned to
                       its last-built runtime version for weeks).

  extra-in-cascade     cascade dispatches to deprecated templates
                       (the wasted-API-calls + dead-CI-noise class —
                       PR #2536 pruned 5 templates from manifest;
                       cascade kept dispatching to all 8 until
                       PR #2556).

Triggers narrowly: only on PRs that touch manifest.json,
publish-runtime.yml, or the script itself. Fast (single grep+sed+comm
pipeline, no Go build).

Surfaced during the RFC #388 prior-art audit; folded in as the
structural follow-up to the data fix #2556 promised.

Self-tested both failure modes locally before commit:
  - Drop codex from cascade → script fails with "MISSING: codex"
  - Add langgraph to cascade → script fails with "EXTRA: langgraph"

Refs: https://github.com/Molecule-AI/molecule-controlplane/issues/388

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/cascade-list-drift-gate.yml | 39 ++++++++
 scripts/check-cascade-list-vs-manifest.sh     | 95 +++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 .github/workflows/cascade-list-drift-gate.yml
 create mode 100755 scripts/check-cascade-list-vs-manifest.sh

diff --git a/.github/workflows/cascade-list-drift-gate.yml b/.github/workflows/cascade-list-drift-gate.yml
new file mode 100644
index 00000000..284a68d8
--- /dev/null
+++ b/.github/workflows/cascade-list-drift-gate.yml
@@ -0,0 +1,39 @@
+name: cascade-list-drift-gate
+
+# Structural gate: TEMPLATES list in publish-runtime.yml must match
+# manifest.json's workspace_templates exactly. Closes the recurrence
+# path of PR #2556 (the data fix) and is the first concrete deliverable
+# of RFC #388 PR-3.
+#
+# Why a gate, not just discipline: PR #2536 pruned the manifest, but the
+# cascade list wasn't updated for ~weeks before someone (PR #2556)
+# noticed during an unrelated audit. During that window, codex never
+# rebuilt on a runtime publish. A structural gate catches the drift
+# the same day either file changes.
+#
+# Triggers narrowly to keep CI quiet: only on PRs that actually change
+# one of the two files. The path-filtered split + always-emit-result
+# pattern (memory: "Required check names need a job that always runs")
+# is unnecessary here because the workflow IS the check name and PR
+# branch protection should require it directly. Future-proof: if this
+# becomes a required check, add a no-op aggregator with always() so the
+# name still emits when paths don't match.
+
+on:
+  pull_request:
+    branches: [staging, main]
+    paths:
+      - manifest.json
+      - .github/workflows/publish-runtime.yml
+      - scripts/check-cascade-list-vs-manifest.sh
+
+permissions:
+  contents: read
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+      - name: Check cascade list matches manifest
+        run: bash scripts/check-cascade-list-vs-manifest.sh
diff --git a/scripts/check-cascade-list-vs-manifest.sh b/scripts/check-cascade-list-vs-manifest.sh
new file mode 100755
index 00000000..434069a5
--- /dev/null
+++ b/scripts/check-cascade-list-vs-manifest.sh
@@ -0,0 +1,95 @@
+#!/usr/bin/env bash
+# check-cascade-list-vs-manifest.sh — structural drift gate for the
+# publish-runtime cascade list vs manifest.json workspace_templates.
+#
+# WHY: PR #2536 pruned the manifest to 4 supported runtimes; PR #2556
+# realigned the cascade list to match. The underlying drift hazard
+# (cascade-list ≠ manifest) was unguarded — the data fix didn't prevent
+# recurrence. This script is the structural gate that does.
+#
+# Behavior-based per project pattern: derives the expected set from
+# manifest.json and the actual set from the workflow YAML, fails on
+# any divergence in either direction.
+#
+#   missing-from-cascade  → templates in manifest that publish-runtime.yml
+#                            won't auto-rebuild on a new wheel publish
+#                            (the codex-stuck-on-stale-runtime bug class)
+#   extra-in-cascade      → cascade dispatches to deprecated templates
+#                            (the wasted-API-calls + dead-CI-noise class)
+#
+# Suffix mapping: manifest names map to GHCR repos via
+#   {name without -default suffix} → molecule-ai-workspace-template-<suffix>
+# That's the same map publish-runtime.yml's TEMPLATES variable iterates.
+#
+# Exit:
+#   0  cascade matches manifest exactly
+#   1  drift detected (script prints the diff)
+#   2  bad usage / missing inputs
+
+set -eu
+
+MANIFEST="${1:-manifest.json}"
+WORKFLOW="${2:-.github/workflows/publish-runtime.yml}"
+
+if [ ! -f "$MANIFEST" ]; then
+    echo "::error::manifest not found: $MANIFEST" >&2
+    exit 2
+fi
+if [ ! -f "$WORKFLOW" ]; then
+    echo "::error::workflow not found: $WORKFLOW" >&2
+    exit 2
+fi
+
+# Expected cascade entries: manifest workspace_templates → suffix-only
+# (strip -default tail, e.g. claude-code-default → claude-code, since
+# publish-runtime.yml's TEMPLATES uses suffixes that match the
+# molecule-ai-workspace-template-<suffix> repo naming).
+EXPECTED=$(jq -r '.workspace_templates[].name' "$MANIFEST" \
+    | sed 's/-default$//' \
+    | sort -u)
+
+# Actual cascade entries: extract from the TEMPLATES="…" line. We look
+# for the line, pull the contents between the quotes, and split into
+# one-per-line. Single source of truth in the workflow itself, no
+# parallel registry needed.
+#
+# Why not \s in the regex: BSD sed (macOS) doesn't recognize \s as
+# whitespace — treats it as literal `s`. POSIX [[:space:]] works on
+# both BSD and GNU sed. Same hazard nuked the original draft of this
+# script: \s* matched empty-prefix-of-literal-s, then the leading
+# whitespace stayed in the captured group.
+ACTUAL=$(grep -E '[[:space:]]*TEMPLATES="' "$WORKFLOW" \
+    | head -1 \
+    | sed -E 's/^[[:space:]]*TEMPLATES="([^"]*)".*$/\1/' \
+    | tr ' ' '\n' \
+    | grep -v '^$' \
+    | sort -u)
+
+if [ -z "$ACTUAL" ]; then
+    echo "::error::could not extract TEMPLATES=\"…\" from $WORKFLOW — has the variable name or quoting changed?" >&2
+    exit 2
+fi
+
+MISSING=$(comm -23 <(printf '%s\n' "$EXPECTED") <(printf '%s\n' "$ACTUAL"))
+EXTRA=$(comm -13 <(printf '%s\n' "$EXPECTED") <(printf '%s\n' "$ACTUAL"))
+
+if [ -z "$MISSING" ] && [ -z "$EXTRA" ]; then
+    echo "✓ cascade list matches manifest workspace_templates ($(echo "$EXPECTED" | wc -l | tr -d ' ') entries)"
+    exit 0
+fi
+
+echo "::error::cascade list drift detected between $MANIFEST and $WORKFLOW" >&2
+echo "" >&2
+if [ -n "$MISSING" ]; then
+    echo "  Templates in manifest but MISSING from cascade (won't auto-rebuild on wheel publish):" >&2
+    echo "$MISSING" | sed 's/^/    - /' >&2
+    echo "" >&2
+fi
+if [ -n "$EXTRA" ]; then
+    echo "  Templates in cascade but NOT in manifest (deprecated, wasting dispatch calls):" >&2
+    echo "$EXTRA" | sed 's/^/    - /' >&2
+    echo "" >&2
+fi
+echo "  Fix: edit the TEMPLATES=\"…\" line in $WORKFLOW so the set matches" >&2
+echo "  manifest.json's workspace_templates (suffix-stripped). See PR #2556 for context." >&2
+exit 1

From e1628c4d56d753ee38632a5d7dbdf10954fe4490 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 3 May 2026 04:06:45 -0700
Subject: [PATCH 3/6] fix(a2a): route terminal Message via
 TaskUpdater.complete/failed in task mode
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #2558 enqueued a Task at the start of new requests so the v1 SDK
would accept TaskUpdater.start_work() — fix #1 of the v0→v1 migration
gap (PR #2170). But after Task is enqueued, the executor enters
"task mode" and the SDK rejects raw Message enqueues at the terminal
step:

  {"code":-32603,"message":"Received Message object in task mode.
  Use TaskStatusUpdateEvent or TaskArtifactUpdateEvent instead."}

Synth-E2E 2026-05-03T11:00:34Z surfaced this on the very first run
after the prior fix cascaded. Validation site is the same
a2a/server/agent_execution/active_task.py — the framework's job is
to enforce the v1 invariant; we're catching up to it.

The fix routes both terminal events through TaskUpdater helpers:
- success: updater.complete(message=msg) wraps in
  TaskStatusUpdateEvent(state=COMPLETED, final=True)
- error: updater.failed(message=...) wraps in
  TaskStatusUpdateEvent(state=FAILED, final=True)

Both helpers exist in a2a-sdk ≥ 1.0; verified via
TaskUpdater.complete signature.

Tests:
- conftest TaskUpdater stub now records complete/failed calls AND
  routes the message back through event_queue.enqueue_event so the
  ~20 legacy tests asserting on enqueue_event keep working
- 2 new regression tests pin the contract:
  * test_terminal_success_routes_via_updater_complete
  * test_terminal_error_routes_via_updater_failed
- Both NEW tests verified to FAIL on staging-baseline (without this
  fix) and PASS with it — they'd catch the regression before staging
  if the wheel-smoke gate covered task-mode terminal events too
  (separate yak-shave for #131 follow-up)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 workspace/a2a_executor.py            | 21 ++++++--
 workspace/tests/conftest.py          | 26 +++++++---
 workspace/tests/test_a2a_executor.py | 78 ++++++++++++++++++++++++++++
 3 files changed, 114 insertions(+), 11 deletions(-)

diff --git a/workspace/a2a_executor.py b/workspace/a2a_executor.py
index 38860c03..9b4d9464 100644
--- a/workspace/a2a_executor.py
+++ b/workspace/a2a_executor.py
@@ -509,7 +509,15 @@ class LangGraphA2AExecutor(AgentExecutor):
                         # accept the assignment. See #1787 + commit dcbcf19
                         # for the original test-mock motivation.
                         logger.debug("metadata attach skipped (non-Message return from new_text_message)")
-                await event_queue.enqueue_event(msg)
+                # A2A v1 (a2a-sdk ≥ 1.0): once Task is enqueued (above, PR #2558),
+                # the executor is in task mode and raw Message enqueues are
+                # rejected with InvalidAgentResponseError("Received Message
+                # object in task mode. Use TaskStatusUpdateEvent or
+                # TaskArtifactUpdateEvent instead."). updater.complete()
+                # wraps the Message in a terminal TaskStatusUpdateEvent
+                # (state=COMPLETED, final=True) which both streaming and
+                # non-streaming clients accept.
+                await updater.complete(message=msg)
                 _result = final_text
 
             except Exception as e:
@@ -520,10 +528,13 @@ class LangGraphA2AExecutor(AgentExecutor):
                     task_span.set_status(StatusCode.ERROR, str(e))
                 except Exception:
                     pass
-                # Emit a Message so both streaming and non-streaming clients
-                # receive an error response rather than hanging.
-                await event_queue.enqueue_event(
-                    new_text_message(
+                # A2A v1: in task mode, terminal errors must publish a
+                # FAILED TaskStatusUpdateEvent (carrying the error Message)
+                # rather than a raw Message enqueue. updater.failed() does
+                # exactly this — both streaming and non-streaming clients
+                # receive the error and stop polling.
+                await updater.failed(
+                    message=new_text_message(
                         f"Agent error: {e}", task_id=task_id, context_id=context_id
                     )
                 )
diff --git a/workspace/tests/conftest.py b/workspace/tests/conftest.py
index 0d130a6f..cb1b75b4 100644
--- a/workspace/tests/conftest.py
+++ b/workspace/tests/conftest.py
@@ -35,27 +35,41 @@ def _make_a2a_mocks():
 
     events_mod.EventQueue = EventQueue
 
-    # a2a.server.tasks needs a TaskUpdater stub whose async methods are no-ops.
-    # In tests, TaskUpdater calls go to this stub rather than the real SDK so
-    # event_queue.enqueue_event is only called via explicit executor code paths.
+    # a2a.server.tasks needs a TaskUpdater stub whose async methods are no-ops
+    # for status transitions but ROUTE the terminal message back through
+    # event_queue.enqueue_event so legacy assertions on enqueue_event keep
+    # working. The wrapper preserves identity (the same Message object the
+    # executor passed in) so tests inspecting str(event_arg) still see the
+    # response text. complete()/failed() also record their last call on the
+    # event_queue itself (`_complete_calls`, `_failed_calls`) so the v1
+    # contract regression test (#262 follow-on to #2558) can pin the proper
+    # path was taken — raw enqueue from executor would NOT touch these.
     tasks_mod = ModuleType("a2a.server.tasks")
 
     class TaskUpdater:
-        """Stub TaskUpdater — no-op async methods for unit tests."""
+        """Stub TaskUpdater — terminal helpers route through event_queue."""
 
         def __init__(self, event_queue, task_id, context_id, *args, **kwargs):
             self.event_queue = event_queue
             self.task_id = task_id
             self.context_id = context_id
+            if not hasattr(event_queue, "_complete_calls"):
+                event_queue._complete_calls = []
+            if not hasattr(event_queue, "_failed_calls"):
+                event_queue._failed_calls = []
 
         async def start_work(self, message=None):
             pass
 
         async def complete(self, message=None):
-            pass
+            self.event_queue._complete_calls.append(message)
+            if message is not None:
+                await self.event_queue.enqueue_event(message)
 
         async def failed(self, message=None):
-            pass
+            self.event_queue._failed_calls.append(message)
+            if message is not None:
+                await self.event_queue.enqueue_event(message)
 
         async def add_artifact(
             self, parts, artifact_id=None, name=None, metadata=None,
diff --git a/workspace/tests/test_a2a_executor.py b/workspace/tests/test_a2a_executor.py
index 134c56ba..1835092c 100644
--- a/workspace/tests/test_a2a_executor.py
+++ b/workspace/tests/test_a2a_executor.py
@@ -1123,3 +1123,81 @@ async def test_no_task_enqueue_on_continuation():
         assert not isinstance(event, Task), (
             f"continuation must not re-enqueue Task, but got Task at {call}"
         )
+
+
+# ---------------------------------------------------------------------------
+# A2A v1 task-mode terminal-event contract (PR #2558 follow-up, task #262)
+# ---------------------------------------------------------------------------
+# After PR #2558 enqueues a Task at the start of new requests, the executor
+# is in v1 "task mode". The SDK then rejects any subsequent raw Message
+# enqueue with InvalidAgentResponseError("Received Message object in task
+# mode. Use TaskStatusUpdateEvent or TaskArtifactUpdateEvent instead.") —
+# see a2a/server/agent_execution/active_task.py validation site. Synth-E2E
+# 2026-05-03T11:00:34Z surfaced this. The fix routes the terminal Message
+# through TaskUpdater.complete()/failed() which wrap it in a
+# TaskStatusUpdateEvent. Both tests below pin that path so the regression
+# can't recur (raw enqueue at the terminal step would NOT touch
+# event_queue._complete_calls / _failed_calls).
+
+@pytest.mark.asyncio
+async def test_terminal_success_routes_via_updater_complete():
+    """A successful run must terminate via updater.complete(message=...) —
+    raw event_queue.enqueue_event(Message) crashes the v1 SDK in task mode."""
+    agent = MagicMock()
+    agent.astream_events = MagicMock(return_value=_stream(_text_chunk("Hello")))
+    executor = LangGraphA2AExecutor(agent)
+
+    part = MagicMock()
+    part.text = "Hi"
+
+    context = _make_context([part], "ctx-term-ok", task_id="task-term-ok")
+    context.current_task = None  # forces task-mode (Task gets enqueued)
+    eq = _make_event_queue()
+    # Pre-init real lists so the AsyncMock event_queue doesn't auto-spec
+    # _complete_calls/_failed_calls into child MagicMocks. The conftest
+    # TaskUpdater stub appends to these lists when complete/failed fire.
+    eq._complete_calls = []
+    eq._failed_calls = []
+
+    await executor.execute(context, eq)
+
+    assert eq._complete_calls, (
+        "terminal Message must route via updater.complete() in task mode — "
+        "raw event_queue.enqueue_event(Message) is rejected by a2a-sdk v1"
+    )
+    final_msg = eq._complete_calls[-1]
+    assert "Hello" in str(final_msg)
+
+
+@pytest.mark.asyncio
+async def test_terminal_error_routes_via_updater_failed():
+    """An agent crash must terminate via updater.failed(message=...) — raw
+    enqueue in task mode hits the same v1 contract violation."""
+    async def _error_stream(*args, **kwargs):
+        raise RuntimeError("model crashed")
+        yield  # pragma: no cover — makes this an async generator
+
+    agent = MagicMock()
+    agent.astream_events = MagicMock(return_value=_error_stream())
+    executor = LangGraphA2AExecutor(agent)
+
+    part = MagicMock()
+    part.text = "Break things"
+
+    context = _make_context([part], "ctx-term-err", task_id="task-term-err")
+    context.current_task = None  # forces task-mode
+    eq = _make_event_queue()
+    eq._complete_calls = []
+    eq._failed_calls = []
+
+    await executor.execute(context, eq)
+
+    assert eq._failed_calls, (
+        "terminal error Message must route via updater.failed() in task mode"
+    )
+    err_msg = eq._failed_calls[-1]
+    assert "model crashed" in str(err_msg)
+    # And complete() must NOT have been called on the failure path.
+    assert not eq._complete_calls, (
+        "complete() should not fire when execute() raises"
+    )

From df7edfcd3fcfa3a542c7a6953d81a5b5ab448527 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 3 May 2026 04:11:35 -0700
Subject: [PATCH 4/6] fix(canvas): wire ReactFlow colorMode to resolvedTheme
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PR #2555 (Tailwind v4 + warm-paper) migrated all canvas chrome (toolbar,
side panel, modal layer) to semantic tokens, but missed the React Flow
viewport's `colorMode="dark"` literal — and two paired hardcoded dark
literals on the Background dot color and MiniMap mask. Net result on
prod: the user picked light mode, the toolbar flipped warm-paper, but
the canvas backplate, edges, dots, controls, and minimap stayed black —
visibly half-themed.

Three coordinated fixes inside the canvas viewport:
- ReactFlow `colorMode={resolvedTheme}` so the library's own dark/light
  styles flip with the user's choice.
- Background dot color picks the line-soft tone in light mode (zinc-800
  was invisible-on-cream).
- MiniMap maskColor warm-tints the off-viewport dim so the unselected
  region doesn't render as a hard black bar over warm-paper.

Verification:
- `npx tsc --noEmit` clean
- `npx vitest run` 188/188 pass
- (will browser-verify post-redeploy on hongming.moleculesai.app)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 canvas/src/components/Canvas.tsx | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/canvas/src/components/Canvas.tsx b/canvas/src/components/Canvas.tsx
index f677862a..ebd8a1d3 100644
--- a/canvas/src/components/Canvas.tsx
+++ b/canvas/src/components/Canvas.tsx
@@ -13,6 +13,7 @@ import {
 import "@xyflow/react/dist/style.css";
 
 import { useCanvasStore } from "@/store/canvas";
+import { useTheme } from "@/lib/theme-provider";
 import { A2ATopologyOverlay } from "./A2ATopologyOverlay";
 import { WorkspaceNode } from "./WorkspaceNode";
 import { SidePanel } from "./SidePanel";
@@ -69,6 +70,14 @@ export function Canvas() {
 }
 
 function CanvasInner() {
+  // ReactFlow's `colorMode` prop drives the styling of every viewport
+  // primitive it renders directly (background dots, edge defaults,
+  // selection rings, controls, minimap mask). Pre-fix this was hard-pinned
+  // to "dark" — so on light theme the chrome (toolbar, side panel) flipped
+  // to warm-paper but the canvas backplate + edges stayed black, leaving a
+  // half-themed page. Pull resolvedTheme so the canvas matches the user's
+  // selected mode (and the system preference when they pick "system").
+  const { resolvedTheme } = useTheme();
   const rawNodes = useCanvasStore((s) => s.nodes);
   const edges = useCanvasStore((s) => s.edges);
   const a2aEdges = useCanvasStore((s) => s.a2aEdges);
@@ -250,7 +259,7 @@ function CanvasInner() {
       </a>
       <main id="canvas-main" className="w-screen h-screen bg-surface">
         <ReactFlow
-          colorMode="dark"
+          colorMode={resolvedTheme}
           nodes={nodes}
           edges={allEdges}
           onNodesChange={onNodesChange}
@@ -273,7 +282,9 @@ function CanvasInner() {
             variant={BackgroundVariant.Dots}
             gap={24}
             size={1}
-            color="#27272a"
+            // Match the line token so dots fade with the surface.
+            // Hard-coded zinc-800 was invisible on warm-paper.
+            color={resolvedTheme === "dark" ? "#27272a" : "#d4d0c4"}
           />
           <Controls
             className="!bg-surface-sunken/90 !border-line/50 !rounded-lg !shadow-xl !shadow-black/20 [&>button]:!bg-surface-card [&>button]:!border-line/50 [&>button]:!text-ink-mid [&>button:hover]:!bg-surface-card [&>button:hover]:!text-ink"
@@ -281,7 +292,9 @@ function CanvasInner() {
           />
           <MiniMap
             className="!bg-surface-sunken/90 !border-line/50 !rounded-lg !shadow-xl !shadow-black/20"
-            maskColor="rgba(0, 0, 0, 0.7)"
+            // Mask dims off-viewport areas; tint matches the surface so
+            // the dimming doesn't show as a black bar in light mode.
+            maskColor={resolvedTheme === "dark" ? "rgba(0, 0, 0, 0.7)" : "rgba(232, 226, 211, 0.7)"}
             nodeColor={(node) => {
               // Parents show as a filled region — hierarchy visible at
               // a glance in the minimap without needing to zoom.

From 596e797dca895789e0dc7f5cab305f4782fc23c0 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 3 May 2026 04:28:29 -0700
Subject: [PATCH 5/6] ci(deploy): broaden ephemeral-prefix matchers to cover
 rt-e2e-*
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The redeploy-tenants-on-staging soft-warn filter and the
sweep-stale-e2e-orgs janitor both hardcoded `^e2e-` to identify
ephemeral test tenants. Runtime-test harness fixtures (RFC #2251)
mint slugs prefixed with `rt-e2e-`, which neither matcher recognized.

Concrete impact observed today:
  - Two `rt-e2e-v{5,6}-*` tenants left orphaned 8h on staging
    (sweep-stale-e2e-orgs ignored them).
  - On the next staging redeploy their phantom EC2s returned
    `InvalidInstanceId: Instances not in a valid state for account`
    from SSM SendCommand → CP returned HTTP 500 + ok=false.
  - The redeploy soft-warn missed them too, so the workflow went
    red, which broke the auto-promote-staging chain feeding the
    canvas warm-paper rollout to prod.

Fix: switch both matchers to recognize the alternation
`^(e2e-|rt-e2e-)`. Long-lived prefixes (demo-prep, dryrun-*, dryrun2-*)
remain non-ephemeral and continue to hard-fail. Comment documents
the source-of-truth list and the cross-file invariant.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../workflows/redeploy-tenants-on-staging.yml | 40 +++++++++++--------
 .github/workflows/sweep-stale-e2e-orgs.yml    | 14 +++++--
 2 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/redeploy-tenants-on-staging.yml b/.github/workflows/redeploy-tenants-on-staging.yml
index caaeb56e..97392172 100644
--- a/.github/workflows/redeploy-tenants-on-staging.yml
+++ b/.github/workflows/redeploy-tenants-on-staging.yml
@@ -176,35 +176,41 @@ jobs:
           #
           # CP returns HTTP 500 + ok=false whenever ANY tenant in the
           # fleet failed SSM or healthz. In practice the recurring source
-          # of these is ephemeral e2e-* tenants (saas/canvas/ext) being
-          # torn down by their parent E2E run mid-redeploy: the EC2 dies →
-          # SSM exit=2 or healthz timeout → CP marks the fleet failed →
-          # this workflow goes red even though every operator-facing
-          # tenant rolled fine.
+          # of these is ephemeral test tenants being torn down by their
+          # parent E2E run mid-redeploy: the EC2 dies → SSM exit=2 or
+          # healthz timeout → CP marks the fleet failed → this workflow
+          # goes red even though every operator-facing tenant rolled fine.
           #
-          # Filter: if HTTP=500/ok=false AND every failed slug matches
-          # ^e2e-, treat as soft-warn and let the verify step downstream
-          # handle the unreachable-vs-stale distinction (it already knows
-          # the difference per #2402). Any non-e2e-* failure or a non-500
-          # HTTP response remains a hard failure.
+          # Ephemeral slug prefixes (kept in sync with sweep-stale-e2e-orgs.yml
+          # — see that file for the source-of-truth list and rationale):
+          #   - e2e-*       — canvas/saas/ext E2E suites
+          #   - rt-e2e-*    — runtime-test harness fixtures (RFC #2251)
+          # Long-lived prefixes that are NOT ephemeral and MUST hard-fail:
+          # demo-prep, dryrun-*, dryrun2-*, plus all human tenant slugs.
+          #
+          # Filter: if HTTP=500/ok=false AND every failed slug matches an
+          # ephemeral prefix, treat as soft-warn and let the verify step
+          # downstream handle unreachable-vs-stale (#2402). Any non-ephemeral
+          # failure or a non-500 HTTP response remains a hard failure.
           OK=$(jq -r '.ok // "false"' "$HTTP_RESPONSE")
           FAILED_SLUGS=$(jq -r '
             .results[]?
             | select((.healthz_ok != true) or (.ssm_status != "Success"))
             | .slug' "$HTTP_RESPONSE" 2>/dev/null || true)
-          NON_E2E_FAILED=$(printf '%s\n' "$FAILED_SLUGS" | grep -v '^$' | grep -v '^e2e-' || true)
+          EPHEMERAL_PREFIX_RE='^(e2e-|rt-e2e-)'
+          NON_EPHEMERAL_FAILED=$(printf '%s\n' "$FAILED_SLUGS" | grep -v '^$' | grep -Ev "$EPHEMERAL_PREFIX_RE" || true)
 
           if [ "$HTTP_CODE" = "200" ] && [ "$OK" = "true" ]; then
             : # happy path — fall through to verification
-          elif [ "$HTTP_CODE" = "500" ] && [ -z "$NON_E2E_FAILED" ] && [ -n "$FAILED_SLUGS" ]; then
-            COUNT=$(printf '%s\n' "$FAILED_SLUGS" | grep -c '^e2e-' || true)
-            echo "::warning::redeploy-fleet returned HTTP 500 but every failed tenant ($COUNT) is e2e-* ephemeral — treating as teardown race, soft-warning."
+          elif [ "$HTTP_CODE" = "500" ] && [ -z "$NON_EPHEMERAL_FAILED" ] && [ -n "$FAILED_SLUGS" ]; then
+            COUNT=$(printf '%s\n' "$FAILED_SLUGS" | grep -Ec "$EPHEMERAL_PREFIX_RE" || true)
+            echo "::warning::redeploy-fleet returned HTTP 500 but every failed tenant ($COUNT) is ephemeral (e2e-*/rt-e2e-*) — treating as teardown race, soft-warning."
             printf '%s\n' "$FAILED_SLUGS" | sed 's/^/::warning::  failed: /'
           elif [ "$HTTP_CODE" != "200" ]; then
             echo "::error::redeploy-fleet returned HTTP $HTTP_CODE"
-            if [ -n "$NON_E2E_FAILED" ]; then
-              echo "::error::non-e2e tenant(s) failed:"
-              printf '%s\n' "$NON_E2E_FAILED" | sed 's/^/::error::  /'
+            if [ -n "$NON_EPHEMERAL_FAILED" ]; then
+              echo "::error::non-ephemeral tenant(s) failed:"
+              printf '%s\n' "$NON_EPHEMERAL_FAILED" | sed 's/^/::error::  /'
             fi
             exit 1
           else
diff --git a/.github/workflows/sweep-stale-e2e-orgs.yml b/.github/workflows/sweep-stale-e2e-orgs.yml
index 6913cba2..5a0dce30 100644
--- a/.github/workflows/sweep-stale-e2e-orgs.yml
+++ b/.github/workflows/sweep-stale-e2e-orgs.yml
@@ -87,20 +87,28 @@ jobs:
             > orgs.json
 
           # Filter:
-          #   1. slug starts with 'e2e-' (covers e2e-, e2e-canary-,
-          #      e2e-canvas-* — all variants the test scripts mint)
+          #   1. slug starts with one of the ephemeral test prefixes:
+          #        - 'e2e-'    — covers e2e-canary-, e2e-canvas-*, etc.
+          #        - 'rt-e2e-' — runtime-test harness fixtures (RFC #2251);
+          #                      missing this prefix left two such tenants
+          #                      orphaned 8h on staging (2026-05-03), then
+          #                      hard-failed redeploy-tenants-on-staging
+          #                      and broke the staging→main auto-promote
+          #                      chain. Kept in sync with the EPHEMERAL_PREFIX_RE
+          #                      regex in redeploy-tenants-on-staging.yml.
           #   2. created_at is older than MAX_AGE_MINUTES ago
           # Output one slug per line to a file the next step reads.
           python3 > stale_slugs.txt <<'PY'
           import json, os
           from datetime import datetime, timezone, timedelta
+          EPHEMERAL_PREFIXES = ("e2e-", "rt-e2e-")
           with open("orgs.json") as f:
               data = json.load(f)
           max_age = int(os.environ["MAX_AGE_MINUTES"])
           cutoff = datetime.now(timezone.utc) - timedelta(minutes=max_age)
           for o in data.get("orgs", []):
               slug = o.get("slug", "")
-              if not slug.startswith("e2e-"):
+              if not slug.startswith(EPHEMERAL_PREFIXES):
                   continue
               created = o.get("created_at")
               if not created:

From 5e46ea70d639b194e2e29d30e9b340dcf0493396 Mon Sep 17 00:00:00 2001
From: Hongming Wang <hongmingwang.rabbit@users.noreply.github.com>
Date: Sun, 3 May 2026 04:43:07 -0700
Subject: [PATCH 6/6] ci(synth-e2e): wire MOLECULE_STAGING_OPENAI_KEY into
 provisioned tenant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The synth-E2E (#2342) provisions a langgraph tenant whose default
model `openai:gpt-4.1-mini` requires OPENAI_API_KEY for the first LLM
call. Sibling workflows already wire this:
- e2e-staging-saas.yml:89
- canary-staging.yml:63

continuous-synth-e2e.yml just forgot. Result: tenant boots, accepts
a2a messages, then returns:

  Agent error: "Could not resolve authentication method. Expected
  either api_key or auth_token to be set."

This was masked since 2026-04-29 (workflow creation) by a2a-sdk v0→v1
contract violations — PR #2558 (Task-enqueue) and #2563
(TaskUpdater.complete/failed terminal events) cleared those, exposing
the underlying auth gap on the synth-E2E firing at 11:39 UTC today.

The script tests/e2e/test_staging_full_saas.sh:325 already reads
E2E_OPENAI_API_KEY and persists it as a workspace_secret on tenant
create — only the workflow wiring was missing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/continuous-synth-e2e.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/continuous-synth-e2e.yml b/.github/workflows/continuous-synth-e2e.yml
index ba5f80ce..c6c482b8 100644
--- a/.github/workflows/continuous-synth-e2e.yml
+++ b/.github/workflows/continuous-synth-e2e.yml
@@ -88,6 +88,15 @@ jobs:
       E2E_KEEP_ORG: ${{ github.event.inputs.keep_org == 'true' && '1' || '' }}
       MOLECULE_CP_URL: ${{ vars.STAGING_CP_URL || 'https://staging-api.moleculesai.app' }}
       MOLECULE_ADMIN_TOKEN: ${{ secrets.CP_STAGING_ADMIN_API_TOKEN }}
+      # Provisioned tenant's default model (langgraph: openai:gpt-4.1-mini)
+      # needs OPENAI_API_KEY at first call. Sibling workflows
+      # e2e-staging-saas.yml + canary-staging.yml use the same secret;
+      # without this wire-up the tenant boots, accepts a2a messages,
+      # then returns "Could not resolve authentication method" — masked
+      # earlier by the a2a-sdk task-mode contract bugs PR #2558+#2563
+      # fixed. tests/e2e/test_staging_full_saas.sh:325 reads this and
+      # persists it as a workspace_secret on tenant create.
+      E2E_OPENAI_API_KEY: ${{ secrets.MOLECULE_STAGING_OPENAI_KEY }}
     steps:
       - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2