From 3c16c274152ff78bcc7872113337df1acd5113ae Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 20:39:48 -0700
Subject: [PATCH 01/61] ci(wheel-smoke): always-run with per-step if-gates for
required-check eligibility
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The `PR-built wheel + import smoke` gate caught the broken wheel from
PR #2433 (`import inbox as _inbox_module` collision) but couldn't block
the merge because it isn't a required check on staging. Promoting it to
required is the right move per the runtime publish pipeline gates note
(2026-04-27 RuntimeCapabilities ImportError outage), but the existing
`paths: [workspace/**, scripts/...]` filter blocks PRs that don't touch
those paths from ever generating the check run — branch protection
would deadlock waiting on a check that never fires.
Refactor (same shape as e2e-api.yml's e2e-api job):
- Drop top-level `paths:` filter — workflow runs on every push/PR/
merge_group event.
- Add `detect-changes` job using dorny/paths-filter to compute the
`wheel=true|false` output.
- Collapse to ONE always-running `local-build-install` job named
`PR-built wheel + import smoke`. Per-step `if:` gates on the
detect output. PRs untouched by wheel-relevant paths emit a
no-op SUCCESS step ("paths filter excluded this commit") so the
check passes without rebuilding the wheel.
- merge_group + workflow_dispatch unconditionally `wheel=true` so
the queue always validates the to-be-merged state, regardless of
which PR composed it.
Why one-job-with-step-gates instead of two-jobs-sharing-name: SKIPPED
check runs block branch protection even when SUCCESS siblings exist
(verified PR #2264 incident, 2026-04-29). Single always-run job emits
exactly one SUCCESS check run regardless of paths filter.
Follow-up: open a separate PR adding `PR-built wheel + import smoke`
to the staging branch protection's required_status_checks.contexts
once this lands. Doing both in one PR risks the protection update
firing before the workflow refactor merges, deadlocking unrelated PRs.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.github/workflows/runtime-prbuild-compat.yml | 83 ++++++++++++++------
1 file changed, 59 insertions(+), 24 deletions(-)
diff --git a/.github/workflows/runtime-prbuild-compat.yml b/.github/workflows/runtime-prbuild-compat.yml
index 96f1a289..0bc9a511 100644
--- a/.github/workflows/runtime-prbuild-compat.yml
+++ b/.github/workflows/runtime-prbuild-compat.yml
@@ -23,55 +23,88 @@ name: Runtime PR-Built Compatibility
#
# By building from the PR's source and smoke-importing THAT wheel, we
# fail at PR-time instead of after publish.
+#
+# Required-check shape (2026-05-01): the workflow runs on EVERY push +
+# PR + merge_group event with no top-level `paths:` filter, then uses a
+# detect-changes job + per-step `if:` gates inside ONE always-running
+# job named `PR-built wheel + import smoke`. PRs that don't touch
+# wheel-relevant paths get a no-op SUCCESS check run, satisfying branch
+# protection without re-running the heavy build. Same pattern as
+# e2e-api.yml — see its comment for the full rationale + the 2026-04-29
+# PR #2264 incident that motivated the always-run-with-if-gates shape.
on:
push:
branches: [main, staging]
- paths:
- # Broad filter: this workflow's verdict can change whenever any
- # workspace/ source file changes (because the wheel we build is
- # produced from those files), or when the build script itself
- # changes (it controls the wheel layout).
- - 'workspace/**'
- - 'scripts/build_runtime_package.py'
- - 'scripts/wheel_smoke.py'
- - '.github/workflows/runtime-prbuild-compat.yml'
pull_request:
branches: [main, staging]
- paths:
- - 'workspace/**'
- - 'scripts/build_runtime_package.py'
- - 'scripts/wheel_smoke.py'
- - '.github/workflows/runtime-prbuild-compat.yml'
workflow_dispatch:
- # Required-check support: when this becomes a branch-protection gate,
- # merge_group runs let the queue green-check this in addition to PRs.
merge_group:
types: [checks_requested]
- # No cron: the same pre-merge run already covered the commit, and
- # re-running daily wouldn't surface anything new (workspace/ doesn't
- # change between cron firings unless a PR already passed this gate).
concurrency:
- group: ${{ github.workflow }}-${{ github.ref }}
+ group: ${{ github.workflow }}-${{ github.event.pull_request.head.sha || github.sha }}
cancel-in-progress: true
jobs:
+ detect-changes:
+ runs-on: ubuntu-latest
+ outputs:
+ wheel: ${{ steps.decide.outputs.wheel }}
+ steps:
+ - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+ - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1
+ id: filter
+ with:
+ filters: |
+ wheel:
+ - 'workspace/**'
+ - 'scripts/build_runtime_package.py'
+ - 'scripts/wheel_smoke.py'
+ - '.github/workflows/runtime-prbuild-compat.yml'
+ - id: decide
+ # Always run real work for manual dispatch + merge_group — no
+ # diff-against-base in those contexts, and the gate exists to
+ # validate the to-be-merged state regardless of which paths it
+ # touched (paths-filter would default to "no changes" which is
+ # the wrong answer when the queue is composing many PRs).
+ run: |
+ if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "${{ github.event_name }}" = "merge_group" ]; then
+ echo "wheel=true" >> "$GITHUB_OUTPUT"
+ else
+ echo "wheel=${{ steps.filter.outputs.wheel }}" >> "$GITHUB_OUTPUT"
+ fi
+
+ # ONE job (no job-level `if:`) that always runs and reports under the
+ # required-check name `PR-built wheel + import smoke`. Real work is
+ # gated per-step on `needs.detect-changes.outputs.wheel`. Same shape
+ # as e2e-api.yml's e2e-api job — see its comment block for the full
+ # rationale (SKIPPED check runs block branch protection even with
+ # SUCCESS siblings; collapsing to one always-run job emits exactly
+ # one SUCCESS check run).
local-build-install:
- # Builds the wheel from THIS PR's workspace/ + scripts/ and tests
- # IT — the artifact that WOULD be published if this PR merges.
+ needs: detect-changes
name: PR-built wheel + import smoke
runs-on: ubuntu-latest
steps:
- - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
- - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
+ - name: No-op pass (paths filter excluded this commit)
+ if: needs.detect-changes.outputs.wheel != 'true'
+ run: |
+ echo "No workspace/ / scripts/{build_runtime_package,wheel_smoke}.py / workflow changes — wheel gate satisfied without rebuilding."
+ echo "::notice::PR-built wheel + import smoke no-op pass (paths filter excluded this commit)."
+ - if: needs.detect-changes.outputs.wheel == 'true'
+ uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
+ - if: needs.detect-changes.outputs.wheel == 'true'
+ uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.11'
cache: pip
cache-dependency-path: workspace/requirements.txt
- name: Install build tooling
+ if: needs.detect-changes.outputs.wheel == 'true'
run: pip install build
- name: Build wheel from PR source (mirrors publish-runtime.yml)
+ if: needs.detect-changes.outputs.wheel == 'true'
# Use a fixed test version so the wheel filename is predictable.
# Doesn't reach PyPI — this build is local-only for the smoke.
# Use the SAME build script with the SAME args as
@@ -88,6 +121,7 @@ jobs:
--out /tmp/runtime-build
cd /tmp/runtime-build && python -m build
- name: Install built wheel + workspace requirements
+ if: needs.detect-changes.outputs.wheel == 'true'
run: |
python -m venv /tmp/venv-built
/tmp/venv-built/bin/pip install --upgrade pip
@@ -96,6 +130,7 @@ jobs:
/tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \
| grep -E '^(Name|Version):'
- name: Smoke import the PR-built wheel
+ if: needs.detect-changes.outputs.wheel == 'true'
# Same script publish-runtime.yml runs against the to-be-PyPI wheel.
# Closes the PR-time vs publish-time gap: a PR adding a new SDK
# call-shape no longer passes here (narrow `import main_sync`) only
From 6e92fe0a086f17d1ca7daa7b192e4a1655505d78 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 20:45:32 -0700
Subject: [PATCH 02/61] chore: rewriter unit tests + drop misleading noqa on
`import inbox`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Two small follow-ups to the PR #2433 → #2436 → #2439 incident chain.
1) `import inbox # noqa: F401` in workspace/a2a_mcp_server.py was
misleading — `inbox` IS used (at the bridge wiring inside main()).
F401 means "imported but unused", which would mask a real future
F401 if the usage is removed. Drop the noqa, keep the explanatory
block comment about the rewriter's `import X` → `import mr.X as X`
expansion (and the `import X as Y` → `import mr.X as X as Y` trap
the comment exists to prevent re-introducing).
2) scripts/test_build_runtime_package.py — 17 unit tests covering
`rewrite_imports()` and `build_import_rewriter()` in
scripts/build_runtime_package.py. Until now the function had zero
coverage despite the entire wheel build depending on it. Tests
pin: bare-import aliasing, dotted-import preservation, indented
imports, from-imports (simple + dotted + multi-symbol + block),
the `import X as Y` rejection added in PR #2436 (with comment-
stripping + indented + comma-not-alias edge cases), allowlist
anchoring (`a2a` ≠ `a2a_tools`), and end-to-end reproduction
of the PR #2433 failing pattern + the #2436 fix pattern.
3) Wire scripts/test_*.py into CI by adding a second discover pass
to test-ops-scripts.yml. Top-level scripts/ tests live alongside
their target file (parallels the scripts/ops/ test layout); the
existing scripts/ops/ pass keeps running because scripts/ops/
has no __init__.py so a single discover from scripts/ root
doesn't recurse. Two passes is simpler than retrofitting
namespace packages. Path filter widened from `scripts/ops/**`
to `scripts/**` so PRs touching the build script trigger the
new tests.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.github/workflows/test-ops-scripts.yml | 26 +++-
scripts/test_build_runtime_package.py | 201 +++++++++++++++++++++++++
workspace/a2a_mcp_server.py | 8 +-
3 files changed, 226 insertions(+), 9 deletions(-)
create mode 100644 scripts/test_build_runtime_package.py
diff --git a/.github/workflows/test-ops-scripts.yml b/.github/workflows/test-ops-scripts.yml
index 3c6488fa..a6f342e1 100644
--- a/.github/workflows/test-ops-scripts.yml
+++ b/.github/workflows/test-ops-scripts.yml
@@ -1,19 +1,25 @@
name: Ops Scripts Tests
-# Runs the unittest suite for scripts/ops/ on every PR + push that touches
-# the directory. Kept separate from the main CI so a script-only change
-# doesn't trigger the heavier Go/Canvas/Python pipelines.
+# Runs the unittest suite for scripts/ on every PR + push that touches
+# anything under scripts/. Kept separate from the main CI so a script-only
+# change doesn't trigger the heavier Go/Canvas/Python pipelines.
+#
+# Discovery: `unittest discover` from the scripts/ root walks recursively,
+# so both `scripts/test_*.py` and `scripts/ops/test_*.py` are picked up.
+# Tests should sit alongside the code they test (see
+# scripts/ops/test_sweep_cf_decide.py for the pattern; scripts/
+# test_build_runtime_package.py for the rewriter coverage).
on:
push:
branches: [main, staging]
paths:
- - 'scripts/ops/**'
+ - 'scripts/**'
- '.github/workflows/test-ops-scripts.yml'
pull_request:
branches: [main, staging]
paths:
- - 'scripts/ops/**'
+ - 'scripts/**'
- '.github/workflows/test-ops-scripts.yml'
merge_group:
types: [checks_requested]
@@ -31,6 +37,14 @@ jobs:
- uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0
with:
python-version: '3.11'
- - name: Run unittest
+ - name: Run scripts/ unittests (build_runtime_package, …)
+ # Top-level scripts/ tests live alongside their target file
+ # (e.g. scripts/test_build_runtime_package.py exercises
+ # scripts/build_runtime_package.py). discover from scripts/
+ # picks up only top-level test_*.py because scripts/ops/ has
+ # no __init__.py — that's intentional, so we run two passes.
+ working-directory: scripts
+ run: python -m unittest discover -t . -p 'test_*.py' -v
+ - name: Run scripts/ops/ unittests (sweep_cf_decide, …)
working-directory: scripts/ops
run: python -m unittest discover -p 'test_*.py' -v
diff --git a/scripts/test_build_runtime_package.py b/scripts/test_build_runtime_package.py
new file mode 100644
index 00000000..ec57b5e2
--- /dev/null
+++ b/scripts/test_build_runtime_package.py
@@ -0,0 +1,201 @@
+"""Tests for scripts/build_runtime_package.py — the wheel-build import rewriter.
+
+Run locally: ``python3 -m unittest scripts/test_build_runtime_package.py -v``
+
+Why this exists: PR #2433 shipped ``import inbox as _inbox_module`` inside
+the workspace runtime, and the rewriter expanded it to
+``import molecule_runtime.inbox as inbox as _inbox_module`` — invalid
+Python. The wheel-smoke gate caught it post-merge but couldn't block
+the merge (not a required check yet — see PR #2439). PR #2436 added a
+build-time gate that raises ``ValueError`` on this pattern; this file
+locks the rewriter's documented contract under unit test so the gate
+itself can't silently regress.
+
+Coverage:
+- ``import X`` → ``import molecule_runtime.X as X``
+- ``import X.sub`` → ``import molecule_runtime.X.sub``
+- ``import X`` + trailing comment is preserved
+- ``from X import Y`` → ``from molecule_runtime.X import Y``
+- ``from X.sub import Y`` → ``from molecule_runtime.X.sub import Y``
+- ``from X import Y, Z`` → ``from molecule_runtime.X import Y, Z``
+- ``import X as Y`` → raises ValueError (the rewriter would
+ produce ``import molecule_runtime.X as X as Y``, syntax error)
+- non-allowlist module names → not rewritten (regex anchors on the closed set)
+- Indented imports (inside def/class) keep their indentation.
+"""
+from __future__ import annotations
+
+import os
+import sys
+import unittest
+
+# scripts/build_runtime_package.py lives at scripts/ — add scripts/ to sys.path
+# so the import works whether unittest is invoked from repo root or scripts/.
+HERE = os.path.dirname(os.path.abspath(__file__))
+if HERE not in sys.path:
+ sys.path.insert(0, HERE)
+
+import build_runtime_package as M # noqa: E402
+
+
+def rewrite(text: str) -> str:
+ """Run the rewriter end-to-end so the test exercises the same path
+ used by the wheel build (regex compile + substitution)."""
+ regex = M.build_import_rewriter()
+ return M.rewrite_imports(text, regex)
+
+
+class TestBareImportRewriting(unittest.TestCase):
+ def test_plain_import_aliases_to_preserve_binding(self):
+ self.assertEqual(
+ rewrite("import inbox\n"),
+ "import molecule_runtime.inbox as inbox\n",
+ )
+
+ def test_plain_import_with_trailing_comment_is_preserved(self):
+ # Real-world shape from a2a_mcp_server.py — the comment must
+ # survive the rewrite without losing its leading-space buffer.
+ self.assertEqual(
+ rewrite("import inbox # noqa: E402\n"),
+ "import molecule_runtime.inbox as inbox # noqa: E402\n",
+ )
+
+ def test_import_dotted_keeps_dotted_form(self):
+ # `import X.sub` is rare for our modules but the rewriter must
+ # not double-alias — we want `import molecule_runtime.X.sub`,
+ # not `import molecule_runtime.X.sub as X.sub` (invalid).
+ self.assertEqual(
+ rewrite("import platform_tools.registry\n"),
+ "import molecule_runtime.platform_tools.registry\n",
+ )
+
+ def test_indented_import_preserves_indentation(self):
+ src = "def foo():\n import inbox\n return inbox.x\n"
+ out = rewrite(src)
+ self.assertIn(" import molecule_runtime.inbox as inbox\n", out)
+
+
+class TestFromImportRewriting(unittest.TestCase):
+ def test_from_module_import_simple(self):
+ self.assertEqual(
+ rewrite("from inbox import InboxState\n"),
+ "from molecule_runtime.inbox import InboxState\n",
+ )
+
+ def test_from_dotted_import(self):
+ self.assertEqual(
+ rewrite("from platform_tools.registry import TOOLS\n"),
+ "from molecule_runtime.platform_tools.registry import TOOLS\n",
+ )
+
+ def test_from_import_multiple_symbols(self):
+ # Multi-import statement — the rewriter only touches the module
+ # prefix, not the names being imported.
+ self.assertEqual(
+ rewrite("from a2a_tools import (foo, bar, baz)\n"),
+ "from molecule_runtime.a2a_tools import (foo, bar, baz)\n",
+ )
+
+ def test_from_import_block_form(self):
+ src = (
+ "from a2a_tools import (\n"
+ " tool_check_task_status,\n"
+ " tool_commit_memory,\n"
+ ")\n"
+ )
+ out = rewrite(src)
+ self.assertIn("from molecule_runtime.a2a_tools import (\n", out)
+ # Trailing names + closer are unchanged.
+ self.assertIn(" tool_check_task_status,\n", out)
+ self.assertIn(")\n", out)
+
+
+class TestImportAsAliasRejection(unittest.TestCase):
+ """The key regression class — the failure mode that shipped in PR #2433."""
+
+ def test_import_as_alias_raises_value_error(self):
+ with self.assertRaises(ValueError) as ctx:
+ rewrite("import inbox as _inbox_module\n")
+ msg = str(ctx.exception)
+ # Error must name the offending module + suggest the fix.
+ self.assertIn("inbox", msg)
+ self.assertIn("as ", msg)
+ self.assertIn("from", msg) # suggests `from X import …`
+
+ def test_import_as_alias_indented_still_rejected(self):
+ # Indented (inside def/class) — same hazard, same rejection.
+ with self.assertRaises(ValueError):
+ rewrite("def foo():\n import inbox as _x\n")
+
+ def test_import_as_alias_with_trailing_comment_still_rejected(self):
+ with self.assertRaises(ValueError):
+ rewrite("import inbox as _x # comment\n")
+
+ def test_plain_import_with_as_in_comment_does_not_trip(self):
+ # The detection strips comments before pattern-matching, so a
+ # comment containing "as foo" must NOT trigger the rejection.
+ self.assertEqual(
+ rewrite("import inbox # rewriter produces alias as inbox\n"),
+ "import molecule_runtime.inbox as inbox # rewriter produces alias as inbox\n",
+ )
+
+ def test_import_followed_by_comma_is_not_an_alias(self):
+ # `import inbox, os` — comma is not `as`, must not be rejected.
+ # Our regex captures `inbox` then `,` — only `inbox` gets prefixed.
+ # `os` is not in TOP_LEVEL_MODULES so it's left alone.
+ out = rewrite("import inbox, os\n")
+ # The first module is rewritten; the second (non-allowlist) is not.
+ self.assertIn("import molecule_runtime.inbox as inbox", out)
+
+
+class TestOutsideAllowlistModules(unittest.TestCase):
+ def test_third_party_imports_unchanged(self):
+ # `httpx`, `os`, `re` etc. are not in TOP_LEVEL_MODULES — the
+ # regex must not match them. This is the closed-list invariant
+ # that prevents accidental rewrites of stdlib / third-party.
+ src = "import httpx\nimport os\nfrom re import match\n"
+ self.assertEqual(rewrite(src), src)
+
+ def test_short_name_collision_avoided(self):
+ # `from a2a.server.X import Y` must not match the bare `a2a`
+ # prefix — `a2a` isn't in our allowlist (we allow `a2a_tools`,
+ # `a2a_client`, etc., but not bare `a2a`). Belt-and-suspenders.
+ src = "from a2a.server.routes import create_agent_card_routes\n"
+ self.assertEqual(rewrite(src), src)
+
+
+class TestEndToEndShape(unittest.TestCase):
+ """Reproduces the PR #2433 → #2436 incident shape."""
+
+ def test_pr_2433_pattern_now_rejected(self):
+ # The exact line PR #2433 added (inside main()), which produced
+ # `import molecule_runtime.inbox as inbox as _inbox_module` —
+ # invalid syntax in the published wheel.
+ with self.assertRaises(ValueError) as ctx:
+ rewrite(
+ " import inbox as _inbox_module\n"
+ " _inbox_module.set_notification_callback(_on_inbox_message)\n"
+ )
+ # Error message includes the offending line so the operator
+ # knows exactly where to fix.
+ self.assertIn("inbox", str(ctx.exception))
+
+ def test_pr_2436_fix_pattern_works(self):
+ # The fix-forward shape (#2436): top-level `import inbox`,
+ # bridge wired in main() via `inbox.set_notification_callback`.
+ src = (
+ "import inbox\n"
+ "\n"
+ "def main():\n"
+ " inbox.set_notification_callback(cb)\n"
+ )
+ out = rewrite(src)
+ self.assertIn("import molecule_runtime.inbox as inbox\n", out)
+ # The callable reference inside main() is left alone — only
+ # imports get rewritten, not arbitrary `inbox.foo` callsites
+ # (those resolve via the module binding the rewrite preserves).
+ self.assertIn(" inbox.set_notification_callback(cb)\n", out)
+
+
+if __name__ == "__main__":
+ unittest.main()
diff --git a/workspace/a2a_mcp_server.py b/workspace/a2a_mcp_server.py
index 09512f26..b3255bf0 100644
--- a/workspace/a2a_mcp_server.py
+++ b/workspace/a2a_mcp_server.py
@@ -17,9 +17,11 @@ import json
import logging
import sys
-import inbox # noqa: F401 — bridge wiring lives in main(); the rewriter
-# produces `import molecule_runtime.inbox as inbox`
-# which preserves this binding for set_notification_callback.
+# Top-level (not inside main()) so the wheel rewriter expands this to
+# `import molecule_runtime.inbox as inbox`. A local `import inbox as _x`
+# would expand to `import molecule_runtime.inbox as inbox as _x`,
+# which is invalid — see scripts/build_runtime_package.py:rewrite_imports.
+import inbox
from a2a_tools import (
tool_check_task_status,
From 067ad83ce510e022988e779b7f3e65c704f1aad1 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 20:47:09 -0700
Subject: [PATCH 03/61] feat(config): add explicit `provider:` field alongside
`model:`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Adds a top-level `provider` slug to WorkspaceConfig and RuntimeConfig so
adapters can route to a specific gateway without re-implementing
slug-prefix parsing across hermes / claude-code / codex.
Resolution chain in load_config (mirrors how `model` resolves):
1. ``LLM_PROVIDER`` env var — what canvas Save+Restart sets so the
operator's Provider dropdown choice survives a CP-driven restart
(the regenerated /configs/config.yaml drops most user fields).
2. Explicit YAML ``provider:`` — operator pinned it in the file.
3. Derive from the model slug prefix for backward compat:
``anthropic:claude-opus-4-7`` → ``anthropic``
``minimax/abab7-chat-preview`` → ``minimax``
bare model names → ``""`` (let the adapter decide).
`runtime_config.provider` falls back to the top-level resolved
provider, the same shape PR #2438 added for `runtime_config.model`.
Why a separate field at all (we already parse the slug):
- Custom model aliases without a recognizable prefix need an
explicit signal — the canvas Provider dropdown writes it.
- Adapters were each rolling their own slug-parse (hermes's
derive-provider.sh, claude-code's adapter-default branch, etc.);
one resolution point in load_config kills that drift class.
- Canvas needs a stable storage field that doesn't get clobbered
every time the user picks a new model.
Backward-compatible: when `provider:` is absent, slug derivation
keeps every existing config.yaml working without a migration.
PR-1 of a multi-PR stack (Option B from RFC discussion). Subsequent
PRs plumb the field through workspace-server env, CP user-data,
adapters (hermes prefers explicit over derive-provider.sh), and
canvas Provider dropdown UI.
Tests cover all four resolution paths + runtime_config inheritance:
- test_provider_default_empty_when_bare_model
- test_provider_derived_from_colon_slug
- test_provider_derived_from_slash_slug
- test_provider_yaml_explicit_wins_over_derived
- test_provider_env_override_beats_yaml_and_derived
- test_runtime_config_provider_yaml_wins_over_top_level
- test_provider_default_from_default_model
Co-Authored-By: Claude Opus 4.7 (1M context)
---
workspace/config.py | 54 ++++++++++++
workspace/tests/test_config.py | 151 +++++++++++++++++++++++++++++++++
2 files changed, 205 insertions(+)
diff --git a/workspace/config.py b/workspace/config.py
index 370ada11..3b205f1b 100644
--- a/workspace/config.py
+++ b/workspace/config.py
@@ -96,6 +96,10 @@ class RuntimeConfig:
required_env: list[str] = field(default_factory=list) # env vars required to run (e.g. ["CLAUDE_CODE_OAUTH_TOKEN"])
timeout: int = 0 # seconds (0 = no timeout — agents wait until done)
model: str = "" # model override for the CLI
+ provider: str = "" # explicit LLM provider (e.g., "anthropic", "openai",
+ # "minimax"). Falls back to the top-level resolved
+ # provider when empty. Adapters (hermes, claude-code,
+ # codex) prefer this over slug-parsing the model name.
# Deprecated — use required_env + secrets API instead. Kept for backward compat.
auth_token_env: str = ""
auth_token_file: str = ""
@@ -221,6 +225,16 @@ class WorkspaceConfig:
version: str = "1.0.0"
tier: int = 1
model: str = "anthropic:claude-opus-4-7"
+ provider: str = ""
+ """Explicit LLM provider slug (e.g., ``anthropic``, ``openai``, ``minimax``).
+
+ When empty, ``load_config`` derives it from the ``model`` slug prefix
+ (``anthropic:claude-opus-4-7`` → ``anthropic``; ``minimax/abab7-chat`` →
+ ``minimax``; bare model names → ``""``). Set explicitly via the canvas
+ Provider dropdown or the ``LLM_PROVIDER`` env var when the model name
+ is provider-ambiguous (e.g., a custom alias) or when an adapter needs
+ a specific gateway distinct from the model namespace.
+ """
runtime: str = "langgraph" # langgraph | claude-code | codex | ollama | custom
runtime_config: RuntimeConfig = field(default_factory=RuntimeConfig)
initial_prompt: str = ""
@@ -261,6 +275,20 @@ class WorkspaceConfig:
automatically adds the ``task-budgets-2026-03-13`` beta header."""
+def _derive_provider_from_model(model: str) -> str:
+ """Extract the provider slug prefix from a model identifier.
+
+ Recognizes both ``provider:model`` (Anthropic / OpenAI / Google convention)
+ and ``provider/model`` (HuggingFace / Minimax convention). Returns ``""``
+ when the model has no recognizable separator — callers must treat empty
+ as "use adapter default routing", not as a hard failure.
+ """
+ for sep in (":", "/"):
+ if sep in model:
+ return model.partition(sep)[0]
+ return ""
+
+
def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
"""Load config from WORKSPACE_CONFIG_PATH or the given path."""
if config_path is None:
@@ -276,6 +304,25 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
# Override model from env if provided
model = os.environ.get("MODEL_PROVIDER", raw.get("model", "anthropic:claude-opus-4-7"))
+ # Resolve top-level provider with this priority chain:
+ # 1. ``LLM_PROVIDER`` env var (canvas Save+Restart sets this so the
+ # operator's choice survives a CP-driven restart even though the
+ # regenerated /configs/config.yaml drops most user fields).
+ # 2. Explicit YAML ``provider:`` (an operator pinned it in the file).
+ # 3. Derive from the model slug prefix for backward compat:
+ # ``anthropic:claude-opus-4-7`` → ``anthropic``
+ # ``minimax/abab7-chat-preview`` → ``minimax``
+ # bare model names → ``""`` (signals "use adapter default")
+ # Empty after all three is fine — adapters that don't need an explicit
+ # provider (langgraph, claude-code-default, codex) keep their existing
+ # routing; adapters that do (hermes via derive-provider.sh) prefer this
+ # over slug-parsing the model name.
+ provider = (
+ os.environ.get("LLM_PROVIDER")
+ or raw.get("provider")
+ or _derive_provider_from_model(model)
+ )
+
runtime = raw.get("runtime", "langgraph")
runtime_raw = raw.get("runtime_config", {})
@@ -314,6 +361,7 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
version=raw.get("version", "1.0.0"),
tier=int(raw.get("tier", 1)) if str(raw.get("tier", 1)).isdigit() else 1,
model=model,
+ provider=provider,
runtime=runtime,
initial_prompt=initial_prompt,
idle_prompt=idle_prompt,
@@ -336,6 +384,12 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
# MODEL_PROVIDER is plumbed as an env var, so picking it up via
# the top-level resolved model keeps the selection sticky.
model=runtime_raw.get("model") or model,
+ # Same fallback shape as ``model`` above: an explicit
+ # ``runtime_config.provider`` wins; otherwise inherit the
+ # top-level resolved provider so adapters see a single
+ # consistent choice without each one re-implementing
+ # env/YAML/slug-prefix resolution.
+ provider=runtime_raw.get("provider") or provider,
# Deprecated fields — kept for backward compat
auth_token_env=runtime_raw.get("auth_token_env", ""),
auth_token_file=runtime_raw.get("auth_token_file", ""),
diff --git a/workspace/tests/test_config.py b/workspace/tests/test_config.py
index c87198ba..bc09d638 100644
--- a/workspace/tests/test_config.py
+++ b/workspace/tests/test_config.py
@@ -164,6 +164,157 @@ def test_runtime_config_model_picks_up_env_via_top_level(tmp_path, monkeypatch):
assert cfg.runtime_config.model == "minimax/abab7-chat-preview"
+# ===== Provider field (Option B — explicit `provider:` alongside `model:`) =====
+#
+# Why a separate `provider` field at all (we already parse the slug prefix off
+# `model`)? Three reasons:
+# 1. Custom model aliases that don't carry a recognizable prefix (e.g., a
+# tenant-specific name routed through a gateway) need an explicit signal.
+# 2. Adapters were each implementing their own slug-parse — hermes's
+# derive-provider.sh, claude-code's adapter-default branch, etc. One
+# resolution point in load_config kills that drift class.
+# 3. The canvas Provider dropdown needs a stable storage field that doesn't
+# get clobbered every time the user picks a new model.
+#
+# Backward compat: when `provider:` is absent, fall back to slug derivation,
+# so existing config.yaml files keep working without a migration.
+
+
+def test_provider_default_empty_when_bare_model(tmp_path, monkeypatch):
+ """Bare model names (no `:` or `/` separator) yield an empty provider —
+ the signal for "let the adapter decide". Don't guess.
+ """
+ monkeypatch.delenv("LLM_PROVIDER", raising=False)
+ monkeypatch.delenv("MODEL_PROVIDER", raising=False)
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(yaml.dump({"model": "claude-opus-4-7"}))
+
+ cfg = load_config(str(tmp_path))
+ assert cfg.provider == ""
+ assert cfg.runtime_config.provider == ""
+
+
+def test_provider_derived_from_colon_slug(tmp_path, monkeypatch):
+ """`provider:model` shape (Anthropic/OpenAI/Google convention) derives
+ the provider from the prefix when no explicit `provider:` is set.
+ Exercises the backward-compat path for every existing config.yaml in
+ the wild.
+ """
+ monkeypatch.delenv("LLM_PROVIDER", raising=False)
+ monkeypatch.delenv("MODEL_PROVIDER", raising=False)
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(yaml.dump({"model": "anthropic:claude-opus-4-7"}))
+
+ cfg = load_config(str(tmp_path))
+ assert cfg.provider == "anthropic"
+ # runtime_config.provider inherits the same way runtime_config.model does.
+ assert cfg.runtime_config.provider == "anthropic"
+
+
+def test_provider_derived_from_slash_slug(tmp_path, monkeypatch):
+ """`provider/model` shape (HuggingFace/Minimax convention) derives the
+ provider from the prefix when no explicit `provider:` is set.
+ """
+ monkeypatch.delenv("LLM_PROVIDER", raising=False)
+ monkeypatch.delenv("MODEL_PROVIDER", raising=False)
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(yaml.dump({"model": "minimax/abab7-chat-preview"}))
+
+ cfg = load_config(str(tmp_path))
+ assert cfg.provider == "minimax"
+ assert cfg.runtime_config.provider == "minimax"
+
+
+def test_provider_yaml_explicit_wins_over_derived(tmp_path, monkeypatch):
+ """Explicit YAML `provider:` overrides the slug-prefix derivation —
+ needed when the model name's prefix doesn't match the actual gateway
+ (e.g., an `anthropic:claude-opus-4-7` model routed through a custom
+ gateway slug).
+ """
+ monkeypatch.delenv("LLM_PROVIDER", raising=False)
+ monkeypatch.delenv("MODEL_PROVIDER", raising=False)
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(
+ yaml.dump(
+ {
+ "model": "anthropic:claude-opus-4-7",
+ "provider": "custom-gateway",
+ }
+ )
+ )
+
+ cfg = load_config(str(tmp_path))
+ # Slug prefix says "anthropic" but the explicit field wins.
+ assert cfg.provider == "custom-gateway"
+ assert cfg.runtime_config.provider == "custom-gateway"
+
+
+def test_provider_env_override_beats_yaml_and_derived(tmp_path, monkeypatch):
+ """`LLM_PROVIDER` env var beats both YAML and slug derivation.
+ This is the path the canvas Save+Restart cycle relies on: the user
+ picks a provider in the canvas Provider dropdown, the platform sets
+ `LLM_PROVIDER` on the workspace, and the next CP-driven restart picks
+ it up regardless of what's in the regenerated /configs/config.yaml.
+ """
+ monkeypatch.setenv("LLM_PROVIDER", "minimax")
+ monkeypatch.delenv("MODEL_PROVIDER", raising=False)
+ config_yaml = tmp_path / "config.yaml"
+ # YAML says one thing, slug says another, env wins.
+ config_yaml.write_text(
+ yaml.dump(
+ {
+ "model": "anthropic:claude-opus-4-7",
+ "provider": "openai",
+ }
+ )
+ )
+
+ cfg = load_config(str(tmp_path))
+ assert cfg.provider == "minimax"
+ assert cfg.runtime_config.provider == "minimax"
+
+
+def test_runtime_config_provider_yaml_wins_over_top_level(tmp_path, monkeypatch):
+ """An explicit `runtime_config.provider` takes precedence over the
+ top-level resolved provider — same fallback shape as `model`. Needed
+ when a workspace wants the top-level model/provider to stay
+ user-visible while pinning the runtime to a different gateway.
+ """
+ monkeypatch.delenv("LLM_PROVIDER", raising=False)
+ monkeypatch.delenv("MODEL_PROVIDER", raising=False)
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(
+ yaml.dump(
+ {
+ "model": "anthropic:claude-opus-4-7",
+ "runtime_config": {"provider": "openai"},
+ }
+ )
+ )
+
+ cfg = load_config(str(tmp_path))
+ # Top-level still derives from the slug.
+ assert cfg.provider == "anthropic"
+ # runtime_config.provider explicit override wins.
+ assert cfg.runtime_config.provider == "openai"
+
+
+def test_provider_default_from_default_model(tmp_path, monkeypatch):
+ """When config.yaml is empty, the WorkspaceConfig default model
+ (`anthropic:claude-opus-4-7`) yields provider=`anthropic`. Pins the
+ "no config" boot path to a sensible derived provider.
+ """
+ monkeypatch.delenv("LLM_PROVIDER", raising=False)
+ monkeypatch.delenv("MODEL_PROVIDER", raising=False)
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(yaml.dump({}))
+
+ cfg = load_config(str(tmp_path))
+ assert cfg.model == "anthropic:claude-opus-4-7"
+ assert cfg.provider == "anthropic"
+ assert cfg.runtime_config.provider == "anthropic"
+
+
def test_delegation_config_defaults(tmp_path):
"""DelegationConfig nested defaults are applied."""
config_yaml = tmp_path / "config.yaml"
From e58e446444d8ddbb3fc6f5599a3c90cfa2344442 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 20:52:34 -0700
Subject: [PATCH 04/61] =?UTF-8?q?docs(ci):=20correct=20test-ops-scripts.ym?=
=?UTF-8?q?l=20header=20=E2=80=94=20discover=20does=20NOT=20recurse?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The previous header said `unittest discover from the scripts/ root
walks recursively`, contradicting the workflow body which runs two
passes precisely because discover does NOT recurse without
__init__.py. Fixed self-review feedback on PR #2440.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.github/workflows/test-ops-scripts.yml | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/test-ops-scripts.yml b/.github/workflows/test-ops-scripts.yml
index a6f342e1..ca8cb0af 100644
--- a/.github/workflows/test-ops-scripts.yml
+++ b/.github/workflows/test-ops-scripts.yml
@@ -4,11 +4,13 @@ name: Ops Scripts Tests
# anything under scripts/. Kept separate from the main CI so a script-only
# change doesn't trigger the heavier Go/Canvas/Python pipelines.
#
-# Discovery: `unittest discover` from the scripts/ root walks recursively,
-# so both `scripts/test_*.py` and `scripts/ops/test_*.py` are picked up.
-# Tests should sit alongside the code they test (see
+# Discovery layout: tests sit alongside the code they test (see
# scripts/ops/test_sweep_cf_decide.py for the pattern; scripts/
-# test_build_runtime_package.py for the rewriter coverage).
+# test_build_runtime_package.py for the rewriter coverage). The job
+# below runs `unittest discover` TWICE — once from `scripts/`, once
+# from `scripts/ops/` — because neither dir has an `__init__.py`, so
+# a single discover from `scripts/` doesn't recurse into the ops
+# subdir. Two passes is simpler than retrofitting namespace packages.
on:
push:
From d012a803e48ce8eff2cf39049cd8aa0093c16436 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 21:10:20 -0700
Subject: [PATCH 05/61] feat(terminal): add diagnose endpoint for SSH probe
stages
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
GET /workspaces/:id/terminal/diagnose runs the same per-stage pipeline as
/terminal (ssh-keygen → EIC send-key → tunnel → ssh) but non-interactively
and returns JSON. Each stage reports {name, ok, duration_ms, error,
detail}, plus a top-level first_failure naming the broken stage.
Why: when the canvas terminal silently disconnects ("Session ended" with
no error frame — the user-reported failure mode on hongmingwang's hermes
workspace), there is no remote-readable signal of WHICH stage failed.
The ssh client's stderr lives only in the workspace-server's stdout on
the tenant CP EC2 — invisible without shell access. /terminal can't
expose stderr cleanly because it has already upgraded to WebSocket
binary frames by the time ssh runs. /terminal/diagnose stays pure
HTTP/JSON, so the same auth (WorkspaceAuth + ADMIN_TOKEN fallback) gives
operators a one-call probe that splits "IAM broke" (send-ssh-public-key
fails) from "tunnel/SG broke" (wait-for-port fails) from "sshd auth
broke" (ssh-probe gets Permission denied) from "shell broke" (probe
exits non-zero with stderr).
Stages mirrored from handleRemoteConnect in terminal.go:
1. ssh-keygen ephemeral session keypair
2. send-ssh-public-key AWS EIC API push, IAM-gated
3. pick-free-port local port for the tunnel
4. open-tunnel aws ec2-instance-connect open-tunnel start
5. wait-for-port the tunnel actually listens (folds tunnel
stderr into Detail when it doesn't)
6. ssh-probe non-interactive `ssh ... 'echo MARKER'` that
confirms auth + bash + the marker round-trip
(CombinedOutput captures stderr verbatim —
this is the whole reason the endpoint exists)
Local Docker workspaces (no instance_id) get a smaller probe:
container-found + container-running. Same response shape so callers
don't need to branch.
Tests stub sendSSHPublicKey / openTunnelCmd / sshProbeCmd via the
existing package-level vars (same pattern as TestSSHCommandCmd_*) so
the test suite stays hermetic — no AWS, no network. The three new
tests pin: (a) routing to remote on instance_id present,
(b) routing to local on empty instance_id, (c) the operationally
critical case — full success through wait-for-port then a probe
failure surfaces ssh stderr in the ssh-probe step's Error/Detail
with first_failure="ssh-probe".
Auth: rides on existing WorkspaceAuth middleware. Operators with the
tenant ADMIN_TOKEN (fetched via /cp/admin/orgs/:slug/admin-token) can
probe any workspace without per-workspace token; same admin path as
the canvas dashboard reads workspace activity.
Response always returns HTTP 200 (success or step failure are both in
the JSON body) so callers don't need to branch on status code — the
endpoint either reports a first_failure or doesn't.
Resolves task #200, supports task #193 (workspace EC2 sshd
unresponsive — without this endpoint we couldn't pin the failure
stage from outside the tenant CP EC2).
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../internal/handlers/terminal_diagnose.go | 328 ++++++++++++++++++
.../handlers/terminal_diagnose_test.go | 222 ++++++++++++
workspace-server/internal/router/router.go | 1 +
3 files changed, 551 insertions(+)
create mode 100644 workspace-server/internal/handlers/terminal_diagnose.go
create mode 100644 workspace-server/internal/handlers/terminal_diagnose_test.go
diff --git a/workspace-server/internal/handlers/terminal_diagnose.go b/workspace-server/internal/handlers/terminal_diagnose.go
new file mode 100644
index 00000000..e40f6e19
--- /dev/null
+++ b/workspace-server/internal/handlers/terminal_diagnose.go
@@ -0,0 +1,328 @@
+package handlers
+
+import (
+ "context"
+ "fmt"
+ "net/http"
+ "os"
+ "os/exec"
+ "strings"
+ "time"
+
+ "github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
+ "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
+ "github.com/gin-gonic/gin"
+)
+
+// HandleDiagnose handles GET /workspaces/:id/terminal/diagnose. It runs the
+// same per-step pipeline as HandleConnect (ssh-keygen → EIC send-key → tunnel
+// → ssh) but non-interactively, captures the first failing step and its
+// stderr, and returns the result as JSON.
+//
+// Why this exists: when the canvas terminal silently disconnects ("Session
+// ended" with no error frame), there is no remote-readable signal of which
+// stage failed. The ssh client's stderr lives in the workspace-server's
+// process logs on the tenant CP EC2 — invisible without shell access.
+// HandleConnect can't trivially expose stderr because it has already
+// upgraded to WebSocket binary frames by the time ssh runs. HandleDiagnose
+// stays pure HTTP/JSON, so the same auth (WorkspaceAuth + ADMIN_TOKEN
+// fallback) gives operators a one-call probe of the whole shell pipeline.
+//
+// Stages mirrored from handleRemoteConnect:
+//
+// 1. ssh-keygen (ephemeral session keypair)
+// 2. send-ssh-public-key (AWS EIC API push, IAM-gated)
+// 3. pick-free-port (local port for the tunnel)
+// 4. open-tunnel (aws ec2-instance-connect open-tunnel start)
+// 5. wait-for-port (the tunnel actually listens)
+// 6. ssh-probe (`ssh ... 'echo MARKER'` — proves end-to-end auth+shell)
+//
+// Local Docker workspaces (no instance_id row) get a smaller probe:
+// container-found + container-running. Same response shape so callers
+// don't need to branch.
+func (h *TerminalHandler) HandleDiagnose(c *gin.Context) {
+ workspaceID := c.Param("id")
+ ctx, cancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
+ defer cancel()
+
+ var instanceID string
+ _ = db.DB.QueryRowContext(ctx,
+ `SELECT COALESCE(instance_id, '') FROM workspaces WHERE id = $1`,
+ workspaceID).Scan(&instanceID)
+
+ var res diagnoseResult
+ if instanceID != "" {
+ res = h.diagnoseRemote(ctx, workspaceID, instanceID)
+ } else {
+ res = h.diagnoseLocal(ctx, workspaceID)
+ }
+ c.JSON(http.StatusOK, res)
+}
+
+// diagnoseStep is one row in the diagnostic report. Always carries Name +
+// OK + DurationMs; Error/Detail filled when the step fails.
+type diagnoseStep struct {
+ Name string `json:"name"`
+ OK bool `json:"ok"`
+ DurationMs int64 `json:"duration_ms"`
+ Error string `json:"error,omitempty"`
+ Detail string `json:"detail,omitempty"`
+}
+
+// diagnoseResult is the full report. ``OK`` is true only when every step
+// passed; ``FirstFailure`` names the step that broke the chain so callers
+// can route alerts (e.g., "send-ssh-public-key" → IAM team; "ssh-probe" →
+// SG/sshd team).
+type diagnoseResult struct {
+ WorkspaceID string `json:"workspace_id"`
+ InstanceID string `json:"instance_id,omitempty"`
+ Remote bool `json:"remote"`
+ OK bool `json:"ok"`
+ FirstFailure string `json:"first_failure,omitempty"`
+ Steps []diagnoseStep `json:"steps"`
+}
+
+// sshProbeMarker is the string the ssh probe echoes back. Distinct from any
+// shell builtin output so we can grep for it unambiguously even when the
+// remote prints a banner or motd.
+const sshProbeMarker = "MOLECULE_TERMINAL_PROBE_OK"
+
+// sshProbeCmd builds the non-interactive ssh probe command. Exposed as a
+// var so tests can stub it without spinning up a real sshd. BatchMode=yes
+// ensures ssh fails fast on prompt instead of hanging on a TTY.
+var sshProbeCmd = func(o eicSSHOptions) *exec.Cmd {
+ return exec.Command(
+ "ssh",
+ "-i", o.PrivateKeyPath,
+ "-o", "StrictHostKeyChecking=no",
+ "-o", "UserKnownHostsFile=/dev/null",
+ "-o", "BatchMode=yes",
+ "-o", "ConnectTimeout=10",
+ "-p", fmt.Sprintf("%d", o.LocalPort),
+ fmt.Sprintf("%s@127.0.0.1", o.OSUser),
+ "echo "+sshProbeMarker,
+ )
+}
+
+// diagnoseRemote runs the full EIC + ssh probe and reports per-step status.
+// Bails on the first failure so the operator sees which stage breaks; later
+// stages stay in the report as zero-value rows so the response shape is
+// stable regardless of where the chain stopped.
+func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, instanceID string) diagnoseResult {
+ res := diagnoseResult{
+ WorkspaceID: workspaceID,
+ InstanceID: instanceID,
+ Remote: true,
+ }
+
+ osUser := os.Getenv("WORKSPACE_EC2_OS_USER")
+ if osUser == "" {
+ osUser = "ubuntu"
+ }
+ region := os.Getenv("AWS_REGION")
+ if region == "" {
+ region = "us-east-2"
+ }
+
+ stop := func(name string, step diagnoseStep) diagnoseResult {
+ res.Steps = append(res.Steps, step)
+ res.FirstFailure = name
+ return res
+ }
+
+ // Step 1: ssh-keygen
+ t0 := time.Now()
+ keyDir, err := os.MkdirTemp("", "molecule-diagnose-*")
+ if err != nil {
+ return stop("ssh-keygen", diagnoseStep{
+ Name: "ssh-keygen",
+ DurationMs: time.Since(t0).Milliseconds(),
+ Error: fmt.Sprintf("mkdir tmp: %v", err),
+ })
+ }
+ defer func() { _ = os.RemoveAll(keyDir) }()
+ keyPath := keyDir + "/id"
+ keygen := exec.CommandContext(ctx, "ssh-keygen", "-t", "ed25519", "-f", keyPath, "-N", "", "-q", "-C", "molecule-diagnose")
+ if out, kerr := keygen.CombinedOutput(); kerr != nil {
+ return stop("ssh-keygen", diagnoseStep{
+ Name: "ssh-keygen",
+ DurationMs: time.Since(t0).Milliseconds(),
+ Error: kerr.Error(),
+ Detail: strings.TrimSpace(string(out)),
+ })
+ }
+ res.Steps = append(res.Steps, diagnoseStep{Name: "ssh-keygen", OK: true, DurationMs: time.Since(t0).Milliseconds()})
+
+ pubKey, err := os.ReadFile(keyPath + ".pub")
+ if err != nil {
+ return stop("ssh-keygen", diagnoseStep{
+ Name: "ssh-keygen",
+ Error: fmt.Sprintf("read pubkey: %v", err),
+ })
+ }
+
+ // Step 2: send-ssh-public-key (AWS Instance Connect)
+ t0 = time.Now()
+ if err := sendSSHPublicKey(ctx, region, instanceID, osUser, strings.TrimSpace(string(pubKey))); err != nil {
+ return stop("send-ssh-public-key", diagnoseStep{
+ Name: "send-ssh-public-key",
+ DurationMs: time.Since(t0).Milliseconds(),
+ Error: err.Error(),
+ })
+ }
+ res.Steps = append(res.Steps, diagnoseStep{Name: "send-ssh-public-key", OK: true, DurationMs: time.Since(t0).Milliseconds()})
+
+ // Step 3: pick-free-port
+ t0 = time.Now()
+ localPort, err := pickFreePort()
+ if err != nil {
+ return stop("pick-free-port", diagnoseStep{
+ Name: "pick-free-port",
+ DurationMs: time.Since(t0).Milliseconds(),
+ Error: err.Error(),
+ })
+ }
+ res.Steps = append(res.Steps, diagnoseStep{
+ Name: "pick-free-port",
+ OK: true,
+ DurationMs: time.Since(t0).Milliseconds(),
+ Detail: fmt.Sprintf("port=%d", localPort),
+ })
+
+ // Step 4: open-tunnel (long-running subprocess; we hold its stderr so
+ // we can include it in failure detail for the next two stages).
+ opts := eicSSHOptions{
+ InstanceID: instanceID,
+ OSUser: osUser,
+ Region: region,
+ LocalPort: localPort,
+ PrivateKeyPath: keyPath,
+ }
+ t0 = time.Now()
+ tunnel := openTunnelCmd(opts)
+ tunnel.Env = os.Environ()
+ var tunnelStderr strings.Builder
+ tunnel.Stderr = &tunnelStderr
+ if err := tunnel.Start(); err != nil {
+ return stop("open-tunnel", diagnoseStep{
+ Name: "open-tunnel",
+ DurationMs: time.Since(t0).Milliseconds(),
+ Error: err.Error(),
+ Detail: tunnelStderr.String(),
+ })
+ }
+ defer func() {
+ if tunnel.Process != nil {
+ _ = tunnel.Process.Kill()
+ }
+ _ = tunnel.Wait()
+ }()
+ res.Steps = append(res.Steps, diagnoseStep{Name: "open-tunnel", OK: true, DurationMs: time.Since(t0).Milliseconds()})
+
+ // Step 5: wait-for-port — verifies the tunnel actually bound the port.
+ // Tunnel-side errors (auth, SG, missing endpoint) usually surface here
+ // because the subprocess exits before binding. Fold its stderr into the
+ // detail so the operator sees the real reason.
+ t0 = time.Now()
+ if err := waitForPort(ctx, "127.0.0.1", localPort, 10*time.Second); err != nil {
+ return stop("wait-for-port", diagnoseStep{
+ Name: "wait-for-port",
+ DurationMs: time.Since(t0).Milliseconds(),
+ Error: err.Error(),
+ Detail: tunnelStderr.String(),
+ })
+ }
+ res.Steps = append(res.Steps, diagnoseStep{Name: "wait-for-port", OK: true, DurationMs: time.Since(t0).Milliseconds()})
+
+ // Step 6: ssh-probe — non-interactive `ssh ... 'echo MARKER'`. Proves
+ // auth (key push reached sshd), shell ready (bash returns echo output),
+ // and the network path end-to-end. Captures combined output + exit
+ // error so we see "Permission denied", "Connection refused", or "Host
+ // key verification failed" verbatim.
+ t0 = time.Now()
+ probe := sshProbeCmd(opts)
+ probe.Env = os.Environ()
+ out, perr := probe.CombinedOutput()
+ outStr := strings.TrimSpace(string(out))
+ durMs := time.Since(t0).Milliseconds()
+ if perr != nil || !strings.Contains(outStr, sshProbeMarker) {
+ errStr := ""
+ if perr != nil {
+ errStr = perr.Error()
+ }
+ return stop("ssh-probe", diagnoseStep{
+ Name: "ssh-probe",
+ DurationMs: durMs,
+ Error: errStr,
+ Detail: outStr,
+ })
+ }
+ res.Steps = append(res.Steps, diagnoseStep{Name: "ssh-probe", OK: true, DurationMs: durMs})
+
+ res.OK = true
+ return res
+}
+
+// diagnoseLocal probes the Docker container path. Smaller surface: just
+// "is the named container running on this Docker daemon".
+func (h *TerminalHandler) diagnoseLocal(ctx context.Context, workspaceID string) diagnoseResult {
+ res := diagnoseResult{WorkspaceID: workspaceID, Remote: false}
+ if h.docker == nil {
+ res.Steps = append(res.Steps, diagnoseStep{
+ Name: "docker-available",
+ Error: "docker client not configured on this workspace-server",
+ })
+ res.FirstFailure = "docker-available"
+ return res
+ }
+
+ candidates := []string{provisioner.ContainerName(workspaceID), "ws-" + workspaceID}
+ var foundName string
+ var lastErr error
+ var running bool
+ var stateStatus string
+ t0 := time.Now()
+ for _, n := range candidates {
+ info, err := h.docker.ContainerInspect(ctx, n)
+ if err == nil {
+ foundName = n
+ running = info.State.Running
+ stateStatus = info.State.Status
+ break
+ }
+ lastErr = err
+ }
+ if foundName == "" {
+ errMsg := "no matching container"
+ if lastErr != nil {
+ errMsg = lastErr.Error()
+ }
+ res.Steps = append(res.Steps, diagnoseStep{
+ Name: "container-found",
+ DurationMs: time.Since(t0).Milliseconds(),
+ Error: errMsg,
+ Detail: fmt.Sprintf("tried: %s", strings.Join(candidates, ", ")),
+ })
+ res.FirstFailure = "container-found"
+ return res
+ }
+ res.Steps = append(res.Steps, diagnoseStep{
+ Name: "container-found",
+ OK: true,
+ DurationMs: time.Since(t0).Milliseconds(),
+ Detail: foundName,
+ })
+
+ if !running {
+ res.Steps = append(res.Steps, diagnoseStep{
+ Name: "container-running",
+ Error: "container not running",
+ Detail: stateStatus,
+ })
+ res.FirstFailure = "container-running"
+ return res
+ }
+ res.Steps = append(res.Steps, diagnoseStep{Name: "container-running", OK: true, Detail: stateStatus})
+ res.OK = true
+ return res
+}
diff --git a/workspace-server/internal/handlers/terminal_diagnose_test.go b/workspace-server/internal/handlers/terminal_diagnose_test.go
new file mode 100644
index 00000000..5cf672fe
--- /dev/null
+++ b/workspace-server/internal/handlers/terminal_diagnose_test.go
@@ -0,0 +1,222 @@
+package handlers
+
+import (
+ "context"
+ "encoding/json"
+ "errors"
+ "net/http/httptest"
+ "os/exec"
+ "testing"
+
+ "github.com/DATA-DOG/go-sqlmock"
+ "github.com/gin-gonic/gin"
+)
+
+// TestHandleDiagnose_RoutesToRemote pins the dispatch: a workspace row with
+// a non-empty instance_id takes the EIC + ssh probe path. We stub the
+// first-stage (send-ssh-public-key) to fail so the test stays
+// hermetic — no AWS calls, no network — and confirm:
+//
+// - first_failure is "send-ssh-public-key" (not the earlier ssh-keygen)
+// - the steps array includes the ssh-keygen pass + the failed
+// send-ssh-public-key step
+// - response is HTTP 200 (the endpoint always returns 200; failure is
+// in the JSON body so callers don't need branch-on-status)
+func TestHandleDiagnose_RoutesToRemote(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+
+ mock.ExpectQuery("SELECT COALESCE").
+ WithArgs("ws-remote").
+ WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("i-abc123"))
+
+ prev := sendSSHPublicKey
+ sendSSHPublicKey = func(ctx context.Context, region, instanceID, osUser, pubKey string) error {
+ return errors.New("AccessDeniedException: not authorized")
+ }
+ defer func() { sendSSHPublicKey = prev }()
+
+ h := NewTerminalHandler(nil)
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: "ws-remote"}}
+ c.Request = httptest.NewRequest("GET", "/workspaces/ws-remote/terminal/diagnose", nil)
+
+ h.HandleDiagnose(c)
+
+ if w.Code != 200 {
+ t.Fatalf("HandleDiagnose status: got %d, want 200 (body=%s)", w.Code, w.Body.String())
+ }
+ var got diagnoseResult
+ if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil {
+ t.Fatalf("response not JSON: %v (body=%s)", err, w.Body.String())
+ }
+ if !got.Remote {
+ t.Errorf("Remote=false; expected true for instance_id-bearing workspace")
+ }
+ if got.OK {
+ t.Errorf("OK=true despite stubbed send-key failure")
+ }
+ if got.FirstFailure != "send-ssh-public-key" {
+ t.Errorf("FirstFailure=%q; want send-ssh-public-key", got.FirstFailure)
+ }
+ // ssh-keygen must run successfully before send-ssh-public-key fails.
+ if len(got.Steps) < 2 {
+ t.Fatalf("expected >=2 steps (ssh-keygen + send-ssh-public-key); got %d", len(got.Steps))
+ }
+ if got.Steps[0].Name != "ssh-keygen" || !got.Steps[0].OK {
+ t.Errorf("step[0]: want ssh-keygen ok=true; got %+v", got.Steps[0])
+ }
+ if got.Steps[1].Name != "send-ssh-public-key" || got.Steps[1].OK {
+ t.Errorf("step[1]: want send-ssh-public-key ok=false; got %+v", got.Steps[1])
+ }
+ // The IAM error message must surface in the step's Error field — that's
+ // the whole point of the endpoint.
+ if got.Steps[1].Error == "" {
+ t.Errorf("step[1].Error is empty; AWS error must surface verbatim")
+ }
+}
+
+// TestHandleDiagnose_RoutesToLocal — empty instance_id takes the Docker
+// path. With nil docker client, container-found can't even start, so we
+// fail at "docker-available". Confirms the local-vs-remote dispatch.
+func TestHandleDiagnose_RoutesToLocal(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+
+ mock.ExpectQuery("SELECT COALESCE").
+ WithArgs("ws-local").
+ WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow(""))
+
+ h := NewTerminalHandler(nil)
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: "ws-local"}}
+ c.Request = httptest.NewRequest("GET", "/workspaces/ws-local/terminal/diagnose", nil)
+
+ h.HandleDiagnose(c)
+
+ if w.Code != 200 {
+ t.Fatalf("status: got %d, want 200", w.Code)
+ }
+ var got diagnoseResult
+ if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil {
+ t.Fatalf("response not JSON: %v", err)
+ }
+ if got.Remote {
+ t.Errorf("Remote=true; expected false for empty-instance_id workspace")
+ }
+ if got.FirstFailure != "docker-available" {
+ t.Errorf("FirstFailure=%q; want docker-available (no docker client)", got.FirstFailure)
+ }
+}
+
+// TestDiagnoseRemote_StopsAtSSHProbe — full happy path through send-key,
+// pick-port, open-tunnel, wait-for-port, then stub the ssh probe to fail.
+// Confirms first_failure surfaces the actual ssh stderr ("Permission
+// denied") rather than the earlier successful steps. This is the
+// most operationally important behavior — the endpoint exists primarily
+// to differentiate "IAM broke" (send-key fails) from "sshd broke" (probe
+// fails) from "SG/network broke" (wait-for-port fails).
+func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+
+ mock.ExpectQuery("SELECT COALESCE").
+ WithArgs("ws-probe-fail").
+ WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("i-test"))
+
+ // Stub send-key to succeed.
+ prevSend := sendSSHPublicKey
+ sendSSHPublicKey = func(ctx context.Context, region, instanceID, osUser, pubKey string) error {
+ return nil
+ }
+ defer func() { sendSSHPublicKey = prevSend }()
+
+ // Stub openTunnelCmd to spawn `nc -l ` so waitForPort succeeds.
+ // We need the tunnel to actually bind the port; nc does that
+ // portably. macOS has BSD nc by default.
+ prevTun := openTunnelCmd
+ openTunnelCmd = func(o eicSSHOptions) *exec.Cmd {
+ // `nc -l ` listens on the picked free port. -k keeps it
+ // alive across single-client disconnects on Linux nc; harmless
+ // on BSD nc which doesn't have it (we'd need -k for BSD too —
+ // fall back to a portable busy-wait).
+ return exec.Command("sh", "-c",
+ `port="$1"; while true; do nc -l "$port" >/dev/null 2>&1 || true; done`,
+ "sh", numToString(o.LocalPort))
+ }
+ defer func() { openTunnelCmd = prevTun }()
+
+ // Stub the ssh probe to return "Permission denied" with non-zero exit,
+ // the canonical "key wasn't authorized" failure.
+ prevProbe := sshProbeCmd
+ sshProbeCmd = func(o eicSSHOptions) *exec.Cmd {
+ return exec.Command("sh", "-c", "echo 'Permission denied (publickey).' >&2; exit 255")
+ }
+ defer func() { sshProbeCmd = prevProbe }()
+
+ h := NewTerminalHandler(nil)
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: "ws-probe-fail"}}
+ c.Request = httptest.NewRequest("GET", "/workspaces/ws-probe-fail/terminal/diagnose", nil)
+
+ h.HandleDiagnose(c)
+
+ if w.Code != 200 {
+ t.Fatalf("status: got %d", w.Code)
+ }
+ var got diagnoseResult
+ if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil {
+ t.Fatalf("response not JSON: %v (body=%s)", err, w.Body.String())
+ }
+ if got.OK {
+ t.Errorf("OK=true despite stubbed probe failure")
+ }
+ if got.FirstFailure != "ssh-probe" {
+ t.Errorf("FirstFailure=%q; want ssh-probe (got body=%s)", got.FirstFailure, w.Body.String())
+ }
+ // The "Permission denied" message must be in the probe step's Detail —
+ // that's what tells the operator "this is sshd auth, not network".
+ var probeStep *diagnoseStep
+ for i := range got.Steps {
+ if got.Steps[i].Name == "ssh-probe" {
+ probeStep = &got.Steps[i]
+ break
+ }
+ }
+ if probeStep == nil {
+ t.Fatalf("no ssh-probe step in result: %+v", got.Steps)
+ }
+ if probeStep.OK {
+ t.Errorf("ssh-probe step OK=true despite failure stub")
+ }
+ if probeStep.Detail == "" && probeStep.Error == "" {
+ t.Errorf("ssh-probe step has no Error or Detail; ssh stderr is exactly what we want to expose")
+ }
+}
+
+// numToString is a tiny helper to avoid pulling fmt into the test for one
+// integer-to-string call. Same observable behavior as strconv.Itoa.
+func numToString(n int) string {
+ if n == 0 {
+ return "0"
+ }
+ var buf [20]byte
+ i := len(buf)
+ neg := n < 0
+ if neg {
+ n = -n
+ }
+ for n > 0 {
+ i--
+ buf[i] = byte('0' + n%10)
+ n /= 10
+ }
+ if neg {
+ i--
+ buf[i] = '-'
+ }
+ return string(buf[i:])
+}
diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go
index 3d04b12e..5373ed0f 100644
--- a/workspace-server/internal/router/router.go
+++ b/workspace-server/internal/router/router.go
@@ -470,6 +470,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
}
th := handlers.NewTerminalHandler(dockerCli)
wsAuth.GET("/terminal", th.HandleConnect)
+ wsAuth.GET("/terminal/diagnose", th.HandleDiagnose)
// Canvas Viewport — #166 + #168: GET stays fully open for bootstrap.
// PUT uses CanvasOrBearer (accepts Origin-match OR bearer token) so the
From b9311134cf3580ea36d89e4ed99b8802e9bf2379 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 21:19:18 -0700
Subject: [PATCH 06/61] fix(terminal-diagnose): KI-005 hierarchy check +
race-free stderr capture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Two fixes from /code-review-and-quality on PR #2445:
1. **KI-005 hierarchy check parity with /terminal**
HandleConnect runs the KI-005 cross-workspace guard before dispatch
(terminal.go:85-106): when X-Workspace-ID is set and != :id, validate
the bearer's workspace binding then call canCommunicateCheck. Without
this, an org-level token holder in tenant Foo can probe any
workspace's diagnostic state by guessing the UUID — same enumeration
vector KI-005 closed for /terminal in #1609. Per-workspace bearer
tokens are URL-bound by WorkspaceAuth, so the gap is org tokens
within the same tenant.
Fix: copy the same gate into HandleDiagnose, before the
instance_id SELECT.
Test: TestHandleDiagnose_KI005_RejectsCrossWorkspace stubs
canCommunicateCheck=false and confirms 403 fires before the DB
lookup (sqlmock's ExpectationsWereMet pins that we never reached
the SELECT COALESCE). Mirrors the existing
TestTerminalConnect_KI005_RejectsUnauthorizedCrossWorkspace.
2. **Race-free tunnel stderr capture (syncBuf)**
strings.Builder isn't goroutine-safe. os/exec spawns a background
goroutine that copies the subprocess's stderr fd to cmd.Stderr's
Write, so reading the buffer's String() from the request goroutine
on wait-for-port timeout while the tunnel may still be writing is
a data race that `go test -race` flags. Worst-case impact in
production is a garbled Detail string (not a crash), but the fix
is small.
Fix: wrap bytes.Buffer in a sync.Mutex (syncBuf type). Same
io.Writer interface, no API changes elsewhere.
3. **Nit cleanup**
- read-pubkey failure now reports as its own step name instead of
a duplicated "ssh-keygen" entry — disambiguates two different
failure modes that previously shared a name.
- Replaced numToString hand-rolled int-to-string with strconv.Itoa
in the test (no import savings reason existed).
Suite: 4 diagnose tests pass with -race; full handlers suite passes
in 3.95s. go vet clean.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../internal/handlers/terminal_diagnose.go | 58 ++++++++++++++-
.../handlers/terminal_diagnose_test.go | 73 +++++++++++++------
2 files changed, 104 insertions(+), 27 deletions(-)
diff --git a/workspace-server/internal/handlers/terminal_diagnose.go b/workspace-server/internal/handlers/terminal_diagnose.go
index e40f6e19..b78c8955 100644
--- a/workspace-server/internal/handlers/terminal_diagnose.go
+++ b/workspace-server/internal/handlers/terminal_diagnose.go
@@ -1,19 +1,48 @@
package handlers
import (
+ "bytes"
"context"
"fmt"
"net/http"
"os"
"os/exec"
"strings"
+ "sync"
"time"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
+ "github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth"
"github.com/gin-gonic/gin"
)
+// syncBuf is a goroutine-safe writer that wraps bytes.Buffer with a mutex.
+// Used to capture subprocess stderr without racing the os/exec stderr-copy
+// goroutine: ``cmd.Stderr = io.Writer`` spawns a background goroutine that
+// reads from the subprocess's stderr fd and calls Write on our writer, so
+// reading the buffer from another goroutine (e.g., on wait-for-port
+// timeout while the tunnel may still be writing) without synchronization
+// is a data race that ``go test -race`` would flag. ``strings.Builder``
+// and bare ``bytes.Buffer`` aren't goroutine-safe; this tiny shim is the
+// cheapest fix.
+type syncBuf struct {
+ mu sync.Mutex
+ b bytes.Buffer
+}
+
+func (s *syncBuf) Write(p []byte) (int, error) {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ return s.b.Write(p)
+}
+
+func (s *syncBuf) String() string {
+ s.mu.Lock()
+ defer s.mu.Unlock()
+ return s.b.String()
+}
+
// HandleDiagnose handles GET /workspaces/:id/terminal/diagnose. It runs the
// same per-step pipeline as HandleConnect (ssh-keygen → EIC send-key → tunnel
// → ssh) but non-interactively, captures the first failing step and its
@@ -45,6 +74,29 @@ func (h *TerminalHandler) HandleDiagnose(c *gin.Context) {
ctx, cancel := context.WithTimeout(c.Request.Context(), 30*time.Second)
defer cancel()
+ // KI-005 hierarchy check — same shape as HandleConnect. Without this,
+ // an org-level token holder can probe any workspace in their tenant by
+ // guessing the UUID, learning its diagnostic state (which IAM call
+ // fails, what sshd says) even when they don't own it. Per-workspace
+ // bearer tokens are already URL-bound by WorkspaceAuth, so the gap is
+ // org tokens — same vector KI-005 closed for /terminal (#1609).
+ callerID := c.GetHeader("X-Workspace-ID")
+ if callerID != "" && callerID != workspaceID {
+ tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization"))
+ if tok != "" {
+ if err := wsauth.ValidateToken(ctx, db.DB, callerID, tok); err != nil {
+ if c.GetString("org_token_id") == "" {
+ c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid token for claimed workspace"})
+ return
+ }
+ }
+ }
+ if !canCommunicateCheck(callerID, workspaceID) {
+ c.JSON(http.StatusForbidden, gin.H{"error": "not authorized to diagnose this workspace's terminal"})
+ return
+ }
+ }
+
var instanceID string
_ = db.DB.QueryRowContext(ctx,
`SELECT COALESCE(instance_id, '') FROM workspaces WHERE id = $1`,
@@ -155,8 +207,8 @@ func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, insta
pubKey, err := os.ReadFile(keyPath + ".pub")
if err != nil {
- return stop("ssh-keygen", diagnoseStep{
- Name: "ssh-keygen",
+ return stop("read-pubkey", diagnoseStep{
+ Name: "read-pubkey",
Error: fmt.Sprintf("read pubkey: %v", err),
})
}
@@ -201,7 +253,7 @@ func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, insta
t0 = time.Now()
tunnel := openTunnelCmd(opts)
tunnel.Env = os.Environ()
- var tunnelStderr strings.Builder
+ var tunnelStderr syncBuf
tunnel.Stderr = &tunnelStderr
if err := tunnel.Start(); err != nil {
return stop("open-tunnel", diagnoseStep{
diff --git a/workspace-server/internal/handlers/terminal_diagnose_test.go b/workspace-server/internal/handlers/terminal_diagnose_test.go
index 5cf672fe..15b94945 100644
--- a/workspace-server/internal/handlers/terminal_diagnose_test.go
+++ b/workspace-server/internal/handlers/terminal_diagnose_test.go
@@ -6,6 +6,7 @@ import (
"errors"
"net/http/httptest"
"os/exec"
+ "strconv"
"testing"
"github.com/DATA-DOG/go-sqlmock"
@@ -111,6 +112,53 @@ func TestHandleDiagnose_RoutesToLocal(t *testing.T) {
}
}
+// TestHandleDiagnose_KI005_RejectsCrossWorkspace — the diagnostic endpoint
+// has the same cross-workspace info-leak surface as /terminal had before
+// #1609. Without KI-005, an org-level token holder could probe any
+// workspace in their tenant by guessing the UUID, learning which IAM call
+// fails or which sshd error fires. This test pins that HandleDiagnose
+// applies the same hierarchy guard as HandleConnect (parity: ws-attacker
+// claiming X-Workspace-ID against /workspaces/ws-victim/terminal/diagnose
+// must 403, never reaching the SELECT COALESCE for instance_id).
+func TestHandleDiagnose_KI005_RejectsCrossWorkspace(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+
+ // Stub CanCommunicate to deny. Reset after — same pattern as the
+ // HandleConnect KI-005 tests.
+ prev := canCommunicateCheck
+ canCommunicateCheck = func(callerID, targetID string) bool { return false }
+ defer func() { canCommunicateCheck = prev }()
+
+ // Token validation: caller's bearer is bound to ws-attacker.
+ mock.ExpectQuery(`SELECT t\.id, t\.workspace_id\s+FROM workspace_auth_tokens t`).
+ WithArgs(sqlmock.AnyArg()).
+ WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-attacker"))
+ mock.ExpectExec(`UPDATE workspace_auth_tokens SET last_used_at`).
+ WithArgs(sqlmock.AnyArg()).
+ WillReturnResult(sqlmock.NewResult(0, 1))
+
+ h := NewTerminalHandler(nil)
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: "ws-victim"}}
+ c.Request = httptest.NewRequest("GET", "/workspaces/ws-victim/terminal/diagnose", nil)
+ c.Request.Header.Set("X-Workspace-ID", "ws-attacker")
+ c.Request.Header.Set("Authorization", "Bearer attacker-token")
+
+ h.HandleDiagnose(c)
+
+ if w.Code != 403 {
+ t.Errorf("cross-workspace diagnose: got %d, want 403 (%s)", w.Code, w.Body.String())
+ }
+ // Critically: the SELECT COALESCE for instance_id must NOT have run —
+ // no expectation was set for it. ExpectationsWereMet ensures we
+ // rejected before reaching the DB lookup.
+ if err := mock.ExpectationsWereMet(); err != nil {
+ t.Errorf("unmet sqlmock expectations (rejection should fire before instance_id lookup): %v", err)
+ }
+}
+
// TestDiagnoseRemote_StopsAtSSHProbe — full happy path through send-key,
// pick-port, open-tunnel, wait-for-port, then stub the ssh probe to fail.
// Confirms first_failure surfaces the actual ssh stderr ("Permission
@@ -144,7 +192,7 @@ func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) {
// fall back to a portable busy-wait).
return exec.Command("sh", "-c",
`port="$1"; while true; do nc -l "$port" >/dev/null 2>&1 || true; done`,
- "sh", numToString(o.LocalPort))
+ "sh", strconv.Itoa(o.LocalPort))
}
defer func() { openTunnelCmd = prevTun }()
@@ -197,26 +245,3 @@ func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) {
}
}
-// numToString is a tiny helper to avoid pulling fmt into the test for one
-// integer-to-string call. Same observable behavior as strconv.Itoa.
-func numToString(n int) string {
- if n == 0 {
- return "0"
- }
- var buf [20]byte
- i := len(buf)
- neg := n < 0
- if neg {
- n = -n
- }
- for n > 0 {
- i--
- buf[i] = byte('0' + n%10)
- n /= 10
- }
- if neg {
- i--
- buf[i] = '-'
- }
- return string(buf[i:])
-}
From aacaba024c13c37e258128397d1ce5976cece911 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 21:21:18 -0700
Subject: [PATCH 07/61] feat(wheel-smoke): exercise executor.execute() to catch
lazy imports (#2275)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The existing wheel-publish smoke (`wheel_smoke.py`) only IMPORTS
`molecule_runtime.main` at module scope. Lazy imports buried inside
`async def execute(...)` bodies (e.g. `from a2a.types import FilePart`)
NEVER evaluate at static-import time — they crash at first message
delivery in production.
The 2026-04-2x v0→v1 a2a-sdk migration shipped 5 such regressions in
templates that all looked fine at module-load smoke. This change adds
`smoke_mode.py` plus a `MOLECULE_SMOKE_MODE=1` short-circuit in
`main.py`: after `adapter.create_executor(...)`, the boot path invokes
`executor.execute(stub_ctx, stub_queue)` once with a 5s timeout
(`MOLECULE_SMOKE_TIMEOUT_SECS`). Healthy import tree → execution
proceeds far enough to hit a network boundary and times out (exit 0).
Broken lazy import → `ImportError` / `ModuleNotFoundError` from inside
the executor body (exit 1). Other downstream errors (auth, validation)
pass — those are caught by adapter-level tests, not this gate.
Stub `(RequestContext, EventQueue)` is built from the real a2a-sdk so
SendMessageRequest/RequestContext constructor changes also surface as
import-tree failures (the regression class also includes "SDK
refactored mid-publish"). The stub-build itself is wrapped — if it
raises, that's a smoke fail too.
Phase 2 (separate PR, molecule-ci) wires this into
publish-template-image.yml so the publish gate runs the boot smoke
against every template image before pushing the tag.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
scripts/build_runtime_package.py | 1 +
workspace/main.py | 14 +++
workspace/smoke_mode.py | 140 +++++++++++++++++++++
workspace/tests/test_smoke_mode.py | 190 +++++++++++++++++++++++++++++
4 files changed, 345 insertions(+)
create mode 100644 workspace/smoke_mode.py
create mode 100644 workspace/tests/test_smoke_mode.py
diff --git a/scripts/build_runtime_package.py b/scripts/build_runtime_package.py
index 910ea691..e6977e52 100755
--- a/scripts/build_runtime_package.py
+++ b/scripts/build_runtime_package.py
@@ -78,6 +78,7 @@ TOP_LEVEL_MODULES = {
"prompt",
"runtime_wedge",
"shared_runtime",
+ "smoke_mode",
"transcript_auth",
"watcher",
}
diff --git a/workspace/main.py b/workspace/main.py
index 093860c2..356080f3 100644
--- a/workspace/main.py
+++ b/workspace/main.py
@@ -136,6 +136,20 @@ async def main(): # pragma: no cover
await adapter.setup(adapter_config)
executor = await adapter.create_executor(adapter_config)
+ # 5a. Boot-smoke short-circuit (issue #2275): if MOLECULE_SMOKE_MODE
+ # is set, exercise the executor's full import tree by calling
+ # execute() once with stub deps + a short timeout. Skips platform
+ # registration + uvicorn entirely. Returns process exit code.
+ from smoke_mode import is_smoke_mode, run_executor_smoke
+ if is_smoke_mode():
+ exit_code = await run_executor_smoke(executor)
+ if hasattr(heartbeat, "stop"):
+ try:
+ await heartbeat.stop()
+ except Exception: # noqa: BLE001
+ pass
+ raise SystemExit(exit_code)
+
# 5b. Restore from pre-stop snapshot if one exists (GH#1391).
# The snapshot is scrubbed before being written, so secrets are
# already redacted — restore_state must not re-expose them.
diff --git a/workspace/smoke_mode.py b/workspace/smoke_mode.py
new file mode 100644
index 00000000..773e0cbe
--- /dev/null
+++ b/workspace/smoke_mode.py
@@ -0,0 +1,140 @@
+"""Boot smoke mode — exercises the executor's full import tree without touching real platforms.
+
+Why this exists (issue #2275): the existing `wheel_smoke.py` only IMPORTS
+`molecule_runtime.main` at module scope. Lazy imports buried inside
+`async def execute(...)` bodies (e.g. `from a2a.types import FilePart`)
+NEVER evaluate at static-import time — they crash at first message
+delivery in production.
+
+The 2026-04-2x v0→v1 a2a-sdk migration shipped 5 such regressions in
+templates that all looked fine at module-load smoke. This module fills
+the gap by actually invoking `executor.execute(stub_ctx, stub_queue)`
+once with a short timeout. If the import-tree is healthy the call
+proceeds far enough to hit a network boundary (LLM call, etc.) and
+times out — that's a *pass*. If a lazy import is broken, the call
+raises `ImportError` / `ModuleNotFoundError` from inside the executor
+body — that's a *fail*.
+
+Activated by setting `MOLECULE_SMOKE_MODE=1` in the env. Wired into
+`main.py` after `executor = await adapter.create_executor(...)` so the
+full adapter setup path runs first; the smoke just adds one more
+exercise step before exit.
+
+CI usage (intended for `molecule-ci/.github/workflows/publish-template-image.yml`):
+ docker run --rm \
+ -e WORKSPACE_ID=fake -e MOLECULE_SMOKE_MODE=1 \
+ "$IMAGE" molecule-runtime
+"""
+from __future__ import annotations
+
+import asyncio
+import logging
+import os
+import sys
+from typing import Any
+
+logger = logging.getLogger(__name__)
+
+
+_SMOKE_TIMEOUT_SECS = float(os.environ.get("MOLECULE_SMOKE_TIMEOUT_SECS", "5.0"))
+
+
+def is_smoke_mode() -> bool:
+ """True iff MOLECULE_SMOKE_MODE is set to a truthy value.
+
+ Recognises the standard truthy strings (`1`, `true`, `yes`,
+ case-insensitive). An unset / empty / `0` env reads as False so
+ the boot path takes the normal branch in production.
+ """
+ raw = os.environ.get("MOLECULE_SMOKE_MODE", "").strip().lower()
+ return raw in ("1", "true", "yes", "on")
+
+
+def _build_stub_context() -> tuple[Any, Any]:
+ """Build a (RequestContext, EventQueue) pair stuffed with a minimal
+ text message ("smoke test"). The Message is enough that
+ `extract_message_text(context)` returns non-empty input, so the
+ executor takes the "real" branch (not the empty-input early-exit)
+ and exercises any lazy imports along that path.
+
+ Imports happen at function scope so smoke_mode.py itself doesn't
+ pull a2a-sdk into every consumer of the runtime — the wheel still
+ boots without smoke mode active.
+ """
+ from a2a.helpers import new_text_message
+ from a2a.server.agent_execution import RequestContext
+ from a2a.server.context import ServerCallContext
+ from a2a.server.events import EventQueue
+ from a2a.types import SendMessageRequest
+
+ message = new_text_message("smoke test")
+ call_ctx = ServerCallContext()
+ request = SendMessageRequest(message=message)
+ context = RequestContext(call_ctx, request=request)
+ queue = EventQueue()
+ return context, queue
+
+
+async def run_executor_smoke(executor: Any) -> int:
+ """Invoke executor.execute() once with stub deps. Return an exit code.
+
+ Returns:
+ 0 — import tree healthy. Either execution timed out (the
+ expected outcome — we hit a network boundary like an LLM
+ call) or completed cleanly. Either way, no broken imports.
+ 1 — broken lazy import detected. Re-raised as a clear log line
+ so the publish gate's stderr captures the offending symbol.
+
+ The 5-second timeout comes from `MOLECULE_SMOKE_TIMEOUT_SECS` env
+ (default 5.0). Bump it via env if a slow adapter setup overlaps the
+ first execute call. Don't make it too long — the publish workflow
+ multiplies this across N templates.
+ """
+ print(
+ f"[smoke-mode] invoking executor.execute(stub_ctx, stub_queue) "
+ f"with {_SMOKE_TIMEOUT_SECS:.1f}s timeout to exercise lazy imports"
+ )
+
+ try:
+ context, queue = _build_stub_context()
+ except Exception as build_err: # noqa: BLE001
+ # If we can't even build the stub, the a2a-sdk import path is
+ # broken — that's exactly the regression class this gate exists
+ # for. Treat as a smoke failure.
+ print(
+ f"[smoke-mode] FAIL: stub-context build raised "
+ f"{type(build_err).__name__}: {build_err}",
+ file=sys.stderr,
+ )
+ return 1
+
+ try:
+ await asyncio.wait_for(
+ executor.execute(context, queue),
+ timeout=_SMOKE_TIMEOUT_SECS,
+ )
+ except (asyncio.TimeoutError, asyncio.CancelledError):
+ # Timeout = imports healthy, execution was proceeding and hit
+ # a network boundary or long await. Pass.
+ print("[smoke-mode] PASS: timed out past import-tree (imports healthy)")
+ return 0
+ except (ImportError, ModuleNotFoundError) as imp_err:
+ # The exact regression class issue #2275 exists to catch.
+ print(
+ f"[smoke-mode] FAIL: lazy import broken in execute(): "
+ f"{type(imp_err).__name__}: {imp_err}",
+ file=sys.stderr,
+ )
+ return 1
+ except Exception as other_err: # noqa: BLE001
+ # Anything else (auth errors, validation errors, runtime bugs)
+ # is downstream of the import gate. Pass — these are caught by
+ # the relevant adapter-level tests, not by this smoke.
+ print(
+ f"[smoke-mode] PASS: execute() raised "
+ f"{type(other_err).__name__} past import-tree (not an import error)"
+ )
+ return 0
+ else:
+ print("[smoke-mode] PASS: execute() completed within timeout (imports + body OK)")
+ return 0
diff --git a/workspace/tests/test_smoke_mode.py b/workspace/tests/test_smoke_mode.py
new file mode 100644
index 00000000..10edbe30
--- /dev/null
+++ b/workspace/tests/test_smoke_mode.py
@@ -0,0 +1,190 @@
+"""Tests for smoke_mode — the executor-stub boot smoke (issue #2275).
+
+These tests exercise the helper module directly. The end-to-end path
+(main.py invoking run_executor_smoke + sys.exit) is not unit-tested
+here because main() is `# pragma: no cover` and integration-shaped;
+that path is covered by the publish-template-image.yml smoke step
+(which is the production gate this helper exists for).
+
+Note on a2a-sdk: conftest.py stubs out a2a.* modules with minimal
+shims that don't include `a2a.server.context.ServerCallContext` or
+`a2a.types.SendMessageRequest` (the real-SDK-only symbols
+_build_stub_context needs). Tests that want to verify the
+`run_executor_smoke` control flow patch _build_stub_context to
+sidestep the real construction; tests that NEED the real SDK
+construction skip when those symbols aren't reachable.
+"""
+from __future__ import annotations
+
+import asyncio
+from unittest.mock import patch
+
+import pytest
+
+import smoke_mode
+
+
+def _real_a2a_sdk_available() -> bool:
+ """True when the real a2a-sdk types needed by _build_stub_context
+ are importable. The conftest's a2a stubs intentionally don't
+ include these — they're only present in the published wheel's
+ runtime env or when a2a-sdk is installed alongside the test."""
+ try:
+ from a2a.server.context import ServerCallContext # noqa: F401
+ from a2a.types import SendMessageRequest # noqa: F401
+ return True
+ except (ImportError, AttributeError):
+ return False
+
+
+# ─── is_smoke_mode ─────────────────────────────────────────────────────
+
+
+@pytest.mark.parametrize("env_value", ["1", "true", "yes", "on", "TRUE", "Yes", "ON"])
+def test_is_smoke_mode_truthy_values(env_value: str, monkeypatch: pytest.MonkeyPatch):
+ monkeypatch.setenv("MOLECULE_SMOKE_MODE", env_value)
+ assert smoke_mode.is_smoke_mode() is True
+
+
+@pytest.mark.parametrize("env_value", ["0", "false", "no", "off", "", " "])
+def test_is_smoke_mode_falsy_values(env_value: str, monkeypatch: pytest.MonkeyPatch):
+ monkeypatch.setenv("MOLECULE_SMOKE_MODE", env_value)
+ assert smoke_mode.is_smoke_mode() is False
+
+
+def test_is_smoke_mode_unset(monkeypatch: pytest.MonkeyPatch):
+ monkeypatch.delenv("MOLECULE_SMOKE_MODE", raising=False)
+ assert smoke_mode.is_smoke_mode() is False
+
+
+# ─── _build_stub_context (real-SDK-only) ───────────────────────────────
+
+
+@pytest.mark.skipif(
+ not _real_a2a_sdk_available(),
+ reason="conftest stubs a2a.* without ServerCallContext / SendMessageRequest; real SDK only",
+)
+def test_build_stub_context_returns_request_context_with_message():
+ """Stub must produce a RequestContext that has a non-empty message
+ payload — otherwise extract_message_text returns empty and the
+ executor takes the early-exit branch instead of exercising the
+ full import tree."""
+ context, _queue = smoke_mode._build_stub_context()
+ assert context.message is not None
+ parts = context.message.parts
+ assert len(parts) == 1
+ assert parts[0].text == "smoke test"
+
+
+@pytest.mark.skipif(
+ not _real_a2a_sdk_available(),
+ reason="conftest stubs a2a.* without ServerCallContext / SendMessageRequest; real SDK only",
+)
+def test_build_stub_context_returns_event_queue():
+ from a2a.server.events import EventQueue
+ _, queue = smoke_mode._build_stub_context()
+ assert isinstance(queue, EventQueue)
+
+
+# ─── run_executor_smoke — control flow with stubbed context ────────────
+#
+# These tests patch _build_stub_context to return sentinel objects, so
+# they don't depend on the real a2a-sdk being present. The executor
+# stubs ignore ctx + queue.
+
+
+class _RaisingExecutor:
+ def __init__(self, exc: Exception):
+ self._exc = exc
+
+ async def execute(self, context, event_queue) -> None: # noqa: ARG002
+ raise self._exc
+
+
+class _BlockingExecutor:
+ """Simulates an LLM network call that the smoke timeout cuts short."""
+
+ async def execute(self, context, event_queue) -> None: # noqa: ARG002
+ await asyncio.Event().wait()
+
+
+class _CleanExecutor:
+ async def execute(self, context, event_queue) -> None: # noqa: ARG002
+ return None
+
+
+@pytest.fixture
+def stub_build():
+ """Replace _build_stub_context with a no-op so execute() gets
+ sentinel ctx/queue. Tests can override this fixture's behavior
+ via monkeypatch when they need a different shape."""
+ sentinel_ctx = object()
+ sentinel_queue = object()
+ with patch.object(
+ smoke_mode, "_build_stub_context",
+ lambda: (sentinel_ctx, sentinel_queue),
+ ):
+ yield
+
+
+@pytest.mark.asyncio
+async def test_smoke_passes_on_timeout(stub_build, monkeypatch: pytest.MonkeyPatch):
+ monkeypatch.setattr(smoke_mode, "_SMOKE_TIMEOUT_SECS", 0.1)
+ code = await smoke_mode.run_executor_smoke(_BlockingExecutor())
+ assert code == 0
+
+
+@pytest.mark.asyncio
+async def test_smoke_passes_on_clean_return(stub_build):
+ code = await smoke_mode.run_executor_smoke(_CleanExecutor())
+ assert code == 0
+
+
+@pytest.mark.asyncio
+async def test_smoke_fails_on_import_error(stub_build):
+ """The exact regression class issue #2275 exists to catch — a lazy
+ import inside execute() that the static smoke missed."""
+ code = await smoke_mode.run_executor_smoke(
+ _RaisingExecutor(ImportError("cannot import name 'FilePart' from 'a2a.types'"))
+ )
+ assert code == 1
+
+
+@pytest.mark.asyncio
+async def test_smoke_fails_on_module_not_found_error(stub_build):
+ code = await smoke_mode.run_executor_smoke(
+ _RaisingExecutor(ModuleNotFoundError("No module named 'temporalio'"))
+ )
+ assert code == 1
+
+
+@pytest.mark.asyncio
+async def test_smoke_passes_on_non_import_runtime_error(stub_build):
+ """Auth errors, validation errors, anything-not-an-import-error
+ pass — those are caught by adapter-level tests, not by this gate."""
+ code = await smoke_mode.run_executor_smoke(
+ _RaisingExecutor(RuntimeError("ANTHROPIC_API_KEY missing"))
+ )
+ assert code == 0
+
+
+@pytest.mark.asyncio
+async def test_smoke_passes_on_value_error(stub_build):
+ code = await smoke_mode.run_executor_smoke(
+ _RaisingExecutor(ValueError("bad config"))
+ )
+ assert code == 0
+
+
+@pytest.mark.asyncio
+async def test_smoke_fails_when_stub_context_build_breaks(monkeypatch: pytest.MonkeyPatch):
+ """If a2a-sdk's own SendMessageRequest / RequestContext can't be
+ constructed (e.g. SDK migration broke the constructor), that's
+ exactly the regression class this gate exists for — fail loud."""
+
+ def _fail_build():
+ raise ImportError("simulated: a2a.types refactored mid-publish")
+
+ monkeypatch.setattr(smoke_mode, "_build_stub_context", _fail_build)
+ code = await smoke_mode.run_executor_smoke(_CleanExecutor())
+ assert code == 1
From 661eec2659bc0a7e517a6d0d8132a9ce4f2e9629 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 21:31:08 -0700
Subject: [PATCH 08/61] chore(smoke-mode): harden module-load + drop dead
except clause
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Two follow-ups from the #2275 Phase 1 self-review:
1. `_SMOKE_TIMEOUT_SECS = float(os.environ.get(...))` was evaluated at
module load. main.py imports smoke_mode unconditionally — before
the is_smoke_mode() check — so a malformed
MOLECULE_SMOKE_TIMEOUT_SECS env value would SystemExit every
workspace boot, not just smoke runs. Wrapped in try/except with a
5.0 fallback. Probability of a typo'd env var hitting production
is low (it's a CI-only knob), but the footgun is removed entirely.
Regression test reloads the module under a malformed env value.
2. `_real_a2a_sdk_available()` caught (ImportError, AttributeError).
`from X import Y` raises ImportError when Y is missing on X — never
AttributeError. Dropped the unreachable branch.
No behavior change for the happy path.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
workspace/smoke_mode.py | 8 +++++++-
workspace/tests/test_smoke_mode.py | 23 ++++++++++++++++++++++-
2 files changed, 29 insertions(+), 2 deletions(-)
diff --git a/workspace/smoke_mode.py b/workspace/smoke_mode.py
index 773e0cbe..79399946 100644
--- a/workspace/smoke_mode.py
+++ b/workspace/smoke_mode.py
@@ -36,7 +36,13 @@ from typing import Any
logger = logging.getLogger(__name__)
-_SMOKE_TIMEOUT_SECS = float(os.environ.get("MOLECULE_SMOKE_TIMEOUT_SECS", "5.0"))
+# Don't crash production boot if MOLECULE_SMOKE_TIMEOUT_SECS is malformed —
+# main.py imports smoke_mode unconditionally (before the is_smoke_mode()
+# check), so a typo'd value would otherwise SystemExit every workspace.
+try:
+ _SMOKE_TIMEOUT_SECS = float(os.environ.get("MOLECULE_SMOKE_TIMEOUT_SECS", "5.0"))
+except ValueError:
+ _SMOKE_TIMEOUT_SECS = 5.0
def is_smoke_mode() -> bool:
diff --git a/workspace/tests/test_smoke_mode.py b/workspace/tests/test_smoke_mode.py
index 10edbe30..9721024f 100644
--- a/workspace/tests/test_smoke_mode.py
+++ b/workspace/tests/test_smoke_mode.py
@@ -33,7 +33,7 @@ def _real_a2a_sdk_available() -> bool:
from a2a.server.context import ServerCallContext # noqa: F401
from a2a.types import SendMessageRequest # noqa: F401
return True
- except (ImportError, AttributeError):
+ except ImportError:
return False
@@ -57,6 +57,27 @@ def test_is_smoke_mode_unset(monkeypatch: pytest.MonkeyPatch):
assert smoke_mode.is_smoke_mode() is False
+# ─── _SMOKE_TIMEOUT_SECS bad-env-var resilience ────────────────────────
+
+
+def test_smoke_timeout_falls_back_when_env_value_is_malformed(
+ monkeypatch: pytest.MonkeyPatch,
+):
+ """A typo'd MOLECULE_SMOKE_TIMEOUT_SECS must not crash production
+ boot. main.py imports smoke_mode unconditionally — before the
+ is_smoke_mode() check — so float()-at-module-load would SystemExit
+ every workspace if the env value were bad."""
+ import importlib
+ monkeypatch.setenv("MOLECULE_SMOKE_TIMEOUT_SECS", "not-a-float")
+ reloaded = importlib.reload(smoke_mode)
+ try:
+ assert reloaded._SMOKE_TIMEOUT_SECS == 5.0
+ finally:
+ # Restore module to clean default for other tests.
+ monkeypatch.delenv("MOLECULE_SMOKE_TIMEOUT_SECS", raising=False)
+ importlib.reload(smoke_mode)
+
+
# ─── _build_stub_context (real-SDK-only) ───────────────────────────────
From 72f0079c106e21bd339d4389ec2f487726d80e84 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 21:55:24 -0700
Subject: [PATCH 09/61] feat(workspace-server): GET /workspaces/:id returns 410
Gone when status='removed' (#2429)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Defense-in-depth at the endpoint level. Previously, GET /workspaces/:id
returned 200 OK with `status:"removed"` in the body for deleted
workspaces — silent-fail UX hit on the hongmingwang tenant 2026-04-30:
the channel bridge / molecule-mcp wheel had a dead workspace_id + token
in .env, get_workspace_info returned 200 → caller assumed everything
was fine, then every subsequent /registry/* call 401d because tokens
were revoked, and operators had no idea their workspace was gone.
#2425 fixed the steady-state heartbeat path (escalate to ERROR after
3 consecutive 401s). This change is the startup-time defense — fail
loud when the operator first probes the workspace instead of waiting
for the heartbeat to sour.
The 410 body includes:
{error: "workspace removed", id, removed_at, hint: "Regenerate ..."}
Audit-trail consumers that need the body shape of a removed workspace
(admin views, "show me deleted workspaces" tooling) opt into the
legacy 200 + body via ?include_removed=true. Without this opt-in path
the audit trail becomes invisible at the API layer.
Two new tests pinned:
- TestWorkspaceGet_RemovedReturns410
- TestWorkspaceGet_RemovedWithIncludeQueryReturns200
Follow-ups in separate PRs:
- Update workspace/a2a_client.py get_workspace_info to surface
"removed" specifically rather than collapsing into "not found"
- Update channel bridge getWorkspaceInfo (server.ts) to detect 410
→ log clear "workspace was deleted, re-onboard" error
- Audit canvas/* + admin tooling consumers that may rely on the
legacy 200 + status:"removed" shape; switch them to the
?include_removed=true opt-in if needed
- Update docs (runtime-mcp.mdx Troubleshooting + external-agents.mdx
lifecycle table)
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../internal/handlers/workspace.go | 25 ++++
.../internal/handlers/workspace_test.go | 120 ++++++++++++++++++
2 files changed, 145 insertions(+)
diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go
index c4a3376f..23d67f44 100644
--- a/workspace-server/internal/handlers/workspace.go
+++ b/workspace-server/internal/handlers/workspace.go
@@ -14,6 +14,7 @@ import (
"os"
"path/filepath"
"strings"
+ "time"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/crypto"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/db"
@@ -649,6 +650,30 @@ func (h *WorkspaceHandler) Get(c *gin.Context) {
return
}
+ // #2429: workspaces with status='removed' return 410 Gone (not 200)
+ // so callers fail loudly at startup instead of after 60s of revoked-
+ // token heartbeats. The audit-trail consumers that need the body of
+ // a removed workspace opt in via ?include_removed=true.
+ //
+ // Why a query param and not a header: cheap to set in curl/canvas
+ // fetch alike, visible in access logs, and works without coupling
+ // to content negotiation.
+ if status, _ := ws["status"].(string); status == string(models.StatusRemoved) {
+ if c.Query("include_removed") != "true" {
+ var removedAt time.Time
+ _ = db.DB.QueryRowContext(c.Request.Context(),
+ `SELECT updated_at FROM workspaces WHERE id = $1`, id,
+ ).Scan(&removedAt)
+ c.JSON(http.StatusGone, gin.H{
+ "error": "workspace removed",
+ "id": id,
+ "removed_at": removedAt,
+ "hint": "Regenerate workspace + token from the canvas → Tokens tab",
+ })
+ return
+ }
+ }
+
// Strip sensitive fields — GET /workspaces/:id is on the open router.
// Any caller with a valid UUID would otherwise read operational data.
delete(ws, "budget_limit")
diff --git a/workspace-server/internal/handlers/workspace_test.go b/workspace-server/internal/handlers/workspace_test.go
index 9149b178..f1093191 100644
--- a/workspace-server/internal/handlers/workspace_test.go
+++ b/workspace-server/internal/handlers/workspace_test.go
@@ -9,6 +9,7 @@ import (
"os"
"path/filepath"
"testing"
+ "time"
"github.com/DATA-DOG/go-sqlmock"
"github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
@@ -97,6 +98,125 @@ func TestWorkspaceGet_NotFound(t *testing.T) {
}
}
+// #2429: GET /workspaces/:id returns 410 Gone when status='removed'.
+// Defense-in-depth at the endpoint level — without this, callers
+// holding stale workspace_id + token tuples (channel bridge .env,
+// captured curl scripts, etc.) get 200 + status:"removed" and have
+// no idea their tokens are revoked until the heartbeat fails 60s
+// later. 410 makes startup fail loud instead.
+func TestWorkspaceGet_RemovedReturns410(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+ broadcaster := newTestBroadcaster()
+ handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+ id := "cccccccc-0010-0000-0000-000000000000"
+ removedAt := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
+
+ columns := []string{
+ "id", "name", "role", "tier", "status", "agent_card", "url",
+ "parent_id", "active_tasks", "max_concurrent_tasks", "last_error_rate", "last_sample_error",
+ "uptime_seconds", "current_task", "runtime", "workspace_dir", "x", "y", "collapsed",
+ "budget_limit", "monthly_spend",
+ }
+ mock.ExpectQuery("SELECT w.id, w.name").
+ WithArgs(id).
+ WillReturnRows(sqlmock.NewRows(columns).
+ AddRow(id, "Old Agent", "worker", 1, string(models.StatusRemoved), []byte(`null`),
+ "", nil, 0, 1, 0.0, "", 0, "", "langgraph",
+ "", 0.0, 0.0, false,
+ nil, 0))
+ mock.ExpectQuery(`SELECT updated_at FROM workspaces`).
+ WithArgs(id).
+ WillReturnRows(sqlmock.NewRows([]string{"updated_at"}).AddRow(removedAt))
+
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: id}}
+ c.Request = httptest.NewRequest("GET", "/workspaces/"+id, nil)
+
+ handler.Get(c)
+
+ if w.Code != http.StatusGone {
+ t.Fatalf("expected 410 Gone, got %d: %s", w.Code, w.Body.String())
+ }
+
+ var resp map[string]interface{}
+ if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+ t.Fatalf("failed to parse 410 body: %v", err)
+ }
+ if resp["error"] != "workspace removed" {
+ t.Errorf("expected error 'workspace removed', got %v", resp["error"])
+ }
+ if resp["id"] != id {
+ t.Errorf("expected id %q, got %v", id, resp["id"])
+ }
+ if _, ok := resp["removed_at"]; !ok {
+ t.Errorf("expected removed_at in 410 body, got: %v", resp)
+ }
+ if _, ok := resp["hint"]; !ok {
+ t.Errorf("expected hint in 410 body, got: %v", resp)
+ }
+
+ if err := mock.ExpectationsWereMet(); err != nil {
+ t.Errorf("unmet sqlmock expectations: %v", err)
+ }
+}
+
+// Audit-trail consumers (admin views, "show me deleted workspaces"
+// tooling) opt into the legacy 200 + body shape via
+// ?include_removed=true. Without this opt-in path the audit trail
+// becomes invisible at the API layer.
+func TestWorkspaceGet_RemovedWithIncludeQueryReturns200(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+ broadcaster := newTestBroadcaster()
+ handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+ id := "cccccccc-0011-0000-0000-000000000000"
+
+ columns := []string{
+ "id", "name", "role", "tier", "status", "agent_card", "url",
+ "parent_id", "active_tasks", "max_concurrent_tasks", "last_error_rate", "last_sample_error",
+ "uptime_seconds", "current_task", "runtime", "workspace_dir", "x", "y", "collapsed",
+ "budget_limit", "monthly_spend",
+ }
+ mock.ExpectQuery("SELECT w.id, w.name").
+ WithArgs(id).
+ WillReturnRows(sqlmock.NewRows(columns).
+ AddRow(id, "Audit Agent", "worker", 1, string(models.StatusRemoved), []byte(`null`),
+ "", nil, 0, 1, 0.0, "", 0, "", "langgraph",
+ "", 0.0, 0.0, false,
+ nil, 0))
+ // last_outbound_at follow-up query (existing path)
+ mock.ExpectQuery(`SELECT last_outbound_at FROM workspaces`).
+ WithArgs(id).
+ WillReturnRows(sqlmock.NewRows([]string{"last_outbound_at"}).AddRow(nil))
+
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: id}}
+ c.Request = httptest.NewRequest("GET", "/workspaces/"+id+"?include_removed=true", nil)
+
+ handler.Get(c)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("expected 200 OK with ?include_removed=true, got %d: %s", w.Code, w.Body.String())
+ }
+
+ var resp map[string]interface{}
+ if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+ t.Fatalf("failed to parse response: %v", err)
+ }
+ if resp["status"] != string(models.StatusRemoved) {
+ t.Errorf("expected status 'removed' in body, got %v", resp["status"])
+ }
+
+ if err := mock.ExpectationsWereMet(); err != nil {
+ t.Errorf("unmet sqlmock expectations: %v", err)
+ }
+}
+
func TestWorkspaceGet_DBError(t *testing.T) {
mock := setupTestDB(t)
setupTestRedis(t)
From 59902bce836f3cce810735898b2cab9e54d6cb12 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 21:58:45 -0700
Subject: [PATCH 10/61] feat(config): add observability block schema (#119 PR-1
of 4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Hermes-style declarative block grouping cadence + verbosity knobs into
one place. Schema-only in this PR — wiring into heartbeat.py and main.py
lands in PR-3 of the #119 stack.
Two fields with live consumers waiting:
- heartbeat_interval_seconds (default 30, clamped to [5, 300])
→ heartbeat.py:134 currently has hard-coded HEARTBEAT_INTERVAL = 30
- log_level (default "INFO", uppercased at parse)
→ main.py:465 currently has hard-coded log_level="info"
Clamp band [5, 300] is intentional: sub-5s flooded the platform during
IR-2026-03-11; >5min lets crashed workspaces look healthy long enough
to mask failure. Coerce at parse so adapters and heartbeat.py can read
the value without re-validating.
Tests pin defaults, explicit YAML override, partial override, and
parametrized clamp behavior (10 cases including garbage strings + None).
Part of: task #119 (adopt hermes-style architecture)
Stack: PR-1 schema → PR-2 event_log → PR-3 wire consumers → PR-4 skill compat
---
workspace/config.py | 61 +++++++++++++++++
workspace/tests/test_config.py | 117 +++++++++++++++++++++++++++++++++
2 files changed, 178 insertions(+)
diff --git a/workspace/config.py b/workspace/config.py
index 3b205f1b..4e199c57 100644
--- a/workspace/config.py
+++ b/workspace/config.py
@@ -166,6 +166,43 @@ class SecurityScanConfig:
operators who require a CVE gate know the gate is absent. Closes #268."""
+@dataclass
+class ObservabilityConfig:
+ """Observability settings — heartbeat cadence and log verbosity.
+
+ Hermes-style block: groups platform-runtime knobs that operators
+ typically tune together (cadence, verbosity) into one declarative
+ section instead of scattering them across env vars and hard-coded
+ constants. Adopting this shape unblocks per-workspace tuning without
+ a code change and pre-positions the schema for tracing/event-log
+ settings that will land in follow-up PRs (#119 PR-2 / PR-3).
+
+ Today only ``heartbeat_interval_seconds`` and ``log_level`` have live
+ consumers; both fields are accepted but not yet wired to their final
+ sites in this PR (schema-only). Wiring lands in PR-3 of the series.
+
+ Example config.yaml snippet::
+
+ observability:
+ heartbeat_interval_seconds: 60
+ log_level: DEBUG
+ """
+
+ heartbeat_interval_seconds: int = 30
+ """Seconds between heartbeats sent to the platform. Default 30 matches
+ ``workspace/heartbeat.py``'s long-standing constant. Lower values
+ reduce platform-side detection latency for crashed workspaces; higher
+ values reduce platform write load. Bounds: clamped to [5, 300] at
+ parse time — outside that range the workspace either floods the
+ platform or looks dead before the next beat."""
+
+ log_level: str = "INFO"
+ """Python ``logging`` level for the workspace runtime. Accepts the
+ standard names (DEBUG, INFO, WARNING, ERROR, CRITICAL). Today the
+ runtime reads ``LOG_LEVEL`` env; PR-3 of the #119 stack switches to
+ this field with env still honored as an override for ops debugging."""
+
+
@dataclass
class ComplianceConfig:
"""OWASP Top 10 for Agentic Applications compliance settings.
@@ -264,6 +301,7 @@ class WorkspaceConfig:
governance: GovernanceConfig = field(default_factory=GovernanceConfig)
security_scan: SecurityScanConfig = field(default_factory=SecurityScanConfig)
compliance: ComplianceConfig = field(default_factory=ComplianceConfig)
+ observability: ObservabilityConfig = field(default_factory=ObservabilityConfig)
sub_workspaces: list[dict] = field(default_factory=list)
effort: str = ""
"""Claude output effort level for the agentic loop: low | medium | high | xhigh | max.
@@ -289,6 +327,22 @@ def _derive_provider_from_model(model: str) -> str:
return ""
+def _clamp_heartbeat(value: object) -> int:
+ """Coerce raw YAML/env input into the [5, 300]-second heartbeat band.
+
+ Outside that band the workspace either floods the platform with
+ sub-second beats or looks dead long before the next one — both
+ real failure modes seen on incidents, neither benign. Coerce here
+ so adapters and ``heartbeat.py`` can read the value without
+ re-validating.
+ """
+ try:
+ n = int(value)
+ except (TypeError, ValueError):
+ return 30
+ return max(5, min(300, n))
+
+
def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
"""Load config from WORKSPACE_CONFIG_PATH or the given path."""
if config_path is None:
@@ -336,6 +390,7 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
_ss_raw = raw.get("security_scan", {})
security_scan_raw = _ss_raw if isinstance(_ss_raw, dict) else {"mode": str(_ss_raw)}
compliance_raw = raw.get("compliance", {})
+ observability_raw = raw.get("observability", {})
# Resolve initial_prompt: inline string or file reference
initial_prompt = raw.get("initial_prompt", "")
@@ -445,6 +500,12 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig:
max_tool_calls_per_task=int(compliance_raw.get("max_tool_calls_per_task", 50)),
max_task_duration_seconds=int(compliance_raw.get("max_task_duration_seconds", 300)),
),
+ observability=ObservabilityConfig(
+ heartbeat_interval_seconds=_clamp_heartbeat(
+ observability_raw.get("heartbeat_interval_seconds", 30)
+ ),
+ log_level=str(observability_raw.get("log_level", "INFO")).upper(),
+ ),
sub_workspaces=raw.get("sub_workspaces", []),
effort=str(raw.get("effort", "")),
task_budget=int(raw.get("task_budget", 0)),
diff --git a/workspace/tests/test_config.py b/workspace/tests/test_config.py
index bc09d638..5c790b04 100644
--- a/workspace/tests/test_config.py
+++ b/workspace/tests/test_config.py
@@ -9,6 +9,7 @@ from config import (
A2AConfig,
ComplianceConfig,
DelegationConfig,
+ ObservabilityConfig,
SandboxConfig,
WorkspaceConfig,
load_config,
@@ -523,3 +524,119 @@ def test_compliance_default_via_load_config(tmp_path, yaml_payload, expected_mod
# prompt_injection was never overridden in any payload — must stay at
# the dataclass default regardless of the mode value.
assert cfg.compliance.prompt_injection == "detect"
+
+
+# ===== Observability block (#119 PR-1) =====
+#
+# Hermes-style declarative block grouping cadence + verbosity knobs into one
+# place. Schema-only in this PR — wiring into heartbeat.py / main.py lands in
+# PR-3. These tests pin the schema so the wiring PR can rely on the parsed
+# values matching the documented contract (defaults, clamping bounds,
+# log-level normalization).
+
+
+def test_observability_dataclass_default():
+ """ObservabilityConfig() — no args — yields the documented defaults."""
+ cfg = ObservabilityConfig()
+ assert cfg.heartbeat_interval_seconds == 30
+ assert cfg.log_level == "INFO"
+
+
+def test_observability_default_when_yaml_omits_block(tmp_path):
+ """No ``observability:`` key in YAML → dataclass defaults."""
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(yaml.dump({}))
+
+ cfg = load_config(str(tmp_path))
+ assert cfg.observability.heartbeat_interval_seconds == 30
+ assert cfg.observability.log_level == "INFO"
+
+
+def test_observability_explicit_yaml_override(tmp_path):
+ """Explicit YAML values flow through load_config to ObservabilityConfig."""
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(
+ yaml.dump(
+ {
+ "observability": {
+ "heartbeat_interval_seconds": 60,
+ "log_level": "DEBUG",
+ }
+ }
+ )
+ )
+
+ cfg = load_config(str(tmp_path))
+ assert cfg.observability.heartbeat_interval_seconds == 60
+ assert cfg.observability.log_level == "DEBUG"
+
+
+def test_observability_partial_override_keeps_other_defaults(tmp_path):
+ """Setting only heartbeat preserves the log_level default — and vice versa."""
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(
+ yaml.dump({"observability": {"heartbeat_interval_seconds": 45}})
+ )
+
+ cfg = load_config(str(tmp_path))
+ assert cfg.observability.heartbeat_interval_seconds == 45
+ assert cfg.observability.log_level == "INFO"
+
+
+@pytest.mark.parametrize(
+ "raw, expected",
+ [
+ # In-band values pass through unchanged.
+ (5, 5),
+ (30, 30),
+ (300, 300),
+ # Below floor → clamped up to 5s. Sub-5s heartbeats flooded the
+ # platform during incident IR-2026-03-11 (workspace stuck in a
+ # tight loop emitting beats faster than the platform could ack).
+ (1, 5),
+ (0, 5),
+ (-7, 5),
+ # Above ceiling → clamped down to 300s. >5min beats let crashed
+ # workspaces look healthy long enough to mask the failure.
+ (301, 300),
+ (3600, 300),
+ # Non-integer YAML values fall back to the documented default
+ # rather than crashing the workspace at boot.
+ ("not-a-number", 30),
+ (None, 30),
+ ],
+ ids=[
+ "floor_in_band",
+ "default_in_band",
+ "ceiling_in_band",
+ "below_floor_one",
+ "below_floor_zero",
+ "below_floor_negative",
+ "above_ceiling_just",
+ "above_ceiling_far",
+ "garbage_string",
+ "null",
+ ],
+)
+def test_observability_heartbeat_clamp(tmp_path, raw, expected):
+ """heartbeat_interval_seconds is clamped to the [5, 300] band at parse."""
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(
+ yaml.dump({"observability": {"heartbeat_interval_seconds": raw}})
+ )
+
+ cfg = load_config(str(tmp_path))
+ assert cfg.observability.heartbeat_interval_seconds == expected
+
+
+def test_observability_log_level_uppercased(tmp_path):
+ """Lowercase or mixed-case log levels normalize to the canonical form
+ Python's ``logging`` module expects, so operators can write either
+ ``debug`` or ``DEBUG`` in YAML without surprise."""
+ config_yaml = tmp_path / "config.yaml"
+ config_yaml.write_text(
+ yaml.dump({"observability": {"log_level": "debug"}})
+ )
+
+ cfg = load_config(str(tmp_path))
+ assert cfg.observability.log_level == "DEBUG"
From 645c1862c489030a711eb011ac26a2e8a4391e0a Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 22:08:08 -0700
Subject: [PATCH 11/61] feat(a2a-client): surface 410 Gone as 'removed' error
so callers can re-onboard (#2429)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Follow-up A to PR #2449 — that PR taught the platform to return 410
Gone for status='removed' workspaces; this PR teaches get_workspace_info
to consume that signal.
Before: every non-200 collapsed into {"error": "not found"}, which
made the 2026-04-30 incident impossible to diagnose — the operator
KNEW the workspace_id existed (they'd just registered it), but the
runtime kept reporting "not found" for a deleted-but-not-purged row.
After: 410 produces a distinct {"error": "removed", "id", "removed_at",
"hint"} dict so callers (heartbeat-loop, channel bridge, dashboard
tools) can surface "your workspace was deleted, re-onboard" instead
of "not found". Falls back to a default hint if the platform body
isn't parseable so the actionable signal doesn't depend on body
shape parity.
Two new tests:
- TestGetWorkspaceInfo.test_410_returns_removed_with_hint
- TestGetWorkspaceInfo.test_410_with_unparseable_body_falls_back_to_default_hint
Co-Authored-By: Claude Opus 4.7 (1M context)
---
workspace/a2a_client.py | 30 ++++++++++++++++++++-
workspace/tests/test_a2a_client.py | 42 ++++++++++++++++++++++++++++++
2 files changed, 71 insertions(+), 1 deletion(-)
diff --git a/workspace/a2a_client.py b/workspace/a2a_client.py
index 83ad0c89..4a9be69e 100644
--- a/workspace/a2a_client.py
+++ b/workspace/a2a_client.py
@@ -340,7 +340,14 @@ async def get_peers() -> list[dict]:
async def get_workspace_info() -> dict:
- """Get this workspace's info from the platform."""
+ """Get this workspace's info from the platform.
+
+ Distinguishes three failure shapes so callers can handle them
+ distinctly (#2429):
+ - 410 Gone → workspace was deleted; re-onboard required
+ - 404 / other → workspace never existed (or transient)
+ - exception → network / auth failure
+ """
async with httpx.AsyncClient(timeout=10.0) as client:
try:
resp = await client.get(
@@ -349,6 +356,27 @@ async def get_workspace_info() -> dict:
)
if resp.status_code == 200:
return resp.json()
+ if resp.status_code == 410:
+ # #2429: platform returns 410 when status='removed'.
+ # Surface "removed" + the actionable hint so callers
+ # can prompt re-onboard instead of falling through to
+ # "not found" — which made the 2026-04-30 incident
+ # impossible to diagnose ("workspace not found" with
+ # a workspace_id we KNEW we'd just registered).
+ try:
+ body = resp.json()
+ except Exception:
+ body = {}
+ return {
+ "error": "removed",
+ "id": body.get("id", WORKSPACE_ID),
+ "removed_at": body.get("removed_at"),
+ "hint": body.get(
+ "hint",
+ "Workspace was deleted on the platform. "
+ "Regenerate workspace + token from the canvas → Tokens tab.",
+ ),
+ }
return {"error": "not found"}
except Exception as e:
return {"error": str(e)}
diff --git a/workspace/tests/test_a2a_client.py b/workspace/tests/test_a2a_client.py
index 446945f9..f667ed95 100644
--- a/workspace/tests/test_a2a_client.py
+++ b/workspace/tests/test_a2a_client.py
@@ -819,6 +819,48 @@ class TestGetWorkspaceInfo:
assert result == {"error": "not found"}
+ async def test_410_returns_removed_with_hint(self):
+ """410 Gone (#2429) → distinct error 'removed' so callers can
+ prompt re-onboard instead of falling through to 'not found'.
+ Body shape passes through removed_at + the platform hint."""
+ import a2a_client
+
+ body = {
+ "error": "workspace removed",
+ "id": "ws-deleted-uuid",
+ "removed_at": "2026-04-30T12:00:00Z",
+ "hint": "Regenerate workspace + token from the canvas → Tokens tab",
+ }
+ resp = _make_response(410, body)
+ mock_client = _make_mock_client(get_resp=resp)
+
+ with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+ result = await a2a_client.get_workspace_info()
+
+ assert result["error"] == "removed"
+ assert result["id"] == "ws-deleted-uuid"
+ assert result["removed_at"] == "2026-04-30T12:00:00Z"
+ assert "Regenerate" in result["hint"]
+
+ async def test_410_with_unparseable_body_falls_back_to_default_hint(self):
+ """If the platform's 410 body isn't JSON for some reason, the
+ default hint still surfaces — the actionable signal must not
+ depend on body shape parity with the platform."""
+ import a2a_client
+
+ resp = MagicMock()
+ resp.status_code = 410
+ resp.json = MagicMock(side_effect=ValueError("not json"))
+ mock_client = _make_mock_client(get_resp=resp)
+
+ with patch("a2a_client.httpx.AsyncClient", return_value=mock_client):
+ result = await a2a_client.get_workspace_info()
+
+ assert result["error"] == "removed"
+ assert result["id"] == a2a_client.WORKSPACE_ID
+ assert result["removed_at"] is None
+ assert "Regenerate" in result["hint"]
+
async def test_exception_returns_error_dict_with_message(self):
"""Network exception → returns {'error': ''}."""
import a2a_client
From 364c70fc71b4eb065856063da9540a604be2f781 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 22:24:59 -0700
Subject: [PATCH 12/61] fix(workspace-server): emit null removed_at when
timestamp fetch fails
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
#2429 review finding. The 410-Gone path issues a follow-up
`SELECT updated_at` after detecting status='removed'. If that query
fails (workspace row deleted between the two queries, transient DB
error, etc.), `removedAt` stays as Go's zero time and the JSON body
emits `"removed_at": "0001-01-01T00:00:00Z"` — a misleading timestamp
the client has to know to ignore.
Now we branch on `removedAt.IsZero()` and emit `null` for the failed
path. The actionable signal (the 410 + hint) is unchanged; only the
timestamp shape gets cleaner.
Pinned by `TestWorkspaceGet_RemovedReturns410WithNullRemovedAtOnTimestampFetchFailure`,
which simulates the row vanishing via `sqlmock`'s `WillReturnError(sql.ErrNoRows)`.
The original `_RemovedReturns410` test now also asserts that the
happy-path timestamp is a non-null value (was just checking the key
existed).
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../internal/handlers/workspace.go | 24 +++++--
.../internal/handlers/workspace_test.go | 67 ++++++++++++++++++-
2 files changed, 83 insertions(+), 8 deletions(-)
diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go
index 23d67f44..78181e61 100644
--- a/workspace-server/internal/handlers/workspace.go
+++ b/workspace-server/internal/handlers/workspace.go
@@ -660,16 +660,28 @@ func (h *WorkspaceHandler) Get(c *gin.Context) {
// to content negotiation.
if status, _ := ws["status"].(string); status == string(models.StatusRemoved) {
if c.Query("include_removed") != "true" {
+ // Best-effort fetch of the removal timestamp. If the row was
+ // deleted (or some transient DB error fired) between the
+ // scanWorkspaceRow above and this follow-up SELECT,
+ // removedAt stays as Go's zero time. Emit `null` in that
+ // case rather than the misleading `0001-01-01T00:00:00Z`
+ // the client would otherwise see — the actionable signal
+ // is the 410 + hint, not the timestamp.
var removedAt time.Time
_ = db.DB.QueryRowContext(c.Request.Context(),
`SELECT updated_at FROM workspaces WHERE id = $1`, id,
).Scan(&removedAt)
- c.JSON(http.StatusGone, gin.H{
- "error": "workspace removed",
- "id": id,
- "removed_at": removedAt,
- "hint": "Regenerate workspace + token from the canvas → Tokens tab",
- })
+ body := gin.H{
+ "error": "workspace removed",
+ "id": id,
+ "hint": "Regenerate workspace + token from the canvas → Tokens tab",
+ }
+ if removedAt.IsZero() {
+ body["removed_at"] = nil
+ } else {
+ body["removed_at"] = removedAt
+ }
+ c.JSON(http.StatusGone, body)
return
}
}
diff --git a/workspace-server/internal/handlers/workspace_test.go b/workspace-server/internal/handlers/workspace_test.go
index f1093191..4e17ca6a 100644
--- a/workspace-server/internal/handlers/workspace_test.go
+++ b/workspace-server/internal/handlers/workspace_test.go
@@ -151,8 +151,8 @@ func TestWorkspaceGet_RemovedReturns410(t *testing.T) {
if resp["id"] != id {
t.Errorf("expected id %q, got %v", id, resp["id"])
}
- if _, ok := resp["removed_at"]; !ok {
- t.Errorf("expected removed_at in 410 body, got: %v", resp)
+ if v, ok := resp["removed_at"]; !ok || v == nil {
+ t.Errorf("expected removed_at to be a real timestamp on the happy path, got: %v", v)
}
if _, ok := resp["hint"]; !ok {
t.Errorf("expected hint in 410 body, got: %v", resp)
@@ -163,6 +163,69 @@ func TestWorkspaceGet_RemovedReturns410(t *testing.T) {
}
}
+// If the follow-up `SELECT updated_at` query fails (workspace row
+// disappeared in the gap, transient DB error, etc.), removedAt stays
+// as Go's zero time. We emit JSON `null` for that case rather than
+// the misleading `"0001-01-01T00:00:00Z"` the client would otherwise
+// see — the actionable signal is the 410 + hint, not the timestamp.
+func TestWorkspaceGet_RemovedReturns410WithNullRemovedAtOnTimestampFetchFailure(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+ broadcaster := newTestBroadcaster()
+ handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir())
+
+ id := "cccccccc-0012-0000-0000-000000000000"
+
+ columns := []string{
+ "id", "name", "role", "tier", "status", "agent_card", "url",
+ "parent_id", "active_tasks", "max_concurrent_tasks", "last_error_rate", "last_sample_error",
+ "uptime_seconds", "current_task", "runtime", "workspace_dir", "x", "y", "collapsed",
+ "budget_limit", "monthly_spend",
+ }
+ mock.ExpectQuery("SELECT w.id, w.name").
+ WithArgs(id).
+ WillReturnRows(sqlmock.NewRows(columns).
+ AddRow(id, "Vanished", "worker", 1, string(models.StatusRemoved), []byte(`null`),
+ "", nil, 0, 1, 0.0, "", 0, "", "langgraph",
+ "", 0.0, 0.0, false,
+ nil, 0))
+ // Simulate the row vanishing between the two queries.
+ mock.ExpectQuery(`SELECT updated_at FROM workspaces`).
+ WithArgs(id).
+ WillReturnError(sql.ErrNoRows)
+
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: id}}
+ c.Request = httptest.NewRequest("GET", "/workspaces/"+id, nil)
+
+ handler.Get(c)
+
+ if w.Code != http.StatusGone {
+ t.Fatalf("expected 410 Gone, got %d: %s", w.Code, w.Body.String())
+ }
+
+ var resp map[string]interface{}
+ if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+ t.Fatalf("failed to parse 410 body: %v", err)
+ }
+ if resp["removed_at"] != nil {
+ t.Errorf(
+ "expected removed_at == null when timestamp fetch fails; got %v (type %T). "+
+ "Misleading 0001-01-01 timestamps in the JSON would confuse clients.",
+ resp["removed_at"], resp["removed_at"],
+ )
+ }
+ // Other fields must still be present.
+ if resp["error"] != "workspace removed" || resp["id"] != id || resp["hint"] == nil {
+ t.Errorf("expected error/id/hint to survive the timestamp fetch failure; got %v", resp)
+ }
+
+ if err := mock.ExpectationsWereMet(); err != nil {
+ t.Errorf("unmet sqlmock expectations: %v", err)
+ }
+}
+
// Audit-trail consumers (admin views, "show me deleted workspaces"
// tooling) opt into the legacy 200 + body shape via
// ?include_removed=true. Without this opt-in path the audit trail
From 258c6bea44910b75226295fd3b0a96de7e423457 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Thu, 30 Apr 2026 22:25:15 -0700
Subject: [PATCH 13/61] feat(workspace-server): PUT /provider endpoint for
explicit LLM provider (#196)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Mirror of PUT /model. Stores the provider slug as the LLM_PROVIDER
workspace secret so the canvas can update model + provider
independently — a user might keep the same model alias and switch
providers (route through a different gateway), or vice versa.
Forcing both into one endpoint imposes a single Save+Restart per
change; two endpoints let canvas update each as the user picks.
Plumbs through the existing chain: secret-load → envVars → CP
req.Env → user-data env exports → /configs/config.yaml (after
controlplane PR #364 lands the heredoc append).
Tests: 5 new cases mirroring SetModel/GetModel exactly — default
empty response, DB error, upsert with restart trigger, empty-clears,
invalid-UUID rejection.
Part of: Option B PR-2 (#196) — workspace-server plumbs LLM_PROVIDER
Stack: PR-1 schema (#2441 merged)
PR-2 (this) ws-server endpoint
PR-3 (#364 open) CP user-data persistence
PR-4 (pending) hermes adapter consume
PR-5 (pending) canvas Provider dropdown
---
workspace-server/internal/handlers/secrets.go | 106 +++++++++++++
.../internal/handlers/secrets_test.go | 146 ++++++++++++++++++
workspace-server/internal/router/router.go | 2 +
3 files changed, 254 insertions(+)
diff --git a/workspace-server/internal/handlers/secrets.go b/workspace-server/internal/handlers/secrets.go
index 3766068d..4d88be38 100644
--- a/workspace-server/internal/handlers/secrets.go
+++ b/workspace-server/internal/handlers/secrets.go
@@ -533,3 +533,109 @@ func (h *SecretsHandler) SetModel(c *gin.Context) {
}
c.JSON(http.StatusOK, gin.H{"status": "saved", "model": body.Model})
}
+
+// GetProvider handles GET /workspaces/:id/provider
+// Returns the explicit LLM provider override stored as the LLM_PROVIDER
+// workspace secret. Mirror of GetModel — same shape, same response keys
+// (provider/source) to keep canvas wiring symmetric.
+//
+// Why a sibling endpoint rather than overloading PUT /model: the new
+// `provider` field (Option B, PR #2441) is orthogonal to the model
+// slug. A user might keep the same model alias and switch providers
+// (e.g., route the same alias through a different gateway), or keep
+// the same provider and switch models. Co-storing them under one
+// endpoint forces a single Save+Restart round-trip per change; two
+// endpoints let the canvas update each independently.
+func (h *SecretsHandler) GetProvider(c *gin.Context) {
+ workspaceID := c.Param("id")
+ ctx := c.Request.Context()
+
+ var bytesVal []byte
+ var version int
+ err := db.DB.QueryRowContext(ctx,
+ `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1 AND key = 'LLM_PROVIDER'`,
+ workspaceID).Scan(&bytesVal, &version)
+ if err == sql.ErrNoRows {
+ c.JSON(http.StatusOK, gin.H{"provider": "", "source": "default"})
+ return
+ }
+ if err != nil {
+ c.JSON(http.StatusInternalServerError, gin.H{"error": "query failed"})
+ return
+ }
+
+ decrypted, err := crypto.DecryptVersioned(bytesVal, version)
+ if err != nil {
+ c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to decrypt"})
+ return
+ }
+
+ c.JSON(http.StatusOK, gin.H{"provider": string(decrypted), "source": "workspace_secrets"})
+}
+
+// SetProvider handles PUT /workspaces/:id/provider — writes the provider
+// slug into workspace_secrets as LLM_PROVIDER. Empty string clears the
+// override. Triggers auto-restart so the new env is in effect on the
+// next boot — without this the canvas Save+Restart can race the
+// already-restarting container and miss the window.
+//
+// CP user-data (controlplane PR #364) reads LLM_PROVIDER from env and
+// writes it into /configs/config.yaml at boot, so the choice survives
+// restart. Without that PR this endpoint still works but the value is
+// only sticky when the workspace_secrets row is read on every restart
+// (the secret-load path) — slower failure mode, same eventual behavior.
+func (h *SecretsHandler) SetProvider(c *gin.Context) {
+ workspaceID := c.Param("id")
+ if !uuidRegex.MatchString(workspaceID) {
+ c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"})
+ return
+ }
+ ctx := c.Request.Context()
+
+ var body struct {
+ Provider string `json:"provider"`
+ }
+ if err := c.ShouldBindJSON(&body); err != nil {
+ c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"})
+ return
+ }
+
+ if body.Provider == "" {
+ if _, err := db.DB.ExecContext(ctx,
+ `DELETE FROM workspace_secrets WHERE workspace_id = $1 AND key = 'LLM_PROVIDER'`,
+ workspaceID); err != nil {
+ log.Printf("SetProvider delete error: %v", err)
+ c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to clear provider"})
+ return
+ }
+ if h.restartFunc != nil {
+ go h.restartFunc(workspaceID)
+ }
+ c.JSON(http.StatusOK, gin.H{"status": "cleared"})
+ return
+ }
+
+ encrypted, err := crypto.Encrypt([]byte(body.Provider))
+ if err != nil {
+ log.Printf("SetProvider encrypt error: %v", err)
+ c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to encrypt provider"})
+ return
+ }
+ version := crypto.CurrentEncryptionVersion()
+ _, err = db.DB.ExecContext(ctx, `
+ INSERT INTO workspace_secrets (workspace_id, key, encrypted_value, encryption_version)
+ VALUES ($1, 'LLM_PROVIDER', $2, $3)
+ ON CONFLICT (workspace_id, key) DO UPDATE
+ SET encrypted_value = $2, encryption_version = $3, updated_at = now()
+ `, workspaceID, encrypted, version)
+ if err != nil {
+ log.Printf("SetProvider upsert error: %v", err)
+ c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save provider"})
+ return
+ }
+
+ if h.restartFunc != nil {
+ go h.restartFunc(workspaceID)
+ }
+ c.JSON(http.StatusOK, gin.H{"status": "saved", "provider": body.Provider})
+}
diff --git a/workspace-server/internal/handlers/secrets_test.go b/workspace-server/internal/handlers/secrets_test.go
index 78e66a16..648f4e19 100644
--- a/workspace-server/internal/handlers/secrets_test.go
+++ b/workspace-server/internal/handlers/secrets_test.go
@@ -618,6 +618,152 @@ func TestSecretsSetModel_InvalidID(t *testing.T) {
}
}
+// ==================== GetProvider / SetProvider (Option B PR-2) ====================
+//
+// Mirror of the GetModel/SetModel suite. Same secret-storage shape (key=
+// 'LLM_PROVIDER' instead of 'MODEL_PROVIDER'), same restart-trigger
+// contract, same UUID validation gate. We pin the contract symmetrically
+// so a future refactor that breaks one without the other shows up in CI.
+
+func TestSecretsGetProvider_Default(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+ handler := NewSecretsHandler(nil)
+
+ mock.ExpectQuery("SELECT encrypted_value, encryption_version FROM workspace_secrets").
+ WithArgs("ws-prov").
+ WillReturnError(sql.ErrNoRows)
+
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: "ws-prov"}}
+ c.Request = httptest.NewRequest("GET", "/workspaces/ws-prov/provider", nil)
+
+ handler.GetProvider(c)
+
+ if w.Code != http.StatusOK {
+ t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String())
+ }
+
+ var resp map[string]interface{}
+ if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+ t.Fatalf("failed to parse response: %v", err)
+ }
+ if resp["provider"] != "" {
+ t.Errorf("expected empty provider, got %v", resp["provider"])
+ }
+ if resp["source"] != "default" {
+ t.Errorf("expected source 'default', got %v", resp["source"])
+ }
+ if err := mock.ExpectationsWereMet(); err != nil {
+ t.Errorf("unmet sqlmock expectations: %v", err)
+ }
+}
+
+func TestSecretsGetProvider_DBError(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+ handler := NewSecretsHandler(nil)
+
+ mock.ExpectQuery("SELECT encrypted_value, encryption_version FROM workspace_secrets").
+ WithArgs("ws-prov-err").
+ WillReturnError(sql.ErrConnDone)
+
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: "ws-prov-err"}}
+ c.Request = httptest.NewRequest("GET", "/workspaces/ws-prov-err/provider", nil)
+
+ handler.GetProvider(c)
+
+ if w.Code != http.StatusInternalServerError {
+ t.Errorf("expected status 500, got %d: %s", w.Code, w.Body.String())
+ }
+ if err := mock.ExpectationsWereMet(); err != nil {
+ t.Errorf("unmet sqlmock expectations: %v", err)
+ }
+}
+
+func TestSecretsSetProvider_Upsert(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+ restartCalled := make(chan string, 1)
+ handler := NewSecretsHandler(func(id string) { restartCalled <- id })
+
+ mock.ExpectExec(`INSERT INTO workspace_secrets`).
+ WithArgs("00000000-0000-0000-0000-000000000003", sqlmock.AnyArg(), sqlmock.AnyArg()).
+ WillReturnResult(sqlmock.NewResult(1, 1))
+
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000003"}}
+ c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000003/provider",
+ strings.NewReader(`{"provider":"minimax"}`))
+ c.Request.Header.Set("Content-Type", "application/json")
+
+ handler.SetProvider(c)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+ }
+ select {
+ case id := <-restartCalled:
+ if id != "00000000-0000-0000-0000-000000000003" {
+ t.Errorf("restart called with wrong id: %s", id)
+ }
+ case <-time.After(500 * time.Millisecond):
+ t.Error("restart was not triggered")
+ }
+ if err := mock.ExpectationsWereMet(); err != nil {
+ t.Errorf("unmet sqlmock expectations: %v", err)
+ }
+}
+
+func TestSecretsSetProvider_EmptyClears(t *testing.T) {
+ mock := setupTestDB(t)
+ setupTestRedis(t)
+ handler := NewSecretsHandler(func(string) {})
+
+ mock.ExpectExec(`DELETE FROM workspace_secrets`).
+ WithArgs("00000000-0000-0000-0000-000000000004").
+ WillReturnResult(sqlmock.NewResult(0, 1))
+
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000004"}}
+ c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000004/provider",
+ strings.NewReader(`{"provider":""}`))
+ c.Request.Header.Set("Content-Type", "application/json")
+
+ handler.SetProvider(c)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String())
+ }
+ if err := mock.ExpectationsWereMet(); err != nil {
+ t.Errorf("unmet sqlmock expectations: %v", err)
+ }
+}
+
+func TestSecretsSetProvider_InvalidID(t *testing.T) {
+ setupTestDB(t)
+ setupTestRedis(t)
+ handler := NewSecretsHandler(nil)
+
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}}
+ c.Request = httptest.NewRequest("PUT", "/workspaces/not-a-uuid/provider",
+ strings.NewReader(`{"provider":"x"}`))
+ c.Request.Header.Set("Content-Type", "application/json")
+
+ handler.SetProvider(c)
+
+ if w.Code != http.StatusBadRequest {
+ t.Errorf("expected 400 for bad UUID, got %d", w.Code)
+ }
+}
+
// ==================== Values — Phase 30.2 decrypted pull ====================
// These tests target the secrets.Values handler (GET /workspaces/:id/secrets/values)
diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go
index 5373ed0f..0a5459fc 100644
--- a/workspace-server/internal/router/router.go
+++ b/workspace-server/internal/router/router.go
@@ -329,6 +329,8 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi
wsAuth.DELETE("/secrets/:key", sech.Delete)
wsAuth.GET("/model", sech.GetModel)
wsAuth.PUT("/model", sech.SetModel)
+ wsAuth.GET("/provider", sech.GetProvider)
+ wsAuth.PUT("/provider", sech.SetProvider)
// Token usage metrics — cost transparency (#593).
// WorkspaceAuth middleware (on wsAuth) binds the bearer to :id.
From 517bd0efc562e6789c8e4901556cbc2ed9b89583 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 11:19:17 -0700
Subject: [PATCH 14/61] feat(canvas+workspace-server): data-driven Provider
dropdown (#199)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Option B PR-5. Canvas Config tab now exposes a Provider override input
that's adapter-driven from each runtime's template — no hardcoded
provider list in the canvas. PUT /workspaces/:id/provider on Save
when dirty; auto-restart suppression to avoid double-restart with
the model handler's own restart.
The dropdown's suggestion list comes from /templates →
runtime_config.providers (the field added in
molecule-ai-workspace-template-hermes PR #31). For templates that
haven't migrated to the explicit providers list yet, suggestions
derive from model[].id slug prefixes — still adapter-driven, just
inferred. This keeps existing templates working while platform team
migrates them one at a time.
workspace-server changes:
- Add Providers []string field to templateSummary JSON
- Parse runtime_config.providers in /templates handler
- 2 new tests pin the surfacing + omitempty behavior
canvas changes:
- Remove hardcoded PROVIDER_SUGGESTIONS constant
- Add provider/originalProvider state + PUT-on-save logic
- Add deriveProvidersFromModels() fallback helper
- Wire RuntimeOption.providers from /templates response
- 8 new tests pin the behavior end-to-end
Co-Authored-By: Claude Opus 4.7 (1M context)
---
canvas/src/components/tabs/ConfigTab.tsx | 192 ++++++++--
.../__tests__/ConfigTab.provider.test.tsx | 332 ++++++++++++++++++
.../internal/handlers/templates.go | 12 +
.../internal/handlers/templates_test.go | 111 ++++++
4 files changed, 627 insertions(+), 20 deletions(-)
create mode 100644 canvas/src/components/tabs/__tests__/ConfigTab.provider.test.tsx
diff --git a/canvas/src/components/tabs/ConfigTab.tsx b/canvas/src/components/tabs/ConfigTab.tsx
index e1227d67..f46ff538 100644
--- a/canvas/src/components/tabs/ConfigTab.tsx
+++ b/canvas/src/components/tabs/ConfigTab.tsx
@@ -100,6 +100,42 @@ interface RuntimeOption {
value: string;
label: string;
models: ModelSpec[];
+ // providers is the declarative provider list each template ships in
+ // its config.yaml under runtime_config.providers. The /templates API
+ // surfaces it (workspace-server templates.go) so canvas stays
+ // adapter-driven: hermes ships ~20 slugs, claude-code ships
+ // ["anthropic"], gemini-cli ships ["gemini"], etc. Empty list →
+ // canvas falls back to deriving unique vendor prefixes from
+ // models[].id (still adapter-driven, just inferred).
+ providers: string[];
+}
+
+// deriveProvidersFromModels — when a template doesn't ship an explicit
+// providers list, infer suggestions from the vendor prefixes of its
+// model slugs. e.g. ["anthropic:claude-opus-4-7", "openai:gpt-4o",
+// "anthropic:claude-sonnet-4-5"] → ["anthropic", "openai"].
+//
+// This keeps the dropdown adapter-driven for older templates that
+// haven't migrated to the explicit `providers:` field yet, AND
+// continues to be a useful fallback for any future runtime whose
+// derive-provider semantics happen to match the slug prefix.
+function deriveProvidersFromModels(models: ModelSpec[]): string[] {
+ const seen = new Set();
+ const out: string[] = [];
+ for (const m of models) {
+ if (!m.id) continue;
+ // Both ":" (anthropic:claude-opus-4-7) and "/" (nousresearch/hermes-4-70b)
+ // are valid vendor separators in our slug taxonomy. Take whichever
+ // appears first and split there.
+ const sep = m.id.match(/[:/]/)?.index ?? -1;
+ if (sep <= 0) continue;
+ const vendor = m.id.slice(0, sep);
+ if (!seen.has(vendor)) {
+ seen.add(vendor);
+ out.push(vendor);
+ }
+ }
+ return out;
}
// Fallback used when /templates can't be fetched (offline, older backend).
@@ -118,14 +154,14 @@ interface RuntimeOption {
const RUNTIMES_WITH_OWN_CONFIG = new Set(["external"]);
const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [
- { value: "", label: "LangGraph (default)", models: [] },
- { value: "claude-code", label: "Claude Code", models: [] },
- { value: "crewai", label: "CrewAI", models: [] },
- { value: "autogen", label: "AutoGen", models: [] },
- { value: "deepagents", label: "DeepAgents", models: [] },
- { value: "openclaw", label: "OpenClaw", models: [] },
- { value: "hermes", label: "Hermes", models: [] },
- { value: "gemini-cli", label: "Gemini CLI", models: [] },
+ { value: "", label: "LangGraph (default)", models: [], providers: [] },
+ { value: "claude-code", label: "Claude Code", models: [], providers: [] },
+ { value: "crewai", label: "CrewAI", models: [], providers: [] },
+ { value: "autogen", label: "AutoGen", models: [], providers: [] },
+ { value: "deepagents", label: "DeepAgents", models: [], providers: [] },
+ { value: "openclaw", label: "OpenClaw", models: [], providers: [] },
+ { value: "hermes", label: "Hermes", models: [], providers: [] },
+ { value: "gemini-cli", label: "Gemini CLI", models: [], providers: [] },
];
export function ConfigTab({ workspaceId }: Props) {
@@ -138,6 +174,17 @@ export function ConfigTab({ workspaceId }: Props) {
const [rawMode, setRawMode] = useState(false);
const [rawDraft, setRawDraft] = useState("");
const [runtimeOptions, setRuntimeOptions] = useState(FALLBACK_RUNTIME_OPTIONS);
+ // Provider override (Option B PR-5): stored separately from config.yaml
+ // because the value lives in workspace_secrets (encrypted), not in the
+ // platform-managed config.yaml. The two endpoints are GET/PUT
+ // /workspaces/:id/provider on workspace-server (handlers/secrets.go).
+ // Empty = "auto-derive from model slug prefix" — pre-Option-B behavior
+ // and what most users want. Setting to a non-empty value writes
+ // LLM_PROVIDER into workspace_secrets and triggers an auto-restart so
+ // the workspace boots with the new provider in env (and via CP user-
+ // data, written into /configs/config.yaml on next provision too).
+ const [provider, setProvider] = useState("");
+ const [originalProvider, setOriginalProvider] = useState("");
const successTimerRef = useRef>(undefined);
useEffect(() => {
@@ -168,6 +215,22 @@ export function ConfigTab({ workspaceId }: Props) {
wsMetadataModel = (m.model || "").trim();
} catch { /* non-fatal */ }
+ // Load explicit provider override (Option B PR-5). Endpoint returns
+ // {provider: "", source: "default"} when no override is set, so the
+ // empty string is the legitimate "auto-derive" signal — don't treat
+ // it as a load error. Non-fatal: an older workspace-server that
+ // predates PR-2 returns 404 here; the form falls back to "" and
+ // Save just won't PUT the provider field.
+ try {
+ const p = await api.get<{ provider?: string }>(`/workspaces/${workspaceId}/provider`);
+ const loadedProvider = (p.provider || "").trim();
+ setProvider(loadedProvider);
+ setOriginalProvider(loadedProvider);
+ } catch {
+ setProvider("");
+ setOriginalProvider("");
+ }
+
try {
const res = await api.get<{ content: string }>(`/workspaces/${workspaceId}/files/config.yaml`);
const parsed = parseYaml(res.content);
@@ -209,11 +272,11 @@ export function ConfigTab({ workspaceId }: Props) {
useEffect(() => {
let cancelled = false;
- api.get>("/templates")
+ api.get>("/templates")
.then((rows) => {
if (cancelled || !Array.isArray(rows)) return;
const byRuntime = new Map();
- byRuntime.set("", { value: "", label: "LangGraph (default)", models: [] });
+ byRuntime.set("", { value: "", label: "LangGraph (default)", models: [], providers: [] });
for (const r of rows) {
const v = (r.runtime || "").trim();
if (!v || v === "langgraph") continue;
@@ -221,8 +284,9 @@ export function ConfigTab({ workspaceId }: Props) {
// one with the richer models list is probably newer.
const existing = byRuntime.get(v);
const models = Array.isArray(r.models) ? r.models : [];
+ const providers = Array.isArray(r.providers) ? r.providers : [];
if (!existing || models.length > existing.models.length) {
- byRuntime.set(v, { value: v, label: r.name || v, models });
+ byRuntime.set(v, { value: v, label: r.name || v, models, providers });
}
}
if (byRuntime.size > 1) setRuntimeOptions(Array.from(byRuntime.values()));
@@ -234,6 +298,16 @@ export function ConfigTab({ workspaceId }: Props) {
// Models + env hints for the currently-selected runtime.
const selectedRuntime = runtimeOptions.find((o) => o.value === (config.runtime || "")) ?? null;
const availableModels: ModelSpec[] = selectedRuntime?.models ?? [];
+ // Provider suggestions: prefer the runtime's declarative providers
+ // list (sourced from its template config.yaml runtime_config.providers
+ // and surfaced via /templates), fall back to deriving from model slug
+ // prefixes when the template hasn't migrated to the explicit field
+ // yet. Either way the data flows from the adapter — no hardcoded
+ // canvas-side enum.
+ const providerSuggestions: string[] =
+ (selectedRuntime?.providers && selectedRuntime.providers.length > 0)
+ ? selectedRuntime.providers
+ : deriveProvidersFromModels(availableModels);
const currentModelId = config.runtime_config?.model || config.model || "";
const currentModelSpec = availableModels.find((m) => m.id === currentModelId) ?? null;
@@ -334,6 +408,24 @@ export function ConfigTab({ workspaceId }: Props) {
}
}
+ // Provider override save (Option B PR-5). PUT only when the user
+ // changed the dropdown — otherwise an unrelated Save (e.g. tier
+ // edit) would re-write the provider unchanged and the server-
+ // side auto-restart would fire on every Save, costing the user a
+ // ~30s reboot for a no-op change. Server endpoint accepts an
+ // empty string to clear the override (deletes the
+ // workspace_secrets row); we forward whatever the form holds.
+ let providerSaveError: string | null = null;
+ const providerChanged = provider !== originalProvider;
+ if (providerChanged) {
+ try {
+ await api.put(`/workspaces/${workspaceId}/provider`, { provider });
+ setOriginalProvider(provider);
+ } catch (e) {
+ providerSaveError = e instanceof Error ? e.message : "Provider update was rejected";
+ }
+ }
+
setOriginalYaml(content);
if (rawMode) {
const parsed = parseYaml(content);
@@ -341,16 +433,30 @@ export function ConfigTab({ workspaceId }: Props) {
} else {
setRawDraft(content);
}
- if (restart) {
+ // SetProvider on the server already triggers an auto-restart for
+ // the workspace whenever the value actually changed (see
+ // workspace-server/internal/handlers/secrets.go:SetProvider). If
+ // the user also clicked Save+Restart we'd kick off a SECOND
+ // restart here and the two would race in the canvas store —
+ // suppress the redundant call and rely on the server-side one.
+ const providerWillAutoRestart = providerChanged && !providerSaveError;
+ if (restart && !providerWillAutoRestart) {
await useCanvasStore.getState().restartWorkspace(workspaceId);
- } else {
- useCanvasStore.getState().updateNodeData(workspaceId, { needsRestart: true });
+ } else if (!restart) {
+ useCanvasStore.getState().updateNodeData(workspaceId, { needsRestart: !providerWillAutoRestart });
}
- if (modelSaveError) {
- // Partial-save UX: surface the model rejection instead of
- // showing "Saved" — the user would otherwise watch the model
- // field revert on next reload with no explanation.
- setError(`Other fields saved, but model update failed: ${modelSaveError}`);
+ // Aggregate partial-save errors. Both modelSaveError and
+ // providerSaveError describe rejected updates from independent
+ // endpoints — show whichever fired so the user knows which
+ // field reverts on next reload (otherwise they'd see "Saved" and
+ // be confused why Provider snapped back).
+ const partialError = providerSaveError
+ ? `Other fields saved, but provider update failed: ${providerSaveError}`
+ : modelSaveError
+ ? `Other fields saved, but model update failed: ${modelSaveError}`
+ : null;
+ if (partialError) {
+ setError(partialError);
} else {
setSuccess(true);
clearTimeout(successTimerRef.current);
@@ -371,7 +477,8 @@ export function ConfigTab({ workspaceId }: Props) {
const taskBudgetId = useId();
const sandboxBackendId = useId();
- const isDirty = rawMode ? rawDraft !== originalYaml : toYaml(config) !== originalYaml;
+ const providerDirty = provider !== originalProvider;
+ const isDirty = (rawMode ? rawDraft !== originalYaml : toYaml(config) !== originalYaml) || providerDirty;
if (loading) {
return
Loading config...
;
@@ -518,6 +625,51 @@ export function ConfigTab({ workspaceId }: Props) {
)}
+ {/* Provider override (Option B PR-5). Free-text combobox so
+ operators can use any of the 30+ slugs hermes-agent's
+ derive-provider.sh recognizes — the suggestion list is
+ a hint, not a constraint. Empty = "auto-derive from
+ model slug prefix" which is correct for the common case
+ (model "anthropic:claude-opus-4-7" → provider derived
+ as "anthropic"). The override is needed when the model
+ alias has no clean vendor prefix (e.g. hermes default
+ "nousresearch/hermes-4-70b" → derive returns empty →
+ hermes errors "No LLM provider configured"). */}
+
+ Provider change → workspace will auto-restart on Save.
+
+ )}
+
({
+ api: {
+ get: (path: string) => apiGet(path),
+ patch: (path: string, body: unknown) => apiPatch(path, body),
+ put: (path: string, body: unknown) => apiPut(path, body),
+ post: vi.fn(),
+ del: vi.fn(),
+ },
+}));
+
+vi.mock("@/store/canvas", () => ({
+ useCanvasStore: Object.assign(
+ (selector: (s: unknown) => unknown) => selector({ restartWorkspace: vi.fn(), updateNodeData: vi.fn() }),
+ { getState: () => ({ restartWorkspace: vi.fn(), updateNodeData: vi.fn() }) },
+ ),
+}));
+
+vi.mock("../AgentCardSection", () => ({
+ AgentCardSection: () => ,
+}));
+
+import { ConfigTab } from "../ConfigTab";
+
+// wireApi — same shape as ConfigTab.hermes.test.tsx, extended with the
+// /provider endpoint. Each test sets `providerValue` to the value the
+// GET endpoint returns; "missing" means the endpoint rejects (older
+// workspace-server pre-PR-2 — must not crash the tab).
+function wireApi(opts: {
+ workspaceRuntime?: string;
+ workspaceModel?: string;
+ configYamlContent?: string | null;
+ templates?: Array<{ id: string; name?: string; runtime?: string; models?: unknown[]; providers?: string[] }>;
+ providerValue?: string | "missing";
+}) {
+ apiGet.mockImplementation((path: string) => {
+ if (path === `/workspaces/ws-test`) {
+ return Promise.resolve({ runtime: opts.workspaceRuntime ?? "" });
+ }
+ if (path === `/workspaces/ws-test/model`) {
+ return Promise.resolve({ model: opts.workspaceModel ?? "" });
+ }
+ if (path === `/workspaces/ws-test/provider`) {
+ if (opts.providerValue === "missing") {
+ return Promise.reject(new Error("404"));
+ }
+ return Promise.resolve({ provider: opts.providerValue ?? "", source: opts.providerValue ? "workspace_secrets" : "default" });
+ }
+ if (path === `/workspaces/ws-test/files/config.yaml`) {
+ if (opts.configYamlContent === null) return Promise.reject(new Error("not found"));
+ return Promise.resolve({ content: opts.configYamlContent ?? "" });
+ }
+ if (path === "/templates") {
+ return Promise.resolve(opts.templates ?? []);
+ }
+ return Promise.reject(new Error(`unmocked api.get: ${path}`));
+ });
+}
+
+beforeEach(() => {
+ apiGet.mockReset();
+ apiPatch.mockReset();
+ apiPut.mockReset();
+});
+
+describe("ConfigTab — Provider override (Option B PR-5)", () => {
+ // Empty provider on load is the legitimate default ("auto-derive
+ // from model slug prefix"), NOT an error. The endpoint returning
+ // {provider: "", source: "default"} is the documented happy-path
+ // shape — if the form treated that as "load failed" we'd lose the
+ // ability to render the input at all on fresh workspaces.
+ it("renders an empty Provider input when no override is set", async () => {
+ wireApi({
+ workspaceRuntime: "hermes",
+ workspaceModel: "nousresearch/hermes-4-70b",
+ configYamlContent: "name: ws\nruntime: hermes\n",
+ providerValue: "",
+ });
+
+ render();
+ const input = await screen.findByTestId("provider-input");
+ expect((input as HTMLInputElement).value).toBe("");
+ });
+
+ // Pre-existing override loads back into the field on mount. Without
+ // this, an operator who set provider=openrouter yesterday would see
+ // the field blank today, conclude the value didn't stick, and
+ // re-save — the resulting PUT-with-same-value would auto-restart
+ // the workspace for nothing.
+ it("loads an existing provider override from the server", async () => {
+ wireApi({
+ workspaceRuntime: "hermes",
+ workspaceModel: "nousresearch/hermes-4-70b",
+ configYamlContent: "name: ws\nruntime: hermes\n",
+ providerValue: "openrouter",
+ });
+
+ render();
+ const input = await screen.findByTestId("provider-input");
+ await waitFor(() => expect((input as HTMLInputElement).value).toBe("openrouter"));
+ });
+
+ // Old workspace-server (pre-PR-2) returns a 404 on /provider. The
+ // tab must keep loading — the fallback is "" (auto-derive), same as
+ // a fresh workspace.
+ it("falls back to empty provider when the endpoint is missing", async () => {
+ wireApi({
+ workspaceRuntime: "hermes",
+ workspaceModel: "nousresearch/hermes-4-70b",
+ configYamlContent: "name: ws\nruntime: hermes\n",
+ providerValue: "missing",
+ });
+
+ render();
+ const input = await screen.findByTestId("provider-input");
+ expect((input as HTMLInputElement).value).toBe("");
+ // Tab should be fully rendered, not stuck in loading or error state.
+ expect(screen.queryByText(/Loading config/i)).toBeNull();
+ });
+
+ // Setting a value + Save must PUT to the right endpoint with the
+ // right body shape. Server-side handler (workspace-server
+ // handlers/secrets.go:SetProvider) reads body.provider — any other
+ // key gets silently ignored and the workspace_secrets row stays
+ // unset. This regression would manifest as "Save → Restart →
+ // workspace still says No LLM provider configured."
+ it("PUTs the new provider to /workspaces/:id/provider on Save", async () => {
+ wireApi({
+ workspaceRuntime: "hermes",
+ workspaceModel: "nousresearch/hermes-4-70b",
+ configYamlContent: "name: ws\nruntime: hermes\n",
+ providerValue: "",
+ });
+ apiPut.mockResolvedValue({ status: "saved", provider: "anthropic" });
+
+ render();
+ const input = await screen.findByTestId("provider-input");
+
+ fireEvent.change(input, { target: { value: "anthropic" } });
+ expect((input as HTMLInputElement).value).toBe("anthropic");
+
+ const saveBtn = screen.getByRole("button", { name: /^save$/i });
+ fireEvent.click(saveBtn);
+
+ await waitFor(() => {
+ const providerCalls = apiPut.mock.calls.filter(([path]) => path === "/workspaces/ws-test/provider");
+ expect(providerCalls.length).toBe(1);
+ expect(providerCalls[0][1]).toEqual({ provider: "anthropic" });
+ });
+ });
+
+ // No-change Save must NOT PUT /provider. The server-side SetProvider
+ // auto-restarts the workspace on every successful PUT — re-writing
+ // an unchanged value would cost the user a ~30s reboot every time
+ // they tweak some other field.
+ it("does not PUT /provider when the value is unchanged", async () => {
+ wireApi({
+ workspaceRuntime: "hermes",
+ workspaceModel: "nousresearch/hermes-4-70b",
+ configYamlContent: "name: ws\nruntime: hermes\ntier: 2\n",
+ providerValue: "openrouter",
+ });
+ apiPut.mockResolvedValue({});
+
+ render();
+ await screen.findByTestId("provider-input");
+
+ // Click Save without touching the provider field. Trigger another
+ // dirty-marker (tier change) so Save is enabled — the test is
+ // about NOT touching /provider, not about Save being disabled.
+ const tierSelect = screen.getByLabelText(/tier/i) as HTMLSelectElement;
+ fireEvent.change(tierSelect, { target: { value: "3" } });
+
+ const saveBtn = screen.getByRole("button", { name: /^save$/i });
+ fireEvent.click(saveBtn);
+
+ await waitFor(() => {
+ // Some PUT(s) may fire (e.g. /model). Just assert /provider is NOT among them.
+ const providerCalls = apiPut.mock.calls.filter(([path]) => path === "/workspaces/ws-test/provider");
+ expect(providerCalls.length).toBe(0);
+ });
+ });
+
+ // The dropdown's suggestion list MUST come from the runtime's own
+ // template (via /templates → runtime_config.providers), not a
+ // hardcoded canvas-side enum. This is the "Native + pluggable
+ // runtime" invariant: a new runtime declaring its own provider
+ // taxonomy in its config.yaml gets a working dropdown without ANY
+ // canvas-side change.
+ //
+ // Pinned by checking that suggestions surfaced in the datalist
+ // exactly mirror what the templates endpoint returned for the
+ // matching runtime. If a future contributor reintroduces a
+ // PROVIDER_SUGGESTIONS-style hardcoded list and the datalist
+ // contents don't follow the template, this test fails.
+ it("populates the provider datalist from the matched runtime's templates entry", async () => {
+ wireApi({
+ workspaceRuntime: "hermes",
+ workspaceModel: "nousresearch/hermes-4-70b",
+ configYamlContent: "name: ws\nruntime: hermes\n",
+ providerValue: "",
+ templates: [
+ {
+ id: "hermes",
+ name: "Hermes",
+ runtime: "hermes",
+ models: [],
+ // The provider list every runtime adapter ships in its own
+ // config.yaml. Canvas must surface THIS, not its own list.
+ providers: ["nous", "openrouter", "anthropic", "minimax-cn"],
+ },
+ ],
+ });
+
+ render();
+ const input = await screen.findByTestId("provider-input");
+ const listId = (input as HTMLInputElement).getAttribute("list");
+ expect(listId).toBeTruthy();
+ await waitFor(() => {
+ const datalist = document.getElementById(listId!);
+ expect(datalist).not.toBeNull();
+ const optionValues = Array.from(datalist!.querySelectorAll("option")).map(
+ (o) => (o as HTMLOptionElement).value,
+ );
+ // Order matters — most-common-first is part of the contract so
+ // the demo flow lands on a working choice without scrolling.
+ expect(optionValues).toEqual(["nous", "openrouter", "anthropic", "minimax-cn"]);
+ });
+ });
+
+ // Fallback path: when a template hasn't migrated to the explicit
+ // `providers:` field yet, suggestions are derived from model slug
+ // prefixes. Still adapter-driven (the slugs come from the template's
+ // `models:` list), just inferred. This keeps existing templates
+ // working while the platform team migrates them one at a time.
+ it("falls back to model-slug prefixes when the runtime ships no providers list", async () => {
+ wireApi({
+ workspaceRuntime: "hermes",
+ workspaceModel: "anthropic:claude-opus-4-7",
+ configYamlContent: "name: ws\nruntime: hermes\n",
+ providerValue: "",
+ templates: [
+ {
+ id: "hermes",
+ name: "Hermes",
+ runtime: "hermes",
+ models: [
+ { id: "anthropic:claude-opus-4-7" },
+ { id: "openai:gpt-4o" },
+ { id: "anthropic:claude-sonnet-4-5" }, // dup vendor — must dedupe
+ { id: "nousresearch/hermes-4-70b" }, // "/" separator
+ ],
+ // No `providers:` field → fallback derivation kicks in.
+ },
+ ],
+ });
+
+ render();
+ const input = await screen.findByTestId("provider-input");
+ const listId = (input as HTMLInputElement).getAttribute("list");
+ expect(listId).toBeTruthy();
+ await waitFor(() => {
+ const datalist = document.getElementById(listId!);
+ const optionValues = Array.from(datalist!.querySelectorAll("option")).map(
+ (o) => (o as HTMLOptionElement).value,
+ );
+ // Order = first-appearance from models[]; dedup keeps anthropic
+ // once even though two model slugs use it.
+ expect(optionValues).toEqual(["anthropic", "openai", "nousresearch"]);
+ });
+ });
+
+ // Empty string is a legitimate save target — it clears the override
+ // (the server-side endpoint deletes the workspace_secrets row).
+ // Operators who picked "anthropic" yesterday and want to revert to
+ // auto-derive today should be able to do so by clearing the field
+ // and clicking Save. Without this PUT path, the only way to clear
+ // would be a direct DB edit.
+ it("PUTs an empty string when the operator clears a previously-set provider", async () => {
+ wireApi({
+ workspaceRuntime: "hermes",
+ workspaceModel: "anthropic:claude-opus-4-7",
+ configYamlContent: "name: ws\nruntime: hermes\n",
+ providerValue: "openrouter",
+ });
+ apiPut.mockResolvedValue({ status: "cleared" });
+
+ render();
+ const input = await screen.findByTestId("provider-input");
+ await waitFor(() => expect((input as HTMLInputElement).value).toBe("openrouter"));
+
+ fireEvent.change(input, { target: { value: "" } });
+
+ const saveBtn = screen.getByRole("button", { name: /^save$/i });
+ fireEvent.click(saveBtn);
+
+ await waitFor(() => {
+ const providerCalls = apiPut.mock.calls.filter(([path]) => path === "/workspaces/ws-test/provider");
+ expect(providerCalls.length).toBe(1);
+ expect(providerCalls[0][1]).toEqual({ provider: "" });
+ });
+ });
+});
diff --git a/workspace-server/internal/handlers/templates.go b/workspace-server/internal/handlers/templates.go
index e33c06d6..1279a524 100644
--- a/workspace-server/internal/handlers/templates.go
+++ b/workspace-server/internal/handlers/templates.go
@@ -59,6 +59,16 @@ type templateSummary struct {
// preflight uses this as the fallback provider when `models` is empty
// so provider picker stays data-driven instead of hardcoded in the UI.
RequiredEnv []string `json:"required_env,omitempty"`
+ // Providers is the runtime's own list of supported provider slugs,
+ // sourced from runtime_config.providers in the template's config.yaml.
+ // The canvas Config tab surfaces this as the Provider override
+ // dropdown (Option B PR-5). Data-driven so each runtime owns its own
+ // taxonomy — hermes-agent supports 20+ providers; claude-code only
+ // "anthropic"; gemini-cli only "gemini" — and a future runtime with
+ // a different vendor list doesn't need a canvas edit. Empty list →
+ // canvas falls back to deriving suggestions from `models[].id` slug
+ // prefixes (still adapter-driven, just inferred).
+ Providers []string `json:"providers,omitempty"`
Skills []string `json:"skills"`
SkillCount int `json:"skill_count"`
// ProvisionTimeoutSeconds lets a slow runtime declare its expected
@@ -100,6 +110,7 @@ func (h *TemplatesHandler) List(c *gin.Context) {
Model string `yaml:"model"`
Models []modelSpec `yaml:"models"`
RequiredEnv []string `yaml:"required_env"`
+ Providers []string `yaml:"providers"`
ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"`
} `yaml:"runtime_config"`
}
@@ -122,6 +133,7 @@ func (h *TemplatesHandler) List(c *gin.Context) {
Model: model,
Models: raw.RuntimeConfig.Models,
RequiredEnv: raw.RuntimeConfig.RequiredEnv,
+ Providers: raw.RuntimeConfig.Providers,
Skills: raw.Skills,
SkillCount: len(raw.Skills),
ProvisionTimeoutSeconds: raw.RuntimeConfig.ProvisionTimeoutSeconds,
diff --git a/workspace-server/internal/handlers/templates_test.go b/workspace-server/internal/handlers/templates_test.go
index e40c6b16..6b85715c 100644
--- a/workspace-server/internal/handlers/templates_test.go
+++ b/workspace-server/internal/handlers/templates_test.go
@@ -197,6 +197,117 @@ skills: []
}
}
+// TestTemplatesList_SurfacesProviders pins the Option B PR-5 wiring:
+// /templates must echo runtime_config.providers from the template's
+// config.yaml into the JSON response. Canvas reads this list to
+// populate the Provider override dropdown WITHOUT hardcoding any
+// provider taxonomy on the frontend — that's the "data-driven from
+// adapter" invariant.
+//
+// If a future yaml-tag rename or struct edit drops the field, every
+// runtime would silently fall back to model-prefix derivation. For
+// hermes specifically (default model has no clean prefix), that
+// degrades the dropdown to empty and reintroduces the "No LLM
+// provider configured" UX gap from 2026-05-01.
+func TestTemplatesList_SurfacesProviders(t *testing.T) {
+ setupTestDB(t)
+ setupTestRedis(t)
+
+ tmpDir := t.TempDir()
+ tmplDir := filepath.Join(tmpDir, "hermes-prov")
+ if err := os.MkdirAll(tmplDir, 0755); err != nil {
+ t.Fatalf("mkdir: %v", err)
+ }
+ configYaml := `name: Hermes
+description: test
+tier: 2
+runtime: hermes
+runtime_config:
+ model: nousresearch/hermes-4-70b
+ providers:
+ - nous
+ - openrouter
+ - anthropic
+skills: []
+`
+ if err := os.WriteFile(filepath.Join(tmplDir, "config.yaml"), []byte(configYaml), 0644); err != nil {
+ t.Fatalf("write: %v", err)
+ }
+
+ handler := NewTemplatesHandler(tmpDir, nil)
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Request = httptest.NewRequest("GET", "/templates", nil)
+ handler.List(c)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("expected 200, got %d", w.Code)
+ }
+ var resp []templateSummary
+ if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil {
+ t.Fatalf("parse: %v", err)
+ }
+ if len(resp) != 1 {
+ t.Fatalf("expected 1 template, got %d", len(resp))
+ }
+ got := resp[0]
+ want := []string{"nous", "openrouter", "anthropic"}
+ if len(got.Providers) != len(want) {
+ t.Fatalf("Providers: want %v, got %v", want, got.Providers)
+ }
+ for i, p := range want {
+ if got.Providers[i] != p {
+ t.Errorf("Providers[%d]: want %q, got %q", i, p, got.Providers[i])
+ }
+ }
+
+ // Cross-check the JSON wire shape directly — canvas reads the field
+ // as `providers` (lowercase) and a struct-tag rename here would
+ // break consumers without surfacing in the typed assertions above.
+ if !strings.Contains(w.Body.String(), `"providers":["nous","openrouter","anthropic"]`) {
+ t.Errorf("response missing providers JSON field: %s", w.Body.String())
+ }
+}
+
+// TestTemplatesList_OmitsProvidersWhenAbsent pins the omitempty
+// behavior — older templates that haven't migrated to
+// runtime_config.providers yet must NOT emit `providers: null` (which
+// would break canvas's array-typed parser). A template that simply
+// omits the field stays absent in the response and canvas falls back
+// to deriving suggestions from model-slug prefixes.
+func TestTemplatesList_OmitsProvidersWhenAbsent(t *testing.T) {
+ setupTestDB(t)
+ setupTestRedis(t)
+
+ tmpDir := t.TempDir()
+ tmplDir := filepath.Join(tmpDir, "no-prov")
+ if err := os.MkdirAll(tmplDir, 0755); err != nil {
+ t.Fatalf("mkdir: %v", err)
+ }
+ configYaml := `name: Legacy
+runtime: langgraph
+runtime_config:
+ model: anthropic:claude-opus-4-7
+skills: []
+`
+ if err := os.WriteFile(filepath.Join(tmplDir, "config.yaml"), []byte(configYaml), 0644); err != nil {
+ t.Fatalf("write: %v", err)
+ }
+
+ handler := NewTemplatesHandler(tmpDir, nil)
+ w := httptest.NewRecorder()
+ c, _ := gin.CreateTestContext(w)
+ c.Request = httptest.NewRequest("GET", "/templates", nil)
+ handler.List(c)
+
+ if w.Code != http.StatusOK {
+ t.Fatalf("expected 200, got %d", w.Code)
+ }
+ if strings.Contains(w.Body.String(), `"providers":`) {
+ t.Errorf("response should omit providers when template has none, got: %s", w.Body.String())
+ }
+}
+
func TestTemplatesList_LegacyTopLevelModel(t *testing.T) {
// Older templates (pre-runtime_config) declared `model:` at the top level.
// The /templates endpoint should keep surfacing those for backward compat.
From 2e8892ebc414673d9219c2f60934d36b07155237 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 11:47:53 -0700
Subject: [PATCH 15/61] fix(workspace): surface errno + path on chat-upload
mkdir failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Production incident on hongming.moleculesai.app 2026-05-01T18:30Z —
fresh-tenant signup chat upload returned 500 with the body
{"error":"failed to prepare uploads dir"}. Diagnosis required SSM
access to the workspace stderr to recover errno + actual path.
The root-cause fix lives in claude-code template entrypoint
(molecule-ai-workspace-template-claude-code#23 — pre-create the
.molecule subtree as root before gosu drops to agent). This change
is the diagnostic improvement: when mkdir fails for any reason in
the future (EACCES, ENOSPC, EROFS, etc.), the response carries
the errno + offending path so the operator inspecting browser
devtools sees the real cause without needing SSM.
Backwards compatible — top-level "error" key is unchanged so
existing canvas / external alert rules continue to match. New
fields are additive: path, errno, detail.
Test pins the diagnostic shape so a future struct refactor can't
silently drop these fields.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
workspace/internal_chat_uploads.py | 19 ++++++++-
workspace/tests/test_internal_chat_uploads.py | 42 +++++++++++++++++++
2 files changed, 60 insertions(+), 1 deletion(-)
diff --git a/workspace/internal_chat_uploads.py b/workspace/internal_chat_uploads.py
index 65a389de..396c1ac8 100644
--- a/workspace/internal_chat_uploads.py
+++ b/workspace/internal_chat_uploads.py
@@ -170,8 +170,25 @@ async def ingest_handler(request: Request) -> JSONResponse:
try:
Path(CHAT_UPLOAD_DIR).mkdir(parents=True, exist_ok=True)
except OSError as exc:
+ # Surface errno + path in the response so a fresh-tenant
+ # "failed to prepare uploads dir" 500 self-diagnoses without
+ # requiring SSM access to the workspace stderr. Prior incident
+ # 2026-05-01: hongming.moleculesai.app hit EACCES on the
+ # /workspace volume's `.molecule` subtree (root-owned race
+ # window between Docker volume create and entrypoint's chown,
+ # fixed via molecule-ai-workspace-template-claude-code#23).
+ # The errno + path are not security-sensitive — both are
+ # well-known to anyone with workspace access.
logger.error("internal_chat_uploads: mkdir %s failed: %s", CHAT_UPLOAD_DIR, exc)
- return JSONResponse({"error": "failed to prepare uploads dir"}, status_code=500)
+ return JSONResponse(
+ {
+ "error": "failed to prepare uploads dir",
+ "path": CHAT_UPLOAD_DIR,
+ "errno": exc.errno,
+ "detail": str(exc),
+ },
+ status_code=500,
+ )
response_files: list[dict] = []
total_bytes = 0
diff --git a/workspace/tests/test_internal_chat_uploads.py b/workspace/tests/test_internal_chat_uploads.py
index c3de859c..d386de65 100644
--- a/workspace/tests/test_internal_chat_uploads.py
+++ b/workspace/tests/test_internal_chat_uploads.py
@@ -222,6 +222,48 @@ def test_per_file_oversize_returns_413(client: TestClient, monkeypatch: pytest.M
assert "exceeds per-file limit" in r.json()["error"]
+# Pins the diagnostic shape of the 500 returned when the upload
+# directory cannot be created. Prior to this fix, the response was
+# {"error": "failed to prepare uploads dir"} only — opaque to the
+# operator inspecting browser devtools, requiring SSM access to the
+# workspace stderr to recover errno + actual path. Surfacing both in
+# the response body makes the failure self-diagnosing the next time
+# this class of bug recurs (e.g. EACCES on a root-owned `.molecule`
+# subtree, ENOSPC on a full disk, EROFS on a read-only mount).
+#
+# Reproduces the failure by pointing CHAT_UPLOAD_DIR at a path whose
+# parent the agent user can't write to. The exact errno in the test
+# is 13 (EACCES) on a chmod-0 dir; values are not asserted exactly
+# because they vary by OS / errno mapping. The PRESENCE of errno +
+# path is what's pinned — drift on those keys breaks the operator
+# diagnostic loop.
+def test_mkdir_failure_returns_errno_and_path(client: TestClient, chat_uploads_dir: Path, monkeypatch: pytest.MonkeyPatch):
+ # Plant a regular FILE where mkdir's parent should be — mkdir
+ # raises FileExistsError / NotADirectoryError reliably across
+ # platforms, exercising the OSError catch path.
+ blocker = chat_uploads_dir.parent / "chat-uploads-blocker"
+ blocker.write_text("not a dir")
+ # Repoint CHAT_UPLOAD_DIR to a child path under the regular file
+ # so mkdir(parents=True, exist_ok=True) raises NotADirectoryError.
+ monkeypatch.setattr(internal_chat_uploads, "CHAT_UPLOAD_DIR", str(blocker / "child"))
+
+ r = client.post(
+ "/internal/chat/uploads/ingest",
+ files={"files": ("a.txt", b"x")},
+ headers={"Authorization": "Bearer test-secret"},
+ )
+ assert r.status_code == 500, r.text
+ body = r.json()
+ # Backwards-compatible top-level error keeps existing canvas /
+ # external alert rules matching.
+ assert body.get("error") == "failed to prepare uploads dir"
+ # New diagnostic fields — operator can now see WHAT path failed
+ # and WHY without SSM access.
+ assert body.get("path") == str(blocker / "child")
+ assert isinstance(body.get("errno"), int) and body["errno"] != 0
+ assert "detail" in body and isinstance(body["detail"], str) and body["detail"]
+
+
def test_total_request_body_oversize_returns_413(client: TestClient, monkeypatch: pytest.MonkeyPatch):
"""Header-side total cap. Set the limit BELOW the actual body and
confirm we reject before parsing multipart."""
From 6d23611620daee68fab48992ee433545e18ea859 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 12:04:30 -0700
Subject: [PATCH 16/61] ops: demo-day freeze + rollback runbook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Demo-day preparation bundle for the funding demo (~2026-05-06). Adds:
- scripts/demo-freeze.sh — captures current ghcr.io
workspace-template-* :latest digests for all 8 runtimes, then
disables both cascade vectors that could re-tag :latest mid-demo:
publish-runtime.yml in molecule-core (PATH 1 — staging push to
workspace/** auto-bumps the wheel and fans out to 8 templates) and
publish-image.yml in each of the 8 template repos (PATH 2 — direct
template repo merge re-tags :latest). Defaults to dry-run; requires
--execute to apply. Writes both digest + workflow receipts to
scripts/demo-freeze-snapshots/.
- scripts/demo-thaw.sh — re-enables every workflow demo-freeze.sh
disabled, keyed off the receipt timestamp. Defaults to executing
(the inverse safety polarity from freeze, where the destructive
default is dry-run). --dry-run prints without applying.
- scripts/demo-day-runbook.md — operator runbook indexing the six
rollback levers (platform image rollback, template image rollback,
tenant redeploy, workspace delete, Railway rollback, Vercel
rollback) plus pre-warm timing and post-demo cleanup. Also covers
read-only diagnostics for "is this working?" moments and the
CP_ADMIN_API_TOKEN rotation step that must follow demo (the token
gets copy-pasted into shells during incident response).
- scripts/demo-freeze-snapshots/.gitignore — generated freeze
receipts are operational state, not source. Tracked .gitkeep so
the directory exists when the script writes to it.
Both scripts dry-run-tested locally. Did not exercise --execute since
that would actually disable production workflows mid-development.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
scripts/demo-day-runbook.md | 306 +++++++++++++++++++++++
scripts/demo-freeze-snapshots/.gitignore | 6 +
scripts/demo-freeze-snapshots/.gitkeep | 0
scripts/demo-freeze.sh | 214 ++++++++++++++++
scripts/demo-thaw.sh | 124 +++++++++
5 files changed, 650 insertions(+)
create mode 100644 scripts/demo-day-runbook.md
create mode 100644 scripts/demo-freeze-snapshots/.gitignore
create mode 100644 scripts/demo-freeze-snapshots/.gitkeep
create mode 100755 scripts/demo-freeze.sh
create mode 100755 scripts/demo-thaw.sh
diff --git a/scripts/demo-day-runbook.md b/scripts/demo-day-runbook.md
new file mode 100644
index 00000000..ff4847ce
--- /dev/null
+++ b/scripts/demo-day-runbook.md
@@ -0,0 +1,306 @@
+# Demo-day runbook
+
+Pre-, during-, and post-demo operational procedures for the molecule
+production stack. Updated 2026-05-01 ahead of the funding-demo on
+~2026-05-06.
+
+The whole stack:
+
+```
+Vercel canvas (app.moleculesai.app)
+ → Railway controlplane (api.moleculesai.app)
+ → CloudFront/Cloudflare per-tenant edge (.moleculesai.app)
+ → EC2 tenant instance running platform container
+ → Docker workspaces pulled from
+ ghcr.io/molecule-ai/workspace-template-:latest
+```
+
+Every layer has its own deploy/rollback story. This runbook indexes
+them in the order an operator would touch them during an incident.
+
+## Pre-demo (T-48h to T-1h)
+
+### 1. Freeze the runtime + template image cascade
+
+A merge to `molecule-core/staging` that touches `workspace/**` triggers
+`publish-runtime.yml` → PyPI bump → repository_dispatch → 8 template
+repos rebuild and re-tag `:latest`. A merge to any template repo's
+`main` triggers the same final re-tag directly. Either path means a
+new workspace provision during the demo pulls whatever `:latest`
+resolved to seconds earlier.
+
+Capture current good digests + disable both cascade vectors:
+
+```bash
+# Dry-run first — verifies digests can be fetched and tooling is set up
+scripts/demo-freeze.sh
+
+# Apply
+scripts/demo-freeze.sh --execute
+```
+
+The script writes two receipts to `scripts/demo-freeze-snapshots/`:
+
+- `digests-.txt` — current `:latest` digest per template (rollback target if needed)
+- `disabled-workflows-.txt` — workflow paths to re-enable post-demo
+
+Verify the freeze landed:
+
+```bash
+gh workflow list -R Molecule-AI/molecule-core | grep publish-runtime
+# expect: status = disabled_manually
+```
+
+If a critical fix MUST ship during the freeze window:
+
+1. `gh workflow enable publish-runtime.yml -R Molecule-AI/molecule-core`
+2. Merge the fix
+3. Watch the cascade through to GHCR:latest manually
+4. Smoke-verify against a staging tenant (`scripts/api-smoke.sh` or
+ manual canvas walkthrough)
+5. `gh workflow disable publish-runtime.yml -R Molecule-AI/molecule-core` to re-freeze
+
+Don't auto-promote during the freeze — the value of the freeze is that
+nothing happens automatically.
+
+### 2. Confirm production CP is on the expected SHA
+
+```bash
+gh run list -R Molecule-AI/molecule-controlplane --branch main --limit 5
+# Last `ci` run should be SUCCESS with the SHA you intend to demo on
+```
+
+Railway auto-deploys from main. Spot-check `api.moleculesai.app`:
+
+```bash
+curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+ https://api.moleculesai.app/cp/admin/orgs?limit=1
+# Expect: 200 + a JSON {"orgs": [...]}
+```
+
+### 3. Confirm production canvas (Vercel) is on main
+
+Vercel auto-deploys `main`. Verify in the Vercel dashboard the most
+recent prod deploy ran from the expected commit SHA.
+
+### 4. Pre-warm the demo tenant
+
+Cold-start times on workspace-template images:
+
+| Runtime | Cold-start (first boot) |
+|---|---|
+| claude-code | ~30-60s |
+| openclaw | ~1-2 min |
+| langgraph | ~1 min |
+| hermes | **~7 min** (large image) |
+
+If the demo will use `hermes`, provision the demo workspace at least
+10 min before. The cold-start clock starts when the workspace is
+created, not when it's used.
+
+## During demo — emergency rollback levers
+
+### Lever A: Platform-image rollback (canvas/CP layer regression)
+
+If the canvas or platform container shipped a regression, retag
+`:latest` to a prior staging SHA without rebuilding:
+
+```bash
+# Find a known-good SHA from staging history
+gh run list -R Molecule-AI/molecule-core --workflow=publish-canvas-image.yml --limit 5
+
+# Roll both platform + tenant images
+GITHUB_TOKEN=$(gh auth token) scripts/rollback-latest.sh
+```
+
+`rollback-latest.sh` retags both `ghcr.io/molecule-ai/platform:latest`
+and `ghcr.io/molecule-ai/platform-tenant:latest`. Existing tenants
+auto-pull `:latest` every 5 min — rollback propagates without manual
+restart.
+
+### Lever B: Workspace-template image rollback
+
+If a specific runtime template (claude-code, hermes, etc.) shipped a
+broken `:latest`:
+
+```bash
+# Get the demo's snapshotted-good digest from the freeze receipt
+grep claude-code scripts/demo-freeze-snapshots/digests-.txt
+
+# Retag :latest back to the snapshotted digest using crane
+crane auth login ghcr.io -u "$(gh api user --jq .login)" \
+ --password-stdin <<< "$(gh auth token)"
+crane tag \
+ ghcr.io/molecule-ai/workspace-template-claude-code@sha256: \
+ latest
+```
+
+The next workspace provision pulls the rolled-back image. Existing
+workspaces are unaffected (their image is already loaded into Docker).
+
+### Lever C: Wedged demo tenant — redeploy
+
+If the demo tenant's EC2 instance is wedged (boot succeeded but app
+not responding, or a stuck workspace), the controlplane has an admin
+redeploy endpoint:
+
+```bash
+# AWS-side: forces a fresh EC2 launch with current image. ~3 min.
+curl -fsS -X POST \
+ -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+ https://api.moleculesai.app/cp/admin/orgs//redeploy
+```
+
+WARNING per memory: this triggers real EC2 + SSM actions on production.
+Double-check `` against the demo tenant's slug before pressing
+return. The `/redeploy` endpoint is idempotent on the EC2 side but
+WILL drop active SSH sessions.
+
+### Lever D: Specific bad workspace — delete
+
+If a single workspace inside the demo tenant is misbehaving (e.g.
+hermes wedged on cold-start, claude-code returning the generic
+"Agent error (Exception)" message), kill it:
+
+```bash
+# Get the demo tenant's per-tenant ADMIN_TOKEN
+TENANT_ADMIN=$(curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+ https://api.moleculesai.app/cp/admin/orgs//admin-token \
+ | jq -r .admin_token)
+
+ORG_ID=$(curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+ https://api.moleculesai.app/cp/admin/orgs?limit=20 \
+ | jq -r '.orgs[] | select(.slug=="") | .id')
+
+# Delete the bad workspace
+curl -fsS -X DELETE \
+ -H "Origin: https://.moleculesai.app" \
+ -H "Authorization: Bearer $TENANT_ADMIN" \
+ -H "X-Molecule-Org-Id: $ORG_ID" \
+ https://.moleculesai.app/workspaces/
+```
+
+Then re-provision a fresh workspace from the canvas. Faster than
+debugging the wedged one.
+
+### Lever E: Railway production rollback (CP regression)
+
+If the last Railway deploy of CP introduced a regression that lever A
+can't fix (e.g. a logic bug, not a container issue):
+
+1. Open Railway dashboard → molecule-platform → controlplane → Deployments
+2. Find the previous-known-good deployment
+3. Click **Rollback to this deployment**
+
+Manual step — no CLI equivalent built. Takes ~30s to redeploy from
+the prior image. Note: rollback restores the prior code AND prior env
+var snapshot; don't expect any env var changes made since to persist.
+
+### Lever F: Vercel production rollback (canvas regression)
+
+If the canvas ships a regression:
+
+1. Open Vercel dashboard → molecule-app → Deployments
+2. Find the previous prod deployment
+3. **Promote to Production**
+
+Same pattern as Railway — fast revert, no rebuild.
+
+## Tenant-level read-only diagnostics (not actions)
+
+Useful during a "is this working?" moment without touching anything:
+
+```bash
+# Tenant infra state
+curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+ "https://api.moleculesai.app/cp/admin/orgs?limit=20" \
+ | jq '.orgs[] | select(.slug=="")'
+
+# Tenant boot events (debug a stuck provision)
+curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \
+ "https://api.moleculesai.app/cp/admin/tenants//boot-events?limit=50" \
+ | jq
+
+# Workspace activity (debug an unresponsive agent)
+curl -fsS \
+ -H "Origin: https://.moleculesai.app" \
+ -H "Authorization: Bearer $TENANT_ADMIN" \
+ -H "X-Molecule-Org-Id: $ORG_ID" \
+ "https://.moleculesai.app/workspaces//activity?limit=20" \
+ | jq
+```
+
+## Post-demo (T+30m to T+24h)
+
+### 1. Thaw the cascades
+
+```bash
+# Find the freeze receipt
+ls scripts/demo-freeze-snapshots/
+
+# Thaw — pass the timestamp suffix
+scripts/demo-thaw.sh 20260506-180000
+```
+
+The next merge to `molecule-core/staging` (workspace/**) or any
+template repo's `main` will resume the auto-rebuild cascade.
+
+### 2. Audit what was held back
+
+If any merges queued during the freeze:
+
+```bash
+gh pr list -R Molecule-AI/molecule-core --base staging --state merged \
+ --search "merged:>=$(date -u -v-7d +%Y-%m-%d)"
+```
+
+Verify each merge's CI is green and dispatch the runtime cascade once
+to ensure all templates rebuild against the post-freeze HEAD.
+
+### 3. File a post-mortem if anything fired
+
+If any rollback lever was used during the demo, file a brief doc:
+
+- Which lever (A through F)
+- Which SHA was rolled back FROM and TO
+- Did the rollback fully resolve the issue or was a follow-up needed
+- Whether the underlying regression should have been caught by CI
+
+## Common issues + first-line fix
+
+| Symptom | First lever to try |
+|---|---|
+| Workspace boots but agent always errors | Lever D (delete + reprovision) |
+| Whole tenant unreachable | Lever C (redeploy) |
+| Canvas crashes on load | Lever F (Vercel rollback) |
+| Login broken / API errors | Lever E (Railway rollback) |
+| Specific runtime broken across tenants | Lever B (template image rollback) |
+| Platform container regression | Lever A (rollback-latest.sh) |
+| Mid-demo stray PR auto-published a bad image | Lever B + investigate why freeze didn't catch it |
+
+## Auth fingerprint (rotate post-demo)
+
+The freeze + rollback procedures assume:
+
+- `CP_ADMIN_API_TOKEN` available via `railway variables --kv --environment production`
+- `gh auth token` returns a working PAT with `workflow:write` + `write:packages`
+- `crane` installed (`brew install crane`)
+
+After the demo, **rotate** `CP_ADMIN_API_TOKEN` (it's the keys-to-the-kingdom
+token for production) — it likely got copy-pasted into shells during
+the demo.
+
+```bash
+# Generate a new admin token
+NEW_TOKEN=$(openssl rand -hex 32)
+
+# Update Railway production env var (and optionally staging)
+railway variables --set CP_ADMIN_API_TOKEN="$NEW_TOKEN" --environment production
+
+# Restart CP service to pick up the change
+# (Railway auto-restarts on env var change)
+
+# Verify
+curl -fsS -H "Authorization: Bearer $NEW_TOKEN" \
+ https://api.moleculesai.app/cp/admin/orgs?limit=1
+```
diff --git a/scripts/demo-freeze-snapshots/.gitignore b/scripts/demo-freeze-snapshots/.gitignore
new file mode 100644
index 00000000..50692299
--- /dev/null
+++ b/scripts/demo-freeze-snapshots/.gitignore
@@ -0,0 +1,6 @@
+# Generated by scripts/demo-freeze.sh — receipts are operational state,
+# not source. Tracked .gitignore + .gitkeep keep the directory itself
+# in version control so the freeze script's output dir always exists.
+*
+!.gitignore
+!.gitkeep
diff --git a/scripts/demo-freeze-snapshots/.gitkeep b/scripts/demo-freeze-snapshots/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/scripts/demo-freeze.sh b/scripts/demo-freeze.sh
new file mode 100755
index 00000000..be7b176b
--- /dev/null
+++ b/scripts/demo-freeze.sh
@@ -0,0 +1,214 @@
+#!/usr/bin/env bash
+# demo-freeze.sh — disable the runtime + template image publish cascades
+# during a demo-prep window so a stray staging merge can't auto-rebuild
+# `:latest` for the 8 workspace-template images mid-demo.
+#
+# Demo prep typically runs T-48h to T+1h. During that window:
+#
+# PATH 1: any merge to molecule-core/staging that touches workspace/**
+# → publish-runtime.yml fires
+# → PyPI auto-bumps molecule-ai-workspace-runtime patch version
+# → repository_dispatch fans out to 8 workspace-template-* repos
+# → each template repo rebuilds and re-tags
+# ghcr.io/molecule-ai/workspace-template-:latest
+#
+# PATH 2: any merge to a workspace-template-* repo's main branch
+# → that repo's publish-image.yml fires
+# → ghcr.io/molecule-ai/workspace-template-:latest
+# gets re-tagged
+#
+# provisioner.go:296 RuntimeImages[runtime] reads `:latest` at every
+# workspace boot. A new workspace provision during demo pulls whatever
+# `:latest` resolved to seconds earlier — so a bad merge minutes
+# before the demo can break a tenant the funder is about to see.
+#
+# This script captures the current good `:latest` digests for all 8
+# templates and disables both cascade vectors. The complementary
+# demo-thaw.sh re-enables them.
+#
+# Usage:
+# scripts/demo-freeze.sh # dry run — print what would happen
+# scripts/demo-freeze.sh --execute # actually disable workflows + snapshot
+#
+# Prereqs:
+# - gh CLI authenticated with workflow:write scope on Molecule-AI org
+# - curl + jq (for digest snapshot via GHCR anonymous registry API)
+#
+# Output:
+# /digests-YYYYMMDD-HHMMSS.txt
+# One line per template: ": "
+# /disabled-workflows-YYYYMMDD-HHMMSS.txt
+# One line per disabled workflow: ": "
+#
+# Exit codes:
+# 0 — freeze complete (or dry-run successful)
+# 1 — pre-flight failure (missing tooling, missing auth, etc.)
+# 2 — partial freeze (some workflows did not disable cleanly; see log)
+
+set -euo pipefail
+
+usage() {
+ cat <<'USAGE'
+demo-freeze.sh — disable the runtime + template image publish cascades
+during a demo-prep window.
+
+Captures current :latest digests for all 8 workspace-template-* images
+and disables the workflows that would otherwise re-tag them.
+
+Usage:
+ scripts/demo-freeze.sh # dry run — print what would happen
+ scripts/demo-freeze.sh --execute # actually disable workflows + snapshot
+
+See the comment block at the top of this script for the full procedure.
+USAGE
+}
+
+EXECUTE=0
+case "${1:-}" in
+ --execute)
+ EXECUTE=1
+ ;;
+ --help|-h)
+ usage
+ exit 0
+ ;;
+ "")
+ ;;
+ *)
+ echo "unknown arg: $1" >&2
+ usage >&2
+ exit 2
+ ;;
+esac
+
+# Templates and their GHCR repository slugs. Source of truth for the
+# runtime → image map is workspace-server/internal/provisioner/provisioner.go
+# RuntimeImages — keep this list in sync if a runtime is added.
+TEMPLATES=(
+ "claude-code"
+ "hermes"
+ "openclaw"
+ "langgraph"
+ "deepagents"
+ "crewai"
+ "autogen"
+ "gemini-cli"
+)
+
+# Pre-flight: required tooling.
+need() {
+ command -v "$1" >/dev/null || { echo "ERROR: missing required tool: $1" >&2; exit 1; }
+}
+need gh
+need curl
+need jq
+
+# Pre-flight: gh auth. Snapshot via anonymous GHCR token works without
+# org auth, but workflow disable needs an authenticated gh.
+if ! gh auth status >/dev/null 2>&1; then
+ echo "ERROR: gh not authenticated. Run 'gh auth login' first." >&2
+ exit 1
+fi
+
+# Snapshot location relative to this script. Keeping it under scripts/
+# rather than a temp dir means freeze receipts are easy to find again
+# during the actual demo.
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SNAPSHOT_DIR="${SCRIPT_DIR}/demo-freeze-snapshots"
+mkdir -p "$SNAPSHOT_DIR"
+TS="$(date -u +%Y%m%d-%H%M%S)"
+DIGESTS_FILE="${SNAPSHOT_DIR}/digests-${TS}.txt"
+WORKFLOWS_FILE="${SNAPSHOT_DIR}/disabled-workflows-${TS}.txt"
+
+if [ $EXECUTE -eq 0 ]; then
+ echo "=== DRY RUN (no changes will be made; pass --execute to apply) ==="
+else
+ echo "=== EXECUTING FREEZE — workflows will be disabled ==="
+fi
+echo "Snapshot timestamp: $TS"
+echo "Digest log: $DIGESTS_FILE"
+echo "Workflow log: $WORKFLOWS_FILE"
+echo
+
+# Step 1: capture current :latest digest for each template.
+echo "→ Capturing current :latest digests"
+for tpl in "${TEMPLATES[@]}"; do
+ token=$(curl -fsS "https://ghcr.io/token?scope=repository:molecule-ai/workspace-template-${tpl}:pull" | jq -r .token 2>/dev/null || true)
+ if [ -z "$token" ] || [ "$token" = "null" ]; then
+ echo " WARN: token fetch failed for $tpl — skipping digest capture"
+ continue
+ fi
+ digest=$(curl -fsSI \
+ -H "Authorization: Bearer $token" \
+ -H "Accept: application/vnd.oci.image.index.v1+json" \
+ -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \
+ "https://ghcr.io/v2/molecule-ai/workspace-template-${tpl}/manifests/latest" 2>/dev/null \
+ | grep -i 'docker-content-digest' \
+ | awk '{print $2}' \
+ | tr -d '\r')
+ if [ -z "$digest" ]; then
+ echo " WARN: digest fetch failed for $tpl"
+ continue
+ fi
+ echo " $tpl: $digest"
+ if [ $EXECUTE -eq 1 ]; then
+ echo "$tpl: $digest" >> "$DIGESTS_FILE"
+ fi
+done
+echo
+
+# Step 2: disable publish-runtime.yml in molecule-core (PATH 1 source).
+echo "→ Disabling publish-runtime.yml in molecule-core (kills runtime → 8-template cascade)"
+if [ $EXECUTE -eq 1 ]; then
+ if gh workflow disable publish-runtime.yml -R Molecule-AI/molecule-core 2>/tmp/freeze.err; then
+ echo " OK molecule-core/publish-runtime.yml disabled"
+ echo "Molecule-AI/molecule-core: publish-runtime.yml" >> "$WORKFLOWS_FILE"
+ else
+ echo " FAIL molecule-core/publish-runtime.yml: $(cat /tmp/freeze.err)" >&2
+ fi
+else
+ echo " (dry-run) would disable: gh workflow disable publish-runtime.yml -R Molecule-AI/molecule-core"
+fi
+echo
+
+# Step 3: disable publish-image.yml in each of the 8 template repos (PATH 2 sources).
+echo "→ Disabling publish-image.yml in each workspace-template-* repo"
+PARTIAL_FAIL=0
+for tpl in "${TEMPLATES[@]}"; do
+ repo="Molecule-AI/molecule-ai-workspace-template-${tpl}"
+ if [ $EXECUTE -eq 1 ]; then
+ if gh workflow disable publish-image.yml -R "$repo" 2>/tmp/freeze.err; then
+ echo " OK $repo/publish-image.yml disabled"
+ echo "${repo}: publish-image.yml" >> "$WORKFLOWS_FILE"
+ else
+ echo " FAIL $repo/publish-image.yml: $(cat /tmp/freeze.err)" >&2
+ PARTIAL_FAIL=1
+ fi
+ else
+ echo " (dry-run) would disable: gh workflow disable publish-image.yml -R $repo"
+ fi
+done
+echo
+
+if [ $EXECUTE -eq 0 ]; then
+ echo "=== DRY RUN COMPLETE ==="
+ echo "Re-run with --execute to apply the freeze."
+ exit 0
+fi
+
+echo "=== FREEZE COMPLETE ==="
+echo "Receipts: $DIGESTS_FILE"
+echo " $WORKFLOWS_FILE"
+echo
+echo "Next steps:"
+echo " - Verify by running: gh workflow list -R Molecule-AI/molecule-core | grep publish-runtime"
+echo " Status should be 'disabled_manually'."
+echo " - Demo proceeds; new workspaces pull the snapshotted :latest digests."
+echo " - Post-demo, run: scripts/demo-thaw.sh ${TS}"
+echo " to re-enable every workflow this freeze disabled."
+echo
+if [ $PARTIAL_FAIL -ne 0 ]; then
+ echo "WARNING: one or more workflows did not disable cleanly. Re-run after fixing." >&2
+ exit 2
+fi
+exit 0
diff --git a/scripts/demo-thaw.sh b/scripts/demo-thaw.sh
new file mode 100755
index 00000000..35469c6e
--- /dev/null
+++ b/scripts/demo-thaw.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+# demo-thaw.sh — re-enable workflows that demo-freeze.sh disabled.
+#
+# Usage:
+# scripts/demo-thaw.sh
+# scripts/demo-thaw.sh 20260503-180000
+#
+# Reads disabled-workflows-.txt produced by demo-freeze.sh and
+# runs `gh workflow enable` for each entry. Idempotent — re-enabling
+# an already-enabled workflow is a no-op.
+#
+# Defaults to executing (the inverse of freeze, which defaults to
+# dry-run). Pass --dry-run to print without executing.
+#
+# Prereqs:
+# - gh CLI authenticated with workflow:write scope on Molecule-AI org
+#
+# Exit codes:
+# 0 — all workflows re-enabled
+# 1 — pre-flight failure (missing receipt file, missing tooling)
+# 2 — partial thaw (some workflows did not enable; check output)
+
+set -euo pipefail
+
+usage() {
+ cat <<'USAGE'
+demo-thaw.sh — re-enable workflows that demo-freeze.sh disabled.
+
+Usage:
+ scripts/demo-thaw.sh # apply
+ scripts/demo-thaw.sh --dry-run # print without applying
+
+ts is the YYYYMMDD-HHMMSS suffix on
+scripts/demo-freeze-snapshots/disabled-workflows-*.txt produced by
+demo-freeze.sh.
+USAGE
+}
+
+DRY_RUN=0
+TS=""
+for arg in "$@"; do
+ case "$arg" in
+ --dry-run)
+ DRY_RUN=1
+ ;;
+ --help|-h)
+ usage
+ exit 0
+ ;;
+ *)
+ if [ -z "$TS" ]; then
+ TS="$arg"
+ else
+ echo "unknown arg: $arg" >&2
+ usage >&2
+ exit 2
+ fi
+ ;;
+ esac
+done
+
+if [ -z "$TS" ]; then
+ echo "usage: $0 [--dry-run]" >&2
+ echo " e.g. $0 20260503-180000" >&2
+ echo " ts is the YYYYMMDD-HHMMSS suffix on demo-freeze-snapshots/disabled-workflows-*.txt" >&2
+ exit 2
+fi
+
+command -v gh >/dev/null || { echo "ERROR: gh CLI required" >&2; exit 1; }
+if ! gh auth status >/dev/null 2>&1; then
+ echo "ERROR: gh not authenticated. Run 'gh auth login' first." >&2
+ exit 1
+fi
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+WORKFLOWS_FILE="${SCRIPT_DIR}/demo-freeze-snapshots/disabled-workflows-${TS}.txt"
+
+if [ ! -f "$WORKFLOWS_FILE" ]; then
+ echo "ERROR: receipt not found: $WORKFLOWS_FILE" >&2
+ echo "Available receipts:" >&2
+ ls "${SCRIPT_DIR}/demo-freeze-snapshots/" 2>/dev/null | grep '^disabled-workflows-' >&2 || echo " (none)" >&2
+ exit 1
+fi
+
+if [ $DRY_RUN -eq 1 ]; then
+ echo "=== DRY RUN (no changes will be made) ==="
+else
+ echo "=== THAWING — re-enabling workflows ==="
+fi
+echo "Reading: $WORKFLOWS_FILE"
+echo
+
+PARTIAL_FAIL=0
+while IFS=': ' read -r repo workflow; do
+ [ -z "$repo" ] && continue
+ if [ $DRY_RUN -eq 1 ]; then
+ echo " (dry-run) would enable: gh workflow enable $workflow -R $repo"
+ else
+ if gh workflow enable "$workflow" -R "$repo" 2>/tmp/thaw.err; then
+ echo " OK $repo/$workflow re-enabled"
+ else
+ echo " FAIL $repo/$workflow: $(cat /tmp/thaw.err)" >&2
+ PARTIAL_FAIL=1
+ fi
+ fi
+done < "$WORKFLOWS_FILE"
+
+echo
+if [ $DRY_RUN -eq 1 ]; then
+ echo "=== DRY RUN COMPLETE ==="
+ echo "Re-run without --dry-run to apply."
+ exit 0
+fi
+
+echo "=== THAW COMPLETE ==="
+echo "Cascades restored. Next workspace/** push to molecule-core/staging will"
+echo "auto-publish the runtime wheel and fan out to template rebuilds as normal."
+if [ $PARTIAL_FAIL -ne 0 ]; then
+ echo
+ echo "WARNING: one or more workflows did not re-enable cleanly. Re-run or enable manually:" >&2
+ echo " gh workflow list -R " >&2
+ exit 2
+fi
+exit 0
From e1496936e9a2c82cc383185cd4b3c8307b8a86c3 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 12:45:20 -0700
Subject: [PATCH 17/61] feat(canvas): dynamic provider dropdown in
CreateWorkspaceDialog
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Mirrors the data-driven pattern PR #2454 set in ConfigTab: read
runtime_config.providers from /templates and filter the modal's
provider
-[](https://railway.app/new/template?template=https://github.com/Molecule-AI/molecule-core)
-[](https://render.com/deploy?repo=https://github.com/Molecule-AI/molecule-core)
+[](https://railway.app/new/template?template=https://github.com/Molecule-AI/molecule-monorepo)
+[](https://render.com/deploy?repo=https://github.com/Molecule-AI/molecule-monorepo)
@@ -249,8 +249,8 @@ Workspace Runtime (Python image with adapters)
## Quick Start
```bash
-git clone https://github.com/Molecule-AI/molecule-core.git
-cd molecule-core
+git clone https://github.com/Molecule-AI/molecule-monorepo.git
+cd molecule-monorepo
cp .env.example .env
# Defaults boot the stack locally out of the box. See .env.example for
From f6a48d593e483cb684d19cbd9fcd8d080ec12e6f Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 19:17:55 -0700
Subject: [PATCH 49/61] test: standardise on `from a2a_mcp_server import ...`
in TestStdioPipeAssertion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
github-code-quality bot flagged 4 instances of `import a2a_mcp_server` in
the new TestStdioPipeAssertion class — every other test in the file uses
the `from a2a_mcp_server import ...` per-test pattern, so this is a real
inconsistency.
Switching the new tests to match. No behavior change; resolves the
4 unresolved review threads blocking the merge queue.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
workspace/tests/test_a2a_mcp_server.py | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/workspace/tests/test_a2a_mcp_server.py b/workspace/tests/test_a2a_mcp_server.py
index 175558fb..ac549d07 100644
--- a/workspace/tests/test_a2a_mcp_server.py
+++ b/workspace/tests/test_a2a_mcp_server.py
@@ -730,13 +730,13 @@ class TestStdioPipeAssertion:
"""Happy path — both fds are pipes (the production launch shape
from any MCP client). Should return None without printing or
exiting."""
- import a2a_mcp_server
+ from a2a_mcp_server import _assert_stdio_is_pipe_compatible
r, w = os.pipe()
try:
# No exit, no stderr noise. We don't capture stderr here
# because pipe path should produce zero output.
- a2a_mcp_server._assert_stdio_is_pipe_compatible(stdin_fd=r, stdout_fd=w)
+ _assert_stdio_is_pipe_compatible(stdin_fd=r, stdout_fd=w)
finally:
os.close(r)
os.close(w)
@@ -749,7 +749,7 @@ class TestStdioPipeAssertion:
`ValueError: Pipe transport is only for pipes...`. Post-fix we
exit with code 2 and a stderr message that names the symptom +
fix."""
- import a2a_mcp_server
+ from a2a_mcp_server import _assert_stdio_is_pipe_compatible
# stdin = pipe (so we isolate the stdout failure path);
# stdout = regular file (the bug condition).
@@ -758,7 +758,7 @@ class TestStdioPipeAssertion:
f = open(regular, "wb")
try:
with pytest.raises(SystemExit) as excinfo:
- a2a_mcp_server._assert_stdio_is_pipe_compatible(
+ _assert_stdio_is_pipe_compatible(
stdin_fd=r, stdout_fd=f.fileno()
)
assert excinfo.value.code == 2
@@ -778,7 +778,7 @@ class TestStdioPipeAssertion:
):
"""Symmetric case — stdin redirected from a regular file. Same
asyncio constraint applies via connect_read_pipe."""
- import a2a_mcp_server
+ from a2a_mcp_server import _assert_stdio_is_pipe_compatible
regular = tmp_path / "input.json"
regular.write_bytes(b'{"jsonrpc":"2.0","id":1,"method":"initialize"}\n')
@@ -786,7 +786,7 @@ class TestStdioPipeAssertion:
_r, w = os.pipe()
try:
with pytest.raises(SystemExit) as excinfo:
- a2a_mcp_server._assert_stdio_is_pipe_compatible(
+ _assert_stdio_is_pipe_compatible(
stdin_fd=f.fileno(), stdout_fd=w
)
assert excinfo.value.code == 2
@@ -801,13 +801,13 @@ class TestStdioPipeAssertion:
"""If stdio is closed (rare but seen in detached daemonized
contexts), os.fstat raises OSError. We catch it and exit 2 with
a guidance message instead of letting the traceback escape."""
- import a2a_mcp_server
+ from a2a_mcp_server import _assert_stdio_is_pipe_compatible
r, w = os.pipe()
os.close(w) # Now `w` is a stale fd — fstat will fail.
try:
with pytest.raises(SystemExit) as excinfo:
- a2a_mcp_server._assert_stdio_is_pipe_compatible(
+ _assert_stdio_is_pipe_compatible(
stdin_fd=r, stdout_fd=w
)
assert excinfo.value.code == 2
From fe921945844913328544adc88a2e7488c78fbb11 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 19:19:05 -0700
Subject: [PATCH 50/61] test(provision): concurrent 7-burst repro harness for
#2486 silent-drop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Goal: a deterministic, in-process reproduction of the prod incident
where 7 simultaneous claude-code provisions on the hongming tenant
produced ZERO log lines from any of the four documented exit paths.
Approach: stub CPProvisioner that records every Start() call,
sqlmock for the prepare flow, fire 7 goroutines concurrently against
provisionWorkspaceCP, then assert:
1. Entry log fired exactly 7 times (one per goroutine).
2. Stub Start() recorded all 7 distinct workspace IDs.
3. Each goroutine's entry log names its own workspace ID.
Result on staging head as of 2026-05-02: PASSES — meaning the
silent-drop class isn't reproducible against current head with stub
CP. Tenant hongming runs sha 76c604fb (725 commits behind staging),
so the bug is most likely already fixed upstream — hongming needs
a redeploy.
The test stays as a regression gate: any future refactor that
re-introduces silent goroutine swallow in the CP provision path
(rate-limit drop, channel-send-without-receiver, panic without
recover, etc.) trips it.
A safeWriter wraps the captured log buffer because raw
bytes.Buffer.Write isn't safe for concurrent goroutines — without
serialization the 7 entry-log lines interleave at byte boundaries
and the strings.Count assertion gets unreliable.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
...rkspace_provision_concurrent_repro_test.go | 208 ++++++++++++++++++
1 file changed, 208 insertions(+)
create mode 100644 workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
diff --git a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
new file mode 100644
index 00000000..2100a49c
--- /dev/null
+++ b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
@@ -0,0 +1,208 @@
+package handlers
+
+import (
+ "bytes"
+ "context"
+ "fmt"
+ "log"
+ "strings"
+ "sync"
+ "sync/atomic"
+ "testing"
+
+ "github.com/DATA-DOG/go-sqlmock"
+ "github.com/Molecule-AI/molecule-monorepo/platform/internal/models"
+ "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner"
+)
+
+// Issue #2486 reproduction harness: 7 simultaneous claude-code provisions
+// against the SAME workspace-server (Director Pattern fan-out). On the
+// hongming prod tenant this produced ZERO log lines from any of the four
+// documented exit paths in provisionWorkspaceCP — operators couldn't tell
+// whether the goroutines ran. This test closes the visibility gap by
+// pinning that:
+//
+// 1. Every provision goroutine produces ONE entry log line ("CPProvisioner:
+// goroutine entered for ws-N").
+// 2. Every goroutine reaches its registered exit path (cpProv.Start),
+// i.e. the stub records all 7 workspace IDs.
+//
+// If the silent-drop class is present in current head code, this test
+// fails because either (a) the entry-log count is < 7 (meaning one or
+// more goroutines never started — Go runtime issue), or (b) the
+// recorder count is < 7 (meaning a goroutine entered but exited before
+// reaching cpProv.Start, via some unlogged path).
+//
+// Result on staging head as of 2026-05-02: PASSES — meaning the
+// silent-drop seen in the prod incident is NOT reproducible against
+// current head with stub CP. Possibilities: (i) bug already fixed
+// upstream of the tenant's stale build (sha 76c604fb, 725 commits
+// behind), (ii) bug requires real-CP-side rate-limiting we don't
+// model here, (iii) bug requires a DB-layer interaction (lock
+// contention, deadlock) the sqlmock doesn't model.
+//
+// Even when this passes today, it stays as a regression gate: any
+// future refactor that re-introduces silent goroutine swallow in the
+// CP provision path trips it.
+
+// recordingCPProv implements provisioner.CPProvisionerAPI and records
+// every Start() invocation in a thread-safe slice so a concurrent
+// burst can be verified post-hoc.
+type recordingCPProv struct {
+ mu sync.Mutex
+ startedWS []string
+ // startErr controls what Start() returns. nil → success. Non-nil →
+ // error path; provisionWorkspaceCP marks failed + returns.
+ startErr error
+}
+
+func (r *recordingCPProv) Start(_ context.Context, cfg provisioner.WorkspaceConfig) (string, error) {
+ r.mu.Lock()
+ r.startedWS = append(r.startedWS, cfg.WorkspaceID)
+ r.mu.Unlock()
+ if r.startErr != nil {
+ return "", r.startErr
+ }
+ return "i-stubbed-" + cfg.WorkspaceID[:8], nil
+}
+
+func (r *recordingCPProv) Stop(_ context.Context, _ string) error {
+ panic("recordingCPProv.Stop not expected in concurrent-repro test")
+}
+
+func (r *recordingCPProv) GetConsoleOutput(_ context.Context, _ string) (string, error) {
+ panic("recordingCPProv.GetConsoleOutput not expected in concurrent-repro test")
+}
+
+func (r *recordingCPProv) IsRunning(_ context.Context, _ string) (bool, error) {
+ panic("recordingCPProv.IsRunning not expected in concurrent-repro test")
+}
+
+func (r *recordingCPProv) startedSet() map[string]struct{} {
+ r.mu.Lock()
+ defer r.mu.Unlock()
+ out := make(map[string]struct{}, len(r.startedWS))
+ for _, id := range r.startedWS {
+ out[id] = struct{}{}
+ }
+ return out
+}
+
+// TestProvisionWorkspaceCP_ConcurrentBurst_NoSilentDrop is the
+// repro harness for issue #2486. See file-level comment.
+func TestProvisionWorkspaceCP_ConcurrentBurst_NoSilentDrop(t *testing.T) {
+ const numWorkspaces = 7
+
+ mock := setupTestDB(t)
+
+ // Every goroutine runs prepareProvisionContext → mintWorkspaceSecrets
+ // → cpProv.Start (stubbed to fail) → markProvisionFailed. The DB
+ // shape per goroutine: 2 SELECTs + 1 UPDATE. Order between
+ // goroutines is non-deterministic so use MatchExpectationsInOrder
+ // false.
+ mock.MatchExpectationsInOrder(false)
+ for i := 0; i < numWorkspaces; i++ {
+ mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM global_secrets`).
+ WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}))
+ mock.ExpectQuery(`SELECT key, encrypted_value, encryption_version FROM workspace_secrets`).
+ WithArgs(sqlmock.AnyArg()).
+ WillReturnRows(sqlmock.NewRows([]string{"key", "encrypted_value", "encryption_version"}))
+ mock.ExpectExec(`UPDATE workspaces SET status =`).
+ WithArgs(sqlmock.AnyArg(), sqlmock.AnyArg(), sqlmock.AnyArg()).
+ WillReturnResult(sqlmock.NewResult(0, 1))
+ }
+
+ // Capture every log line so we can count entry-log occurrences.
+ var logBuf bytes.Buffer
+ var logMu sync.Mutex
+ prev := log.Writer()
+ log.SetOutput(&safeWriter{buf: &logBuf, mu: &logMu})
+ defer log.SetOutput(prev)
+
+ // stubFailing-shaped behaviour but recording-capable. Failure is
+ // fine — we're not testing the success path, only that every
+ // goroutine entered AND reached the recorded Start() call.
+ rec := &recordingCPProv{startErr: fmt.Errorf("simulated CP rejection")}
+
+ cap := &captureBroadcaster{}
+ handler := NewWorkspaceHandler(cap, nil, "http://localhost:8080", t.TempDir())
+ handler.SetCPProvisioner(rec)
+
+ var wg sync.WaitGroup
+ var enteredCount int64
+ for i := 0; i < numWorkspaces; i++ {
+ wg.Add(1)
+ // Use a UUID-shaped ID so cfg.WorkspaceID slicing in the stub
+ // has 8 chars to read.
+ wsID := fmt.Sprintf("ws-fan-%016d", i)
+ go func() {
+ defer wg.Done()
+ atomic.AddInt64(&enteredCount, 1)
+ handler.provisionWorkspaceCP(wsID, "", nil, models.CreateWorkspacePayload{
+ Name: wsID,
+ Tier: 1,
+ Runtime: "claude-code",
+ })
+ }()
+ }
+ wg.Wait()
+
+ if got := atomic.LoadInt64(&enteredCount); got != numWorkspaces {
+ t.Fatalf("test setup bug: expected %d goroutines to enter, got %d", numWorkspaces, got)
+ }
+
+ // Assertion 1: every goroutine produced an entry log. Without the
+ // fix in this PR (#2487), there's NO entry log so this assertion
+ // is what closes the visibility gap.
+ logMu.Lock()
+ logged := logBuf.String()
+ logMu.Unlock()
+ entryCount := strings.Count(logged, "CPProvisioner: goroutine entered for")
+ if entryCount != numWorkspaces {
+ t.Errorf("entry log fired %d times, want %d. Either (a) a goroutine never reached the entry log or (b) the entry log was removed/renamed.\nlog dump:\n%s",
+ entryCount, numWorkspaces, logged)
+ }
+
+ // Assertion 2: every goroutine's Start() call was recorded by the
+ // stub — no silent drop between entry log and the registered exit
+ // path (cpProv.Start).
+ started := rec.startedSet()
+ if len(started) != numWorkspaces {
+ t.Errorf("stub CPProvisioner saw %d distinct Start() calls, want %d. SILENT-DROP CLASS: a goroutine entered but never reached Start(). seen=%v",
+ len(started), numWorkspaces, started)
+ }
+
+ // Assertion 3: every entry-log line names a distinct workspace —
+ // guards against a future refactor that hard-codes a single ID
+ // and double-logs.
+ for i := 0; i < numWorkspaces; i++ {
+ want := fmt.Sprintf("CPProvisioner: goroutine entered for ws-fan-%016d", i)
+ if !strings.Contains(logged, want) {
+ t.Errorf("missing entry log for ws-fan-%016d. log dump:\n%s", i, logged)
+ }
+ }
+
+ if err := mock.ExpectationsWereMet(); err != nil {
+ // Soft-fail: under concurrency some queries may have been
+ // re-ordered relative to the (non-strict) expectation set,
+ // which sqlmock can sometimes flag. Surface as t.Logf rather
+ // than t.Errorf so the assertion above (concrete observable
+ // behaviour) remains the primary gate.
+ t.Logf("sqlmock expectations note (non-fatal under concurrent fan-out): %v", err)
+ }
+}
+
+// safeWriter serializes log writes from concurrent goroutines so the
+// captured buffer isn't a torn-write mess. Without this the log lines
+// from 7 concurrent goroutines interleave at byte boundaries and the
+// strings.Count assertion above gets unreliable.
+type safeWriter struct {
+ buf *bytes.Buffer
+ mu *sync.Mutex
+}
+
+func (w *safeWriter) Write(p []byte) (int, error) {
+ w.mu.Lock()
+ defer w.mu.Unlock()
+ return w.buf.Write(p)
+}
From 7a197241949b720699f70296c6066c124c5b1eb4 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 19:56:34 -0700
Subject: [PATCH 51/61] fix(provision): route panic recovery through
markProvisionFailed + fix log capture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Three fixes addressing review of the issue #2486 observability PR:
1. CI failure: original inline UPDATE in logProvisionPanic used a hard-coded
`status='failed'` literal, which trips workspace_status_enum_drift_test
(the post-PR-#2396 gate that requires every status write to flow through
models.Status* via parameterized $N). Refactor to call
h.markProvisionFailed which uses StatusFailed parameterized.
2. Canvas-broadcast gap (review finding): inline UPDATE skipped
RecordAndBroadcast, so panic recovery marked the row failed in DB but
the canvas spinner stayed on "provisioning" until the next poll.
markProvisionFailed fires WORKSPACE_PROVISION_FAILED, so canvas now
flips to a failure card immediately.
3. Critical test bug (review finding): `defer log.SetOutput(log.Writer())`
in three test sites evaluated log.Writer() at defer-fire time AFTER the
SetOutput swap — restoring the buffer to itself, never restoring
os.Stderr. Subsequent tests in the package were running with the panic
tests' captured buffer as their writer. Extracted captureLog(t) helper
that captures `prev` BEFORE the swap and uses t.Cleanup.
Plus: softened the "goroutine never started" comment in the concurrent
repro harness — the harness atomic-counts BEFORE the entry log fires, so
"never started" was misleading; the real failure mode is "entry log
renamed/removed or writer hijacked."
Verified: full handlers suite passes; drift gate passes (Platform Go CI
failure root-caused). Regression-injected the recover body again — both
panic tests still fail as expected, confirming the contract is gated.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../internal/handlers/workspace_provision.go | 29 ++++---
...rkspace_provision_concurrent_repro_test.go | 4 +-
.../workspace_provision_panic_test.go | 86 ++++++++++++++-----
3 files changed, 83 insertions(+), 36 deletions(-)
diff --git a/workspace-server/internal/handlers/workspace_provision.go b/workspace-server/internal/handlers/workspace_provision.go
index a4c37c37..6339fb43 100644
--- a/workspace-server/internal/handlers/workspace_provision.go
+++ b/workspace-server/internal/handlers/workspace_provision.go
@@ -22,30 +22,33 @@ import (
// provisionWorkspaceCP propagates up the goroutine stack and crashes the
// whole workspace-server process — taking every other tenant workspace
// down with it. With it, the panic is logged with a stack trace, the
-// workspace gets persistently marked failed (so the canvas surfaces
-// something instead of leaving it stuck in provisioning until the 10-min
-// sweeper fires), and the rest of the process keeps serving.
+// workspace is marked failed via markProvisionFailed (so the canvas
+// surfaces a failure card immediately instead of leaving the spinner
+// stuck on "provisioning" until the 10-min sweeper fires), and the rest
+// of the process keeps serving.
//
// Issue #2486 added this after the symmetric class — silent goroutine
// exit, no log, no failure mark — was observed in prod. Even if the
// root cause turns out not to be a panic, surfacing the panic class
// closes one branch of "what could have happened" cleanly.
-func logProvisionPanic(workspaceID, mode string) {
+//
+// Method on *WorkspaceHandler (not free function) so the panic path can
+// reuse markProvisionFailed and emit the WORKSPACE_PROVISION_FAILED
+// broadcast — without the broadcast the canvas only learns of the
+// failure when the next poll/refresh hits the DB.
+func (h *WorkspaceHandler) logProvisionPanic(workspaceID, mode string) {
r := recover()
if r == nil {
return
}
log.Printf("Provisioner: PANIC during provision goroutine for %s (mode=%s): %v\nstack:\n%s",
workspaceID, mode, r, debug.Stack())
- // Best-effort mark-failed via a fresh context — the provision goroutine's
- // ctx may have been the one panicking. 10s is enough for a single UPDATE.
+ // Fresh context: the provision goroutine's ctx may have been the one
+ // panicking (timeout, cancelled). 10s is enough for the broadcast +
+ // single UPDATE inside markProvisionFailed.
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
- if _, err := db.DB.ExecContext(ctx,
- `UPDATE workspaces SET status='failed', last_sample_error=$2, updated_at=now() WHERE id=$1`,
- workspaceID, fmt.Sprintf("provision panic: %v", r)); err != nil {
- log.Printf("Provisioner: failed to persist panic-failure for %s: %v", workspaceID, err)
- }
+ h.markProvisionFailed(ctx, workspaceID, fmt.Sprintf("provision panic: %v", r), nil)
}
// provisionWorkspace handles async container deployment with timeout.
@@ -64,7 +67,7 @@ func (h *WorkspaceHandler) provisionWorkspaceOpts(workspaceID, templatePath stri
// neither a prepare-failed nor start-failed nor success log line, so an
// operator couldn't tell whether the goroutine ran at all.
log.Printf("Provisioner: goroutine entered for %s (runtime=%s, mode=docker)", workspaceID, payload.Runtime)
- defer logProvisionPanic(workspaceID, "docker")
+ defer h.logProvisionPanic(workspaceID, "docker")
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
defer cancel()
@@ -687,7 +690,7 @@ func (h *WorkspaceHandler) provisionWorkspaceCP(workspaceID, templatePath string
// unable to distinguish "goroutine never started" from "started but
// returned via an unlogged path."
log.Printf("CPProvisioner: goroutine entered for %s (runtime=%s, mode=cp)", workspaceID, payload.Runtime)
- defer logProvisionPanic(workspaceID, "cp")
+ defer h.logProvisionPanic(workspaceID, "cp")
ctx, cancel := context.WithTimeout(context.Background(), provisioner.ProvisionTimeout)
defer cancel()
diff --git a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
index 2100a49c..2c19e41b 100644
--- a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
@@ -29,7 +29,9 @@ import (
//
// If the silent-drop class is present in current head code, this test
// fails because either (a) the entry-log count is < 7 (meaning one or
-// more goroutines never started — Go runtime issue), or (b) the
+// more goroutines reached the goroutine boundary but never produced
+// the entry-log line — entry log renamed/removed, or log writer
+// hijacked), or (b) the
// recorder count is < 7 (meaning a goroutine entered but exited before
// reaching cpProv.Start, via some unlogged path).
//
diff --git a/workspace-server/internal/handlers/workspace_provision_panic_test.go b/workspace-server/internal/handlers/workspace_provision_panic_test.go
index 0a287fb3..f7776ee4 100644
--- a/workspace-server/internal/handlers/workspace_provision_panic_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_panic_test.go
@@ -14,28 +14,55 @@ import (
// Pin the issue #2486 contract: a panic inside the provision goroutine must
// (1) not propagate (the deferred recover swallows it), (2) log the panic
// with a stack trace so an operator can see what blew up, and (3) mark the
-// workspace `failed` so the canvas surfaces the failure instead of the row
-// sitting in `provisioning` until the 10-min sweeper.
+// workspace `failed` AND broadcast WORKSPACE_PROVISION_FAILED so the canvas
+// flips the spinner to a failure card immediately — not after the 10-min
+// sweeper.
+//
+// Helper: newPanicTestHandler wires a captureBroadcaster + handler so each
+// test exercises the real markProvisionFailed path. The broadcaster capture
+// is what proves assertion (3) — without it, the panic recovery would mark
+// the row failed in the DB but the canvas wouldn't learn until next refresh.
+
+func newPanicTestHandler() (*WorkspaceHandler, *captureBroadcaster) {
+ cap := &captureBroadcaster{}
+ return NewWorkspaceHandler(cap, nil, "http://localhost:8080", ""), cap
+}
+
+// captureLog swaps log output to a buffer for the test and restores the
+// previous writer on cleanup. Capturing `prev` BEFORE SetOutput is
+// load-bearing — `log.Writer()` evaluated at defer-fire time would
+// return the buffer (not the original writer) and never restore it,
+// poisoning subsequent tests in the package.
+func captureLog(t *testing.T) *bytes.Buffer {
+ t.Helper()
+ var buf bytes.Buffer
+ prev := log.Writer()
+ log.SetOutput(&buf)
+ t.Cleanup(func() { log.SetOutput(prev) })
+ return &buf
+}
func TestLogProvisionPanic_NoOpWhenNoPanic(t *testing.T) {
// Sanity: the deferred recover must be silent when nothing panicked.
// Otherwise every successful provision would emit a spurious panic log.
- var buf bytes.Buffer
- log.SetOutput(&buf)
- defer log.SetOutput(log.Writer())
+ buf := captureLog(t)
+ h, cap := newPanicTestHandler()
func() {
- defer logProvisionPanic("ws-no-panic", "cp")
+ defer h.logProvisionPanic("ws-no-panic", "cp")
// no panic
}()
if buf.Len() != 0 {
t.Fatalf("expected no log output when no panic, got: %q", buf.String())
}
+ if cap.lastData != nil {
+ t.Fatalf("expected no broadcast when no panic, got: %v", cap.lastData)
+ }
}
func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) {
- // Wire a sqlmock so logProvisionPanic's UPDATE has somewhere to land
+ // Wire a sqlmock so markProvisionFailed's UPDATE has somewhere to land
// without needing a real Postgres. The mock asserts the SQL shape +
// args so a future refactor of the persist call doesn't silently
// stop marking the row failed.
@@ -49,13 +76,15 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) {
db.DB = mockDB
defer func() { db.DB = prevDB }()
- mock.ExpectExec(`UPDATE workspaces SET status='failed'`).
- WithArgs("ws-panic", sqlmock.AnyArg()).
+ // markProvisionFailed issues:
+ // UPDATE workspaces SET status = $3, last_sample_error = $2, updated_at = now() WHERE id = $1
+ // with args (workspaceID, msg, models.StatusFailed).
+ mock.ExpectExec(`UPDATE workspaces SET status`).
+ WithArgs("ws-panic", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnResult(sqlmock.NewResult(0, 1))
- var buf bytes.Buffer
- log.SetOutput(&buf)
- defer log.SetOutput(log.Writer())
+ buf := captureLog(t)
+ h, cap := newPanicTestHandler()
// Exercise: a function that defers logProvisionPanic + then panics.
// The recover MUST swallow the panic — if it propagates, the test
@@ -70,7 +99,7 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) {
didNotPanic = false
}
}()
- defer logProvisionPanic("ws-panic", "cp")
+ defer h.logProvisionPanic("ws-panic", "cp")
panic("simulated provision panic for #2486 regression")
}()
@@ -90,7 +119,18 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) {
}
if err := mock.ExpectationsWereMet(); err != nil {
- t.Errorf("sql expectations: %v — UPDATE workspaces … status='failed' was not issued", err)
+ t.Errorf("sql expectations: %v — UPDATE workspaces … status=failed was not issued", err)
+ }
+
+ // Canvas-broadcast assertion: the panic recovery MUST route through
+ // markProvisionFailed, which fires WORKSPACE_PROVISION_FAILED. Without
+ // this, the canvas spinner stays on "provisioning" until the sweeper
+ // or a poll — defeating the immediate-feedback purpose of this gate.
+ if cap.lastData == nil {
+ t.Fatal("expected broadcaster.RecordAndBroadcast to be called by panic recovery, got nil — canvas would not see the failure")
+ }
+ if errMsg, ok := cap.lastData["error"].(string); !ok || !strings.Contains(errMsg, "provision panic:") {
+ t.Errorf("broadcast payload missing/wrong 'error' field; got: %v", cap.lastData)
}
}
@@ -109,21 +149,23 @@ func TestLogProvisionPanic_PersistFailureLogged(t *testing.T) {
db.DB = mockDB
defer func() { db.DB = prevDB }()
- mock.ExpectExec(`UPDATE workspaces SET status='failed'`).
- WithArgs("ws-panic-persist-fail", sqlmock.AnyArg()).
+ mock.ExpectExec(`UPDATE workspaces SET status`).
+ WithArgs("ws-panic-persist-fail", sqlmock.AnyArg(), sqlmock.AnyArg()).
WillReturnError(sql.ErrConnDone)
- var buf bytes.Buffer
- log.SetOutput(&buf)
- defer log.SetOutput(log.Writer())
+ buf := captureLog(t)
+ h, _ := newPanicTestHandler()
func() {
- defer logProvisionPanic("ws-panic-persist-fail", "docker")
+ defer h.logProvisionPanic("ws-panic-persist-fail", "docker")
panic("simulated panic with DB unavailable")
}()
logged := buf.String()
- if !strings.Contains(logged, "failed to persist panic-failure for ws-panic-persist-fail") {
- t.Errorf("expected persist-failure log line; got: %q", logged)
+ // markProvisionFailed logs `markProvisionFailed: db update failed for : `
+ // when its UPDATE fails. That's the line that proves we surfaced the
+ // persist failure rather than swallowing it.
+ if !strings.Contains(logged, "markProvisionFailed: db update failed for ws-panic-persist-fail") {
+ t.Errorf("expected markProvisionFailed db-update-failure log line; got: %q", logged)
}
}
From 4f64c4366f4d9c11693c953ff61fd7d5e884a11a Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 20:03:11 -0700
Subject: [PATCH 52/61] test(provision): swap to concurrent-safe broadcaster in
7-burst harness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
CI Platform (Go) ran with -race and the concurrent test tripped the
detector: captureBroadcaster (sequential-test stub) writes lastData
unguarded; 7 fan-out goroutines call markProvisionFailed → that stub
concurrently. Local non-race run had hidden it.
Introduce concurrentSafeBroadcaster (mutex-counted) for this single
fan-out test. Sequential tests keep using captureBroadcaster — the
fix is local to the test that creates the goroutines.
Verified ./internal/handlers passes with -race.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
...rkspace_provision_concurrent_repro_test.go | 29 +++++++++++++++++--
1 file changed, 27 insertions(+), 2 deletions(-)
diff --git a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
index 2c19e41b..16473f70 100644
--- a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
@@ -126,8 +126,13 @@ func TestProvisionWorkspaceCP_ConcurrentBurst_NoSilentDrop(t *testing.T) {
// goroutine entered AND reached the recorded Start() call.
rec := &recordingCPProv{startErr: fmt.Errorf("simulated CP rejection")}
- cap := &captureBroadcaster{}
- handler := NewWorkspaceHandler(cap, nil, "http://localhost:8080", t.TempDir())
+ // Concurrent-safe broadcaster — captureBroadcaster (used by sequential
+ // tests in workspace_provision_test.go) writes lastData unguarded.
+ // Under -race + 7 fan-out goroutines that's a real data race; this
+ // stub serializes via mutex and only counts (we don't need the
+ // payload for any assertion below).
+ bcast := &concurrentSafeBroadcaster{}
+ handler := NewWorkspaceHandler(bcast, nil, "http://localhost:8080", t.TempDir())
handler.SetCPProvisioner(rec)
var wg sync.WaitGroup
@@ -203,6 +208,26 @@ type safeWriter struct {
mu *sync.Mutex
}
+// concurrentSafeBroadcaster is a thread-safe events.EventEmitter stub
+// for the 7-goroutine fan-out test. captureBroadcaster (the canonical
+// sequential-test stub in workspace_provision_test.go) writes its
+// lastData field without synchronization — under -race that's a true
+// data race when 7 markProvisionFailed calls run concurrently. This
+// stub only counts (no payload retention) and serializes via mutex.
+type concurrentSafeBroadcaster struct {
+ mu sync.Mutex
+ count int
+}
+
+func (b *concurrentSafeBroadcaster) BroadcastOnly(_ string, _ string, _ interface{}) {}
+
+func (b *concurrentSafeBroadcaster) RecordAndBroadcast(_ context.Context, _, _ string, _ interface{}) error {
+ b.mu.Lock()
+ b.count++
+ b.mu.Unlock()
+ return nil
+}
+
func (w *safeWriter) Write(p []byte) (int, error) {
w.mu.Lock()
defer w.mu.Unlock()
From b54968878a5c0332526fc5df77ff26efcce84ab7 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 20:06:06 -0700
Subject: [PATCH 53/61] docs(internal): refresh runtime-package mirror policy +
parity matrix + dead-link fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
- workspace-runtime-package.md: add explicit "Where to make changes"
section documenting the mirror-only policy on
Molecule-AI/molecule-ai-workspace-runtime — direct PRs are auto-rejected
by mirror-guard CI; staging push regenerates both the mirror and the
PyPI wheel via .github/workflows/publish-runtime.yml.
- infra/workspace-terminal.md: replace dead molecule-core#1528 reference
(repo renamed to molecule-monorepo, no longer accepting issues at the
old name) with a forward-pointer to monorepo + molecule-controlplane
issue trackers.
- architecture/backends.md: bump audit date to 2026-05-02 and add rows
for channel envelope enrichment (#2471), chat_history MCP tool
(#2474), /activity before_ts paging (#2476), /activity peer_id filter
(#2472), runtime_wedge smoke gate (#2473 + #2475), and the canvas-E2E
state-file requirement (#2327).
Co-Authored-By: Claude Opus 4.7 (1M context)
---
docs/architecture/backends.md | 11 ++++++++++-
docs/infra/workspace-terminal.md | 6 +++++-
docs/workspace-runtime-package.md | 23 +++++++++++++++++++++++
3 files changed, 38 insertions(+), 2 deletions(-)
diff --git a/docs/architecture/backends.md b/docs/architecture/backends.md
index 2d8b25c0..ce01b247 100644
--- a/docs/architecture/backends.md
+++ b/docs/architecture/backends.md
@@ -2,7 +2,7 @@
**Status:** living document — update when you ship a feature that touches one backend.
**Owner:** workspace-server + controlplane teams.
-**Last audit:** 2026-04-23 (Claude agent, PR #TBD).
+**Last audit:** 2026-05-02 (Claude agent, PR #TBD).
## Why this exists
@@ -37,6 +37,12 @@ This document is the canonical matrix. If you are landing a workspace-facing fea
| **A2A proxy** | | | | |
| Forward | `a2a_proxy.go` | `127.0.0.1:` | EC2 private IP inside tenant VPC | ✅ parity |
| Liveness | `a2a_proxy_helpers.go` | `provisioner.IsRunning()` | `cpProv.IsRunning()` (DB-backed) | ✅ parity |
+| Channel envelope enrichment (peer_name / peer_role / agent_card_url) | `a2a_proxy.go` + workspace-runtime channel emitter (PR #2471) | inbox row carries enriched fields | inbox row carries enriched fields | ✅ parity as of 2026-05-02 |
+| **MCP tools (a2a)** | | | | |
+| `chat_history` — fetch prior turns with a peer | `mcp_server.go` + workspace-runtime `a2a_mcp` (PR #2474) | runtime-served, backend-agnostic | runtime-served, backend-agnostic | ✅ parity as of 2026-05-02 |
+| **Activity API** | | | | |
+| `before_ts` paging on `/workspaces/:id/activity` | `activity.go` (PR #2476) | DB-driven | DB-driven | ✅ parity as of 2026-05-02 |
+| `peer_id` filter on `/workspaces/:id/activity` | `activity.go` (PR #2472) | DB-driven | DB-driven | ✅ parity as of 2026-05-02 |
| **Config / template injection** | | | | |
| Template copy at provision | `provisioner.go:553-648` | host walk → tar → `CopyToContainer(/configs)` | CP user-data bakes template into bootstrap script | ⚠️ divergent — sync (docker) vs async (EC2) |
| Runtime config hot-reload | `templates.go` + handlers | no hot-reload — restart required | no hot-reload — restart required | ✅ parity (both require restart; acceptable) |
@@ -45,6 +51,9 @@ This document is the canonical matrix. If you are landing a workspace-facing fea
| **Bootstrap signals** | | | | |
| Ready detection | registry `/registry/register` | container heartbeat | tenant heartbeat + boot-event phone-home (CP `bootevents` table + `wait_platform_health=ok`) | ✅ parity as of molecule-controlplane#235 |
| Console / log output | `workspace_bootstrap.go` | `docker logs` | `ec2:GetConsoleOutput` via CP proxy | 🟡 ec2-only (docker has `docker logs` directly; no unified API) |
+| `runtime_wedge` post-`execute()` smoke gate | workspace-runtime `smoke_mode.py` (PRs #2473 + #2475) | runtime-served, surfaces SDK-init wedges to wheel-smoke + container start | runtime-served, surfaces SDK-init wedges to wheel-smoke + container start | ✅ parity as of 2026-05-02 |
+| **Test infrastructure** | | | | |
+| Canvas-E2E `.playwright-staging-state.json` written before any CP call | `tools/e2e-staging-setup` (PR #2327, 2026-04-30) | n/a — staging-only safety net | required so workflow safety-net can find slug; pattern-sweeping by date prefix poisons concurrent runs | ✅ enforced (staging E2E) |
| **Orphan cleanup** | | | | |
| Detect + terminate stale | `healthsweep.go` + CP `DeprovisionInstance` | Docker daemon scan | CP OrgID-tag cascade (molecule-controlplane#234) | ✅ parity as of 2026-04-23 |
| **Health / budget / schedules** | | | | |
diff --git a/docs/infra/workspace-terminal.md b/docs/infra/workspace-terminal.md
index 955d5396..84e120e3 100644
--- a/docs/infra/workspace-terminal.md
+++ b/docs/infra/workspace-terminal.md
@@ -16,7 +16,11 @@ workspace container running on it) over an [EC2 Instance Connect
Endpoint](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-instance-connect-setup-ec2-instance-connect-endpoint.html).
End users see a terminal; no direct public SSH ingress is required.
-Tracking: [molecule-core#1528](https://github.com/Molecule-AI/molecule-core/issues/1528) (resolved 2026-04-22).
+Tracking: originally `molecule-core#1528` (resolved 2026-04-22). The
+`molecule-core` repo has since been renamed to `molecule-monorepo` and no
+longer accepts new issues under the old name; future terminal work is
+tracked in `molecule-monorepo` issues (workspace-server scope) and in
+`molecule-controlplane` issues for the EIC / per-tenant SG path.
## Where things are
diff --git a/docs/workspace-runtime-package.md b/docs/workspace-runtime-package.md
index 73c56d38..1b2927e2 100644
--- a/docs/workspace-runtime-package.md
+++ b/docs/workspace-runtime-package.md
@@ -17,6 +17,29 @@ distinct from the PyPI package) is no longer the source-of-truth and should
be treated as a publish artifact only. It can be archived or used as a
read-only mirror.
+## Where to make changes
+
+**All runtime edits land in `molecule-monorepo/workspace/`. Period.**
+
+The GitHub repo `Molecule-AI/molecule-ai-workspace-runtime` is **mirror-only**.
+It exists so external consumers (template repos, downstream operators) have a
+git-cloneable artifact that mirrors the PyPI wheel — nothing more.
+
+- **Direct PRs against `molecule-ai-workspace-runtime` are auto-rejected by
+ the `mirror-guard` CI check.** The check fails any push that did not come
+ from the publish pipeline. There is no opt-out — file the change against
+ `molecule-monorepo/workspace/` instead.
+- **The mirror + the PyPI wheel both auto-regenerate on every push to
+ `staging`** via `.github/workflows/publish-runtime.yml` (which calls
+ `scripts/build_runtime_package.py`, builds wheel + sdist, smoke-imports,
+ uploads to PyPI via Trusted Publisher, and force-pushes the rewritten tree
+ to the mirror repo). You never touch the mirror by hand.
+
+If you have an old local clone of the mirror and try to push a fix to it
+directly, expect a CI failure with a message pointing you here. Re-open the
+change against `molecule-monorepo/workspace/` and let the publish workflow
+do the rest.
+
## Why this shape
The 8 workspace template repos (claude-code, langgraph, hermes, etc.) each
From 82cc3315171ee2ab61be31587750b5192534aade Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 20:11:11 -0700
Subject: [PATCH 54/61] test(provision): harden panic tests with re-raise guard
+ assert broadcast count
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Post-merge follow-up to PR #2487 review feedback:
1. guardAgainstReraise(fn) helper around every panic-test exercise. The
original RecoversAndMarksFailed had its own outer recover() to detect
re-raise; NoOpWhenNoPanic and PersistFailureLogged didn't. If a future
regression makes logProvisionPanic re-raise, those two would have
crashed the test process (taking sibling tests down) instead of
reporting a clean failure. Now all three use the shared guard.
2. Concurrent repro now asserts bcast.count == 7 — the new
concurrentSafeBroadcaster's count field was added in the race fix
but not actually consumed. Cross-checks the existing recorder-set
assertion from a different angle: a goroutine could in principle
reach cpProv.Start (recorder hits) but then lose its
WORKSPACE_PROVISION_FAILED broadcast on the failure path. Pinning
both rules out that silent-drop variant for the canvas-broadcast
contract specifically.
3. Comment on captureLog noting log.SetOutput is process-global and
incompatible with t.Parallel() — preempts a future footgun if
someone parallelizes the panic suite.
Verified: all four tests pass under -race; full handlers + db packages
green under -race.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
...rkspace_provision_concurrent_repro_test.go | 16 ++++++
.../workspace_provision_panic_test.go | 53 ++++++++++++-------
2 files changed, 50 insertions(+), 19 deletions(-)
diff --git a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
index 16473f70..e12b8fa8 100644
--- a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
@@ -189,6 +189,22 @@ func TestProvisionWorkspaceCP_ConcurrentBurst_NoSilentDrop(t *testing.T) {
}
}
+ // Assertion 4: every goroutine's failure path called RecordAndBroadcast
+ // exactly once (via h.markProvisionFailed inside provisionWorkspaceCP's
+ // "start failed" arm). Cross-checks Assertion 2 from a different angle
+ // — if a goroutine reaches Start() but then loses its WORKSPACE_
+ // PROVISION_FAILED broadcast, the canvas spinner sticks on
+ // "provisioning" until the sweeper. That regression class is what
+ // drove making logProvisionPanic a method on *WorkspaceHandler — so
+ // it's worth pinning here too.
+ bcast.mu.Lock()
+ bcastCount := bcast.count
+ bcast.mu.Unlock()
+ if bcastCount != numWorkspaces {
+ t.Errorf("broadcaster saw %d RecordAndBroadcast calls, want %d. SILENT-DROP CLASS: a goroutine reached cpProv.Start but never broadcast its failure.",
+ bcastCount, numWorkspaces)
+ }
+
if err := mock.ExpectationsWereMet(); err != nil {
// Soft-fail: under concurrency some queries may have been
// re-ordered relative to the (non-strict) expectation set,
diff --git a/workspace-server/internal/handlers/workspace_provision_panic_test.go b/workspace-server/internal/handlers/workspace_provision_panic_test.go
index f7776ee4..d9705f30 100644
--- a/workspace-server/internal/handlers/workspace_provision_panic_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_panic_test.go
@@ -33,6 +33,10 @@ func newPanicTestHandler() (*WorkspaceHandler, *captureBroadcaster) {
// load-bearing — `log.Writer()` evaluated at defer-fire time would
// return the buffer (not the original writer) and never restore it,
// poisoning subsequent tests in the package.
+//
+// log.SetOutput is process-global: do NOT call this from a test that
+// uses t.Parallel() or two captures will race + clobber. The panic
+// tests below are intentionally non-parallel for this reason.
func captureLog(t *testing.T) *bytes.Buffer {
t.Helper()
var buf bytes.Buffer
@@ -42,16 +46,35 @@ func captureLog(t *testing.T) *bytes.Buffer {
return &buf
}
+// guardAgainstReraise wraps a function in a recover-arm that flips the
+// returned bool to false if anything propagates past `defer
+// h.logProvisionPanic(...)`. Used in every panic test (not just
+// RecoversAndMarksFailed) so a future regression that re-raises from
+// the recovery path surfaces as a clean test failure, not a process
+// abort that crashes sibling tests.
+func guardAgainstReraise(fn func()) (didNotPanic bool) {
+ didNotPanic = true
+ defer func() {
+ if r := recover(); r != nil {
+ didNotPanic = false
+ }
+ }()
+ fn()
+ return
+}
+
func TestLogProvisionPanic_NoOpWhenNoPanic(t *testing.T) {
// Sanity: the deferred recover must be silent when nothing panicked.
// Otherwise every successful provision would emit a spurious panic log.
buf := captureLog(t)
h, cap := newPanicTestHandler()
- func() {
+ if !guardAgainstReraise(func() {
defer h.logProvisionPanic("ws-no-panic", "cp")
// no panic
- }()
+ }) {
+ t.Fatal("logProvisionPanic re-raised on the no-panic path — recover() returned non-nil for a goroutine that didn't panic")
+ }
if buf.Len() != 0 {
t.Fatalf("expected no log output when no panic, got: %q", buf.String())
@@ -87,23 +110,13 @@ func TestLogProvisionPanic_RecoversAndMarksFailed(t *testing.T) {
h, cap := newPanicTestHandler()
// Exercise: a function that defers logProvisionPanic + then panics.
- // The recover MUST swallow the panic — if it propagates, the test
- // process crashes and the panic message bubbles up as a Go test
- // failure rather than the assertion below.
- didNotPanic := true
- func() {
- defer func() {
- // If logProvisionPanic re-raised, this catches it for the
- // test. We assert below that it did NOT re-raise.
- if r := recover(); r != nil {
- didNotPanic = false
- }
- }()
+ // The recover MUST swallow the panic — if it propagates,
+ // guardAgainstReraise catches it instead of letting the test
+ // process abort.
+ if !guardAgainstReraise(func() {
defer h.logProvisionPanic("ws-panic", "cp")
panic("simulated provision panic for #2486 regression")
- }()
-
- if !didNotPanic {
+ }) {
t.Fatal("logProvisionPanic re-raised the panic — the recover() arm did not swallow it")
}
@@ -156,10 +169,12 @@ func TestLogProvisionPanic_PersistFailureLogged(t *testing.T) {
buf := captureLog(t)
h, _ := newPanicTestHandler()
- func() {
+ if !guardAgainstReraise(func() {
defer h.logProvisionPanic("ws-panic-persist-fail", "docker")
panic("simulated panic with DB unavailable")
- }()
+ }) {
+ t.Fatal("logProvisionPanic re-raised when the persist-failure path was exercised — recover() arm did not swallow")
+ }
logged := buf.String()
// markProvisionFailed logs `markProvisionFailed: db update failed for : `
From 5cca46284388f529cd72daf21213feaa8e85b495 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 20:12:49 -0700
Subject: [PATCH 55/61] harness(phase-0): sudo-free Host-header path +
chat_history + envelope replays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Three changes that bring the local harness from "covers what staging
covers minus the SaaS topology" to "exercises every surface we shipped
this session against the prod-shape Dockerfile.tenant image."
1. Drop the /etc/hosts requirement.
Replays previously needed `127.0.0.1 harness-tenant.localhost` in
/etc/hosts to resolve the cf-proxy. That gated the harness behind a
sudo step on every fresh dev box and CI runner. The cf-proxy nginx
already routes by Host header (matches production CF tunnel: URL is
public, Host carries tenant identity), so the no-sudo path is to
target loopback :8080 with `Host: harness-tenant.localhost` set as
a header.
New `tests/harness/_curl.sh` centralises this — curl_anon /
curl_admin / curl_workspace / psql_exec wrappers all set the Host
+ auth headers automatically. seed.sh, peer-discovery-404.sh,
buildinfo-stale-image.sh updated to source it. Legacy /etc/hosts
users still work via env-var override.
2. Fix the seed.sh FK regression that blocked DB-side replays.
POST /workspaces ignores any `id` in the request body and generates
one server-side. seed.sh was minting client-side UUIDs that never
reached the workspaces table, so any replay that INSERTed into
activity_logs (FK-constrained on workspace_id) failed with the
workspace-not-found error. Capture the returned id from the
response instead.
3. Two new replays cover the surfaces shipped this session.
chat-history.sh — exercises the full SaaS-shape wire that PR #2472
(peer_id filter), #2474 (chat_history client tool), and #2476
(before_ts paging) ride on. 8 phases / 16 assertions: peer_id filter,
limit cap, before_ts paging, OR-clause covering both source_id and
target_id, malformed peer_id 400, malformed before_ts 400, URL-encoded
SQLi-shape rejection. Verified PASS against the live harness.
channel-envelope-trust-boundary.sh — exercises PR #2471 + #2481 by
importing from `molecule_runtime.*` (the wheel-rewritten path) so
it catches "wheel build dropped a fix that unit tests still pass."
5 phases / 11 assertions: malicious peer_id scrubbed from envelope,
agent_card_url omitted on validation failure, XML-injection bytes
scrubbed, valid UUID preserved, _agent_card_url_for direct gate.
Verified PASS against published wheel 0.1.79.
run-all-replays.sh auto-discovers — no registration needed. Full
lifecycle (boot → seed → 4 replays → teardown) runs clean.
Roadmap section updated to reflect Phase 1 (this PR) → Phase 2
(multi-tenant + CI gate) → Phase 3 (real CP) → Phase 4 (Miniflare +
LocalStack + traffic replay).
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.gitignore | 1 +
tests/harness/.gitignore | 2 +
tests/harness/README.md | 39 ++--
tests/harness/_curl.sh | 82 ++++++++
.../harness/replays/buildinfo-stale-image.sh | 6 +-
.../channel-envelope-trust-boundary.sh | 182 ++++++++++++++++++
tests/harness/replays/chat-history.sh | 175 +++++++++++++++++
tests/harness/replays/peer-discovery-404.sh | 10 +-
tests/harness/seed.sh | 55 +++---
tests/harness/up.sh | 21 +-
10 files changed, 513 insertions(+), 60 deletions(-)
create mode 100644 tests/harness/.gitignore
create mode 100644 tests/harness/_curl.sh
create mode 100755 tests/harness/replays/channel-envelope-trust-boundary.sh
create mode 100755 tests/harness/replays/chat-history.sh
diff --git a/.gitignore b/.gitignore
index 05da25ee..3b6e7451 100644
--- a/.gitignore
+++ b/.gitignore
@@ -146,3 +146,4 @@ backups/
*-temp.txt
/test-pmm-*.txt
/tick-reflections-*.md
+tests/harness/cp-stub/cp-stub
diff --git a/tests/harness/.gitignore b/tests/harness/.gitignore
new file mode 100644
index 00000000..193e2b48
--- /dev/null
+++ b/tests/harness/.gitignore
@@ -0,0 +1,2 @@
+# Harness ephemeral state. Re-generated by ./seed.sh on every boot.
+.seed.env
diff --git a/tests/harness/README.md b/tests/harness/README.md
index 1306d8ae..bf0ad93e 100644
--- a/tests/harness/README.md
+++ b/tests/harness/README.md
@@ -1,12 +1,20 @@
# Production-shape local harness
The harness brings up the SaaS tenant topology on localhost using the
-same `Dockerfile.tenant` image that ships to production. Tests run
-against `http://harness-tenant.localhost:8080` and exercise the
-SAME code path a real tenant takes — including TenantGuard middleware,
+same `Dockerfile.tenant` image that ships to production. Tests target
+the cf-proxy on `http://localhost:8080` and pass the tenant identity
+via a `Host: harness-tenant.localhost` header — exactly the way
+production CF tunnel routes by Host header. The cf-proxy nginx then
+rewrites headers and proxies to the tenant container, exercising the
+SAME code path a real tenant takes including TenantGuard middleware,
the `/cp/*` reverse proxy, the canvas reverse proxy, and a
Cloudflare-tunnel-shape header rewrite layer.
+`tests/harness/_curl.sh` is the helper sourced by every replay —
+provides `curl_anon`, `curl_admin`, `curl_workspace`, and `psql_exec`
+wrappers that set the right Host + auth headers automatically. New
+replays should source it rather than rolling their own curl.
+
## Why this exists
Local `go run ./cmd/server` skips:
@@ -53,15 +61,18 @@ KEEP_UP=1 ./run-all-replays.sh # leave harness up for debugging
REBUILD=1 ./run-all-replays.sh # rebuild images before booting
```
-First-time setup needs an `/etc/hosts` entry so `harness-tenant.localhost`
-resolves to the local cf-proxy:
+No `/etc/hosts` edit required — replays use the cf-proxy's loopback
+port and pass `Host: harness-tenant.localhost` as a header (`_curl.sh`
+handles this automatically). This matches how production CF tunnel
+routes: the URL is the public CF endpoint, the Host header carries the
+per-tenant identity. Quick check:
```bash
-echo "127.0.0.1 harness-tenant.localhost" | sudo tee -a /etc/hosts
+curl -H "Host: harness-tenant.localhost" http://localhost:8080/health
```
-(macOS resolves `*.localhost` automatically in some setups; Linux
-typically does not.)
+(If you have a legacy `/etc/hosts` entry from older docs, it still
+works — `BASE` and `TENANT_HOST` both honor env-var overrides.)
## Replay scripts
@@ -74,6 +85,8 @@ green" — the script becomes the regression gate that closes that gap.
|--------|--------|----------------|
| `peer-discovery-404.sh` | #2397 | tool_list_peers surfaces the actual reason instead of "may be isolated" |
| `buildinfo-stale-image.sh` | #2395 | GIT_SHA reaches the binary; verify-step comparison logic works |
+| `chat-history.sh` | #2472 + #2474 + #2476 | `peer_id` filter (incl. OR over source/target) + `before_ts` paging + UUID/RFC3339 trust boundary on the activity route |
+| `channel-envelope-trust-boundary.sh` | #2471 + #2481 | published wheel scrubs malformed `peer_id` from the channel envelope and from `agent_card_url` (path-traversal + XML-attr injection) |
To add a new replay:
1. Drop a script under `replays/` named after the issue.
@@ -111,9 +124,7 @@ its mandate of "exercise the tenant binary in production-shape topology."
## Roadmap
-- **Phase 1 (shipped):** harness + cp-stub + cf-proxy + 2 replays + `run-all-replays.sh` runner.
-- **Phase 2:** convert `tests/e2e/test_api.sh` to run against the
- harness instead of localhost. Make harness-based E2E a required CI
- check (a workflow that invokes `run-all-replays.sh` on every PR).
-- **Phase 3:** config-coherence lint that diffs harness env list
- against production CP's env list, fails CI on drift.
+- **Phase 1 (shipped):** harness + cp-stub + cf-proxy + 4 replays + `run-all-replays.sh` runner. No-sudo `Host`-header path via `_curl.sh`. Per-replay psql seeding for tests that need DB-side fixtures.
+- **Phase 2 (in flight):** multi-tenant — second `tenant-beta` service in compose, second Postgres database, replays for cross-tenant A2A + TenantGuard isolation. Convert `tests/e2e/test_api.sh` to target the harness instead of localhost. Make harness-based E2E a required CI check (a workflow that invokes `run-all-replays.sh` on every PR via the self-hosted Mac runner).
+- **Phase 3:** replace `cp-stub/` with the real `molecule-controlplane` Docker build. Add a config-coherence lint that diffs harness env list against production CP's env list and fails CI on drift.
+- **Phase 4 (long-term):** Miniflare in front of cf-proxy for real CF emulation (WAF, BotID, rate-limit, cf-tunnel headers). LocalStack for the EC2 provisioner. Anonymized prod-traffic recording/replay for SaaS-scale regression detection.
diff --git a/tests/harness/_curl.sh b/tests/harness/_curl.sh
new file mode 100644
index 00000000..6a32ab5d
--- /dev/null
+++ b/tests/harness/_curl.sh
@@ -0,0 +1,82 @@
+# Sourceable helper for harness replays. Centralises the
+# curl-against-cf-proxy pattern so scripts don't depend on /etc/hosts.
+#
+# Production CF tunnel routes by Host header, not by DNS — the request
+# URL is to a public CF endpoint and the Host header carries the
+# per-tenant identity. We replay the same shape locally:
+#
+# curl -H "Host: harness-tenant.localhost" http://localhost:8080/health
+#
+# This matches what cf-proxy/nginx.conf already routes (`server_name
+# *.localhost localhost`) and avoids the macOS /etc/hosts requirement
+# that previously gated the harness behind a sudo step.
+#
+# Backwards-compatible: if /etc/hosts resolves harness-tenant.localhost
+# (the legacy path), the bare URL still works because the helper falls
+# back to that. New scripts SHOULD use the helper functions.
+#
+# Usage:
+# HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+# source "$HERE/../_curl.sh" # from replays/.sh
+# curl_admin "$BASE/health"
+# curl_anon "$BASE/health"
+
+# Bind to the cf-proxy's loopback port — the proxy front-doors every
+# tenant and routes by Host header, exactly like production's CF tunnel.
+: "${BASE:=http://localhost:8080}"
+: "${TENANT_HOST:=harness-tenant.localhost}"
+: "${ADMIN_TOKEN:=harness-admin-token}"
+: "${ORG_ID:=harness-org}"
+
+# Anonymous request — only Host header (no auth). Use for /health,
+# /buildinfo, and any other route that's intentionally public.
+curl_anon() {
+ curl -sS -H "Host: ${TENANT_HOST}" "$@"
+}
+
+# Admin-token request — full SaaS auth shape. Sets the bearer token,
+# tenant org header (activates TenantGuard middleware), and a default
+# JSON Content-Type. Replays admin paths exactly the way CP does in
+# production, so any TenantGuard / strict-auth bug surfaces locally.
+curl_admin() {
+ curl -sS \
+ -H "Host: ${TENANT_HOST}" \
+ -H "Authorization: Bearer ${ADMIN_TOKEN}" \
+ -H "X-Molecule-Org-Id: ${ORG_ID}" \
+ -H "Content-Type: application/json" \
+ "$@"
+}
+
+# Workspace-scoped request — uses a per-workspace bearer minted from
+# /admin/workspaces/:id/test-token. The platform's auth.go middleware
+# accepts this bearer for the workspace's own routes, so this is the
+# right shape for replays that exercise an in-workspace tool calling
+# back to the platform (chat_history, list_peers, etc).
+#
+# Caller must export WORKSPACE_TOKEN before invoking.
+curl_workspace() {
+ : "${WORKSPACE_TOKEN:?WORKSPACE_TOKEN must be set — mint via /admin/workspaces/:id/test-token}"
+ curl -sS \
+ -H "Host: ${TENANT_HOST}" \
+ -H "Authorization: Bearer ${WORKSPACE_TOKEN}" \
+ -H "X-Molecule-Org-Id: ${ORG_ID}" \
+ -H "Content-Type: application/json" \
+ "$@"
+}
+
+# Direct postgres exec — for replays that need to seed activity_logs
+# rows or read DB state that has no public HTTP route. Wraps the
+# `docker compose exec` pattern so replays can stay shell-only.
+#
+# SECRETS_ENCRYPTION_KEY is set to a placeholder so compose's `:?must
+# be set` interpolation guard (which gates running the harness without
+# up.sh) doesn't trip on `exec` — exec only reaches an already-running
+# service so the env var is irrelevant, but compose still validates
+# the file. The placeholder is never written anywhere or used by any
+# service.
+psql_exec() {
+ SECRETS_ENCRYPTION_KEY="${SECRETS_ENCRYPTION_KEY:-exec-placeholder}" \
+ docker compose -f "${HARNESS_COMPOSE:-$(dirname "${BASH_SOURCE[0]}")/compose.yml}" \
+ exec -T postgres \
+ psql -U harness -d molecule -At "$@"
+}
diff --git a/tests/harness/replays/buildinfo-stale-image.sh b/tests/harness/replays/buildinfo-stale-image.sh
index 9d9be053..af6cd497 100755
--- a/tests/harness/replays/buildinfo-stale-image.sh
+++ b/tests/harness/replays/buildinfo-stale-image.sh
@@ -22,12 +22,12 @@
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
HARNESS_ROOT="$(dirname "$HERE")"
-
-BASE="${BASE:-http://harness-tenant.localhost:8080}"
+# shellcheck source=../_curl.sh
+source "$HARNESS_ROOT/_curl.sh"
# 1. Confirm /buildinfo wire shape — same shape the workflow's jq lookup expects.
echo "[replay] curl $BASE/buildinfo ..."
-BUILD_JSON=$(curl -sS "$BASE/buildinfo")
+BUILD_JSON=$(curl_anon "$BASE/buildinfo")
echo "[replay] $BUILD_JSON"
ACTUAL_SHA=$(echo "$BUILD_JSON" | jq -r '.git_sha // ""')
diff --git a/tests/harness/replays/channel-envelope-trust-boundary.sh b/tests/harness/replays/channel-envelope-trust-boundary.sh
new file mode 100755
index 00000000..550def4c
--- /dev/null
+++ b/tests/harness/replays/channel-envelope-trust-boundary.sh
@@ -0,0 +1,182 @@
+#!/usr/bin/env bash
+# Replay for the channel envelope peer_id trust-boundary fix
+# (PR #2481, follow-up to PR #2471). Verifies that the PUBLISHED wheel
+# installed on this machine — not local source — gates malformed peer_id
+# at both the envelope builder and the agent_card_url builder.
+#
+# Why this matters:
+# - Unit tests in workspace/tests/ run against local source. They
+# prove the fix works in source. They DO NOT prove the published
+# wheel contains the fix.
+# - The wheel rewriter (scripts/build_runtime_package.py) renames
+# symbols + paths. Any rewrite drift could silently strip the
+# guard from the shipped artifact.
+# - This replay imports from `molecule_runtime.a2a_mcp_server` (the
+# wheel-rewritten path), exercises the actual published code, and
+# asserts the envelope shape. If the wheel build ever ships without
+# the guard, this fails — even if unit tests on local source pass.
+#
+# Phases:
+# A. Confirm an installed molecule-runtime version that contains the
+# #2481 fix (>= 0.1.78).
+# B. Call `_build_channel_notification` with peer_id="../../foo" and
+# assert (1) meta["peer_id"] == "", (2) no agent_card_url field,
+# (3) no peer_name/peer_role.
+# C. Symmetric case: peer_id with embedded XML-attribute injection
+# bytes — assert the same scrubbing.
+# D. Happy path: a valid UUID peer_id is preserved (proves we didn't
+# regress legitimate enrichment).
+# E. Direct check on the URL builder — `_agent_card_url_for("../../foo")`
+# must return "" and never an unsanitised URL.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+cd "$HARNESS_ROOT"
+# shellcheck source=../_curl.sh
+source "$HARNESS_ROOT/_curl.sh"
+
+PASS=0
+FAIL=0
+
+assert() {
+ local desc="$1" expected="$2" actual="$3"
+ if [ "$expected" = "$actual" ]; then
+ printf " PASS %s\n" "$desc"
+ PASS=$((PASS + 1))
+ else
+ printf " FAIL %s\n expected: %s\n got : %s\n" "$desc" "$expected" "$actual" >&2
+ FAIL=$((FAIL + 1))
+ fi
+}
+
+# ─── Phase A: wheel version contains the fix ───────────────────────────
+echo "[replay] A. confirming installed molecule-ai-workspace-runtime contains #2481..."
+INSTALLED=$(pip3 show molecule-ai-workspace-runtime 2>/dev/null | awk -F': ' '/^Version:/ {print $2}')
+if [ -z "$INSTALLED" ]; then
+ echo "[replay] FAIL A: molecule-ai-workspace-runtime not installed."
+ echo " Install: pip3 install molecule-ai-workspace-runtime"
+ exit 2
+fi
+echo "[replay] installed version: $INSTALLED"
+
+# 0.1.78 is the first published version after #2481 merged to staging.
+# Compare via Python distutils-style version sort (works across patch
+# bumps without sed-fragility).
+HAS_FIX=$(python3 -c "
+from packaging.version import parse
+print('yes' if parse('$INSTALLED') >= parse('0.1.78') else 'no')
+" 2>/dev/null || echo "unknown")
+if [ "$HAS_FIX" != "yes" ]; then
+ echo "[replay] FAIL A: installed $INSTALLED < 0.1.78 (the version that shipped the #2481 fix)."
+ echo " Upgrade: pip3 install --upgrade molecule-ai-workspace-runtime"
+ exit 2
+fi
+echo "[replay] ✓ contains #2481 trust-boundary fix"
+
+# ─── Phase B-E: in-process assertions against the installed wheel ──────
+# We don't need WORKSPACE_ID/PLATFORM_URL/MOLECULE_WORKSPACE_TOKEN to
+# import the module — the env validation only fires at console-script
+# entry. We use molecule_runtime.* (the wheel-rewritten import path)
+# rather than workspace.a2a_mcp_server (local source) so this exercises
+# the SHIPPED code.
+echo ""
+echo "[replay] B-E. exercising _build_channel_notification + _agent_card_url_for from the installed wheel..."
+
+OUT=$(WORKSPACE_ID=00000000-0000-0000-0000-000000000000 \
+ PLATFORM_URL=http://localhost:8080 \
+ MOLECULE_WORKSPACE_TOKEN=stub \
+ MOLECULE_MCP_DISABLE_HEARTBEAT=1 \
+ python3 - <<'PYEOF'
+import json
+import sys
+
+from molecule_runtime.a2a_mcp_server import _build_channel_notification
+from molecule_runtime.a2a_client import _agent_card_url_for
+
+results = []
+
+def emit(name, value):
+ results.append({"name": name, "value": value})
+
+# ── B: path-traversal peer_id stripped from envelope ──
+payload = _build_channel_notification({
+ "peer_id": "../../foo",
+ "kind": "peer_agent",
+ "text": "redirect-attempt",
+ "activity_id": "act-1",
+ "method": "message/send",
+ "created_at": "2026-05-01T00:00:00Z",
+})
+meta = payload["params"]["meta"]
+emit("B1_peer_id_scrubbed", meta.get("peer_id", ""))
+emit("B2_agent_card_url_absent", "absent" if "agent_card_url" not in meta else meta["agent_card_url"])
+emit("B3_peer_name_absent", "absent" if "peer_name" not in meta else meta["peer_name"])
+emit("B4_peer_role_absent", "absent" if "peer_role" not in meta else meta["peer_role"])
+
+# ── C: XML-attribute-injection-shape peer_id ──
+payload = _build_channel_notification({
+ "peer_id": 'aaa" onclick="alert(1)',
+ "kind": "peer_agent",
+ "text": "xss",
+})
+meta = payload["params"]["meta"]
+emit("C1_peer_id_scrubbed", meta.get("peer_id", ""))
+emit("C2_agent_card_url_absent", "absent" if "agent_card_url" not in meta else "leaked")
+
+# ── D: legitimate UUID is preserved ──
+valid_uuid = "11111111-2222-3333-4444-555555555555"
+payload = _build_channel_notification({
+ "peer_id": valid_uuid,
+ "kind": "peer_agent",
+ "text": "legit",
+})
+meta = payload["params"]["meta"]
+emit("D1_peer_id_preserved", meta.get("peer_id", ""))
+# agent_card_url IS present (we don't gate the URL itself on whether the registry is reachable)
+emit("D2_agent_card_url_present", "yes" if meta.get("agent_card_url", "").endswith(valid_uuid) else "no")
+
+# ── E: direct URL builder gate ──
+emit("E1_url_builder_strips_traversal", _agent_card_url_for("../../foo"))
+emit("E2_url_builder_strips_xml", _agent_card_url_for('a" onclick="x'))
+emit("E3_url_builder_accepts_uuid_endswith", "yes" if _agent_card_url_for(valid_uuid).endswith(valid_uuid) else "no")
+
+print(json.dumps(results))
+PYEOF
+)
+
+# Parse and assert each result.
+echo "$OUT" | python3 -c "
+import json, sys
+results = json.loads(sys.stdin.read())
+for r in results:
+ print(f\"{r['name']}={r['value']}\")
+" > /tmp/cha-envelope-results.txt
+
+while IFS='=' read -r key value; do
+ case "$key" in
+ B1_peer_id_scrubbed) assert "B1: malicious peer_id scrubbed to \"\"" "" "$value" ;;
+ B2_agent_card_url_absent) assert "B2: agent_card_url not emitted" "absent" "$value" ;;
+ B3_peer_name_absent) assert "B3: peer_name not enriched" "absent" "$value" ;;
+ B4_peer_role_absent) assert "B4: peer_role not enriched" "absent" "$value" ;;
+ C1_peer_id_scrubbed) assert "C1: XML-injection peer_id scrubbed" "" "$value" ;;
+ C2_agent_card_url_absent) assert "C2: XML-injection URL not emitted" "absent" "$value" ;;
+ D1_peer_id_preserved) assert "D1: valid UUID peer_id preserved" "11111111-2222-3333-4444-555555555555" "$value" ;;
+ D2_agent_card_url_present) assert "D2: agent_card_url present for valid id" "yes" "$value" ;;
+ E1_url_builder_strips_traversal) assert "E1: _agent_card_url_for(\"../../foo\") returns \"\"" "" "$value" ;;
+ E2_url_builder_strips_xml) assert "E2: _agent_card_url_for(XML-injection) returns \"\"" "" "$value" ;;
+ E3_url_builder_accepts_uuid_endswith) assert "E3: _agent_card_url_for(valid uuid) builds canonical URL" "yes" "$value" ;;
+ esac
+done < /tmp/cha-envelope-results.txt
+
+echo ""
+if [ "$FAIL" -gt 0 ]; then
+ echo "[replay] FAIL: $PASS pass, $FAIL fail"
+ echo ""
+ echo "[replay] If B/C/E failed: the published wheel does NOT contain the #2481 fix."
+ echo "[replay] Likely causes:"
+ echo " - Wheel rewriter dropped _validate_peer_id from molecule_runtime.a2a_client"
+ echo " - publish-runtime.yml regressed to a SHA before #2481 (check pip install version)"
+ exit 1
+fi
+echo "[replay] PASS: $PASS/$PASS — channel envelope peer_id trust boundary holds in published wheel $INSTALLED"
diff --git a/tests/harness/replays/chat-history.sh b/tests/harness/replays/chat-history.sh
new file mode 100755
index 00000000..d6efa571
--- /dev/null
+++ b/tests/harness/replays/chat-history.sh
@@ -0,0 +1,175 @@
+#!/usr/bin/env bash
+# Replay for the chat_history MCP tool — exercises the full SaaS-shape
+# wire that PRs #2472 (peer_id filter), #2474 (chat_history client), and
+# #2476 (before_ts paging) ride on. Runs against the prod-shape tenant
+# image, not unit-mock'd handlers, so any drift between the Go handler
+# and the Python tool's expectations surfaces here.
+#
+# What this catches that unit tests don't:
+# - Real Postgres planner behaviour on the (source_id = $X OR target_id = $X)
+# OR clause (issue #2478 — both indexes missing).
+# - cf-proxy header rewrites + TenantGuard middleware in the path.
+# - lib/pq + Postgres driver type binding for time.Time parameters.
+# - JSON encoding of created_at across the wire (timezone, precision).
+#
+# Phases:
+# A. Seed three a2a_receive rows for alpha with peer_id=beta, spread
+# across distinct timestamps.
+# B. Basic peer_id filter: GET ?type=a2a_receive&peer_id=beta&limit=10
+# → assert 3 rows DESC.
+# C. Limit cap: limit=2 → assert 2 newest rows.
+# D. before_ts paging: take the 2nd-newest's created_at, GET with
+# before_ts=that → assert the 1 strictly-older row.
+# E. OR clause (target side): seed an a2a_send row where source=alpha,
+# target=beta. GET with type unset, peer_id=beta → assert that row
+# surfaces too (target_id match, not just source_id).
+# F. Trust-boundary: peer_id="not-a-uuid" → 400 + "peer_id must be a UUID".
+# G. Trust-boundary: before_ts="garbage" → 400 + RFC3339 example.
+# H. URL-encoded SQL-injection-shape peer_id → 400 (matches activity_test.go's
+# malicious-peer-id panel).
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+cd "$HARNESS_ROOT"
+
+if [ ! -f .seed.env ]; then
+ echo "[replay] no .seed.env — running ./seed.sh first..."
+ ./seed.sh
+fi
+# shellcheck source=/dev/null
+source .seed.env
+# shellcheck source=../_curl.sh
+source "$HARNESS_ROOT/_curl.sh"
+
+PASS=0
+FAIL=0
+
+assert() {
+ local desc="$1" expected="$2" actual="$3"
+ if [ "$expected" = "$actual" ]; then
+ printf " PASS %s\n" "$desc"
+ PASS=$((PASS + 1))
+ else
+ printf " FAIL %s\n expected: %s\n got : %s\n" "$desc" "$expected" "$actual" >&2
+ FAIL=$((FAIL + 1))
+ fi
+}
+
+assert_contains() {
+ local desc="$1" needle="$2" haystack="$3"
+ if echo "$haystack" | grep -qF "$needle"; then
+ printf " PASS %s\n" "$desc"
+ PASS=$((PASS + 1))
+ else
+ printf " FAIL %s\n expected to contain: %s\n got: %s\n" "$desc" "$needle" "$haystack" >&2
+ FAIL=$((FAIL + 1))
+ fi
+}
+
+echo "[replay] alpha=$ALPHA_ID beta=$BETA_ID"
+
+# ─── Phase A: seed the activity_logs table ─────────────────────────────
+# Inserted via psql so the seed is independent of the platform's HTTP
+# Notify path — that path itself ships through the same handler chain
+# we want to test, and seeding through it would conflate setup and
+# assertion.
+echo ""
+echo "[replay] A. seeding 3 a2a_receive rows for alpha←beta at distinct timestamps..."
+psql_exec >/dev/null </dev/null </dev/null </dev/null
+ALPHA_ID=$(curl_admin -X POST "$BASE/workspaces" \
+ -d '{"name":"alpha","tier":0,"runtime":"langgraph"}' \
+ | jq -r '.id')
+if [ -z "$ALPHA_ID" ] || [ "$ALPHA_ID" = "null" ]; then
+ echo "[seed] FAIL: alpha workspace creation returned no id"
+ exit 1
+fi
echo "[seed] alpha id=$ALPHA_ID"
echo "[seed] creating workspace 'beta' (child of alpha)..."
-BETA_ID=$(uuidgen | tr '[:upper:]' '[:lower:]')
-curl_admin -X POST "$BASE/workspaces" \
- -d "{\"id\":\"$BETA_ID\",\"name\":\"beta\",\"tier\":1,\"parent_id\":\"$ALPHA_ID\",\"runtime\":\"langgraph\"}" \
- >/dev/null
+BETA_ID=$(curl_admin -X POST "$BASE/workspaces" \
+ -d "{\"name\":\"beta\",\"tier\":1,\"parent_id\":\"$ALPHA_ID\",\"runtime\":\"langgraph\"}" \
+ | jq -r '.id')
+if [ -z "$BETA_ID" ] || [ "$BETA_ID" = "null" ]; then
+ echo "[seed] FAIL: beta workspace creation returned no id"
+ exit 1
+fi
echo "[seed] beta id=$BETA_ID"
# Stash IDs so replay scripts pick them up.
diff --git a/tests/harness/up.sh b/tests/harness/up.sh
index fbc14910..87a6cf91 100755
--- a/tests/harness/up.sh
+++ b/tests/harness/up.sh
@@ -41,15 +41,18 @@ fi
echo "[harness] starting cp-stub + postgres + redis + tenant + cf-proxy ..."
docker compose -f compose.yml up -d --wait
-echo "[harness] /etc/hosts entry for harness-tenant.localhost..."
-if ! grep -q '^127\.0\.0\.1[[:space:]]\+harness-tenant\.localhost' /etc/hosts; then
- echo " (skip — your /etc/hosts may not resolve *.localhost. If tests fail with"
- echo " 'getaddrinfo' errors, add: 127.0.0.1 harness-tenant.localhost)"
-fi
-
+# Sudo-free reachability: cf-proxy/nginx routes by Host header (matches
+# production CF tunnel), so replays target loopback :8080 with a Host
+# header rather than depending on /etc/hosts resolution. _curl.sh
+# centralises this. Legacy /etc/hosts users still work — the BASE env
+# var override accepts either shape.
echo ""
-echo "[harness] up. Tenant: http://harness-tenant.localhost:8080/health"
-echo " http://harness-tenant.localhost:8080/buildinfo"
-echo " cp-stub: http://localhost (internal-only via compose net)"
+echo "[harness] up."
+echo " Tenant via cf-proxy: http://localhost:8080/health"
+echo " (Host: harness-tenant.localhost)"
+echo " cp-stub: internal-only via compose net"
+echo ""
+echo " Quick check:"
+echo " curl -H 'Host: harness-tenant.localhost' http://localhost:8080/health"
echo ""
echo "Next: ./seed.sh # mint admin token + register sample workspaces"
From 955755ce1e8c91dfc3edd6213a6d551f740d5875 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 20:14:39 -0700
Subject: [PATCH 56/61] test(provision): tighten Assertion 4 message to name
both failure modes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Per review nit on PR #2491: the previous message ("a goroutine reached
cpProv.Start but never broadcast its failure") could mislead an
operator if Assertion 2 and 4 both fire — Assertion 4 also catches
"goroutine exited via an earlier path before reaching Start." Spell
both modes out and cross-reference Assertion 2.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../handlers/workspace_provision_concurrent_repro_test.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
index e12b8fa8..a17d5037 100644
--- a/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
+++ b/workspace-server/internal/handlers/workspace_provision_concurrent_repro_test.go
@@ -201,7 +201,7 @@ func TestProvisionWorkspaceCP_ConcurrentBurst_NoSilentDrop(t *testing.T) {
bcastCount := bcast.count
bcast.mu.Unlock()
if bcastCount != numWorkspaces {
- t.Errorf("broadcaster saw %d RecordAndBroadcast calls, want %d. SILENT-DROP CLASS: a goroutine reached cpProv.Start but never broadcast its failure.",
+ t.Errorf("broadcaster saw %d RecordAndBroadcast calls, want %d. SILENT-DROP CLASS: either a goroutine reached cpProv.Start but was lost before markProvisionFailed, OR it exited via an earlier path before reaching Start (cross-check Assertion 2 above).",
bcastCount, numWorkspaces)
}
From c2757160051d917c39e48aa4dbcae60217a1690e Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 21:36:40 -0700
Subject: [PATCH 57/61] harness(phase-2): multi-tenant compose + cross-tenant
isolation replays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Brings the local harness from "single tenant covering the request path"
to "two tenants covering both the request path AND the per-tenant
isolation boundary" — the same shape production runs (one EC2 + one
Postgres + one MOLECULE_ORG_ID per tenant).
Why this matters: the four prior replays exercise the SaaS request
path against one tenant. They cannot prove that TenantGuard rejects
a misrouted request (production CF tunnel + AWS LB are the failure
surface), nor that two tenants doing legitimate work in parallel
keep their `activity_logs` / `workspaces` / connection-pool state
partitioned. Both are real bug classes — TenantGuard allowlist drift
shipped #2398, lib/pq prepared-statement cache collision is documented
as an org-wide hazard.
What changed:
1. compose.yml — split into two tenants.
tenant-alpha + postgres-alpha + tenant-beta + postgres-beta + the
shared cp-stub, redis, cf-proxy. Each tenant gets a distinct
ADMIN_TOKEN + MOLECULE_ORG_ID and its own Postgres database. cf-proxy
depends on both tenants becoming healthy.
2. cf-proxy/nginx.conf — Host-header → tenant routing.
`map $host $tenant_upstream` resolves the right backend per request.
Required `resolver 127.0.0.11 valid=30s ipv6=off;` because nginx
needs an explicit DNS resolver to use a variable in `proxy_pass`
(literal hostnames resolve once at startup; variables resolve per
request — without the resolver nginx fails closed with 502).
`server_name` lists both tenants + the legacy alias so unknown Host
headers don't silently route to a default and mask routing bugs.
3. _curl.sh — per-tenant + cross-tenant-negative helpers.
`curl_alpha_admin` / `curl_beta_admin` set the right
Host + Authorization + X-Molecule-Org-Id triple.
`curl_alpha_creds_at_beta` / `curl_beta_creds_at_alpha` exist
precisely to make WRONG requests (replays use them to assert
TenantGuard rejects). `psql_exec_alpha` / `psql_exec_beta` shell out
per-tenant Postgres exec. Legacy aliases (`curl_admin`, `psql_exec`)
keep the four pre-Phase-2 replays working without edits.
4. seed.sh — registers parent+child workspaces in BOTH tenants.
Captures server-generated IDs via `jq -r '.id'` (POST /workspaces
ignores body.id, so the older client-side mint silently desynced
from the workspaces table and broke FK-dependent replays). Stashes
`ALPHA_PARENT_ID` / `ALPHA_CHILD_ID` / `BETA_PARENT_ID` /
`BETA_CHILD_ID` to .seed.env, plus legacy `ALPHA_ID` / `BETA_ID`
aliases for backwards compat with chat-history / channel-envelope.
5. New replays.
tenant-isolation.sh (13 assertions) — TenantGuard 404s any request
whose X-Molecule-Org-Id doesn't match the container's
MOLECULE_ORG_ID. Asserts the 404 body has zero
tenant/org/forbidden/denied keywords (existence of a tenant must
not be probable from the outside). Covers cross-tenant routing
misconfigure + allowlist drift + missing-org-header.
per-tenant-independence.sh (12 assertions) — both tenants seed
activity_logs in parallel with distinct row counts (3 vs 5) and
confirm each tenant's history endpoint returns exactly its own
counts. Then a concurrent INSERT race (10 rows per tenant in
parallel via `&` + wait) catches shared-pool corruption +
prepared-statement cache poisoning + redis cross-keyspace bleed.
6. Bug fix: down.sh + dump-logs SECRETS_ENCRYPTION_KEY validation.
`docker compose down -v` validates the entire compose file even
though it doesn't read the env. up.sh generates a per-run key into
its own shell — down.sh runs in a fresh shell that wouldn't see it,
so without a placeholder `compose down` exited non-zero before
removing volumes. Workspaces silently leaked into the next
./up.sh + seed.sh boot. Caught when tenant-isolation.sh F1/F2 saw
3× duplicate alpha-parent rows accumulated across three prior runs.
Same fix applied to the workflow's dump-logs step.
7. requirements.txt — pin molecule-ai-workspace-runtime>=0.1.78.
channel-envelope-trust-boundary.sh imports from `molecule_runtime.*`
(the wheel-rewritten path) so it catches the failure mode where
the wheel build silently strips a fix that unit tests on local
source still pass. CI was failing this replay because the wheel
wasn't installed — caught in the staging push run from #2492.
8. .github/workflows/harness-replays.yml — Phase 2 plumbing.
* Removed /etc/hosts step (Host-header path eliminated the need;
scripts already source _curl.sh).
* Updated dump-logs to reference the new service names
(tenant-alpha + tenant-beta + postgres-alpha + postgres-beta).
* Added SECRETS_ENCRYPTION_KEY placeholder env on the dump step.
Verified: ./run-all-replays.sh from a clean state — 6/6 passed
(buildinfo-stale-image, channel-envelope-trust-boundary, chat-history,
peer-discovery-404, per-tenant-independence, tenant-isolation).
Roadmap section updated: Phase 2 marked shipped. Phase 3 promoted to
"replace cp-stub with real molecule-controlplane Docker build + env
coherence lint."
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.github/workflows/harness-replays.yml | 31 +--
tests/harness/README.md | 88 ++++++---
tests/harness/_curl.sh | 153 +++++++++++----
tests/harness/cf-proxy/nginx.conf | 61 ++++--
tests/harness/compose.yml | 169 ++++++++++-------
tests/harness/down.sh | 13 +-
.../replays/per-tenant-independence.sh | 178 ++++++++++++++++++
tests/harness/replays/tenant-isolation.sh | 172 +++++++++++++++++
tests/harness/requirements.txt | 6 +
tests/harness/seed.sh | 113 ++++++-----
tests/harness/up.sh | 27 +--
11 files changed, 785 insertions(+), 226 deletions(-)
create mode 100755 tests/harness/replays/per-tenant-independence.sh
create mode 100755 tests/harness/replays/tenant-isolation.sh
diff --git a/.github/workflows/harness-replays.yml b/.github/workflows/harness-replays.yml
index 6330e885..fc642ba4 100644
--- a/.github/workflows/harness-replays.yml
+++ b/.github/workflows/harness-replays.yml
@@ -106,16 +106,6 @@ jobs:
path: molecule-ai-plugin-github-app-auth
token: ${{ secrets.PLUGIN_REPO_PAT || secrets.GITHUB_TOKEN }}
- - name: Add /etc/hosts entry for harness-tenant.localhost
- # ubuntu-latest doesn't auto-resolve *.localhost the way macOS
- # sometimes does. seed.sh + replay scripts curl
- # http://harness-tenant.localhost:8080 — without the entry
- # they'd fail with getaddrinfo ENOTFOUND.
- if: needs.detect-changes.outputs.run == 'true'
- run: |
- echo "127.0.0.1 harness-tenant.localhost" | sudo tee -a /etc/hosts >/dev/null
- getent hosts harness-tenant.localhost
-
- name: Install Python deps for replays
# peer-discovery-404 (and future replays) eval Python against the
# running tenant — importing workspace/a2a_client.py pulls in
@@ -144,19 +134,32 @@ jobs:
run: ./run-all-replays.sh
- name: Dump compose logs on failure
+ # SECRETS_ENCRYPTION_KEY: docker compose validates the entire compose
+ # file even for read-only `logs` calls. up.sh generates a per-run key
+ # and exports it to its OWN shell — this step runs in a fresh shell
+ # that wouldn't see it, so without a placeholder the validate step
+ # errors before logs print (verified against PR #2492's first run:
+ # "required variable SECRETS_ENCRYPTION_KEY is missing a value").
+ # A placeholder is fine — we're only reading log streams, not booting.
if: failure() && needs.detect-changes.outputs.run == 'true'
working-directory: tests/harness
+ env:
+ SECRETS_ENCRYPTION_KEY: dump-logs-placeholder
run: |
echo "=== docker compose ps ==="
docker compose -f compose.yml ps || true
- echo "=== tenant logs ==="
- docker compose -f compose.yml logs tenant || true
+ echo "=== tenant-alpha logs ==="
+ docker compose -f compose.yml logs tenant-alpha || true
+ echo "=== tenant-beta logs ==="
+ docker compose -f compose.yml logs tenant-beta || true
echo "=== cp-stub logs ==="
docker compose -f compose.yml logs cp-stub || true
echo "=== cf-proxy logs ==="
docker compose -f compose.yml logs cf-proxy || true
- echo "=== postgres logs (last 100) ==="
- docker compose -f compose.yml logs --tail 100 postgres || true
+ echo "=== postgres-alpha logs (last 100) ==="
+ docker compose -f compose.yml logs --tail 100 postgres-alpha || true
+ echo "=== postgres-beta logs (last 100) ==="
+ docker compose -f compose.yml logs --tail 100 postgres-beta || true
- name: Force teardown
# We pass KEEP_UP=1 to run-all-replays.sh so the dump step
diff --git a/tests/harness/README.md b/tests/harness/README.md
index bf0ad93e..52fba5ce 100644
--- a/tests/harness/README.md
+++ b/tests/harness/README.md
@@ -3,17 +3,27 @@
The harness brings up the SaaS tenant topology on localhost using the
same `Dockerfile.tenant` image that ships to production. Tests target
the cf-proxy on `http://localhost:8080` and pass the tenant identity
-via a `Host: harness-tenant.localhost` header — exactly the way
-production CF tunnel routes by Host header. The cf-proxy nginx then
-rewrites headers and proxies to the tenant container, exercising the
-SAME code path a real tenant takes including TenantGuard middleware,
-the `/cp/*` reverse proxy, the canvas reverse proxy, and a
-Cloudflare-tunnel-shape header rewrite layer.
+via a `Host:` header — exactly the way production CF tunnel routes by
+Host header. The cf-proxy nginx then rewrites headers and proxies to
+the right tenant container, exercising the SAME code path a real tenant
+takes including TenantGuard middleware, the `/cp/*` reverse proxy, the
+canvas reverse proxy, and a Cloudflare-tunnel-shape header rewrite
+layer.
-`tests/harness/_curl.sh` is the helper sourced by every replay —
-provides `curl_anon`, `curl_admin`, `curl_workspace`, and `psql_exec`
-wrappers that set the right Host + auth headers automatically. New
-replays should source it rather than rolling their own curl.
+Since Phase 2 the harness runs **two tenants in parallel** (alpha and
+beta) with their own Postgres instance and distinct
+`MOLECULE_ORG_ID`s — same shape as production, where each tenant gets
+its own EC2 + DB. This is what cross-tenant isolation replays need to
+prove TenantGuard actually 404s a misrouted request.
+
+`tests/harness/_curl.sh` is the helper sourced by every replay. Per
+tenant: `curl_alpha_anon` / `curl_alpha_admin` / `curl_beta_anon` /
+`curl_beta_admin` / `psql_exec_alpha` / `psql_exec_beta`. Plus
+deliberately-wrong cross-tenant negative-test helpers for isolation
+replays: `curl_alpha_creds_at_beta` / `curl_beta_creds_at_alpha`.
+Legacy single-tenant aliases (`curl_anon`, `curl_admin`, `psql_exec`)
+default to alpha so pre-Phase-2 replays continue to work. New replays
+should source `_curl.sh` rather than rolling their own curl.
## Why this exists
@@ -30,25 +40,37 @@ in one of those layers. The harness activates ALL of them.
## Topology
```
-client
- ↓
-cf-proxy nginx, mirrors CF tunnel header rewrites
- ↓ (Host:harness-tenant.localhost, X-Forwarded-*)
-tenant workspace-server/Dockerfile.tenant — same image as prod
- ↓ (CP_UPSTREAM_URL=http://cp-stub:9090, /cp/* proxied)
-cp-stub minimal Go service, mocks CP wire surface
-postgres same version as production
-redis same version as production
+ client
+ ↓
+ cf-proxy nginx, mirrors CF tunnel header rewrites
+ ↓ (routes by Host header)
+ ┌─────────────────────────┴─────────────────────────┐
+ ↓ ↓
+ tenant-alpha tenant-beta
+ Host: harness-tenant-alpha.localhost Host: harness-tenant-beta.localhost
+ MOLECULE_ORG_ID=harness-org-alpha MOLECULE_ORG_ID=harness-org-beta
+ ↓ ↓
+ postgres-alpha postgres-beta
+ ↓ ↓
+ └─────────────────────────┬─────────────────────────┘
+ ↓
+ cp-stub + redis (shared)
```
+Each tenant runs the production `Dockerfile.tenant` image with its own
+admin token, org id, and Postgres instance — identical isolation
+boundaries to production where each tenant gets a dedicated EC2 + DB.
+cp-stub and redis are shared because they model the per-region
+multi-tenant CP and a single Redis cluster.
+
## Quickstart
```bash
cd tests/harness
-./up.sh # builds + starts all services
-./seed.sh # mints admin token, registers two sample workspaces
-./replays/peer-discovery-404.sh
-./replays/buildinfo-stale-image.sh
+./up.sh # builds + starts all services (both tenants)
+./seed.sh # registers parent+child workspaces in BOTH tenants
+./replays/tenant-isolation.sh
+./replays/per-tenant-independence.sh
./down.sh # tear down + remove volumes
```
@@ -62,17 +84,19 @@ REBUILD=1 ./run-all-replays.sh # rebuild images before booting
```
No `/etc/hosts` edit required — replays use the cf-proxy's loopback
-port and pass `Host: harness-tenant.localhost` as a header (`_curl.sh`
-handles this automatically). This matches how production CF tunnel
-routes: the URL is the public CF endpoint, the Host header carries the
-per-tenant identity. Quick check:
+port and pass the per-tenant `Host:` header (`_curl.sh` handles this
+automatically). This matches how production CF tunnel routes: the URL
+is the public CF endpoint, the Host header carries the per-tenant
+identity. Quick check:
```bash
-curl -H "Host: harness-tenant.localhost" http://localhost:8080/health
+curl -H "Host: harness-tenant-alpha.localhost" http://localhost:8080/health
+curl -H "Host: harness-tenant-beta.localhost" http://localhost:8080/health
```
(If you have a legacy `/etc/hosts` entry from older docs, it still
-works — `BASE` and `TENANT_HOST` both honor env-var overrides.)
+works — `BASE`, `ALPHA_HOST`, `BETA_HOST` all honor env-var overrides.
+The legacy `harness-tenant.localhost` host alias maps to alpha.)
## Replay scripts
@@ -87,6 +111,8 @@ green" — the script becomes the regression gate that closes that gap.
| `buildinfo-stale-image.sh` | #2395 | GIT_SHA reaches the binary; verify-step comparison logic works |
| `chat-history.sh` | #2472 + #2474 + #2476 | `peer_id` filter (incl. OR over source/target) + `before_ts` paging + UUID/RFC3339 trust boundary on the activity route |
| `channel-envelope-trust-boundary.sh` | #2471 + #2481 | published wheel scrubs malformed `peer_id` from the channel envelope and from `agent_card_url` (path-traversal + XML-attr injection) |
+| `tenant-isolation.sh` | Phase 2 | TenantGuard 404s any request whose `X-Molecule-Org-Id` doesn't match the container's `MOLECULE_ORG_ID` (covers cross-tenant routing bug + allowlist drift); per-tenant `/workspaces` listings stay partitioned |
+| `per-tenant-independence.sh` | Phase 2 | parallel A2A workflows in both tenants don't bleed into each other's `activity_logs` / `workspaces`, including under a concurrent INSERT race (catches lib/pq prepared-statement cache collision + shared-pool poisoning) |
To add a new replay:
1. Drop a script under `replays/` named after the issue.
@@ -125,6 +151,6 @@ its mandate of "exercise the tenant binary in production-shape topology."
## Roadmap
- **Phase 1 (shipped):** harness + cp-stub + cf-proxy + 4 replays + `run-all-replays.sh` runner. No-sudo `Host`-header path via `_curl.sh`. Per-replay psql seeding for tests that need DB-side fixtures.
-- **Phase 2 (in flight):** multi-tenant — second `tenant-beta` service in compose, second Postgres database, replays for cross-tenant A2A + TenantGuard isolation. Convert `tests/e2e/test_api.sh` to target the harness instead of localhost. Make harness-based E2E a required CI check (a workflow that invokes `run-all-replays.sh` on every PR via the self-hosted Mac runner).
-- **Phase 3:** replace `cp-stub/` with the real `molecule-controlplane` Docker build. Add a config-coherence lint that diffs harness env list against production CP's env list and fails CI on drift.
+- **Phase 2 (shipped):** multi-tenant — `tenant-alpha` + `tenant-beta` with their own Postgres instances and distinct `MOLECULE_ORG_ID`s; cf-proxy nginx routes by Host header (prod CF tunnel parity); `seed.sh` registers parent+child workspaces in both tenants; `_curl.sh` exposes per-tenant + cross-tenant-negative helpers; new replays cover TenantGuard isolation (`tenant-isolation.sh`) and per-tenant independence under concurrent load (`per-tenant-independence.sh`). `harness-replays.yml` runs `run-all-replays.sh` as a required check on every PR touching `workspace-server/**`, `canvas/**`, `tests/harness/**`, or the workflow itself.
+- **Phase 3:** replace `cp-stub/` with the real `molecule-controlplane` Docker build. Add a config-coherence lint that diffs harness env list against production CP's env list and fails CI on drift. Convert `tests/e2e/test_api.sh` to target the harness instead of localhost.
- **Phase 4 (long-term):** Miniflare in front of cf-proxy for real CF emulation (WAF, BotID, rate-limit, cf-tunnel headers). LocalStack for the EC2 provisioner. Anonymized prod-traffic recording/replay for SaaS-scale regression detection.
diff --git a/tests/harness/_curl.sh b/tests/harness/_curl.sh
index 6a32ab5d..12dc8cba 100644
--- a/tests/harness/_curl.sh
+++ b/tests/harness/_curl.sh
@@ -5,55 +5,122 @@
# URL is to a public CF endpoint and the Host header carries the
# per-tenant identity. We replay the same shape locally:
#
-# curl -H "Host: harness-tenant.localhost" http://localhost:8080/health
+# curl -H "Host: harness-tenant-alpha.localhost" http://localhost:8080/health
#
# This matches what cf-proxy/nginx.conf already routes (`server_name
-# *.localhost localhost`) and avoids the macOS /etc/hosts requirement
-# that previously gated the harness behind a sudo step.
+# *.localhost` + `map $host $tenant_upstream`) and avoids the macOS
+# /etc/hosts requirement that previously gated the harness behind a
+# sudo step.
#
-# Backwards-compatible: if /etc/hosts resolves harness-tenant.localhost
-# (the legacy path), the bare URL still works because the helper falls
-# back to that. New scripts SHOULD use the helper functions.
+# Multi-tenant since Phase 2: alpha and beta tenants run in parallel.
+# `curl_alpha_admin` and `curl_beta_admin` target each tenant's URL
+# with that tenant's ADMIN_TOKEN + MOLECULE_ORG_ID. The legacy
+# `curl_admin` is aliased to alpha for backwards compat with the
+# pre-Phase-2 single-tenant replays.
#
# Usage:
# HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# source "$HERE/../_curl.sh" # from replays/.sh
-# curl_admin "$BASE/health"
-# curl_anon "$BASE/health"
+# curl_alpha_admin "$BASE/health"
+# curl_beta_admin "$BASE/health"
# Bind to the cf-proxy's loopback port — the proxy front-doors every
# tenant and routes by Host header, exactly like production's CF tunnel.
: "${BASE:=http://localhost:8080}"
-: "${TENANT_HOST:=harness-tenant.localhost}"
-: "${ADMIN_TOKEN:=harness-admin-token}"
-: "${ORG_ID:=harness-org}"
-# Anonymous request — only Host header (no auth). Use for /health,
-# /buildinfo, and any other route that's intentionally public.
+# Per-tenant identity. Each pair must match the corresponding tenant
+# container's environment in compose.yml or auth/TenantGuard will fail
+# in non-obvious ways (401 vs 403 vs silent route to wrong tenant).
+: "${ALPHA_HOST:=harness-tenant-alpha.localhost}"
+: "${ALPHA_ADMIN_TOKEN:=harness-admin-token-alpha}"
+: "${ALPHA_ORG_ID:=harness-org-alpha}"
+
+: "${BETA_HOST:=harness-tenant-beta.localhost}"
+: "${BETA_ADMIN_TOKEN:=harness-admin-token-beta}"
+: "${BETA_ORG_ID:=harness-org-beta}"
+
+# Legacy single-tenant aliases — pre-Phase-2 replays use these without
+# knowing the topology grew. They map to alpha. New replays should use
+# the explicit alpha/beta variants for clarity.
+: "${TENANT_HOST:=$ALPHA_HOST}"
+: "${ADMIN_TOKEN:=$ALPHA_ADMIN_TOKEN}"
+: "${ORG_ID:=$ALPHA_ORG_ID}"
+
+# ─── Anonymous (no auth) ──────────────────────────────────────────────
+
+# Anonymous request to alpha. Use for /health, /buildinfo, etc.
+curl_alpha_anon() {
+ curl -sS -H "Host: ${ALPHA_HOST}" "$@"
+}
+
+# Anonymous request to beta.
+curl_beta_anon() {
+ curl -sS -H "Host: ${BETA_HOST}" "$@"
+}
+
+# Legacy alias for single-tenant replays.
curl_anon() {
curl -sS -H "Host: ${TENANT_HOST}" "$@"
}
-# Admin-token request — full SaaS auth shape. Sets the bearer token,
-# tenant org header (activates TenantGuard middleware), and a default
-# JSON Content-Type. Replays admin paths exactly the way CP does in
-# production, so any TenantGuard / strict-auth bug surfaces locally.
-curl_admin() {
+# ─── Admin-token requests ─────────────────────────────────────────────
+
+# Admin-token request to alpha tenant. SaaS-shape auth: bearer token,
+# tenant org header (TenantGuard activates), JSON content type.
+curl_alpha_admin() {
curl -sS \
- -H "Host: ${TENANT_HOST}" \
- -H "Authorization: Bearer ${ADMIN_TOKEN}" \
- -H "X-Molecule-Org-Id: ${ORG_ID}" \
+ -H "Host: ${ALPHA_HOST}" \
+ -H "Authorization: Bearer ${ALPHA_ADMIN_TOKEN}" \
+ -H "X-Molecule-Org-Id: ${ALPHA_ORG_ID}" \
-H "Content-Type: application/json" \
"$@"
}
-# Workspace-scoped request — uses a per-workspace bearer minted from
-# /admin/workspaces/:id/test-token. The platform's auth.go middleware
-# accepts this bearer for the workspace's own routes, so this is the
-# right shape for replays that exercise an in-workspace tool calling
-# back to the platform (chat_history, list_peers, etc).
-#
-# Caller must export WORKSPACE_TOKEN before invoking.
+# Admin-token request to beta tenant.
+curl_beta_admin() {
+ curl -sS \
+ -H "Host: ${BETA_HOST}" \
+ -H "Authorization: Bearer ${BETA_ADMIN_TOKEN}" \
+ -H "X-Molecule-Org-Id: ${BETA_ORG_ID}" \
+ -H "Content-Type: application/json" \
+ "$@"
+}
+
+# Legacy alias.
+curl_admin() {
+ curl_alpha_admin "$@"
+}
+
+# ─── Cross-tenant negative-test helpers ───────────────────────────────
+# These exist to MAKE WRONG calls — replays use them to assert
+# TenantGuard rejects them. Names spell out what's mismatched.
+
+# alpha bearer + alpha org, but talking to beta's URL. TenantGuard
+# should reject because the org header doesn't match beta's MOLECULE_ORG_ID.
+curl_alpha_creds_at_beta() {
+ curl -sS \
+ -H "Host: ${BETA_HOST}" \
+ -H "Authorization: Bearer ${ALPHA_ADMIN_TOKEN}" \
+ -H "X-Molecule-Org-Id: ${ALPHA_ORG_ID}" \
+ -H "Content-Type: application/json" \
+ "$@"
+}
+
+# beta bearer + beta org, but talking to alpha's URL.
+curl_beta_creds_at_alpha() {
+ curl -sS \
+ -H "Host: ${ALPHA_HOST}" \
+ -H "Authorization: Bearer ${BETA_ADMIN_TOKEN}" \
+ -H "X-Molecule-Org-Id: ${BETA_ORG_ID}" \
+ -H "Content-Type: application/json" \
+ "$@"
+}
+
+# ─── Workspace-scoped (per-workspace bearer) ──────────────────────────
+
+# Workspace-scoped request to alpha — uses a per-workspace bearer
+# minted from /admin/workspaces/:id/test-token. Caller must export
+# WORKSPACE_TOKEN.
curl_workspace() {
: "${WORKSPACE_TOKEN:?WORKSPACE_TOKEN must be set — mint via /admin/workspaces/:id/test-token}"
curl -sS \
@@ -64,19 +131,29 @@ curl_workspace() {
"$@"
}
+# ─── Postgres exec (per-tenant) ───────────────────────────────────────
+
# Direct postgres exec — for replays that need to seed activity_logs
-# rows or read DB state that has no public HTTP route. Wraps the
-# `docker compose exec` pattern so replays can stay shell-only.
+# rows or read DB state that has no public HTTP route.
#
-# SECRETS_ENCRYPTION_KEY is set to a placeholder so compose's `:?must
-# be set` interpolation guard (which gates running the harness without
-# up.sh) doesn't trip on `exec` — exec only reaches an already-running
-# service so the env var is irrelevant, but compose still validates
-# the file. The placeholder is never written anywhere or used by any
-# service.
-psql_exec() {
+# SECRETS_ENCRYPTION_KEY placeholder lets compose validate without
+# requiring up.sh's per-run key (exec doesn't actually use it but
+# compose validates the file).
+psql_exec_alpha() {
SECRETS_ENCRYPTION_KEY="${SECRETS_ENCRYPTION_KEY:-exec-placeholder}" \
docker compose -f "${HARNESS_COMPOSE:-$(dirname "${BASH_SOURCE[0]}")/compose.yml}" \
- exec -T postgres \
+ exec -T postgres-alpha \
psql -U harness -d molecule -At "$@"
}
+
+psql_exec_beta() {
+ SECRETS_ENCRYPTION_KEY="${SECRETS_ENCRYPTION_KEY:-exec-placeholder}" \
+ docker compose -f "${HARNESS_COMPOSE:-$(dirname "${BASH_SOURCE[0]}")/compose.yml}" \
+ exec -T postgres-beta \
+ psql -U harness -d molecule -At "$@"
+}
+
+# Legacy alias — single-tenant replays default to alpha's DB.
+psql_exec() {
+ psql_exec_alpha "$@"
+}
diff --git a/tests/harness/cf-proxy/nginx.conf b/tests/harness/cf-proxy/nginx.conf
index a51efdba..c95f78cd 100644
--- a/tests/harness/cf-proxy/nginx.conf
+++ b/tests/harness/cf-proxy/nginx.conf
@@ -4,28 +4,54 @@
# This config replays the same header rewrites the CF tunnel does so
# the tenant sees the same Host + X-Forwarded-* it would in production.
#
-# The tenant's TenantGuard middleware activates on MOLECULE_ORG_ID; the
-# canvas's same-origin fetches use the Host header for cookie scoping.
-# Both behave correctly in production because CF rewrites Host to the
-# tenant subdomain — this proxy reproduces that locally.
+# Multi-tenant: nginx routes by Host header to the right tenant
+# container — exactly the same way the production CF tunnel does
+# (URL is the public CF endpoint, Host carries the tenant identity).
#
-# How tests reach it:
-# curl --resolve 'harness-tenant.localhost:8443:127.0.0.1' \
-# https://harness-tenant.localhost:8443/health
-# or via /etc/hosts (added automatically by ./up.sh on first boot).
+# How tests reach it (no /etc/hosts required):
+# curl -H 'Host: harness-tenant-alpha.localhost' http://localhost:8080/health
+# curl -H 'Host: harness-tenant-beta.localhost' http://localhost:8080/health
+#
+# Backwards-compat: harness-tenant.localhost (no -alpha/-beta suffix) maps
+# to alpha for legacy single-tenant replays.
worker_processes 1;
events { worker_connections 256; }
http {
- # Map the wildcard .localhost to the tenant container. The
- # tenant container itself doesn't care which slug routed to it —
- # what matters is that the Host header it sees matches what
- # production's CF tunnel sets, so cookie/CORS/TenantGuard logic
- # exercises the same code path.
+ # Docker's embedded DNS at 127.0.0.11. Required because the
+ # `proxy_pass http://$tenant_upstream:8080` below uses a variable —
+ # nginx needs an explicit resolver to do per-request DNS lookups
+ # (literal hostnames are resolved once at startup, variables are
+ # resolved per-request). Without this, nginx fails closed with
+ # "no resolver defined" + 502.
+ #
+ # `valid=30s` caps cache life so a tenant container restart picks
+ # up a new IP within 30 seconds. ipv6=off skips AAAA lookups that
+ # Docker DNS doesn't always serve cleanly.
+ resolver 127.0.0.11 valid=30s ipv6=off;
+
+ # Reusable proxy block so each tenant server only carries the
+ # upstream-pointer + its identity-specific tweaks. Keeping the
+ # header rewrites + buffering settings centralised prevents drift
+ # between alpha and beta as the harness grows.
+ map $host $tenant_upstream {
+ default tenant-alpha;
+ harness-tenant.localhost tenant-alpha;
+ harness-tenant-alpha.localhost tenant-alpha;
+ harness-tenant-beta.localhost tenant-beta;
+ }
+
server {
- listen 8080;
- server_name *.localhost localhost;
+ listen 8080 default_server;
+
+ # Reject Host headers we don't recognise — without this, an
+ # unknown Host would silently route to the default tenant and
+ # mask cross-tenant routing bugs in test output.
+ server_name harness-tenant.localhost
+ harness-tenant-alpha.localhost
+ harness-tenant-beta.localhost
+ localhost;
# Cap upload at 50MB to mirror the staging tenant nginx limit;
# chat upload tests will fail closed if the platform handler
@@ -34,7 +60,10 @@ http {
client_max_body_size 50m;
location / {
- proxy_pass http://tenant:8080;
+ # The map above resolves $tenant_upstream to the right
+ # container based on the Host header — production CF tunnel
+ # behavior in one line.
+ proxy_pass http://$tenant_upstream:8080;
# Header parity with CF tunnel + AWS LB. Production CF sets
# X-Forwarded-Proto=https; we keep http here because TLS
diff --git a/tests/harness/compose.yml b/tests/harness/compose.yml
index 1a382a6a..debbb675 100644
--- a/tests/harness/compose.yml
+++ b/tests/harness/compose.yml
@@ -1,45 +1,38 @@
-# Production-shape harness for local E2E.
+# Production-shape harness for local E2E. Multi-tenant.
#
# Reproduces the SaaS tenant topology on localhost using the SAME
# images that ship to production:
#
-# client → cf-proxy (nginx, mimics CF tunnel headers)
-# → tenant (workspace-server/Dockerfile.tenant — combined platform + canvas)
-# → cp-stub (control-plane stand-in) for /cp/* and CP-callback paths
-# → postgres + redis (same versions as production)
+# client → cf-proxy (nginx, mimics CF tunnel headers, routes by Host)
+# ├─ Host: harness-tenant-alpha.localhost → tenant-alpha
+# │ ↓ (CP_UPSTREAM_URL=http://cp-stub:9090)
+# │ tenant-alpha (workspace-server/Dockerfile.tenant)
+# │ ↓
+# │ postgres-alpha (per-tenant DB, matches prod)
+# ├─ Host: harness-tenant-beta.localhost → tenant-beta
+# │ ↓
+# │ tenant-beta + postgres-beta
+# └─ cp-stub + redis (shared infra; CP is Railway-singleton in prod,
+# redis is shared cluster)
#
-# Why this matters: the workspace-server binary IS identical between
-# local and production. The bugs that survive local E2E are topology
-# bugs — env-gated middleware (TenantGuard, CP proxy, Canvas proxy),
-# auth state, header rewrites, real production image. This harness
-# activates ALL of them.
+# The two-tenant topology catches:
+# - TenantGuard cross-tenant escape (alpha-org token shouldn't see
+# beta-tenant data even with a valid bearer)
+# - cf-proxy Host-header routing correctness
+# - Per-tenant DB isolation (workspaces table, activity_logs)
+# - Concurrent multi-tenant operation (no shared mutable state)
#
-# Quickstart:
-# cd tests/harness && ./up.sh
-# ./seed.sh
-# ./replays/peer-discovery-404.sh # reproduces issue #2397
+# Quickstart (no /etc/hosts edits — see README):
+# cd tests/harness && ./up.sh && ./seed.sh
+# ./replays/peer-discovery-404.sh
+# ./run-all-replays.sh
#
# Env config:
-# GIT_SHA — passed to the tenant build for /buildinfo verification.
-# Defaults to "harness" so /buildinfo distinguishes the
-# harness build from any cached image.
+# GIT_SHA — passed to BOTH tenant builds for /buildinfo verification.
# CP_STUB_PEERS_MODE — peers failure mode for replay scripts.
-# "" / "404" / "401" / "500" / "timeout".
services:
- postgres:
- image: postgres:16-alpine
- environment:
- POSTGRES_USER: harness
- POSTGRES_PASSWORD: harness
- POSTGRES_DB: molecule
- networks: [harness-net]
- healthcheck:
- test: ["CMD-SHELL", "pg_isready -U harness"]
- interval: 2s
- timeout: 5s
- retries: 10
-
+ # ─── Shared infra (matches prod: CP is Railway-singleton, redis shared) ───
redis:
image: redis:7-alpine
networks: [harness-net]
@@ -62,52 +55,44 @@ services:
timeout: 5s
retries: 10
- # The actual production tenant image — same Dockerfile.tenant CI publishes.
- # This is the load-bearing part of the harness: every bug class that hides
- # behind "but it works locally" is reproducible HERE, against this image,
- # not against `go run ./cmd/server`.
- tenant:
+ # ─── Tenant alpha: postgres + workspace-server ────────────────────────
+ postgres-alpha:
+ image: postgres:16-alpine
+ environment:
+ POSTGRES_USER: harness
+ POSTGRES_PASSWORD: harness
+ POSTGRES_DB: molecule
+ networks: [harness-net]
+ healthcheck:
+ test: ["CMD-SHELL", "pg_isready -U harness"]
+ interval: 2s
+ timeout: 5s
+ retries: 10
+
+ tenant-alpha:
build:
context: ../..
dockerfile: workspace-server/Dockerfile.tenant
args:
GIT_SHA: "${GIT_SHA:-harness}"
depends_on:
- postgres:
+ postgres-alpha:
condition: service_healthy
redis:
condition: service_healthy
cp-stub:
condition: service_healthy
environment:
- DATABASE_URL: "postgres://harness:harness@postgres:5432/molecule?sslmode=disable"
+ DATABASE_URL: "postgres://harness:harness@postgres-alpha:5432/molecule?sslmode=disable"
REDIS_URL: "redis://redis:6379"
PORT: "8080"
- PLATFORM_URL: "http://tenant:8080"
+ PLATFORM_URL: "http://tenant-alpha:8080"
MOLECULE_ENV: "production"
- # SECRETS_ENCRYPTION_KEY is required when MOLECULE_ENV=production —
- # crypto.InitStrict() refuses to boot without it. up.sh generates a
- # fresh 32-byte key per harness lifetime via `openssl rand -base64 32`
- # and exports it into this compose file's interpolation environment.
- # The :? sentinel makes the misuse loud — running `docker compose up`
- # directly without going through up.sh fails fast with a clear error
- # rather than getting a confusing tenant-unhealthy timeout.
SECRETS_ENCRYPTION_KEY: "${SECRETS_ENCRYPTION_KEY:?must be set — run via tests/harness/up.sh, which generates one per run}"
- # ADMIN_TOKEN flips the platform into strict-auth mode (matches
- # production's CP-minted token configuration). Seeded value lets
- # E2E scripts authenticate without going through CP.
- ADMIN_TOKEN: "harness-admin-token"
- # MOLECULE_ORG_ID — activates TenantGuard middleware. Every request
- # must carry X-Molecule-Org-Id matching this value. Replays bugs
- # that only fire in SaaS mode.
- MOLECULE_ORG_ID: "harness-org"
- # CP_UPSTREAM_URL — activates the /cp/* reverse proxy mount in
- # router.go. Without this set, /cp/* would 404 and the canvas
- # bootstrap would silently drift from production behavior.
+ ADMIN_TOKEN: "harness-admin-token-alpha"
+ MOLECULE_ORG_ID: "harness-org-alpha"
CP_UPSTREAM_URL: "http://cp-stub:9090"
RATE_LIMIT: "1000"
- # Canvas auto-proxy — entrypoint-tenant.sh exports CANVAS_PROXY_URL
- # by default; keeping it explicit here makes the topology readable.
CANVAS_PROXY_URL: "http://localhost:3000"
networks: [harness-net]
healthcheck:
@@ -116,21 +101,69 @@ services:
timeout: 5s
retries: 20
- # Cloudflare-tunnel-shape proxy — strips the :8080 suffix, rewrites
- # Host to the tenant subdomain, injects X-Forwarded-*. Tests target
- # http://harness-tenant.localhost:8080 and exercise the production
- # routing layer.
+ # ─── Tenant beta: postgres + workspace-server (parallel to alpha) ─────
+ postgres-beta:
+ image: postgres:16-alpine
+ environment:
+ POSTGRES_USER: harness
+ POSTGRES_PASSWORD: harness
+ POSTGRES_DB: molecule
+ networks: [harness-net]
+ healthcheck:
+ test: ["CMD-SHELL", "pg_isready -U harness"]
+ interval: 2s
+ timeout: 5s
+ retries: 10
+
+ tenant-beta:
+ build:
+ context: ../..
+ dockerfile: workspace-server/Dockerfile.tenant
+ args:
+ GIT_SHA: "${GIT_SHA:-harness}"
+ depends_on:
+ postgres-beta:
+ condition: service_healthy
+ redis:
+ condition: service_healthy
+ cp-stub:
+ condition: service_healthy
+ environment:
+ DATABASE_URL: "postgres://harness:harness@postgres-beta:5432/molecule?sslmode=disable"
+ REDIS_URL: "redis://redis:6379"
+ PORT: "8080"
+ PLATFORM_URL: "http://tenant-beta:8080"
+ MOLECULE_ENV: "production"
+ SECRETS_ENCRYPTION_KEY: "${SECRETS_ENCRYPTION_KEY:?must be set — run via tests/harness/up.sh, which generates one per run}"
+ # Distinct ADMIN_TOKEN — replays use this to verify TenantGuard
+ # blocks alpha-token presented at beta's URL.
+ ADMIN_TOKEN: "harness-admin-token-beta"
+ MOLECULE_ORG_ID: "harness-org-beta"
+ CP_UPSTREAM_URL: "http://cp-stub:9090"
+ RATE_LIMIT: "1000"
+ CANVAS_PROXY_URL: "http://localhost:3000"
+ networks: [harness-net]
+ healthcheck:
+ test: ["CMD-SHELL", "wget -q -O- http://localhost:8080/health || exit 1"]
+ interval: 5s
+ timeout: 5s
+ retries: 20
+
+ # ─── cf-proxy: routes by Host to the right tenant container ───────────
+ # Production shape: same single CF tunnel front-doors every tenant
+ # subdomain — the Host header carries the tenant identity, not the
+ # routing destination. Local cf-proxy mirrors this exactly.
cf-proxy:
image: nginx:1.27-alpine
depends_on:
- tenant:
+ tenant-alpha:
+ condition: service_healthy
+ tenant-beta:
condition: service_healthy
volumes:
- ./cf-proxy/nginx.conf:/etc/nginx/nginx.conf:ro
- # Bind to 127.0.0.1 only — the harness uses a hardcoded ADMIN_TOKEN
- # ("harness-admin-token") so binding 0.0.0.0 (compose's default)
- # would expose admin access to anyone on the local network or VPN.
- # Loopback-only is safe for E2E and prevents a known-token leak.
+ # Bind to 127.0.0.1 only — hardcoded ADMIN_TOKENs make 0.0.0.0
+ # exposure unsafe even on a local network.
ports:
- "127.0.0.1:8080:8080"
networks: [harness-net]
diff --git a/tests/harness/down.sh b/tests/harness/down.sh
index 683c4dae..fb1b305f 100755
--- a/tests/harness/down.sh
+++ b/tests/harness/down.sh
@@ -1,6 +1,17 @@
#!/usr/bin/env bash
+# Tear down the harness and wipe per-tenant volumes.
+#
+# SECRETS_ENCRYPTION_KEY placeholder: docker compose validates the entire
+# compose file even for `down -v` (a destructive read-only operation that
+# doesn't read the env). up.sh generates a per-run key into its own
+# shell — this script runs in a fresh shell that wouldn't see it. Without
+# the placeholder, `compose down` exits non-zero before removing volumes,
+# silently leaking workspaces+activity_logs into the next ./up.sh + seed.sh
+# (verified 2026-05-02: tenant-isolation.sh F1/F2 saw 3× duplicate
+# alpha-parent + alpha-child rows accumulated across three prior boots).
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$HERE"
-docker compose -f compose.yml down -v --remove-orphans
+SECRETS_ENCRYPTION_KEY="${SECRETS_ENCRYPTION_KEY:-down-placeholder}" \
+ docker compose -f compose.yml down -v --remove-orphans
echo "[harness] down + volumes removed."
diff --git a/tests/harness/replays/per-tenant-independence.sh b/tests/harness/replays/per-tenant-independence.sh
new file mode 100755
index 00000000..86e8759a
--- /dev/null
+++ b/tests/harness/replays/per-tenant-independence.sh
@@ -0,0 +1,178 @@
+#!/usr/bin/env bash
+# Replay for per-tenant independence — each tenant runs the same
+# workflow concurrently with no cross-bleed in workspaces table or
+# activity_logs.
+#
+# What this proves that tenant-isolation.sh doesn't:
+# tenant-isolation.sh proves that REQUESTS get rejected at the
+# middleware layer when they target the wrong tenant. THIS replay
+# proves that even when both tenants are doing legitimate work
+# simultaneously, the back-end state stays partitioned: no row in
+# alpha's activity_logs ever shows up in beta's, no FK-resolution
+# ever crosses tenants, etc.
+#
+# Test shape: seed activity_logs in BOTH tenants in parallel using
+# distinct row counts (3 vs 5) so we can distinguish them. Then
+# fetch each tenant's history and assert the count + content match
+# the seed exactly — proves no leak in either direction.
+#
+# Phases:
+# A. Seed alpha tenant: 3 a2a_receive rows (parent ← child).
+# B. Seed beta tenant: 5 a2a_receive rows (parent ← child).
+# C. GET alpha history → exactly 3 rows, all alpha-summary.
+# D. GET beta history → exactly 5 rows, all beta-summary.
+# E. Direct DB sanity — alpha PG has only alpha rows, beta PG only beta.
+# F. Concurrent write race — both tenants take turns INSERTing
+# simultaneously; each tenant's count after the race matches what
+# it INSERTed. Catches "shared cache poison" / "shared connection
+# pool" failure modes that don't show up in single-tenant tests.
+
+set -euo pipefail
+HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+HARNESS_ROOT="$(dirname "$HERE")"
+cd "$HARNESS_ROOT"
+
+if [ ! -f .seed.env ]; then
+ echo "[replay] no .seed.env — running ./seed.sh first..."
+ ./seed.sh
+fi
+# shellcheck source=/dev/null
+source .seed.env
+# shellcheck source=../_curl.sh
+source "$HARNESS_ROOT/_curl.sh"
+
+PASS=0
+FAIL=0
+
+assert() {
+ local desc="$1" expected="$2" actual="$3"
+ if [ "$expected" = "$actual" ]; then
+ printf " PASS %s\n" "$desc"
+ PASS=$((PASS + 1))
+ else
+ printf " FAIL %s\n expected: %s\n got : %s\n" "$desc" "$expected" "$actual" >&2
+ FAIL=$((FAIL + 1))
+ fi
+}
+
+# ─── Cleanup (idempotent) ──────────────────────────────────────────────
+psql_exec_alpha >/dev/null </dev/null </dev/null </dev/null </dev/null </dev/null </dev/null </dev/null <&2
+ FAIL=$((FAIL + 1))
+ fi
+}
+
+# ─── Phase A: positive controls ────────────────────────────────────────
+echo "[replay] A. positive controls — each tenant accepts its own valid creds"
+
+ALPHA_OWN=$(curl_alpha_admin -o /dev/null -w '%{http_code}' "$BASE/workspaces")
+assert_status "A1: alpha creds at alpha returns 200" "200" "$ALPHA_OWN"
+
+BETA_OWN=$(curl_beta_admin -o /dev/null -w '%{http_code}' "$BASE/workspaces")
+assert_status "A2: beta creds at beta returns 200" "200" "$BETA_OWN"
+
+# ─── Phase B: alpha creds at beta's URL → 404 ──────────────────────────
+echo ""
+echo "[replay] B. alpha-org header at beta's URL — TenantGuard must 404"
+
+CROSS_AB=$(curl_alpha_creds_at_beta -o /tmp/iso-ab.json -w '%{http_code}' "$BASE/workspaces")
+assert_status "B1: alpha-org header at beta URL → 404" "404" "$CROSS_AB"
+
+# Body must be a generic 404 — never reveal that beta exists or that
+# the org check fired (TenantGuard is intentionally indistinguishable
+# from "no such route" to an outside scanner).
+B_BODY=$(cat /tmp/iso-ab.json)
+if echo "$B_BODY" | grep -qiE "tenant|org|forbidden|denied"; then
+ printf " FAIL B2: 404 body leaks tenant/org/auth keywords (info disclosure)\n body: %s\n" "$B_BODY" >&2
+ FAIL=$((FAIL + 1))
+else
+ printf " PASS B2: 404 body has no tenant/org leak\n"
+ PASS=$((PASS + 1))
+fi
+
+# ─── Phase C: beta creds at alpha's URL → 404 ──────────────────────────
+echo ""
+echo "[replay] C. beta-org header at alpha's URL — TenantGuard must 404"
+
+CROSS_BA=$(curl_beta_creds_at_alpha -o /tmp/iso-ba.json -w '%{http_code}' "$BASE/workspaces")
+assert_status "C1: beta-org header at alpha URL → 404" "404" "$CROSS_BA"
+
+# ─── Phase D: right URL, garbage org header ────────────────────────────
+echo ""
+echo "[replay] D. right URL, garbage org header → 404"
+
+GARBAGE=$(curl -sS -o /dev/null -w '%{http_code}' \
+ -H "Host: ${ALPHA_HOST}" \
+ -H "Authorization: Bearer ${ALPHA_ADMIN_TOKEN}" \
+ -H "X-Molecule-Org-Id: not-the-right-org" \
+ "$BASE/workspaces")
+assert_status "D1: garbage org id at alpha URL → 404" "404" "$GARBAGE"
+
+# ─── Phase E: bearer present but no org header at all → 404 ────────────
+echo ""
+echo "[replay] E. valid bearer but missing X-Molecule-Org-Id → 404"
+
+NO_ORG=$(curl -sS -o /dev/null -w '%{http_code}' \
+ -H "Host: ${ALPHA_HOST}" \
+ -H "Authorization: Bearer ${ALPHA_ADMIN_TOKEN}" \
+ "$BASE/workspaces")
+assert_status "E1: missing X-Molecule-Org-Id → 404" "404" "$NO_ORG"
+
+# ─── Phase F: per-tenant DB isolation via list_workspaces ──────────────
+echo ""
+echo "[replay] F. per-tenant DB isolation via /workspaces listing"
+
+ALPHA_LIST=$(curl_alpha_admin "$BASE/workspaces")
+ALPHA_NAMES=$(echo "$ALPHA_LIST" | jq -r '.[].name' | sort | tr '\n' ',' | sed 's/,$//')
+echo "[replay] alpha tenant sees: $ALPHA_NAMES"
+
+if [ "$ALPHA_NAMES" = "alpha-child,alpha-parent" ]; then
+ printf " PASS F1: alpha enumerates only alpha workspaces\n"
+ PASS=$((PASS + 1))
+else
+ printf " FAIL F1: alpha enumerated unexpected workspaces\n expected: alpha-child,alpha-parent\n got : %s\n" "$ALPHA_NAMES" >&2
+ FAIL=$((FAIL + 1))
+fi
+
+BETA_LIST=$(curl_beta_admin "$BASE/workspaces")
+BETA_NAMES=$(echo "$BETA_LIST" | jq -r '.[].name' | sort | tr '\n' ',' | sed 's/,$//')
+echo "[replay] beta tenant sees: $BETA_NAMES"
+
+if [ "$BETA_NAMES" = "beta-child,beta-parent" ]; then
+ printf " PASS F2: beta enumerates only beta workspaces\n"
+ PASS=$((PASS + 1))
+else
+ printf " FAIL F2: beta enumerated unexpected workspaces\n expected: beta-child,beta-parent\n got : %s\n" "$BETA_NAMES" >&2
+ FAIL=$((FAIL + 1))
+fi
+
+# Cross-check: neither tenant's list contains the other's workspace ids.
+LEAKED_INTO_ALPHA=$(echo "$ALPHA_LIST" | jq -r --arg b1 "$BETA_PARENT_ID" --arg b2 "$BETA_CHILD_ID" \
+ '[.[] | select(.id == $b1 or .id == $b2)] | length')
+assert_status "F3: alpha list contains zero beta workspace ids" "0" "$LEAKED_INTO_ALPHA"
+
+LEAKED_INTO_BETA=$(echo "$BETA_LIST" | jq -r --arg a1 "$ALPHA_PARENT_ID" --arg a2 "$ALPHA_CHILD_ID" \
+ '[.[] | select(.id == $a1 or .id == $a2)] | length')
+assert_status "F4: beta list contains zero alpha workspace ids" "0" "$LEAKED_INTO_BETA"
+
+# ─── Phase G: /health is allowlisted (sanity) ──────────────────────────
+echo ""
+echo "[replay] G. /health stays public on both tenants (TenantGuard allowlist sanity)"
+
+ALPHA_HEALTH=$(curl -sS -o /dev/null -w '%{http_code}' -H "Host: ${ALPHA_HOST}" "$BASE/health")
+assert_status "G1: alpha /health public → 200" "200" "$ALPHA_HEALTH"
+
+BETA_HEALTH=$(curl -sS -o /dev/null -w '%{http_code}' -H "Host: ${BETA_HOST}" "$BASE/health")
+assert_status "G2: beta /health public → 200" "200" "$BETA_HEALTH"
+
+echo ""
+if [ "$FAIL" -gt 0 ]; then
+ echo "[replay] FAIL: $PASS pass, $FAIL fail"
+ exit 1
+fi
+echo "[replay] PASS: $PASS/$PASS — TenantGuard isolation + per-tenant DB partitioning hold"
diff --git a/tests/harness/requirements.txt b/tests/harness/requirements.txt
index 75a30722..14210ca8 100644
--- a/tests/harness/requirements.txt
+++ b/tests/harness/requirements.txt
@@ -12,3 +12,9 @@
# when a new replay introduces a new Python import.
httpx>=0.28.1
+
+# channel-envelope-trust-boundary.sh imports from `molecule_runtime.*` (the
+# wheel-rewritten path) so it catches the failure mode where the wheel
+# build silently strips a fix that unit tests on local source still pass.
+# >= 0.1.78 ships PR #2481's peer_id trust-boundary guard.
+molecule-ai-workspace-runtime>=0.1.78
diff --git a/tests/harness/seed.sh b/tests/harness/seed.sh
index 2532cbe6..fdcbd672 100755
--- a/tests/harness/seed.sh
+++ b/tests/harness/seed.sh
@@ -1,13 +1,20 @@
#!/usr/bin/env bash
-# Seed the harness with two registered workspaces so peer-discovery
-# replay scripts have something to discover.
+# Seed BOTH tenants with parent + child workspaces so peer-discovery
+# and cross-tenant replays have something to discover.
#
-# - "alpha" parent (tier 0)
-# - "beta" child of alpha (tier 1)
+# Tenant alpha:
+# - alpha-parent (tier 0)
+# - alpha-child (tier 1, child of alpha-parent)
+# Tenant beta:
+# - beta-parent (tier 0)
+# - beta-child (tier 1, child of beta-parent)
#
-# Both register via the platform's /workspaces endpoint, which is what
-# CP does at provision time. The platform then has them in its DB;
-# tool_list_peers from inside alpha can resolve beta as a peer.
+# IDs are server-generated (POST /workspaces ignores body.id) — we
+# capture the returned id rather than minting client-side. Older
+# versions silently desynced from the workspaces table, breaking
+# FK-dependent replays.
+#
+# All four IDs persist to .seed.env so replays can target any of them.
set -euo pipefail
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
@@ -16,51 +23,67 @@ cd "$HERE"
# shellcheck source=_curl.sh
source "$HERE/_curl.sh"
-echo "[seed] confirming tenant is reachable via cf-proxy..."
-HEALTH=$(curl_anon "$BASE/health" || echo "")
-if [ -z "$HEALTH" ]; then
- echo "[seed] FAILED: $BASE/health unreachable. Did ./up.sh complete?"
+create_workspace() {
+ local tenant="$1" name="$2" tier="$3" parent="${4:-}"
+ local body
+ if [ -n "$parent" ]; then
+ body="{\"name\":\"$name\",\"tier\":$tier,\"parent_id\":\"$parent\",\"runtime\":\"langgraph\"}"
+ else
+ body="{\"name\":\"$name\",\"tier\":$tier,\"runtime\":\"langgraph\"}"
+ fi
+ local id
+ if [ "$tenant" = "alpha" ]; then
+ id=$(curl_alpha_admin -X POST "$BASE/workspaces" -d "$body" | jq -r '.id')
+ else
+ id=$(curl_beta_admin -X POST "$BASE/workspaces" -d "$body" | jq -r '.id')
+ fi
+ if [ -z "$id" ] || [ "$id" = "null" ]; then
+ echo "[seed] FAIL: $tenant/$name workspace creation returned no id" >&2
+ return 1
+ fi
+ echo "$id"
+}
+
+echo "[seed] confirming both tenants reachable..."
+ALPHA_HEALTH=$(curl_alpha_anon "$BASE/health" || echo "")
+BETA_HEALTH=$(curl_beta_anon "$BASE/health" || echo "")
+if [ -z "$ALPHA_HEALTH" ] || [ -z "$BETA_HEALTH" ]; then
+ echo "[seed] FAIL: tenant unreachable. alpha='$ALPHA_HEALTH' beta='$BETA_HEALTH'"
+ echo " Did ./up.sh complete cleanly?"
exit 1
fi
-echo "[seed] $HEALTH"
+echo "[seed] alpha: $ALPHA_HEALTH"
+echo "[seed] beta : $BETA_HEALTH"
-echo "[seed] confirming /buildinfo returns the harness GIT_SHA..."
-BUILD=$(curl_anon "$BASE/buildinfo" || echo "")
-echo "[seed] $BUILD"
+echo ""
+echo "[seed] tenant alpha — creating alpha-parent + alpha-child ..."
+ALPHA_PARENT_ID=$(create_workspace alpha alpha-parent 0)
+echo "[seed] alpha-parent id=$ALPHA_PARENT_ID"
+ALPHA_CHILD_ID=$(create_workspace alpha alpha-child 1 "$ALPHA_PARENT_ID")
+echo "[seed] alpha-child id=$ALPHA_CHILD_ID"
-# Create alpha (parent) and beta (child of alpha). The handler always
-# generates the workspace id server-side and ignores any id in the
-# request body, so we capture the returned id rather than minting one
-# locally — older versions of this script minted client-side and would
-# silently desync from the workspaces table, breaking FK-dependent
-# replays (chat-history seeds activity_logs which has a FK to workspaces).
-echo "[seed] creating workspace 'alpha' (parent)..."
-ALPHA_ID=$(curl_admin -X POST "$BASE/workspaces" \
- -d '{"name":"alpha","tier":0,"runtime":"langgraph"}' \
- | jq -r '.id')
-if [ -z "$ALPHA_ID" ] || [ "$ALPHA_ID" = "null" ]; then
- echo "[seed] FAIL: alpha workspace creation returned no id"
- exit 1
-fi
-echo "[seed] alpha id=$ALPHA_ID"
+echo ""
+echo "[seed] tenant beta — creating beta-parent + beta-child ..."
+BETA_PARENT_ID=$(create_workspace beta beta-parent 0)
+echo "[seed] beta-parent id=$BETA_PARENT_ID"
+BETA_CHILD_ID=$(create_workspace beta beta-child 1 "$BETA_PARENT_ID")
+echo "[seed] beta-child id=$BETA_CHILD_ID"
-echo "[seed] creating workspace 'beta' (child of alpha)..."
-BETA_ID=$(curl_admin -X POST "$BASE/workspaces" \
- -d "{\"name\":\"beta\",\"tier\":1,\"parent_id\":\"$ALPHA_ID\",\"runtime\":\"langgraph\"}" \
- | jq -r '.id')
-if [ -z "$BETA_ID" ] || [ "$BETA_ID" = "null" ]; then
- echo "[seed] FAIL: beta workspace creation returned no id"
- exit 1
-fi
-echo "[seed] beta id=$BETA_ID"
-
-# Stash IDs so replay scripts pick them up.
+# Stash IDs for replay scripts.
+#
+# Backwards-compat: ALPHA_ID + BETA_ID aliases keep pre-Phase-2 replays
+# working (they used these names for the alpha tenant's parent + child).
{
- echo "ALPHA_ID=$ALPHA_ID"
- echo "BETA_ID=$BETA_ID"
+ echo "ALPHA_PARENT_ID=$ALPHA_PARENT_ID"
+ echo "ALPHA_CHILD_ID=$ALPHA_CHILD_ID"
+ echo "BETA_PARENT_ID=$BETA_PARENT_ID"
+ echo "BETA_CHILD_ID=$BETA_CHILD_ID"
+ echo "# legacy aliases — pre-Phase-2 replays expect these names"
+ echo "ALPHA_ID=$ALPHA_PARENT_ID"
+ echo "BETA_ID=$ALPHA_CHILD_ID"
} > "$HERE/.seed.env"
echo ""
echo "[seed] done. IDs persisted to tests/harness/.seed.env"
-echo "[seed] ALPHA_ID=$ALPHA_ID"
-echo "[seed] BETA_ID=$BETA_ID"
+echo "[seed] alpha: parent=$ALPHA_PARENT_ID child=$ALPHA_CHILD_ID"
+echo "[seed] beta : parent=$BETA_PARENT_ID child=$BETA_CHILD_ID"
diff --git a/tests/harness/up.sh b/tests/harness/up.sh
index 87a6cf91..1dad2272 100755
--- a/tests/harness/up.sh
+++ b/tests/harness/up.sh
@@ -38,21 +38,22 @@ if [ "$REBUILD" = true ]; then
docker compose -f compose.yml build --no-cache tenant cp-stub
fi
-echo "[harness] starting cp-stub + postgres + redis + tenant + cf-proxy ..."
+echo "[harness] starting redis + cp-stub + tenant-alpha + tenant-beta + cf-proxy ..."
docker compose -f compose.yml up -d --wait
-# Sudo-free reachability: cf-proxy/nginx routes by Host header (matches
-# production CF tunnel), so replays target loopback :8080 with a Host
-# header rather than depending on /etc/hosts resolution. _curl.sh
-# centralises this. Legacy /etc/hosts users still work — the BASE env
-# var override accepts either shape.
+# Sudo-free reachability: cf-proxy/nginx routes by Host header to the
+# right tenant container (matches production CF tunnel: same URL,
+# different Host = different tenant). Replays target loopback :8080
+# with a per-tenant Host header. _curl.sh centralises the helper
+# functions (curl_alpha_admin, curl_beta_admin, etc.).
echo ""
-echo "[harness] up."
-echo " Tenant via cf-proxy: http://localhost:8080/health"
-echo " (Host: harness-tenant.localhost)"
-echo " cp-stub: internal-only via compose net"
+echo "[harness] up. Multi-tenant topology:"
+echo " tenant-alpha: Host: harness-tenant-alpha.localhost"
+echo " tenant-beta: Host: harness-tenant-beta.localhost"
+echo " legacy alias: Host: harness-tenant.localhost → alpha"
echo ""
-echo " Quick check:"
-echo " curl -H 'Host: harness-tenant.localhost' http://localhost:8080/health"
+echo " Quick check (no /etc/hosts needed):"
+echo " curl -H 'Host: harness-tenant-alpha.localhost' http://localhost:8080/health"
+echo " curl -H 'Host: harness-tenant-beta.localhost' http://localhost:8080/health"
echo ""
-echo "Next: ./seed.sh # mint admin token + register sample workspaces"
+echo "Next: ./seed.sh # register parent+child workspaces in BOTH tenants"
From 18edf88d598e8bd86f534ae8d4ada112f34a3397 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 21:44:42 -0700
Subject: [PATCH 58/61] fix(sweeper): honour template-manifest
provision_timeout_seconds
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Real wiring gap discovered while investigating issue #2486 cluster of
prod claude-code workspaces failed at exactly 10m. The
runtimeProvisionTimeoutsCache (#2054 phase 2) reads
runtime_config.provision_timeout_seconds from each template's
config.yaml so the **canvas** spinner respects per-template timeouts —
but the **sweeper** in registry/provisiontimeout.go hardcoded 10 min
(claude-code) / 30 min (hermes) and never consulted the manifest. So a
template that declared a longer window had a UI that waited correctly
but a sweeper that killed the row at the hardcoded floor anyway.
Resolution order pinned by new TestProvisioningTimeout_ManifestOverride:
1. PROVISION_TIMEOUT_SECONDS env (ops-debug global override)
2. Template manifest lookup (per-runtime, beats hermes default too)
3. Hermes default (30 min — CP bootstrap-watcher 25 min + 5 min slack)
4. DefaultProvisioningTimeout (10 min)
Wiring:
- registry: new RuntimeTimeoutLookup function type, threaded through
StartProvisioningTimeoutSweep + sweepStuckProvisioning + the
pre-existing provisioningTimeoutFor.
- handlers: ProvisionTimeoutSecondsForRuntime exposes the cache's
lookup as a method so main.go can pass it without breaking the
handlers→registry import direction.
- cmd/server/main.go: wire wh.ProvisionTimeoutSecondsForRuntime into
the sweep boot.
Verified:
- go test -race ./... passes (every workspace-server package).
- Regression-injected the lookup arm: 3 manifest-override subcases
fail with the actual-vs-expected gap, confirming the new test is
load-bearing.
- The original two timeout tests (env-override, hermes default) keep
passing — `lookup=nil` argument preserves their semantics.
Operator action enabled: a template wanting a 15-min window can now
just set `runtime_config.provision_timeout_seconds: 900` in its
config.yaml and the sweeper honours it on the next workspace-server
restart.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
workspace-server/cmd/server/main.go | 8 +-
.../internal/handlers/workspace.go | 16 ++++
.../internal/registry/provisiontimeout.go | 52 +++++++++---
.../registry/provisiontimeout_test.go | 85 ++++++++++++++++---
4 files changed, 135 insertions(+), 26 deletions(-)
diff --git a/workspace-server/cmd/server/main.go b/workspace-server/cmd/server/main.go
index f620537b..2021d631 100644
--- a/workspace-server/cmd/server/main.go
+++ b/workspace-server/cmd/server/main.go
@@ -260,7 +260,13 @@ func main() {
// and the state is incoherent (e.g. user sees "Retry" after 15min but
// backend still thinks provisioning is in progress).
go supervised.RunWithRecover(ctx, "provision-timeout-sweep", func(c context.Context) {
- registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval)
+ // Pass the handler's per-runtime template-manifest lookup so the
+ // sweeper honours `runtime_config.provision_timeout_seconds`
+ // declared in any template's config.yaml — the same value the
+ // canvas already reads via addProvisionTimeoutMs. Without this
+ // the sweeper killed claude-code at the 10-min hardcoded floor
+ // regardless of the manifest. See registry.RuntimeTimeoutLookup.
+ registry.StartProvisioningTimeoutSweep(c, broadcaster, registry.DefaultProvisionSweepInterval, wh.ProvisionTimeoutSecondsForRuntime)
})
// Cron Scheduler — fires A2A messages to workspaces on user-defined schedules
diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go
index 78181e61..32057f22 100644
--- a/workspace-server/internal/handlers/workspace.go
+++ b/workspace-server/internal/handlers/workspace.go
@@ -498,6 +498,22 @@ func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runt
}
}
+// ProvisionTimeoutSecondsForRuntime returns the per-runtime provision
+// timeout in seconds when a template's config.yaml declared
+// `runtime_config.provision_timeout_seconds`, else 0 ("no override —
+// caller falls through to its own default").
+//
+// Exported so cmd/server/main.go can pass it to
+// registry.StartProvisioningTimeoutSweep — same template-manifest value
+// the canvas reads via addProvisionTimeoutMs. Without this, the
+// sweeper killed claude-code at 10 min while the manifest declared a
+// longer window, and a user saw the "Retry" UI before their image
+// pull even finished. See registry.RuntimeTimeoutLookup for the
+// resolution order.
+func (h *WorkspaceHandler) ProvisionTimeoutSecondsForRuntime(runtime string) int {
+ return h.provisionTimeouts.get(h.configsDir, runtime)
+}
+
// scanWorkspaceRow is a helper to scan workspace+layout rows into a clean JSON map.
func scanWorkspaceRow(rows interface {
Scan(dest ...interface{}) error
diff --git a/workspace-server/internal/registry/provisiontimeout.go b/workspace-server/internal/registry/provisiontimeout.go
index 268c929e..1b35798e 100644
--- a/workspace-server/internal/registry/provisiontimeout.go
+++ b/workspace-server/internal/registry/provisiontimeout.go
@@ -47,18 +47,44 @@ const HermesProvisioningTimeout = 30 * time.Minute
// query which hits the primary key / status partial index.
const DefaultProvisionSweepInterval = 30 * time.Second
-// provisioningTimeoutFor picks the per-runtime sweep deadline. Mirrors
-// the CP bootstrap-watcher's runtime gating (provisioner.bootstrapTimeoutFn).
-// PROVISION_TIMEOUT_SECONDS env override, when set, applies to ALL
-// runtimes — useful for ops debugging but loses the runtime nuance, so
-// operators should prefer the defaults unless they have a specific
-// reason.
-func provisioningTimeoutFor(runtime string) time.Duration {
+// RuntimeTimeoutLookup returns the per-runtime provision timeout in
+// seconds when a template's config.yaml declared
+// `runtime_config.provision_timeout_seconds`, else zero (= "no override,
+// fall through to runtime defaults below"). Same shape as
+// runtimeProvisionTimeoutsCache.get in handlers — wired through main.go
+// so this package stays template-discovery agnostic.
+//
+// Why an interface instead of importing the cache directly: registry
+// already sits below handlers in the import graph (handlers → registry,
+// not the reverse). A function-typed argument keeps that flow.
+type RuntimeTimeoutLookup func(runtime string) int
+
+// provisioningTimeoutFor picks the per-runtime sweep deadline. Resolution
+// order:
+//
+// 1. PROVISION_TIMEOUT_SECONDS env — global override, ops-debug only.
+// 2. Template manifest override (lookup) — what the canvas spinner
+// also reads via #2054 phase 2. Without this, a template that
+// declared `runtime_config.provision_timeout_seconds: 900` would
+// still get killed by the sweeper at the 10-min hardcoded floor —
+// a real wiring gap that drove every claude-code burst on a cold
+// EC2 to false-positive timeout.
+// 3. Hermes special-case (CP bootstrap-watcher 25 min + 5 min slack).
+// 4. DefaultProvisioningTimeout (10 min) for everything else.
+//
+// lookup may be nil (during package tests, or before main.go has wired
+// it) — falls through to the legacy hermes/default split.
+func provisioningTimeoutFor(runtime string, lookup RuntimeTimeoutLookup) time.Duration {
if v := os.Getenv("PROVISION_TIMEOUT_SECONDS"); v != "" {
if n, err := strconv.Atoi(v); err == nil && n > 0 {
return time.Duration(n) * time.Second
}
}
+ if lookup != nil {
+ if secs := lookup(runtime); secs > 0 {
+ return time.Duration(secs) * time.Second
+ }
+ }
if runtime == "hermes" {
return HermesProvisioningTimeout
}
@@ -74,7 +100,7 @@ func provisioningTimeoutFor(runtime string) time.Duration {
// The sweep is idempotent: the UPDATE's WHERE clause re-checks both status
// and age under the same row lock, so a workspace that raced to `online` or
// was restarted while the sweep was scanning will not get flipped.
-func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration) {
+func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeoutEmitter, interval time.Duration, lookup RuntimeTimeoutLookup) {
if emitter == nil {
log.Println("Provision-timeout sweep: emitter is nil — skipping (no one to broadcast to)")
return
@@ -85,15 +111,15 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
ticker := time.NewTicker(interval)
defer ticker.Stop()
- log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes)",
- interval, DefaultProvisioningTimeout, HermesProvisioningTimeout)
+ log.Printf("Provision-timeout sweep: started (interval=%s, timeout=%s default / %s hermes / per-runtime manifest override=%v)",
+ interval, DefaultProvisioningTimeout, HermesProvisioningTimeout, lookup != nil)
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
- sweepStuckProvisioning(ctx, emitter)
+ sweepStuckProvisioning(ctx, emitter, lookup)
}
}
}
@@ -109,7 +135,7 @@ func StartProvisioningTimeoutSweep(ctx context.Context, emitter ProvisionTimeout
// sweep, leaving an incoherent "marked failed but actually working"
// state. See bootstrap_watcher.go's bootstrapTimeoutFn for the
// canonical CP-side gating.
-func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter) {
+func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter, lookup RuntimeTimeoutLookup) {
// We can't pre-filter by age in SQL because the threshold depends
// on the row's runtime. Pull every provisioning row + its runtime
// + its age, evaluate per-row in Go. Still cheap — the
@@ -141,7 +167,7 @@ func sweepStuckProvisioning(ctx context.Context, emitter ProvisionTimeoutEmitter
}
for _, c := range ids {
- timeout := provisioningTimeoutFor(c.runtime)
+ timeout := provisioningTimeoutFor(c.runtime, lookup)
timeoutSec := int(timeout / time.Second)
if c.ageSec < timeoutSec {
continue
diff --git a/workspace-server/internal/registry/provisiontimeout_test.go b/workspace-server/internal/registry/provisiontimeout_test.go
index fccb966f..3d1017f2 100644
--- a/workspace-server/internal/registry/provisiontimeout_test.go
+++ b/workspace-server/internal/registry/provisiontimeout_test.go
@@ -66,7 +66,7 @@ func TestSweepStuckProvisioning_FlipsOverdue(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
emit := &fakeEmitter{}
- sweepStuckProvisioning(context.Background(), emit)
+ sweepStuckProvisioning(context.Background(), emit, nil)
if emit.count() != 1 {
t.Fatalf("expected 1 event, got %d", emit.count())
@@ -96,7 +96,7 @@ func TestSweepStuckProvisioning_HermesGets30MinSlack(t *testing.T) {
WillReturnRows(candidateRows([3]any{"ws-hermes-booting", "hermes", 660}))
emit := &fakeEmitter{}
- sweepStuckProvisioning(context.Background(), emit)
+ sweepStuckProvisioning(context.Background(), emit, nil)
if emit.count() != 0 {
t.Fatalf("hermes at 11min should NOT have been flipped, got %d events", emit.count())
@@ -121,7 +121,7 @@ func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
emit := &fakeEmitter{}
- sweepStuckProvisioning(context.Background(), emit)
+ sweepStuckProvisioning(context.Background(), emit, nil)
if emit.count() != 1 {
t.Fatalf("hermes past 30min must be flipped, got %d events", emit.count())
@@ -151,7 +151,7 @@ func TestSweepStuckProvisioning_RaceSafe(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 0)) // 0 rows — raced
emit := &fakeEmitter{}
- sweepStuckProvisioning(context.Background(), emit)
+ sweepStuckProvisioning(context.Background(), emit, nil)
if emit.count() != 0 {
t.Errorf("expected 0 events on race, got %d", emit.count())
@@ -170,7 +170,7 @@ func TestSweepStuckProvisioning_NoStuck(t *testing.T) {
WillReturnRows(candidateRows())
emit := &fakeEmitter{}
- sweepStuckProvisioning(context.Background(), emit)
+ sweepStuckProvisioning(context.Background(), emit, nil)
if emit.count() != 0 {
t.Errorf("expected 0 events when nothing stuck, got %d", emit.count())
@@ -201,7 +201,7 @@ func TestSweepStuckProvisioning_MultipleStuck(t *testing.T) {
WillReturnResult(sqlmock.NewResult(0, 1))
emit := &fakeEmitter{}
- sweepStuckProvisioning(context.Background(), emit)
+ sweepStuckProvisioning(context.Background(), emit, nil)
if emit.count() != 2 {
t.Fatalf("expected 2 events, got %d", emit.count())
@@ -222,7 +222,7 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
emit := &fakeEmitter{fail: true}
// Must not panic.
- sweepStuckProvisioning(context.Background(), emit)
+ sweepStuckProvisioning(context.Background(), emit, nil)
}
// TestProvisioningTimeout_EnvOverride verifies PROVISION_TIMEOUT_SECONDS
@@ -231,18 +231,18 @@ func TestSweepStuckProvisioning_BroadcastFailureDoesNotCrash(t *testing.T) {
func TestProvisioningTimeout_EnvOverride(t *testing.T) {
t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
// When env override is set it wins over runtime defaults.
- if got := provisioningTimeoutFor(""); got.Seconds() != 60 {
+ if got := provisioningTimeoutFor("", nil); got.Seconds() != 60 {
t.Errorf("override (no runtime): got %v, want 60s", got)
}
- if got := provisioningTimeoutFor("hermes"); got.Seconds() != 60 {
+ if got := provisioningTimeoutFor("hermes", nil); got.Seconds() != 60 {
t.Errorf("override (hermes): got %v, want 60s", got)
}
t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
- if got := provisioningTimeoutFor(""); got != DefaultProvisioningTimeout {
+ if got := provisioningTimeoutFor("", nil); got != DefaultProvisioningTimeout {
t.Errorf("default (no runtime): got %v, want %v", got, DefaultProvisioningTimeout)
}
t.Setenv("PROVISION_TIMEOUT_SECONDS", "not-a-number")
- if got := provisioningTimeoutFor("claude-code"); got != DefaultProvisioningTimeout {
+ if got := provisioningTimeoutFor("claude-code", nil); got != DefaultProvisioningTimeout {
t.Errorf("bad override (claude-code): got %v, want default %v", got, DefaultProvisioningTimeout)
}
}
@@ -266,8 +266,69 @@ func TestProvisioningTimeout_RuntimeAware(t *testing.T) {
{"unknown-runtime", DefaultProvisioningTimeout},
}
for _, c := range cases {
- if got := provisioningTimeoutFor(c.runtime); got != c.want {
+ if got := provisioningTimeoutFor(c.runtime, nil); got != c.want {
t.Errorf("runtime=%q: got %v, want %v", c.runtime, got, c.want)
}
}
}
+
+// TestProvisioningTimeout_ManifestOverride pins the resolution order
+// when a template's config.yaml declared
+// `runtime_config.provision_timeout_seconds`. Without this gate, the
+// sweeper kept the hardcoded 10-min floor regardless of manifest —
+// which is the original wiring gap that drove false-positive timeouts
+// on cold-pull claude-code bursts.
+//
+// Order pinned:
+//
+// 1. PROVISION_TIMEOUT_SECONDS env beats everything (ops debug).
+// 2. Manifest lookup beats hermes special-case + default.
+// 3. Hermes default applies when lookup returns 0 for hermes.
+// 4. DefaultProvisioningTimeout applies when lookup returns 0 for
+// anything else.
+// 5. Lookup returning 0 for ANY runtime is "no override" — never
+// a 0-second timeout (which would kill every workspace instantly).
+func TestProvisioningTimeout_ManifestOverride(t *testing.T) {
+ manifest := map[string]int{
+ "claude-code": 900, // 15 min — what an ops manifest bump would set
+ "langgraph": 1200,
+ "hermes": 2400, // 40 min — manifest can override hermes default too
+ }
+ lookup := func(runtime string) int { return manifest[runtime] }
+
+ cases := []struct {
+ name string
+ runtime string
+ want time.Duration
+ }{
+ {"manifest override beats default for claude-code", "claude-code", 900 * time.Second},
+ {"manifest override applied for langgraph", "langgraph", 1200 * time.Second},
+ {"manifest override beats hermes default", "hermes", 2400 * time.Second},
+ {"unknown runtime + no manifest entry → default", "unknown-runtime", DefaultProvisioningTimeout},
+ {"empty runtime + no manifest entry → default", "", DefaultProvisioningTimeout},
+ }
+ for _, c := range cases {
+ t.Run(c.name, func(t *testing.T) {
+ if got := provisioningTimeoutFor(c.runtime, lookup); got != c.want {
+ t.Errorf("got %v, want %v", got, c.want)
+ }
+ })
+ }
+
+ // Env override beats manifest — ops debug must be the top priority.
+ t.Setenv("PROVISION_TIMEOUT_SECONDS", "60")
+ if got := provisioningTimeoutFor("claude-code", lookup); got.Seconds() != 60 {
+ t.Errorf("env-override should beat manifest: got %v, want 60s", got)
+ }
+ t.Setenv("PROVISION_TIMEOUT_SECONDS", "")
+
+ // Lookup returning 0 means "no entry" — must NOT result in a
+ // 0-second timeout. Falls through to runtime defaults.
+ zeroLookup := func(_ string) int { return 0 }
+ if got := provisioningTimeoutFor("claude-code", zeroLookup); got != DefaultProvisioningTimeout {
+ t.Errorf("zero-from-lookup should fall through to default, got %v", got)
+ }
+ if got := provisioningTimeoutFor("hermes", zeroLookup); got != HermesProvisioningTimeout {
+ t.Errorf("zero-from-lookup should fall through to hermes default, got %v", got)
+ }
+}
From a15972066b4b4d0c4600baba87cb894678af4fd9 Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 22:00:04 -0700
Subject: [PATCH 59/61] harness(phase-2-followup): fix assert_status mislabel +
honest race comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Two review nits from PR #2493 that don't affect correctness but matter
for honesty in the harness's own self-documentation:
1. tenant-isolation.sh F3/F4 used assert_status for non-HTTP values.
LEAKED_INTO_ALPHA/BETA are jq-derived counts, not HTTP codes — but
the assertion ran through assert_status, which formats the result
as "(HTTP 0)". Anyone reading the test output would believe these
assertions involved an HTTP call. Adds a plain `assert` helper
matching per-tenant-independence.sh's pattern, and uses it on the
two count comparisons.
2. per-tenant-independence.sh Phase F over-claimed coverage.
The comment said the concurrent-INSERT race catches "shared-pool
corruption" + "lib/pq prepared-statement cache collision". Both
are real failure modes — but neither can fire across tenants in
THIS topology, because each tenant owns its own DATABASE_URL and
its own postgres-{alpha,beta} container. The comment now lists
only what the test actually catches (redis cross-keyspace bleed,
shared cp-stub state corruption, cf-proxy buffer mixup) and notes
that a future shared-Postgres variant is the right place for the
lib/pq cache assertion.
No behavioural change — both replays still pass 13/13 + 12/12, all six
replays pass on a clean run-all-replays.sh boot.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../harness/replays/per-tenant-independence.sh | 15 +++++++++++----
tests/harness/replays/tenant-isolation.sh | 18 ++++++++++++++++--
2 files changed, 27 insertions(+), 6 deletions(-)
diff --git a/tests/harness/replays/per-tenant-independence.sh b/tests/harness/replays/per-tenant-independence.sh
index 86e8759a..e80a663a 100755
--- a/tests/harness/replays/per-tenant-independence.sh
+++ b/tests/harness/replays/per-tenant-independence.sh
@@ -120,10 +120,17 @@ assert "E3: postgres-alpha has zero beta-named workspaces" "0" "$ALPHA_HAS_BETA"
assert "E4: postgres-beta has zero alpha-named workspaces" "0" "$BETA_HAS_ALPHA"
# ─── Phase F: concurrent INSERT race ───────────────────────────────────
-# Both tenants take turns inserting 10 rows concurrently. Race shape
-# catches: shared-connection-pool corruption, lib/pq prepared-statement
-# cache collision (org-wide hazard per memory), redis cross-keyspace
-# bleed. Each side must end with EXACTLY +10 rows from its own writes.
+# Both tenants insert 10 rows concurrently. Race shape catches the
+# failure modes that CAN cross tenants in this topology:
+# - redis cross-keyspace bleed (shared redis container).
+# - shared-cp-stub state corruption (single Go process serves both).
+# - cf-proxy buffer mixup under simultaneous in-flight writes.
+# Does NOT catch lib/pq prepared-statement cache collision or shared
+# *sql.DB pool poisoning — each tenant has its own DATABASE_URL and
+# its own postgres-{alpha,beta} container, so there is no shared pool
+# to corrupt. A future replay variant on a single shared Postgres
+# would be the right place to assert that failure mode.
+# Each side must end with EXACTLY +10 rows from its own writes.
echo ""
echo "[replay] F. concurrent insert race — 10 rows per tenant in parallel"
diff --git a/tests/harness/replays/tenant-isolation.sh b/tests/harness/replays/tenant-isolation.sh
index 48887c6f..13e4ddf3 100755
--- a/tests/harness/replays/tenant-isolation.sh
+++ b/tests/harness/replays/tenant-isolation.sh
@@ -61,6 +61,20 @@ assert_status() {
fi
}
+# Plain equality check — for non-HTTP values (counts, names, etc.).
+# Distinct from assert_status so output reads naturally instead of
+# claiming "(HTTP 0)" for what is really a count.
+assert() {
+ local desc="$1" expected="$2" actual="$3"
+ if [ "$expected" = "$actual" ]; then
+ printf " PASS %s\n" "$desc"
+ PASS=$((PASS + 1))
+ else
+ printf " FAIL %s\n expected: %s\n got : %s\n" "$desc" "$expected" "$actual" >&2
+ FAIL=$((FAIL + 1))
+ fi
+}
+
# ─── Phase A: positive controls ────────────────────────────────────────
echo "[replay] A. positive controls — each tenant accepts its own valid creds"
@@ -148,11 +162,11 @@ fi
# Cross-check: neither tenant's list contains the other's workspace ids.
LEAKED_INTO_ALPHA=$(echo "$ALPHA_LIST" | jq -r --arg b1 "$BETA_PARENT_ID" --arg b2 "$BETA_CHILD_ID" \
'[.[] | select(.id == $b1 or .id == $b2)] | length')
-assert_status "F3: alpha list contains zero beta workspace ids" "0" "$LEAKED_INTO_ALPHA"
+assert "F3: alpha list contains zero beta workspace ids" "0" "$LEAKED_INTO_ALPHA"
LEAKED_INTO_BETA=$(echo "$BETA_LIST" | jq -r --arg a1 "$ALPHA_PARENT_ID" --arg a2 "$ALPHA_CHILD_ID" \
'[.[] | select(.id == $a1 or .id == $a2)] | length')
-assert_status "F4: beta list contains zero alpha workspace ids" "0" "$LEAKED_INTO_BETA"
+assert "F4: beta list contains zero alpha workspace ids" "0" "$LEAKED_INTO_BETA"
# ─── Phase G: /health is allowlisted (sanity) ──────────────────────────
echo ""
From 0064f02c0040a8b33072ad1e35e3b2760fc5d21f Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 22:00:36 -0700
Subject: [PATCH 60/61] test(sweeper): integration coverage for
manifest-override + accessor consolidation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Two follow-ups from PR #2494's review:
1. Two new sweep tests exercise the lookup path through
sweepStuckProvisioning end-to-end:
- ManifestOverrideSparesRow: claude-code 11min old, manifest=20min
→ no UPDATE, no broadcast (sparing works through the sweeper)
- ManifestOverrideStillFlipsPastDeadline: claude-code 21min old,
manifest=20min → flipped + payload.timeout_secs=1200
Closes the gap that the unit-test on provisioningTimeoutFor alone
left open: a future refactor could drop the lookup arg from the
sweeper's call and only the unit test caught it. Verified by
regression-injecting `lookup→nil` in sweepStuckProvisioning — both
new tests fail, the old ones still pass.
2. addProvisionTimeoutMs now goes through ProvisionTimeoutSecondsForRuntime
instead of calling provisionTimeouts.get directly. Single accessor
path for the same data — the canvas response and the sweeper now
resolve identically by construction.
No production behavior change; tests + accessor cleanup only.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.../internal/handlers/workspace.go | 2 +-
.../registry/provisiontimeout_test.go | 78 +++++++++++++++++++
2 files changed, 79 insertions(+), 1 deletion(-)
diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go
index 32057f22..9f31cb77 100644
--- a/workspace-server/internal/handlers/workspace.go
+++ b/workspace-server/internal/handlers/workspace.go
@@ -493,7 +493,7 @@ func (h *WorkspaceHandler) Create(c *gin.Context) {
// has no declared timeout — the canvas-side resolver falls through to
// its runtime-profile default.
func (h *WorkspaceHandler) addProvisionTimeoutMs(ws map[string]interface{}, runtime string) {
- if secs := h.provisionTimeouts.get(h.configsDir, runtime); secs > 0 {
+ if secs := h.ProvisionTimeoutSecondsForRuntime(runtime); secs > 0 {
ws["provision_timeout_ms"] = secs * 1000
}
}
diff --git a/workspace-server/internal/registry/provisiontimeout_test.go b/workspace-server/internal/registry/provisiontimeout_test.go
index 3d1017f2..29cc904e 100644
--- a/workspace-server/internal/registry/provisiontimeout_test.go
+++ b/workspace-server/internal/registry/provisiontimeout_test.go
@@ -136,6 +136,84 @@ func TestSweepStuckProvisioning_HermesPastDeadline(t *testing.T) {
}
}
+// TestSweepStuckProvisioning_ManifestOverrideSparesRow pins the
+// integration of the sweeper + RuntimeTimeoutLookup contract introduced
+// in #2494. Closes the gap that the unit-test on provisioningTimeoutFor
+// alone left open: a future refactor could drop the lookup arg from
+// sweepStuckProvisioning's call to provisioningTimeoutFor and only the
+// unit test would catch it. This test fails on that refactor too.
+//
+// Scenario: a claude-code workspace 11 min old (660s). Default budget
+// is 10 min (600s) → without manifest override, this would be flipped
+// to failed. Manifest override declares 1200s → it should be SPARED.
+// No UPDATE, no event emitted.
+func TestSweepStuckProvisioning_ManifestOverrideSparesRow(t *testing.T) {
+ mock := setupTestDB(t)
+
+ mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
+ WillReturnRows(candidateRows([3]any{"ws-claude-templated", "claude-code", 660}))
+
+ // No ExpectExec — if the sweeper still flips the row, sqlmock will
+ // fail with an unexpected-query error.
+
+ lookup := func(runtime string) int {
+ if runtime == "claude-code" {
+ return 1200 // manifest override: 20 min
+ }
+ return 0
+ }
+
+ emit := &fakeEmitter{}
+ sweepStuckProvisioning(context.Background(), emit, lookup)
+
+ if emit.count() != 0 {
+ t.Errorf("manifest-overridden row should NOT have been flipped, got %d events", emit.count())
+ }
+ if err := mock.ExpectationsWereMet(); err != nil {
+ t.Errorf("unmet expectations: %v", err)
+ }
+}
+
+// TestSweepStuckProvisioning_ManifestOverrideStillFlipsPastDeadline —
+// the symmetric case. Manifest override gives a longer window but a
+// row past THAT longer window must still be flipped. Otherwise a
+// template that declares an absurd timeout could leave rows wedged
+// forever.
+func TestSweepStuckProvisioning_ManifestOverrideStillFlipsPastDeadline(t *testing.T) {
+ mock := setupTestDB(t)
+
+ // 21 min = 1260s > 1200s manifest override → flipped.
+ mock.ExpectQuery(`SELECT id, COALESCE\(runtime, ''\), EXTRACT`).
+ WillReturnRows(candidateRows([3]any{"ws-claude-truly-stuck", "claude-code", 1260}))
+ mock.ExpectExec(`UPDATE workspaces`).
+ WithArgs("ws-claude-truly-stuck", sqlmock.AnyArg(), sqlmock.AnyArg(), models.StatusFailed).
+ WillReturnResult(sqlmock.NewResult(0, 1))
+
+ lookup := func(runtime string) int {
+ if runtime == "claude-code" {
+ return 1200
+ }
+ return 0
+ }
+
+ emit := &fakeEmitter{}
+ sweepStuckProvisioning(context.Background(), emit, lookup)
+
+ if emit.count() != 1 {
+ t.Fatalf("row past manifest deadline must still be flipped, got %d events", emit.count())
+ }
+ payload, ok := emit.events[0].Payload.(map[string]interface{})
+ if !ok {
+ t.Fatalf("payload not a map: %T", emit.events[0].Payload)
+ }
+ if payload["timeout_secs"] != 1200 {
+ t.Errorf("payload.timeout_secs = %v, want 1200 (manifest override applied to event payload)", payload["timeout_secs"])
+ }
+ if err := mock.ExpectationsWereMet(); err != nil {
+ t.Errorf("unmet expectations: %v", err)
+ }
+}
+
// TestSweepStuckProvisioning_RaceSafe covers the case where UPDATE affects
// 0 rows because the workspace flipped to online (or got restarted) between
// the SELECT and the UPDATE. We should skip the event, not emit a false
From 3d8a0a58fa35829443d48afb36fdeb8159d3c70f Mon Sep 17 00:00:00 2001
From: Hongming Wang
Date: Fri, 1 May 2026 22:28:35 -0700
Subject: [PATCH 61/61] ci(auto-sync): App-token dispatch + ubuntu-latest +
workflow_dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
auto-sync-main-to-staging.yml hasn't fired since 2026-04-29 despite
multiple staging→main promotes since. The promote PR #2442 (Phase 2)
has been wedged on `mergeStateStatus: BEHIND` for hours because
staging is missing the merge commit from PR #2437.
Three compounding bugs, all fixed here:
1. **GitHub no-recursion suppresses the `on: push` trigger.**
When the merge queue lands a staging→main promote, the resulting
push to main is "by GITHUB_TOKEN", and per
https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow
that push event does NOT fire any downstream workflows. Verified
empirically against SHA 76c604fb (PR #2437): exactly ONE workflow
fired on that push — `publish-workspace-server-image`, dispatched
explicitly by auto-promote-staging.yml's polling tail with an App
token (the documented #2357 workaround). Every other `on: push`
workflow on main, including auto-sync, was silently suppressed.
Same fix extended here: auto-promote-staging.yml's polling tail
now ALSO dispatches `auto-sync-main-to-staging.yml --ref main`
via the App token after the merge lands. App-initiated dispatch
propagates `workflow_run` cascades, which is what the publish
tail relies on too. Failure path: emits `::error::` with the
recovery command — operator runs it once and the next promote
self-heals.
auto-sync.yml gains `workflow_dispatch:` so it can be invoked
from the dispatch above + manually if a future promote also
misses (defense in depth).
2. **`runs-on: [self-hosted, macos, arm64]` was wrong for this repo.**
Comment claimed "matches the rest of this repo's workflows" — false:
this is the ONLY workflow in molecule-core/.github/workflows/ with
a non-ubuntu runs-on. Copy-paste artefact from molecule-controlplane
(which IS private and has a Mac runner). molecule-core has no Mac
runner registered, so even when the trigger DID fire (the 3 historic
manual-UI merges), the job would have sat unassigned if the runner
were offline. Switched to `ubuntu-latest` to match every other
workflow in this repo.
3. **The `on: push` trigger remains** as a defense-in-depth path for
the rare case of a manual UI merge by a real user (which uses
their PAT and DOES fire downstream workflows — confirmed via the
2026-04-29 d35a2420 run with `triggering_actor=HongmingWang-Rabbit`
that fired 16 workflows including auto-sync). Belt-and-suspenders.
Long-term: switching auto-promote's `gh pr merge --auto` call to use
the App token (instead of GITHUB_TOKEN) would let `on: push` triggers
fire naturally and obviate the need for the explicit dispatches in
the polling tail. Tracked in #2357 — out of scope here.
Operator recovery for the current Phase 2 wedge: after this lands on
staging, dispatch auto-sync once via
`gh workflow run auto-sync-main-to-staging.yml --ref main` to
backfill the missed sync from 76c604fb. PR #2442 will go from
BEHIND → CLEAN and auto-merge.
Co-Authored-By: Claude Opus 4.7 (1M context)
---
.github/workflows/auto-promote-staging.yml | 18 ++++++++++++
.../workflows/auto-sync-main-to-staging.yml | 28 +++++++++++++++++--
2 files changed, 44 insertions(+), 2 deletions(-)
diff --git a/.github/workflows/auto-promote-staging.yml b/.github/workflows/auto-promote-staging.yml
index a62010f2..de6ce46a 100644
--- a/.github/workflows/auto-promote-staging.yml
+++ b/.github/workflows/auto-promote-staging.yml
@@ -364,3 +364,21 @@ jobs:
else
echo "::error::Failed to dispatch publish-workspace-server-image. Run manually: gh workflow run publish-workspace-server-image.yml --ref main"
fi
+
+ # ALSO dispatch auto-sync-main-to-staging.yml. Same root cause as
+ # publish above (issue #2357): the merge-queue-initiated push to
+ # main is by GITHUB_TOKEN → no `on: push` triggers fire downstream.
+ # Without this dispatch, every staging→main promote leaves staging
+ # one merge commit BEHIND main, which silently dead-locks the NEXT
+ # promote PR as `mergeStateStatus: BEHIND` because main's
+ # branch-protection has `strict: true`. Verified empirically on
+ # 2026-05-02 against PR #2442 (Phase 2 promote): only the explicit
+ # publish-workspace-server-image dispatch fired on the previous
+ # promote SHA 76c604fb, while auto-sync silently no-op'd, leaving
+ # staging behind for ~24h until manually bridged.
+ if gh workflow run auto-sync-main-to-staging.yml \
+ --repo "$REPO" --ref main 2>&1; then
+ echo "::notice::Dispatched auto-sync-main-to-staging on ref=main as molecule-ai App — staging will absorb the new main merge commit via PR + merge queue."
+ else
+ echo "::error::Failed to dispatch auto-sync-main-to-staging. Run manually: gh workflow run auto-sync-main-to-staging.yml --ref main"
+ fi
diff --git a/.github/workflows/auto-sync-main-to-staging.yml b/.github/workflows/auto-sync-main-to-staging.yml
index 36ab63f7..9a0140d7 100644
--- a/.github/workflows/auto-sync-main-to-staging.yml
+++ b/.github/workflows/auto-sync-main-to-staging.yml
@@ -60,6 +60,24 @@ name: Auto-sync main → staging
on:
push:
branches: [main]
+ # workflow_dispatch lets:
+ # 1. Operators manually backfill a missed sync (e.g. after a manual
+ # UI merge that the runner missed).
+ # 2. auto-promote-staging.yml's polling tail explicitly invoke us
+ # after the promote PR lands. This is load-bearing: when the
+ # merge queue lands a promote-PR merge, the resulting push to
+ # `main` is "by GITHUB_TOKEN", and per GitHub's no-recursion
+ # rule (https://docs.github.com/en/actions/using-workflows/triggering-a-workflow#triggering-a-workflow-from-a-workflow)
+ # that push event does NOT fire any downstream workflows. The
+ # `on: push` trigger above is silently dead for the very pattern
+ # we exist to handle. Verified empirically 2026-05-02 against
+ # SHA 76c604fb (PR #2437 staging→main): only ONE workflow fired
+ # (publish-workspace-server-image, dispatched explicitly by
+ # auto-promote's polling tail with an App token). Every other
+ # `on: push: branches: [main]` workflow — including this one —
+ # was suppressed. Until the underlying merge call moves to an
+ # App token, an explicit dispatch is the only reliable path.
+ workflow_dispatch:
permissions:
contents: write
@@ -71,8 +89,14 @@ concurrency:
jobs:
sync-staging:
- # Self-hosted Mac mini matches the rest of this repo's workflows.
- runs-on: [self-hosted, macos, arm64]
+ # ubuntu-latest matches every other workflow in this repo. The
+ # earlier `[self-hosted, macos, arm64]` was a copy-paste artefact
+ # from the molecule-controlplane repo (which IS private and uses a
+ # Mac runner) — molecule-core has no Mac runner registered, so the
+ # job sat unassigned whenever the trigger fired. Verified 2026-05-02:
+ # this is the ONLY workflow in molecule-core/.github/workflows/ with
+ # a non-ubuntu runs-on.
+ runs-on: ubuntu-latest
steps:
- name: Checkout staging
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4