From 3c16c274152ff78bcc7872113337df1acd5113ae Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 20:39:48 -0700 Subject: [PATCH 01/61] ci(wheel-smoke): always-run with per-step if-gates for required-check eligibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `PR-built wheel + import smoke` gate caught the broken wheel from PR #2433 (`import inbox as _inbox_module` collision) but couldn't block the merge because it isn't a required check on staging. Promoting it to required is the right move per the runtime publish pipeline gates note (2026-04-27 RuntimeCapabilities ImportError outage), but the existing `paths: [workspace/**, scripts/...]` filter blocks PRs that don't touch those paths from ever generating the check run — branch protection would deadlock waiting on a check that never fires. Refactor (same shape as e2e-api.yml's e2e-api job): - Drop top-level `paths:` filter — workflow runs on every push/PR/ merge_group event. - Add `detect-changes` job using dorny/paths-filter to compute the `wheel=true|false` output. - Collapse to ONE always-running `local-build-install` job named `PR-built wheel + import smoke`. Per-step `if:` gates on the detect output. PRs untouched by wheel-relevant paths emit a no-op SUCCESS step ("paths filter excluded this commit") so the check passes without rebuilding the wheel. - merge_group + workflow_dispatch unconditionally `wheel=true` so the queue always validates the to-be-merged state, regardless of which PR composed it. Why one-job-with-step-gates instead of two-jobs-sharing-name: SKIPPED check runs block branch protection even when SUCCESS siblings exist (verified PR #2264 incident, 2026-04-29). Single always-run job emits exactly one SUCCESS check run regardless of paths filter. Follow-up: open a separate PR adding `PR-built wheel + import smoke` to the staging branch protection's required_status_checks.contexts once this lands. Doing both in one PR risks the protection update firing before the workflow refactor merges, deadlocking unrelated PRs. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/runtime-prbuild-compat.yml | 83 ++++++++++++++------ 1 file changed, 59 insertions(+), 24 deletions(-) diff --git a/.github/workflows/runtime-prbuild-compat.yml b/.github/workflows/runtime-prbuild-compat.yml index 96f1a289..0bc9a511 100644 --- a/.github/workflows/runtime-prbuild-compat.yml +++ b/.github/workflows/runtime-prbuild-compat.yml @@ -23,55 +23,88 @@ name: Runtime PR-Built Compatibility # # By building from the PR's source and smoke-importing THAT wheel, we # fail at PR-time instead of after publish. +# +# Required-check shape (2026-05-01): the workflow runs on EVERY push + +# PR + merge_group event with no top-level `paths:` filter, then uses a +# detect-changes job + per-step `if:` gates inside ONE always-running +# job named `PR-built wheel + import smoke`. PRs that don't touch +# wheel-relevant paths get a no-op SUCCESS check run, satisfying branch +# protection without re-running the heavy build. Same pattern as +# e2e-api.yml — see its comment for the full rationale + the 2026-04-29 +# PR #2264 incident that motivated the always-run-with-if-gates shape. on: push: branches: [main, staging] - paths: - # Broad filter: this workflow's verdict can change whenever any - # workspace/ source file changes (because the wheel we build is - # produced from those files), or when the build script itself - # changes (it controls the wheel layout). - - 'workspace/**' - - 'scripts/build_runtime_package.py' - - 'scripts/wheel_smoke.py' - - '.github/workflows/runtime-prbuild-compat.yml' pull_request: branches: [main, staging] - paths: - - 'workspace/**' - - 'scripts/build_runtime_package.py' - - 'scripts/wheel_smoke.py' - - '.github/workflows/runtime-prbuild-compat.yml' workflow_dispatch: - # Required-check support: when this becomes a branch-protection gate, - # merge_group runs let the queue green-check this in addition to PRs. merge_group: types: [checks_requested] - # No cron: the same pre-merge run already covered the commit, and - # re-running daily wouldn't surface anything new (workspace/ doesn't - # change between cron firings unless a PR already passed this gate). concurrency: - group: ${{ github.workflow }}-${{ github.ref }} + group: ${{ github.workflow }}-${{ github.event.pull_request.head.sha || github.sha }} cancel-in-progress: true jobs: + detect-changes: + runs-on: ubuntu-latest + outputs: + wheel: ${{ steps.decide.outputs.wheel }} + steps: + - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - uses: dorny/paths-filter@fbd0ab8f3e69293af611ebaee6363fc25e6d187d # v4.0.1 + id: filter + with: + filters: | + wheel: + - 'workspace/**' + - 'scripts/build_runtime_package.py' + - 'scripts/wheel_smoke.py' + - '.github/workflows/runtime-prbuild-compat.yml' + - id: decide + # Always run real work for manual dispatch + merge_group — no + # diff-against-base in those contexts, and the gate exists to + # validate the to-be-merged state regardless of which paths it + # touched (paths-filter would default to "no changes" which is + # the wrong answer when the queue is composing many PRs). + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ] || [ "${{ github.event_name }}" = "merge_group" ]; then + echo "wheel=true" >> "$GITHUB_OUTPUT" + else + echo "wheel=${{ steps.filter.outputs.wheel }}" >> "$GITHUB_OUTPUT" + fi + + # ONE job (no job-level `if:`) that always runs and reports under the + # required-check name `PR-built wheel + import smoke`. Real work is + # gated per-step on `needs.detect-changes.outputs.wheel`. Same shape + # as e2e-api.yml's e2e-api job — see its comment block for the full + # rationale (SKIPPED check runs block branch protection even with + # SUCCESS siblings; collapsing to one always-run job emits exactly + # one SUCCESS check run). local-build-install: - # Builds the wheel from THIS PR's workspace/ + scripts/ and tests - # IT — the artifact that WOULD be published if this PR merges. + needs: detect-changes name: PR-built wheel + import smoke runs-on: ubuntu-latest steps: - - uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 - - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 + - name: No-op pass (paths filter excluded this commit) + if: needs.detect-changes.outputs.wheel != 'true' + run: | + echo "No workspace/ / scripts/{build_runtime_package,wheel_smoke}.py / workflow changes — wheel gate satisfied without rebuilding." + echo "::notice::PR-built wheel + import smoke no-op pass (paths filter excluded this commit)." + - if: needs.detect-changes.outputs.wheel == 'true' + uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4 + - if: needs.detect-changes.outputs.wheel == 'true' + uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: '3.11' cache: pip cache-dependency-path: workspace/requirements.txt - name: Install build tooling + if: needs.detect-changes.outputs.wheel == 'true' run: pip install build - name: Build wheel from PR source (mirrors publish-runtime.yml) + if: needs.detect-changes.outputs.wheel == 'true' # Use a fixed test version so the wheel filename is predictable. # Doesn't reach PyPI — this build is local-only for the smoke. # Use the SAME build script with the SAME args as @@ -88,6 +121,7 @@ jobs: --out /tmp/runtime-build cd /tmp/runtime-build && python -m build - name: Install built wheel + workspace requirements + if: needs.detect-changes.outputs.wheel == 'true' run: | python -m venv /tmp/venv-built /tmp/venv-built/bin/pip install --upgrade pip @@ -96,6 +130,7 @@ jobs: /tmp/venv-built/bin/pip show molecule-ai-workspace-runtime a2a-sdk \ | grep -E '^(Name|Version):' - name: Smoke import the PR-built wheel + if: needs.detect-changes.outputs.wheel == 'true' # Same script publish-runtime.yml runs against the to-be-PyPI wheel. # Closes the PR-time vs publish-time gap: a PR adding a new SDK # call-shape no longer passes here (narrow `import main_sync`) only From 6e92fe0a086f17d1ca7daa7b192e4a1655505d78 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 20:45:32 -0700 Subject: [PATCH 02/61] chore: rewriter unit tests + drop misleading noqa on `import inbox` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two small follow-ups to the PR #2433 → #2436 → #2439 incident chain. 1) `import inbox # noqa: F401` in workspace/a2a_mcp_server.py was misleading — `inbox` IS used (at the bridge wiring inside main()). F401 means "imported but unused", which would mask a real future F401 if the usage is removed. Drop the noqa, keep the explanatory block comment about the rewriter's `import X` → `import mr.X as X` expansion (and the `import X as Y` → `import mr.X as X as Y` trap the comment exists to prevent re-introducing). 2) scripts/test_build_runtime_package.py — 17 unit tests covering `rewrite_imports()` and `build_import_rewriter()` in scripts/build_runtime_package.py. Until now the function had zero coverage despite the entire wheel build depending on it. Tests pin: bare-import aliasing, dotted-import preservation, indented imports, from-imports (simple + dotted + multi-symbol + block), the `import X as Y` rejection added in PR #2436 (with comment- stripping + indented + comma-not-alias edge cases), allowlist anchoring (`a2a` ≠ `a2a_tools`), and end-to-end reproduction of the PR #2433 failing pattern + the #2436 fix pattern. 3) Wire scripts/test_*.py into CI by adding a second discover pass to test-ops-scripts.yml. Top-level scripts/ tests live alongside their target file (parallels the scripts/ops/ test layout); the existing scripts/ops/ pass keeps running because scripts/ops/ has no __init__.py so a single discover from scripts/ root doesn't recurse. Two passes is simpler than retrofitting namespace packages. Path filter widened from `scripts/ops/**` to `scripts/**` so PRs touching the build script trigger the new tests. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/test-ops-scripts.yml | 26 +++- scripts/test_build_runtime_package.py | 201 +++++++++++++++++++++++++ workspace/a2a_mcp_server.py | 8 +- 3 files changed, 226 insertions(+), 9 deletions(-) create mode 100644 scripts/test_build_runtime_package.py diff --git a/.github/workflows/test-ops-scripts.yml b/.github/workflows/test-ops-scripts.yml index 3c6488fa..a6f342e1 100644 --- a/.github/workflows/test-ops-scripts.yml +++ b/.github/workflows/test-ops-scripts.yml @@ -1,19 +1,25 @@ name: Ops Scripts Tests -# Runs the unittest suite for scripts/ops/ on every PR + push that touches -# the directory. Kept separate from the main CI so a script-only change -# doesn't trigger the heavier Go/Canvas/Python pipelines. +# Runs the unittest suite for scripts/ on every PR + push that touches +# anything under scripts/. Kept separate from the main CI so a script-only +# change doesn't trigger the heavier Go/Canvas/Python pipelines. +# +# Discovery: `unittest discover` from the scripts/ root walks recursively, +# so both `scripts/test_*.py` and `scripts/ops/test_*.py` are picked up. +# Tests should sit alongside the code they test (see +# scripts/ops/test_sweep_cf_decide.py for the pattern; scripts/ +# test_build_runtime_package.py for the rewriter coverage). on: push: branches: [main, staging] paths: - - 'scripts/ops/**' + - 'scripts/**' - '.github/workflows/test-ops-scripts.yml' pull_request: branches: [main, staging] paths: - - 'scripts/ops/**' + - 'scripts/**' - '.github/workflows/test-ops-scripts.yml' merge_group: types: [checks_requested] @@ -31,6 +37,14 @@ jobs: - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 with: python-version: '3.11' - - name: Run unittest + - name: Run scripts/ unittests (build_runtime_package, …) + # Top-level scripts/ tests live alongside their target file + # (e.g. scripts/test_build_runtime_package.py exercises + # scripts/build_runtime_package.py). discover from scripts/ + # picks up only top-level test_*.py because scripts/ops/ has + # no __init__.py — that's intentional, so we run two passes. + working-directory: scripts + run: python -m unittest discover -t . -p 'test_*.py' -v + - name: Run scripts/ops/ unittests (sweep_cf_decide, …) working-directory: scripts/ops run: python -m unittest discover -p 'test_*.py' -v diff --git a/scripts/test_build_runtime_package.py b/scripts/test_build_runtime_package.py new file mode 100644 index 00000000..ec57b5e2 --- /dev/null +++ b/scripts/test_build_runtime_package.py @@ -0,0 +1,201 @@ +"""Tests for scripts/build_runtime_package.py — the wheel-build import rewriter. + +Run locally: ``python3 -m unittest scripts/test_build_runtime_package.py -v`` + +Why this exists: PR #2433 shipped ``import inbox as _inbox_module`` inside +the workspace runtime, and the rewriter expanded it to +``import molecule_runtime.inbox as inbox as _inbox_module`` — invalid +Python. The wheel-smoke gate caught it post-merge but couldn't block +the merge (not a required check yet — see PR #2439). PR #2436 added a +build-time gate that raises ``ValueError`` on this pattern; this file +locks the rewriter's documented contract under unit test so the gate +itself can't silently regress. + +Coverage: +- ``import X`` → ``import molecule_runtime.X as X`` +- ``import X.sub`` → ``import molecule_runtime.X.sub`` +- ``import X`` + trailing comment is preserved +- ``from X import Y`` → ``from molecule_runtime.X import Y`` +- ``from X.sub import Y`` → ``from molecule_runtime.X.sub import Y`` +- ``from X import Y, Z`` → ``from molecule_runtime.X import Y, Z`` +- ``import X as Y`` → raises ValueError (the rewriter would + produce ``import molecule_runtime.X as X as Y``, syntax error) +- non-allowlist module names → not rewritten (regex anchors on the closed set) +- Indented imports (inside def/class) keep their indentation. +""" +from __future__ import annotations + +import os +import sys +import unittest + +# scripts/build_runtime_package.py lives at scripts/ — add scripts/ to sys.path +# so the import works whether unittest is invoked from repo root or scripts/. +HERE = os.path.dirname(os.path.abspath(__file__)) +if HERE not in sys.path: + sys.path.insert(0, HERE) + +import build_runtime_package as M # noqa: E402 + + +def rewrite(text: str) -> str: + """Run the rewriter end-to-end so the test exercises the same path + used by the wheel build (regex compile + substitution).""" + regex = M.build_import_rewriter() + return M.rewrite_imports(text, regex) + + +class TestBareImportRewriting(unittest.TestCase): + def test_plain_import_aliases_to_preserve_binding(self): + self.assertEqual( + rewrite("import inbox\n"), + "import molecule_runtime.inbox as inbox\n", + ) + + def test_plain_import_with_trailing_comment_is_preserved(self): + # Real-world shape from a2a_mcp_server.py — the comment must + # survive the rewrite without losing its leading-space buffer. + self.assertEqual( + rewrite("import inbox # noqa: E402\n"), + "import molecule_runtime.inbox as inbox # noqa: E402\n", + ) + + def test_import_dotted_keeps_dotted_form(self): + # `import X.sub` is rare for our modules but the rewriter must + # not double-alias — we want `import molecule_runtime.X.sub`, + # not `import molecule_runtime.X.sub as X.sub` (invalid). + self.assertEqual( + rewrite("import platform_tools.registry\n"), + "import molecule_runtime.platform_tools.registry\n", + ) + + def test_indented_import_preserves_indentation(self): + src = "def foo():\n import inbox\n return inbox.x\n" + out = rewrite(src) + self.assertIn(" import molecule_runtime.inbox as inbox\n", out) + + +class TestFromImportRewriting(unittest.TestCase): + def test_from_module_import_simple(self): + self.assertEqual( + rewrite("from inbox import InboxState\n"), + "from molecule_runtime.inbox import InboxState\n", + ) + + def test_from_dotted_import(self): + self.assertEqual( + rewrite("from platform_tools.registry import TOOLS\n"), + "from molecule_runtime.platform_tools.registry import TOOLS\n", + ) + + def test_from_import_multiple_symbols(self): + # Multi-import statement — the rewriter only touches the module + # prefix, not the names being imported. + self.assertEqual( + rewrite("from a2a_tools import (foo, bar, baz)\n"), + "from molecule_runtime.a2a_tools import (foo, bar, baz)\n", + ) + + def test_from_import_block_form(self): + src = ( + "from a2a_tools import (\n" + " tool_check_task_status,\n" + " tool_commit_memory,\n" + ")\n" + ) + out = rewrite(src) + self.assertIn("from molecule_runtime.a2a_tools import (\n", out) + # Trailing names + closer are unchanged. + self.assertIn(" tool_check_task_status,\n", out) + self.assertIn(")\n", out) + + +class TestImportAsAliasRejection(unittest.TestCase): + """The key regression class — the failure mode that shipped in PR #2433.""" + + def test_import_as_alias_raises_value_error(self): + with self.assertRaises(ValueError) as ctx: + rewrite("import inbox as _inbox_module\n") + msg = str(ctx.exception) + # Error must name the offending module + suggest the fix. + self.assertIn("inbox", msg) + self.assertIn("as ", msg) + self.assertIn("from", msg) # suggests `from X import …` + + def test_import_as_alias_indented_still_rejected(self): + # Indented (inside def/class) — same hazard, same rejection. + with self.assertRaises(ValueError): + rewrite("def foo():\n import inbox as _x\n") + + def test_import_as_alias_with_trailing_comment_still_rejected(self): + with self.assertRaises(ValueError): + rewrite("import inbox as _x # comment\n") + + def test_plain_import_with_as_in_comment_does_not_trip(self): + # The detection strips comments before pattern-matching, so a + # comment containing "as foo" must NOT trigger the rejection. + self.assertEqual( + rewrite("import inbox # rewriter produces alias as inbox\n"), + "import molecule_runtime.inbox as inbox # rewriter produces alias as inbox\n", + ) + + def test_import_followed_by_comma_is_not_an_alias(self): + # `import inbox, os` — comma is not `as`, must not be rejected. + # Our regex captures `inbox` then `,` — only `inbox` gets prefixed. + # `os` is not in TOP_LEVEL_MODULES so it's left alone. + out = rewrite("import inbox, os\n") + # The first module is rewritten; the second (non-allowlist) is not. + self.assertIn("import molecule_runtime.inbox as inbox", out) + + +class TestOutsideAllowlistModules(unittest.TestCase): + def test_third_party_imports_unchanged(self): + # `httpx`, `os`, `re` etc. are not in TOP_LEVEL_MODULES — the + # regex must not match them. This is the closed-list invariant + # that prevents accidental rewrites of stdlib / third-party. + src = "import httpx\nimport os\nfrom re import match\n" + self.assertEqual(rewrite(src), src) + + def test_short_name_collision_avoided(self): + # `from a2a.server.X import Y` must not match the bare `a2a` + # prefix — `a2a` isn't in our allowlist (we allow `a2a_tools`, + # `a2a_client`, etc., but not bare `a2a`). Belt-and-suspenders. + src = "from a2a.server.routes import create_agent_card_routes\n" + self.assertEqual(rewrite(src), src) + + +class TestEndToEndShape(unittest.TestCase): + """Reproduces the PR #2433 → #2436 incident shape.""" + + def test_pr_2433_pattern_now_rejected(self): + # The exact line PR #2433 added (inside main()), which produced + # `import molecule_runtime.inbox as inbox as _inbox_module` — + # invalid syntax in the published wheel. + with self.assertRaises(ValueError) as ctx: + rewrite( + " import inbox as _inbox_module\n" + " _inbox_module.set_notification_callback(_on_inbox_message)\n" + ) + # Error message includes the offending line so the operator + # knows exactly where to fix. + self.assertIn("inbox", str(ctx.exception)) + + def test_pr_2436_fix_pattern_works(self): + # The fix-forward shape (#2436): top-level `import inbox`, + # bridge wired in main() via `inbox.set_notification_callback`. + src = ( + "import inbox\n" + "\n" + "def main():\n" + " inbox.set_notification_callback(cb)\n" + ) + out = rewrite(src) + self.assertIn("import molecule_runtime.inbox as inbox\n", out) + # The callable reference inside main() is left alone — only + # imports get rewritten, not arbitrary `inbox.foo` callsites + # (those resolve via the module binding the rewrite preserves). + self.assertIn(" inbox.set_notification_callback(cb)\n", out) + + +if __name__ == "__main__": + unittest.main() diff --git a/workspace/a2a_mcp_server.py b/workspace/a2a_mcp_server.py index 09512f26..b3255bf0 100644 --- a/workspace/a2a_mcp_server.py +++ b/workspace/a2a_mcp_server.py @@ -17,9 +17,11 @@ import json import logging import sys -import inbox # noqa: F401 — bridge wiring lives in main(); the rewriter -# produces `import molecule_runtime.inbox as inbox` -# which preserves this binding for set_notification_callback. +# Top-level (not inside main()) so the wheel rewriter expands this to +# `import molecule_runtime.inbox as inbox`. A local `import inbox as _x` +# would expand to `import molecule_runtime.inbox as inbox as _x`, +# which is invalid — see scripts/build_runtime_package.py:rewrite_imports. +import inbox from a2a_tools import ( tool_check_task_status, From 067ad83ce510e022988e779b7f3e65c704f1aad1 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 20:47:09 -0700 Subject: [PATCH 03/61] feat(config): add explicit `provider:` field alongside `model:` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a top-level `provider` slug to WorkspaceConfig and RuntimeConfig so adapters can route to a specific gateway without re-implementing slug-prefix parsing across hermes / claude-code / codex. Resolution chain in load_config (mirrors how `model` resolves): 1. ``LLM_PROVIDER`` env var — what canvas Save+Restart sets so the operator's Provider dropdown choice survives a CP-driven restart (the regenerated /configs/config.yaml drops most user fields). 2. Explicit YAML ``provider:`` — operator pinned it in the file. 3. Derive from the model slug prefix for backward compat: ``anthropic:claude-opus-4-7`` → ``anthropic`` ``minimax/abab7-chat-preview`` → ``minimax`` bare model names → ``""`` (let the adapter decide). `runtime_config.provider` falls back to the top-level resolved provider, the same shape PR #2438 added for `runtime_config.model`. Why a separate field at all (we already parse the slug): - Custom model aliases without a recognizable prefix need an explicit signal — the canvas Provider dropdown writes it. - Adapters were each rolling their own slug-parse (hermes's derive-provider.sh, claude-code's adapter-default branch, etc.); one resolution point in load_config kills that drift class. - Canvas needs a stable storage field that doesn't get clobbered every time the user picks a new model. Backward-compatible: when `provider:` is absent, slug derivation keeps every existing config.yaml working without a migration. PR-1 of a multi-PR stack (Option B from RFC discussion). Subsequent PRs plumb the field through workspace-server env, CP user-data, adapters (hermes prefers explicit over derive-provider.sh), and canvas Provider dropdown UI. Tests cover all four resolution paths + runtime_config inheritance: - test_provider_default_empty_when_bare_model - test_provider_derived_from_colon_slug - test_provider_derived_from_slash_slug - test_provider_yaml_explicit_wins_over_derived - test_provider_env_override_beats_yaml_and_derived - test_runtime_config_provider_yaml_wins_over_top_level - test_provider_default_from_default_model Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace/config.py | 54 ++++++++++++ workspace/tests/test_config.py | 151 +++++++++++++++++++++++++++++++++ 2 files changed, 205 insertions(+) diff --git a/workspace/config.py b/workspace/config.py index 370ada11..3b205f1b 100644 --- a/workspace/config.py +++ b/workspace/config.py @@ -96,6 +96,10 @@ class RuntimeConfig: required_env: list[str] = field(default_factory=list) # env vars required to run (e.g. ["CLAUDE_CODE_OAUTH_TOKEN"]) timeout: int = 0 # seconds (0 = no timeout — agents wait until done) model: str = "" # model override for the CLI + provider: str = "" # explicit LLM provider (e.g., "anthropic", "openai", + # "minimax"). Falls back to the top-level resolved + # provider when empty. Adapters (hermes, claude-code, + # codex) prefer this over slug-parsing the model name. # Deprecated — use required_env + secrets API instead. Kept for backward compat. auth_token_env: str = "" auth_token_file: str = "" @@ -221,6 +225,16 @@ class WorkspaceConfig: version: str = "1.0.0" tier: int = 1 model: str = "anthropic:claude-opus-4-7" + provider: str = "" + """Explicit LLM provider slug (e.g., ``anthropic``, ``openai``, ``minimax``). + + When empty, ``load_config`` derives it from the ``model`` slug prefix + (``anthropic:claude-opus-4-7`` → ``anthropic``; ``minimax/abab7-chat`` → + ``minimax``; bare model names → ``""``). Set explicitly via the canvas + Provider dropdown or the ``LLM_PROVIDER`` env var when the model name + is provider-ambiguous (e.g., a custom alias) or when an adapter needs + a specific gateway distinct from the model namespace. + """ runtime: str = "langgraph" # langgraph | claude-code | codex | ollama | custom runtime_config: RuntimeConfig = field(default_factory=RuntimeConfig) initial_prompt: str = "" @@ -261,6 +275,20 @@ class WorkspaceConfig: automatically adds the ``task-budgets-2026-03-13`` beta header.""" +def _derive_provider_from_model(model: str) -> str: + """Extract the provider slug prefix from a model identifier. + + Recognizes both ``provider:model`` (Anthropic / OpenAI / Google convention) + and ``provider/model`` (HuggingFace / Minimax convention). Returns ``""`` + when the model has no recognizable separator — callers must treat empty + as "use adapter default routing", not as a hard failure. + """ + for sep in (":", "/"): + if sep in model: + return model.partition(sep)[0] + return "" + + def load_config(config_path: Optional[str] = None) -> WorkspaceConfig: """Load config from WORKSPACE_CONFIG_PATH or the given path.""" if config_path is None: @@ -276,6 +304,25 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig: # Override model from env if provided model = os.environ.get("MODEL_PROVIDER", raw.get("model", "anthropic:claude-opus-4-7")) + # Resolve top-level provider with this priority chain: + # 1. ``LLM_PROVIDER`` env var (canvas Save+Restart sets this so the + # operator's choice survives a CP-driven restart even though the + # regenerated /configs/config.yaml drops most user fields). + # 2. Explicit YAML ``provider:`` (an operator pinned it in the file). + # 3. Derive from the model slug prefix for backward compat: + # ``anthropic:claude-opus-4-7`` → ``anthropic`` + # ``minimax/abab7-chat-preview`` → ``minimax`` + # bare model names → ``""`` (signals "use adapter default") + # Empty after all three is fine — adapters that don't need an explicit + # provider (langgraph, claude-code-default, codex) keep their existing + # routing; adapters that do (hermes via derive-provider.sh) prefer this + # over slug-parsing the model name. + provider = ( + os.environ.get("LLM_PROVIDER") + or raw.get("provider") + or _derive_provider_from_model(model) + ) + runtime = raw.get("runtime", "langgraph") runtime_raw = raw.get("runtime_config", {}) @@ -314,6 +361,7 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig: version=raw.get("version", "1.0.0"), tier=int(raw.get("tier", 1)) if str(raw.get("tier", 1)).isdigit() else 1, model=model, + provider=provider, runtime=runtime, initial_prompt=initial_prompt, idle_prompt=idle_prompt, @@ -336,6 +384,12 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig: # MODEL_PROVIDER is plumbed as an env var, so picking it up via # the top-level resolved model keeps the selection sticky. model=runtime_raw.get("model") or model, + # Same fallback shape as ``model`` above: an explicit + # ``runtime_config.provider`` wins; otherwise inherit the + # top-level resolved provider so adapters see a single + # consistent choice without each one re-implementing + # env/YAML/slug-prefix resolution. + provider=runtime_raw.get("provider") or provider, # Deprecated fields — kept for backward compat auth_token_env=runtime_raw.get("auth_token_env", ""), auth_token_file=runtime_raw.get("auth_token_file", ""), diff --git a/workspace/tests/test_config.py b/workspace/tests/test_config.py index c87198ba..bc09d638 100644 --- a/workspace/tests/test_config.py +++ b/workspace/tests/test_config.py @@ -164,6 +164,157 @@ def test_runtime_config_model_picks_up_env_via_top_level(tmp_path, monkeypatch): assert cfg.runtime_config.model == "minimax/abab7-chat-preview" +# ===== Provider field (Option B — explicit `provider:` alongside `model:`) ===== +# +# Why a separate `provider` field at all (we already parse the slug prefix off +# `model`)? Three reasons: +# 1. Custom model aliases that don't carry a recognizable prefix (e.g., a +# tenant-specific name routed through a gateway) need an explicit signal. +# 2. Adapters were each implementing their own slug-parse — hermes's +# derive-provider.sh, claude-code's adapter-default branch, etc. One +# resolution point in load_config kills that drift class. +# 3. The canvas Provider dropdown needs a stable storage field that doesn't +# get clobbered every time the user picks a new model. +# +# Backward compat: when `provider:` is absent, fall back to slug derivation, +# so existing config.yaml files keep working without a migration. + + +def test_provider_default_empty_when_bare_model(tmp_path, monkeypatch): + """Bare model names (no `:` or `/` separator) yield an empty provider — + the signal for "let the adapter decide". Don't guess. + """ + monkeypatch.delenv("LLM_PROVIDER", raising=False) + monkeypatch.delenv("MODEL_PROVIDER", raising=False) + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text(yaml.dump({"model": "claude-opus-4-7"})) + + cfg = load_config(str(tmp_path)) + assert cfg.provider == "" + assert cfg.runtime_config.provider == "" + + +def test_provider_derived_from_colon_slug(tmp_path, monkeypatch): + """`provider:model` shape (Anthropic/OpenAI/Google convention) derives + the provider from the prefix when no explicit `provider:` is set. + Exercises the backward-compat path for every existing config.yaml in + the wild. + """ + monkeypatch.delenv("LLM_PROVIDER", raising=False) + monkeypatch.delenv("MODEL_PROVIDER", raising=False) + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text(yaml.dump({"model": "anthropic:claude-opus-4-7"})) + + cfg = load_config(str(tmp_path)) + assert cfg.provider == "anthropic" + # runtime_config.provider inherits the same way runtime_config.model does. + assert cfg.runtime_config.provider == "anthropic" + + +def test_provider_derived_from_slash_slug(tmp_path, monkeypatch): + """`provider/model` shape (HuggingFace/Minimax convention) derives the + provider from the prefix when no explicit `provider:` is set. + """ + monkeypatch.delenv("LLM_PROVIDER", raising=False) + monkeypatch.delenv("MODEL_PROVIDER", raising=False) + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text(yaml.dump({"model": "minimax/abab7-chat-preview"})) + + cfg = load_config(str(tmp_path)) + assert cfg.provider == "minimax" + assert cfg.runtime_config.provider == "minimax" + + +def test_provider_yaml_explicit_wins_over_derived(tmp_path, monkeypatch): + """Explicit YAML `provider:` overrides the slug-prefix derivation — + needed when the model name's prefix doesn't match the actual gateway + (e.g., an `anthropic:claude-opus-4-7` model routed through a custom + gateway slug). + """ + monkeypatch.delenv("LLM_PROVIDER", raising=False) + monkeypatch.delenv("MODEL_PROVIDER", raising=False) + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text( + yaml.dump( + { + "model": "anthropic:claude-opus-4-7", + "provider": "custom-gateway", + } + ) + ) + + cfg = load_config(str(tmp_path)) + # Slug prefix says "anthropic" but the explicit field wins. + assert cfg.provider == "custom-gateway" + assert cfg.runtime_config.provider == "custom-gateway" + + +def test_provider_env_override_beats_yaml_and_derived(tmp_path, monkeypatch): + """`LLM_PROVIDER` env var beats both YAML and slug derivation. + This is the path the canvas Save+Restart cycle relies on: the user + picks a provider in the canvas Provider dropdown, the platform sets + `LLM_PROVIDER` on the workspace, and the next CP-driven restart picks + it up regardless of what's in the regenerated /configs/config.yaml. + """ + monkeypatch.setenv("LLM_PROVIDER", "minimax") + monkeypatch.delenv("MODEL_PROVIDER", raising=False) + config_yaml = tmp_path / "config.yaml" + # YAML says one thing, slug says another, env wins. + config_yaml.write_text( + yaml.dump( + { + "model": "anthropic:claude-opus-4-7", + "provider": "openai", + } + ) + ) + + cfg = load_config(str(tmp_path)) + assert cfg.provider == "minimax" + assert cfg.runtime_config.provider == "minimax" + + +def test_runtime_config_provider_yaml_wins_over_top_level(tmp_path, monkeypatch): + """An explicit `runtime_config.provider` takes precedence over the + top-level resolved provider — same fallback shape as `model`. Needed + when a workspace wants the top-level model/provider to stay + user-visible while pinning the runtime to a different gateway. + """ + monkeypatch.delenv("LLM_PROVIDER", raising=False) + monkeypatch.delenv("MODEL_PROVIDER", raising=False) + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text( + yaml.dump( + { + "model": "anthropic:claude-opus-4-7", + "runtime_config": {"provider": "openai"}, + } + ) + ) + + cfg = load_config(str(tmp_path)) + # Top-level still derives from the slug. + assert cfg.provider == "anthropic" + # runtime_config.provider explicit override wins. + assert cfg.runtime_config.provider == "openai" + + +def test_provider_default_from_default_model(tmp_path, monkeypatch): + """When config.yaml is empty, the WorkspaceConfig default model + (`anthropic:claude-opus-4-7`) yields provider=`anthropic`. Pins the + "no config" boot path to a sensible derived provider. + """ + monkeypatch.delenv("LLM_PROVIDER", raising=False) + monkeypatch.delenv("MODEL_PROVIDER", raising=False) + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text(yaml.dump({})) + + cfg = load_config(str(tmp_path)) + assert cfg.model == "anthropic:claude-opus-4-7" + assert cfg.provider == "anthropic" + assert cfg.runtime_config.provider == "anthropic" + + def test_delegation_config_defaults(tmp_path): """DelegationConfig nested defaults are applied.""" config_yaml = tmp_path / "config.yaml" From e58e446444d8ddbb3fc6f5599a3c90cfa2344442 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 20:52:34 -0700 Subject: [PATCH 04/61] =?UTF-8?q?docs(ci):=20correct=20test-ops-scripts.ym?= =?UTF-8?q?l=20header=20=E2=80=94=20discover=20does=20NOT=20recurse?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous header said `unittest discover from the scripts/ root walks recursively`, contradicting the workflow body which runs two passes precisely because discover does NOT recurse without __init__.py. Fixed self-review feedback on PR #2440. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/test-ops-scripts.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test-ops-scripts.yml b/.github/workflows/test-ops-scripts.yml index a6f342e1..ca8cb0af 100644 --- a/.github/workflows/test-ops-scripts.yml +++ b/.github/workflows/test-ops-scripts.yml @@ -4,11 +4,13 @@ name: Ops Scripts Tests # anything under scripts/. Kept separate from the main CI so a script-only # change doesn't trigger the heavier Go/Canvas/Python pipelines. # -# Discovery: `unittest discover` from the scripts/ root walks recursively, -# so both `scripts/test_*.py` and `scripts/ops/test_*.py` are picked up. -# Tests should sit alongside the code they test (see +# Discovery layout: tests sit alongside the code they test (see # scripts/ops/test_sweep_cf_decide.py for the pattern; scripts/ -# test_build_runtime_package.py for the rewriter coverage). +# test_build_runtime_package.py for the rewriter coverage). The job +# below runs `unittest discover` TWICE — once from `scripts/`, once +# from `scripts/ops/` — because neither dir has an `__init__.py`, so +# a single discover from `scripts/` doesn't recurse into the ops +# subdir. Two passes is simpler than retrofitting namespace packages. on: push: From d012a803e48ce8eff2cf39049cd8aa0093c16436 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 21:10:20 -0700 Subject: [PATCH 05/61] feat(terminal): add diagnose endpoint for SSH probe stages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GET /workspaces/:id/terminal/diagnose runs the same per-stage pipeline as /terminal (ssh-keygen → EIC send-key → tunnel → ssh) but non-interactively and returns JSON. Each stage reports {name, ok, duration_ms, error, detail}, plus a top-level first_failure naming the broken stage. Why: when the canvas terminal silently disconnects ("Session ended" with no error frame — the user-reported failure mode on hongmingwang's hermes workspace), there is no remote-readable signal of WHICH stage failed. The ssh client's stderr lives only in the workspace-server's stdout on the tenant CP EC2 — invisible without shell access. /terminal can't expose stderr cleanly because it has already upgraded to WebSocket binary frames by the time ssh runs. /terminal/diagnose stays pure HTTP/JSON, so the same auth (WorkspaceAuth + ADMIN_TOKEN fallback) gives operators a one-call probe that splits "IAM broke" (send-ssh-public-key fails) from "tunnel/SG broke" (wait-for-port fails) from "sshd auth broke" (ssh-probe gets Permission denied) from "shell broke" (probe exits non-zero with stderr). Stages mirrored from handleRemoteConnect in terminal.go: 1. ssh-keygen ephemeral session keypair 2. send-ssh-public-key AWS EIC API push, IAM-gated 3. pick-free-port local port for the tunnel 4. open-tunnel aws ec2-instance-connect open-tunnel start 5. wait-for-port the tunnel actually listens (folds tunnel stderr into Detail when it doesn't) 6. ssh-probe non-interactive `ssh ... 'echo MARKER'` that confirms auth + bash + the marker round-trip (CombinedOutput captures stderr verbatim — this is the whole reason the endpoint exists) Local Docker workspaces (no instance_id) get a smaller probe: container-found + container-running. Same response shape so callers don't need to branch. Tests stub sendSSHPublicKey / openTunnelCmd / sshProbeCmd via the existing package-level vars (same pattern as TestSSHCommandCmd_*) so the test suite stays hermetic — no AWS, no network. The three new tests pin: (a) routing to remote on instance_id present, (b) routing to local on empty instance_id, (c) the operationally critical case — full success through wait-for-port then a probe failure surfaces ssh stderr in the ssh-probe step's Error/Detail with first_failure="ssh-probe". Auth: rides on existing WorkspaceAuth middleware. Operators with the tenant ADMIN_TOKEN (fetched via /cp/admin/orgs/:slug/admin-token) can probe any workspace without per-workspace token; same admin path as the canvas dashboard reads workspace activity. Response always returns HTTP 200 (success or step failure are both in the JSON body) so callers don't need to branch on status code — the endpoint either reports a first_failure or doesn't. Resolves task #200, supports task #193 (workspace EC2 sshd unresponsive — without this endpoint we couldn't pin the failure stage from outside the tenant CP EC2). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/terminal_diagnose.go | 328 ++++++++++++++++++ .../handlers/terminal_diagnose_test.go | 222 ++++++++++++ workspace-server/internal/router/router.go | 1 + 3 files changed, 551 insertions(+) create mode 100644 workspace-server/internal/handlers/terminal_diagnose.go create mode 100644 workspace-server/internal/handlers/terminal_diagnose_test.go diff --git a/workspace-server/internal/handlers/terminal_diagnose.go b/workspace-server/internal/handlers/terminal_diagnose.go new file mode 100644 index 00000000..e40f6e19 --- /dev/null +++ b/workspace-server/internal/handlers/terminal_diagnose.go @@ -0,0 +1,328 @@ +package handlers + +import ( + "context" + "fmt" + "net/http" + "os" + "os/exec" + "strings" + "time" + + "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" + "github.com/gin-gonic/gin" +) + +// HandleDiagnose handles GET /workspaces/:id/terminal/diagnose. It runs the +// same per-step pipeline as HandleConnect (ssh-keygen → EIC send-key → tunnel +// → ssh) but non-interactively, captures the first failing step and its +// stderr, and returns the result as JSON. +// +// Why this exists: when the canvas terminal silently disconnects ("Session +// ended" with no error frame), there is no remote-readable signal of which +// stage failed. The ssh client's stderr lives in the workspace-server's +// process logs on the tenant CP EC2 — invisible without shell access. +// HandleConnect can't trivially expose stderr because it has already +// upgraded to WebSocket binary frames by the time ssh runs. HandleDiagnose +// stays pure HTTP/JSON, so the same auth (WorkspaceAuth + ADMIN_TOKEN +// fallback) gives operators a one-call probe of the whole shell pipeline. +// +// Stages mirrored from handleRemoteConnect: +// +// 1. ssh-keygen (ephemeral session keypair) +// 2. send-ssh-public-key (AWS EIC API push, IAM-gated) +// 3. pick-free-port (local port for the tunnel) +// 4. open-tunnel (aws ec2-instance-connect open-tunnel start) +// 5. wait-for-port (the tunnel actually listens) +// 6. ssh-probe (`ssh ... 'echo MARKER'` — proves end-to-end auth+shell) +// +// Local Docker workspaces (no instance_id row) get a smaller probe: +// container-found + container-running. Same response shape so callers +// don't need to branch. +func (h *TerminalHandler) HandleDiagnose(c *gin.Context) { + workspaceID := c.Param("id") + ctx, cancel := context.WithTimeout(c.Request.Context(), 30*time.Second) + defer cancel() + + var instanceID string + _ = db.DB.QueryRowContext(ctx, + `SELECT COALESCE(instance_id, '') FROM workspaces WHERE id = $1`, + workspaceID).Scan(&instanceID) + + var res diagnoseResult + if instanceID != "" { + res = h.diagnoseRemote(ctx, workspaceID, instanceID) + } else { + res = h.diagnoseLocal(ctx, workspaceID) + } + c.JSON(http.StatusOK, res) +} + +// diagnoseStep is one row in the diagnostic report. Always carries Name + +// OK + DurationMs; Error/Detail filled when the step fails. +type diagnoseStep struct { + Name string `json:"name"` + OK bool `json:"ok"` + DurationMs int64 `json:"duration_ms"` + Error string `json:"error,omitempty"` + Detail string `json:"detail,omitempty"` +} + +// diagnoseResult is the full report. ``OK`` is true only when every step +// passed; ``FirstFailure`` names the step that broke the chain so callers +// can route alerts (e.g., "send-ssh-public-key" → IAM team; "ssh-probe" → +// SG/sshd team). +type diagnoseResult struct { + WorkspaceID string `json:"workspace_id"` + InstanceID string `json:"instance_id,omitempty"` + Remote bool `json:"remote"` + OK bool `json:"ok"` + FirstFailure string `json:"first_failure,omitempty"` + Steps []diagnoseStep `json:"steps"` +} + +// sshProbeMarker is the string the ssh probe echoes back. Distinct from any +// shell builtin output so we can grep for it unambiguously even when the +// remote prints a banner or motd. +const sshProbeMarker = "MOLECULE_TERMINAL_PROBE_OK" + +// sshProbeCmd builds the non-interactive ssh probe command. Exposed as a +// var so tests can stub it without spinning up a real sshd. BatchMode=yes +// ensures ssh fails fast on prompt instead of hanging on a TTY. +var sshProbeCmd = func(o eicSSHOptions) *exec.Cmd { + return exec.Command( + "ssh", + "-i", o.PrivateKeyPath, + "-o", "StrictHostKeyChecking=no", + "-o", "UserKnownHostsFile=/dev/null", + "-o", "BatchMode=yes", + "-o", "ConnectTimeout=10", + "-p", fmt.Sprintf("%d", o.LocalPort), + fmt.Sprintf("%s@127.0.0.1", o.OSUser), + "echo "+sshProbeMarker, + ) +} + +// diagnoseRemote runs the full EIC + ssh probe and reports per-step status. +// Bails on the first failure so the operator sees which stage breaks; later +// stages stay in the report as zero-value rows so the response shape is +// stable regardless of where the chain stopped. +func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, instanceID string) diagnoseResult { + res := diagnoseResult{ + WorkspaceID: workspaceID, + InstanceID: instanceID, + Remote: true, + } + + osUser := os.Getenv("WORKSPACE_EC2_OS_USER") + if osUser == "" { + osUser = "ubuntu" + } + region := os.Getenv("AWS_REGION") + if region == "" { + region = "us-east-2" + } + + stop := func(name string, step diagnoseStep) diagnoseResult { + res.Steps = append(res.Steps, step) + res.FirstFailure = name + return res + } + + // Step 1: ssh-keygen + t0 := time.Now() + keyDir, err := os.MkdirTemp("", "molecule-diagnose-*") + if err != nil { + return stop("ssh-keygen", diagnoseStep{ + Name: "ssh-keygen", + DurationMs: time.Since(t0).Milliseconds(), + Error: fmt.Sprintf("mkdir tmp: %v", err), + }) + } + defer func() { _ = os.RemoveAll(keyDir) }() + keyPath := keyDir + "/id" + keygen := exec.CommandContext(ctx, "ssh-keygen", "-t", "ed25519", "-f", keyPath, "-N", "", "-q", "-C", "molecule-diagnose") + if out, kerr := keygen.CombinedOutput(); kerr != nil { + return stop("ssh-keygen", diagnoseStep{ + Name: "ssh-keygen", + DurationMs: time.Since(t0).Milliseconds(), + Error: kerr.Error(), + Detail: strings.TrimSpace(string(out)), + }) + } + res.Steps = append(res.Steps, diagnoseStep{Name: "ssh-keygen", OK: true, DurationMs: time.Since(t0).Milliseconds()}) + + pubKey, err := os.ReadFile(keyPath + ".pub") + if err != nil { + return stop("ssh-keygen", diagnoseStep{ + Name: "ssh-keygen", + Error: fmt.Sprintf("read pubkey: %v", err), + }) + } + + // Step 2: send-ssh-public-key (AWS Instance Connect) + t0 = time.Now() + if err := sendSSHPublicKey(ctx, region, instanceID, osUser, strings.TrimSpace(string(pubKey))); err != nil { + return stop("send-ssh-public-key", diagnoseStep{ + Name: "send-ssh-public-key", + DurationMs: time.Since(t0).Milliseconds(), + Error: err.Error(), + }) + } + res.Steps = append(res.Steps, diagnoseStep{Name: "send-ssh-public-key", OK: true, DurationMs: time.Since(t0).Milliseconds()}) + + // Step 3: pick-free-port + t0 = time.Now() + localPort, err := pickFreePort() + if err != nil { + return stop("pick-free-port", diagnoseStep{ + Name: "pick-free-port", + DurationMs: time.Since(t0).Milliseconds(), + Error: err.Error(), + }) + } + res.Steps = append(res.Steps, diagnoseStep{ + Name: "pick-free-port", + OK: true, + DurationMs: time.Since(t0).Milliseconds(), + Detail: fmt.Sprintf("port=%d", localPort), + }) + + // Step 4: open-tunnel (long-running subprocess; we hold its stderr so + // we can include it in failure detail for the next two stages). + opts := eicSSHOptions{ + InstanceID: instanceID, + OSUser: osUser, + Region: region, + LocalPort: localPort, + PrivateKeyPath: keyPath, + } + t0 = time.Now() + tunnel := openTunnelCmd(opts) + tunnel.Env = os.Environ() + var tunnelStderr strings.Builder + tunnel.Stderr = &tunnelStderr + if err := tunnel.Start(); err != nil { + return stop("open-tunnel", diagnoseStep{ + Name: "open-tunnel", + DurationMs: time.Since(t0).Milliseconds(), + Error: err.Error(), + Detail: tunnelStderr.String(), + }) + } + defer func() { + if tunnel.Process != nil { + _ = tunnel.Process.Kill() + } + _ = tunnel.Wait() + }() + res.Steps = append(res.Steps, diagnoseStep{Name: "open-tunnel", OK: true, DurationMs: time.Since(t0).Milliseconds()}) + + // Step 5: wait-for-port — verifies the tunnel actually bound the port. + // Tunnel-side errors (auth, SG, missing endpoint) usually surface here + // because the subprocess exits before binding. Fold its stderr into the + // detail so the operator sees the real reason. + t0 = time.Now() + if err := waitForPort(ctx, "127.0.0.1", localPort, 10*time.Second); err != nil { + return stop("wait-for-port", diagnoseStep{ + Name: "wait-for-port", + DurationMs: time.Since(t0).Milliseconds(), + Error: err.Error(), + Detail: tunnelStderr.String(), + }) + } + res.Steps = append(res.Steps, diagnoseStep{Name: "wait-for-port", OK: true, DurationMs: time.Since(t0).Milliseconds()}) + + // Step 6: ssh-probe — non-interactive `ssh ... 'echo MARKER'`. Proves + // auth (key push reached sshd), shell ready (bash returns echo output), + // and the network path end-to-end. Captures combined output + exit + // error so we see "Permission denied", "Connection refused", or "Host + // key verification failed" verbatim. + t0 = time.Now() + probe := sshProbeCmd(opts) + probe.Env = os.Environ() + out, perr := probe.CombinedOutput() + outStr := strings.TrimSpace(string(out)) + durMs := time.Since(t0).Milliseconds() + if perr != nil || !strings.Contains(outStr, sshProbeMarker) { + errStr := "" + if perr != nil { + errStr = perr.Error() + } + return stop("ssh-probe", diagnoseStep{ + Name: "ssh-probe", + DurationMs: durMs, + Error: errStr, + Detail: outStr, + }) + } + res.Steps = append(res.Steps, diagnoseStep{Name: "ssh-probe", OK: true, DurationMs: durMs}) + + res.OK = true + return res +} + +// diagnoseLocal probes the Docker container path. Smaller surface: just +// "is the named container running on this Docker daemon". +func (h *TerminalHandler) diagnoseLocal(ctx context.Context, workspaceID string) diagnoseResult { + res := diagnoseResult{WorkspaceID: workspaceID, Remote: false} + if h.docker == nil { + res.Steps = append(res.Steps, diagnoseStep{ + Name: "docker-available", + Error: "docker client not configured on this workspace-server", + }) + res.FirstFailure = "docker-available" + return res + } + + candidates := []string{provisioner.ContainerName(workspaceID), "ws-" + workspaceID} + var foundName string + var lastErr error + var running bool + var stateStatus string + t0 := time.Now() + for _, n := range candidates { + info, err := h.docker.ContainerInspect(ctx, n) + if err == nil { + foundName = n + running = info.State.Running + stateStatus = info.State.Status + break + } + lastErr = err + } + if foundName == "" { + errMsg := "no matching container" + if lastErr != nil { + errMsg = lastErr.Error() + } + res.Steps = append(res.Steps, diagnoseStep{ + Name: "container-found", + DurationMs: time.Since(t0).Milliseconds(), + Error: errMsg, + Detail: fmt.Sprintf("tried: %s", strings.Join(candidates, ", ")), + }) + res.FirstFailure = "container-found" + return res + } + res.Steps = append(res.Steps, diagnoseStep{ + Name: "container-found", + OK: true, + DurationMs: time.Since(t0).Milliseconds(), + Detail: foundName, + }) + + if !running { + res.Steps = append(res.Steps, diagnoseStep{ + Name: "container-running", + Error: "container not running", + Detail: stateStatus, + }) + res.FirstFailure = "container-running" + return res + } + res.Steps = append(res.Steps, diagnoseStep{Name: "container-running", OK: true, Detail: stateStatus}) + res.OK = true + return res +} diff --git a/workspace-server/internal/handlers/terminal_diagnose_test.go b/workspace-server/internal/handlers/terminal_diagnose_test.go new file mode 100644 index 00000000..5cf672fe --- /dev/null +++ b/workspace-server/internal/handlers/terminal_diagnose_test.go @@ -0,0 +1,222 @@ +package handlers + +import ( + "context" + "encoding/json" + "errors" + "net/http/httptest" + "os/exec" + "testing" + + "github.com/DATA-DOG/go-sqlmock" + "github.com/gin-gonic/gin" +) + +// TestHandleDiagnose_RoutesToRemote pins the dispatch: a workspace row with +// a non-empty instance_id takes the EIC + ssh probe path. We stub the +// first-stage (send-ssh-public-key) to fail so the test stays +// hermetic — no AWS calls, no network — and confirm: +// +// - first_failure is "send-ssh-public-key" (not the earlier ssh-keygen) +// - the steps array includes the ssh-keygen pass + the failed +// send-ssh-public-key step +// - response is HTTP 200 (the endpoint always returns 200; failure is +// in the JSON body so callers don't need branch-on-status) +func TestHandleDiagnose_RoutesToRemote(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + mock.ExpectQuery("SELECT COALESCE"). + WithArgs("ws-remote"). + WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("i-abc123")) + + prev := sendSSHPublicKey + sendSSHPublicKey = func(ctx context.Context, region, instanceID, osUser, pubKey string) error { + return errors.New("AccessDeniedException: not authorized") + } + defer func() { sendSSHPublicKey = prev }() + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-remote"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-remote/terminal/diagnose", nil) + + h.HandleDiagnose(c) + + if w.Code != 200 { + t.Fatalf("HandleDiagnose status: got %d, want 200 (body=%s)", w.Code, w.Body.String()) + } + var got diagnoseResult + if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil { + t.Fatalf("response not JSON: %v (body=%s)", err, w.Body.String()) + } + if !got.Remote { + t.Errorf("Remote=false; expected true for instance_id-bearing workspace") + } + if got.OK { + t.Errorf("OK=true despite stubbed send-key failure") + } + if got.FirstFailure != "send-ssh-public-key" { + t.Errorf("FirstFailure=%q; want send-ssh-public-key", got.FirstFailure) + } + // ssh-keygen must run successfully before send-ssh-public-key fails. + if len(got.Steps) < 2 { + t.Fatalf("expected >=2 steps (ssh-keygen + send-ssh-public-key); got %d", len(got.Steps)) + } + if got.Steps[0].Name != "ssh-keygen" || !got.Steps[0].OK { + t.Errorf("step[0]: want ssh-keygen ok=true; got %+v", got.Steps[0]) + } + if got.Steps[1].Name != "send-ssh-public-key" || got.Steps[1].OK { + t.Errorf("step[1]: want send-ssh-public-key ok=false; got %+v", got.Steps[1]) + } + // The IAM error message must surface in the step's Error field — that's + // the whole point of the endpoint. + if got.Steps[1].Error == "" { + t.Errorf("step[1].Error is empty; AWS error must surface verbatim") + } +} + +// TestHandleDiagnose_RoutesToLocal — empty instance_id takes the Docker +// path. With nil docker client, container-found can't even start, so we +// fail at "docker-available". Confirms the local-vs-remote dispatch. +func TestHandleDiagnose_RoutesToLocal(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + mock.ExpectQuery("SELECT COALESCE"). + WithArgs("ws-local"). + WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("")) + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-local"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-local/terminal/diagnose", nil) + + h.HandleDiagnose(c) + + if w.Code != 200 { + t.Fatalf("status: got %d, want 200", w.Code) + } + var got diagnoseResult + if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil { + t.Fatalf("response not JSON: %v", err) + } + if got.Remote { + t.Errorf("Remote=true; expected false for empty-instance_id workspace") + } + if got.FirstFailure != "docker-available" { + t.Errorf("FirstFailure=%q; want docker-available (no docker client)", got.FirstFailure) + } +} + +// TestDiagnoseRemote_StopsAtSSHProbe — full happy path through send-key, +// pick-port, open-tunnel, wait-for-port, then stub the ssh probe to fail. +// Confirms first_failure surfaces the actual ssh stderr ("Permission +// denied") rather than the earlier successful steps. This is the +// most operationally important behavior — the endpoint exists primarily +// to differentiate "IAM broke" (send-key fails) from "sshd broke" (probe +// fails) from "SG/network broke" (wait-for-port fails). +func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + mock.ExpectQuery("SELECT COALESCE"). + WithArgs("ws-probe-fail"). + WillReturnRows(sqlmock.NewRows([]string{"instance_id"}).AddRow("i-test")) + + // Stub send-key to succeed. + prevSend := sendSSHPublicKey + sendSSHPublicKey = func(ctx context.Context, region, instanceID, osUser, pubKey string) error { + return nil + } + defer func() { sendSSHPublicKey = prevSend }() + + // Stub openTunnelCmd to spawn `nc -l ` so waitForPort succeeds. + // We need the tunnel to actually bind the port; nc does that + // portably. macOS has BSD nc by default. + prevTun := openTunnelCmd + openTunnelCmd = func(o eicSSHOptions) *exec.Cmd { + // `nc -l ` listens on the picked free port. -k keeps it + // alive across single-client disconnects on Linux nc; harmless + // on BSD nc which doesn't have it (we'd need -k for BSD too — + // fall back to a portable busy-wait). + return exec.Command("sh", "-c", + `port="$1"; while true; do nc -l "$port" >/dev/null 2>&1 || true; done`, + "sh", numToString(o.LocalPort)) + } + defer func() { openTunnelCmd = prevTun }() + + // Stub the ssh probe to return "Permission denied" with non-zero exit, + // the canonical "key wasn't authorized" failure. + prevProbe := sshProbeCmd + sshProbeCmd = func(o eicSSHOptions) *exec.Cmd { + return exec.Command("sh", "-c", "echo 'Permission denied (publickey).' >&2; exit 255") + } + defer func() { sshProbeCmd = prevProbe }() + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-probe-fail"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-probe-fail/terminal/diagnose", nil) + + h.HandleDiagnose(c) + + if w.Code != 200 { + t.Fatalf("status: got %d", w.Code) + } + var got diagnoseResult + if err := json.Unmarshal(w.Body.Bytes(), &got); err != nil { + t.Fatalf("response not JSON: %v (body=%s)", err, w.Body.String()) + } + if got.OK { + t.Errorf("OK=true despite stubbed probe failure") + } + if got.FirstFailure != "ssh-probe" { + t.Errorf("FirstFailure=%q; want ssh-probe (got body=%s)", got.FirstFailure, w.Body.String()) + } + // The "Permission denied" message must be in the probe step's Detail — + // that's what tells the operator "this is sshd auth, not network". + var probeStep *diagnoseStep + for i := range got.Steps { + if got.Steps[i].Name == "ssh-probe" { + probeStep = &got.Steps[i] + break + } + } + if probeStep == nil { + t.Fatalf("no ssh-probe step in result: %+v", got.Steps) + } + if probeStep.OK { + t.Errorf("ssh-probe step OK=true despite failure stub") + } + if probeStep.Detail == "" && probeStep.Error == "" { + t.Errorf("ssh-probe step has no Error or Detail; ssh stderr is exactly what we want to expose") + } +} + +// numToString is a tiny helper to avoid pulling fmt into the test for one +// integer-to-string call. Same observable behavior as strconv.Itoa. +func numToString(n int) string { + if n == 0 { + return "0" + } + var buf [20]byte + i := len(buf) + neg := n < 0 + if neg { + n = -n + } + for n > 0 { + i-- + buf[i] = byte('0' + n%10) + n /= 10 + } + if neg { + i-- + buf[i] = '-' + } + return string(buf[i:]) +} diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go index 3d04b12e..5373ed0f 100644 --- a/workspace-server/internal/router/router.go +++ b/workspace-server/internal/router/router.go @@ -470,6 +470,7 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi } th := handlers.NewTerminalHandler(dockerCli) wsAuth.GET("/terminal", th.HandleConnect) + wsAuth.GET("/terminal/diagnose", th.HandleDiagnose) // Canvas Viewport — #166 + #168: GET stays fully open for bootstrap. // PUT uses CanvasOrBearer (accepts Origin-match OR bearer token) so the From b9311134cf3580ea36d89e4ed99b8802e9bf2379 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 21:19:18 -0700 Subject: [PATCH 06/61] fix(terminal-diagnose): KI-005 hierarchy check + race-free stderr capture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes from /code-review-and-quality on PR #2445: 1. **KI-005 hierarchy check parity with /terminal** HandleConnect runs the KI-005 cross-workspace guard before dispatch (terminal.go:85-106): when X-Workspace-ID is set and != :id, validate the bearer's workspace binding then call canCommunicateCheck. Without this, an org-level token holder in tenant Foo can probe any workspace's diagnostic state by guessing the UUID — same enumeration vector KI-005 closed for /terminal in #1609. Per-workspace bearer tokens are URL-bound by WorkspaceAuth, so the gap is org tokens within the same tenant. Fix: copy the same gate into HandleDiagnose, before the instance_id SELECT. Test: TestHandleDiagnose_KI005_RejectsCrossWorkspace stubs canCommunicateCheck=false and confirms 403 fires before the DB lookup (sqlmock's ExpectationsWereMet pins that we never reached the SELECT COALESCE). Mirrors the existing TestTerminalConnect_KI005_RejectsUnauthorizedCrossWorkspace. 2. **Race-free tunnel stderr capture (syncBuf)** strings.Builder isn't goroutine-safe. os/exec spawns a background goroutine that copies the subprocess's stderr fd to cmd.Stderr's Write, so reading the buffer's String() from the request goroutine on wait-for-port timeout while the tunnel may still be writing is a data race that `go test -race` flags. Worst-case impact in production is a garbled Detail string (not a crash), but the fix is small. Fix: wrap bytes.Buffer in a sync.Mutex (syncBuf type). Same io.Writer interface, no API changes elsewhere. 3. **Nit cleanup** - read-pubkey failure now reports as its own step name instead of a duplicated "ssh-keygen" entry — disambiguates two different failure modes that previously shared a name. - Replaced numToString hand-rolled int-to-string with strconv.Itoa in the test (no import savings reason existed). Suite: 4 diagnose tests pass with -race; full handlers suite passes in 3.95s. go vet clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/terminal_diagnose.go | 58 ++++++++++++++- .../handlers/terminal_diagnose_test.go | 73 +++++++++++++------ 2 files changed, 104 insertions(+), 27 deletions(-) diff --git a/workspace-server/internal/handlers/terminal_diagnose.go b/workspace-server/internal/handlers/terminal_diagnose.go index e40f6e19..b78c8955 100644 --- a/workspace-server/internal/handlers/terminal_diagnose.go +++ b/workspace-server/internal/handlers/terminal_diagnose.go @@ -1,19 +1,48 @@ package handlers import ( + "bytes" "context" "fmt" "net/http" "os" "os/exec" "strings" + "sync" "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" "github.com/Molecule-AI/molecule-monorepo/platform/internal/provisioner" + "github.com/Molecule-AI/molecule-monorepo/platform/internal/wsauth" "github.com/gin-gonic/gin" ) +// syncBuf is a goroutine-safe writer that wraps bytes.Buffer with a mutex. +// Used to capture subprocess stderr without racing the os/exec stderr-copy +// goroutine: ``cmd.Stderr = io.Writer`` spawns a background goroutine that +// reads from the subprocess's stderr fd and calls Write on our writer, so +// reading the buffer from another goroutine (e.g., on wait-for-port +// timeout while the tunnel may still be writing) without synchronization +// is a data race that ``go test -race`` would flag. ``strings.Builder`` +// and bare ``bytes.Buffer`` aren't goroutine-safe; this tiny shim is the +// cheapest fix. +type syncBuf struct { + mu sync.Mutex + b bytes.Buffer +} + +func (s *syncBuf) Write(p []byte) (int, error) { + s.mu.Lock() + defer s.mu.Unlock() + return s.b.Write(p) +} + +func (s *syncBuf) String() string { + s.mu.Lock() + defer s.mu.Unlock() + return s.b.String() +} + // HandleDiagnose handles GET /workspaces/:id/terminal/diagnose. It runs the // same per-step pipeline as HandleConnect (ssh-keygen → EIC send-key → tunnel // → ssh) but non-interactively, captures the first failing step and its @@ -45,6 +74,29 @@ func (h *TerminalHandler) HandleDiagnose(c *gin.Context) { ctx, cancel := context.WithTimeout(c.Request.Context(), 30*time.Second) defer cancel() + // KI-005 hierarchy check — same shape as HandleConnect. Without this, + // an org-level token holder can probe any workspace in their tenant by + // guessing the UUID, learning its diagnostic state (which IAM call + // fails, what sshd says) even when they don't own it. Per-workspace + // bearer tokens are already URL-bound by WorkspaceAuth, so the gap is + // org tokens — same vector KI-005 closed for /terminal (#1609). + callerID := c.GetHeader("X-Workspace-ID") + if callerID != "" && callerID != workspaceID { + tok := wsauth.BearerTokenFromHeader(c.GetHeader("Authorization")) + if tok != "" { + if err := wsauth.ValidateToken(ctx, db.DB, callerID, tok); err != nil { + if c.GetString("org_token_id") == "" { + c.JSON(http.StatusUnauthorized, gin.H{"error": "invalid token for claimed workspace"}) + return + } + } + } + if !canCommunicateCheck(callerID, workspaceID) { + c.JSON(http.StatusForbidden, gin.H{"error": "not authorized to diagnose this workspace's terminal"}) + return + } + } + var instanceID string _ = db.DB.QueryRowContext(ctx, `SELECT COALESCE(instance_id, '') FROM workspaces WHERE id = $1`, @@ -155,8 +207,8 @@ func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, insta pubKey, err := os.ReadFile(keyPath + ".pub") if err != nil { - return stop("ssh-keygen", diagnoseStep{ - Name: "ssh-keygen", + return stop("read-pubkey", diagnoseStep{ + Name: "read-pubkey", Error: fmt.Sprintf("read pubkey: %v", err), }) } @@ -201,7 +253,7 @@ func (h *TerminalHandler) diagnoseRemote(ctx context.Context, workspaceID, insta t0 = time.Now() tunnel := openTunnelCmd(opts) tunnel.Env = os.Environ() - var tunnelStderr strings.Builder + var tunnelStderr syncBuf tunnel.Stderr = &tunnelStderr if err := tunnel.Start(); err != nil { return stop("open-tunnel", diagnoseStep{ diff --git a/workspace-server/internal/handlers/terminal_diagnose_test.go b/workspace-server/internal/handlers/terminal_diagnose_test.go index 5cf672fe..15b94945 100644 --- a/workspace-server/internal/handlers/terminal_diagnose_test.go +++ b/workspace-server/internal/handlers/terminal_diagnose_test.go @@ -6,6 +6,7 @@ import ( "errors" "net/http/httptest" "os/exec" + "strconv" "testing" "github.com/DATA-DOG/go-sqlmock" @@ -111,6 +112,53 @@ func TestHandleDiagnose_RoutesToLocal(t *testing.T) { } } +// TestHandleDiagnose_KI005_RejectsCrossWorkspace — the diagnostic endpoint +// has the same cross-workspace info-leak surface as /terminal had before +// #1609. Without KI-005, an org-level token holder could probe any +// workspace in their tenant by guessing the UUID, learning which IAM call +// fails or which sshd error fires. This test pins that HandleDiagnose +// applies the same hierarchy guard as HandleConnect (parity: ws-attacker +// claiming X-Workspace-ID against /workspaces/ws-victim/terminal/diagnose +// must 403, never reaching the SELECT COALESCE for instance_id). +func TestHandleDiagnose_KI005_RejectsCrossWorkspace(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + + // Stub CanCommunicate to deny. Reset after — same pattern as the + // HandleConnect KI-005 tests. + prev := canCommunicateCheck + canCommunicateCheck = func(callerID, targetID string) bool { return false } + defer func() { canCommunicateCheck = prev }() + + // Token validation: caller's bearer is bound to ws-attacker. + mock.ExpectQuery(`SELECT t\.id, t\.workspace_id\s+FROM workspace_auth_tokens t`). + WithArgs(sqlmock.AnyArg()). + WillReturnRows(sqlmock.NewRows([]string{"id", "workspace_id"}).AddRow("tok-1", "ws-attacker")) + mock.ExpectExec(`UPDATE workspace_auth_tokens SET last_used_at`). + WithArgs(sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(0, 1)) + + h := NewTerminalHandler(nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-victim"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-victim/terminal/diagnose", nil) + c.Request.Header.Set("X-Workspace-ID", "ws-attacker") + c.Request.Header.Set("Authorization", "Bearer attacker-token") + + h.HandleDiagnose(c) + + if w.Code != 403 { + t.Errorf("cross-workspace diagnose: got %d, want 403 (%s)", w.Code, w.Body.String()) + } + // Critically: the SELECT COALESCE for instance_id must NOT have run — + // no expectation was set for it. ExpectationsWereMet ensures we + // rejected before reaching the DB lookup. + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations (rejection should fire before instance_id lookup): %v", err) + } +} + // TestDiagnoseRemote_StopsAtSSHProbe — full happy path through send-key, // pick-port, open-tunnel, wait-for-port, then stub the ssh probe to fail. // Confirms first_failure surfaces the actual ssh stderr ("Permission @@ -144,7 +192,7 @@ func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) { // fall back to a portable busy-wait). return exec.Command("sh", "-c", `port="$1"; while true; do nc -l "$port" >/dev/null 2>&1 || true; done`, - "sh", numToString(o.LocalPort)) + "sh", strconv.Itoa(o.LocalPort)) } defer func() { openTunnelCmd = prevTun }() @@ -197,26 +245,3 @@ func TestDiagnoseRemote_StopsAtSSHProbe(t *testing.T) { } } -// numToString is a tiny helper to avoid pulling fmt into the test for one -// integer-to-string call. Same observable behavior as strconv.Itoa. -func numToString(n int) string { - if n == 0 { - return "0" - } - var buf [20]byte - i := len(buf) - neg := n < 0 - if neg { - n = -n - } - for n > 0 { - i-- - buf[i] = byte('0' + n%10) - n /= 10 - } - if neg { - i-- - buf[i] = '-' - } - return string(buf[i:]) -} From aacaba024c13c37e258128397d1ce5976cece911 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 21:21:18 -0700 Subject: [PATCH 07/61] feat(wheel-smoke): exercise executor.execute() to catch lazy imports (#2275) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The existing wheel-publish smoke (`wheel_smoke.py`) only IMPORTS `molecule_runtime.main` at module scope. Lazy imports buried inside `async def execute(...)` bodies (e.g. `from a2a.types import FilePart`) NEVER evaluate at static-import time — they crash at first message delivery in production. The 2026-04-2x v0→v1 a2a-sdk migration shipped 5 such regressions in templates that all looked fine at module-load smoke. This change adds `smoke_mode.py` plus a `MOLECULE_SMOKE_MODE=1` short-circuit in `main.py`: after `adapter.create_executor(...)`, the boot path invokes `executor.execute(stub_ctx, stub_queue)` once with a 5s timeout (`MOLECULE_SMOKE_TIMEOUT_SECS`). Healthy import tree → execution proceeds far enough to hit a network boundary and times out (exit 0). Broken lazy import → `ImportError` / `ModuleNotFoundError` from inside the executor body (exit 1). Other downstream errors (auth, validation) pass — those are caught by adapter-level tests, not this gate. Stub `(RequestContext, EventQueue)` is built from the real a2a-sdk so SendMessageRequest/RequestContext constructor changes also surface as import-tree failures (the regression class also includes "SDK refactored mid-publish"). The stub-build itself is wrapped — if it raises, that's a smoke fail too. Phase 2 (separate PR, molecule-ci) wires this into publish-template-image.yml so the publish gate runs the boot smoke against every template image before pushing the tag. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/build_runtime_package.py | 1 + workspace/main.py | 14 +++ workspace/smoke_mode.py | 140 +++++++++++++++++++++ workspace/tests/test_smoke_mode.py | 190 +++++++++++++++++++++++++++++ 4 files changed, 345 insertions(+) create mode 100644 workspace/smoke_mode.py create mode 100644 workspace/tests/test_smoke_mode.py diff --git a/scripts/build_runtime_package.py b/scripts/build_runtime_package.py index 910ea691..e6977e52 100755 --- a/scripts/build_runtime_package.py +++ b/scripts/build_runtime_package.py @@ -78,6 +78,7 @@ TOP_LEVEL_MODULES = { "prompt", "runtime_wedge", "shared_runtime", + "smoke_mode", "transcript_auth", "watcher", } diff --git a/workspace/main.py b/workspace/main.py index 093860c2..356080f3 100644 --- a/workspace/main.py +++ b/workspace/main.py @@ -136,6 +136,20 @@ async def main(): # pragma: no cover await adapter.setup(adapter_config) executor = await adapter.create_executor(adapter_config) + # 5a. Boot-smoke short-circuit (issue #2275): if MOLECULE_SMOKE_MODE + # is set, exercise the executor's full import tree by calling + # execute() once with stub deps + a short timeout. Skips platform + # registration + uvicorn entirely. Returns process exit code. + from smoke_mode import is_smoke_mode, run_executor_smoke + if is_smoke_mode(): + exit_code = await run_executor_smoke(executor) + if hasattr(heartbeat, "stop"): + try: + await heartbeat.stop() + except Exception: # noqa: BLE001 + pass + raise SystemExit(exit_code) + # 5b. Restore from pre-stop snapshot if one exists (GH#1391). # The snapshot is scrubbed before being written, so secrets are # already redacted — restore_state must not re-expose them. diff --git a/workspace/smoke_mode.py b/workspace/smoke_mode.py new file mode 100644 index 00000000..773e0cbe --- /dev/null +++ b/workspace/smoke_mode.py @@ -0,0 +1,140 @@ +"""Boot smoke mode — exercises the executor's full import tree without touching real platforms. + +Why this exists (issue #2275): the existing `wheel_smoke.py` only IMPORTS +`molecule_runtime.main` at module scope. Lazy imports buried inside +`async def execute(...)` bodies (e.g. `from a2a.types import FilePart`) +NEVER evaluate at static-import time — they crash at first message +delivery in production. + +The 2026-04-2x v0→v1 a2a-sdk migration shipped 5 such regressions in +templates that all looked fine at module-load smoke. This module fills +the gap by actually invoking `executor.execute(stub_ctx, stub_queue)` +once with a short timeout. If the import-tree is healthy the call +proceeds far enough to hit a network boundary (LLM call, etc.) and +times out — that's a *pass*. If a lazy import is broken, the call +raises `ImportError` / `ModuleNotFoundError` from inside the executor +body — that's a *fail*. + +Activated by setting `MOLECULE_SMOKE_MODE=1` in the env. Wired into +`main.py` after `executor = await adapter.create_executor(...)` so the +full adapter setup path runs first; the smoke just adds one more +exercise step before exit. + +CI usage (intended for `molecule-ci/.github/workflows/publish-template-image.yml`): + docker run --rm \ + -e WORKSPACE_ID=fake -e MOLECULE_SMOKE_MODE=1 \ + "$IMAGE" molecule-runtime +""" +from __future__ import annotations + +import asyncio +import logging +import os +import sys +from typing import Any + +logger = logging.getLogger(__name__) + + +_SMOKE_TIMEOUT_SECS = float(os.environ.get("MOLECULE_SMOKE_TIMEOUT_SECS", "5.0")) + + +def is_smoke_mode() -> bool: + """True iff MOLECULE_SMOKE_MODE is set to a truthy value. + + Recognises the standard truthy strings (`1`, `true`, `yes`, + case-insensitive). An unset / empty / `0` env reads as False so + the boot path takes the normal branch in production. + """ + raw = os.environ.get("MOLECULE_SMOKE_MODE", "").strip().lower() + return raw in ("1", "true", "yes", "on") + + +def _build_stub_context() -> tuple[Any, Any]: + """Build a (RequestContext, EventQueue) pair stuffed with a minimal + text message ("smoke test"). The Message is enough that + `extract_message_text(context)` returns non-empty input, so the + executor takes the "real" branch (not the empty-input early-exit) + and exercises any lazy imports along that path. + + Imports happen at function scope so smoke_mode.py itself doesn't + pull a2a-sdk into every consumer of the runtime — the wheel still + boots without smoke mode active. + """ + from a2a.helpers import new_text_message + from a2a.server.agent_execution import RequestContext + from a2a.server.context import ServerCallContext + from a2a.server.events import EventQueue + from a2a.types import SendMessageRequest + + message = new_text_message("smoke test") + call_ctx = ServerCallContext() + request = SendMessageRequest(message=message) + context = RequestContext(call_ctx, request=request) + queue = EventQueue() + return context, queue + + +async def run_executor_smoke(executor: Any) -> int: + """Invoke executor.execute() once with stub deps. Return an exit code. + + Returns: + 0 — import tree healthy. Either execution timed out (the + expected outcome — we hit a network boundary like an LLM + call) or completed cleanly. Either way, no broken imports. + 1 — broken lazy import detected. Re-raised as a clear log line + so the publish gate's stderr captures the offending symbol. + + The 5-second timeout comes from `MOLECULE_SMOKE_TIMEOUT_SECS` env + (default 5.0). Bump it via env if a slow adapter setup overlaps the + first execute call. Don't make it too long — the publish workflow + multiplies this across N templates. + """ + print( + f"[smoke-mode] invoking executor.execute(stub_ctx, stub_queue) " + f"with {_SMOKE_TIMEOUT_SECS:.1f}s timeout to exercise lazy imports" + ) + + try: + context, queue = _build_stub_context() + except Exception as build_err: # noqa: BLE001 + # If we can't even build the stub, the a2a-sdk import path is + # broken — that's exactly the regression class this gate exists + # for. Treat as a smoke failure. + print( + f"[smoke-mode] FAIL: stub-context build raised " + f"{type(build_err).__name__}: {build_err}", + file=sys.stderr, + ) + return 1 + + try: + await asyncio.wait_for( + executor.execute(context, queue), + timeout=_SMOKE_TIMEOUT_SECS, + ) + except (asyncio.TimeoutError, asyncio.CancelledError): + # Timeout = imports healthy, execution was proceeding and hit + # a network boundary or long await. Pass. + print("[smoke-mode] PASS: timed out past import-tree (imports healthy)") + return 0 + except (ImportError, ModuleNotFoundError) as imp_err: + # The exact regression class issue #2275 exists to catch. + print( + f"[smoke-mode] FAIL: lazy import broken in execute(): " + f"{type(imp_err).__name__}: {imp_err}", + file=sys.stderr, + ) + return 1 + except Exception as other_err: # noqa: BLE001 + # Anything else (auth errors, validation errors, runtime bugs) + # is downstream of the import gate. Pass — these are caught by + # the relevant adapter-level tests, not by this smoke. + print( + f"[smoke-mode] PASS: execute() raised " + f"{type(other_err).__name__} past import-tree (not an import error)" + ) + return 0 + else: + print("[smoke-mode] PASS: execute() completed within timeout (imports + body OK)") + return 0 diff --git a/workspace/tests/test_smoke_mode.py b/workspace/tests/test_smoke_mode.py new file mode 100644 index 00000000..10edbe30 --- /dev/null +++ b/workspace/tests/test_smoke_mode.py @@ -0,0 +1,190 @@ +"""Tests for smoke_mode — the executor-stub boot smoke (issue #2275). + +These tests exercise the helper module directly. The end-to-end path +(main.py invoking run_executor_smoke + sys.exit) is not unit-tested +here because main() is `# pragma: no cover` and integration-shaped; +that path is covered by the publish-template-image.yml smoke step +(which is the production gate this helper exists for). + +Note on a2a-sdk: conftest.py stubs out a2a.* modules with minimal +shims that don't include `a2a.server.context.ServerCallContext` or +`a2a.types.SendMessageRequest` (the real-SDK-only symbols +_build_stub_context needs). Tests that want to verify the +`run_executor_smoke` control flow patch _build_stub_context to +sidestep the real construction; tests that NEED the real SDK +construction skip when those symbols aren't reachable. +""" +from __future__ import annotations + +import asyncio +from unittest.mock import patch + +import pytest + +import smoke_mode + + +def _real_a2a_sdk_available() -> bool: + """True when the real a2a-sdk types needed by _build_stub_context + are importable. The conftest's a2a stubs intentionally don't + include these — they're only present in the published wheel's + runtime env or when a2a-sdk is installed alongside the test.""" + try: + from a2a.server.context import ServerCallContext # noqa: F401 + from a2a.types import SendMessageRequest # noqa: F401 + return True + except (ImportError, AttributeError): + return False + + +# ─── is_smoke_mode ───────────────────────────────────────────────────── + + +@pytest.mark.parametrize("env_value", ["1", "true", "yes", "on", "TRUE", "Yes", "ON"]) +def test_is_smoke_mode_truthy_values(env_value: str, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("MOLECULE_SMOKE_MODE", env_value) + assert smoke_mode.is_smoke_mode() is True + + +@pytest.mark.parametrize("env_value", ["0", "false", "no", "off", "", " "]) +def test_is_smoke_mode_falsy_values(env_value: str, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("MOLECULE_SMOKE_MODE", env_value) + assert smoke_mode.is_smoke_mode() is False + + +def test_is_smoke_mode_unset(monkeypatch: pytest.MonkeyPatch): + monkeypatch.delenv("MOLECULE_SMOKE_MODE", raising=False) + assert smoke_mode.is_smoke_mode() is False + + +# ─── _build_stub_context (real-SDK-only) ─────────────────────────────── + + +@pytest.mark.skipif( + not _real_a2a_sdk_available(), + reason="conftest stubs a2a.* without ServerCallContext / SendMessageRequest; real SDK only", +) +def test_build_stub_context_returns_request_context_with_message(): + """Stub must produce a RequestContext that has a non-empty message + payload — otherwise extract_message_text returns empty and the + executor takes the early-exit branch instead of exercising the + full import tree.""" + context, _queue = smoke_mode._build_stub_context() + assert context.message is not None + parts = context.message.parts + assert len(parts) == 1 + assert parts[0].text == "smoke test" + + +@pytest.mark.skipif( + not _real_a2a_sdk_available(), + reason="conftest stubs a2a.* without ServerCallContext / SendMessageRequest; real SDK only", +) +def test_build_stub_context_returns_event_queue(): + from a2a.server.events import EventQueue + _, queue = smoke_mode._build_stub_context() + assert isinstance(queue, EventQueue) + + +# ─── run_executor_smoke — control flow with stubbed context ──────────── +# +# These tests patch _build_stub_context to return sentinel objects, so +# they don't depend on the real a2a-sdk being present. The executor +# stubs ignore ctx + queue. + + +class _RaisingExecutor: + def __init__(self, exc: Exception): + self._exc = exc + + async def execute(self, context, event_queue) -> None: # noqa: ARG002 + raise self._exc + + +class _BlockingExecutor: + """Simulates an LLM network call that the smoke timeout cuts short.""" + + async def execute(self, context, event_queue) -> None: # noqa: ARG002 + await asyncio.Event().wait() + + +class _CleanExecutor: + async def execute(self, context, event_queue) -> None: # noqa: ARG002 + return None + + +@pytest.fixture +def stub_build(): + """Replace _build_stub_context with a no-op so execute() gets + sentinel ctx/queue. Tests can override this fixture's behavior + via monkeypatch when they need a different shape.""" + sentinel_ctx = object() + sentinel_queue = object() + with patch.object( + smoke_mode, "_build_stub_context", + lambda: (sentinel_ctx, sentinel_queue), + ): + yield + + +@pytest.mark.asyncio +async def test_smoke_passes_on_timeout(stub_build, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setattr(smoke_mode, "_SMOKE_TIMEOUT_SECS", 0.1) + code = await smoke_mode.run_executor_smoke(_BlockingExecutor()) + assert code == 0 + + +@pytest.mark.asyncio +async def test_smoke_passes_on_clean_return(stub_build): + code = await smoke_mode.run_executor_smoke(_CleanExecutor()) + assert code == 0 + + +@pytest.mark.asyncio +async def test_smoke_fails_on_import_error(stub_build): + """The exact regression class issue #2275 exists to catch — a lazy + import inside execute() that the static smoke missed.""" + code = await smoke_mode.run_executor_smoke( + _RaisingExecutor(ImportError("cannot import name 'FilePart' from 'a2a.types'")) + ) + assert code == 1 + + +@pytest.mark.asyncio +async def test_smoke_fails_on_module_not_found_error(stub_build): + code = await smoke_mode.run_executor_smoke( + _RaisingExecutor(ModuleNotFoundError("No module named 'temporalio'")) + ) + assert code == 1 + + +@pytest.mark.asyncio +async def test_smoke_passes_on_non_import_runtime_error(stub_build): + """Auth errors, validation errors, anything-not-an-import-error + pass — those are caught by adapter-level tests, not by this gate.""" + code = await smoke_mode.run_executor_smoke( + _RaisingExecutor(RuntimeError("ANTHROPIC_API_KEY missing")) + ) + assert code == 0 + + +@pytest.mark.asyncio +async def test_smoke_passes_on_value_error(stub_build): + code = await smoke_mode.run_executor_smoke( + _RaisingExecutor(ValueError("bad config")) + ) + assert code == 0 + + +@pytest.mark.asyncio +async def test_smoke_fails_when_stub_context_build_breaks(monkeypatch: pytest.MonkeyPatch): + """If a2a-sdk's own SendMessageRequest / RequestContext can't be + constructed (e.g. SDK migration broke the constructor), that's + exactly the regression class this gate exists for — fail loud.""" + + def _fail_build(): + raise ImportError("simulated: a2a.types refactored mid-publish") + + monkeypatch.setattr(smoke_mode, "_build_stub_context", _fail_build) + code = await smoke_mode.run_executor_smoke(_CleanExecutor()) + assert code == 1 From 661eec2659bc0a7e517a6d0d8132a9ce4f2e9629 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 21:31:08 -0700 Subject: [PATCH 08/61] chore(smoke-mode): harden module-load + drop dead except clause MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-ups from the #2275 Phase 1 self-review: 1. `_SMOKE_TIMEOUT_SECS = float(os.environ.get(...))` was evaluated at module load. main.py imports smoke_mode unconditionally — before the is_smoke_mode() check — so a malformed MOLECULE_SMOKE_TIMEOUT_SECS env value would SystemExit every workspace boot, not just smoke runs. Wrapped in try/except with a 5.0 fallback. Probability of a typo'd env var hitting production is low (it's a CI-only knob), but the footgun is removed entirely. Regression test reloads the module under a malformed env value. 2. `_real_a2a_sdk_available()` caught (ImportError, AttributeError). `from X import Y` raises ImportError when Y is missing on X — never AttributeError. Dropped the unreachable branch. No behavior change for the happy path. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace/smoke_mode.py | 8 +++++++- workspace/tests/test_smoke_mode.py | 23 ++++++++++++++++++++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/workspace/smoke_mode.py b/workspace/smoke_mode.py index 773e0cbe..79399946 100644 --- a/workspace/smoke_mode.py +++ b/workspace/smoke_mode.py @@ -36,7 +36,13 @@ from typing import Any logger = logging.getLogger(__name__) -_SMOKE_TIMEOUT_SECS = float(os.environ.get("MOLECULE_SMOKE_TIMEOUT_SECS", "5.0")) +# Don't crash production boot if MOLECULE_SMOKE_TIMEOUT_SECS is malformed — +# main.py imports smoke_mode unconditionally (before the is_smoke_mode() +# check), so a typo'd value would otherwise SystemExit every workspace. +try: + _SMOKE_TIMEOUT_SECS = float(os.environ.get("MOLECULE_SMOKE_TIMEOUT_SECS", "5.0")) +except ValueError: + _SMOKE_TIMEOUT_SECS = 5.0 def is_smoke_mode() -> bool: diff --git a/workspace/tests/test_smoke_mode.py b/workspace/tests/test_smoke_mode.py index 10edbe30..9721024f 100644 --- a/workspace/tests/test_smoke_mode.py +++ b/workspace/tests/test_smoke_mode.py @@ -33,7 +33,7 @@ def _real_a2a_sdk_available() -> bool: from a2a.server.context import ServerCallContext # noqa: F401 from a2a.types import SendMessageRequest # noqa: F401 return True - except (ImportError, AttributeError): + except ImportError: return False @@ -57,6 +57,27 @@ def test_is_smoke_mode_unset(monkeypatch: pytest.MonkeyPatch): assert smoke_mode.is_smoke_mode() is False +# ─── _SMOKE_TIMEOUT_SECS bad-env-var resilience ──────────────────────── + + +def test_smoke_timeout_falls_back_when_env_value_is_malformed( + monkeypatch: pytest.MonkeyPatch, +): + """A typo'd MOLECULE_SMOKE_TIMEOUT_SECS must not crash production + boot. main.py imports smoke_mode unconditionally — before the + is_smoke_mode() check — so float()-at-module-load would SystemExit + every workspace if the env value were bad.""" + import importlib + monkeypatch.setenv("MOLECULE_SMOKE_TIMEOUT_SECS", "not-a-float") + reloaded = importlib.reload(smoke_mode) + try: + assert reloaded._SMOKE_TIMEOUT_SECS == 5.0 + finally: + # Restore module to clean default for other tests. + monkeypatch.delenv("MOLECULE_SMOKE_TIMEOUT_SECS", raising=False) + importlib.reload(smoke_mode) + + # ─── _build_stub_context (real-SDK-only) ─────────────────────────────── From 72f0079c106e21bd339d4389ec2f487726d80e84 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 21:55:24 -0700 Subject: [PATCH 09/61] feat(workspace-server): GET /workspaces/:id returns 410 Gone when status='removed' (#2429) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defense-in-depth at the endpoint level. Previously, GET /workspaces/:id returned 200 OK with `status:"removed"` in the body for deleted workspaces — silent-fail UX hit on the hongmingwang tenant 2026-04-30: the channel bridge / molecule-mcp wheel had a dead workspace_id + token in .env, get_workspace_info returned 200 → caller assumed everything was fine, then every subsequent /registry/* call 401d because tokens were revoked, and operators had no idea their workspace was gone. #2425 fixed the steady-state heartbeat path (escalate to ERROR after 3 consecutive 401s). This change is the startup-time defense — fail loud when the operator first probes the workspace instead of waiting for the heartbeat to sour. The 410 body includes: {error: "workspace removed", id, removed_at, hint: "Regenerate ..."} Audit-trail consumers that need the body shape of a removed workspace (admin views, "show me deleted workspaces" tooling) opt into the legacy 200 + body via ?include_removed=true. Without this opt-in path the audit trail becomes invisible at the API layer. Two new tests pinned: - TestWorkspaceGet_RemovedReturns410 - TestWorkspaceGet_RemovedWithIncludeQueryReturns200 Follow-ups in separate PRs: - Update workspace/a2a_client.py get_workspace_info to surface "removed" specifically rather than collapsing into "not found" - Update channel bridge getWorkspaceInfo (server.ts) to detect 410 → log clear "workspace was deleted, re-onboard" error - Audit canvas/* + admin tooling consumers that may rely on the legacy 200 + status:"removed" shape; switch them to the ?include_removed=true opt-in if needed - Update docs (runtime-mcp.mdx Troubleshooting + external-agents.mdx lifecycle table) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/workspace.go | 25 ++++ .../internal/handlers/workspace_test.go | 120 ++++++++++++++++++ 2 files changed, 145 insertions(+) diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index c4a3376f..23d67f44 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -14,6 +14,7 @@ import ( "os" "path/filepath" "strings" + "time" "github.com/Molecule-AI/molecule-monorepo/platform/internal/crypto" "github.com/Molecule-AI/molecule-monorepo/platform/internal/db" @@ -649,6 +650,30 @@ func (h *WorkspaceHandler) Get(c *gin.Context) { return } + // #2429: workspaces with status='removed' return 410 Gone (not 200) + // so callers fail loudly at startup instead of after 60s of revoked- + // token heartbeats. The audit-trail consumers that need the body of + // a removed workspace opt in via ?include_removed=true. + // + // Why a query param and not a header: cheap to set in curl/canvas + // fetch alike, visible in access logs, and works without coupling + // to content negotiation. + if status, _ := ws["status"].(string); status == string(models.StatusRemoved) { + if c.Query("include_removed") != "true" { + var removedAt time.Time + _ = db.DB.QueryRowContext(c.Request.Context(), + `SELECT updated_at FROM workspaces WHERE id = $1`, id, + ).Scan(&removedAt) + c.JSON(http.StatusGone, gin.H{ + "error": "workspace removed", + "id": id, + "removed_at": removedAt, + "hint": "Regenerate workspace + token from the canvas → Tokens tab", + }) + return + } + } + // Strip sensitive fields — GET /workspaces/:id is on the open router. // Any caller with a valid UUID would otherwise read operational data. delete(ws, "budget_limit") diff --git a/workspace-server/internal/handlers/workspace_test.go b/workspace-server/internal/handlers/workspace_test.go index 9149b178..f1093191 100644 --- a/workspace-server/internal/handlers/workspace_test.go +++ b/workspace-server/internal/handlers/workspace_test.go @@ -9,6 +9,7 @@ import ( "os" "path/filepath" "testing" + "time" "github.com/DATA-DOG/go-sqlmock" "github.com/Molecule-AI/molecule-monorepo/platform/internal/models" @@ -97,6 +98,125 @@ func TestWorkspaceGet_NotFound(t *testing.T) { } } +// #2429: GET /workspaces/:id returns 410 Gone when status='removed'. +// Defense-in-depth at the endpoint level — without this, callers +// holding stale workspace_id + token tuples (channel bridge .env, +// captured curl scripts, etc.) get 200 + status:"removed" and have +// no idea their tokens are revoked until the heartbeat fails 60s +// later. 410 makes startup fail loud instead. +func TestWorkspaceGet_RemovedReturns410(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + + id := "cccccccc-0010-0000-0000-000000000000" + removedAt := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC) + + columns := []string{ + "id", "name", "role", "tier", "status", "agent_card", "url", + "parent_id", "active_tasks", "max_concurrent_tasks", "last_error_rate", "last_sample_error", + "uptime_seconds", "current_task", "runtime", "workspace_dir", "x", "y", "collapsed", + "budget_limit", "monthly_spend", + } + mock.ExpectQuery("SELECT w.id, w.name"). + WithArgs(id). + WillReturnRows(sqlmock.NewRows(columns). + AddRow(id, "Old Agent", "worker", 1, string(models.StatusRemoved), []byte(`null`), + "", nil, 0, 1, 0.0, "", 0, "", "langgraph", + "", 0.0, 0.0, false, + nil, 0)) + mock.ExpectQuery(`SELECT updated_at FROM workspaces`). + WithArgs(id). + WillReturnRows(sqlmock.NewRows([]string{"updated_at"}).AddRow(removedAt)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: id}} + c.Request = httptest.NewRequest("GET", "/workspaces/"+id, nil) + + handler.Get(c) + + if w.Code != http.StatusGone { + t.Fatalf("expected 410 Gone, got %d: %s", w.Code, w.Body.String()) + } + + var resp map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("failed to parse 410 body: %v", err) + } + if resp["error"] != "workspace removed" { + t.Errorf("expected error 'workspace removed', got %v", resp["error"]) + } + if resp["id"] != id { + t.Errorf("expected id %q, got %v", id, resp["id"]) + } + if _, ok := resp["removed_at"]; !ok { + t.Errorf("expected removed_at in 410 body, got: %v", resp) + } + if _, ok := resp["hint"]; !ok { + t.Errorf("expected hint in 410 body, got: %v", resp) + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +// Audit-trail consumers (admin views, "show me deleted workspaces" +// tooling) opt into the legacy 200 + body shape via +// ?include_removed=true. Without this opt-in path the audit trail +// becomes invisible at the API layer. +func TestWorkspaceGet_RemovedWithIncludeQueryReturns200(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + + id := "cccccccc-0011-0000-0000-000000000000" + + columns := []string{ + "id", "name", "role", "tier", "status", "agent_card", "url", + "parent_id", "active_tasks", "max_concurrent_tasks", "last_error_rate", "last_sample_error", + "uptime_seconds", "current_task", "runtime", "workspace_dir", "x", "y", "collapsed", + "budget_limit", "monthly_spend", + } + mock.ExpectQuery("SELECT w.id, w.name"). + WithArgs(id). + WillReturnRows(sqlmock.NewRows(columns). + AddRow(id, "Audit Agent", "worker", 1, string(models.StatusRemoved), []byte(`null`), + "", nil, 0, 1, 0.0, "", 0, "", "langgraph", + "", 0.0, 0.0, false, + nil, 0)) + // last_outbound_at follow-up query (existing path) + mock.ExpectQuery(`SELECT last_outbound_at FROM workspaces`). + WithArgs(id). + WillReturnRows(sqlmock.NewRows([]string{"last_outbound_at"}).AddRow(nil)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: id}} + c.Request = httptest.NewRequest("GET", "/workspaces/"+id+"?include_removed=true", nil) + + handler.Get(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200 OK with ?include_removed=true, got %d: %s", w.Code, w.Body.String()) + } + + var resp map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("failed to parse response: %v", err) + } + if resp["status"] != string(models.StatusRemoved) { + t.Errorf("expected status 'removed' in body, got %v", resp["status"]) + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + func TestWorkspaceGet_DBError(t *testing.T) { mock := setupTestDB(t) setupTestRedis(t) From 59902bce836f3cce810735898b2cab9e54d6cb12 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 21:58:45 -0700 Subject: [PATCH 10/61] feat(config): add observability block schema (#119 PR-1 of 4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hermes-style declarative block grouping cadence + verbosity knobs into one place. Schema-only in this PR — wiring into heartbeat.py and main.py lands in PR-3 of the #119 stack. Two fields with live consumers waiting: - heartbeat_interval_seconds (default 30, clamped to [5, 300]) → heartbeat.py:134 currently has hard-coded HEARTBEAT_INTERVAL = 30 - log_level (default "INFO", uppercased at parse) → main.py:465 currently has hard-coded log_level="info" Clamp band [5, 300] is intentional: sub-5s flooded the platform during IR-2026-03-11; >5min lets crashed workspaces look healthy long enough to mask failure. Coerce at parse so adapters and heartbeat.py can read the value without re-validating. Tests pin defaults, explicit YAML override, partial override, and parametrized clamp behavior (10 cases including garbage strings + None). Part of: task #119 (adopt hermes-style architecture) Stack: PR-1 schema → PR-2 event_log → PR-3 wire consumers → PR-4 skill compat --- workspace/config.py | 61 +++++++++++++++++ workspace/tests/test_config.py | 117 +++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+) diff --git a/workspace/config.py b/workspace/config.py index 3b205f1b..4e199c57 100644 --- a/workspace/config.py +++ b/workspace/config.py @@ -166,6 +166,43 @@ class SecurityScanConfig: operators who require a CVE gate know the gate is absent. Closes #268.""" +@dataclass +class ObservabilityConfig: + """Observability settings — heartbeat cadence and log verbosity. + + Hermes-style block: groups platform-runtime knobs that operators + typically tune together (cadence, verbosity) into one declarative + section instead of scattering them across env vars and hard-coded + constants. Adopting this shape unblocks per-workspace tuning without + a code change and pre-positions the schema for tracing/event-log + settings that will land in follow-up PRs (#119 PR-2 / PR-3). + + Today only ``heartbeat_interval_seconds`` and ``log_level`` have live + consumers; both fields are accepted but not yet wired to their final + sites in this PR (schema-only). Wiring lands in PR-3 of the series. + + Example config.yaml snippet:: + + observability: + heartbeat_interval_seconds: 60 + log_level: DEBUG + """ + + heartbeat_interval_seconds: int = 30 + """Seconds between heartbeats sent to the platform. Default 30 matches + ``workspace/heartbeat.py``'s long-standing constant. Lower values + reduce platform-side detection latency for crashed workspaces; higher + values reduce platform write load. Bounds: clamped to [5, 300] at + parse time — outside that range the workspace either floods the + platform or looks dead before the next beat.""" + + log_level: str = "INFO" + """Python ``logging`` level for the workspace runtime. Accepts the + standard names (DEBUG, INFO, WARNING, ERROR, CRITICAL). Today the + runtime reads ``LOG_LEVEL`` env; PR-3 of the #119 stack switches to + this field with env still honored as an override for ops debugging.""" + + @dataclass class ComplianceConfig: """OWASP Top 10 for Agentic Applications compliance settings. @@ -264,6 +301,7 @@ class WorkspaceConfig: governance: GovernanceConfig = field(default_factory=GovernanceConfig) security_scan: SecurityScanConfig = field(default_factory=SecurityScanConfig) compliance: ComplianceConfig = field(default_factory=ComplianceConfig) + observability: ObservabilityConfig = field(default_factory=ObservabilityConfig) sub_workspaces: list[dict] = field(default_factory=list) effort: str = "" """Claude output effort level for the agentic loop: low | medium | high | xhigh | max. @@ -289,6 +327,22 @@ def _derive_provider_from_model(model: str) -> str: return "" +def _clamp_heartbeat(value: object) -> int: + """Coerce raw YAML/env input into the [5, 300]-second heartbeat band. + + Outside that band the workspace either floods the platform with + sub-second beats or looks dead long before the next one — both + real failure modes seen on incidents, neither benign. Coerce here + so adapters and ``heartbeat.py`` can read the value without + re-validating. + """ + try: + n = int(value) + except (TypeError, ValueError): + return 30 + return max(5, min(300, n)) + + def load_config(config_path: Optional[str] = None) -> WorkspaceConfig: """Load config from WORKSPACE_CONFIG_PATH or the given path.""" if config_path is None: @@ -336,6 +390,7 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig: _ss_raw = raw.get("security_scan", {}) security_scan_raw = _ss_raw if isinstance(_ss_raw, dict) else {"mode": str(_ss_raw)} compliance_raw = raw.get("compliance", {}) + observability_raw = raw.get("observability", {}) # Resolve initial_prompt: inline string or file reference initial_prompt = raw.get("initial_prompt", "") @@ -445,6 +500,12 @@ def load_config(config_path: Optional[str] = None) -> WorkspaceConfig: max_tool_calls_per_task=int(compliance_raw.get("max_tool_calls_per_task", 50)), max_task_duration_seconds=int(compliance_raw.get("max_task_duration_seconds", 300)), ), + observability=ObservabilityConfig( + heartbeat_interval_seconds=_clamp_heartbeat( + observability_raw.get("heartbeat_interval_seconds", 30) + ), + log_level=str(observability_raw.get("log_level", "INFO")).upper(), + ), sub_workspaces=raw.get("sub_workspaces", []), effort=str(raw.get("effort", "")), task_budget=int(raw.get("task_budget", 0)), diff --git a/workspace/tests/test_config.py b/workspace/tests/test_config.py index bc09d638..5c790b04 100644 --- a/workspace/tests/test_config.py +++ b/workspace/tests/test_config.py @@ -9,6 +9,7 @@ from config import ( A2AConfig, ComplianceConfig, DelegationConfig, + ObservabilityConfig, SandboxConfig, WorkspaceConfig, load_config, @@ -523,3 +524,119 @@ def test_compliance_default_via_load_config(tmp_path, yaml_payload, expected_mod # prompt_injection was never overridden in any payload — must stay at # the dataclass default regardless of the mode value. assert cfg.compliance.prompt_injection == "detect" + + +# ===== Observability block (#119 PR-1) ===== +# +# Hermes-style declarative block grouping cadence + verbosity knobs into one +# place. Schema-only in this PR — wiring into heartbeat.py / main.py lands in +# PR-3. These tests pin the schema so the wiring PR can rely on the parsed +# values matching the documented contract (defaults, clamping bounds, +# log-level normalization). + + +def test_observability_dataclass_default(): + """ObservabilityConfig() — no args — yields the documented defaults.""" + cfg = ObservabilityConfig() + assert cfg.heartbeat_interval_seconds == 30 + assert cfg.log_level == "INFO" + + +def test_observability_default_when_yaml_omits_block(tmp_path): + """No ``observability:`` key in YAML → dataclass defaults.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text(yaml.dump({})) + + cfg = load_config(str(tmp_path)) + assert cfg.observability.heartbeat_interval_seconds == 30 + assert cfg.observability.log_level == "INFO" + + +def test_observability_explicit_yaml_override(tmp_path): + """Explicit YAML values flow through load_config to ObservabilityConfig.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text( + yaml.dump( + { + "observability": { + "heartbeat_interval_seconds": 60, + "log_level": "DEBUG", + } + } + ) + ) + + cfg = load_config(str(tmp_path)) + assert cfg.observability.heartbeat_interval_seconds == 60 + assert cfg.observability.log_level == "DEBUG" + + +def test_observability_partial_override_keeps_other_defaults(tmp_path): + """Setting only heartbeat preserves the log_level default — and vice versa.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text( + yaml.dump({"observability": {"heartbeat_interval_seconds": 45}}) + ) + + cfg = load_config(str(tmp_path)) + assert cfg.observability.heartbeat_interval_seconds == 45 + assert cfg.observability.log_level == "INFO" + + +@pytest.mark.parametrize( + "raw, expected", + [ + # In-band values pass through unchanged. + (5, 5), + (30, 30), + (300, 300), + # Below floor → clamped up to 5s. Sub-5s heartbeats flooded the + # platform during incident IR-2026-03-11 (workspace stuck in a + # tight loop emitting beats faster than the platform could ack). + (1, 5), + (0, 5), + (-7, 5), + # Above ceiling → clamped down to 300s. >5min beats let crashed + # workspaces look healthy long enough to mask the failure. + (301, 300), + (3600, 300), + # Non-integer YAML values fall back to the documented default + # rather than crashing the workspace at boot. + ("not-a-number", 30), + (None, 30), + ], + ids=[ + "floor_in_band", + "default_in_band", + "ceiling_in_band", + "below_floor_one", + "below_floor_zero", + "below_floor_negative", + "above_ceiling_just", + "above_ceiling_far", + "garbage_string", + "null", + ], +) +def test_observability_heartbeat_clamp(tmp_path, raw, expected): + """heartbeat_interval_seconds is clamped to the [5, 300] band at parse.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text( + yaml.dump({"observability": {"heartbeat_interval_seconds": raw}}) + ) + + cfg = load_config(str(tmp_path)) + assert cfg.observability.heartbeat_interval_seconds == expected + + +def test_observability_log_level_uppercased(tmp_path): + """Lowercase or mixed-case log levels normalize to the canonical form + Python's ``logging`` module expects, so operators can write either + ``debug`` or ``DEBUG`` in YAML without surprise.""" + config_yaml = tmp_path / "config.yaml" + config_yaml.write_text( + yaml.dump({"observability": {"log_level": "debug"}}) + ) + + cfg = load_config(str(tmp_path)) + assert cfg.observability.log_level == "DEBUG" From 645c1862c489030a711eb011ac26a2e8a4391e0a Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 22:08:08 -0700 Subject: [PATCH 11/61] feat(a2a-client): surface 410 Gone as 'removed' error so callers can re-onboard (#2429) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up A to PR #2449 — that PR taught the platform to return 410 Gone for status='removed' workspaces; this PR teaches get_workspace_info to consume that signal. Before: every non-200 collapsed into {"error": "not found"}, which made the 2026-04-30 incident impossible to diagnose — the operator KNEW the workspace_id existed (they'd just registered it), but the runtime kept reporting "not found" for a deleted-but-not-purged row. After: 410 produces a distinct {"error": "removed", "id", "removed_at", "hint"} dict so callers (heartbeat-loop, channel bridge, dashboard tools) can surface "your workspace was deleted, re-onboard" instead of "not found". Falls back to a default hint if the platform body isn't parseable so the actionable signal doesn't depend on body shape parity. Two new tests: - TestGetWorkspaceInfo.test_410_returns_removed_with_hint - TestGetWorkspaceInfo.test_410_with_unparseable_body_falls_back_to_default_hint Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace/a2a_client.py | 30 ++++++++++++++++++++- workspace/tests/test_a2a_client.py | 42 ++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+), 1 deletion(-) diff --git a/workspace/a2a_client.py b/workspace/a2a_client.py index 83ad0c89..4a9be69e 100644 --- a/workspace/a2a_client.py +++ b/workspace/a2a_client.py @@ -340,7 +340,14 @@ async def get_peers() -> list[dict]: async def get_workspace_info() -> dict: - """Get this workspace's info from the platform.""" + """Get this workspace's info from the platform. + + Distinguishes three failure shapes so callers can handle them + distinctly (#2429): + - 410 Gone → workspace was deleted; re-onboard required + - 404 / other → workspace never existed (or transient) + - exception → network / auth failure + """ async with httpx.AsyncClient(timeout=10.0) as client: try: resp = await client.get( @@ -349,6 +356,27 @@ async def get_workspace_info() -> dict: ) if resp.status_code == 200: return resp.json() + if resp.status_code == 410: + # #2429: platform returns 410 when status='removed'. + # Surface "removed" + the actionable hint so callers + # can prompt re-onboard instead of falling through to + # "not found" — which made the 2026-04-30 incident + # impossible to diagnose ("workspace not found" with + # a workspace_id we KNEW we'd just registered). + try: + body = resp.json() + except Exception: + body = {} + return { + "error": "removed", + "id": body.get("id", WORKSPACE_ID), + "removed_at": body.get("removed_at"), + "hint": body.get( + "hint", + "Workspace was deleted on the platform. " + "Regenerate workspace + token from the canvas → Tokens tab.", + ), + } return {"error": "not found"} except Exception as e: return {"error": str(e)} diff --git a/workspace/tests/test_a2a_client.py b/workspace/tests/test_a2a_client.py index 446945f9..f667ed95 100644 --- a/workspace/tests/test_a2a_client.py +++ b/workspace/tests/test_a2a_client.py @@ -819,6 +819,48 @@ class TestGetWorkspaceInfo: assert result == {"error": "not found"} + async def test_410_returns_removed_with_hint(self): + """410 Gone (#2429) → distinct error 'removed' so callers can + prompt re-onboard instead of falling through to 'not found'. + Body shape passes through removed_at + the platform hint.""" + import a2a_client + + body = { + "error": "workspace removed", + "id": "ws-deleted-uuid", + "removed_at": "2026-04-30T12:00:00Z", + "hint": "Regenerate workspace + token from the canvas → Tokens tab", + } + resp = _make_response(410, body) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result = await a2a_client.get_workspace_info() + + assert result["error"] == "removed" + assert result["id"] == "ws-deleted-uuid" + assert result["removed_at"] == "2026-04-30T12:00:00Z" + assert "Regenerate" in result["hint"] + + async def test_410_with_unparseable_body_falls_back_to_default_hint(self): + """If the platform's 410 body isn't JSON for some reason, the + default hint still surfaces — the actionable signal must not + depend on body shape parity with the platform.""" + import a2a_client + + resp = MagicMock() + resp.status_code = 410 + resp.json = MagicMock(side_effect=ValueError("not json")) + mock_client = _make_mock_client(get_resp=resp) + + with patch("a2a_client.httpx.AsyncClient", return_value=mock_client): + result = await a2a_client.get_workspace_info() + + assert result["error"] == "removed" + assert result["id"] == a2a_client.WORKSPACE_ID + assert result["removed_at"] is None + assert "Regenerate" in result["hint"] + async def test_exception_returns_error_dict_with_message(self): """Network exception → returns {'error': ''}.""" import a2a_client From 364c70fc71b4eb065856063da9540a604be2f781 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 22:24:59 -0700 Subject: [PATCH 12/61] fix(workspace-server): emit null removed_at when timestamp fetch fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #2429 review finding. The 410-Gone path issues a follow-up `SELECT updated_at` after detecting status='removed'. If that query fails (workspace row deleted between the two queries, transient DB error, etc.), `removedAt` stays as Go's zero time and the JSON body emits `"removed_at": "0001-01-01T00:00:00Z"` — a misleading timestamp the client has to know to ignore. Now we branch on `removedAt.IsZero()` and emit `null` for the failed path. The actionable signal (the 410 + hint) is unchanged; only the timestamp shape gets cleaner. Pinned by `TestWorkspaceGet_RemovedReturns410WithNullRemovedAtOnTimestampFetchFailure`, which simulates the row vanishing via `sqlmock`'s `WillReturnError(sql.ErrNoRows)`. The original `_RemovedReturns410` test now also asserts that the happy-path timestamp is a non-null value (was just checking the key existed). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../internal/handlers/workspace.go | 24 +++++-- .../internal/handlers/workspace_test.go | 67 ++++++++++++++++++- 2 files changed, 83 insertions(+), 8 deletions(-) diff --git a/workspace-server/internal/handlers/workspace.go b/workspace-server/internal/handlers/workspace.go index 23d67f44..78181e61 100644 --- a/workspace-server/internal/handlers/workspace.go +++ b/workspace-server/internal/handlers/workspace.go @@ -660,16 +660,28 @@ func (h *WorkspaceHandler) Get(c *gin.Context) { // to content negotiation. if status, _ := ws["status"].(string); status == string(models.StatusRemoved) { if c.Query("include_removed") != "true" { + // Best-effort fetch of the removal timestamp. If the row was + // deleted (or some transient DB error fired) between the + // scanWorkspaceRow above and this follow-up SELECT, + // removedAt stays as Go's zero time. Emit `null` in that + // case rather than the misleading `0001-01-01T00:00:00Z` + // the client would otherwise see — the actionable signal + // is the 410 + hint, not the timestamp. var removedAt time.Time _ = db.DB.QueryRowContext(c.Request.Context(), `SELECT updated_at FROM workspaces WHERE id = $1`, id, ).Scan(&removedAt) - c.JSON(http.StatusGone, gin.H{ - "error": "workspace removed", - "id": id, - "removed_at": removedAt, - "hint": "Regenerate workspace + token from the canvas → Tokens tab", - }) + body := gin.H{ + "error": "workspace removed", + "id": id, + "hint": "Regenerate workspace + token from the canvas → Tokens tab", + } + if removedAt.IsZero() { + body["removed_at"] = nil + } else { + body["removed_at"] = removedAt + } + c.JSON(http.StatusGone, body) return } } diff --git a/workspace-server/internal/handlers/workspace_test.go b/workspace-server/internal/handlers/workspace_test.go index f1093191..4e17ca6a 100644 --- a/workspace-server/internal/handlers/workspace_test.go +++ b/workspace-server/internal/handlers/workspace_test.go @@ -151,8 +151,8 @@ func TestWorkspaceGet_RemovedReturns410(t *testing.T) { if resp["id"] != id { t.Errorf("expected id %q, got %v", id, resp["id"]) } - if _, ok := resp["removed_at"]; !ok { - t.Errorf("expected removed_at in 410 body, got: %v", resp) + if v, ok := resp["removed_at"]; !ok || v == nil { + t.Errorf("expected removed_at to be a real timestamp on the happy path, got: %v", v) } if _, ok := resp["hint"]; !ok { t.Errorf("expected hint in 410 body, got: %v", resp) @@ -163,6 +163,69 @@ func TestWorkspaceGet_RemovedReturns410(t *testing.T) { } } +// If the follow-up `SELECT updated_at` query fails (workspace row +// disappeared in the gap, transient DB error, etc.), removedAt stays +// as Go's zero time. We emit JSON `null` for that case rather than +// the misleading `"0001-01-01T00:00:00Z"` the client would otherwise +// see — the actionable signal is the 410 + hint, not the timestamp. +func TestWorkspaceGet_RemovedReturns410WithNullRemovedAtOnTimestampFetchFailure(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + broadcaster := newTestBroadcaster() + handler := NewWorkspaceHandler(broadcaster, nil, "http://localhost:8080", t.TempDir()) + + id := "cccccccc-0012-0000-0000-000000000000" + + columns := []string{ + "id", "name", "role", "tier", "status", "agent_card", "url", + "parent_id", "active_tasks", "max_concurrent_tasks", "last_error_rate", "last_sample_error", + "uptime_seconds", "current_task", "runtime", "workspace_dir", "x", "y", "collapsed", + "budget_limit", "monthly_spend", + } + mock.ExpectQuery("SELECT w.id, w.name"). + WithArgs(id). + WillReturnRows(sqlmock.NewRows(columns). + AddRow(id, "Vanished", "worker", 1, string(models.StatusRemoved), []byte(`null`), + "", nil, 0, 1, 0.0, "", 0, "", "langgraph", + "", 0.0, 0.0, false, + nil, 0)) + // Simulate the row vanishing between the two queries. + mock.ExpectQuery(`SELECT updated_at FROM workspaces`). + WithArgs(id). + WillReturnError(sql.ErrNoRows) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: id}} + c.Request = httptest.NewRequest("GET", "/workspaces/"+id, nil) + + handler.Get(c) + + if w.Code != http.StatusGone { + t.Fatalf("expected 410 Gone, got %d: %s", w.Code, w.Body.String()) + } + + var resp map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("failed to parse 410 body: %v", err) + } + if resp["removed_at"] != nil { + t.Errorf( + "expected removed_at == null when timestamp fetch fails; got %v (type %T). "+ + "Misleading 0001-01-01 timestamps in the JSON would confuse clients.", + resp["removed_at"], resp["removed_at"], + ) + } + // Other fields must still be present. + if resp["error"] != "workspace removed" || resp["id"] != id || resp["hint"] == nil { + t.Errorf("expected error/id/hint to survive the timestamp fetch failure; got %v", resp) + } + + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + // Audit-trail consumers (admin views, "show me deleted workspaces" // tooling) opt into the legacy 200 + body shape via // ?include_removed=true. Without this opt-in path the audit trail From 258c6bea44910b75226295fd3b0a96de7e423457 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Thu, 30 Apr 2026 22:25:15 -0700 Subject: [PATCH 13/61] feat(workspace-server): PUT /provider endpoint for explicit LLM provider (#196) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror of PUT /model. Stores the provider slug as the LLM_PROVIDER workspace secret so the canvas can update model + provider independently — a user might keep the same model alias and switch providers (route through a different gateway), or vice versa. Forcing both into one endpoint imposes a single Save+Restart per change; two endpoints let canvas update each as the user picks. Plumbs through the existing chain: secret-load → envVars → CP req.Env → user-data env exports → /configs/config.yaml (after controlplane PR #364 lands the heredoc append). Tests: 5 new cases mirroring SetModel/GetModel exactly — default empty response, DB error, upsert with restart trigger, empty-clears, invalid-UUID rejection. Part of: Option B PR-2 (#196) — workspace-server plumbs LLM_PROVIDER Stack: PR-1 schema (#2441 merged) PR-2 (this) ws-server endpoint PR-3 (#364 open) CP user-data persistence PR-4 (pending) hermes adapter consume PR-5 (pending) canvas Provider dropdown --- workspace-server/internal/handlers/secrets.go | 106 +++++++++++++ .../internal/handlers/secrets_test.go | 146 ++++++++++++++++++ workspace-server/internal/router/router.go | 2 + 3 files changed, 254 insertions(+) diff --git a/workspace-server/internal/handlers/secrets.go b/workspace-server/internal/handlers/secrets.go index 3766068d..4d88be38 100644 --- a/workspace-server/internal/handlers/secrets.go +++ b/workspace-server/internal/handlers/secrets.go @@ -533,3 +533,109 @@ func (h *SecretsHandler) SetModel(c *gin.Context) { } c.JSON(http.StatusOK, gin.H{"status": "saved", "model": body.Model}) } + +// GetProvider handles GET /workspaces/:id/provider +// Returns the explicit LLM provider override stored as the LLM_PROVIDER +// workspace secret. Mirror of GetModel — same shape, same response keys +// (provider/source) to keep canvas wiring symmetric. +// +// Why a sibling endpoint rather than overloading PUT /model: the new +// `provider` field (Option B, PR #2441) is orthogonal to the model +// slug. A user might keep the same model alias and switch providers +// (e.g., route the same alias through a different gateway), or keep +// the same provider and switch models. Co-storing them under one +// endpoint forces a single Save+Restart round-trip per change; two +// endpoints let the canvas update each independently. +func (h *SecretsHandler) GetProvider(c *gin.Context) { + workspaceID := c.Param("id") + ctx := c.Request.Context() + + var bytesVal []byte + var version int + err := db.DB.QueryRowContext(ctx, + `SELECT encrypted_value, encryption_version FROM workspace_secrets WHERE workspace_id = $1 AND key = 'LLM_PROVIDER'`, + workspaceID).Scan(&bytesVal, &version) + if err == sql.ErrNoRows { + c.JSON(http.StatusOK, gin.H{"provider": "", "source": "default"}) + return + } + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "query failed"}) + return + } + + decrypted, err := crypto.DecryptVersioned(bytesVal, version) + if err != nil { + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to decrypt"}) + return + } + + c.JSON(http.StatusOK, gin.H{"provider": string(decrypted), "source": "workspace_secrets"}) +} + +// SetProvider handles PUT /workspaces/:id/provider — writes the provider +// slug into workspace_secrets as LLM_PROVIDER. Empty string clears the +// override. Triggers auto-restart so the new env is in effect on the +// next boot — without this the canvas Save+Restart can race the +// already-restarting container and miss the window. +// +// CP user-data (controlplane PR #364) reads LLM_PROVIDER from env and +// writes it into /configs/config.yaml at boot, so the choice survives +// restart. Without that PR this endpoint still works but the value is +// only sticky when the workspace_secrets row is read on every restart +// (the secret-load path) — slower failure mode, same eventual behavior. +func (h *SecretsHandler) SetProvider(c *gin.Context) { + workspaceID := c.Param("id") + if !uuidRegex.MatchString(workspaceID) { + c.JSON(http.StatusBadRequest, gin.H{"error": "invalid workspace ID"}) + return + } + ctx := c.Request.Context() + + var body struct { + Provider string `json:"provider"` + } + if err := c.ShouldBindJSON(&body); err != nil { + c.JSON(http.StatusBadRequest, gin.H{"error": "invalid request body"}) + return + } + + if body.Provider == "" { + if _, err := db.DB.ExecContext(ctx, + `DELETE FROM workspace_secrets WHERE workspace_id = $1 AND key = 'LLM_PROVIDER'`, + workspaceID); err != nil { + log.Printf("SetProvider delete error: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to clear provider"}) + return + } + if h.restartFunc != nil { + go h.restartFunc(workspaceID) + } + c.JSON(http.StatusOK, gin.H{"status": "cleared"}) + return + } + + encrypted, err := crypto.Encrypt([]byte(body.Provider)) + if err != nil { + log.Printf("SetProvider encrypt error: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to encrypt provider"}) + return + } + version := crypto.CurrentEncryptionVersion() + _, err = db.DB.ExecContext(ctx, ` + INSERT INTO workspace_secrets (workspace_id, key, encrypted_value, encryption_version) + VALUES ($1, 'LLM_PROVIDER', $2, $3) + ON CONFLICT (workspace_id, key) DO UPDATE + SET encrypted_value = $2, encryption_version = $3, updated_at = now() + `, workspaceID, encrypted, version) + if err != nil { + log.Printf("SetProvider upsert error: %v", err) + c.JSON(http.StatusInternalServerError, gin.H{"error": "failed to save provider"}) + return + } + + if h.restartFunc != nil { + go h.restartFunc(workspaceID) + } + c.JSON(http.StatusOK, gin.H{"status": "saved", "provider": body.Provider}) +} diff --git a/workspace-server/internal/handlers/secrets_test.go b/workspace-server/internal/handlers/secrets_test.go index 78e66a16..648f4e19 100644 --- a/workspace-server/internal/handlers/secrets_test.go +++ b/workspace-server/internal/handlers/secrets_test.go @@ -618,6 +618,152 @@ func TestSecretsSetModel_InvalidID(t *testing.T) { } } +// ==================== GetProvider / SetProvider (Option B PR-2) ==================== +// +// Mirror of the GetModel/SetModel suite. Same secret-storage shape (key= +// 'LLM_PROVIDER' instead of 'MODEL_PROVIDER'), same restart-trigger +// contract, same UUID validation gate. We pin the contract symmetrically +// so a future refactor that breaks one without the other shows up in CI. + +func TestSecretsGetProvider_Default(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewSecretsHandler(nil) + + mock.ExpectQuery("SELECT encrypted_value, encryption_version FROM workspace_secrets"). + WithArgs("ws-prov"). + WillReturnError(sql.ErrNoRows) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-prov"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-prov/provider", nil) + + handler.GetProvider(c) + + if w.Code != http.StatusOK { + t.Errorf("expected status 200, got %d: %s", w.Code, w.Body.String()) + } + + var resp map[string]interface{} + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("failed to parse response: %v", err) + } + if resp["provider"] != "" { + t.Errorf("expected empty provider, got %v", resp["provider"]) + } + if resp["source"] != "default" { + t.Errorf("expected source 'default', got %v", resp["source"]) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +func TestSecretsGetProvider_DBError(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewSecretsHandler(nil) + + mock.ExpectQuery("SELECT encrypted_value, encryption_version FROM workspace_secrets"). + WithArgs("ws-prov-err"). + WillReturnError(sql.ErrConnDone) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "ws-prov-err"}} + c.Request = httptest.NewRequest("GET", "/workspaces/ws-prov-err/provider", nil) + + handler.GetProvider(c) + + if w.Code != http.StatusInternalServerError { + t.Errorf("expected status 500, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +func TestSecretsSetProvider_Upsert(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + restartCalled := make(chan string, 1) + handler := NewSecretsHandler(func(id string) { restartCalled <- id }) + + mock.ExpectExec(`INSERT INTO workspace_secrets`). + WithArgs("00000000-0000-0000-0000-000000000003", sqlmock.AnyArg(), sqlmock.AnyArg()). + WillReturnResult(sqlmock.NewResult(1, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000003"}} + c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000003/provider", + strings.NewReader(`{"provider":"minimax"}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.SetProvider(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + select { + case id := <-restartCalled: + if id != "00000000-0000-0000-0000-000000000003" { + t.Errorf("restart called with wrong id: %s", id) + } + case <-time.After(500 * time.Millisecond): + t.Error("restart was not triggered") + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +func TestSecretsSetProvider_EmptyClears(t *testing.T) { + mock := setupTestDB(t) + setupTestRedis(t) + handler := NewSecretsHandler(func(string) {}) + + mock.ExpectExec(`DELETE FROM workspace_secrets`). + WithArgs("00000000-0000-0000-0000-000000000004"). + WillReturnResult(sqlmock.NewResult(0, 1)) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "00000000-0000-0000-0000-000000000004"}} + c.Request = httptest.NewRequest("PUT", "/workspaces/00000000-0000-0000-0000-000000000004/provider", + strings.NewReader(`{"provider":""}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.SetProvider(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + if err := mock.ExpectationsWereMet(); err != nil { + t.Errorf("unmet sqlmock expectations: %v", err) + } +} + +func TestSecretsSetProvider_InvalidID(t *testing.T) { + setupTestDB(t) + setupTestRedis(t) + handler := NewSecretsHandler(nil) + + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Params = gin.Params{{Key: "id", Value: "not-a-uuid"}} + c.Request = httptest.NewRequest("PUT", "/workspaces/not-a-uuid/provider", + strings.NewReader(`{"provider":"x"}`)) + c.Request.Header.Set("Content-Type", "application/json") + + handler.SetProvider(c) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400 for bad UUID, got %d", w.Code) + } +} + // ==================== Values — Phase 30.2 decrypted pull ==================== // These tests target the secrets.Values handler (GET /workspaces/:id/secrets/values) diff --git a/workspace-server/internal/router/router.go b/workspace-server/internal/router/router.go index 5373ed0f..0a5459fc 100644 --- a/workspace-server/internal/router/router.go +++ b/workspace-server/internal/router/router.go @@ -329,6 +329,8 @@ func Setup(hub *ws.Hub, broadcaster *events.Broadcaster, prov *provisioner.Provi wsAuth.DELETE("/secrets/:key", sech.Delete) wsAuth.GET("/model", sech.GetModel) wsAuth.PUT("/model", sech.SetModel) + wsAuth.GET("/provider", sech.GetProvider) + wsAuth.PUT("/provider", sech.SetProvider) // Token usage metrics — cost transparency (#593). // WorkspaceAuth middleware (on wsAuth) binds the bearer to :id. From 517bd0efc562e6789c8e4901556cbc2ed9b89583 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 1 May 2026 11:19:17 -0700 Subject: [PATCH 14/61] feat(canvas+workspace-server): data-driven Provider dropdown (#199) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Option B PR-5. Canvas Config tab now exposes a Provider override input that's adapter-driven from each runtime's template — no hardcoded provider list in the canvas. PUT /workspaces/:id/provider on Save when dirty; auto-restart suppression to avoid double-restart with the model handler's own restart. The dropdown's suggestion list comes from /templates → runtime_config.providers (the field added in molecule-ai-workspace-template-hermes PR #31). For templates that haven't migrated to the explicit providers list yet, suggestions derive from model[].id slug prefixes — still adapter-driven, just inferred. This keeps existing templates working while platform team migrates them one at a time. workspace-server changes: - Add Providers []string field to templateSummary JSON - Parse runtime_config.providers in /templates handler - 2 new tests pin the surfacing + omitempty behavior canvas changes: - Remove hardcoded PROVIDER_SUGGESTIONS constant - Add provider/originalProvider state + PUT-on-save logic - Add deriveProvidersFromModels() fallback helper - Wire RuntimeOption.providers from /templates response - 8 new tests pin the behavior end-to-end Co-Authored-By: Claude Opus 4.7 (1M context) --- canvas/src/components/tabs/ConfigTab.tsx | 192 ++++++++-- .../__tests__/ConfigTab.provider.test.tsx | 332 ++++++++++++++++++ .../internal/handlers/templates.go | 12 + .../internal/handlers/templates_test.go | 111 ++++++ 4 files changed, 627 insertions(+), 20 deletions(-) create mode 100644 canvas/src/components/tabs/__tests__/ConfigTab.provider.test.tsx diff --git a/canvas/src/components/tabs/ConfigTab.tsx b/canvas/src/components/tabs/ConfigTab.tsx index e1227d67..f46ff538 100644 --- a/canvas/src/components/tabs/ConfigTab.tsx +++ b/canvas/src/components/tabs/ConfigTab.tsx @@ -100,6 +100,42 @@ interface RuntimeOption { value: string; label: string; models: ModelSpec[]; + // providers is the declarative provider list each template ships in + // its config.yaml under runtime_config.providers. The /templates API + // surfaces it (workspace-server templates.go) so canvas stays + // adapter-driven: hermes ships ~20 slugs, claude-code ships + // ["anthropic"], gemini-cli ships ["gemini"], etc. Empty list → + // canvas falls back to deriving unique vendor prefixes from + // models[].id (still adapter-driven, just inferred). + providers: string[]; +} + +// deriveProvidersFromModels — when a template doesn't ship an explicit +// providers list, infer suggestions from the vendor prefixes of its +// model slugs. e.g. ["anthropic:claude-opus-4-7", "openai:gpt-4o", +// "anthropic:claude-sonnet-4-5"] → ["anthropic", "openai"]. +// +// This keeps the dropdown adapter-driven for older templates that +// haven't migrated to the explicit `providers:` field yet, AND +// continues to be a useful fallback for any future runtime whose +// derive-provider semantics happen to match the slug prefix. +function deriveProvidersFromModels(models: ModelSpec[]): string[] { + const seen = new Set(); + const out: string[] = []; + for (const m of models) { + if (!m.id) continue; + // Both ":" (anthropic:claude-opus-4-7) and "/" (nousresearch/hermes-4-70b) + // are valid vendor separators in our slug taxonomy. Take whichever + // appears first and split there. + const sep = m.id.match(/[:/]/)?.index ?? -1; + if (sep <= 0) continue; + const vendor = m.id.slice(0, sep); + if (!seen.has(vendor)) { + seen.add(vendor); + out.push(vendor); + } + } + return out; } // Fallback used when /templates can't be fetched (offline, older backend). @@ -118,14 +154,14 @@ interface RuntimeOption { const RUNTIMES_WITH_OWN_CONFIG = new Set(["external"]); const FALLBACK_RUNTIME_OPTIONS: RuntimeOption[] = [ - { value: "", label: "LangGraph (default)", models: [] }, - { value: "claude-code", label: "Claude Code", models: [] }, - { value: "crewai", label: "CrewAI", models: [] }, - { value: "autogen", label: "AutoGen", models: [] }, - { value: "deepagents", label: "DeepAgents", models: [] }, - { value: "openclaw", label: "OpenClaw", models: [] }, - { value: "hermes", label: "Hermes", models: [] }, - { value: "gemini-cli", label: "Gemini CLI", models: [] }, + { value: "", label: "LangGraph (default)", models: [], providers: [] }, + { value: "claude-code", label: "Claude Code", models: [], providers: [] }, + { value: "crewai", label: "CrewAI", models: [], providers: [] }, + { value: "autogen", label: "AutoGen", models: [], providers: [] }, + { value: "deepagents", label: "DeepAgents", models: [], providers: [] }, + { value: "openclaw", label: "OpenClaw", models: [], providers: [] }, + { value: "hermes", label: "Hermes", models: [], providers: [] }, + { value: "gemini-cli", label: "Gemini CLI", models: [], providers: [] }, ]; export function ConfigTab({ workspaceId }: Props) { @@ -138,6 +174,17 @@ export function ConfigTab({ workspaceId }: Props) { const [rawMode, setRawMode] = useState(false); const [rawDraft, setRawDraft] = useState(""); const [runtimeOptions, setRuntimeOptions] = useState(FALLBACK_RUNTIME_OPTIONS); + // Provider override (Option B PR-5): stored separately from config.yaml + // because the value lives in workspace_secrets (encrypted), not in the + // platform-managed config.yaml. The two endpoints are GET/PUT + // /workspaces/:id/provider on workspace-server (handlers/secrets.go). + // Empty = "auto-derive from model slug prefix" — pre-Option-B behavior + // and what most users want. Setting to a non-empty value writes + // LLM_PROVIDER into workspace_secrets and triggers an auto-restart so + // the workspace boots with the new provider in env (and via CP user- + // data, written into /configs/config.yaml on next provision too). + const [provider, setProvider] = useState(""); + const [originalProvider, setOriginalProvider] = useState(""); const successTimerRef = useRef>(undefined); useEffect(() => { @@ -168,6 +215,22 @@ export function ConfigTab({ workspaceId }: Props) { wsMetadataModel = (m.model || "").trim(); } catch { /* non-fatal */ } + // Load explicit provider override (Option B PR-5). Endpoint returns + // {provider: "", source: "default"} when no override is set, so the + // empty string is the legitimate "auto-derive" signal — don't treat + // it as a load error. Non-fatal: an older workspace-server that + // predates PR-2 returns 404 here; the form falls back to "" and + // Save just won't PUT the provider field. + try { + const p = await api.get<{ provider?: string }>(`/workspaces/${workspaceId}/provider`); + const loadedProvider = (p.provider || "").trim(); + setProvider(loadedProvider); + setOriginalProvider(loadedProvider); + } catch { + setProvider(""); + setOriginalProvider(""); + } + try { const res = await api.get<{ content: string }>(`/workspaces/${workspaceId}/files/config.yaml`); const parsed = parseYaml(res.content); @@ -209,11 +272,11 @@ export function ConfigTab({ workspaceId }: Props) { useEffect(() => { let cancelled = false; - api.get>("/templates") + api.get>("/templates") .then((rows) => { if (cancelled || !Array.isArray(rows)) return; const byRuntime = new Map(); - byRuntime.set("", { value: "", label: "LangGraph (default)", models: [] }); + byRuntime.set("", { value: "", label: "LangGraph (default)", models: [], providers: [] }); for (const r of rows) { const v = (r.runtime || "").trim(); if (!v || v === "langgraph") continue; @@ -221,8 +284,9 @@ export function ConfigTab({ workspaceId }: Props) { // one with the richer models list is probably newer. const existing = byRuntime.get(v); const models = Array.isArray(r.models) ? r.models : []; + const providers = Array.isArray(r.providers) ? r.providers : []; if (!existing || models.length > existing.models.length) { - byRuntime.set(v, { value: v, label: r.name || v, models }); + byRuntime.set(v, { value: v, label: r.name || v, models, providers }); } } if (byRuntime.size > 1) setRuntimeOptions(Array.from(byRuntime.values())); @@ -234,6 +298,16 @@ export function ConfigTab({ workspaceId }: Props) { // Models + env hints for the currently-selected runtime. const selectedRuntime = runtimeOptions.find((o) => o.value === (config.runtime || "")) ?? null; const availableModels: ModelSpec[] = selectedRuntime?.models ?? []; + // Provider suggestions: prefer the runtime's declarative providers + // list (sourced from its template config.yaml runtime_config.providers + // and surfaced via /templates), fall back to deriving from model slug + // prefixes when the template hasn't migrated to the explicit field + // yet. Either way the data flows from the adapter — no hardcoded + // canvas-side enum. + const providerSuggestions: string[] = + (selectedRuntime?.providers && selectedRuntime.providers.length > 0) + ? selectedRuntime.providers + : deriveProvidersFromModels(availableModels); const currentModelId = config.runtime_config?.model || config.model || ""; const currentModelSpec = availableModels.find((m) => m.id === currentModelId) ?? null; @@ -334,6 +408,24 @@ export function ConfigTab({ workspaceId }: Props) { } } + // Provider override save (Option B PR-5). PUT only when the user + // changed the dropdown — otherwise an unrelated Save (e.g. tier + // edit) would re-write the provider unchanged and the server- + // side auto-restart would fire on every Save, costing the user a + // ~30s reboot for a no-op change. Server endpoint accepts an + // empty string to clear the override (deletes the + // workspace_secrets row); we forward whatever the form holds. + let providerSaveError: string | null = null; + const providerChanged = provider !== originalProvider; + if (providerChanged) { + try { + await api.put(`/workspaces/${workspaceId}/provider`, { provider }); + setOriginalProvider(provider); + } catch (e) { + providerSaveError = e instanceof Error ? e.message : "Provider update was rejected"; + } + } + setOriginalYaml(content); if (rawMode) { const parsed = parseYaml(content); @@ -341,16 +433,30 @@ export function ConfigTab({ workspaceId }: Props) { } else { setRawDraft(content); } - if (restart) { + // SetProvider on the server already triggers an auto-restart for + // the workspace whenever the value actually changed (see + // workspace-server/internal/handlers/secrets.go:SetProvider). If + // the user also clicked Save+Restart we'd kick off a SECOND + // restart here and the two would race in the canvas store — + // suppress the redundant call and rely on the server-side one. + const providerWillAutoRestart = providerChanged && !providerSaveError; + if (restart && !providerWillAutoRestart) { await useCanvasStore.getState().restartWorkspace(workspaceId); - } else { - useCanvasStore.getState().updateNodeData(workspaceId, { needsRestart: true }); + } else if (!restart) { + useCanvasStore.getState().updateNodeData(workspaceId, { needsRestart: !providerWillAutoRestart }); } - if (modelSaveError) { - // Partial-save UX: surface the model rejection instead of - // showing "Saved" — the user would otherwise watch the model - // field revert on next reload with no explanation. - setError(`Other fields saved, but model update failed: ${modelSaveError}`); + // Aggregate partial-save errors. Both modelSaveError and + // providerSaveError describe rejected updates from independent + // endpoints — show whichever fired so the user knows which + // field reverts on next reload (otherwise they'd see "Saved" and + // be confused why Provider snapped back). + const partialError = providerSaveError + ? `Other fields saved, but provider update failed: ${providerSaveError}` + : modelSaveError + ? `Other fields saved, but model update failed: ${modelSaveError}` + : null; + if (partialError) { + setError(partialError); } else { setSuccess(true); clearTimeout(successTimerRef.current); @@ -371,7 +477,8 @@ export function ConfigTab({ workspaceId }: Props) { const taskBudgetId = useId(); const sandboxBackendId = useId(); - const isDirty = rawMode ? rawDraft !== originalYaml : toYaml(config) !== originalYaml; + const providerDirty = provider !== originalProvider; + const isDirty = (rawMode ? rawDraft !== originalYaml : toYaml(config) !== originalYaml) || providerDirty; if (loading) { return
Loading config...
; @@ -518,6 +625,51 @@ export function ConfigTab({ workspaceId }: Props) { )} + {/* Provider override (Option B PR-5). Free-text combobox so + operators can use any of the 30+ slugs hermes-agent's + derive-provider.sh recognizes — the suggestion list is + a hint, not a constraint. Empty = "auto-derive from + model slug prefix" which is correct for the common case + (model "anthropic:claude-opus-4-7" → provider derived + as "anthropic"). The override is needed when the model + alias has no clean vendor prefix (e.g. hermes default + "nousresearch/hermes-4-70b" → derive returns empty → + hermes errors "No LLM provider configured"). */} +
+ + 0 ? `${runtimeId}-providers` : undefined} + value={provider} + onChange={(e) => setProvider(e.target.value.trim())} + placeholder={ + providerSuggestions.length > 0 + ? `e.g. ${providerSuggestions.slice(0, 3).join(", ")} (empty = auto-derive)` + : "empty = auto-derive from model slug" + } + aria-label="LLM provider override" + data-testid="provider-input" + className="w-full bg-zinc-800 border border-zinc-700 rounded px-2 py-1 text-xs text-zinc-200 font-mono focus:outline-none focus:border-blue-500" + /> + {providerSuggestions.length > 0 && ( + + {providerSuggestions.map((p) => ( + + )} + {provider && provider !== originalProvider && ( +

+ Provider change → workspace will auto-restart on Save. +

+ )} +
({ + api: { + get: (path: string) => apiGet(path), + patch: (path: string, body: unknown) => apiPatch(path, body), + put: (path: string, body: unknown) => apiPut(path, body), + post: vi.fn(), + del: vi.fn(), + }, +})); + +vi.mock("@/store/canvas", () => ({ + useCanvasStore: Object.assign( + (selector: (s: unknown) => unknown) => selector({ restartWorkspace: vi.fn(), updateNodeData: vi.fn() }), + { getState: () => ({ restartWorkspace: vi.fn(), updateNodeData: vi.fn() }) }, + ), +})); + +vi.mock("../AgentCardSection", () => ({ + AgentCardSection: () =>
, +})); + +import { ConfigTab } from "../ConfigTab"; + +// wireApi — same shape as ConfigTab.hermes.test.tsx, extended with the +// /provider endpoint. Each test sets `providerValue` to the value the +// GET endpoint returns; "missing" means the endpoint rejects (older +// workspace-server pre-PR-2 — must not crash the tab). +function wireApi(opts: { + workspaceRuntime?: string; + workspaceModel?: string; + configYamlContent?: string | null; + templates?: Array<{ id: string; name?: string; runtime?: string; models?: unknown[]; providers?: string[] }>; + providerValue?: string | "missing"; +}) { + apiGet.mockImplementation((path: string) => { + if (path === `/workspaces/ws-test`) { + return Promise.resolve({ runtime: opts.workspaceRuntime ?? "" }); + } + if (path === `/workspaces/ws-test/model`) { + return Promise.resolve({ model: opts.workspaceModel ?? "" }); + } + if (path === `/workspaces/ws-test/provider`) { + if (opts.providerValue === "missing") { + return Promise.reject(new Error("404")); + } + return Promise.resolve({ provider: opts.providerValue ?? "", source: opts.providerValue ? "workspace_secrets" : "default" }); + } + if (path === `/workspaces/ws-test/files/config.yaml`) { + if (opts.configYamlContent === null) return Promise.reject(new Error("not found")); + return Promise.resolve({ content: opts.configYamlContent ?? "" }); + } + if (path === "/templates") { + return Promise.resolve(opts.templates ?? []); + } + return Promise.reject(new Error(`unmocked api.get: ${path}`)); + }); +} + +beforeEach(() => { + apiGet.mockReset(); + apiPatch.mockReset(); + apiPut.mockReset(); +}); + +describe("ConfigTab — Provider override (Option B PR-5)", () => { + // Empty provider on load is the legitimate default ("auto-derive + // from model slug prefix"), NOT an error. The endpoint returning + // {provider: "", source: "default"} is the documented happy-path + // shape — if the form treated that as "load failed" we'd lose the + // ability to render the input at all on fresh workspaces. + it("renders an empty Provider input when no override is set", async () => { + wireApi({ + workspaceRuntime: "hermes", + workspaceModel: "nousresearch/hermes-4-70b", + configYamlContent: "name: ws\nruntime: hermes\n", + providerValue: "", + }); + + render(); + const input = await screen.findByTestId("provider-input"); + expect((input as HTMLInputElement).value).toBe(""); + }); + + // Pre-existing override loads back into the field on mount. Without + // this, an operator who set provider=openrouter yesterday would see + // the field blank today, conclude the value didn't stick, and + // re-save — the resulting PUT-with-same-value would auto-restart + // the workspace for nothing. + it("loads an existing provider override from the server", async () => { + wireApi({ + workspaceRuntime: "hermes", + workspaceModel: "nousresearch/hermes-4-70b", + configYamlContent: "name: ws\nruntime: hermes\n", + providerValue: "openrouter", + }); + + render(); + const input = await screen.findByTestId("provider-input"); + await waitFor(() => expect((input as HTMLInputElement).value).toBe("openrouter")); + }); + + // Old workspace-server (pre-PR-2) returns a 404 on /provider. The + // tab must keep loading — the fallback is "" (auto-derive), same as + // a fresh workspace. + it("falls back to empty provider when the endpoint is missing", async () => { + wireApi({ + workspaceRuntime: "hermes", + workspaceModel: "nousresearch/hermes-4-70b", + configYamlContent: "name: ws\nruntime: hermes\n", + providerValue: "missing", + }); + + render(); + const input = await screen.findByTestId("provider-input"); + expect((input as HTMLInputElement).value).toBe(""); + // Tab should be fully rendered, not stuck in loading or error state. + expect(screen.queryByText(/Loading config/i)).toBeNull(); + }); + + // Setting a value + Save must PUT to the right endpoint with the + // right body shape. Server-side handler (workspace-server + // handlers/secrets.go:SetProvider) reads body.provider — any other + // key gets silently ignored and the workspace_secrets row stays + // unset. This regression would manifest as "Save → Restart → + // workspace still says No LLM provider configured." + it("PUTs the new provider to /workspaces/:id/provider on Save", async () => { + wireApi({ + workspaceRuntime: "hermes", + workspaceModel: "nousresearch/hermes-4-70b", + configYamlContent: "name: ws\nruntime: hermes\n", + providerValue: "", + }); + apiPut.mockResolvedValue({ status: "saved", provider: "anthropic" }); + + render(); + const input = await screen.findByTestId("provider-input"); + + fireEvent.change(input, { target: { value: "anthropic" } }); + expect((input as HTMLInputElement).value).toBe("anthropic"); + + const saveBtn = screen.getByRole("button", { name: /^save$/i }); + fireEvent.click(saveBtn); + + await waitFor(() => { + const providerCalls = apiPut.mock.calls.filter(([path]) => path === "/workspaces/ws-test/provider"); + expect(providerCalls.length).toBe(1); + expect(providerCalls[0][1]).toEqual({ provider: "anthropic" }); + }); + }); + + // No-change Save must NOT PUT /provider. The server-side SetProvider + // auto-restarts the workspace on every successful PUT — re-writing + // an unchanged value would cost the user a ~30s reboot every time + // they tweak some other field. + it("does not PUT /provider when the value is unchanged", async () => { + wireApi({ + workspaceRuntime: "hermes", + workspaceModel: "nousresearch/hermes-4-70b", + configYamlContent: "name: ws\nruntime: hermes\ntier: 2\n", + providerValue: "openrouter", + }); + apiPut.mockResolvedValue({}); + + render(); + await screen.findByTestId("provider-input"); + + // Click Save without touching the provider field. Trigger another + // dirty-marker (tier change) so Save is enabled — the test is + // about NOT touching /provider, not about Save being disabled. + const tierSelect = screen.getByLabelText(/tier/i) as HTMLSelectElement; + fireEvent.change(tierSelect, { target: { value: "3" } }); + + const saveBtn = screen.getByRole("button", { name: /^save$/i }); + fireEvent.click(saveBtn); + + await waitFor(() => { + // Some PUT(s) may fire (e.g. /model). Just assert /provider is NOT among them. + const providerCalls = apiPut.mock.calls.filter(([path]) => path === "/workspaces/ws-test/provider"); + expect(providerCalls.length).toBe(0); + }); + }); + + // The dropdown's suggestion list MUST come from the runtime's own + // template (via /templates → runtime_config.providers), not a + // hardcoded canvas-side enum. This is the "Native + pluggable + // runtime" invariant: a new runtime declaring its own provider + // taxonomy in its config.yaml gets a working dropdown without ANY + // canvas-side change. + // + // Pinned by checking that suggestions surfaced in the datalist + // exactly mirror what the templates endpoint returned for the + // matching runtime. If a future contributor reintroduces a + // PROVIDER_SUGGESTIONS-style hardcoded list and the datalist + // contents don't follow the template, this test fails. + it("populates the provider datalist from the matched runtime's templates entry", async () => { + wireApi({ + workspaceRuntime: "hermes", + workspaceModel: "nousresearch/hermes-4-70b", + configYamlContent: "name: ws\nruntime: hermes\n", + providerValue: "", + templates: [ + { + id: "hermes", + name: "Hermes", + runtime: "hermes", + models: [], + // The provider list every runtime adapter ships in its own + // config.yaml. Canvas must surface THIS, not its own list. + providers: ["nous", "openrouter", "anthropic", "minimax-cn"], + }, + ], + }); + + render(); + const input = await screen.findByTestId("provider-input"); + const listId = (input as HTMLInputElement).getAttribute("list"); + expect(listId).toBeTruthy(); + await waitFor(() => { + const datalist = document.getElementById(listId!); + expect(datalist).not.toBeNull(); + const optionValues = Array.from(datalist!.querySelectorAll("option")).map( + (o) => (o as HTMLOptionElement).value, + ); + // Order matters — most-common-first is part of the contract so + // the demo flow lands on a working choice without scrolling. + expect(optionValues).toEqual(["nous", "openrouter", "anthropic", "minimax-cn"]); + }); + }); + + // Fallback path: when a template hasn't migrated to the explicit + // `providers:` field yet, suggestions are derived from model slug + // prefixes. Still adapter-driven (the slugs come from the template's + // `models:` list), just inferred. This keeps existing templates + // working while the platform team migrates them one at a time. + it("falls back to model-slug prefixes when the runtime ships no providers list", async () => { + wireApi({ + workspaceRuntime: "hermes", + workspaceModel: "anthropic:claude-opus-4-7", + configYamlContent: "name: ws\nruntime: hermes\n", + providerValue: "", + templates: [ + { + id: "hermes", + name: "Hermes", + runtime: "hermes", + models: [ + { id: "anthropic:claude-opus-4-7" }, + { id: "openai:gpt-4o" }, + { id: "anthropic:claude-sonnet-4-5" }, // dup vendor — must dedupe + { id: "nousresearch/hermes-4-70b" }, // "/" separator + ], + // No `providers:` field → fallback derivation kicks in. + }, + ], + }); + + render(); + const input = await screen.findByTestId("provider-input"); + const listId = (input as HTMLInputElement).getAttribute("list"); + expect(listId).toBeTruthy(); + await waitFor(() => { + const datalist = document.getElementById(listId!); + const optionValues = Array.from(datalist!.querySelectorAll("option")).map( + (o) => (o as HTMLOptionElement).value, + ); + // Order = first-appearance from models[]; dedup keeps anthropic + // once even though two model slugs use it. + expect(optionValues).toEqual(["anthropic", "openai", "nousresearch"]); + }); + }); + + // Empty string is a legitimate save target — it clears the override + // (the server-side endpoint deletes the workspace_secrets row). + // Operators who picked "anthropic" yesterday and want to revert to + // auto-derive today should be able to do so by clearing the field + // and clicking Save. Without this PUT path, the only way to clear + // would be a direct DB edit. + it("PUTs an empty string when the operator clears a previously-set provider", async () => { + wireApi({ + workspaceRuntime: "hermes", + workspaceModel: "anthropic:claude-opus-4-7", + configYamlContent: "name: ws\nruntime: hermes\n", + providerValue: "openrouter", + }); + apiPut.mockResolvedValue({ status: "cleared" }); + + render(); + const input = await screen.findByTestId("provider-input"); + await waitFor(() => expect((input as HTMLInputElement).value).toBe("openrouter")); + + fireEvent.change(input, { target: { value: "" } }); + + const saveBtn = screen.getByRole("button", { name: /^save$/i }); + fireEvent.click(saveBtn); + + await waitFor(() => { + const providerCalls = apiPut.mock.calls.filter(([path]) => path === "/workspaces/ws-test/provider"); + expect(providerCalls.length).toBe(1); + expect(providerCalls[0][1]).toEqual({ provider: "" }); + }); + }); +}); diff --git a/workspace-server/internal/handlers/templates.go b/workspace-server/internal/handlers/templates.go index e33c06d6..1279a524 100644 --- a/workspace-server/internal/handlers/templates.go +++ b/workspace-server/internal/handlers/templates.go @@ -59,6 +59,16 @@ type templateSummary struct { // preflight uses this as the fallback provider when `models` is empty // so provider picker stays data-driven instead of hardcoded in the UI. RequiredEnv []string `json:"required_env,omitempty"` + // Providers is the runtime's own list of supported provider slugs, + // sourced from runtime_config.providers in the template's config.yaml. + // The canvas Config tab surfaces this as the Provider override + // dropdown (Option B PR-5). Data-driven so each runtime owns its own + // taxonomy — hermes-agent supports 20+ providers; claude-code only + // "anthropic"; gemini-cli only "gemini" — and a future runtime with + // a different vendor list doesn't need a canvas edit. Empty list → + // canvas falls back to deriving suggestions from `models[].id` slug + // prefixes (still adapter-driven, just inferred). + Providers []string `json:"providers,omitempty"` Skills []string `json:"skills"` SkillCount int `json:"skill_count"` // ProvisionTimeoutSeconds lets a slow runtime declare its expected @@ -100,6 +110,7 @@ func (h *TemplatesHandler) List(c *gin.Context) { Model string `yaml:"model"` Models []modelSpec `yaml:"models"` RequiredEnv []string `yaml:"required_env"` + Providers []string `yaml:"providers"` ProvisionTimeoutSeconds int `yaml:"provision_timeout_seconds"` } `yaml:"runtime_config"` } @@ -122,6 +133,7 @@ func (h *TemplatesHandler) List(c *gin.Context) { Model: model, Models: raw.RuntimeConfig.Models, RequiredEnv: raw.RuntimeConfig.RequiredEnv, + Providers: raw.RuntimeConfig.Providers, Skills: raw.Skills, SkillCount: len(raw.Skills), ProvisionTimeoutSeconds: raw.RuntimeConfig.ProvisionTimeoutSeconds, diff --git a/workspace-server/internal/handlers/templates_test.go b/workspace-server/internal/handlers/templates_test.go index e40c6b16..6b85715c 100644 --- a/workspace-server/internal/handlers/templates_test.go +++ b/workspace-server/internal/handlers/templates_test.go @@ -197,6 +197,117 @@ skills: [] } } +// TestTemplatesList_SurfacesProviders pins the Option B PR-5 wiring: +// /templates must echo runtime_config.providers from the template's +// config.yaml into the JSON response. Canvas reads this list to +// populate the Provider override dropdown WITHOUT hardcoding any +// provider taxonomy on the frontend — that's the "data-driven from +// adapter" invariant. +// +// If a future yaml-tag rename or struct edit drops the field, every +// runtime would silently fall back to model-prefix derivation. For +// hermes specifically (default model has no clean prefix), that +// degrades the dropdown to empty and reintroduces the "No LLM +// provider configured" UX gap from 2026-05-01. +func TestTemplatesList_SurfacesProviders(t *testing.T) { + setupTestDB(t) + setupTestRedis(t) + + tmpDir := t.TempDir() + tmplDir := filepath.Join(tmpDir, "hermes-prov") + if err := os.MkdirAll(tmplDir, 0755); err != nil { + t.Fatalf("mkdir: %v", err) + } + configYaml := `name: Hermes +description: test +tier: 2 +runtime: hermes +runtime_config: + model: nousresearch/hermes-4-70b + providers: + - nous + - openrouter + - anthropic +skills: [] +` + if err := os.WriteFile(filepath.Join(tmplDir, "config.yaml"), []byte(configYaml), 0644); err != nil { + t.Fatalf("write: %v", err) + } + + handler := NewTemplatesHandler(tmpDir, nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/templates", nil) + handler.List(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + var resp []templateSummary + if err := json.Unmarshal(w.Body.Bytes(), &resp); err != nil { + t.Fatalf("parse: %v", err) + } + if len(resp) != 1 { + t.Fatalf("expected 1 template, got %d", len(resp)) + } + got := resp[0] + want := []string{"nous", "openrouter", "anthropic"} + if len(got.Providers) != len(want) { + t.Fatalf("Providers: want %v, got %v", want, got.Providers) + } + for i, p := range want { + if got.Providers[i] != p { + t.Errorf("Providers[%d]: want %q, got %q", i, p, got.Providers[i]) + } + } + + // Cross-check the JSON wire shape directly — canvas reads the field + // as `providers` (lowercase) and a struct-tag rename here would + // break consumers without surfacing in the typed assertions above. + if !strings.Contains(w.Body.String(), `"providers":["nous","openrouter","anthropic"]`) { + t.Errorf("response missing providers JSON field: %s", w.Body.String()) + } +} + +// TestTemplatesList_OmitsProvidersWhenAbsent pins the omitempty +// behavior — older templates that haven't migrated to +// runtime_config.providers yet must NOT emit `providers: null` (which +// would break canvas's array-typed parser). A template that simply +// omits the field stays absent in the response and canvas falls back +// to deriving suggestions from model-slug prefixes. +func TestTemplatesList_OmitsProvidersWhenAbsent(t *testing.T) { + setupTestDB(t) + setupTestRedis(t) + + tmpDir := t.TempDir() + tmplDir := filepath.Join(tmpDir, "no-prov") + if err := os.MkdirAll(tmplDir, 0755); err != nil { + t.Fatalf("mkdir: %v", err) + } + configYaml := `name: Legacy +runtime: langgraph +runtime_config: + model: anthropic:claude-opus-4-7 +skills: [] +` + if err := os.WriteFile(filepath.Join(tmplDir, "config.yaml"), []byte(configYaml), 0644); err != nil { + t.Fatalf("write: %v", err) + } + + handler := NewTemplatesHandler(tmpDir, nil) + w := httptest.NewRecorder() + c, _ := gin.CreateTestContext(w) + c.Request = httptest.NewRequest("GET", "/templates", nil) + handler.List(c) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + if strings.Contains(w.Body.String(), `"providers":`) { + t.Errorf("response should omit providers when template has none, got: %s", w.Body.String()) + } +} + func TestTemplatesList_LegacyTopLevelModel(t *testing.T) { // Older templates (pre-runtime_config) declared `model:` at the top level. // The /templates endpoint should keep surfacing those for backward compat. From 2e8892ebc414673d9219c2f60934d36b07155237 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 1 May 2026 11:47:53 -0700 Subject: [PATCH 15/61] fix(workspace): surface errno + path on chat-upload mkdir failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Production incident on hongming.moleculesai.app 2026-05-01T18:30Z — fresh-tenant signup chat upload returned 500 with the body {"error":"failed to prepare uploads dir"}. Diagnosis required SSM access to the workspace stderr to recover errno + actual path. The root-cause fix lives in claude-code template entrypoint (molecule-ai-workspace-template-claude-code#23 — pre-create the .molecule subtree as root before gosu drops to agent). This change is the diagnostic improvement: when mkdir fails for any reason in the future (EACCES, ENOSPC, EROFS, etc.), the response carries the errno + offending path so the operator inspecting browser devtools sees the real cause without needing SSM. Backwards compatible — top-level "error" key is unchanged so existing canvas / external alert rules continue to match. New fields are additive: path, errno, detail. Test pins the diagnostic shape so a future struct refactor can't silently drop these fields. Co-Authored-By: Claude Opus 4.7 (1M context) --- workspace/internal_chat_uploads.py | 19 ++++++++- workspace/tests/test_internal_chat_uploads.py | 42 +++++++++++++++++++ 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/workspace/internal_chat_uploads.py b/workspace/internal_chat_uploads.py index 65a389de..396c1ac8 100644 --- a/workspace/internal_chat_uploads.py +++ b/workspace/internal_chat_uploads.py @@ -170,8 +170,25 @@ async def ingest_handler(request: Request) -> JSONResponse: try: Path(CHAT_UPLOAD_DIR).mkdir(parents=True, exist_ok=True) except OSError as exc: + # Surface errno + path in the response so a fresh-tenant + # "failed to prepare uploads dir" 500 self-diagnoses without + # requiring SSM access to the workspace stderr. Prior incident + # 2026-05-01: hongming.moleculesai.app hit EACCES on the + # /workspace volume's `.molecule` subtree (root-owned race + # window between Docker volume create and entrypoint's chown, + # fixed via molecule-ai-workspace-template-claude-code#23). + # The errno + path are not security-sensitive — both are + # well-known to anyone with workspace access. logger.error("internal_chat_uploads: mkdir %s failed: %s", CHAT_UPLOAD_DIR, exc) - return JSONResponse({"error": "failed to prepare uploads dir"}, status_code=500) + return JSONResponse( + { + "error": "failed to prepare uploads dir", + "path": CHAT_UPLOAD_DIR, + "errno": exc.errno, + "detail": str(exc), + }, + status_code=500, + ) response_files: list[dict] = [] total_bytes = 0 diff --git a/workspace/tests/test_internal_chat_uploads.py b/workspace/tests/test_internal_chat_uploads.py index c3de859c..d386de65 100644 --- a/workspace/tests/test_internal_chat_uploads.py +++ b/workspace/tests/test_internal_chat_uploads.py @@ -222,6 +222,48 @@ def test_per_file_oversize_returns_413(client: TestClient, monkeypatch: pytest.M assert "exceeds per-file limit" in r.json()["error"] +# Pins the diagnostic shape of the 500 returned when the upload +# directory cannot be created. Prior to this fix, the response was +# {"error": "failed to prepare uploads dir"} only — opaque to the +# operator inspecting browser devtools, requiring SSM access to the +# workspace stderr to recover errno + actual path. Surfacing both in +# the response body makes the failure self-diagnosing the next time +# this class of bug recurs (e.g. EACCES on a root-owned `.molecule` +# subtree, ENOSPC on a full disk, EROFS on a read-only mount). +# +# Reproduces the failure by pointing CHAT_UPLOAD_DIR at a path whose +# parent the agent user can't write to. The exact errno in the test +# is 13 (EACCES) on a chmod-0 dir; values are not asserted exactly +# because they vary by OS / errno mapping. The PRESENCE of errno + +# path is what's pinned — drift on those keys breaks the operator +# diagnostic loop. +def test_mkdir_failure_returns_errno_and_path(client: TestClient, chat_uploads_dir: Path, monkeypatch: pytest.MonkeyPatch): + # Plant a regular FILE where mkdir's parent should be — mkdir + # raises FileExistsError / NotADirectoryError reliably across + # platforms, exercising the OSError catch path. + blocker = chat_uploads_dir.parent / "chat-uploads-blocker" + blocker.write_text("not a dir") + # Repoint CHAT_UPLOAD_DIR to a child path under the regular file + # so mkdir(parents=True, exist_ok=True) raises NotADirectoryError. + monkeypatch.setattr(internal_chat_uploads, "CHAT_UPLOAD_DIR", str(blocker / "child")) + + r = client.post( + "/internal/chat/uploads/ingest", + files={"files": ("a.txt", b"x")}, + headers={"Authorization": "Bearer test-secret"}, + ) + assert r.status_code == 500, r.text + body = r.json() + # Backwards-compatible top-level error keeps existing canvas / + # external alert rules matching. + assert body.get("error") == "failed to prepare uploads dir" + # New diagnostic fields — operator can now see WHAT path failed + # and WHY without SSM access. + assert body.get("path") == str(blocker / "child") + assert isinstance(body.get("errno"), int) and body["errno"] != 0 + assert "detail" in body and isinstance(body["detail"], str) and body["detail"] + + def test_total_request_body_oversize_returns_413(client: TestClient, monkeypatch: pytest.MonkeyPatch): """Header-side total cap. Set the limit BELOW the actual body and confirm we reject before parsing multipart.""" From 6d23611620daee68fab48992ee433545e18ea859 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 1 May 2026 12:04:30 -0700 Subject: [PATCH 16/61] ops: demo-day freeze + rollback runbook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Demo-day preparation bundle for the funding demo (~2026-05-06). Adds: - scripts/demo-freeze.sh — captures current ghcr.io workspace-template-* :latest digests for all 8 runtimes, then disables both cascade vectors that could re-tag :latest mid-demo: publish-runtime.yml in molecule-core (PATH 1 — staging push to workspace/** auto-bumps the wheel and fans out to 8 templates) and publish-image.yml in each of the 8 template repos (PATH 2 — direct template repo merge re-tags :latest). Defaults to dry-run; requires --execute to apply. Writes both digest + workflow receipts to scripts/demo-freeze-snapshots/. - scripts/demo-thaw.sh — re-enables every workflow demo-freeze.sh disabled, keyed off the receipt timestamp. Defaults to executing (the inverse safety polarity from freeze, where the destructive default is dry-run). --dry-run prints without applying. - scripts/demo-day-runbook.md — operator runbook indexing the six rollback levers (platform image rollback, template image rollback, tenant redeploy, workspace delete, Railway rollback, Vercel rollback) plus pre-warm timing and post-demo cleanup. Also covers read-only diagnostics for "is this working?" moments and the CP_ADMIN_API_TOKEN rotation step that must follow demo (the token gets copy-pasted into shells during incident response). - scripts/demo-freeze-snapshots/.gitignore — generated freeze receipts are operational state, not source. Tracked .gitkeep so the directory exists when the script writes to it. Both scripts dry-run-tested locally. Did not exercise --execute since that would actually disable production workflows mid-development. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/demo-day-runbook.md | 306 +++++++++++++++++++++++ scripts/demo-freeze-snapshots/.gitignore | 6 + scripts/demo-freeze-snapshots/.gitkeep | 0 scripts/demo-freeze.sh | 214 ++++++++++++++++ scripts/demo-thaw.sh | 124 +++++++++ 5 files changed, 650 insertions(+) create mode 100644 scripts/demo-day-runbook.md create mode 100644 scripts/demo-freeze-snapshots/.gitignore create mode 100644 scripts/demo-freeze-snapshots/.gitkeep create mode 100755 scripts/demo-freeze.sh create mode 100755 scripts/demo-thaw.sh diff --git a/scripts/demo-day-runbook.md b/scripts/demo-day-runbook.md new file mode 100644 index 00000000..ff4847ce --- /dev/null +++ b/scripts/demo-day-runbook.md @@ -0,0 +1,306 @@ +# Demo-day runbook + +Pre-, during-, and post-demo operational procedures for the molecule +production stack. Updated 2026-05-01 ahead of the funding-demo on +~2026-05-06. + +The whole stack: + +``` +Vercel canvas (app.moleculesai.app) + → Railway controlplane (api.moleculesai.app) + → CloudFront/Cloudflare per-tenant edge (.moleculesai.app) + → EC2 tenant instance running platform container + → Docker workspaces pulled from + ghcr.io/molecule-ai/workspace-template-:latest +``` + +Every layer has its own deploy/rollback story. This runbook indexes +them in the order an operator would touch them during an incident. + +## Pre-demo (T-48h to T-1h) + +### 1. Freeze the runtime + template image cascade + +A merge to `molecule-core/staging` that touches `workspace/**` triggers +`publish-runtime.yml` → PyPI bump → repository_dispatch → 8 template +repos rebuild and re-tag `:latest`. A merge to any template repo's +`main` triggers the same final re-tag directly. Either path means a +new workspace provision during the demo pulls whatever `:latest` +resolved to seconds earlier. + +Capture current good digests + disable both cascade vectors: + +```bash +# Dry-run first — verifies digests can be fetched and tooling is set up +scripts/demo-freeze.sh + +# Apply +scripts/demo-freeze.sh --execute +``` + +The script writes two receipts to `scripts/demo-freeze-snapshots/`: + +- `digests-.txt` — current `:latest` digest per template (rollback target if needed) +- `disabled-workflows-.txt` — workflow paths to re-enable post-demo + +Verify the freeze landed: + +```bash +gh workflow list -R Molecule-AI/molecule-core | grep publish-runtime +# expect: status = disabled_manually +``` + +If a critical fix MUST ship during the freeze window: + +1. `gh workflow enable publish-runtime.yml -R Molecule-AI/molecule-core` +2. Merge the fix +3. Watch the cascade through to GHCR:latest manually +4. Smoke-verify against a staging tenant (`scripts/api-smoke.sh` or + manual canvas walkthrough) +5. `gh workflow disable publish-runtime.yml -R Molecule-AI/molecule-core` to re-freeze + +Don't auto-promote during the freeze — the value of the freeze is that +nothing happens automatically. + +### 2. Confirm production CP is on the expected SHA + +```bash +gh run list -R Molecule-AI/molecule-controlplane --branch main --limit 5 +# Last `ci` run should be SUCCESS with the SHA you intend to demo on +``` + +Railway auto-deploys from main. Spot-check `api.moleculesai.app`: + +```bash +curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ + https://api.moleculesai.app/cp/admin/orgs?limit=1 +# Expect: 200 + a JSON {"orgs": [...]} +``` + +### 3. Confirm production canvas (Vercel) is on main + +Vercel auto-deploys `main`. Verify in the Vercel dashboard the most +recent prod deploy ran from the expected commit SHA. + +### 4. Pre-warm the demo tenant + +Cold-start times on workspace-template images: + +| Runtime | Cold-start (first boot) | +|---|---| +| claude-code | ~30-60s | +| openclaw | ~1-2 min | +| langgraph | ~1 min | +| hermes | **~7 min** (large image) | + +If the demo will use `hermes`, provision the demo workspace at least +10 min before. The cold-start clock starts when the workspace is +created, not when it's used. + +## During demo — emergency rollback levers + +### Lever A: Platform-image rollback (canvas/CP layer regression) + +If the canvas or platform container shipped a regression, retag +`:latest` to a prior staging SHA without rebuilding: + +```bash +# Find a known-good SHA from staging history +gh run list -R Molecule-AI/molecule-core --workflow=publish-canvas-image.yml --limit 5 + +# Roll both platform + tenant images +GITHUB_TOKEN=$(gh auth token) scripts/rollback-latest.sh +``` + +`rollback-latest.sh` retags both `ghcr.io/molecule-ai/platform:latest` +and `ghcr.io/molecule-ai/platform-tenant:latest`. Existing tenants +auto-pull `:latest` every 5 min — rollback propagates without manual +restart. + +### Lever B: Workspace-template image rollback + +If a specific runtime template (claude-code, hermes, etc.) shipped a +broken `:latest`: + +```bash +# Get the demo's snapshotted-good digest from the freeze receipt +grep claude-code scripts/demo-freeze-snapshots/digests-.txt + +# Retag :latest back to the snapshotted digest using crane +crane auth login ghcr.io -u "$(gh api user --jq .login)" \ + --password-stdin <<< "$(gh auth token)" +crane tag \ + ghcr.io/molecule-ai/workspace-template-claude-code@sha256: \ + latest +``` + +The next workspace provision pulls the rolled-back image. Existing +workspaces are unaffected (their image is already loaded into Docker). + +### Lever C: Wedged demo tenant — redeploy + +If the demo tenant's EC2 instance is wedged (boot succeeded but app +not responding, or a stuck workspace), the controlplane has an admin +redeploy endpoint: + +```bash +# AWS-side: forces a fresh EC2 launch with current image. ~3 min. +curl -fsS -X POST \ + -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ + https://api.moleculesai.app/cp/admin/orgs//redeploy +``` + +WARNING per memory: this triggers real EC2 + SSM actions on production. +Double-check `` against the demo tenant's slug before pressing +return. The `/redeploy` endpoint is idempotent on the EC2 side but +WILL drop active SSH sessions. + +### Lever D: Specific bad workspace — delete + +If a single workspace inside the demo tenant is misbehaving (e.g. +hermes wedged on cold-start, claude-code returning the generic +"Agent error (Exception)" message), kill it: + +```bash +# Get the demo tenant's per-tenant ADMIN_TOKEN +TENANT_ADMIN=$(curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ + https://api.moleculesai.app/cp/admin/orgs//admin-token \ + | jq -r .admin_token) + +ORG_ID=$(curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ + https://api.moleculesai.app/cp/admin/orgs?limit=20 \ + | jq -r '.orgs[] | select(.slug=="") | .id') + +# Delete the bad workspace +curl -fsS -X DELETE \ + -H "Origin: https://.moleculesai.app" \ + -H "Authorization: Bearer $TENANT_ADMIN" \ + -H "X-Molecule-Org-Id: $ORG_ID" \ + https://.moleculesai.app/workspaces/ +``` + +Then re-provision a fresh workspace from the canvas. Faster than +debugging the wedged one. + +### Lever E: Railway production rollback (CP regression) + +If the last Railway deploy of CP introduced a regression that lever A +can't fix (e.g. a logic bug, not a container issue): + +1. Open Railway dashboard → molecule-platform → controlplane → Deployments +2. Find the previous-known-good deployment +3. Click **Rollback to this deployment** + +Manual step — no CLI equivalent built. Takes ~30s to redeploy from +the prior image. Note: rollback restores the prior code AND prior env +var snapshot; don't expect any env var changes made since to persist. + +### Lever F: Vercel production rollback (canvas regression) + +If the canvas ships a regression: + +1. Open Vercel dashboard → molecule-app → Deployments +2. Find the previous prod deployment +3. **Promote to Production** + +Same pattern as Railway — fast revert, no rebuild. + +## Tenant-level read-only diagnostics (not actions) + +Useful during a "is this working?" moment without touching anything: + +```bash +# Tenant infra state +curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ + "https://api.moleculesai.app/cp/admin/orgs?limit=20" \ + | jq '.orgs[] | select(.slug=="")' + +# Tenant boot events (debug a stuck provision) +curl -fsS -H "Authorization: Bearer $CP_ADMIN_API_TOKEN" \ + "https://api.moleculesai.app/cp/admin/tenants//boot-events?limit=50" \ + | jq + +# Workspace activity (debug an unresponsive agent) +curl -fsS \ + -H "Origin: https://.moleculesai.app" \ + -H "Authorization: Bearer $TENANT_ADMIN" \ + -H "X-Molecule-Org-Id: $ORG_ID" \ + "https://.moleculesai.app/workspaces//activity?limit=20" \ + | jq +``` + +## Post-demo (T+30m to T+24h) + +### 1. Thaw the cascades + +```bash +# Find the freeze receipt +ls scripts/demo-freeze-snapshots/ + +# Thaw — pass the timestamp suffix +scripts/demo-thaw.sh 20260506-180000 +``` + +The next merge to `molecule-core/staging` (workspace/**) or any +template repo's `main` will resume the auto-rebuild cascade. + +### 2. Audit what was held back + +If any merges queued during the freeze: + +```bash +gh pr list -R Molecule-AI/molecule-core --base staging --state merged \ + --search "merged:>=$(date -u -v-7d +%Y-%m-%d)" +``` + +Verify each merge's CI is green and dispatch the runtime cascade once +to ensure all templates rebuild against the post-freeze HEAD. + +### 3. File a post-mortem if anything fired + +If any rollback lever was used during the demo, file a brief doc: + +- Which lever (A through F) +- Which SHA was rolled back FROM and TO +- Did the rollback fully resolve the issue or was a follow-up needed +- Whether the underlying regression should have been caught by CI + +## Common issues + first-line fix + +| Symptom | First lever to try | +|---|---| +| Workspace boots but agent always errors | Lever D (delete + reprovision) | +| Whole tenant unreachable | Lever C (redeploy) | +| Canvas crashes on load | Lever F (Vercel rollback) | +| Login broken / API errors | Lever E (Railway rollback) | +| Specific runtime broken across tenants | Lever B (template image rollback) | +| Platform container regression | Lever A (rollback-latest.sh) | +| Mid-demo stray PR auto-published a bad image | Lever B + investigate why freeze didn't catch it | + +## Auth fingerprint (rotate post-demo) + +The freeze + rollback procedures assume: + +- `CP_ADMIN_API_TOKEN` available via `railway variables --kv --environment production` +- `gh auth token` returns a working PAT with `workflow:write` + `write:packages` +- `crane` installed (`brew install crane`) + +After the demo, **rotate** `CP_ADMIN_API_TOKEN` (it's the keys-to-the-kingdom +token for production) — it likely got copy-pasted into shells during +the demo. + +```bash +# Generate a new admin token +NEW_TOKEN=$(openssl rand -hex 32) + +# Update Railway production env var (and optionally staging) +railway variables --set CP_ADMIN_API_TOKEN="$NEW_TOKEN" --environment production + +# Restart CP service to pick up the change +# (Railway auto-restarts on env var change) + +# Verify +curl -fsS -H "Authorization: Bearer $NEW_TOKEN" \ + https://api.moleculesai.app/cp/admin/orgs?limit=1 +``` diff --git a/scripts/demo-freeze-snapshots/.gitignore b/scripts/demo-freeze-snapshots/.gitignore new file mode 100644 index 00000000..50692299 --- /dev/null +++ b/scripts/demo-freeze-snapshots/.gitignore @@ -0,0 +1,6 @@ +# Generated by scripts/demo-freeze.sh — receipts are operational state, +# not source. Tracked .gitignore + .gitkeep keep the directory itself +# in version control so the freeze script's output dir always exists. +* +!.gitignore +!.gitkeep diff --git a/scripts/demo-freeze-snapshots/.gitkeep b/scripts/demo-freeze-snapshots/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/scripts/demo-freeze.sh b/scripts/demo-freeze.sh new file mode 100755 index 00000000..be7b176b --- /dev/null +++ b/scripts/demo-freeze.sh @@ -0,0 +1,214 @@ +#!/usr/bin/env bash +# demo-freeze.sh — disable the runtime + template image publish cascades +# during a demo-prep window so a stray staging merge can't auto-rebuild +# `:latest` for the 8 workspace-template images mid-demo. +# +# Demo prep typically runs T-48h to T+1h. During that window: +# +# PATH 1: any merge to molecule-core/staging that touches workspace/** +# → publish-runtime.yml fires +# → PyPI auto-bumps molecule-ai-workspace-runtime patch version +# → repository_dispatch fans out to 8 workspace-template-* repos +# → each template repo rebuilds and re-tags +# ghcr.io/molecule-ai/workspace-template-:latest +# +# PATH 2: any merge to a workspace-template-* repo's main branch +# → that repo's publish-image.yml fires +# → ghcr.io/molecule-ai/workspace-template-:latest +# gets re-tagged +# +# provisioner.go:296 RuntimeImages[runtime] reads `:latest` at every +# workspace boot. A new workspace provision during demo pulls whatever +# `:latest` resolved to seconds earlier — so a bad merge minutes +# before the demo can break a tenant the funder is about to see. +# +# This script captures the current good `:latest` digests for all 8 +# templates and disables both cascade vectors. The complementary +# demo-thaw.sh re-enables them. +# +# Usage: +# scripts/demo-freeze.sh # dry run — print what would happen +# scripts/demo-freeze.sh --execute # actually disable workflows + snapshot +# +# Prereqs: +# - gh CLI authenticated with workflow:write scope on Molecule-AI org +# - curl + jq (for digest snapshot via GHCR anonymous registry API) +# +# Output: +# /digests-YYYYMMDD-HHMMSS.txt +# One line per template: ": " +# /disabled-workflows-YYYYMMDD-HHMMSS.txt +# One line per disabled workflow: ": " +# +# Exit codes: +# 0 — freeze complete (or dry-run successful) +# 1 — pre-flight failure (missing tooling, missing auth, etc.) +# 2 — partial freeze (some workflows did not disable cleanly; see log) + +set -euo pipefail + +usage() { + cat <<'USAGE' +demo-freeze.sh — disable the runtime + template image publish cascades +during a demo-prep window. + +Captures current :latest digests for all 8 workspace-template-* images +and disables the workflows that would otherwise re-tag them. + +Usage: + scripts/demo-freeze.sh # dry run — print what would happen + scripts/demo-freeze.sh --execute # actually disable workflows + snapshot + +See the comment block at the top of this script for the full procedure. +USAGE +} + +EXECUTE=0 +case "${1:-}" in + --execute) + EXECUTE=1 + ;; + --help|-h) + usage + exit 0 + ;; + "") + ;; + *) + echo "unknown arg: $1" >&2 + usage >&2 + exit 2 + ;; +esac + +# Templates and their GHCR repository slugs. Source of truth for the +# runtime → image map is workspace-server/internal/provisioner/provisioner.go +# RuntimeImages — keep this list in sync if a runtime is added. +TEMPLATES=( + "claude-code" + "hermes" + "openclaw" + "langgraph" + "deepagents" + "crewai" + "autogen" + "gemini-cli" +) + +# Pre-flight: required tooling. +need() { + command -v "$1" >/dev/null || { echo "ERROR: missing required tool: $1" >&2; exit 1; } +} +need gh +need curl +need jq + +# Pre-flight: gh auth. Snapshot via anonymous GHCR token works without +# org auth, but workflow disable needs an authenticated gh. +if ! gh auth status >/dev/null 2>&1; then + echo "ERROR: gh not authenticated. Run 'gh auth login' first." >&2 + exit 1 +fi + +# Snapshot location relative to this script. Keeping it under scripts/ +# rather than a temp dir means freeze receipts are easy to find again +# during the actual demo. +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +SNAPSHOT_DIR="${SCRIPT_DIR}/demo-freeze-snapshots" +mkdir -p "$SNAPSHOT_DIR" +TS="$(date -u +%Y%m%d-%H%M%S)" +DIGESTS_FILE="${SNAPSHOT_DIR}/digests-${TS}.txt" +WORKFLOWS_FILE="${SNAPSHOT_DIR}/disabled-workflows-${TS}.txt" + +if [ $EXECUTE -eq 0 ]; then + echo "=== DRY RUN (no changes will be made; pass --execute to apply) ===" +else + echo "=== EXECUTING FREEZE — workflows will be disabled ===" +fi +echo "Snapshot timestamp: $TS" +echo "Digest log: $DIGESTS_FILE" +echo "Workflow log: $WORKFLOWS_FILE" +echo + +# Step 1: capture current :latest digest for each template. +echo "→ Capturing current :latest digests" +for tpl in "${TEMPLATES[@]}"; do + token=$(curl -fsS "https://ghcr.io/token?scope=repository:molecule-ai/workspace-template-${tpl}:pull" | jq -r .token 2>/dev/null || true) + if [ -z "$token" ] || [ "$token" = "null" ]; then + echo " WARN: token fetch failed for $tpl — skipping digest capture" + continue + fi + digest=$(curl -fsSI \ + -H "Authorization: Bearer $token" \ + -H "Accept: application/vnd.oci.image.index.v1+json" \ + -H "Accept: application/vnd.docker.distribution.manifest.v2+json" \ + "https://ghcr.io/v2/molecule-ai/workspace-template-${tpl}/manifests/latest" 2>/dev/null \ + | grep -i 'docker-content-digest' \ + | awk '{print $2}' \ + | tr -d '\r') + if [ -z "$digest" ]; then + echo " WARN: digest fetch failed for $tpl" + continue + fi + echo " $tpl: $digest" + if [ $EXECUTE -eq 1 ]; then + echo "$tpl: $digest" >> "$DIGESTS_FILE" + fi +done +echo + +# Step 2: disable publish-runtime.yml in molecule-core (PATH 1 source). +echo "→ Disabling publish-runtime.yml in molecule-core (kills runtime → 8-template cascade)" +if [ $EXECUTE -eq 1 ]; then + if gh workflow disable publish-runtime.yml -R Molecule-AI/molecule-core 2>/tmp/freeze.err; then + echo " OK molecule-core/publish-runtime.yml disabled" + echo "Molecule-AI/molecule-core: publish-runtime.yml" >> "$WORKFLOWS_FILE" + else + echo " FAIL molecule-core/publish-runtime.yml: $(cat /tmp/freeze.err)" >&2 + fi +else + echo " (dry-run) would disable: gh workflow disable publish-runtime.yml -R Molecule-AI/molecule-core" +fi +echo + +# Step 3: disable publish-image.yml in each of the 8 template repos (PATH 2 sources). +echo "→ Disabling publish-image.yml in each workspace-template-* repo" +PARTIAL_FAIL=0 +for tpl in "${TEMPLATES[@]}"; do + repo="Molecule-AI/molecule-ai-workspace-template-${tpl}" + if [ $EXECUTE -eq 1 ]; then + if gh workflow disable publish-image.yml -R "$repo" 2>/tmp/freeze.err; then + echo " OK $repo/publish-image.yml disabled" + echo "${repo}: publish-image.yml" >> "$WORKFLOWS_FILE" + else + echo " FAIL $repo/publish-image.yml: $(cat /tmp/freeze.err)" >&2 + PARTIAL_FAIL=1 + fi + else + echo " (dry-run) would disable: gh workflow disable publish-image.yml -R $repo" + fi +done +echo + +if [ $EXECUTE -eq 0 ]; then + echo "=== DRY RUN COMPLETE ===" + echo "Re-run with --execute to apply the freeze." + exit 0 +fi + +echo "=== FREEZE COMPLETE ===" +echo "Receipts: $DIGESTS_FILE" +echo " $WORKFLOWS_FILE" +echo +echo "Next steps:" +echo " - Verify by running: gh workflow list -R Molecule-AI/molecule-core | grep publish-runtime" +echo " Status should be 'disabled_manually'." +echo " - Demo proceeds; new workspaces pull the snapshotted :latest digests." +echo " - Post-demo, run: scripts/demo-thaw.sh ${TS}" +echo " to re-enable every workflow this freeze disabled." +echo +if [ $PARTIAL_FAIL -ne 0 ]; then + echo "WARNING: one or more workflows did not disable cleanly. Re-run after fixing." >&2 + exit 2 +fi +exit 0 diff --git a/scripts/demo-thaw.sh b/scripts/demo-thaw.sh new file mode 100755 index 00000000..35469c6e --- /dev/null +++ b/scripts/demo-thaw.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash +# demo-thaw.sh — re-enable workflows that demo-freeze.sh disabled. +# +# Usage: +# scripts/demo-thaw.sh +# scripts/demo-thaw.sh 20260503-180000 +# +# Reads disabled-workflows-.txt produced by demo-freeze.sh and +# runs `gh workflow enable` for each entry. Idempotent — re-enabling +# an already-enabled workflow is a no-op. +# +# Defaults to executing (the inverse of freeze, which defaults to +# dry-run). Pass --dry-run to print without executing. +# +# Prereqs: +# - gh CLI authenticated with workflow:write scope on Molecule-AI org +# +# Exit codes: +# 0 — all workflows re-enabled +# 1 — pre-flight failure (missing receipt file, missing tooling) +# 2 — partial thaw (some workflows did not enable; check output) + +set -euo pipefail + +usage() { + cat <<'USAGE' +demo-thaw.sh — re-enable workflows that demo-freeze.sh disabled. + +Usage: + scripts/demo-thaw.sh # apply + scripts/demo-thaw.sh --dry-run # print without applying + +ts is the YYYYMMDD-HHMMSS suffix on +scripts/demo-freeze-snapshots/disabled-workflows-*.txt produced by +demo-freeze.sh. +USAGE +} + +DRY_RUN=0 +TS="" +for arg in "$@"; do + case "$arg" in + --dry-run) + DRY_RUN=1 + ;; + --help|-h) + usage + exit 0 + ;; + *) + if [ -z "$TS" ]; then + TS="$arg" + else + echo "unknown arg: $arg" >&2 + usage >&2 + exit 2 + fi + ;; + esac +done + +if [ -z "$TS" ]; then + echo "usage: $0 [--dry-run]" >&2 + echo " e.g. $0 20260503-180000" >&2 + echo " ts is the YYYYMMDD-HHMMSS suffix on demo-freeze-snapshots/disabled-workflows-*.txt" >&2 + exit 2 +fi + +command -v gh >/dev/null || { echo "ERROR: gh CLI required" >&2; exit 1; } +if ! gh auth status >/dev/null 2>&1; then + echo "ERROR: gh not authenticated. Run 'gh auth login' first." >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +WORKFLOWS_FILE="${SCRIPT_DIR}/demo-freeze-snapshots/disabled-workflows-${TS}.txt" + +if [ ! -f "$WORKFLOWS_FILE" ]; then + echo "ERROR: receipt not found: $WORKFLOWS_FILE" >&2 + echo "Available receipts:" >&2 + ls "${SCRIPT_DIR}/demo-freeze-snapshots/" 2>/dev/null | grep '^disabled-workflows-' >&2 || echo " (none)" >&2 + exit 1 +fi + +if [ $DRY_RUN -eq 1 ]; then + echo "=== DRY RUN (no changes will be made) ===" +else + echo "=== THAWING — re-enabling workflows ===" +fi +echo "Reading: $WORKFLOWS_FILE" +echo + +PARTIAL_FAIL=0 +while IFS=': ' read -r repo workflow; do + [ -z "$repo" ] && continue + if [ $DRY_RUN -eq 1 ]; then + echo " (dry-run) would enable: gh workflow enable $workflow -R $repo" + else + if gh workflow enable "$workflow" -R "$repo" 2>/tmp/thaw.err; then + echo " OK $repo/$workflow re-enabled" + else + echo " FAIL $repo/$workflow: $(cat /tmp/thaw.err)" >&2 + PARTIAL_FAIL=1 + fi + fi +done < "$WORKFLOWS_FILE" + +echo +if [ $DRY_RUN -eq 1 ]; then + echo "=== DRY RUN COMPLETE ===" + echo "Re-run without --dry-run to apply." + exit 0 +fi + +echo "=== THAW COMPLETE ===" +echo "Cascades restored. Next workspace/** push to molecule-core/staging will" +echo "auto-publish the runtime wheel and fan out to template rebuilds as normal." +if [ $PARTIAL_FAIL -ne 0 ]; then + echo + echo "WARNING: one or more workflows did not re-enable cleanly. Re-run or enable manually:" >&2 + echo " gh workflow list -R " >&2 + exit 2 +fi +exit 0 From e1496936e9a2c82cc383185cd4b3c8307b8a86c3 Mon Sep 17 00:00:00 2001 From: Hongming Wang Date: Fri, 1 May 2026 12:45:20 -0700 Subject: [PATCH 17/61] feat(canvas): dynamic provider dropdown in CreateWorkspaceDialog MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors the data-driven pattern PR #2454 set in ConfigTab: read runtime_config.providers from /templates and filter the modal's provider so an operator can only pick a +// provider the template actually supports. +interface TemplateSpec { + id: string; + name?: string; + runtime?: string; + providers?: string[]; +} + interface HermesProvider { id: string; label: string; @@ -55,6 +68,13 @@ export function CreateWorkspaceButton() { const [creating, setCreating] = useState(false); const [error, setError] = useState(null); const [workspaces, setWorkspaces] = useState([]); + // Templates fetched from /api/templates — drives the dynamic provider + // filter below. Same data source ConfigTab uses (PR #2454). When the + // selected template declares `runtime_config.providers` in its + // config.yaml, the modal surfaces only those providers in the + // . Better to over-show than to lock the user out. + return filtered.length > 0 ? filtered : HERMES_PROVIDERS; + }, [selectedTemplateSpec]); + + // If the currently-selected provider is filtered out by a template + // change, snap back to the first available. Without this, the + // hermesProvider state could refer to a provider not in the dropdown + // — confusing UI + the API key field's envVar would be wrong. + useEffect(() => { + if (!isHermes) return; + if (availableProviders.length === 0) return; + if (!availableProviders.some((p) => p.id === hermesProvider)) { + setHermesProvider(availableProviders[0].id); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [availableProviders, isHermes]); + // Auto-fill hermesModel with the provider's defaultModel whenever the // provider changes, but only if the user hasn't already typed their own // slug. Prevents the empty-model → "auto" → Anthropic-default 401 trap. @@ -163,6 +229,10 @@ export function CreateWorkspaceButton() { .get("/workspaces") .then((ws) => setWorkspaces(ws)) .catch(() => {}); + api + .get("/templates") + .then((rows) => setTemplateSpecs(Array.isArray(rows) ? rows : [])) + .catch(() => { /* keep empty — HERMES_PROVIDERS fallback below */ }); // defaultTier is stable for the session (derived from window.location), // safe to omit from deps. // eslint-disable-next-line react-hooks/exhaustive-deps @@ -405,7 +475,7 @@ export function CreateWorkspaceButton() { aria-label="Hermes provider" className="w-full bg-zinc-800/60 border border-zinc-700/50 rounded-lg px-3 py-2 text-sm text-zinc-100 focus:outline-none focus:border-violet-500/60 focus:ring-1 focus:ring-violet-500/20 transition-colors" > - {HERMES_PROVIDERS.map((p) => ( + {availableProviders.map((p) => ( diff --git a/canvas/src/components/__tests__/CreateWorkspaceDialog.test.tsx b/canvas/src/components/__tests__/CreateWorkspaceDialog.test.tsx index dd207743..4d441436 100644 --- a/canvas/src/components/__tests__/CreateWorkspaceDialog.test.tsx +++ b/canvas/src/components/__tests__/CreateWorkspaceDialog.test.tsx @@ -190,6 +190,91 @@ describe("CreateWorkspaceDialog — Hermes provider picker", () => { expect(ids).toContain("hermes"); }); + // Pins the dynamic-providers behavior: when the matched template's + // /templates row declares `providers`, the dropdown filters to that + // subset instead of showing the full HERMES_PROVIDERS catalog. Same + // data source ConfigTab uses (PR #2454) — keeps the modal and the + // settings tab honest about which providers a template supports. + it("hermes provider dropdown filters to template-declared providers when /templates ships them", async () => { + // Per-URL mock: /workspaces returns the existing fixture, /templates + // returns a hermes row that only allows anthropic + minimax + openai. + mockGet.mockImplementation(async (url: string) => { + if (url === "/templates") { + return [ + { id: "hermes", name: "Hermes", runtime: "hermes", providers: ["anthropic", "minimax", "openai"] }, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + ] as any; + } + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return SAMPLE_WORKSPACES as any; + }); + + await openDialog(); + await setTemplate("hermes"); + await waitFor(() => + expect(document.querySelector("[data-testid='hermes-provider-section']")).toBeTruthy() + ); + const providerSelect = document.getElementById("hermes-provider-select") as HTMLSelectElement; + // Filtered list arrives async after /templates fetch resolves — + // keep waiting until the dropdown shrinks below the full catalog. + await waitFor(() => expect(providerSelect.options.length).toBe(3)); + const ids = Array.from(providerSelect.options).map((o) => o.value); + expect(ids).toEqual(expect.arrayContaining(["anthropic", "minimax", "openai"])); + expect(ids).not.toContain("gemini"); + expect(ids).not.toContain("deepseek"); + }); + + // Back-compat: a template that hasn't migrated to runtime_config.providers + // (older templates, self-hosted setups without /templates server) keeps + // showing the full provider catalog. Operators picking from those + // templates can't be locked out of providers we know hermes supports. + it("hermes provider dropdown falls back to all providers when template declares no providers list", async () => { + mockGet.mockImplementation(async (url: string) => { + if (url === "/templates") { + // No `providers` field — empty/missing → fall back to full catalog. + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return [{ id: "hermes", name: "Hermes", runtime: "hermes" }] as any; + } + // eslint-disable-next-line @typescript-eslint/no-explicit-any + return SAMPLE_WORKSPACES as any; + }); + + await openDialog(); + await setTemplate("hermes"); + await waitFor(() => + expect(document.querySelector("[data-testid='hermes-provider-section']")).toBeTruthy() + ); + const providerSelect = document.getElementById("hermes-provider-select") as HTMLSelectElement; + expect(providerSelect.options.length).toBe(HERMES_PROVIDERS.length); + }); + + // Defensive: a template's declared list with NO matches against our + // static catalog (e.g. a brand-new provider id we don't have label/ + // envVar metadata for yet) must not render an empty setModel(e.target.value)} + placeholder="e.g. minimax/MiniMax-M2.7" + aria-label="Model slug" + autoComplete="off" + spellCheck={false} + list="provider-picker-model-suggestions" + className="w-full bg-zinc-900 border border-zinc-600 rounded px-2 py-1.5 text-[11px] text-zinc-100 font-mono focus:outline-none focus:border-blue-500 focus:ring-1 focus:ring-blue-500/20 transition-colors" + /> + + {modelSuggestions?.map((m) => ( + +

+ Slug determines provider routing at install time. +

+
+ )}
Provider @@ -303,12 +400,28 @@ function ProviderPickerModal({
{entry.key}
{entry.saved && ( - - - Saved - +
+ + + Saved + + {/* Allow override when the saved state came from a + pre-configured global secret — the user may want + to use a different key for this workspace. */} + {configuredKeys?.has(entry.key) && ( + + )} +
)} @@ -364,8 +477,12 @@ function ProviderPickerModal({ Cancel Deploy + @@ -95,6 +119,7 @@ function makeTemplate(over: Partial